From 571d91dcadfa3cef499010b4eddb9b58b0da4d24 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:19 -0700 Subject: [PATCH 0001/1562] perf: Add branch stack counters Currently, the additional information of a branch entry is stored in a u64 space. With more and more information added, the space is running out. For example, the information of occurrences of events will be added for each branch. Two places were suggested to append the counters. https://lore.kernel.org/lkml/20230802215814.GH231007@hirez.programming.kicks-ass.net/ One place is right after the flags of each branch entry. It changes the existing struct perf_branch_entry. The later ARCH specific implementation has to be really careful to consistently pick the right struct. The other place is right after the entire struct perf_branch_stack. The disadvantage is that the pointer of the extra space has to be recorded. The common interface perf_sample_save_brstack() has to be updated. The latter is much straightforward, and should be easily understood and maintained. It is implemented in the patch. Add a new branch sample type, PERF_SAMPLE_BRANCH_COUNTERS, to indicate the event which is recorded in the branch info. The "u64 counters" may store the occurrences of several events. The information regarding the number of events/counters and the width of each counter should be exposed via sysfs as a reference for the perf tool. Define the branch_counter_nr and branch_counter_width ABI here. The support will be implemented later in the Intel-specific patch. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-1-kan.liang@linux.intel.com --- .../testing/sysfs-bus-event_source-devices-caps | 6 ++++++ arch/powerpc/perf/core-book3s.c | 2 +- arch/x86/events/amd/core.c | 2 +- arch/x86/events/core.c | 2 +- arch/x86/events/intel/core.c | 2 +- arch/x86/events/intel/ds.c | 4 ++-- include/linux/perf_event.h | 17 ++++++++++++++++- include/uapi/linux/perf_event.h | 10 ++++++++++ kernel/events/core.c | 8 ++++++++ 9 files changed, 46 insertions(+), 7 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps b/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps index 8757dcf41c08..a5f506f7d481 100644 --- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps +++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps @@ -16,3 +16,9 @@ Description: Example output in powerpc: grep . /sys/bus/event_source/devices/cpu/caps/* /sys/bus/event_source/devices/cpu/caps/pmu_name:POWER9 + + The "branch_counter_nr" in the supported platform exposes the + maximum number of counters which can be shown in the u64 counters + of PERF_SAMPLE_BRANCH_COUNTERS, while the "branch_counter_width" + exposes the width of each counter. Both of them can be used by + the perf tool to parse the logged counters in each branch. diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 8c1f7def596e..3c14596bbfaf 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2313,7 +2313,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, struct cpu_hw_events *cpuhw; cpuhw = this_cpu_ptr(&cpu_hw_events); power_pmu_bhrb_read(event, cpuhw); - perf_sample_save_brstack(&data, event, &cpuhw->bhrb_stack); + perf_sample_save_brstack(&data, event, &cpuhw->bhrb_stack, NULL); } if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index e24976593a29..4ee6390b45c9 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -940,7 +940,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) continue; if (has_branch_stack(event)) - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack); + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 40ad1425ffa2..40c9af124128 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1702,7 +1702,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) perf_sample_data_init(&data, 0, event->hw.last_period); if (has_branch_stack(event)) - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack); + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a08f794a0e79..41a164764a84 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3047,7 +3047,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) perf_sample_data_init(&data, 0, event->hw.last_period); if (has_branch_stack(event)) - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack); + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index bf97ab904d40..cb3f329f8fa4 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1755,7 +1755,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, setup_pebs_time(event, data, pebs->tsc); if (has_branch_stack(event)) - perf_sample_save_brstack(data, event, &cpuc->lbr_stack); + perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); } static void adaptive_pebs_save_regs(struct pt_regs *regs, @@ -1912,7 +1912,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); - perf_sample_save_brstack(data, event, &cpuc->lbr_stack); + perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); } } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0367d748fae0..7897ef066027 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1139,6 +1139,10 @@ static inline bool branch_sample_priv(const struct perf_event *event) return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; } +static inline bool branch_sample_counters(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS; +} struct perf_sample_data { /* @@ -1173,6 +1177,7 @@ struct perf_sample_data { struct perf_callchain_entry *callchain; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; + u64 *br_stack_cntr; union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; @@ -1250,7 +1255,8 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data, static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, - struct perf_branch_stack *brs) + struct perf_branch_stack *brs, + u64 *brs_cntr) { int size = sizeof(u64); /* nr */ @@ -1258,7 +1264,16 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data, size += sizeof(u64); size += brs->nr * sizeof(struct perf_branch_entry); + /* + * The extension space for counters is appended after the + * struct perf_branch_stack. It is used to store the occurrences + * of events of each branch. + */ + if (brs_cntr) + size += brs->nr * sizeof(u64); + data->br_stack = brs; + data->br_stack_cntr = brs_cntr; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 39c6a250dd1b..4461f380425b 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -204,6 +204,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ + PERF_SAMPLE_BRANCH_COUNTERS_SHIFT = 19, /* save occurrences of events on a branch */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -235,6 +237,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -982,6 +986,12 @@ enum perf_event_type { * { u64 nr; * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX * { u64 from, to, flags } lbr[nr]; + * # + * # The format of the counters is decided by the + * # "branch_counter_nr" and "branch_counter_width", + * # which are defined in the ABI. + * # + * { u64 counters; } cntr[nr] && PERF_SAMPLE_BRANCH_COUNTERS * } && PERF_SAMPLE_BRANCH_STACK * * { u64 abi; # enum perf_sample_regs_abi diff --git a/kernel/events/core.c b/kernel/events/core.c index 3eb26c2c6e65..d27ffd80ed67 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7341,6 +7341,14 @@ void perf_output_sample(struct perf_output_handle *handle, if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); + /* + * Add the extension space which is appended + * right after the struct perf_branch_stack. + */ + if (data->br_stack_cntr) { + size = data->br_stack->nr * sizeof(u64); + perf_output_copy(handle, data->br_stack_cntr, size); + } } else { /* * we always store at least the value of nr From 85846b27072defc7ab3dcee7ff36563a040079dc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:20 -0700 Subject: [PATCH 0002/1562] perf/x86: Add PERF_X86_EVENT_NEEDS_BRANCH_STACK flag Currently, branch_sample_type !=0 is used to check whether a branch stack setup is required. But it doesn't check the sample type, unnecessary branch stack setup may be done for a counting event. E.g., perf record -e "{branch-instructions,branch-misses}:S" -j any Also, the event only with the new PERF_SAMPLE_BRANCH_COUNTERS branch sample type may not require a branch stack setup either. Add a new flag NEEDS_BRANCH_STACK to indicate whether the event requires a branch stack setup. Replace the needs_branch_stack() by checking the new flag. The counting event check is implemented here. The later patch will take the new PERF_SAMPLE_BRANCH_COUNTERS into account. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-2-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 14 +++++++++++--- arch/x86/events/perf_event_flags.h | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 41a164764a84..a99449c0d77c 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2527,9 +2527,14 @@ static void intel_pmu_assign_event(struct perf_event *event, int idx) perf_report_aux_output_id(event, idx); } +static __always_inline bool intel_pmu_needs_branch_stack(struct perf_event *event) +{ + return event->hw.flags & PERF_X86_EVENT_NEEDS_BRANCH_STACK; +} + static void intel_pmu_del_event(struct perf_event *event) { - if (needs_branch_stack(event)) + if (intel_pmu_needs_branch_stack(event)) intel_pmu_lbr_del(event); if (event->attr.precise_ip) intel_pmu_pebs_del(event); @@ -2820,7 +2825,7 @@ static void intel_pmu_add_event(struct perf_event *event) { if (event->attr.precise_ip) intel_pmu_pebs_add(event); - if (needs_branch_stack(event)) + if (intel_pmu_needs_branch_stack(event)) intel_pmu_lbr_add(event); } @@ -3897,7 +3902,10 @@ static int intel_pmu_hw_config(struct perf_event *event) x86_pmu.pebs_aliases(event); } - if (needs_branch_stack(event)) { + if (needs_branch_stack(event) && is_sampling_event(event)) + event->hw.flags |= PERF_X86_EVENT_NEEDS_BRANCH_STACK; + + if (intel_pmu_needs_branch_stack(event)) { ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index 1dc19b9b4426..a1685981c520 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -20,3 +20,4 @@ PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ +PERF_ARCH(NEEDS_BRANCH_STACK, 0x40000) /* require branch stack setup */ From 1f2376cd03dd3b965d130ed46a7c92769d614ba1 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:21 -0700 Subject: [PATCH 0003/1562] perf: Add branch_sample_call_stack Add a helper function to check call stack sample type. The later patch will invoke the function in several places. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-3-kan.liang@linux.intel.com --- arch/x86/events/core.c | 2 +- include/linux/perf_event.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 40c9af124128..09050641ce5d 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -601,7 +601,7 @@ int x86_pmu_hw_config(struct perf_event *event) } } - if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK) + if (branch_sample_call_stack(event)) event->attach_state |= PERF_ATTACH_TASK_DATA; /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7897ef066027..ac1a59c1f252 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1144,6 +1144,11 @@ static inline bool branch_sample_counters(const struct perf_event *event) return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS; } +static inline bool branch_sample_call_stack(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; +} + struct perf_sample_data { /* * Fields set by perf_sample_data_init() unconditionally, From 318c4985911245508f7e0bab5265e208a38b5f18 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:22 -0700 Subject: [PATCH 0004/1562] perf/x86/intel: Reorganize attrs and is_visible Some attrs and is_visible implementations are rather far away from one another which makes the whole thing hard to interpret. There are only two attribute groups which have both .attrs and .is_visible, group_default and group_caps_lbr. Move them together. No functional changes. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-4-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a99449c0d77c..584b58df7bf6 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5540,6 +5540,12 @@ static struct attribute *lbr_attrs[] = { NULL }; +static umode_t +lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.lbr_nr ? attr->mode : 0; +} + static char pmu_name_str[30]; static ssize_t pmu_name_show(struct device *cdev, @@ -5566,6 +5572,15 @@ static struct attribute *intel_pmu_attrs[] = { NULL, }; +static umode_t +default_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + if (attr == &dev_attr_allow_tsx_force_abort.attr) + return x86_pmu.flags & PMU_FL_TFA ? attr->mode : 0; + + return attr->mode; +} + static umode_t tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i) { @@ -5587,27 +5602,12 @@ mem_is_visible(struct kobject *kobj, struct attribute *attr, int i) return pebs_is_visible(kobj, attr, i); } -static umode_t -lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i) -{ - return x86_pmu.lbr_nr ? attr->mode : 0; -} - static umode_t exra_is_visible(struct kobject *kobj, struct attribute *attr, int i) { return x86_pmu.version >= 2 ? attr->mode : 0; } -static umode_t -default_is_visible(struct kobject *kobj, struct attribute *attr, int i) -{ - if (attr == &dev_attr_allow_tsx_force_abort.attr) - return x86_pmu.flags & PMU_FL_TFA ? attr->mode : 0; - - return attr->mode; -} - static struct attribute_group group_events_td = { .name = "events", }; From 33744916196b4ed7a50f6f47af7c3ad46b730ce6 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:23 -0700 Subject: [PATCH 0005/1562] perf/x86/intel: Support branch counters logging The branch counters logging (A.K.A LBR event logging) introduces a per-counter indication of precise event occurrences in LBRs. It can provide a means to attribute exposed retirement latency to combinations of events across a block of instructions. It also provides a means of attributing Timed LBR latencies to events. The feature is first introduced on SRF/GRR. It is an enhancement of the ARCH LBR. It adds new fields in the LBR_INFO MSRs to log the occurrences of events on the GP counters. The information is displayed by the order of counters. The design proposed in this patch requires that the events which are logged must be in a group with the event that has LBR. If there are more than one LBR group, the counters logging information only from the current group (overflowed) are stored for the perf tool, otherwise the perf tool cannot know which and when other groups are scheduled especially when multiplexing is triggered. The user can ensure it uses the maximum number of counters that support LBR info (4 by now) by making the group large enough. The HW only logs events by the order of counters. The order may be different from the order of enabling which the perf tool can understand. When parsing the information of each branch entry, convert the counter order to the enabled order, and store the enabled order in the extension space. Unconditionally reset LBRs for an LBR event group when it's deleted. The logged counter information is only valid for the current LBR group. If another LBR group is scheduled later, the information from the stale LBRs would be otherwise wrongly interpreted. Add a sanity check in intel_pmu_hw_config(). Disable the feature if other counter filters (inv, cmask, edge, in_tx) are set or LBR call stack mode is enabled. (For the LBR call stack mode, we cannot simply flush the LBR, since it will break the call stack. Also, there is no obvious usage with the call stack mode for now.) Only applying the PERF_SAMPLE_BRANCH_COUNTERS doesn't require any branch stack setup. Expose the maximum number of supported counters and the width of the counters into the sysfs. The perf tool can use the information to parse the logged counters in each branch. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-5-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 103 +++++++++++++++++++++++++++-- arch/x86/events/intel/ds.c | 2 +- arch/x86/events/intel/lbr.c | 85 +++++++++++++++++++++++- arch/x86/events/perf_event.h | 12 ++++ arch/x86/events/perf_event_flags.h | 1 + arch/x86/include/asm/msr-index.h | 5 ++ arch/x86/include/asm/perf_event.h | 4 ++ include/uapi/linux/perf_event.h | 3 + 8 files changed, 207 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 584b58df7bf6..e068a96aeb54 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2792,6 +2792,7 @@ static void intel_pmu_enable_fixed(struct perf_event *event) static void intel_pmu_enable_event(struct perf_event *event) { + u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; @@ -2800,8 +2801,10 @@ static void intel_pmu_enable_event(struct perf_event *event) switch (idx) { case 0 ... INTEL_PMC_IDX_FIXED - 1: + if (branch_sample_counters(event)) + enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; intel_set_masks(event, idx); - __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); + __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: @@ -3052,7 +3055,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) perf_sample_data_init(&data, 0, event->hw.last_period); if (has_branch_stack(event)) - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); + intel_pmu_lbr_save_brstack(&data, cpuc, event); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); @@ -3617,6 +3620,13 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, if (cpuc->excl_cntrs) return intel_get_excl_constraints(cpuc, event, idx, c2); + /* Not all counters support the branch counter feature. */ + if (branch_sample_counters(event)) { + c2 = dyn_constraint(cpuc, c2, idx); + c2->idxmsk64 &= x86_pmu.lbr_counters; + c2->weight = hweight64(c2->idxmsk64); + } + return c2; } @@ -3905,6 +3915,58 @@ static int intel_pmu_hw_config(struct perf_event *event) if (needs_branch_stack(event) && is_sampling_event(event)) event->hw.flags |= PERF_X86_EVENT_NEEDS_BRANCH_STACK; + if (branch_sample_counters(event)) { + struct perf_event *leader, *sibling; + int num = 0; + + if (!(x86_pmu.flags & PMU_FL_BR_CNTR) || + (event->attr.config & ~INTEL_ARCH_EVENT_MASK)) + return -EINVAL; + + /* + * The branch counter logging is not supported in the call stack + * mode yet, since we cannot simply flush the LBR during e.g., + * multiplexing. Also, there is no obvious usage with the call + * stack mode. Simply forbids it for now. + * + * If any events in the group enable the branch counter logging + * feature, the group is treated as a branch counter logging + * group, which requires the extra space to store the counters. + */ + leader = event->group_leader; + if (branch_sample_call_stack(leader)) + return -EINVAL; + if (branch_sample_counters(leader)) + num++; + leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS; + + for_each_sibling_event(sibling, leader) { + if (branch_sample_call_stack(sibling)) + return -EINVAL; + if (branch_sample_counters(sibling)) + num++; + } + + if (num > fls(x86_pmu.lbr_counters)) + return -EINVAL; + /* + * Only applying the PERF_SAMPLE_BRANCH_COUNTERS doesn't + * require any branch stack setup. + * Clear the bit to avoid unnecessary branch stack setup. + */ + if (0 == (event->attr.branch_sample_type & + ~(PERF_SAMPLE_BRANCH_PLM_ALL | + PERF_SAMPLE_BRANCH_COUNTERS))) + event->hw.flags &= ~PERF_X86_EVENT_NEEDS_BRANCH_STACK; + + /* + * Force the leader to be a LBR event. So LBRs can be reset + * with the leader event. See intel_pmu_lbr_del() for details. + */ + if (!intel_pmu_needs_branch_stack(leader)) + return -EINVAL; + } + if (intel_pmu_needs_branch_stack(event)) { ret = intel_pmu_setup_lbr_filter(event); if (ret) @@ -4383,8 +4445,13 @@ cmt_get_event_constraints(struct cpu_hw_events *cpuc, int idx, */ if (event->attr.precise_ip == 3) { /* Force instruction:ppp on PMC0, 1 and Fixed counter 0 */ - if (constraint_match(&fixed0_constraint, event->hw.config)) - return &fixed0_counter0_1_constraint; + if (constraint_match(&fixed0_constraint, event->hw.config)) { + /* The fixed counter 0 doesn't support LBR event logging. */ + if (branch_sample_counters(event)) + return &counter0_1_constraint; + else + return &fixed0_counter0_1_constraint; + } switch (c->idxmsk64 & 0x3ull) { case 0x1: @@ -4563,7 +4630,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) goto err; } - if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA)) { + if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_BR_CNTR)) { size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); @@ -5535,15 +5602,39 @@ static ssize_t branches_show(struct device *cdev, static DEVICE_ATTR_RO(branches); +static ssize_t branch_counter_nr_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", fls(x86_pmu.lbr_counters)); +} + +static DEVICE_ATTR_RO(branch_counter_nr); + +static ssize_t branch_counter_width_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", LBR_INFO_BR_CNTR_BITS); +} + +static DEVICE_ATTR_RO(branch_counter_width); + static struct attribute *lbr_attrs[] = { &dev_attr_branches.attr, + &dev_attr_branch_counter_nr.attr, + &dev_attr_branch_counter_width.attr, NULL }; static umode_t lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i) { - return x86_pmu.lbr_nr ? attr->mode : 0; + /* branches */ + if (i == 0) + return x86_pmu.lbr_nr ? attr->mode : 0; + + return (x86_pmu.flags & PMU_FL_BR_CNTR) ? attr->mode : 0; } static char pmu_name_str[30]; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index cb3f329f8fa4..d49d661ec0a7 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1912,7 +1912,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); - perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); + intel_pmu_lbr_save_brstack(data, cpuc, event); } } diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index c3b0d15a9841..78cd5084104e 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -676,6 +676,25 @@ void intel_pmu_lbr_del(struct perf_event *event) WARN_ON_ONCE(cpuc->lbr_users < 0); WARN_ON_ONCE(cpuc->lbr_pebs_users < 0); perf_sched_cb_dec(event->pmu); + + /* + * The logged occurrences information is only valid for the + * current LBR group. If another LBR group is scheduled in + * later, the information from the stale LBRs will be wrongly + * interpreted. Reset the LBRs here. + * + * Only clear once for a branch counter group with the leader + * event. Because + * - Cannot simply reset the LBRs with the !cpuc->lbr_users. + * Because it's possible that the last LBR user is not in a + * branch counter group, e.g., a branch_counters group + + * several normal LBR events. + * - The LBR reset can be done with any one of the events in a + * branch counter group, since they are always scheduled together. + * It's easy to force the leader event an LBR event. + */ + if (is_branch_counters_group(event) && event == event->group_leader) + intel_pmu_lbr_reset(); } static inline bool vlbr_exclude_host(void) @@ -866,6 +885,8 @@ static __always_inline u16 get_lbr_cycles(u64 info) return cycles; } +static_assert((64 - PERF_BRANCH_ENTRY_INFO_BITS_MAX) > LBR_INFO_BR_CNTR_NUM * LBR_INFO_BR_CNTR_BITS); + static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, struct lbr_entry *entries) { @@ -898,11 +919,67 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, e->abort = !!(info & LBR_INFO_ABORT); e->cycles = get_lbr_cycles(info); e->type = get_lbr_br_type(info); + + /* + * Leverage the reserved field of cpuc->lbr_entries[i] to + * temporarily store the branch counters information. + * The later code will decide what content can be disclosed + * to the perf tool. Pleae see intel_pmu_lbr_counters_reorder(). + */ + e->reserved = (info >> LBR_INFO_BR_CNTR_OFFSET) & LBR_INFO_BR_CNTR_FULL_MASK; } cpuc->lbr_stack.nr = i; } +/* + * The enabled order may be different from the counter order. + * Update the lbr_counters with the enabled order. + */ +static void intel_pmu_lbr_counters_reorder(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + int i, j, pos = 0, order[X86_PMC_IDX_MAX]; + struct perf_event *leader, *sibling; + u64 src, dst, cnt; + + leader = event->group_leader; + if (branch_sample_counters(leader)) + order[pos++] = leader->hw.idx; + + for_each_sibling_event(sibling, leader) { + if (!branch_sample_counters(sibling)) + continue; + order[pos++] = sibling->hw.idx; + } + + WARN_ON_ONCE(!pos); + + for (i = 0; i < cpuc->lbr_stack.nr; i++) { + src = cpuc->lbr_entries[i].reserved; + dst = 0; + for (j = 0; j < pos; j++) { + cnt = (src >> (order[j] * LBR_INFO_BR_CNTR_BITS)) & LBR_INFO_BR_CNTR_MASK; + dst |= cnt << j * LBR_INFO_BR_CNTR_BITS; + } + cpuc->lbr_counters[i] = dst; + cpuc->lbr_entries[i].reserved = 0; + } +} + +void intel_pmu_lbr_save_brstack(struct perf_sample_data *data, + struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + if (is_branch_counters_group(event)) { + intel_pmu_lbr_counters_reorder(cpuc, event); + perf_sample_save_brstack(data, event, &cpuc->lbr_stack, cpuc->lbr_counters); + return; + } + + perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); +} + static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc) { intel_pmu_store_lbr(cpuc, NULL); @@ -1173,8 +1250,10 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) for (i = 0; i < cpuc->lbr_stack.nr; ) { if (!cpuc->lbr_entries[i].from) { j = i; - while (++j < cpuc->lbr_stack.nr) + while (++j < cpuc->lbr_stack.nr) { cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j]; + cpuc->lbr_counters[j-1] = cpuc->lbr_counters[j]; + } cpuc->lbr_stack.nr--; if (!cpuc->lbr_entries[i].from) continue; @@ -1525,8 +1604,12 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_mispred = ecx.split.lbr_mispred; x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr; x86_pmu.lbr_br_type = ecx.split.lbr_br_type; + x86_pmu.lbr_counters = ecx.split.lbr_counters; x86_pmu.lbr_nr = lbr_nr; + if (!!x86_pmu.lbr_counters) + x86_pmu.flags |= PMU_FL_BR_CNTR; + if (x86_pmu.lbr_mispred) static_branch_enable(&x86_lbr_mispred); if (x86_pmu.lbr_timed_lbr) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 53dd5d495ba6..fb56518356ec 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -110,6 +110,11 @@ static inline bool is_topdown_event(struct perf_event *event) return is_metric_event(event) || is_slots_event(event); } +static inline bool is_branch_counters_group(struct perf_event *event) +{ + return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS; +} + struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ @@ -283,6 +288,7 @@ struct cpu_hw_events { int lbr_pebs_users; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; + u64 lbr_counters[MAX_LBR_ENTRIES]; /* branch stack extra */ union { struct er_account *lbr_sel; struct er_account *lbr_ctl; @@ -888,6 +894,7 @@ struct x86_pmu { unsigned int lbr_mispred:1; unsigned int lbr_timed_lbr:1; unsigned int lbr_br_type:1; + unsigned int lbr_counters:4; void (*lbr_reset)(void); void (*lbr_read)(struct cpu_hw_events *cpuc); @@ -1012,6 +1019,7 @@ do { \ #define PMU_FL_INSTR_LATENCY 0x80 /* Support Instruction Latency in PEBS Memory Info Record */ #define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */ #define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */ +#define PMU_FL_BR_CNTR 0x400 /* Support branch counter logging */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr @@ -1552,6 +1560,10 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); void intel_ds_init(void); +void intel_pmu_lbr_save_brstack(struct perf_sample_data *data, + struct cpu_hw_events *cpuc, + struct perf_event *event); + void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc, struct perf_event_pmu_context *next_epc); diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index a1685981c520..6c977c19f2cd 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -21,3 +21,4 @@ PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ PERF_ARCH(NEEDS_BRANCH_STACK, 0x40000) /* require branch stack setup */ +PERF_ARCH(BRANCH_COUNTERS, 0x80000) /* logs the counters in the extra space of each branch */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f8b502867dd1..a5b0a19ccdf2 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -236,6 +236,11 @@ #define LBR_INFO_CYCLES 0xffff #define LBR_INFO_BR_TYPE_OFFSET 56 #define LBR_INFO_BR_TYPE (0xfull << LBR_INFO_BR_TYPE_OFFSET) +#define LBR_INFO_BR_CNTR_OFFSET 32 +#define LBR_INFO_BR_CNTR_NUM 4 +#define LBR_INFO_BR_CNTR_BITS 2 +#define LBR_INFO_BR_CNTR_MASK GENMASK_ULL(LBR_INFO_BR_CNTR_BITS - 1, 0) +#define LBR_INFO_BR_CNTR_FULL_MASK GENMASK_ULL(LBR_INFO_BR_CNTR_NUM * LBR_INFO_BR_CNTR_BITS - 1, 0) #define MSR_ARCH_LBR_CTL 0x000014ce #define ARCH_LBR_CTL_LBREN BIT(0) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 2618ec7c3d1d..3736b8a46c04 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -31,6 +31,7 @@ #define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) #define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23) #define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL +#define ARCH_PERFMON_EVENTSEL_BR_CNTR (1ULL << 35) #define INTEL_FIXED_BITS_MASK 0xFULL #define INTEL_FIXED_BITS_STRIDE 4 @@ -223,6 +224,9 @@ union cpuid28_ecx { unsigned int lbr_timed_lbr:1; /* Branch Type Field Supported */ unsigned int lbr_br_type:1; + unsigned int reserved:13; + /* Branch counters (Event Logging) Supported */ + unsigned int lbr_counters:4; } split; unsigned int full; }; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 4461f380425b..3a64499b0f5d 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1437,6 +1437,9 @@ struct perf_branch_entry { reserved:31; }; +/* Size of used info bits in struct perf_branch_entry */ +#define PERF_BRANCH_ENTRY_INFO_BITS_MAX 33 + union perf_sample_weight { __u64 full; #if defined(__LITTLE_ENDIAN_BITFIELD) From 96a2b48e5e1df6698f504969f0f51dc34e52ff3d Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 29 Oct 2023 06:14:28 +0000 Subject: [PATCH 0006/1562] cgroup: Remove unnecessary list_empty() The root hasn't been removed from the root_list, so the list can't be NULL. However, if it had been removed, attempting to destroy it once more is not possible. Let's replace this with WARN_ON_ONCE() for clarity. Signed-off-by: Yafang Shao Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1d5b9de3b1b9..3a436e4f0da1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1347,10 +1347,9 @@ static void cgroup_destroy_root(struct cgroup_root *root) spin_unlock_irq(&css_set_lock); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - cgroup_root_count--; - } + WARN_ON_ONCE(list_empty(&root->root_list)); + list_del(&root->root_list); + cgroup_root_count--; if (!have_favordynmods) cgroup_favor_dynmods(root, false); From d23b5c577715892c87533b13923306acc6243f93 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 29 Oct 2023 06:14:29 +0000 Subject: [PATCH 0007/1562] cgroup: Make operations on the cgroup root_list RCU safe At present, when we perform operations on the cgroup root_list, we must hold the cgroup_mutex, which is a relatively heavyweight lock. In reality, we can make operations on this list RCU-safe, eliminating the need to hold the cgroup_mutex during traversal. Modifications to the list only occur in the cgroup root setup and destroy paths, which should be infrequent in a production environment. In contrast, traversal may occur frequently. Therefore, making it RCU-safe would be beneficial. Signed-off-by: Yafang Shao Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 + kernel/cgroup/cgroup-internal.h | 3 ++- kernel/cgroup/cgroup.c | 23 ++++++++++++++++------- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4a6b6b77ccb6..4caab0c6b361 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -563,6 +563,7 @@ struct cgroup_root { /* A list running through the active hierarchies */ struct list_head root_list; + struct rcu_head rcu; /* Hierarchy-specific flags */ unsigned int flags; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c56071f150f2..5e17f01ced9f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -170,7 +170,8 @@ extern struct list_head cgroup_roots; /* iterate across the hierarchies */ #define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) + list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ + lockdep_is_held(&cgroup_mutex)) /** * for_each_subsys - iterate all enabled cgroup subsystems diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3a436e4f0da1..19784d44d615 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - kfree(root); + kfree_rcu(root, rcu); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1348,7 +1348,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) spin_unlock_irq(&css_set_lock); WARN_ON_ONCE(list_empty(&root->root_list)); - list_del(&root->root_list); + list_del_rcu(&root->root_list); cgroup_root_count--; if (!have_favordynmods) @@ -1389,7 +1389,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, } } - BUG_ON(!res_cgroup); + /* + * If cgroup_mutex is not held, the cgrp_cset_link will be freed + * before we remove the cgroup root from the root_list. Consequently, + * when accessing a cgroup root, the cset_link may have already been + * freed, resulting in a NULL res_cgroup. However, by holding the + * cgroup_mutex, we ensure that res_cgroup can't be NULL. + * If we don't hold cgroup_mutex in the caller, we must do the NULL + * check. + */ return res_cgroup; } @@ -1448,7 +1456,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void) static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); return __cset_cgroup_from_root(cset, root); @@ -1456,7 +1463,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, /* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_lock held. + * called with css_set_lock held to prevent task's groups from being modified. + * Must be called with either cgroup_mutex or rcu read lock to prevent the + * cgroup root from being destroyed. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) @@ -2031,7 +2040,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; - INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD_RCU(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); @@ -2114,7 +2123,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ - list_add(&root->root_list, &cgroup_roots); + list_add_rcu(&root->root_list, &cgroup_roots); cgroup_root_count++; /* From 9067d90006df089b9a1da0d74f0cad232a5d726a Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 29 Oct 2023 06:14:30 +0000 Subject: [PATCH 0008/1562] cgroup: Eliminate the need for cgroup_mutex in proc_cgroup_show() The cgroup root_list is already RCU-safe. Therefore, we can replace the cgroup_mutex with the RCU read lock in some particular paths. This change will be particularly beneficial for frequent operations, such as `cat /proc/self/cgroup`, in a cgroup1-based container environment. I did stress tests with this change, as outlined below (with CONFIG_PROVE_RCU_LIST enabled): - Continuously mounting and unmounting named cgroups in some tasks, for example: cgrp_name=$1 while true do mount -t cgroup -o none,name=$cgrp_name none /$cgrp_name umount /$cgrp_name done - Continuously triggering proc_cgroup_show() in some tasks concurrently, for example: while true; do cat /proc/self/cgroup > /dev/null; done They can ran successfully after implementing this change, with no RCU warnings in dmesg. Signed-off-by: Yafang Shao Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 19784d44d615..9bb255e41cf2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6285,7 +6285,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - cgroup_lock(); + rcu_read_lock(); spin_lock_irq(&css_set_lock); for_each_root(root) { @@ -6296,6 +6296,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible)) continue; + cgrp = task_cgroup_from_root(tsk, root); + /* The root has already been unmounted. */ + if (!cgrp) + continue; + seq_printf(m, "%d:", root->hierarchy_id); if (root != &cgrp_dfl_root) for_each_subsys(ss, ssid) @@ -6306,9 +6311,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); - - cgrp = task_cgroup_from_root(tsk, root); - /* * On traditional hierarchies, all zombie tasks show up as * belonging to the root cgroup. On the default hierarchy, @@ -6340,7 +6342,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, retval = 0; out_unlock: spin_unlock_irq(&css_set_lock); - cgroup_unlock(); + rcu_read_unlock(); kfree(buf); out: return retval; From 0008454e8fd30ed0017a9a35b8dd708f168931b8 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 29 Oct 2023 06:14:31 +0000 Subject: [PATCH 0009/1562] cgroup: Add annotation for holding namespace_sem in current_cgns_cgroup_from_root() When I initially examined the function current_cgns_cgroup_from_root(), I was perplexed by its lack of holding cgroup_mutex. However, after Michal explained the reason[0] to me, I realized that it already holds the namespace_sem. I believe this intricacy could also confuse others, so it would be advisable to include an annotation for clarification. After we replace the cgroup_mutex with RCU read lock, if current doesn't hold the namespace_sem, the root cgroup will be NULL. So let's add a WARN_ON_ONCE() for it. [0]. https://lore.kernel.org/bpf/afdnpo3jz2ic2ampud7swd6so5carkilts2mkygcaw67vbw6yh@5b5mncf7qyet Signed-off-by: Yafang Shao Cc: Michal Koutny Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9bb255e41cf2..4e610863cc37 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1420,6 +1420,11 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) rcu_read_unlock(); + /* + * The namespace_sem is held by current, so the root cgroup can't + * be umounted. Therefore, we can ensure that the res is non-NULL. + */ + WARN_ON_ONCE(!res); return res; } From aecd408b7e50742868b3305c24325a89024e2a30 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 29 Oct 2023 06:14:32 +0000 Subject: [PATCH 0010/1562] cgroup: Add a new helper for cgroup1 hierarchy A new helper is added for cgroup1 hierarchy: - task_get_cgroup1 Acquires the associated cgroup of a task within a specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its hierarchy ID. This helper function is added to facilitate the tracing of tasks within a particular container or cgroup dir in BPF programs. It's important to note that this helper is designed specifically for cgroup1 only. tj: Use irsqsave/restore as suggested by Hou Tao . Suggested-by: Tejun Heo Signed-off-by: Yafang Shao Cc: Hou Tao Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 4 +++- kernel/cgroup/cgroup-internal.h | 1 - kernel/cgroup/cgroup-v1.c | 34 +++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0ef0af66080e..34aaf0e87def 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -69,6 +69,7 @@ struct css_task_iter { extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; +extern spinlock_t css_set_lock; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include @@ -386,7 +387,6 @@ static inline void cgroup_unlock(void) * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU -extern spinlock_t css_set_lock; #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ rcu_read_lock_sched_held() || \ @@ -853,4 +853,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #endif /* CONFIG_CGROUP_BPF */ +struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); + #endif /* _LINUX_CGROUP_H */ diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 5e17f01ced9f..520b90dd97ec 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -164,7 +164,6 @@ struct cgroup_mgctx { #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) -extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 76db6c67e39a..04d11a7dd95f 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1262,6 +1262,40 @@ int cgroup1_get_tree(struct fs_context *fc) return ret; } +/** + * task_get_cgroup1 - Acquires the associated cgroup of a task within a + * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its + * hierarchy ID. + * @tsk: The target task + * @hierarchy_id: The ID of a cgroup1 hierarchy + * + * On success, the cgroup is returned. On failure, ERR_PTR is returned. + * We limit it to cgroup1 only. + */ +struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id) +{ + struct cgroup *cgrp = ERR_PTR(-ENOENT); + struct cgroup_root *root; + unsigned long flags; + + rcu_read_lock(); + for_each_root(root) { + /* cgroup1 only*/ + if (root == &cgrp_dfl_root) + continue; + if (root->hierarchy_id != hierarchy_id) + continue; + spin_lock_irqsave(&css_set_lock, flags); + cgrp = task_cgroup_from_root(tsk, root); + if (!cgrp || !cgroup_tryget(cgrp)) + cgrp = ERR_PTR(-ENOENT); + spin_unlock_irqrestore(&css_set_lock, flags); + break; + } + rcu_read_unlock(); + return cgrp; +} + static int __init cgroup1_wq_init(void) { /* From 421fc858023b839106a9c0dc42cd8947cf981d83 Mon Sep 17 00:00:00 2001 From: Atul Kumar Pant Date: Mon, 6 Nov 2023 23:40:34 +0530 Subject: [PATCH 0011/1562] selftests: cgroup: Fixes a typo in a comment Signed-off-by: Atul Kumar Pant Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_freezer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index ff519029f6f4..8845353aca53 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -740,7 +740,7 @@ static int test_cgfreezer_ptraced(const char *root) /* * cg_check_frozen(cgroup, true) will fail here, - * because the task in in the TRACEd state. + * because the task is in the TRACEd state. */ if (cg_freeze_wait(cgroup, false)) goto cleanup; From fe28f631fa941fba583d1c4f25895284b90af671 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 25 Oct 2023 14:25:52 -0400 Subject: [PATCH 0012/1562] workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask When the "isolcpus" boot command line option is used to add a set of isolated CPUs, those CPUs will be excluded automatically from wq_unbound_cpumask to avoid running work functions from unbound workqueues. Recently cpuset has been extended to allow the creation of partitions of isolated CPUs dynamically. To make it closer to the "isolcpus" in functionality, the CPUs in those isolated cpuset partitions should be excluded from wq_unbound_cpumask as well. This can be done currently by explicitly writing to the workqueue's cpumask sysfs file after creating the isolated partitions. However, this process can be error prone. Ideally, the cpuset code should be allowed to request the workqueue code to exclude those isolated CPUs from wq_unbound_cpumask so that this operation can be done automatically and the isolated CPUs will be returned back to wq_unbound_cpumask after the destructions of the isolated cpuset partitions. This patch adds a new workqueue_unbound_exclude_cpumask() function to enable that. This new function will exclude the specified isolated CPUs from wq_unbound_cpumask. To be able to restore those isolated CPUs back after the destruction of isolated cpuset partitions, a new wq_requested_unbound_cpumask is added to store the user provided unbound cpumask either from the boot command line options or from writing to the cpumask sysfs file. This new cpumask provides the basis for CPU exclusion. To enable users to understand how the wq_unbound_cpumask is being modified internally, this patch also exposes the newly introduced wq_requested_unbound_cpumask as well as a wq_isolated_cpumask to store the cpumask to be excluded from wq_unbound_cpumask as read-only sysfs files. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- kernel/workqueue.c | 91 +++++++++++++++++++++++++++++++++++---- 2 files changed, 84 insertions(+), 9 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 24b1e5070f4d..b0b9604b76b8 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -491,7 +491,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); -int workqueue_set_unbound_cpumask(cpumask_var_t cpumask); +extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6e578f576a6f..bd9d34eacd78 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -381,6 +381,12 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; +/* PL: user requested unbound cpumask via sysfs */ +static cpumask_var_t wq_requested_unbound_cpumask; + +/* PL: isolated cpumask to be excluded from unbound cpumask */ +static cpumask_var_t wq_isolated_cpumask; + /* for further constrain wq_unbound_cpumask by cmdline parameter*/ static struct cpumask wq_cmdline_cpumask __initdata; @@ -5839,7 +5845,7 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) * -EINVAL - Invalid @cpumask * -ENOMEM - Failed to allocate memory for attrs or pwqs. */ -int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) +static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) { int ret = -EINVAL; @@ -5850,6 +5856,7 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); + cpumask_copy(wq_requested_unbound_cpumask, cpumask); if (cpumask_equal(cpumask, wq_unbound_cpumask)) { ret = 0; goto out_unlock; @@ -5864,6 +5871,44 @@ out_unlock: return ret; } +/** + * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask + * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask + * + * This function can be called from cpuset code to provide a set of isolated + * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold + * either cpus_read_lock or cpus_write_lock. + */ +int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) +{ + cpumask_var_t cpumask; + int ret = 0; + + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + lockdep_assert_cpus_held(); + mutex_lock(&wq_pool_mutex); + + /* Save the current isolated cpumask & export it via sysfs */ + cpumask_copy(wq_isolated_cpumask, exclude_cpumask); + + /* + * If the operation fails, it will fall back to + * wq_requested_unbound_cpumask which is initially set to + * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten + * by any subsequent write to workqueue/cpumask sysfs file. + */ + if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask)) + cpumask_copy(cpumask, wq_requested_unbound_cpumask); + if (!cpumask_equal(cpumask, wq_unbound_cpumask)) + ret = workqueue_apply_unbound_cpumask(cpumask); + + mutex_unlock(&wq_pool_mutex); + free_cpumask_var(cpumask); + return ret; +} + static int parse_affn_scope(const char *val) { int i; @@ -6158,19 +6203,36 @@ static struct bus_type wq_subsys = { .dev_groups = wq_sysfs_groups, }; -static ssize_t wq_unbound_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t __wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf, cpumask_var_t mask) { int written; mutex_lock(&wq_pool_mutex); - written = scnprintf(buf, PAGE_SIZE, "%*pb\n", - cpumask_pr_args(wq_unbound_cpumask)); + written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); mutex_unlock(&wq_pool_mutex); return written; } +static ssize_t wq_unbound_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); +} + +static ssize_t wq_requested_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); +} + +static ssize_t wq_isolated_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); +} + static ssize_t wq_unbound_cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -6188,9 +6250,13 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev, return ret ? ret : count; } -static struct device_attribute wq_sysfs_cpumask_attr = +static struct device_attribute wq_sysfs_cpumask_attrs[] = { __ATTR(cpumask, 0644, wq_unbound_cpumask_show, - wq_unbound_cpumask_store); + wq_unbound_cpumask_store), + __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL), + __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL), + __ATTR_NULL, +}; static int __init wq_sysfs_init(void) { @@ -6203,7 +6269,13 @@ static int __init wq_sysfs_init(void) dev_root = bus_get_dev_root(&wq_subsys); if (dev_root) { - err = device_create_file(dev_root, &wq_sysfs_cpumask_attr); + struct device_attribute *attr; + + for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) { + err = device_create_file(dev_root, attr); + if (err) + break; + } put_device(dev_root); } return err; @@ -6534,11 +6606,14 @@ void __init workqueue_init_early(void) BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL)); cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (!cpumask_empty(&wq_cmdline_cpumask)) cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask); + cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); From 14060dfc481a309e3f236127b0a77abbf249648f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 25 Oct 2023 14:25:53 -0400 Subject: [PATCH 0013/1562] selftests/cgroup: Minor code cleanup and reorganization of test_cpuset_prs.sh Minor cleanup of test matrix and relocation of test_isolated() function to prepare for the next patch. There is no functional change. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- .../selftests/cgroup/test_cpuset_prs.sh | 142 +++++++++--------- 1 file changed, 71 insertions(+), 71 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index a6e9848189d6..2b825019f806 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -146,71 +146,6 @@ test_add_proc() echo $$ > $CGROUP2/cgroup.procs # Move out the task } -# -# Testing the new "isolated" partition root type -# -test_isolated() -{ - cd $CGROUP2/test - echo 2-3 > cpuset.cpus - TYPE=$(cat cpuset.cpus.partition) - [[ $TYPE = member ]] || echo member > cpuset.cpus.partition - - console_msg "Change from member to root" - test_partition root - - console_msg "Change from root to isolated" - test_partition isolated - - console_msg "Change from isolated to member" - test_partition member - - console_msg "Change from member to isolated" - test_partition isolated - - console_msg "Change from isolated to root" - test_partition root - - console_msg "Change from root to member" - test_partition member - - # - # Testing partition root with no cpu - # - console_msg "Distribute all cpus to child partition" - echo +cpuset > cgroup.subtree_control - test_partition root - - mkdir A1 - cd A1 - echo 2-3 > cpuset.cpus - test_partition root - test_effective_cpus 2-3 - cd .. - test_effective_cpus "" - - console_msg "Moving task to partition test" - test_add_proc "No space left" - cd A1 - test_add_proc "" - cd .. - - console_msg "Shrink and expand child partition" - cd A1 - echo 2 > cpuset.cpus - cd .. - test_effective_cpus 3 - cd A1 - echo 2-3 > cpuset.cpus - cd .. - test_effective_cpus "" - - # Cleaning up - console_msg "Cleaning up" - echo $$ > $CGROUP2/cgroup.procs - [[ -d A1 ]] && rmdir A1 -} - # # Cpuset controller state transition test matrix. # @@ -304,7 +239,7 @@ TEST_MATRIX=( A1:P0,A2:P2,A3:P1 2-4" " C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \ . . X5 . . 0 A1:0-4,A2:1-4,A3:2-4 \ - A1:P0,A2:P-2,A3:P-1 ." + A1:P0,A2:P-2,A3:P-1" " C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \ . . . X1 . 0 A1:0-1,A2:2-4,A3:2-4 \ A1:P0,A2:P2,A3:P-1 2-4" @@ -347,10 +282,10 @@ TEST_MATRIX=( # cpus_allowed/exclusive_cpus update tests " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ . C4 . P2 . 0 A1:4,A2:4,XA2:,XA3:,A3:4 \ - A1:P0,A3:P-2 ." + A1:P0,A3:P-2" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ . X1 . P2 . 0 A1:0-3,A2:1-3,XA1:1,XA2:,XA3:,A3:2-3 \ - A1:P0,A3:P-2 ." + A1:P0,A3:P-2" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ . . C3 P2 . 0 A1:0-2,A2:0-2,XA2:3,XA3:3,A3:3 \ A1:P0,A3:P2 3" @@ -359,13 +294,13 @@ TEST_MATRIX=( A1:P0,A3:P2 3" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \ . . X3 . . 0 A1:0-3,A2:1-3,XA2:3,XA3:3,A3:2-3 \ - A1:P0,A3:P-2 ." + A1:P0,A3:P-2" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \ . . C3 . . 0 A1:0-3,A2:3,XA2:3,XA3:3,A3:3 \ - A1:P0,A3:P-2 ." + A1:P0,A3:P-2" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \ . C4 . . . 0 A1:4,A2:4,A3:4,XA1:,XA2:,XA3 \ - A1:P0,A3:P-2 ." + A1:P0,A3:P-2" # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- @@ -804,6 +739,71 @@ run_state_test() echo "All $I tests of $TEST PASSED." } +# +# Testing the new "isolated" partition root type +# +test_isolated() +{ + cd $CGROUP2/test + echo 2-3 > cpuset.cpus + TYPE=$(cat cpuset.cpus.partition) + [[ $TYPE = member ]] || echo member > cpuset.cpus.partition + + console_msg "Change from member to root" + test_partition root + + console_msg "Change from root to isolated" + test_partition isolated + + console_msg "Change from isolated to member" + test_partition member + + console_msg "Change from member to isolated" + test_partition isolated + + console_msg "Change from isolated to root" + test_partition root + + console_msg "Change from root to member" + test_partition member + + # + # Testing partition root with no cpu + # + console_msg "Distribute all cpus to child partition" + echo +cpuset > cgroup.subtree_control + test_partition root + + mkdir A1 + cd A1 + echo 2-3 > cpuset.cpus + test_partition root + test_effective_cpus 2-3 + cd .. + test_effective_cpus "" + + console_msg "Moving task to partition test" + test_add_proc "No space left" + cd A1 + test_add_proc "" + cd .. + + console_msg "Shrink and expand child partition" + cd A1 + echo 2 > cpuset.cpus + cd .. + test_effective_cpus 3 + cd A1 + echo 2-3 > cpuset.cpus + cd .. + test_effective_cpus "" + + # Cleaning up + console_msg "Cleaning up" + echo $$ > $CGROUP2/cgroup.procs + [[ -d A1 ]] && rmdir A1 +} + # # Wait for inotify event for the given file and read it # $1: cgroup file to wait for From 11e5f407b64a8fa09d1a4b336d15bd285a434c1f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 25 Oct 2023 14:25:54 -0400 Subject: [PATCH 0014/1562] cgroup/cpuset: Keep track of CPUs in isolated partitions Add a new internal isolated_cpus mask to keep track of the CPUs that are in isolated partitions. Expose that new cpumask as a new root-only control file ".cpuset.cpus.isolated". tj: Updated patch description to reflect dropping __DEBUG__ prefix. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 190 +++++++++++++++++++++++++++-------------- 1 file changed, 127 insertions(+), 63 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 615daaf87f1f..19c8779798fd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -204,6 +204,11 @@ struct cpuset { */ static cpumask_var_t subpartitions_cpus; +/* + * Exclusive CPUs in isolated partitions + */ +static cpumask_var_t isolated_cpus; + /* List of remote partition root children */ static struct list_head remote_children; @@ -1317,6 +1322,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus, */ enum partition_cmd { partcmd_enable, /* Enable partition root */ + partcmd_enablei, /* Enable isolated partition root */ partcmd_disable, /* Disable partition root */ partcmd_update, /* Update parent's effective_cpus */ partcmd_invalidate, /* Make partition invalid */ @@ -1418,6 +1424,74 @@ static void reset_partition_data(struct cpuset *cs) } } +/* + * partition_xcpus_newstate - Exclusive CPUs state change + * @old_prs: old partition_root_state + * @new_prs: new partition_root_state + * @xcpus: exclusive CPUs with state change + */ +static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus) +{ + WARN_ON_ONCE(old_prs == new_prs); + if (new_prs == PRS_ISOLATED) + cpumask_or(isolated_cpus, isolated_cpus, xcpus); + else + cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); +} + +/* + * partition_xcpus_add - Add new exclusive CPUs to partition + * @new_prs: new partition_root_state + * @parent: parent cpuset + * @xcpus: exclusive CPUs to be added + * + * Remote partition if parent == NULL + */ +static void partition_xcpus_add(int new_prs, struct cpuset *parent, + struct cpumask *xcpus) +{ + WARN_ON_ONCE(new_prs < 0); + lockdep_assert_held(&callback_lock); + if (!parent) + parent = &top_cpuset; + + if (parent == &top_cpuset) + cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); + + if (new_prs != parent->partition_root_state) + partition_xcpus_newstate(parent->partition_root_state, new_prs, + xcpus); + + cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); +} + +/* + * partition_xcpus_del - Remove exclusive CPUs from partition + * @old_prs: old partition_root_state + * @parent: parent cpuset + * @xcpus: exclusive CPUs to be removed + * + * Remote partition if parent == NULL + */ +static void partition_xcpus_del(int old_prs, struct cpuset *parent, + struct cpumask *xcpus) +{ + WARN_ON_ONCE(old_prs < 0); + lockdep_assert_held(&callback_lock); + if (!parent) + parent = &top_cpuset; + + if (parent == &top_cpuset) + cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); + + if (old_prs != parent->partition_root_state) + partition_xcpus_newstate(old_prs, parent->partition_root_state, + xcpus); + + cpumask_and(xcpus, xcpus, cpu_active_mask); + cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); +} + /* * compute_effective_exclusive_cpumask - compute effective exclusive CPUs * @cs: cpuset @@ -1456,13 +1530,15 @@ static inline bool is_local_partition(struct cpuset *cs) /* * remote_partition_enable - Enable current cpuset as a remote partition root * @cs: the cpuset to update + * @new_prs: new partition_root_state * @tmp: temparary masks * Return: 1 if successful, 0 if error * * Enable the current cpuset to become a remote partition root taking CPUs * directly from the top cpuset. cpuset_mutex must be held by the caller. */ -static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp) +static int remote_partition_enable(struct cpuset *cs, int new_prs, + struct tmpmasks *tmp) { /* * The user must have sysadmin privilege. @@ -1485,18 +1561,14 @@ static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp) return 0; spin_lock_irq(&callback_lock); - cpumask_andnot(top_cpuset.effective_cpus, - top_cpuset.effective_cpus, tmp->new_cpus); - cpumask_or(subpartitions_cpus, - subpartitions_cpus, tmp->new_cpus); - + partition_xcpus_add(new_prs, NULL, tmp->new_cpus); + list_add(&cs->remote_sibling, &remote_children); if (cs->use_parent_ecpus) { struct cpuset *parent = parent_cs(cs); cs->use_parent_ecpus = false; parent->child_ecpus_count--; } - list_add(&cs->remote_sibling, &remote_children); spin_unlock_irq(&callback_lock); /* @@ -1524,13 +1596,8 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); - cpumask_andnot(subpartitions_cpus, - subpartitions_cpus, tmp->new_cpus); - cpumask_and(tmp->new_cpus, - tmp->new_cpus, cpu_active_mask); - cpumask_or(top_cpuset.effective_cpus, - top_cpuset.effective_cpus, tmp->new_cpus); list_del_init(&cs->remote_sibling); + partition_xcpus_del(cs->partition_root_state, NULL, tmp->new_cpus); cs->partition_root_state = -cs->partition_root_state; if (!cs->prs_err) cs->prs_err = PERR_INVCPUS; @@ -1557,6 +1624,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, struct tmpmasks *tmp) { bool adding, deleting; + int prs = cs->partition_root_state; if (WARN_ON_ONCE(!is_remote_partition(cs))) return; @@ -1580,20 +1648,10 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, goto invalidate; spin_lock_irq(&callback_lock); - if (adding) { - cpumask_or(subpartitions_cpus, - subpartitions_cpus, tmp->addmask); - cpumask_andnot(top_cpuset.effective_cpus, - top_cpuset.effective_cpus, tmp->addmask); - } - if (deleting) { - cpumask_andnot(subpartitions_cpus, - subpartitions_cpus, tmp->delmask); - cpumask_and(tmp->delmask, - tmp->delmask, cpu_active_mask); - cpumask_or(top_cpuset.effective_cpus, - top_cpuset.effective_cpus, tmp->delmask); - } + if (adding) + partition_xcpus_add(prs, NULL, tmp->addmask); + if (deleting) + partition_xcpus_del(prs, NULL, tmp->delmask); spin_unlock_irq(&callback_lock); /* @@ -1676,11 +1734,11 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) * @tmp: Temporary addmask and delmask * Return: 0 or a partition root state error code * - * For partcmd_enable, the cpuset is being transformed from a non-partition - * root to a partition root. The effective_xcpus (cpus_allowed if effective_xcpus - * not set) mask of the given cpuset will be taken away from parent's - * effective_cpus. The function will return 0 if all the CPUs listed in - * effective_xcpus can be granted or an error code will be returned. + * For partcmd_enable*, the cpuset is being transformed from a non-partition + * root to a partition root. The effective_xcpus (cpus_allowed if + * effective_xcpus not set) mask of the given cpuset will be taken away from + * parent's effective_cpus. The function will return 0 if all the CPUs listed + * in effective_xcpus can be granted or an error code will be returned. * * For partcmd_disable, the cpuset is being transformed from a partition * root back to a non-partition root. Any CPUs in effective_xcpus will be @@ -1695,7 +1753,7 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) * * For partcmd_invalidate, the current partition will be made invalid. * - * The partcmd_enable and partcmd_disable commands are used by + * The partcmd_enable* and partcmd_disable commands are used by * update_prstate(). An error code may be returned and the caller will check * for error. * @@ -1760,7 +1818,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, nocpu = tasks_nocpu_error(parent, cs, xcpus); - if (cmd == partcmd_enable) { + if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* * Enabling partition root is not allowed if its * effective_xcpus is empty or doesn't overlap with @@ -1783,6 +1841,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, cpumask_copy(tmp->delmask, xcpus); deleting = true; subparts_delta++; + new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; } else if (cmd == partcmd_disable) { /* * May need to add cpus to parent's effective_cpus for @@ -1792,6 +1851,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); if (adding) subparts_delta--; + new_prs = PRS_MEMBER; } else if (newmask) { /* * Empty cpumask is not allowed @@ -1940,37 +2000,24 @@ write_error: * newly deleted ones will be added back to effective_cpus. */ spin_lock_irq(&callback_lock); - if (adding) { - if (parent == &top_cpuset) - cpumask_andnot(subpartitions_cpus, - subpartitions_cpus, tmp->addmask); - /* - * Some of the CPUs in effective_xcpus might have been offlined. - */ - cpumask_or(parent->effective_cpus, - parent->effective_cpus, tmp->addmask); - cpumask_and(parent->effective_cpus, - parent->effective_cpus, cpu_active_mask); - } - if (deleting) { - if (parent == &top_cpuset) - cpumask_or(subpartitions_cpus, - subpartitions_cpus, tmp->delmask); - cpumask_andnot(parent->effective_cpus, - parent->effective_cpus, tmp->delmask); - } - - if (is_partition_valid(parent)) { - parent->nr_subparts += subparts_delta; - WARN_ON_ONCE(parent->nr_subparts < 0); - } - if (old_prs != new_prs) { cs->partition_root_state = new_prs; if (new_prs <= 0) cs->nr_subparts = 0; } + /* + * Adding to parent's effective_cpus means deletion CPUs from cs + * and vice versa. + */ + if (adding) + partition_xcpus_del(old_prs, parent, tmp->addmask); + if (deleting) + partition_xcpus_add(new_prs, parent, tmp->delmask); + if (is_partition_valid(parent)) { + parent->nr_subparts += subparts_delta; + WARN_ON_ONCE(parent->nr_subparts < 0); + } spin_unlock_irq(&callback_lock); if ((old_prs != new_prs) && (cmd == partcmd_update)) @@ -2948,6 +2995,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) int err = PERR_NONE, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; + bool new_xcpus_state = false; if (old_prs == new_prs) return 0; @@ -2977,6 +3025,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) goto out; if (!old_prs) { + enum partition_cmd cmd = (new_prs == PRS_ROOT) + ? partcmd_enable : partcmd_enablei; + /* * cpus_allowed cannot be empty. */ @@ -2985,19 +3036,18 @@ static int update_prstate(struct cpuset *cs, int new_prs) goto out; } - err = update_parent_effective_cpumask(cs, partcmd_enable, - NULL, &tmpmask); + err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); /* * If an attempt to become local partition root fails, * try to become a remote partition root instead. */ - if (err && remote_partition_enable(cs, &tmpmask)) + if (err && remote_partition_enable(cs, new_prs, &tmpmask)) err = 0; } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. */ - ; + new_xcpus_state = true; } else { /* * Switching back to member is always allowed even if it @@ -3029,6 +3079,8 @@ out: WRITE_ONCE(cs->prs_err, err); if (!is_partition_valid(cs)) reset_partition_data(cs); + else if (new_xcpus_state) + partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); /* Force update if switching back to member */ @@ -3386,6 +3438,7 @@ typedef enum { FILE_SUBPARTS_CPULIST, FILE_EXCLUSIVE_CPULIST, FILE_EFFECTIVE_XCPULIST, + FILE_ISOLATED_CPULIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_MEM_HARDWALL, @@ -3582,6 +3635,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_SUBPARTS_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus)); break; + case FILE_ISOLATED_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus)); + break; default: ret = -EINVAL; } @@ -3875,6 +3931,13 @@ static struct cftype dfl_files[] = { .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, }, + { + .name = "cpus.isolated", + .seq_show = cpuset_common_seq_show, + .private = FILE_ISOLATED_CPULIST, + .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, + }, + { } /* terminate */ }; @@ -4194,6 +4257,7 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); From 72c6303acfa1008c542e093bc9f9916fb99e0323 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 25 Oct 2023 14:25:55 -0400 Subject: [PATCH 0015/1562] cgroup/cpuset: Take isolated CPUs out of workqueue unbound cpumask To make CPUs in isolated cpuset partition closer in isolation to the boot time isolated CPUs specified in the "isolcpus" boot command line option, we need to take those CPUs out of the workqueue unbound cpumask so that work functions from the unbound workqueues won't run on those CPUs. Otherwise, they will interfere the user tasks running on those isolated CPUs. With the introduction of the workqueue_unbound_exclude_cpumask() helper function in an earlier commit, those isolated CPUs can now be taken out from the workqueue unbound cpumask. This patch also updates cgroup-v2.rst to mention that isolated CPUs will be excluded from unbound workqueue cpumask as well as updating test_cpuset_prs.sh to verify the correctness of the new *cpuset.cpus.isolated file, if available via cgroup_debug option. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 10 +- kernel/cgroup/cpuset.c | 116 +++++++++++++++--- .../selftests/cgroup/test_cpuset_prs.sh | 74 +++++++++-- 3 files changed, 166 insertions(+), 34 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 3f85254f3cef..cf5651a11df8 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2358,11 +2358,11 @@ Cpuset Interface Files partition or scheduling domain. The set of exclusive CPUs is determined by the value of its "cpuset.cpus.exclusive.effective". - When set to "isolated", the CPUs in that partition will - be in an isolated state without any load balancing from the - scheduler. Tasks placed in such a partition with multiple - CPUs should be carefully distributed and bound to each of the - individual CPUs for optimal performance. + When set to "isolated", the CPUs in that partition will be in + an isolated state without any load balancing from the scheduler + and excluded from the unbound workqueues. Tasks placed in such + a partition with multiple CPUs should be carefully distributed + and bound to each of the individual CPUs for optimal performance. A partition root ("root" or "isolated") can be in one of the two possible states - valid or invalid. An invalid partition diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 19c8779798fd..1bad4007ff4b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ #include #include #include +#include DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -1444,25 +1446,31 @@ static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *x * @new_prs: new partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be added + * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ -static void partition_xcpus_add(int new_prs, struct cpuset *parent, +static bool partition_xcpus_add(int new_prs, struct cpuset *parent, struct cpumask *xcpus) { + bool isolcpus_updated; + WARN_ON_ONCE(new_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) parent = &top_cpuset; + if (parent == &top_cpuset) cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); - if (new_prs != parent->partition_root_state) + isolcpus_updated = (new_prs != parent->partition_root_state); + if (isolcpus_updated) partition_xcpus_newstate(parent->partition_root_state, new_prs, xcpus); cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); + return isolcpus_updated; } /* @@ -1470,12 +1478,15 @@ static void partition_xcpus_add(int new_prs, struct cpuset *parent, * @old_prs: old partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be removed + * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ -static void partition_xcpus_del(int old_prs, struct cpuset *parent, +static bool partition_xcpus_del(int old_prs, struct cpuset *parent, struct cpumask *xcpus) { + bool isolcpus_updated; + WARN_ON_ONCE(old_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) @@ -1484,12 +1495,27 @@ static void partition_xcpus_del(int old_prs, struct cpuset *parent, if (parent == &top_cpuset) cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); - if (old_prs != parent->partition_root_state) + isolcpus_updated = (old_prs != parent->partition_root_state); + if (isolcpus_updated) partition_xcpus_newstate(old_prs, parent->partition_root_state, xcpus); cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); + return isolcpus_updated; +} + +static void update_unbound_workqueue_cpumask(bool isolcpus_updated) +{ + int ret; + + lockdep_assert_cpus_held(); + + if (!isolcpus_updated) + return; + + ret = workqueue_unbound_exclude_cpumask(isolated_cpus); + WARN_ON_ONCE(ret < 0); } /* @@ -1540,6 +1566,8 @@ static inline bool is_local_partition(struct cpuset *cs) static int remote_partition_enable(struct cpuset *cs, int new_prs, struct tmpmasks *tmp) { + bool isolcpus_updated; + /* * The user must have sysadmin privilege. */ @@ -1561,7 +1589,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, return 0; spin_lock_irq(&callback_lock); - partition_xcpus_add(new_prs, NULL, tmp->new_cpus); + isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); if (cs->use_parent_ecpus) { struct cpuset *parent = parent_cs(cs); @@ -1570,13 +1598,13 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, parent->child_ecpus_count--; } spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); - return 1; } @@ -1591,18 +1619,22 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, */ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { + bool isolcpus_updated; + compute_effective_exclusive_cpumask(cs, tmp->new_cpus); WARN_ON_ONCE(!is_remote_partition(cs)); WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); list_del_init(&cs->remote_sibling); - partition_xcpus_del(cs->partition_root_state, NULL, tmp->new_cpus); + isolcpus_updated = partition_xcpus_del(cs->partition_root_state, + NULL, tmp->new_cpus); cs->partition_root_state = -cs->partition_root_state; if (!cs->prs_err) cs->prs_err = PERR_INVCPUS; reset_partition_data(cs); spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1625,6 +1657,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, { bool adding, deleting; int prs = cs->partition_root_state; + int isolcpus_updated = 0; if (WARN_ON_ONCE(!is_remote_partition(cs))) return; @@ -1649,10 +1682,11 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, spin_lock_irq(&callback_lock); if (adding) - partition_xcpus_add(prs, NULL, tmp->addmask); + isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); if (deleting) - partition_xcpus_del(prs, NULL, tmp->delmask); + isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1774,6 +1808,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int part_error = PERR_NONE; /* Partition error? */ int subparts_delta = 0; struct cpumask *xcpus; /* cs effective_xcpus */ + int isolcpus_updated = 0; bool nocpu; lockdep_assert_held(&cpuset_mutex); @@ -2010,15 +2045,18 @@ write_error: * and vice versa. */ if (adding) - partition_xcpus_del(old_prs, parent, tmp->addmask); + isolcpus_updated += partition_xcpus_del(old_prs, parent, + tmp->addmask); if (deleting) - partition_xcpus_add(new_prs, parent, tmp->delmask); + isolcpus_updated += partition_xcpus_add(new_prs, parent, + tmp->delmask); if (is_partition_valid(parent)) { parent->nr_subparts += subparts_delta; WARN_ON_ONCE(parent->nr_subparts < 0); } spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive(cs, new_prs); @@ -3082,6 +3120,7 @@ out: else if (new_xcpus_state) partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(new_xcpus_state); /* Force update if switching back to member */ update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); @@ -4370,6 +4409,30 @@ void cpuset_force_rebuild(void) force_rebuild = true; } +/* + * Attempt to acquire a cpus_read_lock while a hotplug operation may be in + * progress. + * Return: true if successful, false otherwise + * + * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, + * cpus_read_trylock() is used here to acquire the lock. + */ +static bool cpuset_hotplug_cpus_read_trylock(void) +{ + int retries = 0; + + while (!cpus_read_trylock()) { + /* + * CPU hotplug still in progress. Retry 5 times + * with a 10ms wait before bailing out. + */ + if (++retries > 5) + return false; + msleep(10); + } + return true; +} + /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest @@ -4386,6 +4449,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) bool cpus_updated; bool mems_updated; bool remote; + int partcmd = -1; struct cpuset *parent; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -4417,11 +4481,13 @@ retry: compute_partition_effective_cpumask(cs, &new_cpus); if (remote && cpumask_empty(&new_cpus) && - partition_is_populated(cs, NULL)) { + partition_is_populated(cs, NULL) && + cpuset_hotplug_cpus_read_trylock()) { remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; cpuset_force_rebuild(); + cpus_read_unlock(); } /* @@ -4432,18 +4498,28 @@ retry: * partitions. */ if (is_local_partition(cs) && (!is_partition_valid(parent) || - tasks_nocpu_error(parent, cs, &new_cpus))) { - update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp); - compute_effective_cpumask(&new_cpus, cs, parent); - cpuset_force_rebuild(); - } + tasks_nocpu_error(parent, cs, &new_cpus))) + partcmd = partcmd_invalidate; /* * On the other hand, an invalid partition root may be transitioned * back to a regular one. */ - else if (is_partition_valid(parent) && is_partition_invalid(cs)) { - update_parent_effective_cpumask(cs, partcmd_update, NULL, tmp); - if (is_partition_valid(cs)) { + else if (is_partition_valid(parent) && is_partition_invalid(cs)) + partcmd = partcmd_update; + + /* + * cpus_read_lock needs to be held before calling + * update_parent_effective_cpumask(). To avoid circular lock + * dependency between cpuset_mutex and cpus_read_lock, + * cpus_read_trylock() is used here to acquire the lock. + */ + if (partcmd >= 0) { + if (!cpuset_hotplug_cpus_read_trylock()) + goto update_tasks; + + update_parent_effective_cpumask(cs, partcmd, NULL, tmp); + cpus_read_unlock(); + if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); } diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 2b825019f806..7b7c4c2b6d85 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -232,11 +232,11 @@ TEST_MATRIX=( " C0-3:S+ C1-3:S+ C2-3 C4-5 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4-5 \ A1:P0,A2:P1,A3:P2,B1:P1 2-3" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4 \ - A1:P0,A2:P1,A3:P2,B1:P1 2-4" + A1:P0,A2:P1,A3:P2,B1:P1 2-4,2-3" " C0-3:S+ C1-3:S+ C3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:2,A3:3,B1:4 \ - A1:P0,A2:P1,A3:P2,B1:P1 2-4" + A1:P0,A2:P1,A3:P2,B1:P1 2-4,3" " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X4:P1 . 0 A1:0-1,A2:2-3,A3:4 \ - A1:P0,A2:P2,A3:P1 2-4" + A1:P0,A2:P2,A3:P1 2-4,2-3" " C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \ . . X5 . . 0 A1:0-4,A2:1-4,A3:2-4 \ A1:P0,A2:P-2,A3:P-1" @@ -248,7 +248,7 @@ TEST_MATRIX=( " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:O2=0 . 0 A1:0-1,A2:1,A3:3 A1:P0,A3:P2 2-3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:O2=0 O2=1 0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3" " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 P2:O3=0 . 0 A1:0-2,A2:1-2,A3: A1:P0,A3:P2 3" - " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 T:P2:O3=0 . 0 A1:0-2,A2:1-2,A3:1-2 A1:P0,A3:P-2 3" + " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 T:P2:O3=0 . 0 A1:0-2,A2:1-2,A3:1-2 A1:P0,A3:P-2 3," # An invalidated remote partition cannot self-recover from hotplug " C0-3:S+ C1-3:S+ C2 . X2-3 X2-3 T:P2:O2=0 O2=1 0 A1:0-3,A2:1-3,A3:2 A1:P0,A3:P-2" @@ -376,7 +376,7 @@ write_cpu_online() } fi echo $VAL > $CPUFILE - pause 0.01 + pause 0.05 } # @@ -508,12 +508,14 @@ dump_states() XECPUS=$DIR/cpuset.cpus.exclusive.effective PRS=$DIR/cpuset.cpus.partition PCPUS=$DIR/.__DEBUG__.cpuset.cpus.subpartitions + ISCPUS=$DIR/.__DEBUG__.cpuset.cpus.isolated [[ -e $CPUS ]] && echo "$CPUS: $(cat $CPUS)" [[ -e $XCPUS ]] && echo "$XCPUS: $(cat $XCPUS)" [[ -e $ECPUS ]] && echo "$ECPUS: $(cat $ECPUS)" [[ -e $XECPUS ]] && echo "$XECPUS: $(cat $XECPUS)" [[ -e $PRS ]] && echo "$PRS: $(cat $PRS)" [[ -e $PCPUS ]] && echo "$PCPUS: $(cat $PCPUS)" + [[ -e $ISCPUS ]] && echo "$ISCPUS: $(cat $ISCPUS)" done } @@ -591,11 +593,17 @@ check_cgroup_states() # # Get isolated (including offline) CPUs by looking at -# /sys/kernel/debug/sched/domains and compare that with the expected value. +# /sys/kernel/debug/sched/domains and *cpuset.cpus.isolated control file, +# if available, and compare that with the expected value. # -# Note that a sched domain of just 1 CPU will be considered isolated. +# Note that isolated CPUs from the sched/domains context include offline +# CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may +# not be included in the *cpuset.cpus.isolated control file which contains +# only CPUs in isolated partitions. # -# $1 - expected isolated cpu list +# $1 - expected isolated cpu list(s) {,} +# - expected sched/domains value +# - *cpuset.cpus.isolated value = if not defined # check_isolcpus() { @@ -603,8 +611,38 @@ check_isolcpus() ISOLCPUS= LASTISOLCPU= SCHED_DOMAINS=/sys/kernel/debug/sched/domains + ISCPUS=${CGROUP2}/.__DEBUG__.cpuset.cpus.isolated + if [[ $EXPECT_VAL = . ]] + then + EXPECT_VAL= + EXPECT_VAL2= + elif [[ $(expr $EXPECT_VAL : ".*,.*") > 0 ]] + then + set -- $(echo $EXPECT_VAL | sed -e "s/,/ /g") + EXPECT_VAL=$1 + EXPECT_VAL2=$2 + else + EXPECT_VAL2=$EXPECT_VAL + fi + + # + # Check the debug isolated cpumask, if present + # + [[ -f $ISCPUS ]] && { + ISOLCPUS=$(cat $ISCPUS) + [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && { + # Take a 50ms pause and try again + pause 0.05 + ISOLCPUS=$(cat $ISCPUS) + } + [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && return 1 + ISOLCPUS= + } + + # + # Use the sched domain in debugfs to check isolated CPUs, if available + # [[ -d $SCHED_DOMAINS ]] || return 0 - [[ $EXPECT_VAL = . ]] && EXPECT_VAL= for ((CPU=0; CPU < $NR_CPUS; CPU++)) do @@ -648,6 +686,22 @@ test_fail() exit 1 } +# +# Check to see if there are unexpected isolated CPUs left +# +null_isolcpus_check() +{ + [[ $VERBOSE -gt 0 ]] || return 0 + pause 0.02 + check_isolcpus "." + if [[ $? -ne 0 ]] + then + echo "Unexpected isolated CPUs: $ISOLCPUS" + dump_states + exit 1 + fi +} + # # Run cpuset state transition test # $1 - test matrix name @@ -733,6 +787,7 @@ run_state_test() echo "Effective cpus changed to $NEWLIST after test $I!" exit 1 } + null_isolcpus_check [[ $VERBOSE -gt 0 ]] && echo "Test $I done." ((I++)) done @@ -802,6 +857,7 @@ test_isolated() console_msg "Cleaning up" echo $$ > $CGROUP2/cgroup.procs [[ -d A1 ]] && rmdir A1 + null_isolcpus_check } # From e76d28bdf9ba5388b8c4835a5199dc427b603188 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 3 Nov 2023 23:13:01 -0400 Subject: [PATCH 0016/1562] cgroup/rstat: Reduce cpu_lock hold time in cgroup_rstat_flush_locked() When cgroup_rstat_updated() isn't being called concurrently with cgroup_rstat_flush_locked(), its run time is pretty short. When both are called concurrently, the cgroup_rstat_updated() run time can spike to a pretty high value due to high cpu_lock hold time in cgroup_rstat_flush_locked(). This can be problematic if the task calling cgroup_rstat_updated() is a realtime task running on an isolated CPU with a strict latency requirement. The cgroup_rstat_updated() call can happen when there is a page fault even though the task is running in user space most of the time. The percpu cpu_lock is used to protect the update tree - updated_next and updated_children. This protection is only needed when cgroup_rstat_cpu_pop_updated() is being called. The subsequent flushing operation which can take a much longer time does not need that protection as it is already protected by cgroup_rstat_lock. To reduce the cpu_lock hold time, we need to perform all the cgroup_rstat_cpu_pop_updated() calls up front with the lock released afterward before doing any flushing. This patch adds a new cgroup_rstat_updated_list() function to return a singly linked list of cgroups to be flushed. Some instrumentation code are added to measure the cpu_lock hold time right after lock acquisition to after releasing the lock. Parallel kernel build on a 2-socket x86-64 server is used as the benchmarking tool for measuring the lock hold time. The maximum cpu_lock hold time before and after the patch are 100us and 29us respectively. So the worst case time is reduced to about 30% of the original. However, there may be some OS or hardware noises like NMI or SMI in the test system that can worsen the worst case value. Those noises are usually tuned out in a real production environment to get a better result. OTOH, the lock hold time frequency distribution should give a better idea of the performance benefit of the patch. Below were the frequency distribution before and after the patch: Hold time Before patch After patch --------- ------------ ----------- 0-01 us 804,139 13,738,708 01-05 us 9,772,767 1,177,194 05-10 us 4,595,028 4,984 10-15 us 303,481 3,562 15-20 us 78,971 1,314 20-25 us 24,583 18 25-30 us 6,908 12 30-40 us 8,015 40-50 us 2,192 50-60 us 316 60-70 us 43 70-80 us 7 80-90 us 2 >90 us 3 Signed-off-by: Waiman Long Reviewed-by: Yosry Ahmed Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 7 ++++++ kernel/cgroup/rstat.c | 43 ++++++++++++++++++++++++------------- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4caab0c6b361..37518436cfe7 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -496,6 +496,13 @@ struct cgroup { struct cgroup_rstat_cpu __percpu *rstat_cpu; struct list_head rstat_css_list; + /* + * A singly-linked list of cgroup structures to be rstat flushed. + * This is a scratch field to be used exclusively by + * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock. + */ + struct cgroup *rstat_flush_next; + /* cgroup basic resource statistics */ struct cgroup_base_stat last_bstat; struct cgroup_base_stat bstat; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d80d7a608141..1f300bf4dc40 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -145,6 +145,32 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, return pos; } +/* Return a list of updated cgroups to be flushed */ +static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) +{ + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); + struct cgroup *head, *tail, *next; + unsigned long flags; + + /* + * The _irqsave() is needed because cgroup_rstat_lock is + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring + * this lock with the _irq() suffix only disables interrupts on + * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables + * interrupts on both configurations. The _irqsave() ensures + * that interrupts are always disabled and later restored. + */ + raw_spin_lock_irqsave(cpu_lock, flags); + head = tail = cgroup_rstat_cpu_pop_updated(NULL, root, cpu); + while (tail) { + next = cgroup_rstat_cpu_pop_updated(tail, root, cpu); + tail->rstat_flush_next = next; + tail = next; + } + raw_spin_unlock_irqrestore(cpu_lock, flags); + return head; +} + /* * A hook for bpf stat collectors to attach to and flush their stats. * Together with providing bpf kfuncs for cgroup_rstat_updated() and @@ -179,21 +205,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) lockdep_assert_held(&cgroup_rstat_lock); for_each_possible_cpu(cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, - cpu); - struct cgroup *pos = NULL; - unsigned long flags; + struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu); - /* - * The _irqsave() is needed because cgroup_rstat_lock is - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring - * this lock with the _irq() suffix only disables interrupts on - * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables - * interrupts on both configurations. The _irqsave() ensures - * that interrupts are always disabled and later restored. - */ - raw_spin_lock_irqsave(cpu_lock, flags); - while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { + for (; pos; pos = pos->rstat_flush_next) { struct cgroup_subsys_state *css; cgroup_base_stat_flush(pos, cpu); @@ -205,7 +219,6 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } - raw_spin_unlock_irqrestore(cpu_lock, flags); /* play nice and yield if necessary */ if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { From 02e3564a344064aca49f147e8a4eecbe5d3459fc Mon Sep 17 00:00:00 2001 From: Ben Wolsieffer Date: Thu, 2 Nov 2023 16:30:38 -0400 Subject: [PATCH 0017/1562] regmap: ram: support noinc semantics Support noinc semantics in RAM backed regmaps, for testing purposes. Add a new callback that selects registers which should have noinc behavior. Bulk writes to a noinc register will cause the last value in the buffer to be assigned to the register, while bulk reads will copy the same value repeatedly into the buffer. This patch only adds support to regmap-raw-ram, since regmap-ram does not support bulk operations. Signed-off-by: Ben Wolsieffer Link: https://lore.kernel.org/r/20231102203039.3069305-1-ben.wolsieffer@hefring.com Signed-off-by: Mark Brown --- drivers/base/regmap/internal.h | 1 + drivers/base/regmap/regmap-raw-ram.c | 23 +++++++++++++++++------ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h index 9a9ea514c2d8..583dd5d7d46b 100644 --- a/drivers/base/regmap/internal.h +++ b/drivers/base/regmap/internal.h @@ -318,6 +318,7 @@ struct regmap_ram_data { bool *read; bool *written; enum regmap_endian reg_endian; + bool (*noinc_reg)(struct regmap_ram_data *data, unsigned int reg); }; /* diff --git a/drivers/base/regmap/regmap-raw-ram.c b/drivers/base/regmap/regmap-raw-ram.c index c9b800885f3b..463adafa9532 100644 --- a/drivers/base/regmap/regmap-raw-ram.c +++ b/drivers/base/regmap/regmap-raw-ram.c @@ -41,10 +41,15 @@ static int regmap_raw_ram_gather_write(void *context, return -EINVAL; r = decode_reg(data->reg_endian, reg); - memcpy(&our_buf[r], val, val_len); + if (data->noinc_reg && data->noinc_reg(data, r)) { + memcpy(&our_buf[r], val + val_len - 2, 2); + data->written[r] = true; + } else { + memcpy(&our_buf[r], val, val_len); - for (i = 0; i < val_len / 2; i++) - data->written[r + i] = true; + for (i = 0; i < val_len / 2; i++) + data->written[r + i] = true; + } return 0; } @@ -70,10 +75,16 @@ static int regmap_raw_ram_read(void *context, return -EINVAL; r = decode_reg(data->reg_endian, reg); - memcpy(val, &our_buf[r], val_len); + if (data->noinc_reg && data->noinc_reg(data, r)) { + for (i = 0; i < val_len; i += 2) + memcpy(val + i, &our_buf[r], 2); + data->read[r] = true; + } else { + memcpy(val, &our_buf[r], val_len); - for (i = 0; i < val_len / 2; i++) - data->read[r + i] = true; + for (i = 0; i < val_len / 2; i++) + data->read[r + i] = true; + } return 0; } From d958d97848a6604d024221920d300d07869715a2 Mon Sep 17 00:00:00 2001 From: Ben Wolsieffer Date: Thu, 2 Nov 2023 16:30:39 -0400 Subject: [PATCH 0018/1562] regmap: kunit: add noinc write test Add a test for writing to a noinc register, which verifies that the write does not touch adjacent registers. This test succeeds with [1] applied and fails without it. [1] 984a4afdc87a ("regmap: prevent noinc writes from clobbering cache") Signed-off-by: Ben Wolsieffer Link: https://lore.kernel.org/r/20231102203039.3069305-2-ben.wolsieffer@hefring.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-kunit.c | 60 ++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/drivers/base/regmap/regmap-kunit.c b/drivers/base/regmap/regmap-kunit.c index e14cc03a17f6..026bdcb45127 100644 --- a/drivers/base/regmap/regmap-kunit.c +++ b/drivers/base/regmap/regmap-kunit.c @@ -1186,6 +1186,65 @@ static void raw_write(struct kunit *test) regmap_exit(map); } +static bool reg_zero(struct device *dev, unsigned int reg) +{ + return reg == 0; +} + +static bool ram_reg_zero(struct regmap_ram_data *data, unsigned int reg) +{ + return reg == 0; +} + +static void raw_noinc_write(struct kunit *test) +{ + struct raw_test_types *t = (struct raw_test_types *)test->param_value; + struct regmap *map; + struct regmap_config config; + struct regmap_ram_data *data; + unsigned int val, val_test, val_last; + u16 val_array[BLOCK_TEST_SIZE]; + + config = raw_regmap_config; + config.volatile_reg = reg_zero; + config.writeable_noinc_reg = reg_zero; + config.readable_noinc_reg = reg_zero; + + map = gen_raw_regmap(&config, t, &data); + KUNIT_ASSERT_FALSE(test, IS_ERR(map)); + if (IS_ERR(map)) + return; + + data->noinc_reg = ram_reg_zero; + + get_random_bytes(&val_array, sizeof(val_array)); + + if (config.val_format_endian == REGMAP_ENDIAN_BIG) { + val_test = be16_to_cpu(val_array[1]) + 100; + val_last = be16_to_cpu(val_array[BLOCK_TEST_SIZE - 1]); + } else { + val_test = le16_to_cpu(val_array[1]) + 100; + val_last = le16_to_cpu(val_array[BLOCK_TEST_SIZE - 1]); + } + + /* Put some data into the register following the noinc register */ + KUNIT_EXPECT_EQ(test, 0, regmap_write(map, 1, val_test)); + + /* Write some data to the noinc register */ + KUNIT_EXPECT_EQ(test, 0, regmap_noinc_write(map, 0, val_array, + sizeof(val_array))); + + /* We should read back the last value written */ + KUNIT_EXPECT_EQ(test, 0, regmap_read(map, 0, &val)); + KUNIT_ASSERT_EQ(test, val_last, val); + + /* Make sure we didn't touch the register after the noinc register */ + KUNIT_EXPECT_EQ(test, 0, regmap_read(map, 1, &val)); + KUNIT_ASSERT_EQ(test, val_test, val); + + regmap_exit(map); +} + static void raw_sync(struct kunit *test) { struct raw_test_types *t = (struct raw_test_types *)test->param_value; @@ -1284,6 +1343,7 @@ static struct kunit_case regmap_test_cases[] = { KUNIT_CASE_PARAM(raw_read_defaults, raw_test_types_gen_params), KUNIT_CASE_PARAM(raw_write_read_single, raw_test_types_gen_params), KUNIT_CASE_PARAM(raw_write, raw_test_types_gen_params), + KUNIT_CASE_PARAM(raw_noinc_write, raw_test_types_gen_params), KUNIT_CASE_PARAM(raw_sync, raw_test_cache_types_gen_params), {} }; From c0d6b2acf78e3195a6b100a236210f2e6e42b0c0 Mon Sep 17 00:00:00 2001 From: Dang Huynh Date: Mon, 6 Nov 2023 19:08:31 +0700 Subject: [PATCH 0019/1562] regulator: qcom_spmi: Add PM8937 SPMI regulator The PM8937 has 4 HFSMPS, 2 FTSMPS2.5 (for controlling APC voltage) and 23 LDO regulators. Add the configuration for this chip. Signed-off-by: Dang Huynh Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20231106-pm8937-v1-3-ec51d9eeec53@riseup.net Signed-off-by: Mark Brown --- drivers/regulator/qcom_spmi-regulator.c | 34 +++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/regulator/qcom_spmi-regulator.c b/drivers/regulator/qcom_spmi-regulator.c index 94f9092b29ef..9a9fa20dcd95 100644 --- a/drivers/regulator/qcom_spmi-regulator.c +++ b/drivers/regulator/qcom_spmi-regulator.c @@ -2239,6 +2239,39 @@ static const struct spmi_regulator_data pm8916_regulators[] = { { } }; +static const struct spmi_regulator_data pm8937_regulators[] = { + { "s1", 0x1400, "vdd_s1", }, + { "s2", 0x1700, "vdd_s2", }, + { "s3", 0x1a00, "vdd_s3", }, + { "s4", 0x1d00, "vdd_s4", }, + { "s5", 0x2000, "vdd_s5", }, + { "s6", 0x2300, "vdd_s6", }, + { "l1", 0x4000, "vdd_l1_l19", }, + { "l2", 0x4100, "vdd_l2_l23", }, + { "l3", 0x4200, "vdd_l3", }, + { "l4", 0x4300, "vdd_l4_l5_l6_l7_l16", }, + { "l5", 0x4400, "vdd_l4_l5_l6_l7_l16", }, + { "l6", 0x4500, "vdd_l4_l5_l6_l7_l16", }, + { "l7", 0x4600, "vdd_l4_l5_l6_l7_l16", }, + { "l8", 0x4700, "vdd_l8_l11_l12_l17_l22", }, + { "l9", 0x4800, "vdd_l9_l10_l13_l14_l15_l18", }, + { "l10", 0x4900, "vdd_l9_l10_l13_l14_l15_l18", }, + { "l11", 0x4a00, "vdd_l8_l11_l12_l17_l22", }, + { "l12", 0x4b00, "vdd_l8_l11_l12_l17_l22", }, + { "l13", 0x4c00, "vdd_l9_l10_l13_l14_l15_l18", }, + { "l14", 0x4d00, "vdd_l9_l10_l13_l14_l15_l18", }, + { "l15", 0x4e00, "vdd_l9_l10_l13_l14_l15_l18", }, + { "l16", 0x4f00, "vdd_l4_l5_l6_l7_l16", }, + { "l17", 0x5000, "vdd_l8_l11_l12_l17_l22", }, + { "l18", 0x5100, "vdd_l9_l10_l13_l14_l15_l18", }, + { "l19", 0x5200, "vdd_l1_l19", }, + { "l20", 0x5300, "vdd_l20_l21", }, + { "l21", 0x5400, "vdd_l21_l21", }, + { "l22", 0x5500, "vdd_l8_l11_l12_l17_l22", }, + { "l23", 0x5600, "vdd_l2_l23", }, + { } +}; + static const struct spmi_regulator_data pm8941_regulators[] = { { "s1", 0x1400, "vdd_s1", }, { "s2", 0x1700, "vdd_s2", }, @@ -2453,6 +2486,7 @@ static const struct of_device_id qcom_spmi_regulator_match[] = { { .compatible = "qcom,pm8841-regulators", .data = &pm8841_regulators }, { .compatible = "qcom,pm8909-regulators", .data = &pm8909_regulators }, { .compatible = "qcom,pm8916-regulators", .data = &pm8916_regulators }, + { .compatible = "qcom,pm8937-regulators", .data = &pm8937_regulators }, { .compatible = "qcom,pm8941-regulators", .data = &pm8941_regulators }, { .compatible = "qcom,pm8950-regulators", .data = &pm8950_regulators }, { .compatible = "qcom,pm8994-regulators", .data = &pm8994_regulators }, From f2b003c8235e0afed60ed426e891e41dab131821 Mon Sep 17 00:00:00 2001 From: Dang Huynh Date: Mon, 6 Nov 2023 19:08:32 +0700 Subject: [PATCH 0020/1562] dt-bindings: regulator: qcom,spmi-regulator: Document PM8937 PMIC Add support for qcom,pm8937-regulators compatible string and add relevant supplies in QCOM's SPMI regulator documentation. Signed-off-by: Dang Huynh Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231106-pm8937-v1-4-ec51d9eeec53@riseup.net Signed-off-by: Mark Brown --- .../regulator/qcom,spmi-regulator.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.yaml b/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.yaml index 7a1b7d2abbd4..aea849e8eadf 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.yaml @@ -22,6 +22,7 @@ properties: - qcom,pm8841-regulators - qcom,pm8909-regulators - qcom,pm8916-regulators + - qcom,pm8937-regulators - qcom,pm8941-regulators - qcom,pm8950-regulators - qcom,pm8994-regulators @@ -291,6 +292,24 @@ allOf: patternProperties: "^vdd_s[1-3]-supply$": true + - if: + properties: + compatible: + contains: + enum: + - qcom,pm8937-regulators + then: + properties: + vdd_l1_l19-supply: true + vdd_l20_l21-supply: true + vdd_l2_l23-supply: true + vdd_l3-supply: true + vdd_l4_l5_l6_l7_l16-supply: true + vdd_l8_l11_l12_l17_l22-supply: true + vdd_l9_l10_l13_l14_l15_l18-supply: true + patternProperties: + "^vdd_s[1-6]-supply$": true + - if: properties: compatible: From 18cc1cd011131d878be2619b56eff7bc2a278bdf Mon Sep 17 00:00:00 2001 From: Dang Huynh Date: Mon, 6 Nov 2023 19:08:33 +0700 Subject: [PATCH 0021/1562] regulator: qcom_smd: Add PM8937 regulators The PM8937 is found on boards with MSM8917, MSM8937, MSM8940 SoCs and APQ variants. It provides 6 SMPS (two are controlled by SPMI) and 23 LDO regulators. Signed-off-by: Dang Huynh Reviewed-by: Stephan Gerhold Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20231106-pm8937-v1-5-ec51d9eeec53@riseup.net Signed-off-by: Mark Brown --- drivers/regulator/qcom_smd-regulator.c | 34 ++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/regulator/qcom_smd-regulator.c b/drivers/regulator/qcom_smd-regulator.c index f53ada076252..09c471a0ba2e 100644 --- a/drivers/regulator/qcom_smd-regulator.c +++ b/drivers/regulator/qcom_smd-regulator.c @@ -1012,6 +1012,39 @@ static const struct rpm_regulator_data rpm_pm8916_regulators[] = { {} }; +static const struct rpm_regulator_data rpm_pm8937_regulators[] = { + { "s1", QCOM_SMD_RPM_SMPA, 1, &pm8994_hfsmps, "vdd_s1" }, + { "s2", QCOM_SMD_RPM_SMPA, 2, &pm8994_hfsmps, "vdd_s2" }, + { "s3", QCOM_SMD_RPM_SMPA, 3, &pm8994_hfsmps, "vdd_s3" }, + { "s4", QCOM_SMD_RPM_SMPA, 4, &pm8994_hfsmps, "vdd_s4" }, + /* S5 - S6 are managed by SPMI */ + + { "l1", QCOM_SMD_RPM_LDOA, 1, &pm8953_ult_nldo, "vdd_l1_l19" }, + { "l2", QCOM_SMD_RPM_LDOA, 2, &pm8953_ult_nldo, "vdd_l2_l23" }, + { "l3", QCOM_SMD_RPM_LDOA, 3, &pm8953_ult_nldo, "vdd_l3" }, + { "l4", QCOM_SMD_RPM_LDOA, 4, &pm8950_ult_pldo, "vdd_l4_l5_l6_l7_l16" }, + { "l5", QCOM_SMD_RPM_LDOA, 5, &pm8950_ult_pldo, "vdd_l4_l5_l6_l7_l16" }, + { "l6", QCOM_SMD_RPM_LDOA, 6, &pm8950_ult_pldo, "vdd_l4_l5_l6_l7_l16" }, + { "l7", QCOM_SMD_RPM_LDOA, 7, &pm8950_ult_pldo, "vdd_l4_l5_l6_l7_l16" }, + { "l8", QCOM_SMD_RPM_LDOA, 8, &pm8950_ult_pldo, "vdd_l8_l11_l12_l17_l22" }, + { "l9", QCOM_SMD_RPM_LDOA, 9, &pm8950_ult_pldo, "vdd_l9_l10_l13_l14_l15_l18" }, + { "l10", QCOM_SMD_RPM_LDOA, 10, &pm8950_ult_pldo, "vdd_l9_l10_l13_l14_l15_l18"}, + { "l11", QCOM_SMD_RPM_LDOA, 11, &pm8950_ult_pldo, "vdd_l8_l11_l12_l17_l22" }, + { "l12", QCOM_SMD_RPM_LDOA, 12, &pm8950_ult_pldo, "vdd_l8_l11_l12_l17_l22" }, + { "l13", QCOM_SMD_RPM_LDOA, 13, &pm8950_ult_pldo, "vdd_l9_l10_l13_l14_l15_l18" }, + { "l14", QCOM_SMD_RPM_LDOA, 14, &pm8950_ult_pldo, "vdd_l9_l10_l13_l14_l15_l18" }, + { "l15", QCOM_SMD_RPM_LDOA, 15, &pm8950_ult_pldo, "vdd_l9_l10_l13_l14_l15_l18" }, + { "l16", QCOM_SMD_RPM_LDOA, 16, &pm8950_ult_pldo, "vdd_l4_l5_l6_l7_l16" }, + { "l17", QCOM_SMD_RPM_LDOA, 17, &pm8950_ult_pldo, "vdd_l8_l11_l12_l17_l22" }, + { "l18", QCOM_SMD_RPM_LDOA, 18, &pm8950_ult_pldo, "vdd_l9_l10_l13_l14_l15_l18" }, + { "l19", QCOM_SMD_RPM_LDOA, 19, &pm8953_ult_nldo, "vdd_l1_l19" }, + { "l20", QCOM_SMD_RPM_LDOA, 20, &pm8953_lnldo, "vdd_l20_l21" }, + { "l21", QCOM_SMD_RPM_LDOA, 21, &pm8953_lnldo, "vdd_l20_l21" }, + { "l22", QCOM_SMD_RPM_LDOA, 22, &pm8950_ult_pldo, "vdd_l8_l11_l12_l17_l22" }, + { "l23", QCOM_SMD_RPM_LDOA, 23, &pm8994_nldo, "vdd_l2_l23" }, + {} +}; + static const struct rpm_regulator_data rpm_pm8941_regulators[] = { { "s1", QCOM_SMD_RPM_SMPA, 1, &pm8x41_hfsmps, "vdd_s1" }, { "s2", QCOM_SMD_RPM_SMPA, 2, &pm8x41_hfsmps, "vdd_s2" }, @@ -1329,6 +1362,7 @@ static const struct of_device_id rpm_of_match[] = { { .compatible = "qcom,rpm-pm8841-regulators", .data = &rpm_pm8841_regulators }, { .compatible = "qcom,rpm-pm8909-regulators", .data = &rpm_pm8909_regulators }, { .compatible = "qcom,rpm-pm8916-regulators", .data = &rpm_pm8916_regulators }, + { .compatible = "qcom,rpm-pm8937-regulators", .data = &rpm_pm8937_regulators }, { .compatible = "qcom,rpm-pm8941-regulators", .data = &rpm_pm8941_regulators }, { .compatible = "qcom,rpm-pm8950-regulators", .data = &rpm_pm8950_regulators }, { .compatible = "qcom,rpm-pm8953-regulators", .data = &rpm_pm8953_regulators }, From 40e13ae67c6fc2897b49398d6f804b5d1ec63fff Mon Sep 17 00:00:00 2001 From: Dang Huynh Date: Mon, 6 Nov 2023 19:08:34 +0700 Subject: [PATCH 0022/1562] dt-bindings: regulator: qcom,smd-rpm-regulator: Document PM8937 IC Document the pm8937 compatible string and available regulators in the QCOM SMD RPM regulator documentation. Signed-off-by: Dang Huynh Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231106-pm8937-v1-6-ec51d9eeec53@riseup.net Signed-off-by: Mark Brown --- .../devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml b/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml index 9ea8ac0786ac..f2fd2df68a9e 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml @@ -47,6 +47,9 @@ description: For pm8916, s1, s2, s3, s4, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15, l16, l17, l18 + For pm8937, s1, s2, s3, s4, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, + l11, l12, l13, l14, l15, l16, l17, l18, l19, l20, l21, l22, l23 + For pm8941, s1, s2, s3, s4, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, lvs1, lvs2, lvs3, 5vs1, 5vs2 @@ -92,6 +95,7 @@ properties: - qcom,rpm-pm8841-regulators - qcom,rpm-pm8909-regulators - qcom,rpm-pm8916-regulators + - qcom,rpm-pm8937-regulators - qcom,rpm-pm8941-regulators - qcom,rpm-pm8950-regulators - qcom,rpm-pm8953-regulators From 0e1c8dcbdecefea93dee19419b2f67dca591dd42 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 26 Oct 2023 16:48:20 +0200 Subject: [PATCH 0023/1562] regulator: dt-bindings: Add system-critical-regulator property Introduce a new Device Tree property 'system-critical-regulator' for marking a regulator as crucial for system stability or functionality. This helps in distinguishing regulators that are vital for system operations and may require special handling in under-voltage scenarios. Signed-off-by: Oleksij Rempel Acked-by: Rob Herring Link: https://lore.kernel.org/r/20231026144824.4065145-2-o.rempel@pengutronix.de Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/regulator/regulator.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/regulator/regulator.yaml b/Documentation/devicetree/bindings/regulator/regulator.yaml index 9daf0fc2465f..5b8d55f7c43b 100644 --- a/Documentation/devicetree/bindings/regulator/regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/regulator.yaml @@ -114,6 +114,11 @@ properties: description: Enable pull down resistor when the regulator is disabled. type: boolean + system-critical-regulator: + description: Set if the regulator is critical to system stability or + functionality. + type: boolean + regulator-over-current-protection: description: Enable over current protection. type: boolean From 8156c7dd47b92fc4a70c9ea58e7a9e88c8bc32be Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 26 Oct 2023 16:48:21 +0200 Subject: [PATCH 0024/1562] regulator: Introduce handling for system-critical under-voltage events Handle under-voltage events for crucial regulators to maintain system stability and avoid issues during power drops. Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20231026144824.4065145-3-o.rempel@pengutronix.de Signed-off-by: Mark Brown --- drivers/regulator/core.c | 38 +++++++++++++++++++++++++++++++ drivers/regulator/of_regulator.c | 2 ++ include/linux/regulator/machine.h | 10 ++++++++ 3 files changed, 50 insertions(+) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 3137e40fcd3e..a072f721f288 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -5061,6 +5062,41 @@ void regulator_bulk_free(int num_consumers, } EXPORT_SYMBOL_GPL(regulator_bulk_free); +/** + * regulator_handle_critical - Handle events for system-critical regulators. + * @rdev: The regulator device. + * @event: The event being handled. + * + * This function handles critical events such as under-voltage, over-current, + * and unknown errors for regulators deemed system-critical. On detecting such + * events, it triggers a hardware protection shutdown with a defined timeout. + */ +static void regulator_handle_critical(struct regulator_dev *rdev, + unsigned long event) +{ + const char *reason = NULL; + + if (!rdev->constraints->system_critical) + return; + + switch (event) { + case REGULATOR_EVENT_UNDER_VOLTAGE: + reason = "System critical regulator: voltage drop detected"; + break; + case REGULATOR_EVENT_OVER_CURRENT: + reason = "System critical regulator: over-current detected"; + break; + case REGULATOR_EVENT_FAIL: + reason = "System critical regulator: unknown error"; + } + + if (!reason) + return; + + hw_protection_shutdown(reason, + REGULATOR_DEF_UV_LESS_CRITICAL_WINDOW_MS); +} + /** * regulator_notifier_call_chain - call regulator event notifier * @rdev: regulator source @@ -5073,6 +5109,8 @@ EXPORT_SYMBOL_GPL(regulator_bulk_free); int regulator_notifier_call_chain(struct regulator_dev *rdev, unsigned long event, void *data) { + regulator_handle_critical(rdev, event); + _notifier_call_chain(rdev, event, data); return NOTIFY_DONE; diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c index 1b65e5e4e40f..3bdd6f1919a4 100644 --- a/drivers/regulator/of_regulator.c +++ b/drivers/regulator/of_regulator.c @@ -131,6 +131,8 @@ static int of_get_regulation_constraints(struct device *dev, constraints->valid_ops_mask |= REGULATOR_CHANGE_STATUS; constraints->pull_down = of_property_read_bool(np, "regulator-pull-down"); + constraints->system_critical = of_property_read_bool(np, + "system-critical-regulator"); if (of_property_read_bool(np, "regulator-allow-bypass")) constraints->valid_ops_mask |= REGULATOR_CHANGE_BYPASS; diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index 621b7f4a3639..e0ddfb5593c9 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -49,6 +49,13 @@ struct regulator; #define DISABLE_IN_SUSPEND 1 #define ENABLE_IN_SUSPEND 2 +/* + * Default time window (in milliseconds) following a critical under-voltage + * event during which less critical actions can be safely carried out by the + * system. + */ +#define REGULATOR_DEF_UV_LESS_CRITICAL_WINDOW_MS 10 + /* Regulator active discharge flags */ enum regulator_active_discharge { REGULATOR_ACTIVE_DISCHARGE_DEFAULT, @@ -127,6 +134,8 @@ struct notification_limit { * @ramp_disable: Disable ramp delay when initialising or when setting voltage. * @soft_start: Enable soft start so that voltage ramps slowly. * @pull_down: Enable pull down when regulator is disabled. + * @system_critical: Set if the regulator is critical to system stability or + * functionality. * @over_current_protection: Auto disable on over current event. * * @over_current_detection: Configure over current limits. @@ -214,6 +223,7 @@ struct regulation_constraints { unsigned ramp_disable:1; /* disable ramp delay */ unsigned soft_start:1; /* ramp voltage slowly */ unsigned pull_down:1; /* pull down resistor when regulator off */ + unsigned system_critical:1; /* critical to system stability */ unsigned over_current_protection:1; /* auto disable on over current */ unsigned over_current_detection:1; /* notify on over current */ unsigned over_voltage_detection:1; /* notify on over voltage */ From 633cd1c0a9de7609f97c0c86e3ac81153e8263b0 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 26 Oct 2023 16:48:22 +0200 Subject: [PATCH 0025/1562] regulator: dt-bindings: Allow system-critical marking for fixed-regulator In certain projects, the main system regulator, composed of simple components including an under-voltage detector and capacitors, can be aptly described as a fixed regulator in the device tree. To cater to such use cases, this patch extends the fixed regulator binding to support the 'system-critical-regulator' property. This property signifies that the fixed-regulator is vital for system stability. Signed-off-by: Oleksij Rempel Acked-by: Rob Herring Link: https://lore.kernel.org/r/20231026144824.4065145-4-o.rempel@pengutronix.de Signed-off-by: Mark Brown --- .../devicetree/bindings/regulator/fixed-regulator.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml b/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml index ce7751b9129c..9ff9abf2691a 100644 --- a/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml @@ -105,6 +105,8 @@ properties: description: Interrupt signaling a critical under-voltage event. + system-critical-regulator: true + required: - compatible - regulator-name From 759e2bd96971763db1cfaf6cafc07654b12aa21e Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 26 Oct 2023 16:48:23 +0200 Subject: [PATCH 0026/1562] regulator: dt-bindings: Add 'regulator-uv-less-critical-window-ms' property Introduces a new devicetree property to specifies the time window (in milliseconds) following a critical under-voltage (UV) event during which less critical actions can be safely carried out by the system. Less Critical Actions: - Logging the under-voltage event for later analysis. - Saving less critical data that may be useful for diagnosing issues or for audit purposes. More Critical Actions (post the less critical window): - Initiating procedures to properly shutdown hardware to prevent damage. The 'regulator-uv-less-critical-window-ms' property is crucial for conveying board-specific hardware characteristics, not for enforcing a certain policy. The time window represented by this property is derived from the physical attributes of the hardware like the capacity of on-board capacitors, the power consumption of the components, and the time needed to safely shut down hardware to prevent damage. These attributes can significantly vary between different boards, making it a board-specific property rather than a policy directive. By providing a precise representation of the time available for less critical actions post an under-voltage event, this property enables the kernel to make informed decisions on action prioritization, ensuring that essential preventative measures are taken to avoid hardware damage while also allowing for data capture and analysis. Signed-off-by: Oleksij Rempel Acked-by: Rob Herring Link: https://lore.kernel.org/r/20231026144824.4065145-5-o.rempel@pengutronix.de Signed-off-by: Mark Brown --- .../devicetree/bindings/regulator/regulator.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/devicetree/bindings/regulator/regulator.yaml b/Documentation/devicetree/bindings/regulator/regulator.yaml index 5b8d55f7c43b..1ef380d1515e 100644 --- a/Documentation/devicetree/bindings/regulator/regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/regulator.yaml @@ -186,6 +186,14 @@ properties: be enabled but limit setting can be omitted. Limit is given as microvolt offset from voltage set to regulator. + regulator-uv-less-critical-window-ms: + description: Specifies the time window (in milliseconds) following a + critical under-voltage event during which the system can continue to + operate safely while performing less critical operations. This property + provides a defined duration before a more severe reaction to the + under-voltage event is needed, allowing for certain non-urgent actions to + be carried out in preparation for potential power loss. + regulator-temp-protection-kelvin: description: Set over temperature protection limit. This is a limit where hardware performs emergency shutdown. Zero can be passed to disable From 1e22152aa59d793743fc53051dd7a042f362aecb Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 26 Oct 2023 16:48:24 +0200 Subject: [PATCH 0027/1562] regulator: Implement uv_survival_time for handling under-voltage events Add 'uv_survival_time' field to regulation_constraints for specifying survival time post critical under-voltage event. Update the regulator notifier call chain and Device Tree property parsing to use this new field, allowing a configurable timeout before emergency shutdown. Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20231026144824.4065145-6-o.rempel@pengutronix.de Signed-off-by: Mark Brown --- drivers/regulator/core.c | 2 +- drivers/regulator/of_regulator.c | 7 +++++++ include/linux/regulator/machine.h | 8 ++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index a072f721f288..a6cb84af989e 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -5094,7 +5094,7 @@ static void regulator_handle_critical(struct regulator_dev *rdev, return; hw_protection_shutdown(reason, - REGULATOR_DEF_UV_LESS_CRITICAL_WINDOW_MS); + rdev->constraints->uv_less_critical_window_ms); } /** diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c index 3bdd6f1919a4..03afc160fc72 100644 --- a/drivers/regulator/of_regulator.c +++ b/drivers/regulator/of_regulator.c @@ -175,6 +175,13 @@ static int of_get_regulation_constraints(struct device *dev, if (!ret) constraints->enable_time = pval; + ret = of_property_read_u32(np, "regulator-uv-survival-time-ms", &pval); + if (!ret) + constraints->uv_less_critical_window_ms = pval; + else + constraints->uv_less_critical_window_ms = + REGULATOR_DEF_UV_LESS_CRITICAL_WINDOW_MS; + constraints->soft_start = of_property_read_bool(np, "regulator-soft-start"); ret = of_property_read_u32(np, "regulator-active-discharge", &pval); diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index e0ddfb5593c9..0cd76d264727 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -162,6 +162,13 @@ struct notification_limit { * regulator_active_discharge values are used for * initialisation. * @enable_time: Turn-on time of the rails (unit: microseconds) + * @uv_less_critical_window_ms: Specifies the time window (in milliseconds) + * following a critical under-voltage (UV) event + * during which less critical actions can be + * safely carried out by the system (for example + * logging). After this time window more critical + * actions should be done (for example prevent + * HW damage). */ struct regulation_constraints { @@ -213,6 +220,7 @@ struct regulation_constraints { unsigned int settling_time_up; unsigned int settling_time_down; unsigned int enable_time; + unsigned int uv_less_critical_window_ms; unsigned int active_discharge; From c986968fe92f20f2db26fa6bce27795b2e9ebe22 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Tue, 7 Nov 2023 20:09:18 +0100 Subject: [PATCH 0028/1562] regulator: core: Add option to prevent disabling unused regulators This may be useful for debugging and develompent purposes, when there are drivers that depend on regulators to be enabled but do not request them. It is inspired from the clk_ignore_unused and pd_ignore_unused parameters, that are used to keep firmware-enabled clocks and power domains on even if these are not used by drivers. The parameter is not expected to be used in normal cases and should not be needed on a platform with proper driver support. Signed-off-by: Javier Martinez Canillas Reviewed-by: Brian Masney Acked-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20231107190926.1185326-1-javierm@redhat.com Signed-off-by: Mark Brown --- Documentation/admin-guide/kernel-parameters.txt | 7 +++++++ drivers/regulator/core.c | 17 +++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 65731b060e3f..825159394da2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5544,6 +5544,13 @@ print every Nth verbose statement, where N is the value specified. + regulator_ignore_unused + [REGULATOR] + Prevents regulator framework from disabling regulators + that are unused, due no driver claiming them. This may + be useful for debug and development, but should not be + needed on a platform with proper driver support. + relax_domain_level= [KNL, SMP] Set scheduler's default relax_domain_level. See Documentation/admin-guide/cgroup-v1/cpusets.rst. diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 3137e40fcd3e..79777495cc3a 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -6234,6 +6234,14 @@ unlock: return 0; } +static bool regulator_ignore_unused; +static int __init regulator_ignore_unused_setup(char *__unused) +{ + regulator_ignore_unused = true; + return 1; +} +__setup("regulator_ignore_unused", regulator_ignore_unused_setup); + static void regulator_init_complete_work_function(struct work_struct *work) { /* @@ -6246,6 +6254,15 @@ static void regulator_init_complete_work_function(struct work_struct *work) class_for_each_device(®ulator_class, NULL, NULL, regulator_register_resolve_supply); + /* + * For debugging purposes, it may be useful to prevent unused + * regulators from being disabled. + */ + if (regulator_ignore_unused) { + pr_warn("regulator: Not disabling unused regulators\n"); + return; + } + /* If we have a full configuration then disable any regulators * we have permission to change the status for and which are * not in use or always_on. This is effectively the default From 7993d3a9c34f609c02171e115fd12c10e2105ff4 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Fri, 3 Nov 2023 15:42:31 +0800 Subject: [PATCH 0029/1562] regulator: core: Only increment use_count when enable_count changes The use_count of a regulator should only be incremented when the enable_count changes from 0 to 1. Similarly, the use_count should only be decremented when the enable_count changes from 1 to 0. In the previous implementation, use_count was sometimes decremented to 0 when some consumer called unbalanced disable, leading to unexpected disable even the regulator is enabled by other consumers. With this change, the use_count accurately reflects the number of users which the regulator is enabled. This should make things more robust in the case where a consumer does leak references. Signed-off-by: Rui Zhang Link: https://lore.kernel.org/r/20231103074231.8031-1-zr.zhang@vivo.com Signed-off-by: Mark Brown --- drivers/regulator/core.c | 52 +++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 79777495cc3a..00221ff369c2 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -2918,7 +2918,8 @@ static int _regulator_enable(struct regulator *regulator) /* Fallthrough on positive return values - already enabled */ } - rdev->use_count++; + if (regulator->enable_count == 1) + rdev->use_count++; return 0; @@ -2993,37 +2994,40 @@ static int _regulator_disable(struct regulator *regulator) lockdep_assert_held_once(&rdev->mutex.base); - if (WARN(rdev->use_count <= 0, + if (WARN(regulator->enable_count == 0, "unbalanced disables for %s\n", rdev_get_name(rdev))) return -EIO; - /* are we the last user and permitted to disable ? */ - if (rdev->use_count == 1 && - (rdev->constraints && !rdev->constraints->always_on)) { + if (regulator->enable_count == 1) { + /* disabling last enable_count from this regulator */ + /* are we the last user and permitted to disable ? */ + if (rdev->use_count == 1 && + (rdev->constraints && !rdev->constraints->always_on)) { - /* we are last user */ - if (regulator_ops_is_valid(rdev, REGULATOR_CHANGE_STATUS)) { - ret = _notifier_call_chain(rdev, - REGULATOR_EVENT_PRE_DISABLE, - NULL); - if (ret & NOTIFY_STOP_MASK) - return -EINVAL; + /* we are last user */ + if (regulator_ops_is_valid(rdev, REGULATOR_CHANGE_STATUS)) { + ret = _notifier_call_chain(rdev, + REGULATOR_EVENT_PRE_DISABLE, + NULL); + if (ret & NOTIFY_STOP_MASK) + return -EINVAL; - ret = _regulator_do_disable(rdev); - if (ret < 0) { - rdev_err(rdev, "failed to disable: %pe\n", ERR_PTR(ret)); - _notifier_call_chain(rdev, - REGULATOR_EVENT_ABORT_DISABLE, + ret = _regulator_do_disable(rdev); + if (ret < 0) { + rdev_err(rdev, "failed to disable: %pe\n", ERR_PTR(ret)); + _notifier_call_chain(rdev, + REGULATOR_EVENT_ABORT_DISABLE, + NULL); + return ret; + } + _notifier_call_chain(rdev, REGULATOR_EVENT_DISABLE, NULL); - return ret; } - _notifier_call_chain(rdev, REGULATOR_EVENT_DISABLE, - NULL); - } - rdev->use_count = 0; - } else if (rdev->use_count > 1) { - rdev->use_count--; + rdev->use_count = 0; + } else if (rdev->use_count > 1) { + rdev->use_count--; + } } if (ret == 0) From 1fc2e768ff28f096e9fb6438f0d01c3851c7cd68 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 11 Nov 2023 19:53:30 +0000 Subject: [PATCH 0030/1562] regulator: palmas: remove redundant initialization of pointer pdata Pointer pdata is being initialized with a value that is never read. It is being re-assigned later on with the return from a devm_kzalloc call. Remove the redundant initialization, cleans up clang scan build warning: drivers/regulator/palmas-regulator.c:1597:36: warning: Value stored to 'pdata' during its initialization is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20231111195330.338324-1-colin.i.king@gmail.com Signed-off-by: Mark Brown --- drivers/regulator/palmas-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/palmas-regulator.c b/drivers/regulator/palmas-regulator.c index e0dc033aae0f..60656a815b9e 100644 --- a/drivers/regulator/palmas-regulator.c +++ b/drivers/regulator/palmas-regulator.c @@ -1594,7 +1594,7 @@ static const struct of_device_id of_palmas_match_tbl[] = { static int palmas_regulators_probe(struct platform_device *pdev) { struct palmas *palmas = dev_get_drvdata(pdev->dev.parent); - struct palmas_pmic_platform_data *pdata = dev_get_platdata(&pdev->dev); + struct palmas_pmic_platform_data *pdata; struct device_node *node = pdev->dev.of_node; struct palmas_pmic_driver_data *driver_data; struct regulator_config config = { }; From e1eb745006ac484427fca14feb27d79a71c3770d Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 9 Nov 2023 15:39:25 +0800 Subject: [PATCH 0031/1562] regulator: stpmic1: Fix kernel-doc notation warnings No functional modification involved. drivers/regulator/stpmic1_regulator.c:31: warning: expecting prototype for struct stpmic1. Prototype was for struct stpmic1_regulator_cfg instead. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7206 Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/20231109073925.98783-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Mark Brown --- drivers/regulator/stpmic1_regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/stpmic1_regulator.c b/drivers/regulator/stpmic1_regulator.c index 79d1a3eb18d4..a498df7cb016 100644 --- a/drivers/regulator/stpmic1_regulator.c +++ b/drivers/regulator/stpmic1_regulator.c @@ -15,7 +15,7 @@ #include /** - * struct stpmic1 regulator description: this structure is used as driver data + * struct stpmic1_regulator_cfg - this structure is used as driver data * @desc: regulator framework description * @mask_reset_reg: mask reset register address * @mask_reset_mask: mask rank and mask reset register mask From adde8a55daf640515edd78b7ac5f3293c3960b8e Mon Sep 17 00:00:00 2001 From: Ben Wolsieffer Date: Thu, 2 Nov 2023 15:37:18 -0400 Subject: [PATCH 0032/1562] spi: stm32: rename stm32f4_* to stm32fx_* The STM32F4 and STM32F7 SPI peripherals are very similar, therefore most of the driver can be shared between the two. In preparation for adding support for the F7, change all functions and defines to use a generic stm32fx prefix, except for code and registers that differ between the two devices. Signed-off-by: Ben Wolsieffer Link: https://lore.kernel.org/r/20231102193722.3042245-2-ben.wolsieffer@hefring.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32.c | 282 ++++++++++++++++++++-------------------- 1 file changed, 141 insertions(+), 141 deletions(-) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index e6e3e4ea29f9..02d1409d7229 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -22,58 +22,58 @@ #define DRIVER_NAME "spi_stm32" -/* STM32F4 SPI registers */ -#define STM32F4_SPI_CR1 0x00 -#define STM32F4_SPI_CR2 0x04 -#define STM32F4_SPI_SR 0x08 -#define STM32F4_SPI_DR 0x0C -#define STM32F4_SPI_I2SCFGR 0x1C +/* STM32F4/7 SPI registers */ +#define STM32FX_SPI_CR1 0x00 +#define STM32FX_SPI_CR2 0x04 +#define STM32FX_SPI_SR 0x08 +#define STM32FX_SPI_DR 0x0C +#define STM32FX_SPI_I2SCFGR 0x1C -/* STM32F4_SPI_CR1 bit fields */ -#define STM32F4_SPI_CR1_CPHA BIT(0) -#define STM32F4_SPI_CR1_CPOL BIT(1) -#define STM32F4_SPI_CR1_MSTR BIT(2) -#define STM32F4_SPI_CR1_BR_SHIFT 3 -#define STM32F4_SPI_CR1_BR GENMASK(5, 3) -#define STM32F4_SPI_CR1_SPE BIT(6) -#define STM32F4_SPI_CR1_LSBFRST BIT(7) -#define STM32F4_SPI_CR1_SSI BIT(8) -#define STM32F4_SPI_CR1_SSM BIT(9) -#define STM32F4_SPI_CR1_RXONLY BIT(10) +/* STM32FX_SPI_CR1 bit fields */ +#define STM32FX_SPI_CR1_CPHA BIT(0) +#define STM32FX_SPI_CR1_CPOL BIT(1) +#define STM32FX_SPI_CR1_MSTR BIT(2) +#define STM32FX_SPI_CR1_BR_SHIFT 3 +#define STM32FX_SPI_CR1_BR GENMASK(5, 3) +#define STM32FX_SPI_CR1_SPE BIT(6) +#define STM32FX_SPI_CR1_LSBFRST BIT(7) +#define STM32FX_SPI_CR1_SSI BIT(8) +#define STM32FX_SPI_CR1_SSM BIT(9) +#define STM32FX_SPI_CR1_RXONLY BIT(10) #define STM32F4_SPI_CR1_DFF BIT(11) -#define STM32F4_SPI_CR1_CRCNEXT BIT(12) -#define STM32F4_SPI_CR1_CRCEN BIT(13) -#define STM32F4_SPI_CR1_BIDIOE BIT(14) -#define STM32F4_SPI_CR1_BIDIMODE BIT(15) -#define STM32F4_SPI_CR1_BR_MIN 0 -#define STM32F4_SPI_CR1_BR_MAX (GENMASK(5, 3) >> 3) +#define STM32FX_SPI_CR1_CRCNEXT BIT(12) +#define STM32FX_SPI_CR1_CRCEN BIT(13) +#define STM32FX_SPI_CR1_BIDIOE BIT(14) +#define STM32FX_SPI_CR1_BIDIMODE BIT(15) +#define STM32FX_SPI_CR1_BR_MIN 0 +#define STM32FX_SPI_CR1_BR_MAX (GENMASK(5, 3) >> 3) -/* STM32F4_SPI_CR2 bit fields */ -#define STM32F4_SPI_CR2_RXDMAEN BIT(0) -#define STM32F4_SPI_CR2_TXDMAEN BIT(1) -#define STM32F4_SPI_CR2_SSOE BIT(2) -#define STM32F4_SPI_CR2_FRF BIT(4) -#define STM32F4_SPI_CR2_ERRIE BIT(5) -#define STM32F4_SPI_CR2_RXNEIE BIT(6) -#define STM32F4_SPI_CR2_TXEIE BIT(7) +/* STM32FX_SPI_CR2 bit fields */ +#define STM32FX_SPI_CR2_RXDMAEN BIT(0) +#define STM32FX_SPI_CR2_TXDMAEN BIT(1) +#define STM32FX_SPI_CR2_SSOE BIT(2) +#define STM32FX_SPI_CR2_FRF BIT(4) +#define STM32FX_SPI_CR2_ERRIE BIT(5) +#define STM32FX_SPI_CR2_RXNEIE BIT(6) +#define STM32FX_SPI_CR2_TXEIE BIT(7) -/* STM32F4_SPI_SR bit fields */ -#define STM32F4_SPI_SR_RXNE BIT(0) -#define STM32F4_SPI_SR_TXE BIT(1) -#define STM32F4_SPI_SR_CHSIDE BIT(2) -#define STM32F4_SPI_SR_UDR BIT(3) -#define STM32F4_SPI_SR_CRCERR BIT(4) -#define STM32F4_SPI_SR_MODF BIT(5) -#define STM32F4_SPI_SR_OVR BIT(6) -#define STM32F4_SPI_SR_BSY BIT(7) -#define STM32F4_SPI_SR_FRE BIT(8) +/* STM32FX_SPI_SR bit fields */ +#define STM32FX_SPI_SR_RXNE BIT(0) +#define STM32FX_SPI_SR_TXE BIT(1) +#define STM32FX_SPI_SR_CHSIDE BIT(2) +#define STM32FX_SPI_SR_UDR BIT(3) +#define STM32FX_SPI_SR_CRCERR BIT(4) +#define STM32FX_SPI_SR_MODF BIT(5) +#define STM32FX_SPI_SR_OVR BIT(6) +#define STM32FX_SPI_SR_BSY BIT(7) +#define STM32FX_SPI_SR_FRE BIT(8) -/* STM32F4_SPI_I2SCFGR bit fields */ -#define STM32F4_SPI_I2SCFGR_I2SMOD BIT(11) +/* STM32FX_SPI_I2SCFGR bit fields */ +#define STM32FX_SPI_I2SCFGR_I2SMOD BIT(11) /* STM32F4 SPI Baud Rate min/max divisor */ -#define STM32F4_SPI_BR_DIV_MIN (2 << STM32F4_SPI_CR1_BR_MIN) -#define STM32F4_SPI_BR_DIV_MAX (2 << STM32F4_SPI_CR1_BR_MAX) +#define STM32FX_SPI_BR_DIV_MIN (2 << STM32FX_SPI_CR1_BR_MIN) +#define STM32FX_SPI_BR_DIV_MAX (2 << STM32FX_SPI_CR1_BR_MAX) /* STM32H7 SPI registers */ #define STM32H7_SPI_CR1 0x00 @@ -324,20 +324,20 @@ struct stm32_spi { bool device_mode; }; -static const struct stm32_spi_regspec stm32f4_spi_regspec = { - .en = { STM32F4_SPI_CR1, STM32F4_SPI_CR1_SPE }, +static const struct stm32_spi_regspec stm32fx_spi_regspec = { + .en = { STM32FX_SPI_CR1, STM32FX_SPI_CR1_SPE }, - .dma_rx_en = { STM32F4_SPI_CR2, STM32F4_SPI_CR2_RXDMAEN }, - .dma_tx_en = { STM32F4_SPI_CR2, STM32F4_SPI_CR2_TXDMAEN }, + .dma_rx_en = { STM32FX_SPI_CR2, STM32FX_SPI_CR2_RXDMAEN }, + .dma_tx_en = { STM32FX_SPI_CR2, STM32FX_SPI_CR2_TXDMAEN }, - .cpol = { STM32F4_SPI_CR1, STM32F4_SPI_CR1_CPOL }, - .cpha = { STM32F4_SPI_CR1, STM32F4_SPI_CR1_CPHA }, - .lsb_first = { STM32F4_SPI_CR1, STM32F4_SPI_CR1_LSBFRST }, + .cpol = { STM32FX_SPI_CR1, STM32FX_SPI_CR1_CPOL }, + .cpha = { STM32FX_SPI_CR1, STM32FX_SPI_CR1_CPHA }, + .lsb_first = { STM32FX_SPI_CR1, STM32FX_SPI_CR1_LSBFRST }, .cs_high = {}, - .br = { STM32F4_SPI_CR1, STM32F4_SPI_CR1_BR, STM32F4_SPI_CR1_BR_SHIFT }, + .br = { STM32FX_SPI_CR1, STM32FX_SPI_CR1_BR, STM32FX_SPI_CR1_BR_SHIFT }, - .rx = { STM32F4_SPI_DR }, - .tx = { STM32F4_SPI_DR }, + .rx = { STM32FX_SPI_DR }, + .tx = { STM32FX_SPI_DR }, }; static const struct stm32_spi_regspec stm32h7_spi_regspec = { @@ -502,19 +502,19 @@ static u32 stm32h7_spi_prepare_fthlv(struct stm32_spi *spi, u32 xfer_len) */ static void stm32f4_spi_write_tx(struct stm32_spi *spi) { - if ((spi->tx_len > 0) && (readl_relaxed(spi->base + STM32F4_SPI_SR) & - STM32F4_SPI_SR_TXE)) { + if ((spi->tx_len > 0) && (readl_relaxed(spi->base + STM32FX_SPI_SR) & + STM32FX_SPI_SR_TXE)) { u32 offs = spi->cur_xferlen - spi->tx_len; if (spi->cur_bpw == 16) { const u16 *tx_buf16 = (const u16 *)(spi->tx_buf + offs); - writew_relaxed(*tx_buf16, spi->base + STM32F4_SPI_DR); + writew_relaxed(*tx_buf16, spi->base + STM32FX_SPI_DR); spi->tx_len -= sizeof(u16); } else { const u8 *tx_buf8 = (const u8 *)(spi->tx_buf + offs); - writeb_relaxed(*tx_buf8, spi->base + STM32F4_SPI_DR); + writeb_relaxed(*tx_buf8, spi->base + STM32FX_SPI_DR); spi->tx_len -= sizeof(u8); } } @@ -566,19 +566,19 @@ static void stm32h7_spi_write_txfifo(struct stm32_spi *spi) */ static void stm32f4_spi_read_rx(struct stm32_spi *spi) { - if ((spi->rx_len > 0) && (readl_relaxed(spi->base + STM32F4_SPI_SR) & - STM32F4_SPI_SR_RXNE)) { + if ((spi->rx_len > 0) && (readl_relaxed(spi->base + STM32FX_SPI_SR) & + STM32FX_SPI_SR_RXNE)) { u32 offs = spi->cur_xferlen - spi->rx_len; if (spi->cur_bpw == 16) { u16 *rx_buf16 = (u16 *)(spi->rx_buf + offs); - *rx_buf16 = readw_relaxed(spi->base + STM32F4_SPI_DR); + *rx_buf16 = readw_relaxed(spi->base + STM32FX_SPI_DR); spi->rx_len -= sizeof(u16); } else { u8 *rx_buf8 = (u8 *)(spi->rx_buf + offs); - *rx_buf8 = readb_relaxed(spi->base + STM32F4_SPI_DR); + *rx_buf8 = readb_relaxed(spi->base + STM32FX_SPI_DR); spi->rx_len -= sizeof(u8); } } @@ -645,10 +645,10 @@ static void stm32_spi_enable(struct stm32_spi *spi) } /** - * stm32f4_spi_disable - Disable SPI controller + * stm32fx_spi_disable - Disable SPI controller * @spi: pointer to the spi controller data structure */ -static void stm32f4_spi_disable(struct stm32_spi *spi) +static void stm32fx_spi_disable(struct stm32_spi *spi) { unsigned long flags; u32 sr; @@ -657,20 +657,20 @@ static void stm32f4_spi_disable(struct stm32_spi *spi) spin_lock_irqsave(&spi->lock, flags); - if (!(readl_relaxed(spi->base + STM32F4_SPI_CR1) & - STM32F4_SPI_CR1_SPE)) { + if (!(readl_relaxed(spi->base + STM32FX_SPI_CR1) & + STM32FX_SPI_CR1_SPE)) { spin_unlock_irqrestore(&spi->lock, flags); return; } /* Disable interrupts */ - stm32_spi_clr_bits(spi, STM32F4_SPI_CR2, STM32F4_SPI_CR2_TXEIE | - STM32F4_SPI_CR2_RXNEIE | - STM32F4_SPI_CR2_ERRIE); + stm32_spi_clr_bits(spi, STM32FX_SPI_CR2, STM32FX_SPI_CR2_TXEIE | + STM32FX_SPI_CR2_RXNEIE | + STM32FX_SPI_CR2_ERRIE); /* Wait until BSY = 0 */ - if (readl_relaxed_poll_timeout_atomic(spi->base + STM32F4_SPI_SR, - sr, !(sr & STM32F4_SPI_SR_BSY), + if (readl_relaxed_poll_timeout_atomic(spi->base + STM32FX_SPI_SR, + sr, !(sr & STM32FX_SPI_SR_BSY), 10, 100000) < 0) { dev_warn(spi->dev, "disabling condition timeout\n"); } @@ -680,14 +680,14 @@ static void stm32f4_spi_disable(struct stm32_spi *spi) if (spi->cur_usedma && spi->dma_rx) dmaengine_terminate_async(spi->dma_rx); - stm32_spi_clr_bits(spi, STM32F4_SPI_CR1, STM32F4_SPI_CR1_SPE); + stm32_spi_clr_bits(spi, STM32FX_SPI_CR1, STM32FX_SPI_CR1_SPE); - stm32_spi_clr_bits(spi, STM32F4_SPI_CR2, STM32F4_SPI_CR2_TXDMAEN | - STM32F4_SPI_CR2_RXDMAEN); + stm32_spi_clr_bits(spi, STM32FX_SPI_CR2, STM32FX_SPI_CR2_TXDMAEN | + STM32FX_SPI_CR2_RXDMAEN); /* Sequence to clear OVR flag */ - readl_relaxed(spi->base + STM32F4_SPI_DR); - readl_relaxed(spi->base + STM32F4_SPI_SR); + readl_relaxed(spi->base + STM32FX_SPI_DR); + readl_relaxed(spi->base + STM32FX_SPI_SR); spin_unlock_irqrestore(&spi->lock, flags); } @@ -763,11 +763,11 @@ static bool stm32_spi_can_dma(struct spi_controller *ctrl, } /** - * stm32f4_spi_irq_event - Interrupt handler for SPI controller events + * stm32fx_spi_irq_event - Interrupt handler for SPI controller events * @irq: interrupt line * @dev_id: SPI controller ctrl interface */ -static irqreturn_t stm32f4_spi_irq_event(int irq, void *dev_id) +static irqreturn_t stm32fx_spi_irq_event(int irq, void *dev_id) { struct spi_controller *ctrl = dev_id; struct stm32_spi *spi = spi_controller_get_devdata(ctrl); @@ -776,26 +776,26 @@ static irqreturn_t stm32f4_spi_irq_event(int irq, void *dev_id) spin_lock(&spi->lock); - sr = readl_relaxed(spi->base + STM32F4_SPI_SR); + sr = readl_relaxed(spi->base + STM32FX_SPI_SR); /* * BSY flag is not handled in interrupt but it is normal behavior when * this flag is set. */ - sr &= ~STM32F4_SPI_SR_BSY; + sr &= ~STM32FX_SPI_SR_BSY; if (!spi->cur_usedma && (spi->cur_comm == SPI_SIMPLEX_TX || spi->cur_comm == SPI_3WIRE_TX)) { /* OVR flag shouldn't be handled for TX only mode */ - sr &= ~(STM32F4_SPI_SR_OVR | STM32F4_SPI_SR_RXNE); - mask |= STM32F4_SPI_SR_TXE; + sr &= ~(STM32FX_SPI_SR_OVR | STM32FX_SPI_SR_RXNE); + mask |= STM32FX_SPI_SR_TXE; } if (!spi->cur_usedma && (spi->cur_comm == SPI_FULL_DUPLEX || spi->cur_comm == SPI_SIMPLEX_RX || spi->cur_comm == SPI_3WIRE_RX)) { /* TXE flag is set and is handled when RXNE flag occurs */ - sr &= ~STM32F4_SPI_SR_TXE; - mask |= STM32F4_SPI_SR_RXNE | STM32F4_SPI_SR_OVR; + sr &= ~STM32FX_SPI_SR_TXE; + mask |= STM32FX_SPI_SR_RXNE | STM32FX_SPI_SR_OVR; } if (!(sr & mask)) { @@ -804,12 +804,12 @@ static irqreturn_t stm32f4_spi_irq_event(int irq, void *dev_id) return IRQ_NONE; } - if (sr & STM32F4_SPI_SR_OVR) { + if (sr & STM32FX_SPI_SR_OVR) { dev_warn(spi->dev, "Overrun: received value discarded\n"); /* Sequence to clear OVR flag */ - readl_relaxed(spi->base + STM32F4_SPI_DR); - readl_relaxed(spi->base + STM32F4_SPI_SR); + readl_relaxed(spi->base + STM32FX_SPI_DR); + readl_relaxed(spi->base + STM32FX_SPI_SR); /* * If overrun is detected, it means that something went wrong, @@ -820,14 +820,14 @@ static irqreturn_t stm32f4_spi_irq_event(int irq, void *dev_id) goto end_irq; } - if (sr & STM32F4_SPI_SR_TXE) { + if (sr & STM32FX_SPI_SR_TXE) { if (spi->tx_buf) stm32f4_spi_write_tx(spi); if (spi->tx_len == 0) end = true; } - if (sr & STM32F4_SPI_SR_RXNE) { + if (sr & STM32FX_SPI_SR_RXNE) { stm32f4_spi_read_rx(spi); if (spi->rx_len == 0) end = true; @@ -838,10 +838,10 @@ static irqreturn_t stm32f4_spi_irq_event(int irq, void *dev_id) end_irq: if (end) { /* Immediately disable interrupts to do not generate new one */ - stm32_spi_clr_bits(spi, STM32F4_SPI_CR2, - STM32F4_SPI_CR2_TXEIE | - STM32F4_SPI_CR2_RXNEIE | - STM32F4_SPI_CR2_ERRIE); + stm32_spi_clr_bits(spi, STM32FX_SPI_CR2, + STM32FX_SPI_CR2_TXEIE | + STM32FX_SPI_CR2_RXNEIE | + STM32FX_SPI_CR2_ERRIE); spin_unlock(&spi->lock); return IRQ_WAKE_THREAD; } @@ -851,17 +851,17 @@ end_irq: } /** - * stm32f4_spi_irq_thread - Thread of interrupt handler for SPI controller + * stm32fx_spi_irq_thread - Thread of interrupt handler for SPI controller * @irq: interrupt line * @dev_id: SPI controller interface */ -static irqreturn_t stm32f4_spi_irq_thread(int irq, void *dev_id) +static irqreturn_t stm32fx_spi_irq_thread(int irq, void *dev_id) { struct spi_controller *ctrl = dev_id; struct stm32_spi *spi = spi_controller_get_devdata(ctrl); spi_finalize_current_transfer(ctrl); - stm32f4_spi_disable(spi); + stm32fx_spi_disable(spi); return IRQ_HANDLED; } @@ -1034,18 +1034,18 @@ static int stm32_spi_prepare_msg(struct spi_controller *ctrl, } /** - * stm32f4_spi_dma_tx_cb - dma callback + * stm32fx_spi_dma_tx_cb - dma callback * @data: pointer to the spi controller data structure * * DMA callback is called when the transfer is complete for DMA TX channel. */ -static void stm32f4_spi_dma_tx_cb(void *data) +static void stm32fx_spi_dma_tx_cb(void *data) { struct stm32_spi *spi = data; if (spi->cur_comm == SPI_SIMPLEX_TX || spi->cur_comm == SPI_3WIRE_TX) { spi_finalize_current_transfer(spi->ctrl); - stm32f4_spi_disable(spi); + stm32fx_spi_disable(spi); } } @@ -1114,21 +1114,21 @@ static void stm32_spi_dma_config(struct stm32_spi *spi, } /** - * stm32f4_spi_transfer_one_irq - transfer a single spi_transfer using + * stm32fx_spi_transfer_one_irq - transfer a single spi_transfer using * interrupts * @spi: pointer to the spi controller data structure * * It must returns 0 if the transfer is finished or 1 if the transfer is still * in progress. */ -static int stm32f4_spi_transfer_one_irq(struct stm32_spi *spi) +static int stm32fx_spi_transfer_one_irq(struct stm32_spi *spi) { unsigned long flags; u32 cr2 = 0; /* Enable the interrupts relative to the current communication mode */ if (spi->cur_comm == SPI_SIMPLEX_TX || spi->cur_comm == SPI_3WIRE_TX) { - cr2 |= STM32F4_SPI_CR2_TXEIE; + cr2 |= STM32FX_SPI_CR2_TXEIE; } else if (spi->cur_comm == SPI_FULL_DUPLEX || spi->cur_comm == SPI_SIMPLEX_RX || spi->cur_comm == SPI_3WIRE_RX) { @@ -1136,14 +1136,14 @@ static int stm32f4_spi_transfer_one_irq(struct stm32_spi *spi) * since the received data are never read. Therefore set OVR * interrupt only when rx buffer is available. */ - cr2 |= STM32F4_SPI_CR2_RXNEIE | STM32F4_SPI_CR2_ERRIE; + cr2 |= STM32FX_SPI_CR2_RXNEIE | STM32FX_SPI_CR2_ERRIE; } else { return -EINVAL; } spin_lock_irqsave(&spi->lock, flags); - stm32_spi_set_bits(spi, STM32F4_SPI_CR2, cr2); + stm32_spi_set_bits(spi, STM32FX_SPI_CR2, cr2); stm32_spi_enable(spi); @@ -1200,11 +1200,11 @@ static int stm32h7_spi_transfer_one_irq(struct stm32_spi *spi) } /** - * stm32f4_spi_transfer_one_dma_start - Set SPI driver registers to start + * stm32fx_spi_transfer_one_dma_start - Set SPI driver registers to start * transfer using DMA * @spi: pointer to the spi controller data structure */ -static void stm32f4_spi_transfer_one_dma_start(struct stm32_spi *spi) +static void stm32fx_spi_transfer_one_dma_start(struct stm32_spi *spi) { /* In DMA mode end of transfer is handled by DMA TX or RX callback. */ if (spi->cur_comm == SPI_SIMPLEX_RX || spi->cur_comm == SPI_3WIRE_RX || @@ -1214,7 +1214,7 @@ static void stm32f4_spi_transfer_one_dma_start(struct stm32_spi *spi) * since the received data are never read. Therefore set OVR * interrupt only when rx buffer is available. */ - stm32_spi_set_bits(spi, STM32F4_SPI_CR2, STM32F4_SPI_CR2_ERRIE); + stm32_spi_set_bits(spi, STM32FX_SPI_CR2, STM32FX_SPI_CR2_ERRIE); } stm32_spi_enable(spi); @@ -1353,9 +1353,9 @@ dma_desc_error: static void stm32f4_spi_set_bpw(struct stm32_spi *spi) { if (spi->cur_bpw == 16) - stm32_spi_set_bits(spi, STM32F4_SPI_CR1, STM32F4_SPI_CR1_DFF); + stm32_spi_set_bits(spi, STM32FX_SPI_CR1, STM32F4_SPI_CR1_DFF); else - stm32_spi_clr_bits(spi, STM32F4_SPI_CR1, STM32F4_SPI_CR1_DFF); + stm32_spi_clr_bits(spi, STM32FX_SPI_CR1, STM32F4_SPI_CR1_DFF); } /** @@ -1433,26 +1433,26 @@ static unsigned int stm32_spi_communication_type(struct spi_device *spi_dev, } /** - * stm32f4_spi_set_mode - configure communication mode + * stm32fx_spi_set_mode - configure communication mode * @spi: pointer to the spi controller data structure * @comm_type: type of communication to configure */ -static int stm32f4_spi_set_mode(struct stm32_spi *spi, unsigned int comm_type) +static int stm32fx_spi_set_mode(struct stm32_spi *spi, unsigned int comm_type) { if (comm_type == SPI_3WIRE_TX || comm_type == SPI_SIMPLEX_TX) { - stm32_spi_set_bits(spi, STM32F4_SPI_CR1, - STM32F4_SPI_CR1_BIDIMODE | - STM32F4_SPI_CR1_BIDIOE); + stm32_spi_set_bits(spi, STM32FX_SPI_CR1, + STM32FX_SPI_CR1_BIDIMODE | + STM32FX_SPI_CR1_BIDIOE); } else if (comm_type == SPI_FULL_DUPLEX || comm_type == SPI_SIMPLEX_RX) { - stm32_spi_clr_bits(spi, STM32F4_SPI_CR1, - STM32F4_SPI_CR1_BIDIMODE | - STM32F4_SPI_CR1_BIDIOE); + stm32_spi_clr_bits(spi, STM32FX_SPI_CR1, + STM32FX_SPI_CR1_BIDIMODE | + STM32FX_SPI_CR1_BIDIOE); } else if (comm_type == SPI_3WIRE_RX) { - stm32_spi_set_bits(spi, STM32F4_SPI_CR1, - STM32F4_SPI_CR1_BIDIMODE); - stm32_spi_clr_bits(spi, STM32F4_SPI_CR1, - STM32F4_SPI_CR1_BIDIOE); + stm32_spi_set_bits(spi, STM32FX_SPI_CR1, + STM32FX_SPI_CR1_BIDIMODE); + stm32_spi_clr_bits(spi, STM32FX_SPI_CR1, + STM32FX_SPI_CR1_BIDIOE); } else { return -EINVAL; } @@ -1672,18 +1672,18 @@ static int stm32_spi_unprepare_msg(struct spi_controller *ctrl, } /** - * stm32f4_spi_config - Configure SPI controller as SPI master + * stm32fx_spi_config - Configure SPI controller as SPI master * @spi: pointer to the spi controller data structure */ -static int stm32f4_spi_config(struct stm32_spi *spi) +static int stm32fx_spi_config(struct stm32_spi *spi) { unsigned long flags; spin_lock_irqsave(&spi->lock, flags); /* Ensure I2SMOD bit is kept cleared */ - stm32_spi_clr_bits(spi, STM32F4_SPI_I2SCFGR, - STM32F4_SPI_I2SCFGR_I2SMOD); + stm32_spi_clr_bits(spi, STM32FX_SPI_I2SCFGR, + STM32FX_SPI_I2SCFGR_I2SMOD); /* * - SS input value high @@ -1692,10 +1692,10 @@ static int stm32f4_spi_config(struct stm32_spi *spi) * - Consider 1 master/n slaves configuration and * SS input value is determined by the SSI bit */ - stm32_spi_set_bits(spi, STM32F4_SPI_CR1, STM32F4_SPI_CR1_SSI | - STM32F4_SPI_CR1_BIDIOE | - STM32F4_SPI_CR1_MSTR | - STM32F4_SPI_CR1_SSM); + stm32_spi_set_bits(spi, STM32FX_SPI_CR1, STM32FX_SPI_CR1_SSI | + STM32FX_SPI_CR1_BIDIOE | + STM32FX_SPI_CR1_MSTR | + STM32FX_SPI_CR1_SSM); spin_unlock_irqrestore(&spi->lock, flags); @@ -1746,20 +1746,20 @@ static int stm32h7_spi_config(struct stm32_spi *spi) } static const struct stm32_spi_cfg stm32f4_spi_cfg = { - .regs = &stm32f4_spi_regspec, + .regs = &stm32fx_spi_regspec, .get_bpw_mask = stm32f4_spi_get_bpw_mask, - .disable = stm32f4_spi_disable, - .config = stm32f4_spi_config, + .disable = stm32fx_spi_disable, + .config = stm32fx_spi_config, .set_bpw = stm32f4_spi_set_bpw, - .set_mode = stm32f4_spi_set_mode, - .transfer_one_dma_start = stm32f4_spi_transfer_one_dma_start, - .dma_tx_cb = stm32f4_spi_dma_tx_cb, + .set_mode = stm32fx_spi_set_mode, + .transfer_one_dma_start = stm32fx_spi_transfer_one_dma_start, + .dma_tx_cb = stm32fx_spi_dma_tx_cb, .dma_rx_cb = stm32_spi_dma_rx_cb, - .transfer_one_irq = stm32f4_spi_transfer_one_irq, - .irq_handler_event = stm32f4_spi_irq_event, - .irq_handler_thread = stm32f4_spi_irq_thread, - .baud_rate_div_min = STM32F4_SPI_BR_DIV_MIN, - .baud_rate_div_max = STM32F4_SPI_BR_DIV_MAX, + .transfer_one_irq = stm32fx_spi_transfer_one_irq, + .irq_handler_event = stm32fx_spi_irq_event, + .irq_handler_thread = stm32fx_spi_irq_thread, + .baud_rate_div_min = STM32FX_SPI_BR_DIV_MIN, + .baud_rate_div_max = STM32FX_SPI_BR_DIV_MAX, .has_fifo = false, .has_device_mode = false, .flags = SPI_CONTROLLER_MUST_TX, From 247ba5ea058290824862902f7ee64c20a744c461 Mon Sep 17 00:00:00 2001 From: Ben Wolsieffer Date: Thu, 2 Nov 2023 15:37:19 -0400 Subject: [PATCH 0033/1562] spi: stm32: use callbacks for read_rx and write_tx The STM32F7 will require different read and write routines, so make these functions into configurable callbacks. Signed-off-by: Ben Wolsieffer Link: https://lore.kernel.org/r/20231102193722.3042245-3-ben.wolsieffer@hefring.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index 02d1409d7229..427788d18532 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -229,6 +229,8 @@ struct stm32_spi; * time between frames (if driver has this functionality) * @set_number_of_data: optional routine to configure registers to desired * number of data (if driver has this functionality) + * @write_tx: routine to write to transmit register/FIFO + * @read_rx: routine to read from receive register/FIFO * @transfer_one_dma_start: routine to start transfer a single spi_transfer * using DMA * @dma_rx_cb: routine to call after DMA RX channel operation is complete @@ -252,6 +254,8 @@ struct stm32_spi_cfg { int (*set_mode)(struct stm32_spi *spi, unsigned int comm_type); void (*set_data_idleness)(struct stm32_spi *spi, u32 length); int (*set_number_of_data)(struct stm32_spi *spi, u32 length); + void (*write_tx)(struct stm32_spi *spi); + void (*read_rx)(struct stm32_spi *spi); void (*transfer_one_dma_start)(struct stm32_spi *spi); void (*dma_rx_cb)(void *data); void (*dma_tx_cb)(void *data); @@ -822,17 +826,17 @@ static irqreturn_t stm32fx_spi_irq_event(int irq, void *dev_id) if (sr & STM32FX_SPI_SR_TXE) { if (spi->tx_buf) - stm32f4_spi_write_tx(spi); + spi->cfg->write_tx(spi); if (spi->tx_len == 0) end = true; } if (sr & STM32FX_SPI_SR_RXNE) { - stm32f4_spi_read_rx(spi); + spi->cfg->read_rx(spi); if (spi->rx_len == 0) end = true; else if (spi->tx_buf)/* Load data for discontinuous mode */ - stm32f4_spi_write_tx(spi); + spi->cfg->write_tx(spi); } end_irq: @@ -1149,7 +1153,7 @@ static int stm32fx_spi_transfer_one_irq(struct stm32_spi *spi) /* starting data transfer when buffer is loaded */ if (spi->tx_buf) - stm32f4_spi_write_tx(spi); + spi->cfg->write_tx(spi); spin_unlock_irqrestore(&spi->lock, flags); @@ -1752,6 +1756,8 @@ static const struct stm32_spi_cfg stm32f4_spi_cfg = { .config = stm32fx_spi_config, .set_bpw = stm32f4_spi_set_bpw, .set_mode = stm32fx_spi_set_mode, + .write_tx = stm32f4_spi_write_tx, + .read_rx = stm32f4_spi_read_rx, .transfer_one_dma_start = stm32fx_spi_transfer_one_dma_start, .dma_tx_cb = stm32fx_spi_dma_tx_cb, .dma_rx_cb = stm32_spi_dma_rx_cb, @@ -1775,6 +1781,8 @@ static const struct stm32_spi_cfg stm32h7_spi_cfg = { .set_mode = stm32h7_spi_set_mode, .set_data_idleness = stm32h7_spi_data_idleness, .set_number_of_data = stm32h7_spi_number_of_data, + .write_tx = stm32h7_spi_write_txfifo, + .read_rx = stm32h7_spi_read_rxfifo, .transfer_one_dma_start = stm32h7_spi_transfer_one_dma_start, .dma_rx_cb = stm32_spi_dma_rx_cb, /* From a84dcb410b5f928899a53ba79ec71108700872d6 Mon Sep 17 00:00:00 2001 From: Ben Wolsieffer Date: Thu, 2 Nov 2023 15:37:21 -0400 Subject: [PATCH 0034/1562] spi: stm32: add STM32F7 support The STM32F7 SPI peripheral is similar to the STM32F4, except it allows arbitrary word lengths between 4 and 16 bits, and has a small 32-bit FIFO that allows two 8-bit or smaller words to be transferred with a single 16-bit read/write. Signed-off-by: Ben Wolsieffer Link: https://lore.kernel.org/r/20231102193722.3042245-5-ben.wolsieffer@hefring.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32.c | 149 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index 427788d18532..94df3836834c 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -41,6 +41,7 @@ #define STM32FX_SPI_CR1_SSM BIT(9) #define STM32FX_SPI_CR1_RXONLY BIT(10) #define STM32F4_SPI_CR1_DFF BIT(11) +#define STM32F7_SPI_CR1_CRCL BIT(11) #define STM32FX_SPI_CR1_CRCNEXT BIT(12) #define STM32FX_SPI_CR1_CRCEN BIT(13) #define STM32FX_SPI_CR1_BIDIOE BIT(14) @@ -56,6 +57,10 @@ #define STM32FX_SPI_CR2_ERRIE BIT(5) #define STM32FX_SPI_CR2_RXNEIE BIT(6) #define STM32FX_SPI_CR2_TXEIE BIT(7) +#define STM32F7_SPI_CR2_DS GENMASK(11, 8) +#define STM32F7_SPI_CR2_FRXTH BIT(12) +#define STM32F7_SPI_CR2_LDMA_RX BIT(13) +#define STM32F7_SPI_CR2_LDMA_TX BIT(14) /* STM32FX_SPI_SR bit fields */ #define STM32FX_SPI_SR_RXNE BIT(0) @@ -67,6 +72,8 @@ #define STM32FX_SPI_SR_OVR BIT(6) #define STM32FX_SPI_SR_BSY BIT(7) #define STM32FX_SPI_SR_FRE BIT(8) +#define STM32F7_SPI_SR_FRLVL GENMASK(10, 9) +#define STM32F7_SPI_SR_FTLVL GENMASK(12, 11) /* STM32FX_SPI_I2SCFGR bit fields */ #define STM32FX_SPI_I2SCFGR_I2SMOD BIT(11) @@ -413,6 +420,16 @@ static int stm32f4_spi_get_bpw_mask(struct stm32_spi *spi) return SPI_BPW_MASK(8) | SPI_BPW_MASK(16); } +/** + * stm32f7_spi_get_bpw_mask - Return bits per word mask + * @spi: pointer to the spi controller data structure + */ +static int stm32f7_spi_get_bpw_mask(struct stm32_spi *spi) +{ + dev_dbg(spi->dev, "16-bit maximum data frame\n"); + return SPI_BPW_RANGE_MASK(4, 16); +} + /** * stm32h7_spi_get_bpw_mask - Return bits per word mask * @spi: pointer to the spi controller data structure @@ -526,6 +543,35 @@ static void stm32f4_spi_write_tx(struct stm32_spi *spi) dev_dbg(spi->dev, "%s: %d bytes left\n", __func__, spi->tx_len); } +/** + * stm32f7_spi_write_tx - Write bytes to Transmit Data Register + * @spi: pointer to the spi controller data structure + * + * Read from tx_buf depends on remaining bytes to avoid to read beyond + * tx_buf end. + */ +static void stm32f7_spi_write_tx(struct stm32_spi *spi) +{ + if ((spi->tx_len > 0) && (readl_relaxed(spi->base + STM32FX_SPI_SR) & + STM32FX_SPI_SR_TXE)) { + u32 offs = spi->cur_xferlen - spi->tx_len; + + if (spi->tx_len >= sizeof(u16)) { + const u16 *tx_buf16 = (const u16 *)(spi->tx_buf + offs); + + writew_relaxed(*tx_buf16, spi->base + STM32FX_SPI_DR); + spi->tx_len -= sizeof(u16); + } else { + const u8 *tx_buf8 = (const u8 *)(spi->tx_buf + offs); + + writeb_relaxed(*tx_buf8, spi->base + STM32FX_SPI_DR); + spi->tx_len -= sizeof(u8); + } + } + + dev_dbg(spi->dev, "%s: %d bytes left\n", __func__, spi->tx_len); +} + /** * stm32h7_spi_write_txfifo - Write bytes in Transmit Data Register * @spi: pointer to the spi controller data structure @@ -590,6 +636,46 @@ static void stm32f4_spi_read_rx(struct stm32_spi *spi) dev_dbg(spi->dev, "%s: %d bytes left\n", __func__, spi->rx_len); } +/** + * stm32f7_spi_read_rx - Read bytes from Receive Data Register + * @spi: pointer to the spi controller data structure + * + * Write in rx_buf depends on remaining bytes to avoid to write beyond + * rx_buf end. + */ +static void stm32f7_spi_read_rx(struct stm32_spi *spi) +{ + u32 sr = readl_relaxed(spi->base + STM32FX_SPI_SR); + u32 frlvl = FIELD_GET(STM32F7_SPI_SR_FRLVL, sr); + + while ((spi->rx_len > 0) && (frlvl > 0)) { + u32 offs = spi->cur_xferlen - spi->rx_len; + + if ((spi->rx_len >= sizeof(u16)) && (frlvl >= 2)) { + u16 *rx_buf16 = (u16 *)(spi->rx_buf + offs); + + *rx_buf16 = readw_relaxed(spi->base + STM32FX_SPI_DR); + spi->rx_len -= sizeof(u16); + } else { + u8 *rx_buf8 = (u8 *)(spi->rx_buf + offs); + + *rx_buf8 = readb_relaxed(spi->base + STM32FX_SPI_DR); + spi->rx_len -= sizeof(u8); + } + + sr = readl_relaxed(spi->base + STM32FX_SPI_SR); + frlvl = FIELD_GET(STM32F7_SPI_SR_FRLVL, sr); + } + + if (spi->rx_len >= sizeof(u16)) + stm32_spi_clr_bits(spi, STM32FX_SPI_CR2, STM32F7_SPI_CR2_FRXTH); + else + stm32_spi_set_bits(spi, STM32FX_SPI_CR2, STM32F7_SPI_CR2_FRXTH); + + dev_dbg(spi->dev, "%s: %d bytes left (sr=%08x)\n", + __func__, spi->rx_len, sr); +} + /** * stm32h7_spi_read_rxfifo - Read bytes in Receive Data Register * @spi: pointer to the spi controller data structure @@ -1224,6 +1310,22 @@ static void stm32fx_spi_transfer_one_dma_start(struct stm32_spi *spi) stm32_spi_enable(spi); } +/** + * stm32f7_spi_transfer_one_dma_start - Set SPI driver registers to start + * transfer using DMA + * @spi: pointer to the spi controller data structure + */ +static void stm32f7_spi_transfer_one_dma_start(struct stm32_spi *spi) +{ + /* Configure DMA request trigger threshold according to DMA width */ + if (spi->cur_bpw <= 8) + stm32_spi_set_bits(spi, STM32FX_SPI_CR2, STM32F7_SPI_CR2_FRXTH); + else + stm32_spi_clr_bits(spi, STM32FX_SPI_CR2, STM32F7_SPI_CR2_FRXTH); + + stm32fx_spi_transfer_one_dma_start(spi); +} + /** * stm32h7_spi_transfer_one_dma_start - Set SPI driver registers to start * transfer using DMA @@ -1362,6 +1464,31 @@ static void stm32f4_spi_set_bpw(struct stm32_spi *spi) stm32_spi_clr_bits(spi, STM32FX_SPI_CR1, STM32F4_SPI_CR1_DFF); } +/** + * stm32f7_spi_set_bpw - Configure bits per word + * @spi: pointer to the spi controller data structure + */ +static void stm32f7_spi_set_bpw(struct stm32_spi *spi) +{ + u32 bpw; + u32 cr2_clrb = 0, cr2_setb = 0; + + bpw = spi->cur_bpw - 1; + + cr2_clrb |= STM32F7_SPI_CR2_DS; + cr2_setb |= FIELD_PREP(STM32F7_SPI_CR2_DS, bpw); + + if (spi->rx_len >= sizeof(u16)) + cr2_clrb |= STM32F7_SPI_CR2_FRXTH; + else + cr2_setb |= STM32F7_SPI_CR2_FRXTH; + + writel_relaxed( + (readl_relaxed(spi->base + STM32FX_SPI_CR2) & + ~cr2_clrb) | cr2_setb, + spi->base + STM32FX_SPI_CR2); +} + /** * stm32h7_spi_set_bpw - configure bits per word * @spi: pointer to the spi controller data structure @@ -1771,6 +1898,27 @@ static const struct stm32_spi_cfg stm32f4_spi_cfg = { .flags = SPI_CONTROLLER_MUST_TX, }; +static const struct stm32_spi_cfg stm32f7_spi_cfg = { + .regs = &stm32fx_spi_regspec, + .get_bpw_mask = stm32f7_spi_get_bpw_mask, + .disable = stm32fx_spi_disable, + .config = stm32fx_spi_config, + .set_bpw = stm32f7_spi_set_bpw, + .set_mode = stm32fx_spi_set_mode, + .write_tx = stm32f7_spi_write_tx, + .read_rx = stm32f7_spi_read_rx, + .transfer_one_dma_start = stm32f7_spi_transfer_one_dma_start, + .dma_tx_cb = stm32fx_spi_dma_tx_cb, + .dma_rx_cb = stm32_spi_dma_rx_cb, + .transfer_one_irq = stm32fx_spi_transfer_one_irq, + .irq_handler_event = stm32fx_spi_irq_event, + .irq_handler_thread = stm32fx_spi_irq_thread, + .baud_rate_div_min = STM32FX_SPI_BR_DIV_MIN, + .baud_rate_div_max = STM32FX_SPI_BR_DIV_MAX, + .has_fifo = false, + .flags = SPI_CONTROLLER_MUST_TX, +}; + static const struct stm32_spi_cfg stm32h7_spi_cfg = { .regs = &stm32h7_spi_regspec, .get_fifo_size = stm32h7_spi_get_fifo_size, @@ -1800,6 +1948,7 @@ static const struct stm32_spi_cfg stm32h7_spi_cfg = { static const struct of_device_id stm32_spi_of_match[] = { { .compatible = "st,stm32h7-spi", .data = (void *)&stm32h7_spi_cfg }, { .compatible = "st,stm32f4-spi", .data = (void *)&stm32f4_spi_cfg }, + { .compatible = "st,stm32f7-spi", .data = (void *)&stm32f7_spi_cfg }, {}, }; MODULE_DEVICE_TABLE(of, stm32_spi_of_match); From 09388379b6d7143ed12fc06900ec9db3bb82ca8f Mon Sep 17 00:00:00 2001 From: Ben Wolsieffer Date: Thu, 2 Nov 2023 15:37:20 -0400 Subject: [PATCH 0035/1562] spi: add stm32f7-spi compatible The STM32F7 SPI peripheral is nearly identical to the STM32F4, with the only significant differences being support for a wider range of word sizes and the addition of 32-bit transmit and receive FIFOs. Signed-off-by: Ben Wolsieffer Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20231102193722.3042245-4-ben.wolsieffer@hefring.com Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/spi/st,stm32-spi.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/spi/st,stm32-spi.yaml b/Documentation/devicetree/bindings/spi/st,stm32-spi.yaml index ae0f082bd377..5754d603f34f 100644 --- a/Documentation/devicetree/bindings/spi/st,stm32-spi.yaml +++ b/Documentation/devicetree/bindings/spi/st,stm32-spi.yaml @@ -23,6 +23,7 @@ properties: compatible: enum: - st,stm32f4-spi + - st,stm32f7-spi - st,stm32h7-spi reg: From dfa8121a6ca7725576f71f7b505f711e1148f151 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sun, 5 Nov 2023 15:39:33 +0100 Subject: [PATCH 0036/1562] spi: cadence-xspi: Drop useless assignment to NULL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Static structs are initialized with zeros for unspecified fields. So there is no advantage to explicitly initialize .remove with NULL and the assignment can be dropped without side effects. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231105143932.3722920-2-u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-xspi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/spi/spi-cadence-xspi.c b/drivers/spi/spi-cadence-xspi.c index b7e04b03be58..8648b8eb080d 100644 --- a/drivers/spi/spi-cadence-xspi.c +++ b/drivers/spi/spi-cadence-xspi.c @@ -619,7 +619,6 @@ MODULE_DEVICE_TABLE(of, cdns_xspi_of_match); static struct platform_driver cdns_xspi_platform_driver = { .probe = cdns_xspi_probe, - .remove = NULL, .driver = { .name = CDNS_XSPI_NAME, .of_match_table = cdns_xspi_of_match, From 424a8166764e462258fdccaaefbdeb07517c8b21 Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Mon, 6 Nov 2023 20:23:55 +0530 Subject: [PATCH 0037/1562] spi: spi-zynqmp-gqspi: fix driver kconfig dependencies ZynqMP GQSPI driver no longer uses spi-master framework. It had been converted to use spi-mem framework. So remove driver dependency from spi-master and replace it with spi-mem. Fixes: 1c26372e5aa9 ("spi: spi-zynqmp-gqspi: Update driver to use spi-mem framework") Signed-off-by: Amit Kumar Mahapatra Signed-off-by: Radhey Shyam Pandey Link: https://lore.kernel.org/r/1699282435-884917-1-git-send-email-radhey.shyam.pandey@amd.com Signed-off-by: Mark Brown --- drivers/spi/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index 70c9dd6b6a31..ddae0fde798e 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -1177,9 +1177,10 @@ config SPI_ZYNQ_QSPI config SPI_ZYNQMP_GQSPI tristate "Xilinx ZynqMP GQSPI controller" - depends on (SPI_MASTER && HAS_DMA) || COMPILE_TEST + depends on (SPI_MEM && HAS_DMA) || COMPILE_TEST help Enables Xilinx GQSPI controller driver for Zynq UltraScale+ MPSoC. + This controller only supports SPI memory interface. config SPI_AMD tristate "AMD SPI controller" From 2f2802d1a59d79a3d00cb429841db502c2bbc3df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sun, 5 Nov 2023 18:26:50 +0100 Subject: [PATCH 0038/1562] spi: spi-ti-qspi: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Add an error message to the error path that returned an error before to replace the core's error message with more information. Apart from the different wording of the error message, this patch doesn't introduce a semantic difference. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231105172649.3738556-2-u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi-ti-qspi.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c index 4c81516b67db..fdc092a05284 100644 --- a/drivers/spi/spi-ti-qspi.c +++ b/drivers/spi/spi-ti-qspi.c @@ -907,21 +907,22 @@ free_master: return ret; } -static int ti_qspi_remove(struct platform_device *pdev) +static void ti_qspi_remove(struct platform_device *pdev) { struct ti_qspi *qspi = platform_get_drvdata(pdev); int rc; rc = spi_master_suspend(qspi->master); - if (rc) - return rc; + if (rc) { + dev_alert(&pdev->dev, "spi_master_suspend() failed (%pe)\n", + ERR_PTR(rc)); + return; + } pm_runtime_put_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); ti_qspi_dma_cleanup(qspi); - - return 0; } static const struct dev_pm_ops ti_qspi_pm_ops = { @@ -930,7 +931,7 @@ static const struct dev_pm_ops ti_qspi_pm_ops = { static struct platform_driver ti_qspi_driver = { .probe = ti_qspi_probe, - .remove = ti_qspi_remove, + .remove_new = ti_qspi_remove, .driver = { .name = "ti-qspi", .pm = &ti_qspi_pm_ops, From 08e23d05fa6dc4fc13da0ccf09defdd4bbc92ff4 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Tue, 24 Oct 2023 20:30:15 +0200 Subject: [PATCH 0039/1562] PM / devfreq: Fix buffer overflow in trans_stat_show Fix buffer overflow in trans_stat_show(). Convert simple snprintf to the more secure scnprintf with size of PAGE_SIZE. Add condition checking if we are exceeding PAGE_SIZE and exit early from loop. Also add at the end a warning that we exceeded PAGE_SIZE and that stats is disabled. Return -EFBIG in the case where we don't have enough space to write the full transition table. Also document in the ABI that this function can return -EFBIG error. Link: https://lore.kernel.org/all/20231024183016.14648-2-ansuelsmth@gmail.com/ Cc: stable@vger.kernel.org Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218041 Fixes: e552bbaf5b98 ("PM / devfreq: Add sysfs node for representing frequency transition information.") Signed-off-by: Christian Marangi Signed-off-by: Chanwoo Choi --- Documentation/ABI/testing/sysfs-class-devfreq | 3 + drivers/devfreq/devfreq.c | 59 +++++++++++++------ 2 files changed, 43 insertions(+), 19 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-class-devfreq b/Documentation/ABI/testing/sysfs-class-devfreq index 5e6b74f30406..1e7e0bb4c14e 100644 --- a/Documentation/ABI/testing/sysfs-class-devfreq +++ b/Documentation/ABI/testing/sysfs-class-devfreq @@ -52,6 +52,9 @@ Description: echo 0 > /sys/class/devfreq/.../trans_stat + If the transition table is bigger than PAGE_SIZE, reading + this will return an -EFBIG error. + What: /sys/class/devfreq/.../available_frequencies Date: October 2012 Contact: Nishanth Menon diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index b3a68d5833bd..907f50ab70ed 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -1688,7 +1688,7 @@ static ssize_t trans_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct devfreq *df = to_devfreq(dev); - ssize_t len; + ssize_t len = 0; int i, j; unsigned int max_state; @@ -1697,7 +1697,7 @@ static ssize_t trans_stat_show(struct device *dev, max_state = df->max_state; if (max_state == 0) - return sprintf(buf, "Not Supported.\n"); + return scnprintf(buf, PAGE_SIZE, "Not Supported.\n"); mutex_lock(&df->lock); if (!df->stop_polling && @@ -1707,31 +1707,52 @@ static ssize_t trans_stat_show(struct device *dev, } mutex_unlock(&df->lock); - len = sprintf(buf, " From : To\n"); - len += sprintf(buf + len, " :"); - for (i = 0; i < max_state; i++) - len += sprintf(buf + len, "%10lu", - df->freq_table[i]); + len += scnprintf(buf + len, PAGE_SIZE - len, " From : To\n"); + len += scnprintf(buf + len, PAGE_SIZE - len, " :"); + for (i = 0; i < max_state; i++) { + if (len >= PAGE_SIZE - 1) + break; + len += scnprintf(buf + len, PAGE_SIZE - len, "%10lu", + df->freq_table[i]); + } + if (len >= PAGE_SIZE - 1) + return PAGE_SIZE - 1; - len += sprintf(buf + len, " time(ms)\n"); + len += scnprintf(buf + len, PAGE_SIZE - len, " time(ms)\n"); for (i = 0; i < max_state; i++) { + if (len >= PAGE_SIZE - 1) + break; if (df->freq_table[i] == df->previous_freq) - len += sprintf(buf + len, "*"); + len += scnprintf(buf + len, PAGE_SIZE - len, "*"); else - len += sprintf(buf + len, " "); + len += scnprintf(buf + len, PAGE_SIZE - len, " "); + if (len >= PAGE_SIZE - 1) + break; - len += sprintf(buf + len, "%10lu:", df->freq_table[i]); - for (j = 0; j < max_state; j++) - len += sprintf(buf + len, "%10u", - df->stats.trans_table[(i * max_state) + j]); - - len += sprintf(buf + len, "%10llu\n", (u64) - jiffies64_to_msecs(df->stats.time_in_state[i])); + len += scnprintf(buf + len, PAGE_SIZE - len, "%10lu:", + df->freq_table[i]); + for (j = 0; j < max_state; j++) { + if (len >= PAGE_SIZE - 1) + break; + len += scnprintf(buf + len, PAGE_SIZE - len, "%10u", + df->stats.trans_table[(i * max_state) + j]); + } + if (len >= PAGE_SIZE - 1) + break; + len += scnprintf(buf + len, PAGE_SIZE - len, "%10llu\n", (u64) + jiffies64_to_msecs(df->stats.time_in_state[i])); + } + + if (len < PAGE_SIZE - 1) + len += scnprintf(buf + len, PAGE_SIZE - len, "Total transition : %u\n", + df->stats.total_trans); + + if (len >= PAGE_SIZE - 1) { + pr_warn_once("devfreq transition table exceeds PAGE_SIZE. Disabling\n"); + return -EFBIG; } - len += sprintf(buf + len, "Total transition : %u\n", - df->stats.total_trans); return len; } From 4920ee6dcfaf9aec9f4bd14ce6c15a6a758a92ae Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Tue, 24 Oct 2023 20:30:16 +0200 Subject: [PATCH 0040/1562] PM / devfreq: Convert to use sysfs_emit_at() API Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Link: https://lore.kernel.org/all/20231024183016.14648-3-ansuelsmth@gmail.com/ Signed-off-by: Christian Marangi Signed-off-by: Chanwoo Choi --- drivers/devfreq/devfreq.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 907f50ab70ed..017a87465776 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -1697,7 +1697,7 @@ static ssize_t trans_stat_show(struct device *dev, max_state = df->max_state; if (max_state == 0) - return scnprintf(buf, PAGE_SIZE, "Not Supported.\n"); + return sysfs_emit(buf, "Not Supported.\n"); mutex_lock(&df->lock); if (!df->stop_polling && @@ -1707,47 +1707,44 @@ static ssize_t trans_stat_show(struct device *dev, } mutex_unlock(&df->lock); - len += scnprintf(buf + len, PAGE_SIZE - len, " From : To\n"); - len += scnprintf(buf + len, PAGE_SIZE - len, " :"); + len += sysfs_emit_at(buf, len, " From : To\n"); + len += sysfs_emit_at(buf, len, " :"); for (i = 0; i < max_state; i++) { if (len >= PAGE_SIZE - 1) break; - len += scnprintf(buf + len, PAGE_SIZE - len, "%10lu", - df->freq_table[i]); + len += sysfs_emit_at(buf, len, "%10lu", + df->freq_table[i]); } + if (len >= PAGE_SIZE - 1) return PAGE_SIZE - 1; - - len += scnprintf(buf + len, PAGE_SIZE - len, " time(ms)\n"); + len += sysfs_emit_at(buf, len, " time(ms)\n"); for (i = 0; i < max_state; i++) { if (len >= PAGE_SIZE - 1) break; - if (df->freq_table[i] == df->previous_freq) - len += scnprintf(buf + len, PAGE_SIZE - len, "*"); + if (df->freq_table[2] == df->previous_freq) + len += sysfs_emit_at(buf, len, "*"); else - len += scnprintf(buf + len, PAGE_SIZE - len, " "); + len += sysfs_emit_at(buf, len, " "); if (len >= PAGE_SIZE - 1) break; - - len += scnprintf(buf + len, PAGE_SIZE - len, "%10lu:", - df->freq_table[i]); + len += sysfs_emit_at(buf, len, "%10lu:", df->freq_table[i]); for (j = 0; j < max_state; j++) { if (len >= PAGE_SIZE - 1) break; - len += scnprintf(buf + len, PAGE_SIZE - len, "%10u", - df->stats.trans_table[(i * max_state) + j]); + len += sysfs_emit_at(buf, len, "%10u", + df->stats.trans_table[(i * max_state) + j]); } if (len >= PAGE_SIZE - 1) break; - len += scnprintf(buf + len, PAGE_SIZE - len, "%10llu\n", (u64) - jiffies64_to_msecs(df->stats.time_in_state[i])); + len += sysfs_emit_at(buf, len, "%10llu\n", (u64) + jiffies64_to_msecs(df->stats.time_in_state[i])); } if (len < PAGE_SIZE - 1) - len += scnprintf(buf + len, PAGE_SIZE - len, "Total transition : %u\n", - df->stats.total_trans); - + len += sysfs_emit_at(buf, len, "Total transition : %u\n", + df->stats.total_trans); if (len >= PAGE_SIZE - 1) { pr_warn_once("devfreq transition table exceeds PAGE_SIZE. Disabling\n"); return -EFBIG; From ecea08916418a94f99f89c543303877cb6e08a11 Mon Sep 17 00:00:00 2001 From: Alper Nebi Yasak Date: Wed, 8 Nov 2023 21:25:13 +0300 Subject: [PATCH 0041/1562] firmware: coreboot: framebuffer: Avoid invalid zero physical address On ARM64 systems coreboot defers framebuffer allocation to its payload, to be done by a libpayload function call. In this case, coreboot tables still include a framebuffer entry with display format details, but the physical address field is set to zero (as in [1], for example). Unfortunately, this field is not automatically updated when the framebuffer is initialized through libpayload, citing that doing so would invalidate checksums over the entire coreboot table [2]. This can be observed on ARM64 Chromebooks with stock firmware. On a Google Kevin (RK3399), trying to use coreboot framebuffer driver as built-in to the kernel results in a benign error. But on Google Hana (MT8173) and Google Cozmo (MT8183) it causes a hang. When the framebuffer physical address field in the coreboot table is zero, we have no idea where coreboot initialized a framebuffer, or even if it did. Instead of trying to set up a framebuffer located at zero, return ENODEV to indicate that there isn't one. [1] https://review.coreboot.org/c/coreboot/+/17109 [2] https://review.coreboot.org/c/coreboot/+/8797 Signed-off-by: Alper Nebi Yasak Reviewed-by: Julius Werner Link: https://lore.kernel.org/r/20231108182625.46563-1-alpernebiyasak@gmail.com Signed-off-by: Tzung-Bi Shih --- drivers/firmware/google/framebuffer-coreboot.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/firmware/google/framebuffer-coreboot.c b/drivers/firmware/google/framebuffer-coreboot.c index c323a818805c..5c84bbebfef8 100644 --- a/drivers/firmware/google/framebuffer-coreboot.c +++ b/drivers/firmware/google/framebuffer-coreboot.c @@ -36,6 +36,9 @@ static int framebuffer_probe(struct coreboot_device *dev) .format = NULL, }; + if (!fb->physical_address) + return -ENODEV; + for (i = 0; i < ARRAY_SIZE(formats); ++i) { if (fb->bits_per_pixel == formats[i].bits_per_pixel && fb->red_mask_pos == formats[i].red.offset && From 49e380795414039f7b3bd44c121104f31738dcf1 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sat, 11 Nov 2023 00:52:39 +0800 Subject: [PATCH 0042/1562] platform/chrome: sensorhub: Fix typos Replace 'preceeds' with 'precedes' in the comment. Replace 'porod' with 'period' in the comment. Replace 'noone' with 'no one' in the comment. Replace 'lantency' with 'latency' in the comment. Replace 'kifo' with 'kfifo' in the comment. Replace 'change' with 'chance' in the comment. Signed-off-by: Kuan-Wei Chiu Reviewed-by: Randy Dunlap Link: https://lore.kernel.org/r/20231110165239.1559109-1-visitorckw@gmail.com Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec_sensorhub_ring.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_sensorhub_ring.c b/drivers/platform/chrome/cros_ec_sensorhub_ring.c index 71948dade0e2..9e17f7483ca0 100644 --- a/drivers/platform/chrome/cros_ec_sensorhub_ring.c +++ b/drivers/platform/chrome/cros_ec_sensorhub_ring.c @@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(cros_ec_sensorhub_unregister_push_data); * @sensorhub: Sensor Hub object * @on: true when events are requested. * - * To be called before sleeping or when noone is listening. + * To be called before sleeping or when no one is listening. * Return: 0 on success, or an error when we can not communicate with the EC. * */ @@ -175,8 +175,8 @@ static s64 cros_ec_sensor_ring_median(s64 *array, size_t length) * * While a and b are recorded at accurate times (due to the EC real time * nature); c is pretty untrustworthy, even though it's recorded the - * first thing in ec_irq_handler(). There is a very good change we'll get - * added lantency due to: + * first thing in ec_irq_handler(). There is a very good chance we'll get + * added latency due to: * other irqs * ddrfreq * cpuidle @@ -511,7 +511,7 @@ cros_ec_sensor_ring_process_event(struct cros_ec_sensorhub *sensorhub, * ringbuffer. * * This is the new spreading code, assumes every sample's timestamp - * preceeds the sample. Run if tight_timestamps == true. + * precedes the sample. Run if tight_timestamps == true. * * Sometimes the EC receives only one interrupt (hence timestamp) for * a batch of samples. Only the first sample will have the correct @@ -595,7 +595,7 @@ cros_ec_sensor_ring_spread_add(struct cros_ec_sensorhub *sensorhub, } else { /* * Push first sample in the batch to the, - * kifo, it's guaranteed to be correct, the + * kfifo, it's guaranteed to be correct, the * rest will follow later on. */ sample_idx = 1; @@ -701,7 +701,7 @@ done_with_this_batch: * last_out --> * * - * We spread time for the samples using perod p = (current - TS1)/4. + * We spread time for the samples using period p = (current - TS1)/4. * between TS1 and TS2: [TS1+p/4, TS1+2p/4, TS1+3p/4, current_timestamp]. * */ From 022732e3d846e197539712e51ecada90ded0572a Mon Sep 17 00:00:00 2001 From: Chris Riches Date: Wed, 18 Oct 2023 09:23:51 +0000 Subject: [PATCH 0043/1562] audit: Send netlink ACK before setting connection in auditd_set When auditd_set sets the auditd_conn pointer, audit messages can immediately be put on the socket by other kernel threads. If the backlog is large or the rate is high, this can immediately fill the socket buffer. If the audit daemon requested an ACK for this operation, a full socket buffer causes the ACK to get dropped, also setting ENOBUFS on the socket. To avoid this race and ensure ACKs get through, fast-track the ACK in this specific case to ensure it is sent before auditd_conn is set. Signed-off-by: Chris Riches [PM: fix some tab vs space damage] Signed-off-by: Paul Moore --- kernel/audit.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/kernel/audit.c b/kernel/audit.c index 16205dd29843..9c8e5f732c4c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -487,15 +487,19 @@ static void auditd_conn_free(struct rcu_head *rcu) * @pid: auditd PID * @portid: auditd netlink portid * @net: auditd network namespace pointer + * @skb: the netlink command from the audit daemon + * @ack: netlink ack flag, cleared if ack'd here * * Description: * This function will obtain and drop network namespace references as * necessary. Returns zero on success, negative values on failure. */ -static int auditd_set(struct pid *pid, u32 portid, struct net *net) +static int auditd_set(struct pid *pid, u32 portid, struct net *net, + struct sk_buff *skb, bool *ack) { unsigned long flags; struct auditd_connection *ac_old, *ac_new; + struct nlmsghdr *nlh; if (!pid || !net) return -EINVAL; @@ -507,6 +511,13 @@ static int auditd_set(struct pid *pid, u32 portid, struct net *net) ac_new->portid = portid; ac_new->net = get_net(net); + /* send the ack now to avoid a race with the queue backlog */ + if (*ack) { + nlh = nlmsg_hdr(skb); + netlink_ack(skb, nlh, 0, NULL); + *ack = false; + } + spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); @@ -1200,7 +1211,8 @@ static int audit_replace(struct pid *pid) return auditd_send_unicast_skb(skb); } -static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + bool *ack) { u32 seq; void *data; @@ -1293,7 +1305,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* register a new auditd connection */ err = auditd_set(req_pid, NETLINK_CB(skb).portid, - sock_net(NETLINK_CB(skb).sk)); + sock_net(NETLINK_CB(skb).sk), + skb, ack); if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, @@ -1538,9 +1551,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) * Parse the provided skb and deal with any messages that may be present, * malformed skbs are discarded. */ -static void audit_receive(struct sk_buff *skb) +static void audit_receive(struct sk_buff *skb) { struct nlmsghdr *nlh; + bool ack; /* * len MUST be signed for nlmsg_next to be able to dec it below 0 * if the nlmsg_len was not aligned @@ -1553,9 +1567,12 @@ static void audit_receive(struct sk_buff *skb) audit_ctl_lock(); while (nlmsg_ok(nlh, len)) { - err = audit_receive_msg(skb, nlh); - /* if err or if this message says it wants a response */ - if (err || (nlh->nlmsg_flags & NLM_F_ACK)) + ack = nlh->nlmsg_flags & NLM_F_ACK; + err = audit_receive_msg(skb, nlh, &ack); + + /* send an ack if the user asked for one and audit_receive_msg + * didn't already do it, or if there was an error. */ + if (ack || err) netlink_ack(skb, nlh, err, NULL); nlh = nlmsg_next(nlh, &len); From f3b8788cde61b02f1e6c202f8fac4360e6adbafc Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:46 -0700 Subject: [PATCH 0044/1562] LSM: Identify modules by more than name Create a struct lsm_id to contain identifying information about Linux Security Modules (LSMs). At inception this contains the name of the module and an identifier associated with the security module. Change the security_add_hooks() interface to use this structure. Change the individual modules to maintain their own struct lsm_id and pass it to security_add_hooks(). The values are for LSM identifiers are defined in a new UAPI header file linux/lsm.h. Each existing LSM has been updated to include it's LSMID in the lsm_id. The LSM ID values are sequential, with the oldest module LSM_ID_CAPABILITY being the lowest value and the existing modules numbered in the order they were included in the main line kernel. This is an arbitrary convention for assigning the values, but none better presents itself. The value 0 is defined as being invalid. The values 1-99 are reserved for any special case uses which may arise in the future. This may include attributes of the LSM infrastructure itself, possibly related to namespacing or network attribute management. A special range is identified for such attributes to help reduce confusion for developers unfamiliar with LSMs. LSM attribute values are defined for the attributes presented by modules that are available today. As with the LSM IDs, The value 0 is defined as being invalid. The values 1-99 are reserved for any special case uses which may arise in the future. Cc: linux-security-module Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: Mickael Salaun Reviewed-by: John Johansen Signed-off-by: Kees Cook Nacked-by: Tetsuo Handa [PM: forward ported beyond v6.6 due merge window changes] Signed-off-by: Paul Moore --- Documentation/userspace-api/index.rst | 1 + MAINTAINERS | 1 + include/linux/lsm_hooks.h | 16 +++++++- include/uapi/linux/lsm.h | 54 +++++++++++++++++++++++++++ security/apparmor/lsm.c | 8 +++- security/bpf/hooks.c | 9 ++++- security/commoncap.c | 8 +++- security/landlock/cred.c | 2 +- security/landlock/fs.c | 2 +- security/landlock/net.c | 2 +- security/landlock/ptrace.c | 2 +- security/landlock/setup.c | 6 +++ security/landlock/setup.h | 1 + security/loadpin/loadpin.c | 9 ++++- security/lockdown/lockdown.c | 8 +++- security/safesetid/lsm.c | 9 ++++- security/security.c | 12 +++--- security/selinux/hooks.c | 9 ++++- security/smack/smack_lsm.c | 8 +++- security/tomoyo/tomoyo.c | 9 ++++- security/yama/yama_lsm.c | 8 +++- 21 files changed, 162 insertions(+), 22 deletions(-) create mode 100644 include/uapi/linux/lsm.h diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index 031df47a7c19..8be8b1979194 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -33,6 +33,7 @@ place where this information is gathered. sysfs-platform_profile vduse futex2 + lsm .. only:: subproject and html diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cf..f1d41fd9159a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19511,6 +19511,7 @@ L: linux-security-module@vger.kernel.org (suggested Cc:) S: Supported W: http://kernsec.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git +F: include/uapi/linux/lsm.h F: security/ X: security/selinux/ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index dcb5e5b5eb13..7f0adb33caaa 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -42,6 +42,18 @@ struct security_hook_heads { #undef LSM_HOOK } __randomize_layout; +/** + * struct lsm_id - Identify a Linux Security Module. + * @lsm: name of the LSM, must be approved by the LSM maintainers + * @id: LSM ID number from uapi/linux/lsm.h + * + * Contains the information that identifies the LSM. + */ +struct lsm_id { + const char *name; + u64 id; +}; + /* * Security module hook list structure. * For use with generic list macros for common operations. @@ -50,7 +62,7 @@ struct security_hook_list { struct hlist_node list; struct hlist_head *head; union security_list_options hook; - const char *lsm; + const struct lsm_id *lsmid; } __randomize_layout; /* @@ -104,7 +116,7 @@ extern struct security_hook_heads security_hook_heads; extern char *lsm_names; extern void security_add_hooks(struct security_hook_list *hooks, int count, - const char *lsm); + const struct lsm_id *lsmid); #define LSM_FLAG_LEGACY_MAJOR BIT(0) #define LSM_FLAG_EXCLUSIVE BIT(1) diff --git a/include/uapi/linux/lsm.h b/include/uapi/linux/lsm.h new file mode 100644 index 000000000000..f27c9a9cc376 --- /dev/null +++ b/include/uapi/linux/lsm.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Linux Security Modules (LSM) - User space API + * + * Copyright (C) 2022 Casey Schaufler + * Copyright (C) 2022 Intel Corporation + */ + +#ifndef _UAPI_LINUX_LSM_H +#define _UAPI_LINUX_LSM_H + +/* + * ID tokens to identify Linux Security Modules (LSMs) + * + * These token values are used to uniquely identify specific LSMs + * in the kernel as well as in the kernel's LSM userspace API. + * + * A value of zero/0 is considered undefined and should not be used + * outside the kernel. Values 1-99 are reserved for potential + * future use. + */ +#define LSM_ID_UNDEF 0 +#define LSM_ID_CAPABILITY 100 +#define LSM_ID_SELINUX 101 +#define LSM_ID_SMACK 102 +#define LSM_ID_TOMOYO 103 +#define LSM_ID_IMA 104 +#define LSM_ID_APPARMOR 105 +#define LSM_ID_YAMA 106 +#define LSM_ID_LOADPIN 107 +#define LSM_ID_SAFESETID 108 +#define LSM_ID_LOCKDOWN 109 +#define LSM_ID_BPF 110 +#define LSM_ID_LANDLOCK 111 + +/* + * LSM_ATTR_XXX definitions identify different LSM attributes + * which are used in the kernel's LSM userspace API. Support + * for these attributes vary across the different LSMs. None + * are required. + * + * A value of zero/0 is considered undefined and should not be used + * outside the kernel. Values 1-99 are reserved for potential + * future use. + */ +#define LSM_ATTR_UNDEF 0 +#define LSM_ATTR_CURRENT 100 +#define LSM_ATTR_EXEC 101 +#define LSM_ATTR_FSCREATE 102 +#define LSM_ATTR_KEYCREATE 103 +#define LSM_ATTR_PREV 104 +#define LSM_ATTR_SOCKCREATE 105 + +#endif /* _UAPI_LINUX_LSM_H */ diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 4981bdf02993..093da0a9dbd8 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "include/apparmor.h" #include "include/apparmorfs.h" @@ -1385,6 +1386,11 @@ struct lsm_blob_sizes apparmor_blob_sizes __ro_after_init = { .lbs_task = sizeof(struct aa_task_ctx), }; +const struct lsm_id apparmor_lsmid = { + .name = "apparmor", + .id = LSM_ID_APPARMOR, +}; + static struct security_hook_list apparmor_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), @@ -2202,7 +2208,7 @@ static int __init apparmor_init(void) goto buffers_out; } security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks), - "apparmor"); + &apparmor_lsmid); /* Report that AppArmor successfully initialized */ apparmor_initialized = 1; diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c index cfaf1d0e6a5f..91011e0c361a 100644 --- a/security/bpf/hooks.c +++ b/security/bpf/hooks.c @@ -5,6 +5,7 @@ */ #include #include +#include static struct security_hook_list bpf_lsm_hooks[] __ro_after_init = { #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ @@ -15,9 +16,15 @@ static struct security_hook_list bpf_lsm_hooks[] __ro_after_init = { LSM_HOOK_INIT(task_free, bpf_task_storage_free), }; +const struct lsm_id bpf_lsmid = { + .name = "bpf", + .id = LSM_ID_BPF, +}; + static int __init bpf_lsm_init(void) { - security_add_hooks(bpf_lsm_hooks, ARRAY_SIZE(bpf_lsm_hooks), "bpf"); + security_add_hooks(bpf_lsm_hooks, ARRAY_SIZE(bpf_lsm_hooks), + &bpf_lsmid); pr_info("LSM support for eBPF active\n"); return 0; } diff --git a/security/commoncap.c b/security/commoncap.c index 8e8c630ce204..a64c0c8592bb 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -25,6 +25,7 @@ #include #include #include +#include /* * If a non-root user executes a setuid-root binary in @@ -1440,6 +1441,11 @@ int cap_mmap_file(struct file *file, unsigned long reqprot, #ifdef CONFIG_SECURITY +const struct lsm_id capability_lsmid = { + .name = "capability", + .id = LSM_ID_CAPABILITY, +}; + static struct security_hook_list capability_hooks[] __ro_after_init = { LSM_HOOK_INIT(capable, cap_capable), LSM_HOOK_INIT(settime, cap_settime), @@ -1464,7 +1470,7 @@ static struct security_hook_list capability_hooks[] __ro_after_init = { static int __init capability_init(void) { security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks), - "capability"); + &capability_lsmid); return 0; } diff --git a/security/landlock/cred.c b/security/landlock/cred.c index 13dff2a31545..786af18c4a1c 100644 --- a/security/landlock/cred.c +++ b/security/landlock/cred.c @@ -42,5 +42,5 @@ static struct security_hook_list landlock_hooks[] __ro_after_init = { __init void landlock_add_cred_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), - LANDLOCK_NAME); + &landlock_lsmid); } diff --git a/security/landlock/fs.c b/security/landlock/fs.c index bc7c126deea2..490655d09b43 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1223,5 +1223,5 @@ static struct security_hook_list landlock_hooks[] __ro_after_init = { __init void landlock_add_fs_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), - LANDLOCK_NAME); + &landlock_lsmid); } diff --git a/security/landlock/net.c b/security/landlock/net.c index aaa92c2b1f08..efa1b644a4af 100644 --- a/security/landlock/net.c +++ b/security/landlock/net.c @@ -196,5 +196,5 @@ static struct security_hook_list landlock_hooks[] __ro_after_init = { __init void landlock_add_net_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), - LANDLOCK_NAME); + &landlock_lsmid); } diff --git a/security/landlock/ptrace.c b/security/landlock/ptrace.c index 8a06d6c492bf..2bfc533d36e4 100644 --- a/security/landlock/ptrace.c +++ b/security/landlock/ptrace.c @@ -116,5 +116,5 @@ static struct security_hook_list landlock_hooks[] __ro_after_init = { __init void landlock_add_ptrace_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), - LANDLOCK_NAME); + &landlock_lsmid); } diff --git a/security/landlock/setup.c b/security/landlock/setup.c index 3e11d303542f..f6dd33143b7f 100644 --- a/security/landlock/setup.c +++ b/security/landlock/setup.c @@ -8,6 +8,7 @@ #include #include +#include #include "common.h" #include "cred.h" @@ -25,6 +26,11 @@ struct lsm_blob_sizes landlock_blob_sizes __ro_after_init = { .lbs_superblock = sizeof(struct landlock_superblock_security), }; +const struct lsm_id landlock_lsmid = { + .name = LANDLOCK_NAME, + .id = LSM_ID_LANDLOCK, +}; + static int __init landlock_init(void) { landlock_add_cred_hooks(); diff --git a/security/landlock/setup.h b/security/landlock/setup.h index 1daffab1ab4b..c4252d46d49d 100644 --- a/security/landlock/setup.h +++ b/security/landlock/setup.h @@ -14,5 +14,6 @@ extern bool landlock_initialized; extern struct lsm_blob_sizes landlock_blob_sizes; +extern const struct lsm_id landlock_lsmid; #endif /* _SECURITY_LANDLOCK_SETUP_H */ diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c index a9d40456a064..d682a851de58 100644 --- a/security/loadpin/loadpin.c +++ b/security/loadpin/loadpin.c @@ -20,6 +20,7 @@ #include #include #include +#include #define VERITY_DIGEST_FILE_HEADER "# LOADPIN_TRUSTED_VERITY_ROOT_DIGESTS" @@ -208,6 +209,11 @@ static int loadpin_load_data(enum kernel_load_data_id id, bool contents) return loadpin_check(NULL, (enum kernel_read_file_id) id); } +const struct lsm_id loadpin_lsmid = { + .name = "loadpin", + .id = LSM_ID_LOADPIN, +}; + static struct security_hook_list loadpin_hooks[] __ro_after_init = { LSM_HOOK_INIT(sb_free_security, loadpin_sb_free_security), LSM_HOOK_INIT(kernel_read_file, loadpin_read_file), @@ -259,7 +265,8 @@ static int __init loadpin_init(void) if (!register_sysctl("kernel/loadpin", loadpin_sysctl_table)) pr_notice("sysctl registration failed!\n"); #endif - security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks), "loadpin"); + security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks), + &loadpin_lsmid); return 0; } diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 68d19632aeb7..cd84d8ea1dfb 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -13,6 +13,7 @@ #include #include #include +#include static enum lockdown_reason kernel_locked_down; @@ -75,6 +76,11 @@ static struct security_hook_list lockdown_hooks[] __ro_after_init = { LSM_HOOK_INIT(locked_down, lockdown_is_locked_down), }; +const struct lsm_id lockdown_lsmid = { + .name = "lockdown", + .id = LSM_ID_LOCKDOWN, +}; + static int __init lockdown_lsm_init(void) { #if defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY) @@ -83,7 +89,7 @@ static int __init lockdown_lsm_init(void) lock_kernel_down("Kernel configuration", LOCKDOWN_CONFIDENTIALITY_MAX); #endif security_add_hooks(lockdown_hooks, ARRAY_SIZE(lockdown_hooks), - "lockdown"); + &lockdown_lsmid); return 0; } diff --git a/security/safesetid/lsm.c b/security/safesetid/lsm.c index 5be5894aa0ea..f42d5af5ffb0 100644 --- a/security/safesetid/lsm.c +++ b/security/safesetid/lsm.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "lsm.h" /* Flag indicating whether initialization completed */ @@ -261,6 +262,11 @@ static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old return 0; } +const struct lsm_id safesetid_lsmid = { + .name = "safesetid", + .id = LSM_ID_SAFESETID, +}; + static struct security_hook_list safesetid_security_hooks[] = { LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid), LSM_HOOK_INIT(task_fix_setgid, safesetid_task_fix_setgid), @@ -271,7 +277,8 @@ static struct security_hook_list safesetid_security_hooks[] = { static int __init safesetid_security_init(void) { security_add_hooks(safesetid_security_hooks, - ARRAY_SIZE(safesetid_security_hooks), "safesetid"); + ARRAY_SIZE(safesetid_security_hooks), + &safesetid_lsmid); /* Report that SafeSetID successfully initialized */ safesetid_initialized = 1; diff --git a/security/security.c b/security/security.c index dcb3e7014f9b..08b1bd9457a9 100644 --- a/security/security.c +++ b/security/security.c @@ -513,17 +513,17 @@ static int lsm_append(const char *new, char **result) * security_add_hooks - Add a modules hooks to the hook lists. * @hooks: the hooks to add * @count: the number of hooks to add - * @lsm: the name of the security module + * @lsmid: the identification information for the security module * * Each LSM has to register its hooks with the infrastructure. */ void __init security_add_hooks(struct security_hook_list *hooks, int count, - const char *lsm) + const struct lsm_id *lsmid) { int i; for (i = 0; i < count; i++) { - hooks[i].lsm = lsm; + hooks[i].lsmid = lsmid; hlist_add_tail_rcu(&hooks[i].list, hooks[i].head); } @@ -532,7 +532,7 @@ void __init security_add_hooks(struct security_hook_list *hooks, int count, * and fix this up afterwards. */ if (slab_is_available()) { - if (lsm_append(lsm, &lsm_names) < 0) + if (lsm_append(lsmid->name, &lsm_names) < 0) panic("%s - Cannot get early memory.\n", __func__); } } @@ -3817,7 +3817,7 @@ int security_getprocattr(struct task_struct *p, const char *lsm, struct security_hook_list *hp; hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) { - if (lsm != NULL && strcmp(lsm, hp->lsm)) + if (lsm != NULL && strcmp(lsm, hp->lsmid->name)) continue; return hp->hook.getprocattr(p, name, value); } @@ -3842,7 +3842,7 @@ int security_setprocattr(const char *lsm, const char *name, void *value, struct security_hook_list *hp; hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) { - if (lsm != NULL && strcmp(lsm, hp->lsm)) + if (lsm != NULL && strcmp(lsm, hp->lsmid->name)) continue; return hp->hook.setprocattr(name, value, size); } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index feda711c6b7b..f2423dfd19cd 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -92,6 +92,7 @@ #include #include #include +#include #include "avc.h" #include "objsec.h" @@ -6950,6 +6951,11 @@ static int selinux_uring_cmd(struct io_uring_cmd *ioucmd) } #endif /* CONFIG_IO_URING */ +const struct lsm_id selinux_lsmid = { + .name = "selinux", + .id = LSM_ID_SELINUX, +}; + /* * IMPORTANT NOTE: When adding new hooks, please be careful to keep this order: * 1. any hooks that don't belong to (2.) or (3.) below, @@ -7270,7 +7276,8 @@ static __init int selinux_init(void) hashtab_cache_init(); - security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks), "selinux"); + security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks), + &selinux_lsmid); if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET)) panic("SELinux: Unable to register AVC netcache callback\n"); diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 65130a791f57..f73f9a2834eb 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "smack.h" #define TRANS_TRUE "TRUE" @@ -4933,6 +4934,11 @@ struct lsm_blob_sizes smack_blob_sizes __ro_after_init = { .lbs_xattr_count = SMACK_INODE_INIT_XATTRS, }; +const struct lsm_id smack_lsmid = { + .name = "smack", + .id = LSM_ID_SMACK, +}; + static struct security_hook_list smack_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme), @@ -5140,7 +5146,7 @@ static __init int smack_init(void) /* * Register with LSM */ - security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), "smack"); + security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), &smack_lsmid); smack_enabled = 1; pr_info("Smack: Initializing.\n"); diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index 255f1b470295..722205433105 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -6,6 +6,7 @@ */ #include +#include #include "common.h" /** @@ -542,6 +543,11 @@ static void tomoyo_task_free(struct task_struct *task) } } +const struct lsm_id tomoyo_lsmid = { + .name = "tomoyo", + .id = LSM_ID_TOMOYO, +}; + /* * tomoyo_security_ops is a "struct security_operations" which is used for * registering TOMOYO. @@ -595,7 +601,8 @@ static int __init tomoyo_init(void) struct tomoyo_task *s = tomoyo_task(current); /* register ourselves with the security framework */ - security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks), "tomoyo"); + security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks), + &tomoyo_lsmid); pr_info("TOMOYO Linux initialized\n"); s->domain_info = &tomoyo_kernel_domain; atomic_inc(&tomoyo_kernel_domain.users); diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 2503cf153d4a..5cdff292fcae 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -18,6 +18,7 @@ #include #include #include +#include #define YAMA_SCOPE_DISABLED 0 #define YAMA_SCOPE_RELATIONAL 1 @@ -421,6 +422,11 @@ static int yama_ptrace_traceme(struct task_struct *parent) return rc; } +const struct lsm_id yama_lsmid = { + .name = "yama", + .id = LSM_ID_YAMA, +}; + static struct security_hook_list yama_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, yama_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, yama_ptrace_traceme), @@ -471,7 +477,7 @@ static inline void yama_init_sysctl(void) { } static int __init yama_init(void) { pr_info("Yama: becoming mindful.\n"); - security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks), "yama"); + security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks), &yama_lsmid); yama_init_sysctl(); return 0; } From 9285c5ad9d00abfe0f4e2ce4039c8127e7a09738 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:47 -0700 Subject: [PATCH 0045/1562] LSM: Maintain a table of LSM attribute data As LSMs are registered add their lsm_id pointers to a table. This will be used later for attribute reporting. Determine the number of possible security modules based on their respective CONFIG options. This allows the number to be known at build time. This allows data structures and tables to use the constant. Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: Mickael Salaun Reviewed-by: John Johansen Signed-off-by: Paul Moore --- include/linux/security.h | 2 ++ security/security.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/include/linux/security.h b/include/linux/security.h index 1d1df326c881..50c178019a58 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -138,6 +138,8 @@ enum lockdown_reason { }; extern const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1]; +extern u32 lsm_active_cnt; +extern const struct lsm_id *lsm_idlist[]; /* These functions are in security/commoncap.c */ extern int cap_capable(const struct cred *cred, struct user_namespace *ns, diff --git a/security/security.c b/security/security.c index 08b1bd9457a9..0952d6bff4da 100644 --- a/security/security.c +++ b/security/security.c @@ -34,6 +34,25 @@ /* How many LSMs were built into the kernel? */ #define LSM_COUNT (__end_lsm_info - __start_lsm_info) +/* + * How many LSMs are built into the kernel as determined at + * build time. Used to determine fixed array sizes. + * The capability module is accounted for by CONFIG_SECURITY + */ +#define LSM_CONFIG_COUNT ( \ + (IS_ENABLED(CONFIG_SECURITY) ? 1 : 0) + \ + (IS_ENABLED(CONFIG_SECURITY_SELINUX) ? 1 : 0) + \ + (IS_ENABLED(CONFIG_SECURITY_SMACK) ? 1 : 0) + \ + (IS_ENABLED(CONFIG_SECURITY_TOMOYO) ? 1 : 0) + \ + (IS_ENABLED(CONFIG_IMA) ? 1 : 0) + \ + (IS_ENABLED(CONFIG_SECURITY_APPARMOR) ? 1 : 0) + \ + (IS_ENABLED(CONFIG_SECURITY_YAMA) ? 1 : 0) + \ + (IS_ENABLED(CONFIG_SECURITY_LOADPIN) ? 1 : 0) + \ + (IS_ENABLED(CONFIG_SECURITY_SAFESETID) ? 1 : 0) + \ + (IS_ENABLED(CONFIG_SECURITY_LOCKDOWN_LSM) ? 1 : 0) + \ + (IS_ENABLED(CONFIG_BPF_LSM) ? 1 : 0) + \ + (IS_ENABLED(CONFIG_SECURITY_LANDLOCK) ? 1 : 0)) + /* * These are descriptions of the reasons that can be passed to the * security_locked_down() LSM hook. Placing this array here allows @@ -245,6 +264,12 @@ static void __init initialize_lsm(struct lsm_info *lsm) } } +/* + * Current index to use while initializing the lsm id list. + */ +u32 lsm_active_cnt __ro_after_init; +const struct lsm_id *lsm_idlist[LSM_CONFIG_COUNT]; + /* Populate ordered LSMs list from comma-separated LSM name list. */ static void __init ordered_lsm_parse(const char *order, const char *origin) { @@ -522,6 +547,18 @@ void __init security_add_hooks(struct security_hook_list *hooks, int count, { int i; + /* + * A security module may call security_add_hooks() more + * than once during initialization, and LSM initialization + * is serialized. Landlock is one such case. + * Look at the previous entry, if there is one, for duplication. + */ + if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) { + if (lsm_active_cnt >= LSM_CONFIG_COUNT) + panic("%s Too many LSMs registered.\n", __func__); + lsm_idlist[lsm_active_cnt++] = lsmid; + } + for (i = 0; i < count; i++) { hooks[i].lsmid = lsmid; hlist_add_tail_rcu(&hooks[i].list, hooks[i].head); From 267c068e5f8b81b68cc4247c94dbba90a21a634e Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:48 -0700 Subject: [PATCH 0046/1562] proc: Use lsmids instead of lsm names for attrs Use the LSM ID number instead of the LSM name to identify which security module's attibute data should be shown in /proc/self/attr. The security_[gs]etprocattr() functions have been changed to expect the LSM ID. The change from a string comparison to an integer comparison in these functions will provide a minor performance improvement. Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: Mickael Salaun Reviewed-by: John Johansen Signed-off-by: Paul Moore --- fs/proc/base.c | 29 +++++++++++++++-------------- fs/proc/internal.h | 2 +- include/linux/security.h | 11 +++++------ security/security.c | 15 +++++++-------- 4 files changed, 28 insertions(+), 29 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index dd31e3b6bf77..98a031ac2648 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -97,6 +97,7 @@ #include #include #include +#include #include #include "internal.h" #include "fd.h" @@ -146,10 +147,10 @@ struct pid_entry { NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ { .proc_show = show } ) -#define ATTR(LSM, NAME, MODE) \ +#define ATTR(LSMID, NAME, MODE) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_pid_attr_operations, \ - { .lsm = LSM }) + { .lsmid = LSMID }) /* * Count the number of hardlinks for the pid_entry table, excluding the . @@ -2726,7 +2727,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, if (!task) return -ESRCH; - length = security_getprocattr(task, PROC_I(inode)->op.lsm, + length = security_getprocattr(task, PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, &p); put_task_struct(task); @@ -2784,7 +2785,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (rv < 0) goto out_free; - rv = security_setprocattr(PROC_I(inode)->op.lsm, + rv = security_setprocattr(PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, page, count); mutex_unlock(¤t->signal->cred_guard_mutex); @@ -2833,27 +2834,27 @@ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \ #ifdef CONFIG_SECURITY_SMACK static const struct pid_entry smack_attr_dir_stuff[] = { - ATTR("smack", "current", 0666), + ATTR(LSM_ID_SMACK, "current", 0666), }; LSM_DIR_OPS(smack); #endif #ifdef CONFIG_SECURITY_APPARMOR static const struct pid_entry apparmor_attr_dir_stuff[] = { - ATTR("apparmor", "current", 0666), - ATTR("apparmor", "prev", 0444), - ATTR("apparmor", "exec", 0666), + ATTR(LSM_ID_APPARMOR, "current", 0666), + ATTR(LSM_ID_APPARMOR, "prev", 0444), + ATTR(LSM_ID_APPARMOR, "exec", 0666), }; LSM_DIR_OPS(apparmor); #endif static const struct pid_entry attr_dir_stuff[] = { - ATTR(NULL, "current", 0666), - ATTR(NULL, "prev", 0444), - ATTR(NULL, "exec", 0666), - ATTR(NULL, "fscreate", 0666), - ATTR(NULL, "keycreate", 0666), - ATTR(NULL, "sockcreate", 0666), + ATTR(LSM_ID_UNDEF, "current", 0666), + ATTR(LSM_ID_UNDEF, "prev", 0444), + ATTR(LSM_ID_UNDEF, "exec", 0666), + ATTR(LSM_ID_UNDEF, "fscreate", 0666), + ATTR(LSM_ID_UNDEF, "keycreate", 0666), + ATTR(LSM_ID_UNDEF, "sockcreate", 0666), #ifdef CONFIG_SECURITY_SMACK DIR("smack", 0555, proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9a8f32f21ff5..a71ac5379584 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -92,7 +92,7 @@ union proc_op { int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); - const char *lsm; + int lsmid; }; struct proc_inode { diff --git a/include/linux/security.h b/include/linux/security.h index 50c178019a58..c81bca77f4f2 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -472,10 +472,9 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd); int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter); void security_d_instantiate(struct dentry *dentry, struct inode *inode); -int security_getprocattr(struct task_struct *p, const char *lsm, const char *name, +int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value); -int security_setprocattr(const char *lsm, const char *name, void *value, - size_t size); +int security_setprocattr(int lsmid, const char *name, void *value, size_t size); int security_netlink_send(struct sock *sk, struct sk_buff *skb); int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); @@ -1339,14 +1338,14 @@ static inline void security_d_instantiate(struct dentry *dentry, struct inode *inode) { } -static inline int security_getprocattr(struct task_struct *p, const char *lsm, +static inline int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value) { return -EINVAL; } -static inline int security_setprocattr(const char *lsm, char *name, - void *value, size_t size) +static inline int security_setprocattr(int lsmid, char *name, void *value, + size_t size) { return -EINVAL; } diff --git a/security/security.c b/security/security.c index 0952d6bff4da..c66f9faefa40 100644 --- a/security/security.c +++ b/security/security.c @@ -3840,7 +3840,7 @@ EXPORT_SYMBOL(security_d_instantiate); /** * security_getprocattr() - Read an attribute for a task * @p: the task - * @lsm: LSM name + * @lsmid: LSM identification * @name: attribute name * @value: attribute value * @@ -3848,13 +3848,13 @@ EXPORT_SYMBOL(security_d_instantiate); * * Return: Returns the length of @value on success, a negative value otherwise. */ -int security_getprocattr(struct task_struct *p, const char *lsm, - const char *name, char **value) +int security_getprocattr(struct task_struct *p, int lsmid, const char *name, + char **value) { struct security_hook_list *hp; hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) { - if (lsm != NULL && strcmp(lsm, hp->lsmid->name)) + if (lsmid != 0 && lsmid != hp->lsmid->id) continue; return hp->hook.getprocattr(p, name, value); } @@ -3863,7 +3863,7 @@ int security_getprocattr(struct task_struct *p, const char *lsm, /** * security_setprocattr() - Set an attribute for a task - * @lsm: LSM name + * @lsmid: LSM identification * @name: attribute name * @value: attribute value * @size: attribute value size @@ -3873,13 +3873,12 @@ int security_getprocattr(struct task_struct *p, const char *lsm, * * Return: Returns bytes written on success, a negative value otherwise. */ -int security_setprocattr(const char *lsm, const char *name, void *value, - size_t size) +int security_setprocattr(int lsmid, const char *name, void *value, size_t size) { struct security_hook_list *hp; hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) { - if (lsm != NULL && strcmp(lsm, hp->lsmid->name)) + if (lsmid != 0 && lsmid != hp->lsmid->id) continue; return hp->hook.setprocattr(name, value, size); } From a04a1198088a1378d0389c250cc684f649bcc91e Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:49 -0700 Subject: [PATCH 0047/1562] LSM: syscalls for current process attributes Create a system call lsm_get_self_attr() to provide the security module maintained attributes of the current process. Create a system call lsm_set_self_attr() to set a security module maintained attribute of the current process. Historically these attributes have been exposed to user space via entries in procfs under /proc/self/attr. The attribute value is provided in a lsm_ctx structure. The structure identifies the size of the attribute, and the attribute value. The format of the attribute value is defined by the security module. A flags field is included for LSM specific information. It is currently unused and must be 0. The total size of the data, including the lsm_ctx structure and any padding, is maintained as well. struct lsm_ctx { __u64 id; __u64 flags; __u64 len; __u64 ctx_len; __u8 ctx[]; }; Two new LSM hooks are used to interface with the LSMs. security_getselfattr() collects the lsm_ctx values from the LSMs that support the hook, accounting for space requirements. security_setselfattr() identifies which LSM the attribute is intended for and passes it along. Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: John Johansen Signed-off-by: Paul Moore --- Documentation/userspace-api/lsm.rst | 70 +++++++++++++ include/linux/lsm_hook_defs.h | 4 + include/linux/lsm_hooks.h | 1 + include/linux/security.h | 19 ++++ include/linux/syscalls.h | 5 + include/uapi/linux/lsm.h | 36 +++++++ kernel/sys_ni.c | 2 + security/Makefile | 1 + security/lsm_syscalls.c | 57 +++++++++++ security/security.c | 152 ++++++++++++++++++++++++++++ 10 files changed, 347 insertions(+) create mode 100644 Documentation/userspace-api/lsm.rst create mode 100644 security/lsm_syscalls.c diff --git a/Documentation/userspace-api/lsm.rst b/Documentation/userspace-api/lsm.rst new file mode 100644 index 000000000000..f8499f3e2826 --- /dev/null +++ b/Documentation/userspace-api/lsm.rst @@ -0,0 +1,70 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. Copyright (C) 2022 Casey Schaufler +.. Copyright (C) 2022 Intel Corporation + +===================================== +Linux Security Modules +===================================== + +:Author: Casey Schaufler +:Date: July 2023 + +Linux security modules (LSM) provide a mechanism to implement +additional access controls to the Linux security policies. + +The various security modules may support any of these attributes: + +``LSM_ATTR_CURRENT`` is the current, active security context of the +process. +The proc filesystem provides this value in ``/proc/self/attr/current``. +This is supported by the SELinux, Smack and AppArmor security modules. +Smack also provides this value in ``/proc/self/attr/smack/current``. +AppArmor also provides this value in ``/proc/self/attr/apparmor/current``. + +``LSM_ATTR_EXEC`` is the security context of the process at the time the +current image was executed. +The proc filesystem provides this value in ``/proc/self/attr/exec``. +This is supported by the SELinux and AppArmor security modules. +AppArmor also provides this value in ``/proc/self/attr/apparmor/exec``. + +``LSM_ATTR_FSCREATE`` is the security context of the process used when +creating file system objects. +The proc filesystem provides this value in ``/proc/self/attr/fscreate``. +This is supported by the SELinux security module. + +``LSM_ATTR_KEYCREATE`` is the security context of the process used when +creating key objects. +The proc filesystem provides this value in ``/proc/self/attr/keycreate``. +This is supported by the SELinux security module. + +``LSM_ATTR_PREV`` is the security context of the process at the time the +current security context was set. +The proc filesystem provides this value in ``/proc/self/attr/prev``. +This is supported by the SELinux and AppArmor security modules. +AppArmor also provides this value in ``/proc/self/attr/apparmor/prev``. + +``LSM_ATTR_SOCKCREATE`` is the security context of the process used when +creating socket objects. +The proc filesystem provides this value in ``/proc/self/attr/sockcreate``. +This is supported by the SELinux security module. + +Kernel interface +================ + +Set a security attribute of the current process +----------------------------------------------- + +.. kernel-doc:: security/lsm_syscalls.c + :identifiers: sys_lsm_set_self_attr + +Get the specified security attributes of the current process +------------------------------------------------------------ + +.. kernel-doc:: security/lsm_syscalls.c + :identifiers: sys_lsm_get_self_attr + +Additional documentation +======================== + +* Documentation/security/lsm.rst +* Documentation/security/lsm-development.rst diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ff217a5ce552..c925a0d26edf 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -262,6 +262,10 @@ LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops, LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb) LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry, struct inode *inode) +LSM_HOOK(int, -EOPNOTSUPP, getselfattr, unsigned int attr, + struct lsm_ctx __user *ctx, size_t *size, u32 flags) +LSM_HOOK(int, -EOPNOTSUPP, setselfattr, unsigned int attr, + struct lsm_ctx *ctx, size_t size, u32 flags) LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name, char **value) LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 7f0adb33caaa..a2ade0ffe9e7 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -25,6 +25,7 @@ #ifndef __LINUX_LSM_HOOKS_H #define __LINUX_LSM_HOOKS_H +#include #include #include #include diff --git a/include/linux/security.h b/include/linux/security.h index c81bca77f4f2..dd1fe487385d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -60,6 +60,7 @@ struct fs_parameter; enum fs_value_type; struct watch; struct watch_notification; +struct lsm_ctx; /* Default (no) options for the capable function */ #define CAP_OPT_NONE 0x0 @@ -472,6 +473,10 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd); int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter); void security_d_instantiate(struct dentry *dentry, struct inode *inode); +int security_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx, + size_t __user *size, u32 flags); +int security_setselfattr(unsigned int attr, struct lsm_ctx __user *ctx, + size_t size, u32 flags); int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value); int security_setprocattr(int lsmid, const char *name, void *value, size_t size); @@ -1338,6 +1343,20 @@ static inline void security_d_instantiate(struct dentry *dentry, struct inode *inode) { } +static inline int security_getselfattr(unsigned int attr, + struct lsm_ctx __user *ctx, + size_t __user *size, u32 flags) +{ + return -EOPNOTSUPP; +} + +static inline int security_setselfattr(unsigned int attr, + struct lsm_ctx __user *ctx, + size_t size, u32 flags) +{ + return -EOPNOTSUPP; +} + static inline int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value) { diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index fd9d12de7e92..4e1e56a24f1e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -71,6 +71,7 @@ struct clone_args; struct open_how; struct mount_attr; struct landlock_ruleset_attr; +struct lsm_ctx; enum landlock_rule_type; struct cachestat_range; struct cachestat; @@ -949,6 +950,10 @@ asmlinkage long sys_cachestat(unsigned int fd, struct cachestat_range __user *cstat_range, struct cachestat __user *cstat, unsigned int flags); asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags); +asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx, + size_t *size, __u32 flags); +asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx, + size_t size, __u32 flags); /* * Architecture-specific system calls diff --git a/include/uapi/linux/lsm.h b/include/uapi/linux/lsm.h index f27c9a9cc376..eeda59a77c02 100644 --- a/include/uapi/linux/lsm.h +++ b/include/uapi/linux/lsm.h @@ -9,6 +9,36 @@ #ifndef _UAPI_LINUX_LSM_H #define _UAPI_LINUX_LSM_H +#include +#include + +/** + * struct lsm_ctx - LSM context information + * @id: the LSM id number, see LSM_ID_XXX + * @flags: LSM specific flags + * @len: length of the lsm_ctx struct, @ctx and any other data or padding + * @ctx_len: the size of @ctx + * @ctx: the LSM context value + * + * The @len field MUST be equal to the size of the lsm_ctx struct + * plus any additional padding and/or data placed after @ctx. + * + * In all cases @ctx_len MUST be equal to the length of @ctx. + * If @ctx is a string value it should be nul terminated with + * @ctx_len equal to `strlen(@ctx) + 1`. Binary values are + * supported. + * + * The @flags and @ctx fields SHOULD only be interpreted by the + * LSM specified by @id; they MUST be set to zero/0 when not used. + */ +struct lsm_ctx { + __u64 id; + __u64 flags; + __u64 len; + __u64 ctx_len; + __u8 ctx[]; +}; + /* * ID tokens to identify Linux Security Modules (LSMs) * @@ -51,4 +81,10 @@ #define LSM_ATTR_PREV 104 #define LSM_ATTR_SOCKCREATE 105 +/* + * LSM_FLAG_XXX definitions identify special handling instructions + * for the API. + */ +#define LSM_FLAG_SINGLE 0x0001 + #endif /* _UAPI_LINUX_LSM_H */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e1a6e3c675c0..1f61b8452a6e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -171,6 +171,8 @@ COND_SYSCALL(landlock_add_rule); COND_SYSCALL(landlock_restrict_self); COND_SYSCALL(fadvise64_64); COND_SYSCALL_COMPAT(fadvise64_64); +COND_SYSCALL(lsm_get_self_attr); +COND_SYSCALL(lsm_set_self_attr); /* CONFIG_MMU only */ COND_SYSCALL(swapon); diff --git a/security/Makefile b/security/Makefile index 18121f8f85cd..59f238490665 100644 --- a/security/Makefile +++ b/security/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_KEYS) += keys/ # always enable default capabilities obj-y += commoncap.o +obj-$(CONFIG_SECURITY) += lsm_syscalls.o obj-$(CONFIG_MMU) += min_addr.o # Object file lists diff --git a/security/lsm_syscalls.c b/security/lsm_syscalls.c new file mode 100644 index 000000000000..226ae80d9683 --- /dev/null +++ b/security/lsm_syscalls.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * System calls implementing the Linux Security Module API. + * + * Copyright (C) 2022 Casey Schaufler + * Copyright (C) 2022 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * sys_lsm_set_self_attr - Set current task's security module attribute + * @attr: which attribute to set + * @ctx: the LSM contexts + * @size: size of @ctx + * @flags: reserved for future use + * + * Sets the calling task's LSM context. On success this function + * returns 0. If the attribute specified cannot be set a negative + * value indicating the reason for the error is returned. + */ +SYSCALL_DEFINE4(lsm_set_self_attr, unsigned int, attr, struct lsm_ctx __user *, + ctx, size_t, size, u32, flags) +{ + return security_setselfattr(attr, ctx, size, flags); +} + +/** + * sys_lsm_get_self_attr - Return current task's security module attributes + * @attr: which attribute to return + * @ctx: the user-space destination for the information, or NULL + * @size: pointer to the size of space available to receive the data + * @flags: special handling options. LSM_FLAG_SINGLE indicates that only + * attributes associated with the LSM identified in the passed @ctx be + * reported. + * + * Returns the calling task's LSM contexts. On success this + * function returns the number of @ctx array elements. This value + * may be zero if there are no LSM contexts assigned. If @size is + * insufficient to contain the return data -E2BIG is returned and + * @size is set to the minimum required size. In all other cases + * a negative value indicating the error is returned. + */ +SYSCALL_DEFINE4(lsm_get_self_attr, unsigned int, attr, struct lsm_ctx __user *, + ctx, size_t __user *, size, u32, flags) +{ + return security_getselfattr(attr, ctx, size, flags); +} diff --git a/security/security.c b/security/security.c index c66f9faefa40..9757d009113f 100644 --- a/security/security.c +++ b/security/security.c @@ -3837,6 +3837,158 @@ void security_d_instantiate(struct dentry *dentry, struct inode *inode) } EXPORT_SYMBOL(security_d_instantiate); +/* + * Please keep this in sync with it's counterpart in security/lsm_syscalls.c + */ + +/** + * security_getselfattr - Read an LSM attribute of the current process. + * @attr: which attribute to return + * @uctx: the user-space destination for the information, or NULL + * @size: pointer to the size of space available to receive the data + * @flags: special handling options. LSM_FLAG_SINGLE indicates that only + * attributes associated with the LSM identified in the passed @ctx be + * reported. + * + * A NULL value for @uctx can be used to get both the number of attributes + * and the size of the data. + * + * Returns the number of attributes found on success, negative value + * on error. @size is reset to the total size of the data. + * If @size is insufficient to contain the data -E2BIG is returned. + */ +int security_getselfattr(unsigned int attr, struct lsm_ctx __user *uctx, + size_t __user *size, u32 flags) +{ + struct security_hook_list *hp; + struct lsm_ctx lctx = { .id = LSM_ID_UNDEF, }; + u8 __user *base = (u8 __user *)uctx; + size_t total = 0; + size_t entrysize; + size_t left; + bool toobig = false; + bool single = false; + int count = 0; + int rc; + + if (attr == LSM_ATTR_UNDEF) + return -EINVAL; + if (size == NULL) + return -EINVAL; + if (get_user(left, size)) + return -EFAULT; + + if (flags) { + /* + * Only flag supported is LSM_FLAG_SINGLE + */ + if (flags != LSM_FLAG_SINGLE) + return -EINVAL; + if (uctx && copy_from_user(&lctx, uctx, sizeof(lctx))) + return -EFAULT; + /* + * If the LSM ID isn't specified it is an error. + */ + if (lctx.id == LSM_ID_UNDEF) + return -EINVAL; + single = true; + } + + /* + * In the usual case gather all the data from the LSMs. + * In the single case only get the data from the LSM specified. + */ + hlist_for_each_entry(hp, &security_hook_heads.getselfattr, list) { + if (single && lctx.id != hp->lsmid->id) + continue; + entrysize = left; + if (base) + uctx = (struct lsm_ctx __user *)(base + total); + rc = hp->hook.getselfattr(attr, uctx, &entrysize, flags); + if (rc == -EOPNOTSUPP) { + rc = 0; + continue; + } + if (rc == -E2BIG) { + toobig = true; + left = 0; + } else if (rc < 0) + return rc; + else + left -= entrysize; + + total += entrysize; + count += rc; + if (single) + break; + } + if (put_user(total, size)) + return -EFAULT; + if (toobig) + return -E2BIG; + if (count == 0) + return LSM_RET_DEFAULT(getselfattr); + return count; +} + +/* + * Please keep this in sync with it's counterpart in security/lsm_syscalls.c + */ + +/** + * security_setselfattr - Set an LSM attribute on the current process. + * @attr: which attribute to set + * @uctx: the user-space source for the information + * @size: the size of the data + * @flags: reserved for future use, must be 0 + * + * Set an LSM attribute for the current process. The LSM, attribute + * and new value are included in @uctx. + * + * Returns 0 on success, -EINVAL if the input is inconsistent, -EFAULT + * if the user buffer is inaccessible, E2BIG if size is too big, or an + * LSM specific failure. + */ +int security_setselfattr(unsigned int attr, struct lsm_ctx __user *uctx, + size_t size, u32 flags) +{ + struct security_hook_list *hp; + struct lsm_ctx *lctx; + int rc = LSM_RET_DEFAULT(setselfattr); + + if (flags) + return -EINVAL; + if (size < sizeof(*lctx)) + return -EINVAL; + if (size > PAGE_SIZE) + return -E2BIG; + + lctx = kmalloc(size, GFP_KERNEL); + if (lctx == NULL) + return -ENOMEM; + + if (copy_from_user(lctx, uctx, size)) { + rc = -EFAULT; + goto free_out; + } + + if (size < lctx->len || size < lctx->ctx_len + sizeof(*lctx) || + lctx->len < lctx->ctx_len + sizeof(*lctx)) { + rc = -EINVAL; + goto free_out; + } + + hlist_for_each_entry(hp, &security_hook_heads.setselfattr, list) + if ((hp->lsmid->id) == lctx->id) { + rc = hp->hook.setselfattr(attr, lctx, size, flags); + break; + } + +free_out: + kfree(lctx); + return rc; +} + /** * security_getprocattr() - Read an attribute for a task * @p: the task From ad4aff9ec25f400608283c10d634cc4eeda83a02 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:50 -0700 Subject: [PATCH 0048/1562] LSM: Create lsm_list_modules system call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a system call to report the list of Linux Security Modules that are active on the system. The list is provided as an array of LSM ID numbers. The calling application can use this list determine what LSM specific actions it might take. That might include choosing an output format, determining required privilege or bypassing security module specific behavior. Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: John Johansen Reviewed-by: Mickaël Salaün Signed-off-by: Paul Moore --- Documentation/userspace-api/lsm.rst | 3 +++ include/linux/syscalls.h | 1 + kernel/sys_ni.c | 1 + security/lsm_syscalls.c | 39 +++++++++++++++++++++++++++++ 4 files changed, 44 insertions(+) diff --git a/Documentation/userspace-api/lsm.rst b/Documentation/userspace-api/lsm.rst index f8499f3e2826..a76da373841b 100644 --- a/Documentation/userspace-api/lsm.rst +++ b/Documentation/userspace-api/lsm.rst @@ -63,6 +63,9 @@ Get the specified security attributes of the current process .. kernel-doc:: security/lsm_syscalls.c :identifiers: sys_lsm_get_self_attr +.. kernel-doc:: security/lsm_syscalls.c + :identifiers: sys_lsm_list_modules + Additional documentation ======================== diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 4e1e56a24f1e..feec5719750b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -954,6 +954,7 @@ asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx, size_t *size, __u32 flags); asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx, size_t size, __u32 flags); +asmlinkage long sys_lsm_list_modules(u64 *ids, size_t *size, u32 flags); /* * Architecture-specific system calls diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 1f61b8452a6e..9fa5989bf2ce 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -173,6 +173,7 @@ COND_SYSCALL(fadvise64_64); COND_SYSCALL_COMPAT(fadvise64_64); COND_SYSCALL(lsm_get_self_attr); COND_SYSCALL(lsm_set_self_attr); +COND_SYSCALL(lsm_list_modules); /* CONFIG_MMU only */ COND_SYSCALL(swapon); diff --git a/security/lsm_syscalls.c b/security/lsm_syscalls.c index 226ae80d9683..329aaca5efc0 100644 --- a/security/lsm_syscalls.c +++ b/security/lsm_syscalls.c @@ -55,3 +55,42 @@ SYSCALL_DEFINE4(lsm_get_self_attr, unsigned int, attr, struct lsm_ctx __user *, { return security_getselfattr(attr, ctx, size, flags); } + +/** + * sys_lsm_list_modules - Return a list of the active security modules + * @ids: the LSM module ids + * @size: pointer to size of @ids, updated on return + * @flags: reserved for future use, must be zero + * + * Returns a list of the active LSM ids. On success this function + * returns the number of @ids array elements. This value may be zero + * if there are no LSMs active. If @size is insufficient to contain + * the return data -E2BIG is returned and @size is set to the minimum + * required size. In all other cases a negative value indicating the + * error is returned. + */ +SYSCALL_DEFINE3(lsm_list_modules, u64 __user *, ids, size_t __user *, size, + u32, flags) +{ + size_t total_size = lsm_active_cnt * sizeof(*ids); + size_t usize; + int i; + + if (flags) + return -EINVAL; + + if (get_user(usize, size)) + return -EFAULT; + + if (put_user(total_size, size) != 0) + return -EFAULT; + + if (usize < total_size) + return -E2BIG; + + for (i = 0; i < lsm_active_cnt; i++) + if (put_user(lsm_idlist[i]->id, ids++)) + return -EFAULT; + + return lsm_active_cnt; +} From 5f42375904b08890f2e8e7cd955c5bf0c2c0d05a Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:51 -0700 Subject: [PATCH 0049/1562] LSM: wireup Linux Security Module syscalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wireup lsm_get_self_attr, lsm_set_self_attr and lsm_list_modules system calls. Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Acked-by: Geert Uytterhoeven Acked-by: Arnd Bergmann Cc: linux-api@vger.kernel.org Reviewed-by: Mickaël Salaün [PM: forward ported beyond v6.6 due merge window changes] Signed-off-by: Paul Moore --- arch/alpha/kernel/syscalls/syscall.tbl | 3 +++ arch/arm/tools/syscall.tbl | 3 +++ arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 6 ++++++ arch/m68k/kernel/syscalls/syscall.tbl | 3 +++ arch/microblaze/kernel/syscalls/syscall.tbl | 3 +++ arch/mips/kernel/syscalls/syscall_n32.tbl | 3 +++ arch/mips/kernel/syscalls/syscall_n64.tbl | 3 +++ arch/mips/kernel/syscalls/syscall_o32.tbl | 3 +++ arch/parisc/kernel/syscalls/syscall.tbl | 3 +++ arch/powerpc/kernel/syscalls/syscall.tbl | 3 +++ arch/s390/kernel/syscalls/syscall.tbl | 3 +++ arch/sh/kernel/syscalls/syscall.tbl | 3 +++ arch/sparc/kernel/syscalls/syscall.tbl | 3 +++ arch/x86/entry/syscalls/syscall_32.tbl | 3 +++ arch/x86/entry/syscalls/syscall_64.tbl | 3 +++ arch/xtensa/kernel/syscalls/syscall.tbl | 3 +++ include/uapi/asm-generic/unistd.h | 9 ++++++++- tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl | 3 +++ tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 3 +++ tools/perf/arch/s390/entry/syscalls/syscall.tbl | 3 +++ tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 3 +++ 22 files changed, 72 insertions(+), 2 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 18c842ca6c32..b04af0c9fcbc 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -496,3 +496,6 @@ 564 common futex_wake sys_futex_wake 565 common futex_wait sys_futex_wait 566 common futex_requeue sys_futex_requeue +567 common lsm_get_self_attr sys_lsm_get_self_attr +568 common lsm_set_self_attr sys_lsm_set_self_attr +569 common lsm_list_modules sys_lsm_list_modules diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 584f9528c996..43313beefae7 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -470,3 +470,6 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common lsm_get_self_attr sys_lsm_get_self_attr +458 common lsm_set_self_attr sys_lsm_set_self_attr +459 common lsm_list_modules sys_lsm_list_modules diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 531effca5f1f..abe10a833fcd 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -39,7 +39,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 457 +#define __NR_compat_syscalls 460 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 9f7c1bf99526..ab1a7c2b6653 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -919,6 +919,12 @@ __SYSCALL(__NR_futex_wake, sys_futex_wake) __SYSCALL(__NR_futex_wait, sys_futex_wait) #define __NR_futex_requeue 456 __SYSCALL(__NR_futex_requeue, sys_futex_requeue) +#define __NR_lsm_get_self_attr 457 +__SYSCALL(__NR_lsm_get_self_attr, sys_lsm_get_self_attr) +#define __NR_lsm_set_self_attr 458 +__SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr) +#define __NR_lsm_list_modules 459 +__SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) /* * Please add new compat syscalls above this comment and update diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 7a4b780e82cb..90629ffc6732 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -456,3 +456,6 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common lsm_get_self_attr sys_lsm_get_self_attr +458 common lsm_set_self_attr sys_lsm_set_self_attr +459 common lsm_list_modules sys_lsm_list_modules diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 5b6a0b02b7de..c395dece73b4 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -462,3 +462,6 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common lsm_get_self_attr sys_lsm_get_self_attr +458 common lsm_set_self_attr sys_lsm_set_self_attr +459 common lsm_list_modules sys_lsm_list_modules diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index a842b41c8e06..4a876c4e77d6 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -395,3 +395,6 @@ 454 n32 futex_wake sys_futex_wake 455 n32 futex_wait sys_futex_wait 456 n32 futex_requeue sys_futex_requeue +457 n32 lsm_get_self_attr sys_lsm_get_self_attr +458 n32 lsm_set_self_attr sys_lsm_set_self_attr +459 n32 lsm_list_modules sys_lsm_list_modules diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 116ff501bf92..b74c8571f063 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -371,3 +371,6 @@ 454 n64 futex_wake sys_futex_wake 455 n64 futex_wait sys_futex_wait 456 n64 futex_requeue sys_futex_requeue +457 n64 lsm_get_self_attr sys_lsm_get_self_attr +458 n64 lsm_set_self_attr sys_lsm_set_self_attr +459 n64 lsm_list_modules sys_lsm_list_modules diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 525cc54bc63b..bf41906e1f68 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -444,3 +444,6 @@ 454 o32 futex_wake sys_futex_wake 455 o32 futex_wait sys_futex_wait 456 o32 futex_requeue sys_futex_requeue +457 o32 lsm_get_self_attr sys_lsm_get_self_attr +458 032 lsm_set_self_attr sys_lsm_set_self_attr +459 o32 lsm_list_modules sys_lsm_list_modules diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index a47798fed54e..ccc0a679e774 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -455,3 +455,6 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common lsm_get_self_attr sys_lsm_get_self_attr +458 common lsm_set_self_attr sys_lsm_set_self_attr +459 common lsm_list_modules sys_lsm_list_modules diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 7fab411378f2..a6f37e2333cb 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -543,3 +543,6 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common lsm_get_self_attr sys_lsm_get_self_attr +458 common lsm_set_self_attr sys_lsm_set_self_attr +459 common lsm_list_modules sys_lsm_list_modules diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 86fec9b080f6..4b818e9ee832 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -459,3 +459,6 @@ 454 common futex_wake sys_futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue sys_futex_requeue +457 common lsm_get_self_attr sys_lsm_get_self_attr sys_lsm_get_self_attr +458 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr +459 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 363fae0fe9bf..1a3d88d1a07f 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -459,3 +459,6 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common lsm_get_self_attr sys_lsm_get_self_attr +458 common lsm_set_self_attr sys_lsm_set_self_attr +459 common lsm_list_modules sys_lsm_list_modules diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 7bcaa3d5ea44..e0e8cec62358 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -502,3 +502,6 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common lsm_get_self_attr sys_lsm_get_self_attr +458 common lsm_set_self_attr sys_lsm_set_self_attr +459 common lsm_list_modules sys_lsm_list_modules diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index c8fac5205803..6e45e693f339 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -461,3 +461,6 @@ 454 i386 futex_wake sys_futex_wake 455 i386 futex_wait sys_futex_wait 456 i386 futex_requeue sys_futex_requeue +457 i386 lsm_get_self_attr sys_lsm_get_self_attr +458 i386 lsm_set_self_attr sys_lsm_set_self_attr +459 i386 lsm_list_modules sys_lsm_list_modules diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 8cb8bf68721c..d3b41d059d4d 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -378,6 +378,9 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common lsm_get_self_attr sys_lsm_get_self_attr +458 common lsm_set_self_attr sys_lsm_set_self_attr +459 common lsm_list_modules sys_lsm_list_modules # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 06eefa9c1458..284784ea5a46 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -427,3 +427,6 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common lsm_get_self_attr sys_lsm_get_self_attr +458 common lsm_set_self_attr sys_lsm_set_self_attr +459 common lsm_list_modules sys_lsm_list_modules diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 756b013fb832..55cc0bcfb58d 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -829,8 +829,15 @@ __SYSCALL(__NR_futex_wait, sys_futex_wait) #define __NR_futex_requeue 456 __SYSCALL(__NR_futex_requeue, sys_futex_requeue) +#define __NR_lsm_get_self_attr 457 +__SYSCALL(__NR_lsm_get_self_attr, sys_lsm_get_self_attr) +#define __NR_lsm_set_self_attr 458 +__SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr) +#define __NR_lsm_list_modules 459 +__SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) + #undef __NR_syscalls -#define __NR_syscalls 457 +#define __NR_syscalls 460 /* * 32 bit systems traditionally used different diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl index 80be0e98ea0c..81c772c0f5c8 100644 --- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl +++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl @@ -367,3 +367,6 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 n64 cachestat sys_cachestat 452 n64 fchmodat2 sys_fchmodat2 +453 n64 lsm_get_self_attr sys_lsm_get_self_attr +454 n64 lsm_set_self_attr sys_lsm_set_self_attr +455 n64 lsm_list_modules sys_lsm_list_modules diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index e1412519b4ad..861c6ca0a8c3 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -539,3 +539,6 @@ 450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 +453 common lsm_get_self_attr sys_lsm_get_self_attr +454 common lsm_set_self_attr sys_lsm_set_self_attr +455 common lsm_list_modules sys_lsm_list_modules diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index cc0bc144b661..5a422443cb16 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -455,3 +455,6 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 sys_fchmodat2 +453 common lsm_get_self_attr sys_lsm_get_self_attr sys_lsm_get_self_attr +454 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr +455 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 2a62eaf30d69..e692c88105a6 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -375,6 +375,9 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 453 64 map_shadow_stack sys_map_shadow_stack +454 common lsm_get_self_attr sys_lsm_get_self_attr +455 common lsm_set_self_attr sys_lsm_set_self_attr +456 common lsm_list_modules sys_lsm_list_modules # # Due to a historical design error, certain syscalls are numbered differently From e1ca7129db2c3b3c4d261702905a752e6b2710b4 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:52 -0700 Subject: [PATCH 0050/1562] LSM: Helpers for attribute names and filling lsm_ctx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add lsm_name_to_attr(), which translates a text string to a LSM_ATTR value if one is available. Add lsm_fill_user_ctx(), which fills a struct lsm_ctx, including the trailing attribute value. Both are used in module specific components of LSM system calls. Signed-off-by: Casey Schaufler Reviewed-by: John Johansen Reviewed-by: Serge Hallyn Reviewed-by: Mickaël Salaün Signed-off-by: Paul Moore --- include/linux/security.h | 14 ++++++++++++++ security/lsm_syscalls.c | 24 +++++++++++++++++++++++ security/security.c | 41 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+) diff --git a/include/linux/security.h b/include/linux/security.h index dd1fe487385d..334f75aa7289 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -32,6 +32,7 @@ #include #include #include +#include struct linux_binprm; struct cred; @@ -264,6 +265,7 @@ int unregister_blocking_lsm_notifier(struct notifier_block *nb); /* prototypes */ extern int security_init(void); extern int early_security_init(void); +extern u64 lsm_name_to_attr(const char *name); /* Security operations */ int security_binder_set_context_mgr(const struct cred *mgr); @@ -490,6 +492,8 @@ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); int security_locked_down(enum lockdown_reason what); +int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context, + size_t context_size, u64 id, u64 flags); #else /* CONFIG_SECURITY */ static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data) @@ -507,6 +511,11 @@ static inline int unregister_blocking_lsm_notifier(struct notifier_block *nb) return 0; } +static inline u64 lsm_name_to_attr(const char *name) +{ + return LSM_ATTR_UNDEF; +} + static inline void security_free_mnt_opts(void **mnt_opts) { } @@ -1415,6 +1424,11 @@ static inline int security_locked_down(enum lockdown_reason what) { return 0; } +static inline int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context, + size_t context_size, u64 id, u64 flags) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_SECURITY */ #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) diff --git a/security/lsm_syscalls.c b/security/lsm_syscalls.c index 329aaca5efc0..5d391b1f7e69 100644 --- a/security/lsm_syscalls.c +++ b/security/lsm_syscalls.c @@ -17,6 +17,30 @@ #include #include +/** + * lsm_name_to_attr - map an LSM attribute name to its ID + * @name: name of the attribute + * + * Returns the LSM attribute value associated with @name, or 0 if + * there is no mapping. + */ +u64 lsm_name_to_attr(const char *name) +{ + if (!strcmp(name, "current")) + return LSM_ATTR_CURRENT; + if (!strcmp(name, "exec")) + return LSM_ATTR_EXEC; + if (!strcmp(name, "fscreate")) + return LSM_ATTR_FSCREATE; + if (!strcmp(name, "keycreate")) + return LSM_ATTR_KEYCREATE; + if (!strcmp(name, "prev")) + return LSM_ATTR_PREV; + if (!strcmp(name, "sockcreate")) + return LSM_ATTR_SOCKCREATE; + return LSM_ATTR_UNDEF; +} + /** * sys_lsm_set_self_attr - Set current task's security module attribute * @attr: which attribute to set diff --git a/security/security.c b/security/security.c index 9757d009113f..988483fcf153 100644 --- a/security/security.c +++ b/security/security.c @@ -771,6 +771,47 @@ static int lsm_superblock_alloc(struct super_block *sb) return 0; } +/** + * lsm_fill_user_ctx - Fill a user space lsm_ctx structure + * @ctx: an LSM context to be filled + * @context: the new context value + * @context_size: the size of the new context value + * @id: LSM id + * @flags: LSM defined flags + * + * Fill all of the fields in a user space lsm_ctx structure. + * Caller is assumed to have verified that @ctx has enough space + * for @context. + * + * Returns 0 on success, -EFAULT on a copyout error, -ENOMEM + * if memory can't be allocated. + */ +int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context, + size_t context_size, u64 id, u64 flags) +{ + struct lsm_ctx *lctx; + size_t locallen = struct_size(lctx, ctx, context_size); + int rc = 0; + + lctx = kzalloc(locallen, GFP_KERNEL); + if (lctx == NULL) + return -ENOMEM; + + lctx->id = id; + lctx->flags = flags; + lctx->ctx_len = context_size; + lctx->len = locallen; + + memcpy(lctx->ctx, context, context_size); + + if (copy_to_user(ctx, lctx, locallen)) + rc = -EFAULT; + + kfree(lctx); + + return rc; +} + /* * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and * can be accessed with: From 38b323e5881608b5a229526d9a567df6182255ef Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:53 -0700 Subject: [PATCH 0051/1562] Smack: implement setselfattr and getselfattr hooks Implement Smack support for security_[gs]etselfattr. Refactor the setprocattr hook to avoid code duplication. Signed-off-by: Casey Schaufler Reviewed-by: John Johansen Signed-off-by: Paul Moore --- security/smack/smack_lsm.c | 95 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 5 deletions(-) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index f73f9a2834eb..12160d060cc1 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -3626,6 +3626,46 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode) return; } +/** + * smack_getselfattr - Smack current process attribute + * @attr: which attribute to fetch + * @ctx: buffer to receive the result + * @size: available size in, actual size out + * @flags: unused + * + * Fill the passed user space @ctx with the details of the requested + * attribute. + * + * Returns the number of attributes on success, an error code otherwise. + * There will only ever be one attribute. + */ +static int smack_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx, + size_t *size, u32 flags) +{ + struct smack_known *skp = smk_of_current(); + int total; + int slen; + int rc; + + if (attr != LSM_ATTR_CURRENT) + return -EOPNOTSUPP; + + slen = strlen(skp->smk_known) + 1; + total = ALIGN(slen + sizeof(*ctx), 8); + if (total > *size) + rc = -E2BIG; + else if (ctx) + rc = lsm_fill_user_ctx(ctx, skp->smk_known, slen, LSM_ID_SMACK, + 0); + else + rc = 1; + + *size = total; + if (rc >= 0) + return 1; + return rc; +} + /** * smack_getprocattr - Smack process attribute access * @p: the object task @@ -3655,8 +3695,8 @@ static int smack_getprocattr(struct task_struct *p, const char *name, char **val } /** - * smack_setprocattr - Smack process attribute setting - * @name: the name of the attribute in /proc/.../attr + * do_setattr - Smack process attribute setting + * @attr: the ID of the attribute * @value: the value to set * @size: the size of the value * @@ -3665,7 +3705,7 @@ static int smack_getprocattr(struct task_struct *p, const char *name, char **val * * Returns the length of the smack label or an error code */ -static int smack_setprocattr(const char *name, void *value, size_t size) +static int do_setattr(u64 attr, void *value, size_t size) { struct task_smack *tsp = smack_cred(current_cred()); struct cred *new; @@ -3679,8 +3719,8 @@ static int smack_setprocattr(const char *name, void *value, size_t size) if (value == NULL || size == 0 || size >= SMK_LONGLABEL) return -EINVAL; - if (strcmp(name, "current") != 0) - return -EINVAL; + if (attr != LSM_ATTR_CURRENT) + return -EOPNOTSUPP; skp = smk_import_entry(value, size); if (IS_ERR(skp)) @@ -3719,6 +3759,49 @@ static int smack_setprocattr(const char *name, void *value, size_t size) return size; } +/** + * smack_setselfattr - Set a Smack process attribute + * @attr: which attribute to set + * @ctx: buffer containing the data + * @size: size of @ctx + * @flags: unused + * + * Fill the passed user space @ctx with the details of the requested + * attribute. + * + * Returns 0 on success, an error code otherwise. + */ +static int smack_setselfattr(unsigned int attr, struct lsm_ctx *ctx, + size_t size, u32 flags) +{ + int rc; + + rc = do_setattr(attr, ctx->ctx, ctx->ctx_len); + if (rc > 0) + return 0; + return rc; +} + +/** + * smack_setprocattr - Smack process attribute setting + * @name: the name of the attribute in /proc/.../attr + * @value: the value to set + * @size: the size of the value + * + * Sets the Smack value of the task. Only setting self + * is permitted and only with privilege + * + * Returns the length of the smack label or an error code + */ +static int smack_setprocattr(const char *name, void *value, size_t size) +{ + int attr = lsm_name_to_attr(name); + + if (attr != LSM_ATTR_UNDEF) + return do_setattr(attr, value, size); + return -EINVAL; +} + /** * smack_unix_stream_connect - Smack access on UDS * @sock: one sock @@ -5033,6 +5116,8 @@ static struct security_hook_list smack_hooks[] __ro_after_init = { LSM_HOOK_INIT(d_instantiate, smack_d_instantiate), + LSM_HOOK_INIT(getselfattr, smack_getselfattr), + LSM_HOOK_INIT(setselfattr, smack_setselfattr), LSM_HOOK_INIT(getprocattr, smack_getprocattr), LSM_HOOK_INIT(setprocattr, smack_setprocattr), From 223981db9bafb80f558162c148f261e2ff043dbe Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:54 -0700 Subject: [PATCH 0052/1562] AppArmor: Add selfattr hooks Add hooks for setselfattr and getselfattr. These hooks are not very different from their setprocattr and getprocattr equivalents, and much of the code is shared. Signed-off-by: Casey Schaufler Acked-by: John Johansen [PM: forward ported beyond v6.6 due merge window changes] Signed-off-by: Paul Moore --- security/apparmor/include/procattr.h | 2 +- security/apparmor/lsm.c | 91 ++++++++++++++++++++++++++-- security/apparmor/procattr.c | 10 +-- 3 files changed, 92 insertions(+), 11 deletions(-) diff --git a/security/apparmor/include/procattr.h b/security/apparmor/include/procattr.h index 31689437e0e1..03dbfdb2f2c0 100644 --- a/security/apparmor/include/procattr.h +++ b/security/apparmor/include/procattr.h @@ -11,7 +11,7 @@ #ifndef __AA_PROCATTR_H #define __AA_PROCATTR_H -int aa_getprocattr(struct aa_label *label, char **string); +int aa_getprocattr(struct aa_label *label, char **string, bool newline); int aa_setprocattr_changehat(char *args, size_t size, int flags); #endif /* __AA_PROCATTR_H */ diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 093da0a9dbd8..8165f80c10ff 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -776,6 +776,55 @@ static int apparmor_sb_pivotroot(const struct path *old_path, return error; } +static int apparmor_getselfattr(unsigned int attr, struct lsm_ctx __user *lx, + size_t *size, u32 flags) +{ + int error = -ENOENT; + struct aa_task_ctx *ctx = task_ctx(current); + struct aa_label *label = NULL; + size_t total_len = 0; + char *value; + + switch (attr) { + case LSM_ATTR_CURRENT: + label = aa_get_newest_label(cred_label(current_cred())); + break; + case LSM_ATTR_PREV: + if (ctx->previous) + label = aa_get_newest_label(ctx->previous); + break; + case LSM_ATTR_EXEC: + if (ctx->onexec) + label = aa_get_newest_label(ctx->onexec); + break; + default: + error = -EOPNOTSUPP; + break; + } + + if (label) { + error = aa_getprocattr(label, &value, false); + if (error > 0) { + total_len = ALIGN(struct_size(lx, ctx, error), 8); + if (total_len > *size) + error = -E2BIG; + else if (lx) + error = lsm_fill_user_ctx(lx, value, error, + LSM_ID_APPARMOR, 0); + else + error = 1; + } + kfree(value); + } + + aa_put_label(label); + + *size = total_len; + if (error < 0) + return error; + return 1; +} + static int apparmor_getprocattr(struct task_struct *task, const char *name, char **value) { @@ -795,7 +844,7 @@ static int apparmor_getprocattr(struct task_struct *task, const char *name, error = -EINVAL; if (label) - error = aa_getprocattr(label, value); + error = aa_getprocattr(label, value, true); aa_put_label(label); put_cred(cred); @@ -803,8 +852,7 @@ static int apparmor_getprocattr(struct task_struct *task, const char *name, return error; } -static int apparmor_setprocattr(const char *name, void *value, - size_t size) +static int do_setattr(u64 attr, void *value, size_t size) { char *command, *largs = NULL, *args = value; size_t arg_size; @@ -835,7 +883,7 @@ static int apparmor_setprocattr(const char *name, void *value, goto out; arg_size = size - (args - (largs ? largs : (char *) value)); - if (strcmp(name, "current") == 0) { + if (attr == LSM_ATTR_CURRENT) { if (strcmp(command, "changehat") == 0) { error = aa_setprocattr_changehat(args, arg_size, AA_CHANGE_NOFLAGS); @@ -850,7 +898,7 @@ static int apparmor_setprocattr(const char *name, void *value, error = aa_change_profile(args, AA_CHANGE_STACK); } else goto fail; - } else if (strcmp(name, "exec") == 0) { + } else if (attr == LSM_ATTR_EXEC) { if (strcmp(command, "exec") == 0) error = aa_change_profile(args, AA_CHANGE_ONEXEC); else if (strcmp(command, "stack") == 0) @@ -870,13 +918,42 @@ out: fail: ad.subj_label = begin_current_label_crit_section(); - ad.info = name; + if (attr == LSM_ATTR_CURRENT) + ad.info = "current"; + else if (attr == LSM_ATTR_EXEC) + ad.info = "exec"; + else + ad.info = "invalid"; ad.error = error = -EINVAL; aa_audit_msg(AUDIT_APPARMOR_DENIED, &ad, NULL); end_current_label_crit_section(ad.subj_label); goto out; } +static int apparmor_setselfattr(unsigned int attr, struct lsm_ctx *ctx, + size_t size, u32 flags) +{ + int rc; + + if (attr != LSM_ATTR_CURRENT && attr != LSM_ATTR_EXEC) + return -EOPNOTSUPP; + + rc = do_setattr(attr, ctx->ctx, ctx->ctx_len); + if (rc > 0) + return 0; + return rc; +} + +static int apparmor_setprocattr(const char *name, void *value, + size_t size) +{ + int attr = lsm_name_to_attr(name); + + if (attr) + return do_setattr(attr, value, size); + return -EINVAL; +} + /** * apparmor_bprm_committing_creds - do task cleanup on committing new creds * @bprm: binprm for the exec (NOT NULL) @@ -1424,6 +1501,8 @@ static struct security_hook_list apparmor_hooks[] __ro_after_init = { LSM_HOOK_INIT(file_lock, apparmor_file_lock), LSM_HOOK_INIT(file_truncate, apparmor_file_truncate), + LSM_HOOK_INIT(getselfattr, apparmor_getselfattr), + LSM_HOOK_INIT(setselfattr, apparmor_setselfattr), LSM_HOOK_INIT(getprocattr, apparmor_getprocattr), LSM_HOOK_INIT(setprocattr, apparmor_setprocattr), diff --git a/security/apparmor/procattr.c b/security/apparmor/procattr.c index 197d41f9c32b..e3857e3d7c6c 100644 --- a/security/apparmor/procattr.c +++ b/security/apparmor/procattr.c @@ -20,6 +20,7 @@ * aa_getprocattr - Return the label information for @label * @label: the label to print label info about (NOT NULL) * @string: Returns - string containing the label info (NOT NULL) + * @newline: indicates that a newline should be added * * Requires: label != NULL && string != NULL * @@ -27,7 +28,7 @@ * * Returns: size of string placed in @string else error code on failure */ -int aa_getprocattr(struct aa_label *label, char **string) +int aa_getprocattr(struct aa_label *label, char **string, bool newline) { struct aa_ns *ns = labels_ns(label); struct aa_ns *current_ns = aa_get_current_ns(); @@ -57,11 +58,12 @@ int aa_getprocattr(struct aa_label *label, char **string) return len; } - (*string)[len] = '\n'; - (*string)[len + 1] = 0; + if (newline) + (*string)[len++] = '\n'; + (*string)[len] = 0; aa_put_ns(current_ns); - return len + 1; + return len; } /** From 762c934317e6f4b576eb4aa75e5facf4968a4a8f Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:55 -0700 Subject: [PATCH 0053/1562] SELinux: Add selfattr hooks Add hooks for setselfattr and getselfattr. These hooks are not very different from their setprocattr and getprocattr equivalents, and much of the code is shared. Cc: selinux@vger.kernel.org Signed-off-by: Casey Schaufler Signed-off-by: Paul Moore --- security/selinux/hooks.c | 134 +++++++++++++++++++++++++++++++-------- 1 file changed, 107 insertions(+), 27 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index f2423dfd19cd..b6c7930a3ab2 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6285,8 +6285,8 @@ static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode) inode_doinit_with_dentry(inode, dentry); } -static int selinux_getprocattr(struct task_struct *p, - const char *name, char **value) +static int selinux_lsm_getattr(unsigned int attr, struct task_struct *p, + char **value) { const struct task_security_struct *__tsec; u32 sid; @@ -6303,20 +6303,27 @@ static int selinux_getprocattr(struct task_struct *p, goto bad; } - if (!strcmp(name, "current")) + switch (attr) { + case LSM_ATTR_CURRENT: sid = __tsec->sid; - else if (!strcmp(name, "prev")) + break; + case LSM_ATTR_PREV: sid = __tsec->osid; - else if (!strcmp(name, "exec")) + break; + case LSM_ATTR_EXEC: sid = __tsec->exec_sid; - else if (!strcmp(name, "fscreate")) + break; + case LSM_ATTR_FSCREATE: sid = __tsec->create_sid; - else if (!strcmp(name, "keycreate")) + break; + case LSM_ATTR_KEYCREATE: sid = __tsec->keycreate_sid; - else if (!strcmp(name, "sockcreate")) + break; + case LSM_ATTR_SOCKCREATE: sid = __tsec->sockcreate_sid; - else { - error = -EINVAL; + break; + default: + error = -EOPNOTSUPP; goto bad; } rcu_read_unlock(); @@ -6334,7 +6341,7 @@ bad: return error; } -static int selinux_setprocattr(const char *name, void *value, size_t size) +static int selinux_lsm_setattr(u64 attr, void *value, size_t size) { struct task_security_struct *tsec; struct cred *new; @@ -6345,23 +6352,31 @@ static int selinux_setprocattr(const char *name, void *value, size_t size) /* * Basic control over ability to set these attributes at all. */ - if (!strcmp(name, "exec")) + switch (attr) { + case LSM_ATTR_EXEC: error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETEXEC, NULL); - else if (!strcmp(name, "fscreate")) + break; + case LSM_ATTR_FSCREATE: error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETFSCREATE, NULL); - else if (!strcmp(name, "keycreate")) + break; + case LSM_ATTR_KEYCREATE: error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETKEYCREATE, NULL); - else if (!strcmp(name, "sockcreate")) + break; + case LSM_ATTR_SOCKCREATE: error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETSOCKCREATE, NULL); - else if (!strcmp(name, "current")) + break; + case LSM_ATTR_CURRENT: error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETCURRENT, NULL); - else - error = -EINVAL; + break; + default: + error = -EOPNOTSUPP; + break; + } if (error) return error; @@ -6373,13 +6388,14 @@ static int selinux_setprocattr(const char *name, void *value, size_t size) } error = security_context_to_sid(value, size, &sid, GFP_KERNEL); - if (error == -EINVAL && !strcmp(name, "fscreate")) { + if (error == -EINVAL && attr == LSM_ATTR_FSCREATE) { if (!has_cap_mac_admin(true)) { struct audit_buffer *ab; size_t audit_size; - /* We strip a nul only if it is at the end, otherwise the - * context contains a nul and we should audit that */ + /* We strip a nul only if it is at the end, + * otherwise the context contains a nul and + * we should audit that */ if (str[size - 1] == '\0') audit_size = size - 1; else @@ -6390,7 +6406,8 @@ static int selinux_setprocattr(const char *name, void *value, size_t size) if (!ab) return error; audit_log_format(ab, "op=fscreate invalid_context="); - audit_log_n_untrustedstring(ab, value, audit_size); + audit_log_n_untrustedstring(ab, value, + audit_size); audit_log_end(ab); return error; @@ -6413,11 +6430,11 @@ static int selinux_setprocattr(const char *name, void *value, size_t size) checks and may_create for the file creation checks. The operation will then fail if the context is not permitted. */ tsec = selinux_cred(new); - if (!strcmp(name, "exec")) { + if (attr == LSM_ATTR_EXEC) { tsec->exec_sid = sid; - } else if (!strcmp(name, "fscreate")) { + } else if (attr == LSM_ATTR_FSCREATE) { tsec->create_sid = sid; - } else if (!strcmp(name, "keycreate")) { + } else if (attr == LSM_ATTR_KEYCREATE) { if (sid) { error = avc_has_perm(mysid, sid, SECCLASS_KEY, KEY__CREATE, NULL); @@ -6425,9 +6442,9 @@ static int selinux_setprocattr(const char *name, void *value, size_t size) goto abort_change; } tsec->keycreate_sid = sid; - } else if (!strcmp(name, "sockcreate")) { + } else if (attr == LSM_ATTR_SOCKCREATE) { tsec->sockcreate_sid = sid; - } else if (!strcmp(name, "current")) { + } else if (attr == LSM_ATTR_CURRENT) { error = -EINVAL; if (sid == 0) goto abort_change; @@ -6469,6 +6486,67 @@ abort_change: return error; } +static int selinux_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx, + size_t *size, u32 flags) +{ + char *value; + size_t total_len; + int len; + int rc = 0; + + len = selinux_lsm_getattr(attr, current, &value); + if (len < 0) + return len; + + total_len = ALIGN(struct_size(ctx, ctx, len), 8); + + if (total_len > *size) + rc = -E2BIG; + else if (ctx) + rc = lsm_fill_user_ctx(ctx, value, len, LSM_ID_SELINUX, 0); + + kfree(value); + *size = total_len; + if (rc < 0) + return rc; + return 1; +} + +static int selinux_setselfattr(unsigned int attr, struct lsm_ctx *ctx, + size_t size, u32 flags) +{ + int rc; + + rc = selinux_lsm_setattr(attr, ctx->ctx, ctx->ctx_len); + if (rc > 0) + return 0; + return rc; +} + +static int selinux_getprocattr(struct task_struct *p, + const char *name, char **value) +{ + unsigned int attr = lsm_name_to_attr(name); + int rc; + + if (attr) { + rc = selinux_lsm_getattr(attr, p, value); + if (rc != -EOPNOTSUPP) + return rc; + } + + return -EINVAL; +} + +static int selinux_setprocattr(const char *name, void *value, size_t size) +{ + int attr = lsm_name_to_attr(name); + + if (attr) + return selinux_lsm_setattr(attr, value, size); + return -EINVAL; +} + static int selinux_ismaclabel(const char *name) { return (strcmp(name, XATTR_SELINUX_SUFFIX) == 0); @@ -7097,6 +7175,8 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { LSM_HOOK_INIT(d_instantiate, selinux_d_instantiate), + LSM_HOOK_INIT(getselfattr, selinux_getselfattr), + LSM_HOOK_INIT(setselfattr, selinux_setselfattr), LSM_HOOK_INIT(getprocattr, selinux_getprocattr), LSM_HOOK_INIT(setprocattr, selinux_setprocattr), From d3d929a8b0cd6deb7d70d1d8d805bccee3fbf11f Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:56 -0700 Subject: [PATCH 0054/1562] LSM: selftests for Linux Security Module syscalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add selftests for the three system calls supporting the LSM infrastructure. This set of tests is limited by the differences in access policy enforced by the existing security modules. Signed-off-by: Casey Schaufler Reviewed-by: Mickaël Salaün Tested-by: Mickaël Salaün Signed-off-by: Paul Moore --- MAINTAINERS | 1 + tools/testing/selftests/Makefile | 1 + tools/testing/selftests/lsm/.gitignore | 1 + tools/testing/selftests/lsm/Makefile | 17 ++ tools/testing/selftests/lsm/common.c | 89 ++++++ tools/testing/selftests/lsm/common.h | 33 +++ tools/testing/selftests/lsm/config | 3 + .../selftests/lsm/lsm_get_self_attr_test.c | 275 ++++++++++++++++++ .../selftests/lsm/lsm_list_modules_test.c | 140 +++++++++ .../selftests/lsm/lsm_set_self_attr_test.c | 74 +++++ 10 files changed, 634 insertions(+) create mode 100644 tools/testing/selftests/lsm/.gitignore create mode 100644 tools/testing/selftests/lsm/Makefile create mode 100644 tools/testing/selftests/lsm/common.c create mode 100644 tools/testing/selftests/lsm/common.h create mode 100644 tools/testing/selftests/lsm/config create mode 100644 tools/testing/selftests/lsm/lsm_get_self_attr_test.c create mode 100644 tools/testing/selftests/lsm/lsm_list_modules_test.c create mode 100644 tools/testing/selftests/lsm/lsm_set_self_attr_test.c diff --git a/MAINTAINERS b/MAINTAINERS index f1d41fd9159a..2482b40fd786 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19513,6 +19513,7 @@ W: http://kernsec.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git F: include/uapi/linux/lsm.h F: security/ +F: tools/testing/selftests/lsm/ X: security/selinux/ SELINUX SECURITY MODULE diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 3b2061d1c1a5..1107be84ea95 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -43,6 +43,7 @@ TARGETS += landlock TARGETS += lib TARGETS += livepatch TARGETS += lkdtm +TARGETS += lsm TARGETS += membarrier TARGETS += memfd TARGETS += memory-hotplug diff --git a/tools/testing/selftests/lsm/.gitignore b/tools/testing/selftests/lsm/.gitignore new file mode 100644 index 000000000000..bd68f6c3fd07 --- /dev/null +++ b/tools/testing/selftests/lsm/.gitignore @@ -0,0 +1 @@ +/*_test diff --git a/tools/testing/selftests/lsm/Makefile b/tools/testing/selftests/lsm/Makefile new file mode 100644 index 000000000000..3f80c0bc093d --- /dev/null +++ b/tools/testing/selftests/lsm/Makefile @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# First run: make -C ../../../.. headers_install + +CFLAGS += -Wall -O2 $(KHDR_INCLUDES) +LOCAL_HDRS += common.h + +TEST_GEN_PROGS := lsm_get_self_attr_test lsm_list_modules_test \ + lsm_set_self_attr_test + +include ../lib.mk + +$(OUTPUT)/lsm_get_self_attr_test: lsm_get_self_attr_test.c common.c +$(OUTPUT)/lsm_set_self_attr_test: lsm_set_self_attr_test.c common.c +$(OUTPUT)/lsm_list_modules_test: lsm_list_modules_test.c common.c + +EXTRA_CLEAN = $(OUTPUT)/common.o diff --git a/tools/testing/selftests/lsm/common.c b/tools/testing/selftests/lsm/common.c new file mode 100644 index 000000000000..9ad258912646 --- /dev/null +++ b/tools/testing/selftests/lsm/common.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Linux Security Module infrastructure tests + * + * Copyright © 2023 Casey Schaufler + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define PROCATTR "/proc/self/attr/" + +int read_proc_attr(const char *attr, char *value, size_t size) +{ + int fd; + int len; + char *path; + + len = strlen(PROCATTR) + strlen(attr) + 1; + path = calloc(len, 1); + if (path == NULL) + return -1; + sprintf(path, "%s%s", PROCATTR, attr); + + fd = open(path, O_RDONLY); + free(path); + + if (fd < 0) + return -1; + len = read(fd, value, size); + + close(fd); + + /* Ensure value is terminated */ + if (len <= 0 || len == size) + return -1; + value[len] = '\0'; + + path = strchr(value, '\n'); + if (path) + *path = '\0'; + + return 0; +} + +int read_sysfs_lsms(char *lsms, size_t size) +{ + FILE *fp; + size_t red; + + fp = fopen("/sys/kernel/security/lsm", "r"); + if (fp == NULL) + return -1; + red = fread(lsms, 1, size, fp); + fclose(fp); + + if (red <= 0 || red == size) + return -1; + lsms[red] = '\0'; + return 0; +} + +int attr_lsm_count(void) +{ + char *names = calloc(sysconf(_SC_PAGESIZE), 1); + int count = 0; + + if (!names) + return 0; + + if (read_sysfs_lsms(names, sysconf(_SC_PAGESIZE))) + return 0; + + if (strstr(names, "selinux")) + count++; + if (strstr(names, "smack")) + count++; + if (strstr(names, "apparmor")) + count++; + + return count; +} diff --git a/tools/testing/selftests/lsm/common.h b/tools/testing/selftests/lsm/common.h new file mode 100644 index 000000000000..d404329e5eeb --- /dev/null +++ b/tools/testing/selftests/lsm/common.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Linux Security Module infrastructure tests + * + * Copyright © 2023 Casey Schaufler + */ + +#ifndef lsm_get_self_attr +static inline int lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx, + size_t *size, __u32 flags) +{ + return syscall(__NR_lsm_get_self_attr, attr, ctx, size, flags); +} +#endif + +#ifndef lsm_set_self_attr +static inline int lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx, + size_t size, __u32 flags) +{ + return syscall(__NR_lsm_set_self_attr, attr, ctx, size, flags); +} +#endif + +#ifndef lsm_list_modules +static inline int lsm_list_modules(__u64 *ids, size_t *size, __u32 flags) +{ + return syscall(__NR_lsm_list_modules, ids, size, flags); +} +#endif + +extern int read_proc_attr(const char *attr, char *value, size_t size); +extern int read_sysfs_lsms(char *lsms, size_t size); +int attr_lsm_count(void); diff --git a/tools/testing/selftests/lsm/config b/tools/testing/selftests/lsm/config new file mode 100644 index 000000000000..1c0c4c020f9c --- /dev/null +++ b/tools/testing/selftests/lsm/config @@ -0,0 +1,3 @@ +CONFIG_SYSFS=y +CONFIG_SECURITY=y +CONFIG_SECURITYFS=y diff --git a/tools/testing/selftests/lsm/lsm_get_self_attr_test.c b/tools/testing/selftests/lsm/lsm_get_self_attr_test.c new file mode 100644 index 000000000000..e0e313d9047a --- /dev/null +++ b/tools/testing/selftests/lsm/lsm_get_self_attr_test.c @@ -0,0 +1,275 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Linux Security Module infrastructure tests + * Tests for the lsm_get_self_attr system call + * + * Copyright © 2022 Casey Schaufler + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include "../kselftest_harness.h" +#include "common.h" + +static struct lsm_ctx *next_ctx(struct lsm_ctx *ctxp) +{ + void *vp; + + vp = (void *)ctxp + sizeof(*ctxp) + ctxp->ctx_len; + return (struct lsm_ctx *)vp; +} + +TEST(size_null_lsm_get_self_attr) +{ + const long page_size = sysconf(_SC_PAGESIZE); + struct lsm_ctx *ctx = calloc(page_size, 1); + + ASSERT_NE(NULL, ctx); + errno = 0; + ASSERT_EQ(-1, lsm_get_self_attr(LSM_ATTR_CURRENT, ctx, NULL, 0)); + ASSERT_EQ(EINVAL, errno); + + free(ctx); +} + +TEST(ctx_null_lsm_get_self_attr) +{ + const long page_size = sysconf(_SC_PAGESIZE); + size_t size = page_size; + int rc; + + rc = lsm_get_self_attr(LSM_ATTR_CURRENT, NULL, &size, 0); + + if (attr_lsm_count()) { + ASSERT_NE(-1, rc); + ASSERT_NE(1, size); + } else { + ASSERT_EQ(-1, rc); + } +} + +TEST(size_too_small_lsm_get_self_attr) +{ + const long page_size = sysconf(_SC_PAGESIZE); + struct lsm_ctx *ctx = calloc(page_size, 1); + size_t size = 1; + + ASSERT_NE(NULL, ctx); + errno = 0; + ASSERT_EQ(-1, lsm_get_self_attr(LSM_ATTR_CURRENT, ctx, &size, 0)); + if (attr_lsm_count()) { + ASSERT_EQ(E2BIG, errno); + } else { + ASSERT_EQ(EOPNOTSUPP, errno); + } + ASSERT_NE(1, size); + + free(ctx); +} + +TEST(flags_zero_lsm_get_self_attr) +{ + const long page_size = sysconf(_SC_PAGESIZE); + struct lsm_ctx *ctx = calloc(page_size, 1); + __u64 *syscall_lsms = calloc(page_size, 1); + size_t size; + int lsmcount; + int i; + + ASSERT_NE(NULL, ctx); + errno = 0; + size = page_size; + ASSERT_EQ(-1, lsm_get_self_attr(LSM_ATTR_CURRENT, ctx, &size, + LSM_FLAG_SINGLE)); + ASSERT_EQ(EINVAL, errno); + ASSERT_EQ(page_size, size); + + lsmcount = syscall(__NR_lsm_list_modules, syscall_lsms, &size, 0); + ASSERT_LE(1, lsmcount); + ASSERT_NE(NULL, syscall_lsms); + + for (i = 0; i < lsmcount; i++) { + errno = 0; + size = page_size; + ctx->id = syscall_lsms[i]; + + if (syscall_lsms[i] == LSM_ID_SELINUX || + syscall_lsms[i] == LSM_ID_SMACK || + syscall_lsms[i] == LSM_ID_APPARMOR) { + ASSERT_EQ(1, lsm_get_self_attr(LSM_ATTR_CURRENT, ctx, + &size, LSM_FLAG_SINGLE)); + } else { + ASSERT_EQ(-1, lsm_get_self_attr(LSM_ATTR_CURRENT, ctx, + &size, + LSM_FLAG_SINGLE)); + } + } + + free(ctx); +} + +TEST(flags_overset_lsm_get_self_attr) +{ + const long page_size = sysconf(_SC_PAGESIZE); + struct lsm_ctx *ctx = calloc(page_size, 1); + size_t size; + + ASSERT_NE(NULL, ctx); + + errno = 0; + size = page_size; + ASSERT_EQ(-1, lsm_get_self_attr(LSM_ATTR_CURRENT | LSM_ATTR_PREV, ctx, + &size, 0)); + ASSERT_EQ(EOPNOTSUPP, errno); + + errno = 0; + size = page_size; + ASSERT_EQ(-1, lsm_get_self_attr(LSM_ATTR_CURRENT, ctx, &size, + LSM_FLAG_SINGLE | + (LSM_FLAG_SINGLE << 1))); + ASSERT_EQ(EINVAL, errno); + + free(ctx); +} + +TEST(basic_lsm_get_self_attr) +{ + const long page_size = sysconf(_SC_PAGESIZE); + size_t size = page_size; + struct lsm_ctx *ctx = calloc(page_size, 1); + struct lsm_ctx *tctx = NULL; + __u64 *syscall_lsms = calloc(page_size, 1); + char *attr = calloc(page_size, 1); + int cnt_current = 0; + int cnt_exec = 0; + int cnt_fscreate = 0; + int cnt_keycreate = 0; + int cnt_prev = 0; + int cnt_sockcreate = 0; + int lsmcount; + int count; + int i; + + ASSERT_NE(NULL, ctx); + ASSERT_NE(NULL, syscall_lsms); + + lsmcount = syscall(__NR_lsm_list_modules, syscall_lsms, &size, 0); + ASSERT_LE(1, lsmcount); + + for (i = 0; i < lsmcount; i++) { + switch (syscall_lsms[i]) { + case LSM_ID_SELINUX: + cnt_current++; + cnt_exec++; + cnt_fscreate++; + cnt_keycreate++; + cnt_prev++; + cnt_sockcreate++; + break; + case LSM_ID_SMACK: + cnt_current++; + break; + case LSM_ID_APPARMOR: + cnt_current++; + cnt_exec++; + cnt_prev++; + break; + default: + break; + } + } + + if (cnt_current) { + size = page_size; + count = lsm_get_self_attr(LSM_ATTR_CURRENT, ctx, &size, 0); + ASSERT_EQ(cnt_current, count); + tctx = ctx; + ASSERT_EQ(0, read_proc_attr("current", attr, page_size)); + ASSERT_EQ(0, strcmp((char *)tctx->ctx, attr)); + for (i = 1; i < count; i++) { + tctx = next_ctx(tctx); + ASSERT_NE(0, strcmp((char *)tctx->ctx, attr)); + } + } + if (cnt_exec) { + size = page_size; + count = lsm_get_self_attr(LSM_ATTR_EXEC, ctx, &size, 0); + ASSERT_GE(cnt_exec, count); + if (count > 0) { + tctx = ctx; + if (read_proc_attr("exec", attr, page_size) == 0) + ASSERT_EQ(0, strcmp((char *)tctx->ctx, attr)); + } + for (i = 1; i < count; i++) { + tctx = next_ctx(tctx); + ASSERT_NE(0, strcmp((char *)tctx->ctx, attr)); + } + } + if (cnt_fscreate) { + size = page_size; + count = lsm_get_self_attr(LSM_ATTR_FSCREATE, ctx, &size, 0); + ASSERT_GE(cnt_fscreate, count); + if (count > 0) { + tctx = ctx; + if (read_proc_attr("fscreate", attr, page_size) == 0) + ASSERT_EQ(0, strcmp((char *)tctx->ctx, attr)); + } + for (i = 1; i < count; i++) { + tctx = next_ctx(tctx); + ASSERT_NE(0, strcmp((char *)tctx->ctx, attr)); + } + } + if (cnt_keycreate) { + size = page_size; + count = lsm_get_self_attr(LSM_ATTR_KEYCREATE, ctx, &size, 0); + ASSERT_GE(cnt_keycreate, count); + if (count > 0) { + tctx = ctx; + if (read_proc_attr("keycreate", attr, page_size) == 0) + ASSERT_EQ(0, strcmp((char *)tctx->ctx, attr)); + } + for (i = 1; i < count; i++) { + tctx = next_ctx(tctx); + ASSERT_NE(0, strcmp((char *)tctx->ctx, attr)); + } + } + if (cnt_prev) { + size = page_size; + count = lsm_get_self_attr(LSM_ATTR_PREV, ctx, &size, 0); + ASSERT_GE(cnt_prev, count); + if (count > 0) { + tctx = ctx; + ASSERT_EQ(0, read_proc_attr("prev", attr, page_size)); + ASSERT_EQ(0, strcmp((char *)tctx->ctx, attr)); + for (i = 1; i < count; i++) { + tctx = next_ctx(tctx); + ASSERT_NE(0, strcmp((char *)tctx->ctx, attr)); + } + } + } + if (cnt_sockcreate) { + size = page_size; + count = lsm_get_self_attr(LSM_ATTR_SOCKCREATE, ctx, &size, 0); + ASSERT_GE(cnt_sockcreate, count); + if (count > 0) { + tctx = ctx; + if (read_proc_attr("sockcreate", attr, page_size) == 0) + ASSERT_EQ(0, strcmp((char *)tctx->ctx, attr)); + } + for (i = 1; i < count; i++) { + tctx = next_ctx(tctx); + ASSERT_NE(0, strcmp((char *)tctx->ctx, attr)); + } + } + + free(ctx); + free(attr); + free(syscall_lsms); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/lsm/lsm_list_modules_test.c b/tools/testing/selftests/lsm/lsm_list_modules_test.c new file mode 100644 index 000000000000..445c02f09c74 --- /dev/null +++ b/tools/testing/selftests/lsm/lsm_list_modules_test.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Linux Security Module infrastructure tests + * Tests for the lsm_list_modules system call + * + * Copyright © 2022 Casey Schaufler + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include "../kselftest_harness.h" +#include "common.h" + +TEST(size_null_lsm_list_modules) +{ + const long page_size = sysconf(_SC_PAGESIZE); + __u64 *syscall_lsms = calloc(page_size, 1); + + ASSERT_NE(NULL, syscall_lsms); + errno = 0; + ASSERT_EQ(-1, lsm_list_modules(syscall_lsms, NULL, 0)); + ASSERT_EQ(EFAULT, errno); + + free(syscall_lsms); +} + +TEST(ids_null_lsm_list_modules) +{ + const long page_size = sysconf(_SC_PAGESIZE); + size_t size = page_size; + + errno = 0; + ASSERT_EQ(-1, lsm_list_modules(NULL, &size, 0)); + ASSERT_EQ(EFAULT, errno); + ASSERT_NE(1, size); +} + +TEST(size_too_small_lsm_list_modules) +{ + const long page_size = sysconf(_SC_PAGESIZE); + __u64 *syscall_lsms = calloc(page_size, 1); + size_t size = 1; + + ASSERT_NE(NULL, syscall_lsms); + errno = 0; + ASSERT_EQ(-1, lsm_list_modules(syscall_lsms, &size, 0)); + ASSERT_EQ(E2BIG, errno); + ASSERT_NE(1, size); + + free(syscall_lsms); +} + +TEST(flags_set_lsm_list_modules) +{ + const long page_size = sysconf(_SC_PAGESIZE); + __u64 *syscall_lsms = calloc(page_size, 1); + size_t size = page_size; + + ASSERT_NE(NULL, syscall_lsms); + errno = 0; + ASSERT_EQ(-1, lsm_list_modules(syscall_lsms, &size, 7)); + ASSERT_EQ(EINVAL, errno); + ASSERT_EQ(page_size, size); + + free(syscall_lsms); +} + +TEST(correct_lsm_list_modules) +{ + const long page_size = sysconf(_SC_PAGESIZE); + size_t size = page_size; + __u64 *syscall_lsms = calloc(page_size, 1); + char *sysfs_lsms = calloc(page_size, 1); + char *name; + char *cp; + int count; + int i; + + ASSERT_NE(NULL, sysfs_lsms); + ASSERT_NE(NULL, syscall_lsms); + ASSERT_EQ(0, read_sysfs_lsms(sysfs_lsms, page_size)); + + count = lsm_list_modules(syscall_lsms, &size, 0); + ASSERT_LE(1, count); + cp = sysfs_lsms; + for (i = 0; i < count; i++) { + switch (syscall_lsms[i]) { + case LSM_ID_CAPABILITY: + name = "capability"; + break; + case LSM_ID_SELINUX: + name = "selinux"; + break; + case LSM_ID_SMACK: + name = "smack"; + break; + case LSM_ID_TOMOYO: + name = "tomoyo"; + break; + case LSM_ID_IMA: + name = "ima"; + break; + case LSM_ID_APPARMOR: + name = "apparmor"; + break; + case LSM_ID_YAMA: + name = "yama"; + break; + case LSM_ID_LOADPIN: + name = "loadpin"; + break; + case LSM_ID_SAFESETID: + name = "safesetid"; + break; + case LSM_ID_LOCKDOWN: + name = "lockdown"; + break; + case LSM_ID_BPF: + name = "bpf"; + break; + case LSM_ID_LANDLOCK: + name = "landlock"; + break; + default: + name = "INVALID"; + break; + } + ASSERT_EQ(0, strncmp(cp, name, strlen(name))); + cp += strlen(name) + 1; + } + + free(sysfs_lsms); + free(syscall_lsms); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/lsm/lsm_set_self_attr_test.c b/tools/testing/selftests/lsm/lsm_set_self_attr_test.c new file mode 100644 index 000000000000..e9712c6cf596 --- /dev/null +++ b/tools/testing/selftests/lsm/lsm_set_self_attr_test.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Linux Security Module infrastructure tests + * Tests for the lsm_set_self_attr system call + * + * Copyright © 2022 Casey Schaufler + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include "../kselftest_harness.h" +#include "common.h" + +TEST(ctx_null_lsm_set_self_attr) +{ + ASSERT_EQ(-1, lsm_set_self_attr(LSM_ATTR_CURRENT, NULL, + sizeof(struct lsm_ctx), 0)); +} + +TEST(size_too_small_lsm_set_self_attr) +{ + const long page_size = sysconf(_SC_PAGESIZE); + struct lsm_ctx *ctx = calloc(page_size, 1); + size_t size = page_size; + + ASSERT_NE(NULL, ctx); + if (attr_lsm_count()) { + ASSERT_LE(1, lsm_get_self_attr(LSM_ATTR_CURRENT, ctx, &size, + 0)); + } + ASSERT_EQ(-1, lsm_set_self_attr(LSM_ATTR_CURRENT, ctx, 1, 0)); + + free(ctx); +} + +TEST(flags_zero_lsm_set_self_attr) +{ + const long page_size = sysconf(_SC_PAGESIZE); + struct lsm_ctx *ctx = calloc(page_size, 1); + size_t size = page_size; + + ASSERT_NE(NULL, ctx); + if (attr_lsm_count()) { + ASSERT_LE(1, lsm_get_self_attr(LSM_ATTR_CURRENT, ctx, &size, + 0)); + } + ASSERT_EQ(-1, lsm_set_self_attr(LSM_ATTR_CURRENT, ctx, size, 1)); + + free(ctx); +} + +TEST(flags_overset_lsm_set_self_attr) +{ + const long page_size = sysconf(_SC_PAGESIZE); + char *ctx = calloc(page_size, 1); + size_t size = page_size; + struct lsm_ctx *tctx = (struct lsm_ctx *)ctx; + + ASSERT_NE(NULL, ctx); + if (attr_lsm_count()) { + ASSERT_LE(1, lsm_get_self_attr(LSM_ATTR_CURRENT, tctx, &size, + 0)); + } + ASSERT_EQ(-1, lsm_set_self_attr(LSM_ATTR_CURRENT | LSM_ATTR_PREV, tctx, + size, 0)); + + free(ctx); +} + +TEST_HARNESS_MAIN From edd71f8e266c7ba15eedfec338864e53ddde1c25 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 18 Oct 2023 17:41:41 -0400 Subject: [PATCH 0055/1562] lsm: drop LSM_ID_IMA When IMA becomes a proper LSM we will reintroduce an appropriate LSM ID, but drop it from the userspace API for now in an effort to put an end to debates around the naming of the LSM ID macro. Reviewed-by: Roberto Sassu Signed-off-by: Paul Moore --- include/uapi/linux/lsm.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/lsm.h b/include/uapi/linux/lsm.h index eeda59a77c02..f0386880a78e 100644 --- a/include/uapi/linux/lsm.h +++ b/include/uapi/linux/lsm.h @@ -54,14 +54,13 @@ struct lsm_ctx { #define LSM_ID_SELINUX 101 #define LSM_ID_SMACK 102 #define LSM_ID_TOMOYO 103 -#define LSM_ID_IMA 104 -#define LSM_ID_APPARMOR 105 -#define LSM_ID_YAMA 106 -#define LSM_ID_LOADPIN 107 -#define LSM_ID_SAFESETID 108 -#define LSM_ID_LOCKDOWN 109 -#define LSM_ID_BPF 110 -#define LSM_ID_LANDLOCK 111 +#define LSM_ID_APPARMOR 104 +#define LSM_ID_YAMA 105 +#define LSM_ID_LOADPIN 106 +#define LSM_ID_SAFESETID 107 +#define LSM_ID_LOCKDOWN 108 +#define LSM_ID_BPF 109 +#define LSM_ID_LANDLOCK 110 /* * LSM_ATTR_XXX definitions identify different LSM attributes From aab30be071f7048c9c23c61e6eddd55bba328398 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 26 Oct 2023 11:02:59 +0200 Subject: [PATCH 0056/1562] lsm: don't yet account for IMA in LSM_CONFIG_COUNT calculation Since IMA is not yet an LSM, don't account for it in the LSM_CONFIG_COUNT calculation, used to limit how many LSMs can invoke security_add_hooks(). Signed-off-by: Roberto Sassu [PM: subject line tweak] Signed-off-by: Paul Moore --- security/security.c | 1 - 1 file changed, 1 deletion(-) diff --git a/security/security.c b/security/security.c index 988483fcf153..7281aa90ca20 100644 --- a/security/security.c +++ b/security/security.c @@ -44,7 +44,6 @@ (IS_ENABLED(CONFIG_SECURITY_SELINUX) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_SMACK) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_TOMOYO) ? 1 : 0) + \ - (IS_ENABLED(CONFIG_IMA) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_APPARMOR) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_YAMA) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_LOADPIN) ? 1 : 0) + \ From dc46db78b9747f8114030982ee5c2faf2faaeddd Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 24 Oct 2023 12:38:40 -0400 Subject: [PATCH 0057/1562] lsm: cleanup the size counters in security_getselfattr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zero out all of the size counters in the -E2BIG case (buffer too small) to help make the current code a bit more robust in the face of future code changes. Acked-by: Casey Schaufler Reviewed-by: Mickaël Salaün Signed-off-by: Paul Moore --- security/security.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/security.c b/security/security.c index 7281aa90ca20..74ff9a48bd66 100644 --- a/security/security.c +++ b/security/security.c @@ -3950,8 +3950,9 @@ int security_getselfattr(unsigned int attr, struct lsm_ctx __user *uctx, continue; } if (rc == -E2BIG) { - toobig = true; + rc = 0; left = 0; + toobig = true; } else if (rc < 0) return rc; else From fdcf699b60712ecd6e41d9fc09137279257a4bf8 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 24 Oct 2023 12:42:38 -0400 Subject: [PATCH 0058/1562] lsm: correct error codes in security_getselfattr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We should return -EINVAL if the user specifies LSM_FLAG_SINGLE without supplying a valid lsm_ctx struct buffer. Acked-by: Casey Schaufler Reviewed-by: Mickaël Salaün Signed-off-by: Paul Moore --- security/security.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/security.c b/security/security.c index 74ff9a48bd66..78e7ffcc9f6c 100644 --- a/security/security.c +++ b/security/security.c @@ -3922,9 +3922,9 @@ int security_getselfattr(unsigned int attr, struct lsm_ctx __user *uctx, /* * Only flag supported is LSM_FLAG_SINGLE */ - if (flags != LSM_FLAG_SINGLE) + if (flags != LSM_FLAG_SINGLE || !uctx) return -EINVAL; - if (uctx && copy_from_user(&lctx, uctx, sizeof(lctx))) + if (copy_from_user(&lctx, uctx, sizeof(lctx))) return -EFAULT; /* * If the LSM ID isn't specified it is an error. From d7cf3412a9f6c547e5ee443fa7644e08898aa3e2 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 24 Oct 2023 14:44:00 -0400 Subject: [PATCH 0059/1562] lsm: consolidate buffer size handling into lsm_fill_user_ctx() While we have a lsm_fill_user_ctx() helper function designed to make life easier for LSMs which return lsm_ctx structs to userspace, we didn't include all of the buffer length safety checks and buffer padding adjustments in the helper. This led to code duplication across the different LSMs and the possibility for mistakes across the different LSM subsystems. In order to reduce code duplication and decrease the chances of silly mistakes, we're consolidating all of this code into the lsm_fill_user_ctx() helper. The buffer padding is also modified from a fixed 8-byte alignment to an alignment that matches the word length of the machine (BITS_PER_LONG / 8). Signed-off-by: Paul Moore --- include/linux/security.h | 9 ++++--- security/apparmor/lsm.c | 15 +++-------- security/security.c | 55 +++++++++++++++++++++----------------- security/selinux/hooks.c | 42 +++++++++++++++-------------- security/smack/smack_lsm.c | 23 +++++----------- 5 files changed, 67 insertions(+), 77 deletions(-) diff --git a/include/linux/security.h b/include/linux/security.h index 334f75aa7289..750130a7b9dd 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -492,8 +492,8 @@ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); int security_locked_down(enum lockdown_reason what); -int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context, - size_t context_size, u64 id, u64 flags); +int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, size_t *uctx_len, + void *val, size_t val_len, u64 id, u64 flags); #else /* CONFIG_SECURITY */ static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data) @@ -1424,8 +1424,9 @@ static inline int security_locked_down(enum lockdown_reason what) { return 0; } -static inline int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context, - size_t context_size, u64 id, u64 flags) +static inline int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, + size_t *uctx_len, void *val, size_t val_len, + u64 id, u64 flags) { return -EOPNOTSUPP; } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8165f80c10ff..332198e0a017 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -782,7 +782,6 @@ static int apparmor_getselfattr(unsigned int attr, struct lsm_ctx __user *lx, int error = -ENOENT; struct aa_task_ctx *ctx = task_ctx(current); struct aa_label *label = NULL; - size_t total_len = 0; char *value; switch (attr) { @@ -804,22 +803,14 @@ static int apparmor_getselfattr(unsigned int attr, struct lsm_ctx __user *lx, if (label) { error = aa_getprocattr(label, &value, false); - if (error > 0) { - total_len = ALIGN(struct_size(lx, ctx, error), 8); - if (total_len > *size) - error = -E2BIG; - else if (lx) - error = lsm_fill_user_ctx(lx, value, error, - LSM_ID_APPARMOR, 0); - else - error = 1; - } + if (error > 0) + error = lsm_fill_user_ctx(lx, size, value, error, + LSM_ID_APPARMOR, 0); kfree(value); } aa_put_label(label); - *size = total_len; if (error < 0) return error; return 1; diff --git a/security/security.c b/security/security.c index 78e7ffcc9f6c..86f7a1995991 100644 --- a/security/security.c +++ b/security/security.c @@ -772,42 +772,49 @@ static int lsm_superblock_alloc(struct super_block *sb) /** * lsm_fill_user_ctx - Fill a user space lsm_ctx structure - * @ctx: an LSM context to be filled - * @context: the new context value - * @context_size: the size of the new context value + * @uctx: a userspace LSM context to be filled + * @uctx_len: available uctx size (input), used uctx size (output) + * @val: the new LSM context value + * @val_len: the size of the new LSM context value * @id: LSM id * @flags: LSM defined flags * - * Fill all of the fields in a user space lsm_ctx structure. - * Caller is assumed to have verified that @ctx has enough space - * for @context. + * Fill all of the fields in a userspace lsm_ctx structure. * - * Returns 0 on success, -EFAULT on a copyout error, -ENOMEM - * if memory can't be allocated. + * Returns 0 on success, -E2BIG if userspace buffer is not large enough, + * -EFAULT on a copyout error, -ENOMEM if memory can't be allocated. */ -int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context, - size_t context_size, u64 id, u64 flags) +int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, size_t *uctx_len, + void *val, size_t val_len, + u64 id, u64 flags) { - struct lsm_ctx *lctx; - size_t locallen = struct_size(lctx, ctx, context_size); + struct lsm_ctx *nctx = NULL; + size_t nctx_len; int rc = 0; - lctx = kzalloc(locallen, GFP_KERNEL); - if (lctx == NULL) - return -ENOMEM; + nctx_len = ALIGN(struct_size(nctx, ctx, val_len), BITS_PER_LONG / 8); + if (nctx_len > *uctx_len) { + rc = -E2BIG; + goto out; + } - lctx->id = id; - lctx->flags = flags; - lctx->ctx_len = context_size; - lctx->len = locallen; + nctx = kzalloc(nctx_len, GFP_KERNEL); + if (nctx == NULL) { + rc = -ENOMEM; + goto out; + } + nctx->id = id; + nctx->flags = flags; + nctx->len = nctx_len; + nctx->ctx_len = val_len; + memcpy(nctx->ctx, val, val_len); - memcpy(lctx->ctx, context, context_size); - - if (copy_to_user(ctx, lctx, locallen)) + if (copy_to_user(uctx, nctx, nctx_len)) rc = -EFAULT; - kfree(lctx); - +out: + kfree(nctx); + *uctx_len = nctx_len; return rc; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index b6c7930a3ab2..942f2b8c4ebb 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6486,30 +6486,32 @@ abort_change: return error; } +/** + * selinux_getselfattr - Get SELinux current task attributes + * @attr: the requested attribute + * @ctx: buffer to receive the result + * @size: buffer size (input), buffer size used (output) + * @flags: unused + * + * Fill the passed user space @ctx with the details of the requested + * attribute. + * + * Returns the number of attributes on success, an error code otherwise. + * There will only ever be one attribute. + */ static int selinux_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx, size_t *size, u32 flags) { - char *value; - size_t total_len; - int len; - int rc = 0; + int rc; + char *val; + int val_len; - len = selinux_lsm_getattr(attr, current, &value); - if (len < 0) - return len; - - total_len = ALIGN(struct_size(ctx, ctx, len), 8); - - if (total_len > *size) - rc = -E2BIG; - else if (ctx) - rc = lsm_fill_user_ctx(ctx, value, len, LSM_ID_SELINUX, 0); - - kfree(value); - *size = total_len; - if (rc < 0) - return rc; - return 1; + val_len = selinux_lsm_getattr(attr, current, &val); + if (val_len < 0) + return val_len; + rc = lsm_fill_user_ctx(ctx, size, val, val_len, LSM_ID_SELINUX, 0); + kfree(val); + return (!rc ? 1 : rc); } static int selinux_setselfattr(unsigned int attr, struct lsm_ctx *ctx, diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 12160d060cc1..99664c8cf867 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -3642,28 +3642,17 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode) static int smack_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx, size_t *size, u32 flags) { - struct smack_known *skp = smk_of_current(); - int total; - int slen; int rc; + struct smack_known *skp; if (attr != LSM_ATTR_CURRENT) return -EOPNOTSUPP; - slen = strlen(skp->smk_known) + 1; - total = ALIGN(slen + sizeof(*ctx), 8); - if (total > *size) - rc = -E2BIG; - else if (ctx) - rc = lsm_fill_user_ctx(ctx, skp->smk_known, slen, LSM_ID_SMACK, - 0); - else - rc = 1; - - *size = total; - if (rc >= 0) - return 1; - return rc; + skp = smk_of_current(); + rc = lsm_fill_user_ctx(ctx, size, + skp->smk_known, strlen(skp->smk_known) + 1, + LSM_ID_SMACK, 0); + return (!rc ? 1 : rc); } /** From 41793202292fd2acf99fdc09eff8323cc27c80eb Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 1 Nov 2023 17:39:44 -0400 Subject: [PATCH 0060/1562] lsm: align based on pointer length in lsm_fill_user_ctx() Using the size of a void pointer is much cleaner than BITS_PER_LONG / 8. Acked-by: Casey Schaufler Signed-off-by: Paul Moore --- security/security.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/security.c b/security/security.c index 86f7a1995991..a808fd5eba6d 100644 --- a/security/security.c +++ b/security/security.c @@ -792,7 +792,7 @@ int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, size_t *uctx_len, size_t nctx_len; int rc = 0; - nctx_len = ALIGN(struct_size(nctx, ctx, val_len), BITS_PER_LONG / 8); + nctx_len = ALIGN(struct_size(nctx, ctx, val_len), sizeof(void *)); if (nctx_len > *uctx_len) { rc = -E2BIG; goto out; From 9ba8802c8b66fbde2ee32ab4c44cd418f9444486 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 1 Nov 2023 18:42:12 -0400 Subject: [PATCH 0061/1562] lsm: convert security_setselfattr() to use memdup_user() As suggested by the kernel test robot, memdup_user() is a better option than the combo of kmalloc()/copy_from_user(). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202310270805.2ArE52i5-lkp@intel.com/ Acked-by: Casey Schaufler Signed-off-by: Paul Moore --- security/security.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/security/security.c b/security/security.c index a808fd5eba6d..d7b15ea67c3f 100644 --- a/security/security.c +++ b/security/security.c @@ -4011,14 +4011,9 @@ int security_setselfattr(unsigned int attr, struct lsm_ctx __user *uctx, if (size > PAGE_SIZE) return -E2BIG; - lctx = kmalloc(size, GFP_KERNEL); - if (lctx == NULL) - return -ENOMEM; - - if (copy_from_user(lctx, uctx, size)) { - rc = -EFAULT; - goto free_out; - } + lctx = memdup_user(uctx, size); + if (IS_ERR(lctx)) + return PTR_ERR(lctx); if (size < lctx->len || size < lctx->ctx_len + sizeof(*lctx) || lctx->len < lctx->ctx_len + sizeof(*lctx)) { From b1a867eeb8ab5e097178728b01cc504c6806acca Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 10 Nov 2023 12:09:33 -0500 Subject: [PATCH 0062/1562] lsm: mark the lsm_id variables are marked as static As the kernel test robot helpfully reminded us, all of the lsm_id instances defined inside the various LSMs should be marked as static. The one exception is Landlock which uses its lsm_id variable across multiple source files with an extern declaration in a header file. Reported-by: kernel test robot Suggested-by: Casey Schaufler Reviewed-by: Casey Schaufler Signed-off-by: Paul Moore --- security/apparmor/lsm.c | 2 +- security/bpf/hooks.c | 2 +- security/commoncap.c | 2 +- security/loadpin/loadpin.c | 2 +- security/safesetid/lsm.c | 2 +- security/selinux/hooks.c | 2 +- security/smack/smack_lsm.c | 2 +- security/tomoyo/tomoyo.c | 2 +- security/yama/yama_lsm.c | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 332198e0a017..e490a7000408 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -1454,7 +1454,7 @@ struct lsm_blob_sizes apparmor_blob_sizes __ro_after_init = { .lbs_task = sizeof(struct aa_task_ctx), }; -const struct lsm_id apparmor_lsmid = { +static const struct lsm_id apparmor_lsmid = { .name = "apparmor", .id = LSM_ID_APPARMOR, }; diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c index 91011e0c361a..57b9ffd53c98 100644 --- a/security/bpf/hooks.c +++ b/security/bpf/hooks.c @@ -16,7 +16,7 @@ static struct security_hook_list bpf_lsm_hooks[] __ro_after_init = { LSM_HOOK_INIT(task_free, bpf_task_storage_free), }; -const struct lsm_id bpf_lsmid = { +static const struct lsm_id bpf_lsmid = { .name = "bpf", .id = LSM_ID_BPF, }; diff --git a/security/commoncap.c b/security/commoncap.c index a64c0c8592bb..162d96b3a676 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -1441,7 +1441,7 @@ int cap_mmap_file(struct file *file, unsigned long reqprot, #ifdef CONFIG_SECURITY -const struct lsm_id capability_lsmid = { +static const struct lsm_id capability_lsmid = { .name = "capability", .id = LSM_ID_CAPABILITY, }; diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c index d682a851de58..8e93cda130f1 100644 --- a/security/loadpin/loadpin.c +++ b/security/loadpin/loadpin.c @@ -209,7 +209,7 @@ static int loadpin_load_data(enum kernel_load_data_id id, bool contents) return loadpin_check(NULL, (enum kernel_read_file_id) id); } -const struct lsm_id loadpin_lsmid = { +static const struct lsm_id loadpin_lsmid = { .name = "loadpin", .id = LSM_ID_LOADPIN, }; diff --git a/security/safesetid/lsm.c b/security/safesetid/lsm.c index f42d5af5ffb0..1ba564f097f5 100644 --- a/security/safesetid/lsm.c +++ b/security/safesetid/lsm.c @@ -262,7 +262,7 @@ static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old return 0; } -const struct lsm_id safesetid_lsmid = { +static const struct lsm_id safesetid_lsmid = { .name = "safesetid", .id = LSM_ID_SAFESETID, }; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 942f2b8c4ebb..b340425ccfae 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7031,7 +7031,7 @@ static int selinux_uring_cmd(struct io_uring_cmd *ioucmd) } #endif /* CONFIG_IO_URING */ -const struct lsm_id selinux_lsmid = { +static const struct lsm_id selinux_lsmid = { .name = "selinux", .id = LSM_ID_SELINUX, }; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 99664c8cf867..53336d7daa93 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -5006,7 +5006,7 @@ struct lsm_blob_sizes smack_blob_sizes __ro_after_init = { .lbs_xattr_count = SMACK_INODE_INIT_XATTRS, }; -const struct lsm_id smack_lsmid = { +static const struct lsm_id smack_lsmid = { .name = "smack", .id = LSM_ID_SMACK, }; diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index 722205433105..e10491f155a5 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -543,7 +543,7 @@ static void tomoyo_task_free(struct task_struct *task) } } -const struct lsm_id tomoyo_lsmid = { +static const struct lsm_id tomoyo_lsmid = { .name = "tomoyo", .id = LSM_ID_TOMOYO, }; diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 5cdff292fcae..49dc52b454ef 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -422,7 +422,7 @@ static int yama_ptrace_traceme(struct task_struct *parent) return rc; } -const struct lsm_id yama_lsmid = { +static const struct lsm_id yama_lsmid = { .name = "yama", .id = LSM_ID_YAMA, }; From 00eb7bd699ccd60d3e5c0ef2ca89467461c39178 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 10 Nov 2023 12:27:48 -0500 Subject: [PATCH 0063/1562] mailmap: update/replace my old email addresses I was recently reminded by someone who was unable to reach my old email address that I really should update the kernel's .mailmap so that people looking for me in old commits can reach my current email. Signed-off-by: Paul Moore --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 43031441b2d9..2ba581ebb2cf 100644 --- a/.mailmap +++ b/.mailmap @@ -469,6 +469,8 @@ Paul E. McKenney Paul E. McKenney Paul Mackerras Paul Mackerras +Paul Moore +Paul Moore Pavankumar Kondeti Peter A Jonsson Peter Oruba From d131f1f3b459980d38a59adc3598c96cc3a6ad5e Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sat, 11 Nov 2023 00:53:14 +0800 Subject: [PATCH 0064/1562] platform/chrome: sensorhub: Implement quickselect for median calculation The cros_ec_sensor_ring_median function currently uses an inefficient sorting algorithm (> O(n)) to find the median of an array. This patch replaces the sorting approach with the quickselect algorithm, which achieves an average time complexity of O(n). The algorithm employs the median-of-three rule to select the pivot, mitigating worst-case scenarios and reducing the expected number of necessary comparisons. This strategy enhances the algorithm's efficiency and ensures a more balanced partitioning. In the worst case, the runtime of quickselect could regress to O(n^2). To address this, alternative algorithms like median-of-medians that can guarantee O(n) even in the worst case. However, due to higher overhead and increased complexity of implementation, quickselect remains a pragmatic choice for our use case. Signed-off-by: Kuan-Wei Chiu Link: https://lore.kernel.org/r/20231110165314.1559285-1-visitorckw@gmail.com Signed-off-by: Tzung-Bi Shih --- .../platform/chrome/cros_ec_sensorhub_ring.c | 62 ++++++++++++++----- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_sensorhub_ring.c b/drivers/platform/chrome/cros_ec_sensorhub_ring.c index 9e17f7483ca0..1205219515d6 100644 --- a/drivers/platform/chrome/cros_ec_sensorhub_ring.c +++ b/drivers/platform/chrome/cros_ec_sensorhub_ring.c @@ -133,33 +133,61 @@ int cros_ec_sensorhub_ring_fifo_enable(struct cros_ec_sensorhub *sensorhub, return ret; } -static int cros_ec_sensor_ring_median_cmp(const void *pv1, const void *pv2) +static void cros_ec_sensor_ring_median_swap(s64 *a, s64 *b) { - s64 v1 = *(s64 *)pv1; - s64 v2 = *(s64 *)pv2; - - if (v1 > v2) - return 1; - else if (v1 < v2) - return -1; - else - return 0; + s64 tmp = *a; + *a = *b; + *b = tmp; } /* * cros_ec_sensor_ring_median: Gets median of an array of numbers * - * For now it's implemented using an inefficient > O(n) sort then return - * the middle element. A more optimal method would be something like - * quickselect, but given that n = 64 we can probably live with it in the - * name of clarity. + * It's implemented using the quickselect algorithm, which achieves an + * average time complexity of O(n) the middle element. In the worst case, + * the runtime of quickselect could regress to O(n^2). To mitigate this, + * algorithms like median-of-medians exist, which can guarantee O(n) even + * in the worst case. However, these algorithms come with a higher + * overhead and are more complex to implement, making quickselect a + * pragmatic choice for our use case. * - * Warning: the input array gets modified (sorted)! + * Warning: the input array gets modified! */ static s64 cros_ec_sensor_ring_median(s64 *array, size_t length) { - sort(array, length, sizeof(s64), cros_ec_sensor_ring_median_cmp, NULL); - return array[length / 2]; + int lo = 0; + int hi = length - 1; + + while (lo <= hi) { + int mid = lo + (hi - lo) / 2; + int pivot, i; + + if (array[lo] > array[mid]) + cros_ec_sensor_ring_median_swap(&array[lo], &array[mid]); + if (array[lo] > array[hi]) + cros_ec_sensor_ring_median_swap(&array[lo], &array[hi]); + if (array[mid] < array[hi]) + cros_ec_sensor_ring_median_swap(&array[mid], &array[hi]); + + pivot = array[hi]; + i = lo - 1; + + for (int j = lo; j < hi; j++) + if (array[j] < pivot) + cros_ec_sensor_ring_median_swap(&array[++i], &array[j]); + + /* The pivot's index corresponds to i+1. */ + cros_ec_sensor_ring_median_swap(&array[i + 1], &array[hi]); + if (i + 1 == length / 2) + return array[i + 1]; + if (i + 1 > length / 2) + hi = i; + else + lo = i + 2; + } + + /* Should never reach here. */ + return -1; } /* From 24c6a097b5a270e05c6e99a99da66b91be81fd7d Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:22 +0000 Subject: [PATCH 0065/1562] slub: Reflow ___slab_alloc() The get_partial() interface used in ___slab_alloc() may return a single object in the "kmem_cache_debug(s)" case, in which we will just return the "freelist" object. Move this handling up to prepare for later changes. And the "pfmemalloc_match()" part is not needed for node partial slab, since we already check this in the get_partial_node(). Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 63d281dfacdb..0b0fdc8c189f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3216,8 +3216,21 @@ new_objects: pc.slab = &slab; pc.orig_size = orig_size; freelist = get_partial(s, node, &pc); - if (freelist) - goto check_new_slab; + if (freelist) { + if (kmem_cache_debug(s)) { + /* + * For debug caches here we had to go through + * alloc_single_from_partial() so just store the + * tracking info and return the object. + */ + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + + return freelist; + } + + goto retry_load_slab; + } slub_put_cpu_ptr(s->cpu_slab); slab = new_slab(s, gfpflags, node); @@ -3253,20 +3266,6 @@ new_objects: inc_slabs_node(s, slab_nid(slab), slab->objects); -check_new_slab: - - if (kmem_cache_debug(s)) { - /* - * For debug caches here we had to go through - * alloc_single_from_partial() so just store the tracking info - * and return the object - */ - if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr); - - return freelist; - } - if (unlikely(!pfmemalloc_match(slab, gfpflags))) { /* * For !pfmemalloc_match() case we don't load freelist so that From 43c4c349149c77f27c8e5801755a7b8883a70ebe Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:23 +0000 Subject: [PATCH 0066/1562] slub: Change get_partial() interfaces to return slab We need all get_partial() related interfaces to return a slab, instead of returning the freelist (or object). Use the partial_context.object to return back freelist or object for now. This patch shouldn't have any functional changes. Suggested-by: Vlastimil Babka Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 63 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 0b0fdc8c189f..03384cd965c5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -204,9 +204,9 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); /* Structure holding parameters for get_partial() call chain */ struct partial_context { - struct slab **slab; gfp_t flags; unsigned int orig_size; + void *object; }; static inline bool kmem_cache_debug(struct kmem_cache *s) @@ -2269,10 +2269,11 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); /* * Try to allocate a partial slab from a specific node. */ -static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, - struct partial_context *pc) +static struct slab *get_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n, + struct partial_context *pc) { - struct slab *slab, *slab2; + struct slab *slab, *slab2, *partial = NULL; void *object = NULL; unsigned long flags; unsigned int partial_slabs = 0; @@ -2288,27 +2289,28 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { - void *t; - if (!pfmemalloc_match(slab, pc->flags)) continue; if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { object = alloc_single_from_partial(s, n, slab, pc->orig_size); - if (object) + if (object) { + partial = slab; + pc->object = object; break; + } continue; } - t = acquire_slab(s, n, slab, object == NULL); - if (!t) + object = acquire_slab(s, n, slab, object == NULL); + if (!object) break; - if (!object) { - *pc->slab = slab; + if (!partial) { + partial = slab; + pc->object = object; stat(s, ALLOC_FROM_PARTIAL); - object = t; } else { put_cpu_partial(s, slab, 0); stat(s, CPU_PARTIAL_NODE); @@ -2324,20 +2326,21 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, } spin_unlock_irqrestore(&n->list_lock, flags); - return object; + return partial; } /* * Get a slab from somewhere. Search in increasing NUMA distances. */ -static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) +static struct slab *get_any_partial(struct kmem_cache *s, + struct partial_context *pc) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(pc->flags); - void *object; + struct slab *slab; unsigned int cpuset_mems_cookie; /* @@ -2372,8 +2375,8 @@ static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) if (n && cpuset_zone_allowed(zone, pc->flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, pc); - if (object) { + slab = get_partial_node(s, n, pc); + if (slab) { /* * Don't check read_mems_allowed_retry() * here - if mems_allowed was updated in @@ -2381,7 +2384,7 @@ static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) * between allocation and the cpuset * update */ - return object; + return slab; } } } @@ -2393,17 +2396,18 @@ static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) /* * Get a partial slab, lock it and return it. */ -static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc) +static struct slab *get_partial(struct kmem_cache *s, int node, + struct partial_context *pc) { - void *object; + struct slab *slab; int searchnode = node; if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - object = get_partial_node(s, get_node(s, searchnode), pc); - if (object || node != NUMA_NO_NODE) - return object; + slab = get_partial_node(s, get_node(s, searchnode), pc); + if (slab || node != NUMA_NO_NODE) + return slab; return get_any_partial(s, pc); } @@ -3213,10 +3217,10 @@ new_slab: new_objects: pc.flags = gfpflags; - pc.slab = &slab; pc.orig_size = orig_size; - freelist = get_partial(s, node, &pc); - if (freelist) { + slab = get_partial(s, node, &pc); + if (slab) { + freelist = pc.object; if (kmem_cache_debug(s)) { /* * For debug caches here we had to go through @@ -3408,12 +3412,11 @@ static void *__slab_alloc_node(struct kmem_cache *s, void *object; pc.flags = gfpflags; - pc.slab = &slab; pc.orig_size = orig_size; - object = get_partial(s, node, &pc); + slab = get_partial(s, node, &pc); - if (object) - return object; + if (slab) + return pc.object; slab = new_slab(s, gfpflags, node); if (unlikely(!slab)) { From 215283a1a4833f441778580359aea768642c56af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 2 Nov 2023 23:02:48 +0100 Subject: [PATCH 0067/1562] mtd: rawnand: brcmnand: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). By changing the function brcmnand_remove() to return void several drivers that use this function as remove callback can be converted to .remove_new(). Signed-off-by: Uwe Kleine-König Reviewed-by: Florian Fainelli Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231102220246.3336154-7-u.kleine-koenig@pengutronix.de --- drivers/mtd/nand/raw/brcmnand/bcm63138_nand.c | 2 +- drivers/mtd/nand/raw/brcmnand/bcm6368_nand.c | 2 +- drivers/mtd/nand/raw/brcmnand/bcma_nand.c | 2 +- drivers/mtd/nand/raw/brcmnand/brcmnand.c | 4 +--- drivers/mtd/nand/raw/brcmnand/brcmnand.h | 2 +- drivers/mtd/nand/raw/brcmnand/brcmstb_nand.c | 2 +- drivers/mtd/nand/raw/brcmnand/iproc_nand.c | 2 +- 7 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/mtd/nand/raw/brcmnand/bcm63138_nand.c b/drivers/mtd/nand/raw/brcmnand/bcm63138_nand.c index 9596629000f4..968c5b674b08 100644 --- a/drivers/mtd/nand/raw/brcmnand/bcm63138_nand.c +++ b/drivers/mtd/nand/raw/brcmnand/bcm63138_nand.c @@ -85,7 +85,7 @@ MODULE_DEVICE_TABLE(of, bcm63138_nand_of_match); static struct platform_driver bcm63138_nand_driver = { .probe = bcm63138_nand_probe, - .remove = brcmnand_remove, + .remove_new = brcmnand_remove, .driver = { .name = "bcm63138_nand", .pm = &brcmnand_pm_ops, diff --git a/drivers/mtd/nand/raw/brcmnand/bcm6368_nand.c b/drivers/mtd/nand/raw/brcmnand/bcm6368_nand.c index a06cd87f839a..05b7b653bdf3 100644 --- a/drivers/mtd/nand/raw/brcmnand/bcm6368_nand.c +++ b/drivers/mtd/nand/raw/brcmnand/bcm6368_nand.c @@ -117,7 +117,7 @@ MODULE_DEVICE_TABLE(of, bcm6368_nand_of_match); static struct platform_driver bcm6368_nand_driver = { .probe = bcm6368_nand_probe, - .remove = brcmnand_remove, + .remove_new = brcmnand_remove, .driver = { .name = "bcm6368_nand", .pm = &brcmnand_pm_ops, diff --git a/drivers/mtd/nand/raw/brcmnand/bcma_nand.c b/drivers/mtd/nand/raw/brcmnand/bcma_nand.c index dd27977919fb..4e7e435ba339 100644 --- a/drivers/mtd/nand/raw/brcmnand/bcma_nand.c +++ b/drivers/mtd/nand/raw/brcmnand/bcma_nand.c @@ -119,7 +119,7 @@ static int brcmnand_bcma_nand_probe(struct platform_device *pdev) static struct platform_driver brcmnand_bcma_nand_driver = { .probe = brcmnand_bcma_nand_probe, - .remove = brcmnand_remove, + .remove_new = brcmnand_remove, .driver = { .name = "bcma_brcmnand", .pm = &brcmnand_pm_ops, diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c index 440bef477930..30fc399f346e 100644 --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c @@ -3299,7 +3299,7 @@ err: } EXPORT_SYMBOL_GPL(brcmnand_probe); -int brcmnand_remove(struct platform_device *pdev) +void brcmnand_remove(struct platform_device *pdev) { struct brcmnand_controller *ctrl = dev_get_drvdata(&pdev->dev); struct brcmnand_host *host; @@ -3316,8 +3316,6 @@ int brcmnand_remove(struct platform_device *pdev) clk_disable_unprepare(ctrl->clk); dev_set_drvdata(&pdev->dev, NULL); - - return 0; } EXPORT_SYMBOL_GPL(brcmnand_remove); diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.h b/drivers/mtd/nand/raw/brcmnand/brcmnand.h index f1f93d85f50d..928114c0be5e 100644 --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.h +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.h @@ -88,7 +88,7 @@ static inline void brcmnand_soc_write(struct brcmnand_soc *soc, u32 val, } int brcmnand_probe(struct platform_device *pdev, struct brcmnand_soc *soc); -int brcmnand_remove(struct platform_device *pdev); +void brcmnand_remove(struct platform_device *pdev); extern const struct dev_pm_ops brcmnand_pm_ops; diff --git a/drivers/mtd/nand/raw/brcmnand/brcmstb_nand.c b/drivers/mtd/nand/raw/brcmnand/brcmstb_nand.c index 950923d977b7..558f083b92e9 100644 --- a/drivers/mtd/nand/raw/brcmnand/brcmstb_nand.c +++ b/drivers/mtd/nand/raw/brcmnand/brcmstb_nand.c @@ -23,7 +23,7 @@ static int brcmstb_nand_probe(struct platform_device *pdev) static struct platform_driver brcmstb_nand_driver = { .probe = brcmstb_nand_probe, - .remove = brcmnand_remove, + .remove_new = brcmnand_remove, .driver = { .name = "brcmstb_nand", .pm = &brcmnand_pm_ops, diff --git a/drivers/mtd/nand/raw/brcmnand/iproc_nand.c b/drivers/mtd/nand/raw/brcmnand/iproc_nand.c index 089c70fc6edf..bf46c8b85898 100644 --- a/drivers/mtd/nand/raw/brcmnand/iproc_nand.c +++ b/drivers/mtd/nand/raw/brcmnand/iproc_nand.c @@ -134,7 +134,7 @@ MODULE_DEVICE_TABLE(of, iproc_nand_of_match); static struct platform_driver iproc_nand_driver = { .probe = iproc_nand_probe, - .remove = brcmnand_remove, + .remove_new = brcmnand_remove, .driver = { .name = "iproc_nand", .pm = &brcmnand_pm_ops, From 160c0b7f9a166d62a3aa32853883666eda896c58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 2 Nov 2023 23:02:49 +0100 Subject: [PATCH 0068/1562] mtd: rawnand: txx9ndfmc: Switch to module_platform_driver() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While module_platform_driver_probe() offers the possibility to discard .probe() and .remove() in some situations, the handling is difficult and in today's systems the few hundred bytes that can be saved have little importance. So convert the driver to be a normal driver that can be bound and unbound at runtime as most other drivers, too. Signed-off-by: Uwe Kleine-König Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231102220246.3336154-8-u.kleine-koenig@pengutronix.de --- drivers/mtd/nand/raw/txx9ndfmc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/mtd/nand/raw/txx9ndfmc.c b/drivers/mtd/nand/raw/txx9ndfmc.c index eddcc0728a67..9d6c62f73bb7 100644 --- a/drivers/mtd/nand/raw/txx9ndfmc.c +++ b/drivers/mtd/nand/raw/txx9ndfmc.c @@ -276,7 +276,7 @@ static const struct nand_controller_ops txx9ndfmc_controller_ops = { .attach_chip = txx9ndfmc_attach_chip, }; -static int __init txx9ndfmc_probe(struct platform_device *dev) +static int txx9ndfmc_probe(struct platform_device *dev) { struct txx9ndfmc_platform_data *plat = dev_get_platdata(&dev->dev); int hold, spw; @@ -369,7 +369,7 @@ static int __init txx9ndfmc_probe(struct platform_device *dev) return 0; } -static int __exit txx9ndfmc_remove(struct platform_device *dev) +static int txx9ndfmc_remove(struct platform_device *dev) { struct txx9ndfmc_drvdata *drvdata = platform_get_drvdata(dev); int ret, i; @@ -407,14 +407,14 @@ static int txx9ndfmc_resume(struct platform_device *dev) #endif static struct platform_driver txx9ndfmc_driver = { - .remove = __exit_p(txx9ndfmc_remove), + .probe = txx9ndfmc_probe, + .remove = txx9ndfmc_remove, .resume = txx9ndfmc_resume, .driver = { .name = "txx9ndfmc", }, }; - -module_platform_driver_probe(txx9ndfmc_driver, txx9ndfmc_probe); +module_platform_driver(txx9ndfmc_driver); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TXx9 SoC NAND flash controller driver"); From 354dbdcbdd79ec417f6c35f55b1fe6310e7dd431 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 2 Nov 2023 23:02:50 +0100 Subject: [PATCH 0069/1562] mtd: rawnand: txx9ndfmc: Drop if block with always false condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit txx9ndfmc_remove() is only called after txx9ndfmc_probe() completed successfully. In this case platform_set_drvdata() was called with a non-NULL argument and so platform_get_drvdata() won't return NULL. Simplify by removing the if block with the always false condition. Signed-off-by: Uwe Kleine-König Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231102220246.3336154-9-u.kleine-koenig@pengutronix.de --- drivers/mtd/nand/raw/txx9ndfmc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/mtd/nand/raw/txx9ndfmc.c b/drivers/mtd/nand/raw/txx9ndfmc.c index 9d6c62f73bb7..fdcdfbea0cbd 100644 --- a/drivers/mtd/nand/raw/txx9ndfmc.c +++ b/drivers/mtd/nand/raw/txx9ndfmc.c @@ -374,8 +374,6 @@ static int txx9ndfmc_remove(struct platform_device *dev) struct txx9ndfmc_drvdata *drvdata = platform_get_drvdata(dev); int ret, i; - if (!drvdata) - return 0; for (i = 0; i < MAX_TXX9NDFMC_DEV; i++) { struct mtd_info *mtd = drvdata->mtds[i]; struct nand_chip *chip; From f52221d55d8dae7c4154116453d5fb6c544bae46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 2 Nov 2023 23:02:51 +0100 Subject: [PATCH 0070/1562] mtd: rawnand: txx9ndfmc: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231102220246.3336154-10-u.kleine-koenig@pengutronix.de --- drivers/mtd/nand/raw/txx9ndfmc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/nand/raw/txx9ndfmc.c b/drivers/mtd/nand/raw/txx9ndfmc.c index fdcdfbea0cbd..37f79c019a72 100644 --- a/drivers/mtd/nand/raw/txx9ndfmc.c +++ b/drivers/mtd/nand/raw/txx9ndfmc.c @@ -369,7 +369,7 @@ static int txx9ndfmc_probe(struct platform_device *dev) return 0; } -static int txx9ndfmc_remove(struct platform_device *dev) +static void txx9ndfmc_remove(struct platform_device *dev) { struct txx9ndfmc_drvdata *drvdata = platform_get_drvdata(dev); int ret, i; @@ -390,7 +390,6 @@ static int txx9ndfmc_remove(struct platform_device *dev) kfree(txx9_priv->mtdname); kfree(txx9_priv); } - return 0; } #ifdef CONFIG_PM @@ -406,7 +405,7 @@ static int txx9ndfmc_resume(struct platform_device *dev) static struct platform_driver txx9ndfmc_driver = { .probe = txx9ndfmc_probe, - .remove = txx9ndfmc_remove, + .remove_new = txx9ndfmc_remove, .resume = txx9ndfmc_resume, .driver = { .name = "txx9ndfmc", From e596ff4a79300423b2ae8fbd1a78a3e311f99dfe Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Mon, 13 Nov 2023 10:22:22 -0600 Subject: [PATCH 0071/1562] mailmap: add entries for Serge Hallyn's dead accounts Signed-off-by: Serge Hallyn Signed-off-by: Paul Moore --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 2ba581ebb2cf..35086f4a7961 100644 --- a/.mailmap +++ b/.mailmap @@ -535,6 +535,8 @@ Sebastian Reichel Sebastian Reichel Sedat Dilek Senthilkumar N L +Serge Hallyn +Serge Hallyn Seth Forshee Shannon Nelson Shannon Nelson From 89b212d4afef64331b08c44e661a703d2be0970b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 13 Nov 2023 21:16:51 +0100 Subject: [PATCH 0072/1562] selftests/nolibc: don't hang on config input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the kernel code has changed the build may ask for configuration input and hang. Prevent this and instead use the default settings. Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index a0fc07253baf..6c7040a75d81 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -210,10 +210,10 @@ defconfig: $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) prepare kernel: - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) + $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null kernel-standalone: initramfs - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs + $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs < /dev/null # run the tests after building the kernel run: kernel initramfs.cpio From 075ede8d20f8f201900756af55c58994c6660659 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Wed, 1 Nov 2023 14:58:48 +0000 Subject: [PATCH 0073/1562] mtd: spi-nor: use kernel sized types instead of c99 types The kernel offers and prefers the kernel sized types instead of the c99 types when not in the uapi directory, use them. Link: https://lore.kernel.org/r/20231101145853.524045-2-tudor.ambarus@linaro.org Signed-off-by: Tudor Ambarus --- drivers/mtd/spi-nor/atmel.c | 16 +++++++--------- drivers/mtd/spi-nor/core.c | 5 ++--- drivers/mtd/spi-nor/core.h | 6 +++--- drivers/mtd/spi-nor/sst.c | 6 +++--- drivers/mtd/spi-nor/swp.c | 25 ++++++++++++------------- 5 files changed, 27 insertions(+), 31 deletions(-) diff --git a/drivers/mtd/spi-nor/atmel.c b/drivers/mtd/spi-nor/atmel.c index e13b8d2dd50a..45d1153a04a0 100644 --- a/drivers/mtd/spi-nor/atmel.c +++ b/drivers/mtd/spi-nor/atmel.c @@ -16,12 +16,12 @@ * is to unlock the whole flash array on startup. Therefore, we have to support * exactly this operation. */ -static int at25fs_nor_lock(struct spi_nor *nor, loff_t ofs, uint64_t len) +static int at25fs_nor_lock(struct spi_nor *nor, loff_t ofs, u64 len) { return -EOPNOTSUPP; } -static int at25fs_nor_unlock(struct spi_nor *nor, loff_t ofs, uint64_t len) +static int at25fs_nor_unlock(struct spi_nor *nor, loff_t ofs, u64 len) { int ret; @@ -37,7 +37,7 @@ static int at25fs_nor_unlock(struct spi_nor *nor, loff_t ofs, uint64_t len) return ret; } -static int at25fs_nor_is_locked(struct spi_nor *nor, loff_t ofs, uint64_t len) +static int at25fs_nor_is_locked(struct spi_nor *nor, loff_t ofs, u64 len) { return -EOPNOTSUPP; } @@ -69,7 +69,7 @@ static const struct spi_nor_fixups at25fs_nor_fixups = { * Return: 0 on success, -error otherwise. */ static int atmel_nor_set_global_protection(struct spi_nor *nor, loff_t ofs, - uint64_t len, bool is_protect) + u64 len, bool is_protect) { int ret; u8 sr; @@ -118,20 +118,18 @@ static int atmel_nor_set_global_protection(struct spi_nor *nor, loff_t ofs, return spi_nor_write_sr(nor, nor->bouncebuf, 1); } -static int atmel_nor_global_protect(struct spi_nor *nor, loff_t ofs, - uint64_t len) +static int atmel_nor_global_protect(struct spi_nor *nor, loff_t ofs, u64 len) { return atmel_nor_set_global_protection(nor, ofs, len, true); } -static int atmel_nor_global_unprotect(struct spi_nor *nor, loff_t ofs, - uint64_t len) +static int atmel_nor_global_unprotect(struct spi_nor *nor, loff_t ofs, u64 len) { return atmel_nor_set_global_protection(nor, ofs, len, false); } static int atmel_nor_is_global_protected(struct spi_nor *nor, loff_t ofs, - uint64_t len) + u64 len) { int ret; diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c index 1c443fe568cf..25a64c65717d 100644 --- a/drivers/mtd/spi-nor/core.c +++ b/drivers/mtd/spi-nor/core.c @@ -1799,8 +1799,7 @@ destroy_erase_cmd_list: static int spi_nor_erase(struct mtd_info *mtd, struct erase_info *instr) { struct spi_nor *nor = mtd_to_spi_nor(mtd); - u32 addr, len; - uint32_t rem; + u32 addr, len, rem; int ret; dev_dbg(nor->dev, "at 0x%llx, len %lld\n", (long long)instr->addr, @@ -2146,7 +2145,7 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len, if (is_power_of_2(page_size)) { page_offset = addr & (page_size - 1); } else { - uint64_t aux = addr; + u64 aux = addr; page_offset = do_div(aux, page_size); } diff --git a/drivers/mtd/spi-nor/core.h b/drivers/mtd/spi-nor/core.h index 93cd2fc3606d..a456042379ee 100644 --- a/drivers/mtd/spi-nor/core.h +++ b/drivers/mtd/spi-nor/core.h @@ -293,9 +293,9 @@ struct spi_nor_erase_map { * @is_locked: check if a region of the SPI NOR is completely locked */ struct spi_nor_locking_ops { - int (*lock)(struct spi_nor *nor, loff_t ofs, uint64_t len); - int (*unlock)(struct spi_nor *nor, loff_t ofs, uint64_t len); - int (*is_locked)(struct spi_nor *nor, loff_t ofs, uint64_t len); + int (*lock)(struct spi_nor *nor, loff_t ofs, u64 len); + int (*unlock)(struct spi_nor *nor, loff_t ofs, u64 len); + int (*is_locked)(struct spi_nor *nor, loff_t ofs, u64 len); }; /** diff --git a/drivers/mtd/spi-nor/sst.c b/drivers/mtd/spi-nor/sst.c index 44d2a546bf17..180b7390690c 100644 --- a/drivers/mtd/spi-nor/sst.c +++ b/drivers/mtd/spi-nor/sst.c @@ -13,12 +13,12 @@ #define SST26VF_CR_BPNV BIT(3) -static int sst26vf_nor_lock(struct spi_nor *nor, loff_t ofs, uint64_t len) +static int sst26vf_nor_lock(struct spi_nor *nor, loff_t ofs, u64 len) { return -EOPNOTSUPP; } -static int sst26vf_nor_unlock(struct spi_nor *nor, loff_t ofs, uint64_t len) +static int sst26vf_nor_unlock(struct spi_nor *nor, loff_t ofs, u64 len) { int ret; @@ -38,7 +38,7 @@ static int sst26vf_nor_unlock(struct spi_nor *nor, loff_t ofs, uint64_t len) return spi_nor_global_block_unlock(nor); } -static int sst26vf_nor_is_locked(struct spi_nor *nor, loff_t ofs, uint64_t len) +static int sst26vf_nor_is_locked(struct spi_nor *nor, loff_t ofs, u64 len) { return -EOPNOTSUPP; } diff --git a/drivers/mtd/spi-nor/swp.c b/drivers/mtd/spi-nor/swp.c index 585813310ee1..e48c3cff247a 100644 --- a/drivers/mtd/spi-nor/swp.c +++ b/drivers/mtd/spi-nor/swp.c @@ -53,7 +53,7 @@ static u64 spi_nor_get_min_prot_length_sr(struct spi_nor *nor) } static void spi_nor_get_locked_range_sr(struct spi_nor *nor, u8 sr, loff_t *ofs, - uint64_t *len) + u64 *len) { struct mtd_info *mtd = &nor->mtd; u64 min_prot_len; @@ -90,10 +90,10 @@ static void spi_nor_get_locked_range_sr(struct spi_nor *nor, u8 sr, loff_t *ofs, * (if @locked is false); false otherwise. */ static bool spi_nor_check_lock_status_sr(struct spi_nor *nor, loff_t ofs, - uint64_t len, u8 sr, bool locked) + u64 len, u8 sr, bool locked) { loff_t lock_offs, lock_offs_max, offs_max; - uint64_t lock_len; + u64 lock_len; if (!len) return true; @@ -111,14 +111,13 @@ static bool spi_nor_check_lock_status_sr(struct spi_nor *nor, loff_t ofs, return (ofs >= lock_offs_max) || (offs_max <= lock_offs); } -static bool spi_nor_is_locked_sr(struct spi_nor *nor, loff_t ofs, uint64_t len, - u8 sr) +static bool spi_nor_is_locked_sr(struct spi_nor *nor, loff_t ofs, u64 len, u8 sr) { return spi_nor_check_lock_status_sr(nor, ofs, len, sr, true); } -static bool spi_nor_is_unlocked_sr(struct spi_nor *nor, loff_t ofs, - uint64_t len, u8 sr) +static bool spi_nor_is_unlocked_sr(struct spi_nor *nor, loff_t ofs, u64 len, + u8 sr) { return spi_nor_check_lock_status_sr(nor, ofs, len, sr, false); } @@ -156,7 +155,7 @@ static bool spi_nor_is_unlocked_sr(struct spi_nor *nor, loff_t ofs, * * Returns negative on errors, 0 on success. */ -static int spi_nor_sr_lock(struct spi_nor *nor, loff_t ofs, uint64_t len) +static int spi_nor_sr_lock(struct spi_nor *nor, loff_t ofs, u64 len) { struct mtd_info *mtd = &nor->mtd; u64 min_prot_len; @@ -246,7 +245,7 @@ static int spi_nor_sr_lock(struct spi_nor *nor, loff_t ofs, uint64_t len) * * Returns negative on errors, 0 on success. */ -static int spi_nor_sr_unlock(struct spi_nor *nor, loff_t ofs, uint64_t len) +static int spi_nor_sr_unlock(struct spi_nor *nor, loff_t ofs, u64 len) { struct mtd_info *mtd = &nor->mtd; u64 min_prot_len; @@ -331,7 +330,7 @@ static int spi_nor_sr_unlock(struct spi_nor *nor, loff_t ofs, uint64_t len) * Returns 1 if entire region is locked, 0 if any portion is unlocked, and * negative on errors. */ -static int spi_nor_sr_is_locked(struct spi_nor *nor, loff_t ofs, uint64_t len) +static int spi_nor_sr_is_locked(struct spi_nor *nor, loff_t ofs, u64 len) { int ret; @@ -353,7 +352,7 @@ void spi_nor_init_default_locking_ops(struct spi_nor *nor) nor->params->locking_ops = &spi_nor_sr_locking_ops; } -static int spi_nor_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) +static int spi_nor_lock(struct mtd_info *mtd, loff_t ofs, u64 len) { struct spi_nor *nor = mtd_to_spi_nor(mtd); int ret; @@ -368,7 +367,7 @@ static int spi_nor_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) return ret; } -static int spi_nor_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) +static int spi_nor_unlock(struct mtd_info *mtd, loff_t ofs, u64 len) { struct spi_nor *nor = mtd_to_spi_nor(mtd); int ret; @@ -383,7 +382,7 @@ static int spi_nor_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) return ret; } -static int spi_nor_is_locked(struct mtd_info *mtd, loff_t ofs, uint64_t len) +static int spi_nor_is_locked(struct mtd_info *mtd, loff_t ofs, u64 len) { struct spi_nor *nor = mtd_to_spi_nor(mtd); int ret; From d6111cf45c5787282b2e20d77bdb6b28881d516a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Oct 2023 11:12:01 -0700 Subject: [PATCH 0074/1562] sched: Use WRITE_ONCE() for p->on_rq Since RCU-tasks uses READ_ONCE(p->on_rq), ensure the write-side matches with WRITE_ONCE(). Signed-off-by: "Paul E. McKenney" Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/e4896e0b-eacc-45a2-a7a8-de2280a51ecc@paulmck-laptop --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a708d225c28e..9d5099d02dbc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2124,12 +2124,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) enqueue_task(rq, p, flags); - p->on_rq = TASK_ON_RQ_QUEUED; + WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); } void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; + WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); dequeue_task(rq, p, flags); } From 84db47ca7146d7bd00eb5cf2b93989a971c84650 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Fri, 20 Oct 2023 21:27:46 +0530 Subject: [PATCH 0075/1562] sched/numa: Fix mm numa_scan_seq based unconditional scan Since commit fc137c0ddab2 ("sched/numa: enhance vma scanning logic") NUMA Balancing allows updating PTEs to trap NUMA hinting faults if the task had previously accessed VMA. However unconditional scan of VMAs are allowed during initial phase of VMA creation until process's mm numa_scan_seq reaches 2 even though current task had not accessed VMA. Rationale: - Without initial scan subsequent PTE update may never happen. - Give fair opportunity to all the VMAs to be scanned and subsequently understand the access pattern of all the VMAs. But it has a corner case where, if a VMA is created after some time, process's mm numa_scan_seq could be already greater than 2. For e.g., values of mm numa_scan_seq when VMAs are created by running mmtest autonuma benchmark briefly looks like: start_seq=0 : 459 start_seq=2 : 138 start_seq=3 : 144 start_seq=4 : 8 start_seq=8 : 1 start_seq=9 : 1 This results in no unconditional PTE updates for those VMAs created after some time. Fix: - Note down the initial value of mm numa_scan_seq in per VMA start_seq. - Allow unconditional scan till start_seq + 2. Result: SUT: AMD EPYC Milan with 2 NUMA nodes 256 cpus. base kernel: upstream 6.6-rc6 with Mels patches [1] applied. kernbench ========== base patched %gain Amean elsp-128 165.09 ( 0.00%) 164.78 * 0.19%* Duration User 41404.28 41375.08 Duration System 9862.22 9768.48 Duration Elapsed 519.87 518.72 Ops NUMA PTE updates 1041416.00 831536.00 Ops NUMA hint faults 263296.00 220966.00 Ops NUMA pages migrated 258021.00 212769.00 Ops AutoNUMA cost 1328.67 1114.69 autonumabench NUMA01_THREADLOCAL ================== Amean elsp-NUMA01_THREADLOCAL 81.79 (0.00%) 67.74 * 17.18%* Duration User 54832.73 47379.67 Duration System 75.00 185.75 Duration Elapsed 576.72 476.09 Ops NUMA PTE updates 394429.00 11121044.00 Ops NUMA hint faults 1001.00 8906404.00 Ops NUMA pages migrated 288.00 2998694.00 Ops AutoNUMA cost 7.77 44666.84 Signed-off-by: Raghavendra K T Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lore.kernel.org/r/2ea7cbce80ac7c62e90cbfb9653a7972f902439f.1697816692.git.raghavendra.kt@amd.com --- include/linux/mm_types.h | 3 +++ kernel/sched/fair.c | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 957ce38768b2..950df415d7de 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -600,6 +600,9 @@ struct vma_numab_state { */ unsigned long pids_active[2]; + /* MM scan sequence ID when scan first started after VMA creation */ + int start_scan_seq; + /* * MM scan sequence ID when the VMA was last completely scanned. * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d7a3c63a2171..44b5262b6657 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3164,7 +3164,7 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) * This is also done to avoid any side effect of task scanning * amplifying the unfairness of disjoint set of VMAs' access. */ - if (READ_ONCE(current->mm->numa_scan_seq) < 2) + if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2) return true; pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; @@ -3307,6 +3307,8 @@ retry_pids: if (!vma->numab_state) continue; + vma->numab_state->start_scan_seq = mm->numa_scan_seq; + vma->numab_state->next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); From 2227a957e1d5b1941be4e4207879ec74f4bb37f8 Mon Sep 17 00:00:00 2001 From: Abel Wu Date: Wed, 15 Nov 2023 11:36:45 +0800 Subject: [PATCH 0076/1562] sched/eevdf: Sort the rbtree by virtual deadline Sort the task timeline by virtual deadline and keep the min_vruntime in the augmented tree, so we can avoid doubling the worst case cost and make full use of the cached leftmost node to enable O(1) fastpath picking in next patch. Signed-off-by: Abel Wu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231115033647.80785-3-wuyun.abel@bytedance.com --- include/linux/sched.h | 2 +- kernel/sched/debug.c | 11 ++- kernel/sched/fair.c | 170 +++++++++++++++++------------------------- kernel/sched/sched.h | 1 + 4 files changed, 78 insertions(+), 106 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 292c31697248..cd56d4018527 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -553,7 +553,7 @@ struct sched_entity { struct load_weight load; struct rb_node run_node; u64 deadline; - u64 min_deadline; + u64 min_vruntime; struct list_head group_node; unsigned int on_rq; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4580a450700e..168eecc209b4 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -628,8 +628,8 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread; - struct sched_entity *last, *first; + s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread; + struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -644,15 +644,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_rq_lock_irqsave(rq, flags); + root = __pick_root_entity(cfs_rq); + if (root) + left_vruntime = root->min_vruntime; first = __pick_first_entity(cfs_rq); if (first) - left_vruntime = first->vruntime; + left_deadline = first->deadline; last = __pick_last_entity(cfs_rq); if (last) right_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; raw_spin_rq_unlock_irqrestore(rq, flags); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", + SPLIT_NS(left_deadline)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", SPLIT_NS(left_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 44b5262b6657..31bca05c3612 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -551,7 +551,11 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) static inline bool entity_before(const struct sched_entity *a, const struct sched_entity *b) { - return (s64)(a->vruntime - b->vruntime) < 0; + /* + * Tiebreak on vruntime seems unnecessary since it can + * hardly happen. + */ + return (s64)(a->deadline - b->deadline) < 0; } static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -720,7 +724,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * Note: using 'avg_vruntime() > se->vruntime' is inacurate due * to the loss in precision caused by the division. */ -int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; s64 avg = cfs_rq->avg_vruntime; @@ -733,7 +737,12 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) load += weight; } - return avg >= entity_key(cfs_rq, se) * load; + return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; +} + +int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return vruntime_eligible(cfs_rq, se->vruntime); } static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) @@ -752,9 +761,8 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) static void update_min_vruntime(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_first_entity(cfs_rq); + struct sched_entity *se = __pick_root_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; - u64 vruntime = cfs_rq->min_vruntime; if (curr) { @@ -766,9 +774,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) if (se) { if (!curr) - vruntime = se->vruntime; + vruntime = se->min_vruntime; else - vruntime = min_vruntime(vruntime, se->vruntime); + vruntime = min_vruntime(vruntime, se->min_vruntime); } /* ensure we never gain time by being placed backwards. */ @@ -781,34 +789,34 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) return entity_before(__node_2_se(a), __node_2_se(b)); } -#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) +#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) -static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node) +static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node) { if (node) { struct sched_entity *rse = __node_2_se(node); - if (deadline_gt(min_deadline, se, rse)) - se->min_deadline = rse->min_deadline; + if (vruntime_gt(min_vruntime, se, rse)) + se->min_vruntime = rse->min_vruntime; } } /* - * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline) + * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ -static inline bool min_deadline_update(struct sched_entity *se, bool exit) +static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { - u64 old_min_deadline = se->min_deadline; + u64 old_min_vruntime = se->min_vruntime; struct rb_node *node = &se->run_node; - se->min_deadline = se->deadline; - __update_min_deadline(se, node->rb_right); - __update_min_deadline(se, node->rb_left); + se->min_vruntime = se->vruntime; + __min_vruntime_update(se, node->rb_right); + __min_vruntime_update(se, node->rb_left); - return se->min_deadline == old_min_deadline; + return se->min_vruntime == old_min_vruntime; } -RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, - run_node, min_deadline, min_deadline_update); +RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, + run_node, min_vruntime, min_vruntime_update); /* * Enqueue an entity into the rb-tree: @@ -816,18 +824,28 @@ RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { avg_vruntime_add(cfs_rq, se); - se->min_deadline = se->deadline; + se->min_vruntime = se->vruntime; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, - __entity_less, &min_deadline_cb); + __entity_less, &min_vruntime_cb); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, - &min_deadline_cb); + &min_vruntime_cb); avg_vruntime_sub(cfs_rq, se); } +struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node; + + if (!root) + return NULL; + + return __node_2_se(root); +} + struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); @@ -850,23 +868,28 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) * with the earliest virtual deadline. * * We can do this in O(log n) time due to an augmented RB-tree. The - * tree keeps the entries sorted on service, but also functions as a - * heap based on the deadline by keeping: + * tree keeps the entries sorted on deadline, but also functions as a + * heap based on the vruntime by keeping: * - * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline) + * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime) * - * Which allows an EDF like search on (sub)trees. + * Which allows tree pruning through eligibility. */ -static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq) +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *curr = cfs_rq->curr; struct sched_entity *best = NULL; - struct sched_entity *best_left = NULL; + + /* + * We can safely skip eligibility check if there is only one entity + * in this cfs_rq, saving some cycles. + */ + if (cfs_rq->nr_running == 1) + return curr && curr->on_rq ? curr : __node_2_se(node); if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - best = curr; /* * Once selected, run a task until it either becomes non-eligible or @@ -875,95 +898,38 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq) if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) return curr; + /* Heap search for the EEVD entity */ while (node) { struct sched_entity *se = __node_2_se(node); + struct rb_node *left = node->rb_left; /* - * If this entity is not eligible, try the left subtree. + * Eligible entities in left subtree are always better + * choices, since they have earlier deadlines. */ - if (!entity_eligible(cfs_rq, se)) { - node = node->rb_left; + if (left && vruntime_eligible(cfs_rq, + __node_2_se(left)->min_vruntime)) { + node = left; continue; } /* - * Now we heap search eligible trees for the best (min_)deadline + * The left subtree either is empty or has no eligible + * entity, so check the current node since it is the one + * with earliest deadline that might be eligible. */ - if (!best || deadline_gt(deadline, best, se)) + if (entity_eligible(cfs_rq, se)) { best = se; - - /* - * Every se in a left branch is eligible, keep track of the - * branch with the best min_deadline - */ - if (node->rb_left) { - struct sched_entity *left = __node_2_se(node->rb_left); - - if (!best_left || deadline_gt(min_deadline, best_left, left)) - best_left = left; - - /* - * min_deadline is in the left branch. rb_left and all - * descendants are eligible, so immediately switch to the second - * loop. - */ - if (left->min_deadline == se->min_deadline) - break; - } - - /* min_deadline is at this node, no need to look right */ - if (se->deadline == se->min_deadline) break; + } - /* else min_deadline is in the right branch. */ node = node->rb_right; } - /* - * We ran into an eligible node which is itself the best. - * (Or nr_running == 0 and both are NULL) - */ - if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0) - return best; + if (!best || (curr && entity_before(curr, best))) + best = curr; - /* - * Now best_left and all of its children are eligible, and we are just - * looking for deadline == min_deadline - */ - node = &best_left->run_node; - while (node) { - struct sched_entity *se = __node_2_se(node); - - /* min_deadline is the current node */ - if (se->deadline == se->min_deadline) - return se; - - /* min_deadline is in the left branch */ - if (node->rb_left && - __node_2_se(node->rb_left)->min_deadline == se->min_deadline) { - node = node->rb_left; - continue; - } - - /* else min_deadline is in the right branch */ - node = node->rb_right; - } - return NULL; -} - -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) -{ - struct sched_entity *se = __pick_eevdf(cfs_rq); - - if (!se) { - struct sched_entity *left = __pick_first_entity(cfs_rq); - if (left) { - pr_err("EEVDF scheduling fail, picking leftmost\n"); - return left; - } - } - - return se; + return best; } #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2e5a95486a42..539c7e763f15 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2822,6 +2822,7 @@ DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, double_rq_lock(_T->lock, _T->lock2), double_rq_unlock(_T->lock, _T->lock2)) +extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); From ee4373dc902c0a403dd084b254ce70a78f95466f Mon Sep 17 00:00:00 2001 From: Abel Wu Date: Wed, 15 Nov 2023 11:36:46 +0800 Subject: [PATCH 0077/1562] sched/eevdf: O(1) fastpath for task selection Since the RB-tree is now sorted by deadline, let's first try the leftmost entity which has the earliest virtual deadline. I've done some benchmarks to see its effectiveness. All the benchmarks are done inside a normal cpu cgroup in a clean environment with cpu turbo disabled, on a dual-CPU Intel Xeon(R) Platinum 8260 with 2 NUMA nodes each of which has 24C/48T. hackbench: process/thread + pipe/socket + 1/2/4/8 groups netperf: TCP/UDP + STREAM/RR + 24/48/72/96/192 threads tbench: loopback 24/48/72/96/192 threads schbench: 1/2/4/8 mthreads direct: cfs_rq has only one entity parity: RUN_TO_PARITY fast: O(1) fastpath slow: heap search (%) direct parity fast slow hackbench 92.95 2.02 4.91 0.12 netperf 68.08 6.60 24.18 1.14 tbench 67.55 11.22 20.61 0.62 schbench 69.91 2.65 25.73 1.71 The above results indicate that this fastpath really makes task selection more efficient. Signed-off-by: Abel Wu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231115033647.80785-4-wuyun.abel@bytedance.com --- kernel/sched/fair.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 31bca05c3612..d3e045d80cab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -878,6 +878,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; + struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; struct sched_entity *best = NULL; @@ -886,7 +887,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) * in this cfs_rq, saving some cycles. */ if (cfs_rq->nr_running == 1) - return curr && curr->on_rq ? curr : __node_2_se(node); + return curr && curr->on_rq ? curr : se; if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; @@ -898,9 +899,14 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) return curr; + /* Pick the leftmost entity if it's eligible */ + if (se && entity_eligible(cfs_rq, se)) { + best = se; + goto found; + } + /* Heap search for the EEVD entity */ while (node) { - struct sched_entity *se = __node_2_se(node); struct rb_node *left = node->rb_left; /* @@ -913,6 +919,8 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) continue; } + se = __node_2_se(node); + /* * The left subtree either is empty or has no eligible * entity, so check the current node since it is the one @@ -925,7 +933,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) node = node->rb_right; } - +found: if (!best || (curr && entity_before(curr, best))) best = curr; From 5d69eca542ee17c618f9a55da52191d5e28b435f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 4 Nov 2023 11:59:18 +0100 Subject: [PATCH 0078/1562] sched: Unify runtime accounting across classes All classes use sched_entity::exec_start to track runtime and have copies of the exact same code around to compute runtime. Collapse all that. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Reviewed-by: Steven Rostedt (Google) Link: https://lkml.kernel.org/r/54d148a144f26d9559698c4dd82d8859038a7380.1699095159.git.bristot@kernel.org --- include/linux/sched.h | 2 +- kernel/sched/deadline.c | 15 +++-------- kernel/sched/fair.c | 57 ++++++++++++++++++++++++++++++---------- kernel/sched/rt.c | 15 +++-------- kernel/sched/sched.h | 12 ++------- kernel/sched/stop_task.c | 13 +-------- 6 files changed, 53 insertions(+), 61 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index cd56d4018527..44b46d9743bf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -523,7 +523,7 @@ struct sched_statistics { u64 block_max; s64 sum_block_runtime; - u64 exec_max; + s64 exec_max; u64 slice_max; u64 nr_migrations_cold; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b28114478b82..de79719c63c0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1275,9 +1275,8 @@ static void update_curr_dl(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_dl_entity *dl_se = &curr->dl; - u64 delta_exec, scaled_delta_exec; + s64 delta_exec, scaled_delta_exec; int cpu = cpu_of(rq); - u64 now; if (!dl_task(curr) || !on_dl_rq(dl_se)) return; @@ -1290,21 +1289,13 @@ static void update_curr_dl(struct rq *rq) * natural solution, but the full ramifications of this * approach need further study. */ - now = rq_clock_task(rq); - delta_exec = now - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) { + delta_exec = update_curr_common(rq); + if (unlikely(delta_exec <= 0)) { if (unlikely(dl_se->dl_yielded)) goto throttle; return; } - schedstat_set(curr->stats.exec_max, - max(curr->stats.exec_max, delta_exec)); - - trace_sched_stat_runtime(curr, delta_exec, 0); - - update_current_exec_runtime(curr, now, delta_exec); - if (dl_entity_is_special(dl_se)) return; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d3e045d80cab..11073cf00134 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1103,23 +1103,17 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) } #endif /* CONFIG_SMP */ -/* - * Update the current task's runtime statistics. - */ -static void update_curr(struct cfs_rq *cfs_rq) +static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) { - struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_clock_task(rq_of(cfs_rq)); - u64 delta_exec; - - if (unlikely(!curr)) - return; + u64 now = rq_clock_task(rq); + s64 delta_exec; delta_exec = now - curr->exec_start; - if (unlikely((s64)delta_exec <= 0)) - return; + if (unlikely(delta_exec <= 0)) + return delta_exec; curr->exec_start = now; + curr->sum_exec_runtime += delta_exec; if (schedstat_enabled()) { struct sched_statistics *stats; @@ -1129,8 +1123,43 @@ static void update_curr(struct cfs_rq *cfs_rq) max(delta_exec, stats->exec_max)); } - curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq->exec_clock, delta_exec); + return delta_exec; +} + +/* + * Used by other classes to account runtime. + */ +s64 update_curr_common(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + s64 delta_exec; + + delta_exec = update_curr_se(rq, &curr->se); + if (unlikely(delta_exec <= 0)) + return delta_exec; + + trace_sched_stat_runtime(curr, delta_exec, 0); + + account_group_exec_runtime(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); + + return delta_exec; +} + +/* + * Update the current task's runtime statistics. + */ +static void update_curr(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + s64 delta_exec; + + if (unlikely(!curr)) + return; + + delta_exec = update_curr_se(rq_of(cfs_rq), curr); + if (unlikely(delta_exec <= 0)) + return; curr->vruntime += calc_delta_fair(delta_exec, curr); update_deadline(cfs_rq, curr); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6aaf0a3d6081..3261b067b67e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1002,24 +1002,15 @@ static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; - u64 delta_exec; - u64 now; + s64 delta_exec; if (curr->sched_class != &rt_sched_class) return; - now = rq_clock_task(rq); - delta_exec = now - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) + delta_exec = update_curr_common(rq); + if (unlikely(delta_exec <= 0)) return; - schedstat_set(curr->stats.exec_max, - max(curr->stats.exec_max, delta_exec)); - - trace_sched_stat_runtime(curr, delta_exec, 0); - - update_current_exec_runtime(curr, now, delta_exec); - if (!rt_bandwidth_enabled()) return; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 539c7e763f15..6703e9e81b1d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2212,6 +2212,8 @@ struct affinity_context { unsigned int flags; }; +extern s64 update_curr_common(struct rq *rq); + struct sched_class { #ifdef CONFIG_UCLAMP_TASK @@ -3262,16 +3264,6 @@ extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif -static inline void update_current_exec_runtime(struct task_struct *curr, - u64 now, u64 delta_exec) -{ - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); -} - #ifdef CONFIG_SCHED_MM_CID #define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 6cf7304e6449..b1b8fe61c532 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq) static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { - struct task_struct *curr = rq->curr; - u64 now, delta_exec; - - now = rq_clock_task(rq); - delta_exec = now - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; - - schedstat_set(curr->stats.exec_max, - max(curr->stats.exec_max, delta_exec)); - - update_current_exec_runtime(curr, now, delta_exec); + update_curr_common(rq); } /* From 5fe6ec8f6ab549b6422e41551abb51802bd48bc7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Nov 2023 13:41:43 +0100 Subject: [PATCH 0079/1562] sched: Remove vruntime from trace_sched_stat_runtime() Tracing the runtime delta makes sense, observer can sum over time. Tracing the absolute vruntime makes less sense, inconsistent: absolute-vs-delta, but also vruntime delta can be computed from runtime delta. Removing the vruntime thing also makes the two tracepoint sites identical, allowing to unify the code in a later patch. Signed-off-by: Peter Zijlstra (Intel) --- include/trace/events/sched.h | 15 ++++++--------- kernel/sched/fair.c | 5 ++--- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 6188ad0d9e0d..dbb01b4b7451 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -493,33 +493,30 @@ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_blocked, */ DECLARE_EVENT_CLASS(sched_stat_runtime, - TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime), + TP_PROTO(struct task_struct *tsk, u64 runtime), - TP_ARGS(tsk, __perf_count(runtime), vruntime), + TP_ARGS(tsk, __perf_count(runtime)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( u64, runtime ) - __field( u64, vruntime ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->runtime = runtime; - __entry->vruntime = vruntime; ), - TP_printk("comm=%s pid=%d runtime=%Lu [ns] vruntime=%Lu [ns]", + TP_printk("comm=%s pid=%d runtime=%Lu [ns]", __entry->comm, __entry->pid, - (unsigned long long)__entry->runtime, - (unsigned long long)__entry->vruntime) + (unsigned long long)__entry->runtime) ); DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime, - TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime), - TP_ARGS(tsk, runtime, vruntime)); + TP_PROTO(struct task_struct *tsk, u64 runtime), + TP_ARGS(tsk, runtime)); /* * Tracepoint for showing priority inheritance modifying a tasks diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 11073cf00134..33db70c6b582 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1138,8 +1138,7 @@ s64 update_curr_common(struct rq *rq) if (unlikely(delta_exec <= 0)) return delta_exec; - trace_sched_stat_runtime(curr, delta_exec, 0); - + trace_sched_stat_runtime(curr, delta_exec); account_group_exec_runtime(curr, delta_exec); cgroup_account_cputime(curr, delta_exec); @@ -1168,7 +1167,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); - trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); + trace_sched_stat_runtime(curtask, delta_exec); cgroup_account_cputime(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } From c708a4dc5ab547edc3d6537233ca9e79ea30ce47 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Nov 2023 14:04:01 +0100 Subject: [PATCH 0080/1562] sched: Unify more update_curr*() Now that trace_sched_stat_runtime() no longer takes a vruntime argument, the task specific bits are identical between update_curr_common() and update_curr(). Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/fair.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 33db70c6b582..1cd92b11b289 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1126,6 +1126,13 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) return delta_exec; } +static inline void update_curr_task(struct task_struct *p, s64 delta_exec) +{ + trace_sched_stat_runtime(p, delta_exec); + account_group_exec_runtime(p, delta_exec); + cgroup_account_cputime(p, delta_exec); +} + /* * Used by other classes to account runtime. */ @@ -1135,12 +1142,8 @@ s64 update_curr_common(struct rq *rq) s64 delta_exec; delta_exec = update_curr_se(rq, &curr->se); - if (unlikely(delta_exec <= 0)) - return delta_exec; - - trace_sched_stat_runtime(curr, delta_exec); - account_group_exec_runtime(curr, delta_exec); - cgroup_account_cputime(curr, delta_exec); + if (likely(delta_exec > 0)) + update_curr_task(curr, delta_exec); return delta_exec; } @@ -1164,13 +1167,8 @@ static void update_curr(struct cfs_rq *cfs_rq) update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); - if (entity_is_task(curr)) { - struct task_struct *curtask = task_of(curr); - - trace_sched_stat_runtime(curtask, delta_exec); - cgroup_account_cputime(curtask, delta_exec); - account_group_exec_runtime(curtask, delta_exec); - } + if (entity_is_task(curr)) + update_curr_task(task_of(curr), delta_exec); account_cfs_rq_runtime(cfs_rq, delta_exec); } From 9e07d45c5210f5dd6701c00d55791983db7320fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 4 Nov 2023 11:59:19 +0100 Subject: [PATCH 0081/1562] sched/deadline: Collect sched_dl_entity initialization Create a single function that initializes a sched_dl_entity. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/51acc695eecf0a1a2f78f9a044e11ffd9b316bcf.1699095159.git.bristot@kernel.org --- kernel/sched/core.c | 5 +---- kernel/sched/deadline.c | 22 +++++++++++++++------- kernel/sched/sched.h | 5 +---- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9d5099d02dbc..966631f05d71 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4511,10 +4511,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) memset(&p->stats, 0, sizeof(p->stats)); #endif - RB_CLEAR_NODE(&p->dl.rb_node); - init_dl_task_timer(&p->dl); - init_dl_inactive_task_timer(&p->dl); - __dl_clear_params(p); + init_dl_entity(&p->dl); INIT_LIST_HEAD(&p->rt.run_list); p->rt.timeout = 0; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index de79719c63c0..e80bb884262d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -335,6 +335,8 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) __add_rq_bw(new_bw, &rq->dl); } +static void __dl_clear_params(struct sched_dl_entity *dl_se); + /* * The utilization of a task cannot be immediately removed from * the rq active utilization (running_bw) when the task blocks. @@ -434,7 +436,7 @@ static void task_non_contending(struct task_struct *p) raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); - __dl_clear_params(p); + __dl_clear_params(dl_se); } return; @@ -1183,7 +1185,7 @@ unlock: return HRTIMER_NORESTART; } -void init_dl_task_timer(struct sched_dl_entity *dl_se) +static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; @@ -1389,7 +1391,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); - __dl_clear_params(p); + __dl_clear_params(dl_se); goto unlock; } @@ -1405,7 +1407,7 @@ unlock: return HRTIMER_NORESTART; } -void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) +static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; @@ -2957,10 +2959,8 @@ bool __checkparam_dl(const struct sched_attr *attr) /* * This function clears the sched_dl_entity static params. */ -void __dl_clear_params(struct task_struct *p) +static void __dl_clear_params(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; - dl_se->dl_runtime = 0; dl_se->dl_deadline = 0; dl_se->dl_period = 0; @@ -2978,6 +2978,14 @@ void __dl_clear_params(struct task_struct *p) #endif } +void init_dl_entity(struct sched_dl_entity *dl_se) +{ + RB_CLEAR_NODE(&dl_se->rb_node); + init_dl_task_timer(dl_se); + init_dl_inactive_task_timer(dl_se); + __dl_clear_params(dl_se); +} + bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) { struct sched_dl_entity *dl_se = &p->dl; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6703e9e81b1d..3c62df1511e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -273,8 +273,6 @@ struct rt_bandwidth { unsigned int rt_period_active; }; -void __dl_clear_params(struct task_struct *p); - static inline int dl_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; @@ -2427,8 +2425,7 @@ extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); -extern void init_dl_task_timer(struct sched_dl_entity *dl_se); -extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_entity(struct sched_dl_entity *dl_se); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) From 2f7a0f58948d8231236e2facecc500f1930fb996 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 4 Nov 2023 11:59:20 +0100 Subject: [PATCH 0082/1562] sched/deadline: Move bandwidth accounting into {en,de}queue_dl_entity In preparation of introducing !task sched_dl_entity; move the bandwidth accounting into {en.de}queue_dl_entity(). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/a86dccbbe44e021b8771627e1dae01a69b73466d.1699095159.git.bristot@kernel.org --- kernel/sched/deadline.c | 132 ++++++++++++++++++++++------------------ kernel/sched/sched.h | 6 ++ 2 files changed, 79 insertions(+), 59 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e80bb884262d..81810f67df7a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -391,12 +391,12 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se); * up, and checks if the task is still in the "ACTIVE non contending" * state or not (in the second case, it updates running_bw). */ -static void task_non_contending(struct task_struct *p) +static void task_non_contending(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; struct hrtimer *timer = &dl_se->inactive_timer; struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + struct task_struct *p = dl_task_of(dl_se); s64 zerolag_time; /* @@ -428,13 +428,14 @@ static void task_non_contending(struct task_struct *p) if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { if (dl_task(p)) sub_running_bw(dl_se, dl_rq); + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (READ_ONCE(p->__state) == TASK_DEAD) - sub_rq_bw(&p->dl, &rq->dl); + sub_rq_bw(dl_se, &rq->dl); raw_spin_lock(&dl_b->lock); - __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); __dl_clear_params(dl_se); } @@ -1601,6 +1602,41 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags); + /* + * Check if a constrained deadline task was activated + * after the deadline but before the next period. + * If that is the case, the task will be throttled and + * the replenishment timer will be set to the next period. + */ + if (!dl_se->dl_throttled && !dl_is_implicit(dl_se)) + dl_check_constrained_dl(dl_se); + + if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING)) { + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + add_rq_bw(dl_se, dl_rq); + add_running_bw(dl_se, dl_rq); + } + + /* + * If p is throttled, we do not enqueue it. In fact, if it exhausted + * its budget it needs a replenishment and, since it now is on + * its rq, the bandwidth timer callback (which clearly has not + * run yet) will take care of this. + * However, the active utilization does not depend on the fact + * that the task is on the runqueue or not (but depends on the + * task's state - in GRUB parlance, "inactive" vs "active contending"). + * In other words, even if a task is throttled its utilization must + * be counted in the active utilization; hence, we need to call + * add_running_bw(). + */ + if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (flags & ENQUEUE_WAKEUP) + task_contending(dl_se, flags); + + return; + } + /* * If this is a wakeup or a new instance, the scheduling * parameters of the task might need updating. Otherwise, @@ -1620,9 +1656,28 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) __enqueue_dl_entity(dl_se); } -static void dequeue_dl_entity(struct sched_dl_entity *dl_se) +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) { __dequeue_dl_entity(dl_se); + + if (flags & (DEQUEUE_SAVE|DEQUEUE_MIGRATING)) { + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + sub_running_bw(dl_se, dl_rq); + sub_rq_bw(dl_se, dl_rq); + } + + /* + * This check allows to start the inactive timer (or to immediately + * decrease the active utilization, if needed) in two cases: + * when the task blocks and when it is terminating + * (p->state == TASK_DEAD). We can handle the two cases in the same + * way, because from GRUB's point of view the same thing is happening + * (the task moves from "active contending" to "active non contending" + * or "inactive") + */ + if (flags & DEQUEUE_SLEEP) + task_non_contending(dl_se); } static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -1667,76 +1722,35 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) return; } - /* - * Check if a constrained deadline task was activated - * after the deadline but before the next period. - * If that is the case, the task will be throttled and - * the replenishment timer will be set to the next period. - */ - if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) - dl_check_constrained_dl(&p->dl); - - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { - add_rq_bw(&p->dl, &rq->dl); - add_running_bw(&p->dl, &rq->dl); - } - - /* - * If p is throttled, we do not enqueue it. In fact, if it exhausted - * its budget it needs a replenishment and, since it now is on - * its rq, the bandwidth timer callback (which clearly has not - * run yet) will take care of this. - * However, the active utilization does not depend on the fact - * that the task is on the runqueue or not (but depends on the - * task's state - in GRUB parlance, "inactive" vs "active contending"). - * In other words, even if a task is throttled its utilization must - * be counted in the active utilization; hence, we need to call - * add_running_bw(). - */ - if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { - if (flags & ENQUEUE_WAKEUP) - task_contending(&p->dl, flags); - - return; - } - check_schedstat_required(); update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl); + if (p->on_rq == TASK_ON_RQ_MIGRATING) + flags |= ENQUEUE_MIGRATING; + enqueue_dl_entity(&p->dl, flags); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_stats_dequeue_dl(&rq->dl, &p->dl, flags); - dequeue_dl_entity(&p->dl); - dequeue_pushable_dl_task(rq, p); + dequeue_dl_entity(&p->dl, flags); + + if (!p->dl.dl_throttled) + dequeue_pushable_dl_task(rq, p); } static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); + + if (p->on_rq == TASK_ON_RQ_MIGRATING) + flags |= DEQUEUE_MIGRATING; + __dequeue_task_dl(rq, p, flags); - - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { - sub_running_bw(&p->dl, &rq->dl); - sub_rq_bw(&p->dl, &rq->dl); - } - - /* - * This check allows to start the inactive timer (or to immediately - * decrease the active utilization, if needed) in two cases: - * when the task blocks and when it is terminating - * (p->state == TASK_DEAD). We can handle the two cases in the same - * way, because from GRUB's point of view the same thing is happening - * (the task moves from "active contending" to "active non contending" - * or "inactive") - */ - if (flags & DEQUEUE_SLEEP) - task_non_contending(p); } /* @@ -2551,7 +2565,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) * will reset the task parameters. */ if (task_on_rq_queued(p) && p->dl.dl_runtime) - task_non_contending(p); + task_non_contending(&p->dl); /* * In case a task is setscheduled out from SCHED_DEADLINE we need to diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3c62df1511e7..1cda787172f0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2177,6 +2177,10 @@ extern const u32 sched_prio_to_wmult[40]; * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location * in the runqueue. * + * NOCLOCK - skip the update_rq_clock() (avoids double updates) + * + * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE) + * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup @@ -2187,6 +2191,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2201,6 +2206,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_MIGRATED 0x00 #endif #define ENQUEUE_INITIAL 0x80 +#define ENQUEUE_MIGRATING 0x100 #define RETRY_TASK ((void *)-1UL) From 63ba8422f876e32ee564ea95da9a7313b13ff0a1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 4 Nov 2023 11:59:21 +0100 Subject: [PATCH 0083/1562] sched/deadline: Introduce deadline servers Low priority tasks (e.g., SCHED_OTHER) can suffer starvation if tasks with higher priority (e.g., SCHED_FIFO) monopolize CPU(s). RT Throttling has been introduced a while ago as a (mostly debug) countermeasure one can utilize to reserve some CPU time for low priority tasks (usually background type of work, e.g. workqueues, timers, etc.). It however has its own problems (see documentation) and the undesired effect of unconditionally throttling FIFO tasks even when no lower priority activity needs to run (there are mechanisms to fix this issue as well, but, again, with their own problems). Introduce deadline servers to service low priority tasks needs under starvation conditions. Deadline servers are built extending SCHED_DEADLINE implementation to allow 2-level scheduling (a sched_deadline entity becomes a container for lower priority scheduling entities). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/4968601859d920335cf85822eb573a5f179f04b8.1699095159.git.bristot@kernel.org --- include/linux/sched.h | 22 ++- kernel/sched/core.c | 17 ++ kernel/sched/deadline.c | 332 +++++++++++++++++++++++++++------------- kernel/sched/fair.c | 2 + kernel/sched/sched.h | 27 ++++ 5 files changed, 292 insertions(+), 108 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 44b46d9743bf..8d258162deb0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -63,11 +63,13 @@ struct robust_list_head; struct root_domain; struct rq; struct sched_attr; +struct sched_dl_entity; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; +struct task_struct; struct user_event_mm; /* @@ -607,6 +609,9 @@ struct sched_rt_entity { #endif } __randomize_layout; +typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); + struct sched_dl_entity { struct rb_node rb_node; @@ -654,6 +659,7 @@ struct sched_dl_entity { unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; + unsigned int dl_server : 1; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -668,7 +674,20 @@ struct sched_dl_entity { * timer is needed to decrease the active utilization at the correct * time. */ - struct hrtimer inactive_timer; + struct hrtimer inactive_timer; + + /* + * Bits for DL-server functionality. Also see the comment near + * dl_server_update(). + * + * @rq the runqueue this server is for + * + * @server_has_tasks() returns true if @server_pick return a + * runnable task. + */ + struct rq *rq; + dl_server_has_tasks_f server_has_tasks; + dl_server_pick_f server_pick; #ifdef CONFIG_RT_MUTEXES /* @@ -795,6 +814,7 @@ struct task_struct { struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; + struct sched_dl_entity *dl_server; const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 966631f05d71..f5f4495d1768 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3797,6 +3797,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } #endif + + p->dl_server = NULL; } /* @@ -6003,12 +6005,27 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = pick_next_task_idle(rq); } + /* + * This is the fast path; it cannot be a DL server pick; + * therefore even if @p == @prev, ->dl_server must be NULL. + */ + if (p->dl_server) + p->dl_server = NULL; + return p; } restart: put_prev_task_balance(rq, prev, rf); + /* + * We've updated @prev and no longer need the server link, clear it. + * Must be done before ->pick_next_task() because that can (re)set + * ->dl_server. + */ + if (prev->dl_server) + prev->dl_server = NULL; + for_each_class(class) { p = class->pick_next_task(rq); if (p) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 81810f67df7a..a04a436af8cc 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -54,8 +54,14 @@ static int __init sched_dl_sysctl_init(void) late_initcall(sched_dl_sysctl_init); #endif +static bool dl_server(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server; +} + static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { + BUG_ON(dl_server(dl_se)); return container_of(dl_se, struct task_struct, dl); } @@ -64,12 +70,19 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) return container_of(dl_rq, struct rq, dl); } +static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se) +{ + struct rq *rq = dl_se->rq; + + if (!dl_server(dl_se)) + rq = task_rq(dl_task_of(dl_se)); + + return rq; +} + static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) { - struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = task_rq(p); - - return &rq->dl; + return &rq_of_dl_se(dl_se)->dl; } static inline int on_dl_rq(struct sched_dl_entity *dl_se) @@ -394,9 +407,8 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se); static void task_non_contending(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); - struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = rq_of_dl_se(dl_se); + struct dl_rq *dl_rq = &rq->dl; s64 zerolag_time; /* @@ -426,25 +438,33 @@ static void task_non_contending(struct sched_dl_entity *dl_se) * utilization now, instead of starting a timer */ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { - if (dl_task(p)) + if (dl_server(dl_se)) { sub_running_bw(dl_se, dl_rq); + } else { + struct task_struct *p = dl_task_of(dl_se); - if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + if (dl_task(p)) + sub_running_bw(dl_se, dl_rq); - if (READ_ONCE(p->__state) == TASK_DEAD) - sub_rq_bw(dl_se, &rq->dl); - raw_spin_lock(&dl_b->lock); - __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p))); - raw_spin_unlock(&dl_b->lock); - __dl_clear_params(dl_se); + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + if (READ_ONCE(p->__state) == TASK_DEAD) + sub_rq_bw(dl_se, &rq->dl); + raw_spin_lock(&dl_b->lock); + __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p))); + raw_spin_unlock(&dl_b->lock); + __dl_clear_params(dl_se); + } } return; } dl_se->dl_non_contending = 1; - get_task_struct(p); + if (!dl_server(dl_se)) + get_task_struct(dl_task_of(dl_se)); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD); } @@ -471,8 +491,10 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) - put_task_struct(dl_task_of(dl_se)); + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { + if (!dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); + } } else { /* * Since "dl_non_contending" is not set, the @@ -485,10 +507,8 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) } } -static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) +static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - struct sched_dl_entity *dl_se = &p->dl; - return rb_first_cached(&dl_rq->root) == &dl_se->rb_node; } @@ -740,8 +760,10 @@ static inline void deadline_queue_pull_task(struct rq *rq) } #endif /* CONFIG_SMP */ +static void +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags); static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, @@ -989,8 +1011,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) */ static void update_dl_entity(struct sched_dl_entity *dl_se) { - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); + struct rq *rq = rq_of_dl_se(dl_se); if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, rq_clock(rq))) { @@ -1021,11 +1042,11 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se) * actually started or not (i.e., the replenishment instant is in * the future or in the past). */ -static int start_dl_timer(struct task_struct *p) +static int start_dl_timer(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; struct hrtimer *timer = &dl_se->dl_timer; - struct rq *rq = task_rq(p); + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); ktime_t now, act; s64 delta; @@ -1059,13 +1080,33 @@ static int start_dl_timer(struct task_struct *p) * and observe our state. */ if (!hrtimer_is_queued(timer)) { - get_task_struct(p); + if (!dl_server(dl_se)) + get_task_struct(dl_task_of(dl_se)); hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD); } return 1; } +static void __push_dl_task(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, check if we need + * to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) { + /* + * Nothing relies on rq->lock after this, so its safe to drop + * rq->lock. + */ + rq_unpin_lock(rq, rf); + push_dl_task(rq); + rq_repin_lock(rq, rf); + } +#endif +} + /* * This is the bandwidth enforcement timer callback. If here, we know * a task is not on its dl_rq, since the fact that the timer was running @@ -1084,10 +1125,34 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity *dl_se = container_of(timer, struct sched_dl_entity, dl_timer); - struct task_struct *p = dl_task_of(dl_se); + struct task_struct *p; struct rq_flags rf; struct rq *rq; + if (dl_server(dl_se)) { + struct rq *rq = rq_of_dl_se(dl_se); + struct rq_flags rf; + + rq_lock(rq, &rf); + if (dl_se->dl_throttled) { + sched_clock_tick(); + update_rq_clock(rq); + + if (dl_se->server_has_tasks(dl_se)) { + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + resched_curr(rq); + __push_dl_task(rq, &rf); + } else { + replenish_dl_entity(dl_se); + } + + } + rq_unlock(rq, &rf); + + return HRTIMER_NORESTART; + } + + p = dl_task_of(dl_se); rq = task_rq_lock(p, &rf); /* @@ -1158,21 +1223,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) else resched_curr(rq); -#ifdef CONFIG_SMP - /* - * Queueing this task back might have overloaded rq, check if we need - * to kick someone away. - */ - if (has_pushable_dl_tasks(rq)) { - /* - * Nothing relies on rq->lock after this, so its safe to drop - * rq->lock. - */ - rq_unpin_lock(rq, &rf); - push_dl_task(rq); - rq_repin_lock(rq, &rf); - } -#endif + __push_dl_task(rq, &rf); unlock: task_rq_unlock(rq, p, &rf); @@ -1214,12 +1265,11 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se) */ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) { - struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se)); + struct rq *rq = rq_of_dl_se(dl_se); if (dl_time_before(dl_se->deadline, rq_clock(rq)) && dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { - if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) return; dl_se->dl_throttled = 1; if (dl_se->runtime > 0) @@ -1270,29 +1320,13 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) return (delta * u_act) >> BW_SHIFT; } -/* - * Update the current task's runtime statistics (provided it is still - * a -deadline task and has not been removed from the dl_rq). - */ -static void update_curr_dl(struct rq *rq) +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags); +static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { - struct task_struct *curr = rq->curr; - struct sched_dl_entity *dl_se = &curr->dl; - s64 delta_exec, scaled_delta_exec; - int cpu = cpu_of(rq); + s64 scaled_delta_exec; - if (!dl_task(curr) || !on_dl_rq(dl_se)) - return; - - /* - * Consumed budget is computed considering the time as - * observed by schedulable tasks (excluding time spent - * in hardirq context, etc.). Deadlines are instead - * computed using hard walltime. This seems to be the more - * natural solution, but the full ramifications of this - * approach need further study. - */ - delta_exec = update_curr_common(rq); if (unlikely(delta_exec <= 0)) { if (unlikely(dl_se->dl_yielded)) goto throttle; @@ -1310,10 +1344,9 @@ static void update_curr_dl(struct rq *rq) * according to current frequency and CPU maximum capacity. */ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) { - scaled_delta_exec = grub_reclaim(delta_exec, - rq, - &curr->dl); + scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se); } else { + int cpu = cpu_of(rq); unsigned long scale_freq = arch_scale_freq_capacity(cpu); unsigned long scale_cpu = arch_scale_cpu_capacity(cpu); @@ -1332,11 +1365,20 @@ throttle: (dl_se->flags & SCHED_FLAG_DL_OVERRUN)) dl_se->dl_overrun = 1; - __dequeue_task_dl(rq, curr, 0); - if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr))) - enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); + dequeue_dl_entity(dl_se, 0); + if (!dl_server(dl_se)) { + update_stats_dequeue_dl(&rq->dl, dl_se, 0); + dequeue_pushable_dl_task(rq, dl_task_of(dl_se)); + } - if (!is_leftmost(curr, &rq->dl)) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) { + if (dl_server(dl_se)) + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + else + enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH); + } + + if (!is_leftmost(dl_se, &rq->dl)) resched_curr(rq); } @@ -1366,20 +1408,82 @@ throttle: } } +void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) +{ + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); +} + +void dl_server_start(struct sched_dl_entity *dl_se) +{ + if (!dl_server(dl_se)) { + dl_se->dl_server = 1; + setup_new_dl_entity(dl_se); + } + enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); +} + +void dl_server_stop(struct sched_dl_entity *dl_se) +{ + dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); +} + +void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, + dl_server_pick_f pick) +{ + dl_se->rq = rq; + dl_se->server_has_tasks = has_tasks; + dl_se->server_pick = pick; +} + +/* + * Update the current task's runtime statistics (provided it is still + * a -deadline task and has not been removed from the dl_rq). + */ +static void update_curr_dl(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct sched_dl_entity *dl_se = &curr->dl; + s64 delta_exec; + + if (!dl_task(curr) || !on_dl_rq(dl_se)) + return; + + /* + * Consumed budget is computed considering the time as + * observed by schedulable tasks (excluding time spent + * in hardirq context, etc.). Deadlines are instead + * computed using hard walltime. This seems to be the more + * natural solution, but the full ramifications of this + * approach need further study. + */ + delta_exec = update_curr_common(rq); + update_curr_dl_se(rq, dl_se, delta_exec); +} + static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) { struct sched_dl_entity *dl_se = container_of(timer, struct sched_dl_entity, inactive_timer); - struct task_struct *p = dl_task_of(dl_se); + struct task_struct *p = NULL; struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(p, &rf); + if (!dl_server(dl_se)) { + p = dl_task_of(dl_se); + rq = task_rq_lock(p, &rf); + } else { + rq = dl_se->rq; + rq_lock(rq, &rf); + } sched_clock_tick(); update_rq_clock(rq); + if (dl_server(dl_se)) + goto no_task; + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); @@ -1396,14 +1500,21 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) goto unlock; } + +no_task: if (dl_se->dl_non_contending == 0) goto unlock; sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending = 0; unlock: - task_rq_unlock(rq, p, &rf); - put_task_struct(p); + + if (!dl_server(dl_se)) { + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + } else { + rq_unlock(rq, &rf); + } return HRTIMER_NORESTART; } @@ -1466,10 +1577,8 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - int prio = dl_task_of(dl_se)->prio; u64 deadline = dl_se->deadline; - WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); @@ -1479,9 +1588,6 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static inline void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - int prio = dl_task_of(dl_se)->prio; - - WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); @@ -1648,8 +1754,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && - dl_time_before(dl_se->deadline, - rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { + dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) { setup_new_dl_entity(dl_se); } @@ -1730,19 +1835,13 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) enqueue_dl_entity(&p->dl, flags); + if (dl_server(&p->dl)) + return; + if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) -{ - update_stats_dequeue_dl(&rq->dl, &p->dl, flags); - dequeue_dl_entity(&p->dl, flags); - - if (!p->dl.dl_throttled) - dequeue_pushable_dl_task(rq, p); -} - static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); @@ -1750,7 +1849,9 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (p->on_rq == TASK_ON_RQ_MIGRATING) flags |= DEQUEUE_MIGRATING; - __dequeue_task_dl(rq, p, flags); + dequeue_dl_entity(&p->dl, flags); + if (!p->dl.dl_throttled && !dl_server(&p->dl)) + dequeue_pushable_dl_task(rq, p); } /* @@ -1940,12 +2041,12 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, } #ifdef CONFIG_SCHED_HRTICK -static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { - hrtick_start(rq, p->dl.runtime); + hrtick_start(rq, dl_se->runtime); } #else /* !CONFIG_SCHED_HRTICK */ -static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { } #endif @@ -1965,9 +2066,6 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (hrtick_enabled_dl(rq)) - start_hrtick_dl(rq, p); - if (rq->curr->sched_class != &dl_sched_class) update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); @@ -1990,12 +2088,25 @@ static struct task_struct *pick_task_dl(struct rq *rq) struct dl_rq *dl_rq = &rq->dl; struct task_struct *p; +again: if (!sched_dl_runnable(rq)) return NULL; dl_se = pick_next_dl_entity(dl_rq); WARN_ON_ONCE(!dl_se); - p = dl_task_of(dl_se); + + if (dl_server(dl_se)) { + p = dl_se->server_pick(dl_se); + if (!p) { + WARN_ON_ONCE(1); + dl_se->dl_yielded = 1; + update_curr_dl_se(rq, dl_se, 0); + goto again; + } + p->dl_server = dl_se; + } else { + p = dl_task_of(dl_se); + } return p; } @@ -2005,9 +2116,15 @@ static struct task_struct *pick_next_task_dl(struct rq *rq) struct task_struct *p; p = pick_task_dl(rq); - if (p) + if (!p) + return p; + + if (!p->dl_server) set_next_task_dl(rq, p, true); + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, &p->dl); + return p; } @@ -2045,8 +2162,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) * be set and schedule() will start a new hrtick for the next task. */ if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 && - is_leftmost(p, &rq->dl)) - start_hrtick_dl(rq, p); + is_leftmost(&p->dl, &rq->dl)) + start_hrtick_dl(rq, &p->dl); } static void task_fork_dl(struct task_struct *p) @@ -2986,6 +3103,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se) dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; + dl_se->dl_server = 0; #ifdef CONFIG_RT_MUTEXES dl_se->pi_se = dl_se; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1cd92b11b289..07f555857698 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1131,6 +1131,8 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec) trace_sched_stat_runtime(p, delta_exec); account_group_exec_runtime(p, delta_exec); cgroup_account_cputime(p, delta_exec); + if (p->dl_server) + dl_server_update(p->dl_server, delta_exec); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1cda787172f0..8a70d51ffa33 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -313,6 +313,33 @@ extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *att extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int dl_bw_check_overflow(int cpu); +/* + * SCHED_DEADLINE supports servers (nested scheduling) with the following + * interface: + * + * dl_se::rq -- runqueue we belong to. + * + * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the + * server when it runs out of tasks to run. + * + * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this + * returns NULL. + * + * dl_server_update() -- called from update_curr_common(), propagates runtime + * to the server. + * + * dl_server_start() + * dl_server_stop() -- start/stop the server when it has (no) tasks. + * + * dl_server_init() -- initializes the server. + */ +extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); +extern void dl_server_start(struct sched_dl_entity *dl_se); +extern void dl_server_stop(struct sched_dl_entity *dl_se); +extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, + dl_server_pick_f pick); + #ifdef CONFIG_CGROUP_SCHED struct cfs_rq; From dd5403869a40595eb953f12e8cd2bb57bb88bb67 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 14 Nov 2023 14:38:39 -0500 Subject: [PATCH 0084/1562] sched/cpuidle: Comment about timers requirements VS idle handler Add missing explanation concerning IRQs re-enablement constraints in the cpuidle path against timers. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/20231114193840.4041-2-frederic@kernel.org --- kernel/sched/idle.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 565f8374ddbb..31231925f1ec 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -258,6 +258,36 @@ static void do_idle(void) while (!need_resched()) { rmb(); + /* + * Interrupts shouldn't be re-enabled from that point on until + * the CPU sleeping instruction is reached. Otherwise an interrupt + * may fire and queue a timer that would be ignored until the CPU + * wakes from the sleeping instruction. And testing need_resched() + * doesn't tell about pending needed timer reprogram. + * + * Several cases to consider: + * + * - SLEEP-UNTIL-PENDING-INTERRUPT based instructions such as + * "wfi" or "mwait" are fine because they can be entered with + * interrupt disabled. + * + * - sti;mwait() couple is fine because the interrupts are + * re-enabled only upon the execution of mwait, leaving no gap + * in-between. + * + * - ROLLBACK based idle handlers with the sleeping instruction + * called with interrupts enabled are NOT fine. In this scheme + * when the interrupt detects it has interrupted an idle handler, + * it rolls back to its beginning which performs the + * need_resched() check before re-executing the sleeping + * instruction. This can leak a pending needed timer reprogram. + * If such a scheme is really mandatory due to the lack of an + * appropriate CPU sleeping instruction, then a FAST-FORWARD + * must instead be applied: when the interrupt detects it has + * interrupted an idle handler, it must resume to the end of + * this idle handler so that the generic idle loop is iterated + * again to reprogram the tick. + */ local_irq_disable(); if (cpu_is_offline(cpu)) { From 194600008d5c43b5a4ba98c4b81633397e34ffad Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 14 Nov 2023 14:38:40 -0500 Subject: [PATCH 0085/1562] sched/timers: Explain why idle task schedules out on remote timer enqueue Trying to avoid that didn't bring much value after testing, add comment about this. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/20231114193840.4041-3-frederic@kernel.org --- kernel/sched/core.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f5f4495d1768..2de77a6d5ef8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1131,6 +1131,28 @@ static void wake_up_idle_cpu(int cpu) if (cpu == smp_processor_id()) return; + /* + * Set TIF_NEED_RESCHED and send an IPI if in the non-polling + * part of the idle loop. This forces an exit from the idle loop + * and a round trip to schedule(). Now this could be optimized + * because a simple new idle loop iteration is enough to + * re-evaluate the next tick. Provided some re-ordering of tick + * nohz functions that would need to follow TIF_NR_POLLING + * clearing: + * + * - On most archs, a simple fetch_or on ti::flags with a + * "0" value would be enough to know if an IPI needs to be sent. + * + * - x86 needs to perform a last need_resched() check between + * monitor and mwait which doesn't take timers into account. + * There a dedicated TIF_TIMER flag would be required to + * fetch_or here and be checked along with TIF_NEED_RESCHED + * before mwait(). + * + * However, remote timer enqueue is not such a frequent event + * and testing of the above solutions didn't appear to report + * much benefits. + */ if (set_nr_and_not_polling(rq->idle)) smp_send_reschedule(cpu); else From 652ffc2104ec1f69dd4a46313888c33527145ccf Mon Sep 17 00:00:00 2001 From: Greg KH Date: Mon, 12 Jun 2023 15:09:09 +0200 Subject: [PATCH 0086/1562] perf/core: Fix narrow startup race when creating the perf nr_addr_filters sysfs file Signed-off-by: Greg Kroah-Hartman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/2023061204-decal-flyable-6090@gregkh --- kernel/events/core.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 08250981d9f4..4f0c45ab8d7d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11408,9 +11408,32 @@ static DEVICE_ATTR_RW(perf_event_mux_interval_ms); static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, + &dev_attr_nr_addr_filters.attr, + NULL, +}; + +static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct pmu *pmu = dev_get_drvdata(dev); + + if (!pmu->nr_addr_filters) + return 0; + + return a->mode; + + return 0; +} + +static struct attribute_group pmu_dev_attr_group = { + .is_visible = pmu_dev_is_visible, + .attrs = pmu_dev_attrs, +}; + +static const struct attribute_group *pmu_dev_groups[] = { + &pmu_dev_attr_group, NULL, }; -ATTRIBUTE_GROUPS(pmu_dev); static int pmu_bus_running; static struct bus_type pmu_bus = { @@ -11447,18 +11470,11 @@ static int pmu_dev_alloc(struct pmu *pmu) if (ret) goto free_dev; - /* For PMUs with address filters, throw in an extra attribute: */ - if (pmu->nr_addr_filters) - ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); - - if (ret) - goto del_dev; - - if (pmu->attr_update) + if (pmu->attr_update) { ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); - - if (ret) - goto del_dev; + if (ret) + goto del_dev; + } out: return ret; From e246777e2a032934047ba9e106de1fb21e7a8402 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 15 Nov 2023 10:20:40 -0500 Subject: [PATCH 0087/1562] MAINTAINERS: update the LSM entry Bring the LSM / "SECURITY SUBSYSTEM" entry up to date with the following changes: * Remove the "(suggested Cc:)" note on the mailing list. I don't really care if the LSM list is on the To: or Cc: line, I just want folks to include it when appropriate. * Remove the website link. The website isn't really maintained in any meaningful way so we're going to go ahead and remove it so we lessen the chance of conflicting or confusing information in the future. * Add our patchwork link. I'm not sure this is of much use for anyone but the maintainer, but there is a provision for including it here so we might as well include it. * Add a bug report URI. I suspect most everyone knows to send mail to the mailing list if they hit a bug, but let's make it official. * Add a link to the LSM tree process/management documentation. While the doc exists both in the canonical kernel.org location and the GitHub mirror, provide a link to the mirror as GitHub does a better job rendering the Markdown. * Update the source tree's git URI to use https. * Aside from changes to the LSM code itself, we also would like to be notified when the LSM call sites are changed so we are adding a security_XXX(...) regex to try and catch all of the callers. Signed-off-by: Paul Moore --- MAINTAINERS | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2482b40fd786..34187ece7330 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19507,14 +19507,17 @@ SECURITY SUBSYSTEM M: Paul Moore M: James Morris M: "Serge E. Hallyn" -L: linux-security-module@vger.kernel.org (suggested Cc:) +L: linux-security-module@vger.kernel.org S: Supported -W: http://kernsec.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git +Q: https://patchwork.kernel.org/project/linux-security-module/list +B: mailto:linux-security-module@vger.kernel.org +P: https://github.com/LinuxSecurityModule/kernel/blob/main/README.md +T: git https://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git F: include/uapi/linux/lsm.h F: security/ F: tools/testing/selftests/lsm/ X: security/selinux/ +K: \bsecurity_[a-z_0-9]\+\b SELINUX SECURITY MODULE M: Paul Moore From 4e8714b76613e6284b263274d6dddcfac24be262 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 15 Nov 2023 12:09:17 -0500 Subject: [PATCH 0088/1562] MAINTAINERS: update the audit entry Bring the audit subsystem entry up to date with the following changes: * Add our patchwork link. I'm not sure this is of much use for anyone but the maintainer, but there is a provision for including it here so we might as well include it. * Add a bug report URI. I suspect most everyone knows to send mail to the mailing list if they hit a bug, but let's make it official. * Add a link to the audit tree process/management documentation. While the doc exists both in the canonical kernel.org location and the GitHub mirror, provide a link to the mirror as GitHub does a better job rendering the Markdown. * Update the source tree's git URI to use https. * Aside from changes to the audit code itself, we also would like to be notified when the audit call sites are changed so we are adding an audit_XXX(...) regex to try and catch all of the callers. Signed-off-by: Paul Moore --- MAINTAINERS | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cf..bb967a494400 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3350,13 +3350,17 @@ M: Eric Paris L: audit@vger.kernel.org S: Supported W: https://github.com/linux-audit -T: git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit.git +Q: https://patchwork.kernel.org/project/audit/list +B: mailto:audit@vger.kernel.org +P: https://github.com/linux-audit/audit-kernel/blob/main/README.md +T: git https://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit.git F: include/asm-generic/audit_*.h F: include/linux/audit.h F: include/linux/audit_arch.h F: include/uapi/linux/audit.h F: kernel/audit* F: lib/*audit.c +K: \baudit_[a-z_0-9]\+\b AUXILIARY BUS DRIVER M: Greg Kroah-Hartman From 3fc6350fc8470d42f5e700ecd1c3d90f9dd9fd2d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 13 Nov 2023 13:12:49 +0200 Subject: [PATCH 0089/1562] treewide, spi: Get rid of SPI_MASTER_HALF_DUPLEX The SPI_MASTER_HALF_DUPLEX is the legacy name of a definition for a half duplex flag. Since all others had been replaced with the respective SPI_CONTROLLER prefix get rid of the last one as well. There is no functional change intended. Signed-off-by: Andy Shevchenko Acked-by: Greg Kroah-Hartman Acked-by: Ulf Hansson # For MMC Acked-by: Dmitry Torokhov # for input Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20231113111249.3982461-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/input/rmi4/rmi_spi.c | 2 +- drivers/mmc/host/mmc_spi.c | 2 +- drivers/net/ethernet/micrel/ks8851_spi.c | 4 ++-- drivers/usb/gadget/udc/max3420_udc.c | 2 +- include/linux/spi/spi.h | 2 -- 5 files changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/input/rmi4/rmi_spi.c b/drivers/input/rmi4/rmi_spi.c index 852aeb0b2c07..07c866f42296 100644 --- a/drivers/input/rmi4/rmi_spi.c +++ b/drivers/input/rmi4/rmi_spi.c @@ -375,7 +375,7 @@ static int rmi_spi_probe(struct spi_device *spi) struct rmi_device_platform_data *spi_pdata = spi->dev.platform_data; int error; - if (spi->master->flags & SPI_MASTER_HALF_DUPLEX) + if (spi->master->flags & SPI_CONTROLLER_HALF_DUPLEX) return -EINVAL; rmi_spi = devm_kzalloc(&spi->dev, sizeof(struct rmi_spi_xport), diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index cc333ad67cac..b0cccef4cfbf 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -1322,7 +1322,7 @@ static int mmc_spi_probe(struct spi_device *spi) /* We rely on full duplex transfers, mostly to reduce * per-transfer overheads (by making fewer transfers). */ - if (spi->master->flags & SPI_MASTER_HALF_DUPLEX) + if (spi->master->flags & SPI_CONTROLLER_HALF_DUPLEX) return -EINVAL; /* MMC and SD specs only seem to care that sampling is on the diff --git a/drivers/net/ethernet/micrel/ks8851_spi.c b/drivers/net/ethernet/micrel/ks8851_spi.c index 70bc7253454f..7c41623dac90 100644 --- a/drivers/net/ethernet/micrel/ks8851_spi.c +++ b/drivers/net/ethernet/micrel/ks8851_spi.c @@ -156,7 +156,7 @@ static void ks8851_rdreg(struct ks8851_net *ks, unsigned int op, txb[0] = cpu_to_le16(op | KS_SPIOP_RD); - if (kss->spidev->master->flags & SPI_MASTER_HALF_DUPLEX) { + if (kss->spidev->master->flags & SPI_CONTROLLER_HALF_DUPLEX) { msg = &kss->spi_msg2; xfer = kss->spi_xfer2; @@ -180,7 +180,7 @@ static void ks8851_rdreg(struct ks8851_net *ks, unsigned int op, ret = spi_sync(kss->spidev, msg); if (ret < 0) netdev_err(ks->netdev, "read: spi_sync() failed\n"); - else if (kss->spidev->master->flags & SPI_MASTER_HALF_DUPLEX) + else if (kss->spidev->master->flags & SPI_CONTROLLER_HALF_DUPLEX) memcpy(rxb, trx, rxl); else memcpy(rxb, trx + 2, rxl); diff --git a/drivers/usb/gadget/udc/max3420_udc.c b/drivers/usb/gadget/udc/max3420_udc.c index 2d57786d3db7..89e8cf2a2a7d 100644 --- a/drivers/usb/gadget/udc/max3420_udc.c +++ b/drivers/usb/gadget/udc/max3420_udc.c @@ -1201,7 +1201,7 @@ static int max3420_probe(struct spi_device *spi) int err, irq; u8 reg[8]; - if (spi->master->flags & SPI_MASTER_HALF_DUPLEX) { + if (spi->master->flags & SPI_CONTROLLER_HALF_DUPLEX) { dev_err(&spi->dev, "UDC needs full duplex to work\n"); return -EINVAL; } diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 255a0562aea5..7b4baff63c5c 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -1638,8 +1638,6 @@ spi_transfer_is_last(struct spi_controller *ctlr, struct spi_transfer *xfer) /* Compatibility layer */ #define spi_master spi_controller -#define SPI_MASTER_HALF_DUPLEX SPI_CONTROLLER_HALF_DUPLEX - #define spi_master_get_devdata(_ctlr) spi_controller_get_devdata(_ctlr) #define spi_master_set_devdata(_ctlr, _data) \ spi_controller_set_devdata(_ctlr, _data) From 4a0b33f771db2b82fdfad08b9f34def786162865 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 16 Oct 2023 23:08:35 +0100 Subject: [PATCH 0090/1562] selinux: saner handling of policy reloads On policy reload selinuxfs replaces two subdirectories (/booleans and /class) with new variants. Unfortunately, that's done with serious abuses of directory locking. 1) lock_rename() should be done to parents, not to objects being exchanged 2) there's a bunch of reasons why it should not be done for directories that do not have a common ancestor; most of those do not apply to selinuxfs, but even in the best case the proof is subtle and brittle. 3) failure halfway through the creation of /class will leak names and values arrays. 4) use of d_genocide() is also rather brittle; it's probably not much of a bug per se, but e.g. an overmount of /sys/fs/selinuxfs/classes/shm/index with any regular file will end up with leaked mount on policy reload. Sure, don't do it, but... Let's stop messing with disconnected directories; just create a temporary (/.swapover) with no permissions for anyone (on the level of ->permission() returing -EPERM, no matter who's calling it) and build the new /booleans and /class in there; then lock_rename on root and that temporary directory and d_exchange() old and new both for class and booleans. Then unlock and use simple_recursive_removal() to take the temporary out; it's much more robust. And instead of bothering with separate pathways for freeing new (on failure halfway through) and old (on success) names/values, do all freeing in one place. With temporaries swapped with the old ones when we are past all possible failures. The only user-visible difference is that /.swapover shows up (but isn't possible to open, look up into, etc.) for the duration of policy reload. Reviewed-by: Stephen Smalley Signed-off-by: Al Viro [PM: applied some fixes from Al post merge] Signed-off-by: Paul Moore --- security/selinux/selinuxfs.c | 148 ++++++++++++++++------------------- 1 file changed, 68 insertions(+), 80 deletions(-) diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 6c596ae7fef9..0619a1cbbfbe 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -336,12 +336,9 @@ static struct dentry *sel_make_dir(struct dentry *dir, const char *name, unsigned long *ino); /* declaration for sel_make_policy_nodes */ -static struct dentry *sel_make_disconnected_dir(struct super_block *sb, +static struct dentry *sel_make_swapover_dir(struct super_block *sb, unsigned long *ino); -/* declaration for sel_make_policy_nodes */ -static void sel_remove_entries(struct dentry *de); - static ssize_t sel_read_mls(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { @@ -508,13 +505,13 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi, struct selinux_policy *newpolicy) { int ret = 0; - struct dentry *tmp_parent, *tmp_bool_dir, *tmp_class_dir, *old_dentry; - unsigned int tmp_bool_num, old_bool_num; - char **tmp_bool_names, **old_bool_names; - int *tmp_bool_values, *old_bool_values; + struct dentry *tmp_parent, *tmp_bool_dir, *tmp_class_dir; + unsigned int bool_num = 0; + char **bool_names = NULL; + int *bool_values = NULL; unsigned long tmp_ino = fsi->last_ino; /* Don't increment last_ino in this function */ - tmp_parent = sel_make_disconnected_dir(fsi->sb, &tmp_ino); + tmp_parent = sel_make_swapover_dir(fsi->sb, &tmp_ino); if (IS_ERR(tmp_parent)) return PTR_ERR(tmp_parent); @@ -532,8 +529,8 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi, goto out; } - ret = sel_make_bools(newpolicy, tmp_bool_dir, &tmp_bool_num, - &tmp_bool_names, &tmp_bool_values); + ret = sel_make_bools(newpolicy, tmp_bool_dir, &bool_num, + &bool_names, &bool_values); if (ret) goto out; @@ -542,38 +539,30 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi, if (ret) goto out; + lock_rename(tmp_parent, fsi->sb->s_root); + /* booleans */ - old_dentry = fsi->bool_dir; - lock_rename(tmp_bool_dir, old_dentry); d_exchange(tmp_bool_dir, fsi->bool_dir); - old_bool_num = fsi->bool_num; - old_bool_names = fsi->bool_pending_names; - old_bool_values = fsi->bool_pending_values; - - fsi->bool_num = tmp_bool_num; - fsi->bool_pending_names = tmp_bool_names; - fsi->bool_pending_values = tmp_bool_values; - - sel_remove_old_bool_data(old_bool_num, old_bool_names, old_bool_values); + swap(fsi->bool_num, bool_num); + swap(fsi->bool_pending_names, bool_names); + swap(fsi->bool_pending_values, bool_values); fsi->bool_dir = tmp_bool_dir; - unlock_rename(tmp_bool_dir, old_dentry); /* classes */ - old_dentry = fsi->class_dir; - lock_rename(tmp_class_dir, old_dentry); d_exchange(tmp_class_dir, fsi->class_dir); fsi->class_dir = tmp_class_dir; - unlock_rename(tmp_class_dir, old_dentry); + + unlock_rename(tmp_parent, fsi->sb->s_root); out: + sel_remove_old_bool_data(bool_num, bool_names, bool_values); /* Since the other temporary dirs are children of tmp_parent * this will handle all the cleanup in the case of a failure before * the swapover */ - sel_remove_entries(tmp_parent); - dput(tmp_parent); /* d_genocide() only handles the children */ + simple_recursive_removal(tmp_parent, NULL); return ret; } @@ -1351,54 +1340,48 @@ static const struct file_operations sel_commit_bools_ops = { .llseek = generic_file_llseek, }; -static void sel_remove_entries(struct dentry *de) -{ - d_genocide(de); - shrink_dcache_parent(de); -} - static int sel_make_bools(struct selinux_policy *newpolicy, struct dentry *bool_dir, unsigned int *bool_num, char ***bool_pending_names, int **bool_pending_values) { int ret; - ssize_t len; - struct dentry *dentry = NULL; - struct inode *inode = NULL; - struct inode_security_struct *isec; - char **names = NULL, *page; + char **names, *page; u32 i, num; - int *values = NULL; - u32 sid; - ret = -ENOMEM; page = (char *)get_zeroed_page(GFP_KERNEL); if (!page) - goto out; + return -ENOMEM; - ret = security_get_bools(newpolicy, &num, &names, &values); + ret = security_get_bools(newpolicy, &num, &names, bool_pending_values); if (ret) goto out; - for (i = 0; i < num; i++) { - ret = -ENOMEM; - dentry = d_alloc_name(bool_dir, names[i]); - if (!dentry) - goto out; + *bool_num = num; + *bool_pending_names = names; + + for (i = 0; i < num; i++) { + struct dentry *dentry; + struct inode *inode; + struct inode_security_struct *isec; + ssize_t len; + u32 sid; + + len = snprintf(page, PAGE_SIZE, "/%s/%s", BOOL_DIR_NAME, names[i]); + if (len >= PAGE_SIZE) { + ret = -ENAMETOOLONG; + break; + } + dentry = d_alloc_name(bool_dir, names[i]); + if (!dentry) { + ret = -ENOMEM; + break; + } - ret = -ENOMEM; inode = sel_make_inode(bool_dir->d_sb, S_IFREG | S_IRUGO | S_IWUSR); if (!inode) { dput(dentry); - goto out; - } - - ret = -ENAMETOOLONG; - len = snprintf(page, PAGE_SIZE, "/%s/%s", BOOL_DIR_NAME, names[i]); - if (len >= PAGE_SIZE) { - dput(dentry); - iput(inode); - goto out; + ret = -ENOMEM; + break; } isec = selinux_inode(inode); @@ -1416,23 +1399,8 @@ static int sel_make_bools(struct selinux_policy *newpolicy, struct dentry *bool_ inode->i_ino = i|SEL_BOOL_INO_OFFSET; d_add(dentry, inode); } - *bool_num = num; - *bool_pending_names = names; - *bool_pending_values = values; - - free_page((unsigned long)page); - return 0; out: free_page((unsigned long)page); - - if (names) { - for (i = 0; i < num; i++) - kfree(names[i]); - kfree(names); - } - kfree(values); - sel_remove_entries(bool_dir); - return ret; } @@ -1961,20 +1929,40 @@ static struct dentry *sel_make_dir(struct dentry *dir, const char *name, return dentry; } -static struct dentry *sel_make_disconnected_dir(struct super_block *sb, +static int reject_all(struct mnt_idmap *idmap, struct inode *inode, int mask) +{ + return -EPERM; // no access for anyone, root or no root. +} + +static const struct inode_operations swapover_dir_inode_operations = { + .lookup = simple_lookup, + .permission = reject_all, +}; + +static struct dentry *sel_make_swapover_dir(struct super_block *sb, unsigned long *ino) { - struct inode *inode = sel_make_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO); + struct dentry *dentry = d_alloc_name(sb->s_root, ".swapover"); + struct inode *inode; - if (!inode) + if (!dentry) return ERR_PTR(-ENOMEM); - inode->i_op = &simple_dir_inode_operations; - inode->i_fop = &simple_dir_operations; + inode = sel_make_inode(sb, S_IFDIR); + if (!inode) { + dput(dentry); + return ERR_PTR(-ENOMEM); + } + + inode->i_op = &swapover_dir_inode_operations; inode->i_ino = ++(*ino); /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); - return d_obtain_alias(inode); + inode_lock(sb->s_root->d_inode); + d_add(dentry, inode); + inc_nlink(sb->s_root->d_inode); + inode_unlock(sb->s_root->d_inode); + return dentry; } #define NULL_FILE_NAME "null" From a67d2a14a77eed5dbdace1801bf2255962121bdb Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 10 Nov 2023 15:46:49 -0500 Subject: [PATCH 0091/1562] selinux: update filenametr_hash() to use full_name_hash() Using full_name_hash() instead of partial_name_hash() should result in cleaner and better performing code. Suggested-by: Linus Torvalds Signed-off-by: Paul Moore --- security/selinux/ss/policydb.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index 595a435ea9c8..bd1e7f26d951 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -409,16 +409,9 @@ out: static u32 filenametr_hash(const void *k) { const struct filename_trans_key *ft = k; - unsigned long hash; - unsigned int byte_num; - unsigned char focus; + unsigned long salt = ft->ttype ^ ft->tclass; - hash = ft->ttype ^ ft->tclass; - - byte_num = 0; - while ((focus = ft->name[byte_num++])) - hash = partial_name_hash(focus, hash); - return hash; + return full_name_hash((void *)salt, ft->name, strlen(ft->name)); } static int filenametr_cmp(const void *k1, const void *k2) From f5364ecfd8c3ec81cf3350caa4629d98408101e5 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 15 Nov 2023 11:50:29 -0500 Subject: [PATCH 0092/1562] MAINTAINERS: update the SELinux entry Bring the SELinux entry up to date with the following changes: * Remove the selinuxproject.org link. The wiki located there is in read-only mode and exists primarily for historical reasons. * Add our patchwork link. I'm not sure this is of much use for anyone but the maintainer, but there is a provision for including it here so we might as well include it. * Add a bug report URI. I suspect most everyone knows to send mail to the mailing list if they hit a bug, but let's make it official. * Add a link to the SELinux tree process/management documentation. While the doc exists both in the canonical kernel.org location and the GitHub mirror, provide a link to the mirror as GitHub does a better job rendering the Markdown. * Update the source tree's git URI to use https. Signed-off-by: Paul Moore --- MAINTAINERS | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cf..931c52cc8393 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19520,9 +19520,11 @@ M: Stephen Smalley M: Eric Paris L: selinux@vger.kernel.org S: Supported -W: https://selinuxproject.org W: https://github.com/SELinuxProject -T: git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git +Q: https://patchwork.kernel.org/project/selinux/list +B: mailto:selinux@vger.kernel.org +P: https://github.com/SELinuxProject/selinux-kernel/blob/main/README.md +T: git https://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git F: Documentation/ABI/removed/sysfs-selinux-checkreqprot F: Documentation/ABI/removed/sysfs-selinux-disable F: Documentation/admin-guide/LSM/SELinux.rst From e9cdebbe23f1aa9a1caea169862f479ab3fa2773 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 6 Nov 2023 15:24:38 -0600 Subject: [PATCH 0093/1562] dlm: use kernel_connect() and kernel_bind() Recent changes to kernel_connect() and kernel_bind() ensure that callers are insulated from changes to the address parameter made by BPF SOCK_ADDR hooks. This patch wraps direct calls to ops->connect() and ops->bind() with kernel_connect() and kernel_bind() to protect callers in such cases. Link: https://lore.kernel.org/netdev/9944248dba1bce861375fcce9de663934d933ba9.camel@redhat.com/ Fixes: d74bad4e74ee ("bpf: Hooks for sys_connect") Fixes: 4fbac77d2d09 ("bpf: Hooks for sys_bind") Cc: stable@vger.kernel.org Signed-off-by: Jordan Rife Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 67f8dd8a05ef..6296c62c10fa 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1817,8 +1817,8 @@ static int dlm_tcp_bind(struct socket *sock) memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr)); make_sockaddr(&src_addr, 0, &addr_len); - result = sock->ops->bind(sock, (struct sockaddr *)&src_addr, - addr_len); + result = kernel_bind(sock, (struct sockaddr *)&src_addr, + addr_len); if (result < 0) { /* This *may* not indicate a critical error */ log_print("could not bind for connect: %d", result); @@ -1830,7 +1830,7 @@ static int dlm_tcp_bind(struct socket *sock) static int dlm_tcp_connect(struct connection *con, struct socket *sock, struct sockaddr *addr, int addr_len) { - return sock->ops->connect(sock, addr, addr_len, O_NONBLOCK); + return kernel_connect(sock, addr, addr_len, O_NONBLOCK); } static int dlm_tcp_listen_validate(void) @@ -1862,8 +1862,8 @@ static int dlm_tcp_listen_bind(struct socket *sock) /* Bind to our port */ make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); - return sock->ops->bind(sock, (struct sockaddr *)&dlm_local_addr[0], - addr_len); + return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0], + addr_len); } static const struct dlm_proto_ops dlm_tcp_ops = { @@ -1888,12 +1888,12 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock, int ret; /* - * Make sock->ops->connect() function return in specified time, + * Make kernel_connect() function return in specified time, * since O_NONBLOCK argument in connect() function does not work here, * then, we should restore the default value of this attribute. */ sock_set_sndtimeo(sock->sk, 5); - ret = sock->ops->connect(sock, addr, addr_len, 0); + ret = kernel_connect(sock, addr, addr_len, 0); sock_set_sndtimeo(sock->sk, 0); return ret; } From dbee1adeb7e6d31c9afbad8e9248c15694f1cc0c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 13 Nov 2023 16:24:09 -0500 Subject: [PATCH 0094/1562] dlm: use fl_owner from lockd This patch is changing the fl_owner value in case of an nfs lock request to not be the pid of lockd. Instead this patch changes it to be the owner value that nfs is giving us. Currently there exists proved problems with this behaviour. One nfsd server was created to export a gfs2 filesystem mount. Two nfs clients doing a nfs mount of this export. Those two clients should conflict each other operating on the same nfs file. A small test program was written: int main(int argc, const char *argv[]) { struct flock fl = { .l_type = F_WRLCK, .l_whence = SEEK_SET, .l_start = 1L, .l_len = 1L, }; int fd; fd = open("filename", O_RDWR | O_CREAT, 0700); printf("try to lock...\n"); fcntl(fd, F_SETLKW, &fl); printf("locked!\n"); getc(stdin); return 0; } Running on both clients at the same time and don't interrupting by pressing any key. It will show that both clients are able to acquire the lock which shouldn't be the case. The issue is here that the fl_owner value is the same and the lock context of both clients should be separated. This patch lets lockd define how to deal with lock contexts and chose hopefully the right fl_owner value. A test after this patch was made and the locks conflicts each other which should be the case. Acked-by: Jeff Layton Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/plock.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index e6b4c1a21446..ee6e0236d4f8 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -145,6 +145,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; + op->info.owner = (__u64)(long)fl->fl_owner; /* async handling */ if (fl->fl_lmops && fl->fl_lmops->lm_grant) { op_data = kzalloc(sizeof(*op_data), GFP_NOFS); @@ -154,9 +155,6 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, goto out; } - /* fl_owner is lockd which doesn't distinguish - processes on the nfs client */ - op->info.owner = (__u64) fl->fl_pid; op_data->callback = fl->fl_lmops->lm_grant; locks_init_lock(&op_data->flc); locks_copy_lock(&op_data->flc, fl); @@ -168,8 +166,6 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, send_op(op); rv = FILE_LOCK_DEFERRED; goto out; - } else { - op->info.owner = (__u64)(long) fl->fl_owner; } send_op(op); @@ -326,10 +322,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - if (fl->fl_lmops && fl->fl_lmops->lm_grant) - op->info.owner = (__u64) fl->fl_pid; - else - op->info.owner = (__u64)(long) fl->fl_owner; + op->info.owner = (__u64)(long)fl->fl_owner; if (fl->fl_flags & FL_CLOSE) { op->info.flags |= DLM_PLOCK_FL_CLOSE; @@ -389,7 +382,7 @@ int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file, info.number = number; info.start = fl->fl_start; info.end = fl->fl_end; - info.owner = (__u64)fl->fl_pid; + info.owner = (__u64)(long)fl->fl_owner; rv = do_lock_cancel(&info); switch (rv) { @@ -450,10 +443,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - if (fl->fl_lmops && fl->fl_lmops->lm_grant) - op->info.owner = (__u64) fl->fl_pid; - else - op->info.owner = (__u64)(long) fl->fl_owner; + op->info.owner = (__u64)(long)fl->fl_owner; send_op(op); wait_event(recv_wq, (op->done != 0)); From 6bd4a2bfe568d963af721cc5efa52091bf1a3746 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 13 Nov 2023 16:24:10 -0500 Subject: [PATCH 0095/1562] dlm: use FL_SLEEP to determine blocking vs non-blocking This patch uses the FL_SLEEP flag in struct file_lock to determine if the lock request is a blocking or non-blocking request. Before dlm was using IS_SETLKW() was being used which is not usable for lock requests coming from lockd when EXPORT_OP_SAFE_ASYNC_LOCK inside the export flags is set. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/plock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index ee6e0236d4f8..d814c5121367 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -140,7 +140,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, op->info.optype = DLM_PLOCK_OP_LOCK; op->info.pid = fl->fl_pid; op->info.ex = (fl->fl_type == F_WRLCK); - op->info.wait = IS_SETLKW(cmd); + op->info.wait = !!(fl->fl_flags & FL_SLEEP); op->info.fsid = ls->ls_global_id; op->info.number = number; op->info.start = fl->fl_start; From 0c08699744d20ce0bac22b9f291a646a0302e51f Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 13 Nov 2023 16:24:11 -0500 Subject: [PATCH 0096/1562] dlm: implement EXPORT_OP_ASYNC_LOCK This patch is activating the EXPORT_OP_ASYNC_LOCK export flag to signal lockd that both filesystems are able to handle async lock requests. The cluster filesystems gfs2 and ocfs2 will redirect their lock requests to DLMs plock implementation that can handle async lock requests. Reviewed-by: Jeff Layton Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/gfs2/export.c | 1 + fs/ocfs2/export.c | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index cf40895233f5..ef1013eff936 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -192,5 +192,6 @@ const struct export_operations gfs2_export_ops = { .fh_to_parent = gfs2_fh_to_parent, .get_name = gfs2_get_name, .get_parent = gfs2_get_parent, + .flags = EXPORT_OP_ASYNC_LOCK, }; diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index eaa8c80ace3c..b8b6a191b5cb 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -280,4 +280,5 @@ const struct export_operations ocfs2_export_ops = { .fh_to_dentry = ocfs2_fh_to_dentry, .fh_to_parent = ocfs2_fh_to_parent, .get_parent = ocfs2_get_parent, + .flags = EXPORT_OP_ASYNC_LOCK, }; From 54a1dc08e1737552e6764f38837b19fae9548fb0 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 15 Nov 2023 20:53:33 +0000 Subject: [PATCH 0097/1562] spi: dt-bindings: renesas,rspi: Document RZ/Five SoC The RSPI block on the RZ/Five SoC is identical to one found on the RZ/G2UL SoC. "renesas,r9a07g043-rspi" compatible string will be used on the RZ/Five SoC so to make this clear and to keep this file consistent, update the comment to include RZ/Five SoC. No driver changes are required as generic compatible string "renesas,rspi-rz" will be used as a fallback on RZ/Five SoC. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20231115205333.31076-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/spi/renesas,rspi.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/spi/renesas,rspi.yaml b/Documentation/devicetree/bindings/spi/renesas,rspi.yaml index 4d8ec69214c9..0ef3f8421986 100644 --- a/Documentation/devicetree/bindings/spi/renesas,rspi.yaml +++ b/Documentation/devicetree/bindings/spi/renesas,rspi.yaml @@ -21,7 +21,7 @@ properties: - enum: - renesas,rspi-r7s72100 # RZ/A1H - renesas,rspi-r7s9210 # RZ/A2 - - renesas,r9a07g043-rspi # RZ/G2UL + - renesas,r9a07g043-rspi # RZ/G2UL and RZ/Five - renesas,r9a07g044-rspi # RZ/G2{L,LC} - renesas,r9a07g054-rspi # RZ/V2L - const: renesas,rspi-rz From e2e13630f93d942d02f3b3f98660228a3545c60e Mon Sep 17 00:00:00 2001 From: Sam James Date: Tue, 7 Nov 2023 20:55:00 +0000 Subject: [PATCH 0098/1562] objtool: Fix calloc call for new -Walloc-size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC 14 introduces a new -Walloc-size included in -Wextra which errors out like: ``` check.c: In function ‘cfi_alloc’: check.c:294:33: error: allocation of insufficient size ‘1’ for type ‘struct cfi_state’ with size ‘320’ [-Werror=alloc-size] 294 | struct cfi_state *cfi = calloc(sizeof(struct cfi_state), 1); | ^~~~~~ ``` The calloc prototype is: ``` void *calloc(size_t nmemb, size_t size); ``` So, just swap the number of members and size arguments to match the prototype, as we're initialising 1 struct of size `sizeof(struct ...)`. GCC then sees we're not doing anything wrong. Signed-off-by: Sam James Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20231107205504.1470006-1-sam@gentoo.org --- tools/objtool/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e94756e09ca9..548ec3cd7c00 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -291,7 +291,7 @@ static void init_insn_state(struct objtool_file *file, struct insn_state *state, static struct cfi_state *cfi_alloc(void) { - struct cfi_state *cfi = calloc(sizeof(struct cfi_state), 1); + struct cfi_state *cfi = calloc(1, sizeof(struct cfi_state)); if (!cfi) { WARN("calloc failed"); exit(1); From 243218ca93037631f0224fdbefea045912cb761a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 16 Nov 2023 06:22:42 -0800 Subject: [PATCH 0099/1562] perf/x86/intel/cstate: Cleanup duplicate attr_groups The events of the cstate_core and cstate_pkg PMU have the same format. They both need to create a "events" group (with empty attrs). The attr_groups can be shared. Remove the dedicated attr_groups for each cstate PMU. Use the shared cstate_attr_groups to replace. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231116142245.1233485-1-kan.liang@linux.intel.com --- arch/x86/events/intel/cstate.c | 44 +++++++++------------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index cbeb6d2bf5b4..693bdcd92e8c 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -189,20 +189,20 @@ static struct attribute *attrs_empty[] = { * "events" group (with empty attrs) before updating * it with detected events. */ -static struct attribute_group core_events_attr_group = { +static struct attribute_group cstate_events_attr_group = { .name = "events", .attrs = attrs_empty, }; -DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63"); -static struct attribute *core_format_attrs[] = { - &format_attr_core_event.attr, +DEFINE_CSTATE_FORMAT_ATTR(cstate_event, event, "config:0-63"); +static struct attribute *cstate_format_attrs[] = { + &format_attr_cstate_event.attr, NULL, }; -static struct attribute_group core_format_attr_group = { +static struct attribute_group cstate_format_attr_group = { .name = "format", - .attrs = core_format_attrs, + .attrs = cstate_format_attrs, }; static cpumask_t cstate_core_cpu_mask; @@ -217,9 +217,9 @@ static struct attribute_group cpumask_attr_group = { .attrs = cstate_cpumask_attrs, }; -static const struct attribute_group *core_attr_groups[] = { - &core_events_attr_group, - &core_format_attr_group, +static const struct attribute_group *cstate_attr_groups[] = { + &cstate_events_attr_group, + &cstate_format_attr_group, &cpumask_attr_group, NULL, }; @@ -268,30 +268,8 @@ static struct perf_msr pkg_msr[] = { [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr }, }; -static struct attribute_group pkg_events_attr_group = { - .name = "events", - .attrs = attrs_empty, -}; - -DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63"); -static struct attribute *pkg_format_attrs[] = { - &format_attr_pkg_event.attr, - NULL, -}; -static struct attribute_group pkg_format_attr_group = { - .name = "format", - .attrs = pkg_format_attrs, -}; - static cpumask_t cstate_pkg_cpu_mask; -static const struct attribute_group *pkg_attr_groups[] = { - &pkg_events_attr_group, - &pkg_format_attr_group, - &cpumask_attr_group, - NULL, -}; - static ssize_t cstate_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) @@ -478,7 +456,7 @@ static const struct attribute_group *pkg_attr_update[] = { }; static struct pmu cstate_core_pmu = { - .attr_groups = core_attr_groups, + .attr_groups = cstate_attr_groups, .attr_update = core_attr_update, .name = "cstate_core", .task_ctx_nr = perf_invalid_context, @@ -493,7 +471,7 @@ static struct pmu cstate_core_pmu = { }; static struct pmu cstate_pkg_pmu = { - .attr_groups = pkg_attr_groups, + .attr_groups = cstate_attr_groups, .attr_update = pkg_attr_update, .name = "cstate_pkg", .task_ctx_nr = perf_invalid_context, From c3dd1995620cdcd65cf4944c4164b0dbc16e557c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 16 Nov 2023 06:22:43 -0800 Subject: [PATCH 0100/1562] x86/smp: Export symbol cpu_clustergroup_mask() Intel cstate PMU driver will invoke the topology_cluster_cpumask() to retrieve the CPU mask of a cluster. A modpost error is triggered since the symbol cpu_clustergroup_mask is not exported. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231116142245.1233485-2-kan.liang@linux.intel.com --- arch/x86/kernel/smpboot.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2cc2aa120b4b..3f57ce68a3f1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -757,6 +757,7 @@ const struct cpumask *cpu_clustergroup_mask(int cpu) { return cpu_l2c_shared_mask(cpu); } +EXPORT_SYMBOL_GPL(cpu_clustergroup_mask); static void impress_friends(void) { From 3877d55a0db2688c2e4ab8a319614a0c81f8e2d2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 16 Nov 2023 06:22:44 -0800 Subject: [PATCH 0101/1562] perf/x86/intel/cstate: Add Sierra Forest support A new module C6 Residency Counter is introduced in the Sierra Forest. The scope of the new counter is module (A cluster of cores shared L2 cache). Create a brand new cstate_module PMU to profile the new counter. The only differences between the new cstate_module PMU and the existing cstate PMU are the scope and events. Regarding the choice of the new cstate_module PMU name, the current naming rule of a cstate PMU is "cstate_" + the scope of the PMU. The scope of the PMU is the cores shared L2. On SRF, Intel calls it "module", while the internal Linux sched code calls it "cluster". The "cstate_module" is used as the new PMU name, because - The Cstate PMU driver is a Intel specific driver. It doesn't impact other ARCHs. The name makes it consistent with the documentation. - The "cluster" mainly be used by the scheduler developer, while the user of cstate PMU is more likely a researcher reading HW docs and optimizing power. - In the Intel's SDM, the "cluster" has a different meaning/scope for topology. Using it will mislead the end users. Besides the module C6, the core C1/C6 and pkg C6 residency counters are supported in the Sierra Forest as well. Suggested-by: Artem Bityutskiy Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231116142245.1233485-3-kan.liang@linux.intel.com --- arch/x86/events/intel/cstate.c | 113 +++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 693bdcd92e8c..4a46ef315284 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -41,7 +41,7 @@ * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL - * MTL + * MTL,SRF * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -52,7 +52,7 @@ * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, - * TGL,TNT,RKL,ADL,RPL,SPR,MTL + * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 @@ -75,7 +75,7 @@ * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, - * TGL,TNT,RKL,ADL,RPL,SPR,MTL + * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -97,6 +97,10 @@ * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, * TNT,RKL,ADL,RPL,MTL * Scope: Package (physical package) + * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter. + * perf code: 0x00 + * Available model: SRF + * Scope: A cluster of cores shared L2 cache * */ @@ -130,6 +134,7 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev, struct cstate_model { unsigned long core_events; unsigned long pkg_events; + unsigned long module_events; unsigned long quirks; }; @@ -270,6 +275,28 @@ static struct perf_msr pkg_msr[] = { static cpumask_t cstate_pkg_cpu_mask; +/* cstate_module PMU */ +static struct pmu cstate_module_pmu; +static bool has_cstate_module; + +enum perf_cstate_module_events { + PERF_CSTATE_MODULE_C6_RES = 0, + + PERF_CSTATE_MODULE_EVENT_MAX, +}; + +PMU_EVENT_ATTR_STRING(c6-residency, attr_cstate_module_c6, "event=0x00"); + +static unsigned long module_msr_mask; + +PMU_EVENT_GROUP(events, cstate_module_c6); + +static struct perf_msr module_msr[] = { + [PERF_CSTATE_MODULE_C6_RES] = { MSR_MODULE_C6_RES_MS, &group_cstate_module_c6, test_msr }, +}; + +static cpumask_t cstate_module_cpu_mask; + static ssize_t cstate_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) @@ -280,6 +307,8 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev, return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask); else if (pmu == &cstate_pkg_pmu) return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask); + else if (pmu == &cstate_module_pmu) + return cpumap_print_to_pagebuf(true, buf, &cstate_module_cpu_mask); else return 0; } @@ -320,6 +349,15 @@ static int cstate_pmu_event_init(struct perf_event *event) event->hw.event_base = pkg_msr[cfg].msr; cpu = cpumask_any_and(&cstate_pkg_cpu_mask, topology_die_cpumask(event->cpu)); + } else if (event->pmu == &cstate_module_pmu) { + if (cfg >= PERF_CSTATE_MODULE_EVENT_MAX) + return -EINVAL; + cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_MODULE_EVENT_MAX); + if (!(module_msr_mask & (1 << cfg))) + return -EINVAL; + event->hw.event_base = module_msr[cfg].msr; + cpu = cpumask_any_and(&cstate_module_cpu_mask, + topology_cluster_cpumask(event->cpu)); } else { return -ENOENT; } @@ -407,6 +445,17 @@ static int cstate_cpu_exit(unsigned int cpu) perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target); } } + + if (has_cstate_module && + cpumask_test_and_clear_cpu(cpu, &cstate_module_cpu_mask)) { + + target = cpumask_any_but(topology_cluster_cpumask(cpu), cpu); + /* Migrate events if there is a valid target */ + if (target < nr_cpu_ids) { + cpumask_set_cpu(target, &cstate_module_cpu_mask); + perf_pmu_migrate_context(&cstate_module_pmu, cpu, target); + } + } return 0; } @@ -433,6 +482,15 @@ static int cstate_cpu_init(unsigned int cpu) if (has_cstate_pkg && target >= nr_cpu_ids) cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); + /* + * If this is the first online thread of that cluster, set it + * in the cluster cpu mask as the designated reader. + */ + target = cpumask_any_and(&cstate_module_cpu_mask, + topology_cluster_cpumask(cpu)); + if (has_cstate_module && target >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cstate_module_cpu_mask); + return 0; } @@ -455,6 +513,11 @@ static const struct attribute_group *pkg_attr_update[] = { NULL, }; +static const struct attribute_group *module_attr_update[] = { + &group_cstate_module_c6, + NULL +}; + static struct pmu cstate_core_pmu = { .attr_groups = cstate_attr_groups, .attr_update = core_attr_update, @@ -485,6 +548,21 @@ static struct pmu cstate_pkg_pmu = { .module = THIS_MODULE, }; +static struct pmu cstate_module_pmu = { + .attr_groups = cstate_attr_groups, + .attr_update = module_attr_update, + .name = "cstate_module", + .task_ctx_nr = perf_invalid_context, + .event_init = cstate_pmu_event_init, + .add = cstate_pmu_event_add, + .del = cstate_pmu_event_del, + .start = cstate_pmu_event_start, + .stop = cstate_pmu_event_stop, + .read = cstate_pmu_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, + .module = THIS_MODULE, +}; + static const struct cstate_model nhm_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C3_RES) | BIT(PERF_CSTATE_CORE_C6_RES), @@ -599,6 +677,15 @@ static const struct cstate_model glm_cstates __initconst = { BIT(PERF_CSTATE_PKG_C10_RES), }; +static const struct cstate_model srf_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES), + + .module_events = BIT(PERF_CSTATE_MODULE_C6_RES), +}; + static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_cstates), @@ -651,6 +738,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &glm_cstates), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &srf_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), @@ -692,10 +780,14 @@ static int __init cstate_probe(const struct cstate_model *cm) pkg_msr_mask = perf_msr_probe(pkg_msr, PERF_CSTATE_PKG_EVENT_MAX, true, (void *) &cm->pkg_events); + module_msr_mask = perf_msr_probe(module_msr, PERF_CSTATE_MODULE_EVENT_MAX, + true, (void *) &cm->module_events); + has_cstate_core = !!core_msr_mask; has_cstate_pkg = !!pkg_msr_mask; + has_cstate_module = !!module_msr_mask; - return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; + return (has_cstate_core || has_cstate_pkg || has_cstate_module) ? 0 : -ENODEV; } static inline void cstate_cleanup(void) @@ -708,6 +800,9 @@ static inline void cstate_cleanup(void) if (has_cstate_pkg) perf_pmu_unregister(&cstate_pkg_pmu); + + if (has_cstate_module) + perf_pmu_unregister(&cstate_module_pmu); } static int __init cstate_init(void) @@ -744,6 +839,16 @@ static int __init cstate_init(void) return err; } } + + if (has_cstate_module) { + err = perf_pmu_register(&cstate_module_pmu, cstate_module_pmu.name, -1); + if (err) { + has_cstate_module = false; + pr_info("Failed to register cstate cluster pmu\n"); + cstate_cleanup(); + return err; + } + } return 0; } From bbb968696d0f3442ab823598def3b756cf4735c6 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 16 Nov 2023 06:22:45 -0800 Subject: [PATCH 0102/1562] perf/x86/intel/cstate: Add Grand Ridge support The same as the Sierra Forest, the Grand Ridge supports core C1/C6 and module C6. But it doesn't support pkg C6 residency counter. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231116142245.1233485-4-kan.liang@linux.intel.com --- arch/x86/events/intel/cstate.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4a46ef315284..4b50a3a9818a 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -41,7 +41,7 @@ * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL - * MTL,SRF + * MTL,SRF,GRR * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -52,7 +52,8 @@ * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, - * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF + * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, + * GRR * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 @@ -99,7 +100,7 @@ * Scope: Package (physical package) * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter. * perf code: 0x00 - * Available model: SRF + * Available model: SRF,GRR * Scope: A cluster of cores shared L2 cache * */ @@ -677,6 +678,13 @@ static const struct cstate_model glm_cstates __initconst = { BIT(PERF_CSTATE_PKG_C10_RES), }; +static const struct cstate_model grr_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C6_RES), + + .module_events = BIT(PERF_CSTATE_MODULE_C6_RES), +}; + static const struct cstate_model srf_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | BIT(PERF_CSTATE_CORE_C6_RES), @@ -739,6 +747,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &srf_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &grr_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), From 18a813a1f94abbab14248071ca551e491bbc2abe Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Fri, 17 Nov 2023 20:10:53 +0530 Subject: [PATCH 0103/1562] spi: intel: make mem_ops comparison unique to opcode match Instead of comparing parameters for every supported mem_ops, only compare on opcode match, which is relatively more efficient. Signed-off-by: Raag Jadav Reviewed-by: Mika Westerberg Link: https://lore.kernel.org/r/20231117144053.24005-1-raag.jadav@intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-intel.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/spi/spi-intel.c b/drivers/spi/spi-intel.c index 98ec4dc22b81..3654ae35d2db 100644 --- a/drivers/spi/spi-intel.c +++ b/drivers/spi/spi-intel.c @@ -711,8 +711,7 @@ static bool intel_spi_cmp_mem_op(const struct intel_spi_mem_op *iop, { if (iop->mem_op.cmd.nbytes != op->cmd.nbytes || iop->mem_op.cmd.buswidth != op->cmd.buswidth || - iop->mem_op.cmd.dtr != op->cmd.dtr || - iop->mem_op.cmd.opcode != op->cmd.opcode) + iop->mem_op.cmd.dtr != op->cmd.dtr) return false; if (iop->mem_op.addr.nbytes != op->addr.nbytes || @@ -737,11 +736,12 @@ intel_spi_match_mem_op(struct intel_spi *ispi, const struct spi_mem_op *op) const struct intel_spi_mem_op *iop; for (iop = ispi->mem_ops; iop->mem_op.cmd.opcode; iop++) { - if (intel_spi_cmp_mem_op(iop, op)) - break; + if (iop->mem_op.cmd.opcode == op->cmd.opcode && + intel_spi_cmp_mem_op(iop, op)) + return iop; } - return iop->mem_op.cmd.opcode ? iop : NULL; + return NULL; } static bool intel_spi_supports_mem_op(struct spi_mem *mem, From 923fb6238cb3ac529aa2bf13b3b1e53762186a8b Mon Sep 17 00:00:00 2001 From: Ronald Monthero Date: Sat, 18 Nov 2023 18:31:51 +1000 Subject: [PATCH 0104/1562] mtd: rawnand: Increment IFC_TIMEOUT_MSECS for nand controller response Under heavy load it is likely that the controller is done with its own task but the thread unlocking the wait is not scheduled in time. Increasing IFC_TIMEOUT_MSECS allows the controller to respond within allowable timeslice of 1 sec. fsl,ifc-nand 7e800000.nand: Controller is not responding [<804b2047>] (nand_get_device) from [<804b5335>] (nand_write_oob+0x1b/0x4a) [<804b5335>] (nand_write_oob) from [<804a3585>] (mtd_write+0x41/0x5c) [<804a3585>] (mtd_write) from [<804c1d47>] (ubi_io_write+0x17f/0x22c) [<804c1d47>] (ubi_io_write) from [<804c047b>] (ubi_eba_write_leb+0x5b/0x1d0) Fixes: 82771882d960 ("NAND Machine support for Integrated Flash Controller") Reviewed-by: Miquel Raynal Reviewed-by: Andy Shevchenko Signed-off-by: Ronald Monthero Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231118083156.776887-1-debug.penguin32@gmail.com --- drivers/mtd/nand/raw/fsl_ifc_nand.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/fsl_ifc_nand.c b/drivers/mtd/nand/raw/fsl_ifc_nand.c index 20bb1e0cb5eb..f0e2318ce088 100644 --- a/drivers/mtd/nand/raw/fsl_ifc_nand.c +++ b/drivers/mtd/nand/raw/fsl_ifc_nand.c @@ -21,7 +21,7 @@ #define ERR_BYTE 0xFF /* Value returned for read bytes when read failed */ -#define IFC_TIMEOUT_MSECS 500 /* Maximum number of mSecs to wait +#define IFC_TIMEOUT_MSECS 1000 /* Maximum timeout to wait for IFC NAND Machine */ struct fsl_ifc_ctrl; From acb1fd579efbcac26ce8f9c4fc8bd82f7eaa56e9 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Mon, 20 Nov 2023 09:42:39 +0300 Subject: [PATCH 0105/1562] mtd: rawnand: meson: initialize clock register Clock register must be also initialized during controller probing. If this is not performed (for example by bootloader before) - controller will not work. Signed-off-by: Arseniy Krasnov Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231120064239.3304108-1-avkrasnov@salutedevices.com --- drivers/mtd/nand/raw/meson_nand.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/meson_nand.c b/drivers/mtd/nand/raw/meson_nand.c index 71ec4052e52a..7e16a13fb438 100644 --- a/drivers/mtd/nand/raw/meson_nand.c +++ b/drivers/mtd/nand/raw/meson_nand.c @@ -90,6 +90,8 @@ /* eMMC clock register, misc control */ #define CLK_SELECT_NAND BIT(31) +#define CLK_ALWAYS_ON_NAND BIT(24) +#define CLK_SELECT_FIX_PLL2 BIT(6) #define NFC_CLK_CYCLE 6 @@ -1154,7 +1156,7 @@ static int meson_nfc_clk_init(struct meson_nfc *nfc) return PTR_ERR(nfc->nand_clk); /* init SD_EMMC_CLOCK to sane defaults w/min clock rate */ - writel(CLK_SELECT_NAND | readl(nfc->reg_clk), + writel(CLK_ALWAYS_ON_NAND | CLK_SELECT_NAND | CLK_SELECT_FIX_PLL2, nfc->reg_clk); ret = clk_prepare_enable(nfc->core_clk); From f25d34646bd01505a0989ca67bc9a37390cae755 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Fri, 3 Nov 2023 19:25:23 +0100 Subject: [PATCH 0106/1562] platform/x86: wmi: Add wmidev_block_set() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, WMI drivers have to use the deprecated GUID-based interface when setting data blocks. This prevents those drivers from fully moving away from this interface. Provide wmidev_block_set() so drivers using wmi_set_block() can fully migrate to the modern bus-based interface. Tested with a custom SSDT from the Intel Slim Bootloader project. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231103182526.3524-1-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/wmi.c | 64 ++++++++++++++++++++------------------ include/linux/wmi.h | 2 ++ 2 files changed, 36 insertions(+), 30 deletions(-) diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index 5c27b4aa9690..9d9a050e7086 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -536,41 +536,50 @@ EXPORT_SYMBOL_GPL(wmidev_block_query); * * Return: acpi_status signaling success or error. */ -acpi_status wmi_set_block(const char *guid_string, u8 instance, - const struct acpi_buffer *in) +acpi_status wmi_set_block(const char *guid_string, u8 instance, const struct acpi_buffer *in) { - struct wmi_block *wblock; - struct guid_block *block; struct wmi_device *wdev; - acpi_handle handle; - struct acpi_object_list input; - union acpi_object params[2]; - char method[WMI_ACPI_METHOD_NAME_SIZE]; acpi_status status; - if (!in) - return AE_BAD_DATA; - wdev = wmi_find_device_by_guid(guid_string); if (IS_ERR(wdev)) return AE_ERROR; - wblock = container_of(wdev, struct wmi_block, dev); - block = &wblock->gblock; - handle = wblock->acpi_device->handle; + status = wmidev_block_set(wdev, instance, in); + wmi_device_put(wdev); - if (block->instance_count <= instance) { - status = AE_BAD_PARAMETER; + return status; +} +EXPORT_SYMBOL_GPL(wmi_set_block); - goto err_wdev_put; - } +/** + * wmidev_block_set - Write to a WMI block + * @wdev: A wmi bus device from a driver + * @instance: Instance index + * @in: Buffer containing new values for the data block + * + * Write contents of the input buffer to an ACPI-WMI data block. + * + * Return: acpi_status signaling success or error. + */ +acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct acpi_buffer *in) +{ + struct wmi_block *wblock = container_of(wdev, struct wmi_block, dev); + acpi_handle handle = wblock->acpi_device->handle; + struct guid_block *block = &wblock->gblock; + char method[WMI_ACPI_METHOD_NAME_SIZE]; + struct acpi_object_list input; + union acpi_object params[2]; + + if (!in) + return AE_BAD_DATA; + + if (block->instance_count <= instance) + return AE_BAD_PARAMETER; /* Check GUID is a data block */ - if (block->flags & (ACPI_WMI_EVENT | ACPI_WMI_METHOD)) { - status = AE_ERROR; - - goto err_wdev_put; - } + if (block->flags & (ACPI_WMI_EVENT | ACPI_WMI_METHOD)) + return AE_ERROR; input.count = 2; input.pointer = params; @@ -582,14 +591,9 @@ acpi_status wmi_set_block(const char *guid_string, u8 instance, get_acpi_method_name(wblock, 'S', method); - status = acpi_evaluate_object(handle, method, &input, NULL); - -err_wdev_put: - wmi_device_put(wdev); - - return status; + return acpi_evaluate_object(handle, method, &input, NULL); } -EXPORT_SYMBOL_GPL(wmi_set_block); +EXPORT_SYMBOL_GPL(wmidev_block_set); static void wmi_dump_wdg(const struct guid_block *g) { diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 763bd382cf2d..207544968268 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -35,6 +35,8 @@ extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev, extern union acpi_object *wmidev_block_query(struct wmi_device *wdev, u8 instance); +acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct acpi_buffer *in); + u8 wmidev_instance_count(struct wmi_device *wdev); extern int set_required_buffer_size(struct wmi_device *wdev, u64 length); From 7275bf3e09578e1761157e7683f2e898c5c235a6 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Fri, 3 Nov 2023 19:25:24 +0100 Subject: [PATCH 0107/1562] platform/x86: wmi: Add to_wmi_device() helper macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper macro for WMI drivers to cast a device to the corresponding WMI device. This should replace some boilerplate code. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231103182526.3524-2-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/wmi.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 207544968268..8a643c39fcce 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -27,6 +27,14 @@ struct wmi_device { bool setable; }; +/** + * to_wmi_device() - Helper macro to cast a device to a wmi_device + * @device: device struct + * + * Cast a struct device to a struct wmi_device. + */ +#define to_wmi_device(device) container_of(device, struct wmi_device, dev) + extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev, u8 instance, u32 method_id, const struct acpi_buffer *in, From 75c487fcb69c981f9bd21f91e6e3b8b2080d7ab0 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Fri, 3 Nov 2023 19:25:25 +0100 Subject: [PATCH 0108/1562] platform/x86: intel-wmi-sbl-fw-update: Use bus-based WMI interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the driver was still using the deprecated GUID-based interface to query/set data blocks. Use the modern bus-based interface for this. Tested with a custom SSDT from the Intel Slim Bootloader project. Reviewed-by: Jithu Joseph Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231103182526.3524-3-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/wmi/sbl-fw-update.c | 13 ++++--------- drivers/platform/x86/wmi.c | 1 + 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/platform/x86/intel/wmi/sbl-fw-update.c b/drivers/platform/x86/intel/wmi/sbl-fw-update.c index 3c86e0108a24..9cf5ed0f8dc2 100644 --- a/drivers/platform/x86/intel/wmi/sbl-fw-update.c +++ b/drivers/platform/x86/intel/wmi/sbl-fw-update.c @@ -25,18 +25,13 @@ static int get_fwu_request(struct device *dev, u32 *out) { - struct acpi_buffer result = {ACPI_ALLOCATE_BUFFER, NULL}; union acpi_object *obj; - acpi_status status; - status = wmi_query_block(INTEL_WMI_SBL_GUID, 0, &result); - if (ACPI_FAILURE(status)) { - dev_err(dev, "wmi_query_block failed\n"); + obj = wmidev_block_query(to_wmi_device(dev), 0); + if (!obj) return -ENODEV; - } - obj = (union acpi_object *)result.pointer; - if (!obj || obj->type != ACPI_TYPE_INTEGER) { + if (obj->type != ACPI_TYPE_INTEGER) { dev_warn(dev, "wmi_query_block returned invalid value\n"); kfree(obj); return -EINVAL; @@ -58,7 +53,7 @@ static int set_fwu_request(struct device *dev, u32 in) input.length = sizeof(u32); input.pointer = &value; - status = wmi_set_block(INTEL_WMI_SBL_GUID, 0, &input); + status = wmidev_block_set(to_wmi_device(dev), 0, &input); if (ACPI_FAILURE(status)) { dev_err(dev, "wmi_set_block failed\n"); return -ENODEV; diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index 9d9a050e7086..4c4effc883ae 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -106,6 +106,7 @@ MODULE_DEVICE_TABLE(acpi, wmi_device_ids); static const char * const allow_duplicates[] = { "05901221-D566-11D1-B2F0-00A0C9062910", /* wmi-bmof */ "8A42EA14-4F2A-FD45-6422-0087F7A7E608", /* dell-wmi-ddv */ + "44FADEB1-B204-40F2-8581-394BBDC1B651", /* intel-wmi-sbl-fw-update */ NULL }; From 2340f12023efa7dc256f496d85d2411ca47cb9a2 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Fri, 3 Nov 2023 19:25:26 +0100 Subject: [PATCH 0109/1562] platform/x86/intel/wmi: thunderbolt: Use bus-based WMI interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the driver still uses the legacy GUID-based interface to invoke WMI methods. Use the modern bus-based interface instead. Tested on a Lenovo E51-80. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231103182526.3524-4-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/wmi/thunderbolt.c | 3 +-- drivers/platform/x86/wmi.c | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel/wmi/thunderbolt.c b/drivers/platform/x86/intel/wmi/thunderbolt.c index fc333ff82d1e..e2ad3f46f356 100644 --- a/drivers/platform/x86/intel/wmi/thunderbolt.c +++ b/drivers/platform/x86/intel/wmi/thunderbolt.c @@ -32,8 +32,7 @@ static ssize_t force_power_store(struct device *dev, mode = hex_to_bin(buf[0]); dev_dbg(dev, "force_power: storing %#x\n", mode); if (mode == 0 || mode == 1) { - status = wmi_evaluate_method(INTEL_WMI_THUNDERBOLT_GUID, 0, 1, - &input, NULL); + status = wmidev_evaluate_method(to_wmi_device(dev), 0, 1, &input, NULL); if (ACPI_FAILURE(status)) { dev_dbg(dev, "force_power: failed to evaluate ACPI method\n"); return -ENODEV; diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index 4c4effc883ae..cb7e74f2b009 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -107,6 +107,7 @@ static const char * const allow_duplicates[] = { "05901221-D566-11D1-B2F0-00A0C9062910", /* wmi-bmof */ "8A42EA14-4F2A-FD45-6422-0087F7A7E608", /* dell-wmi-ddv */ "44FADEB1-B204-40F2-8581-394BBDC1B651", /* intel-wmi-sbl-fw-update */ + "86CCFD48-205E-4A77-9C48-2021CBEDE341", /* intel-wmi-thunderbolt */ NULL }; From 57eb82ff34e3e3dfa95a80c40ef5a4764c833ec6 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Fri, 3 Nov 2023 23:54:08 +0000 Subject: [PATCH 0110/1562] platform/mellanox: mlxbf-tmfifo: Remove unnecessary bool conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes coccinelle warning in macro function IS_VRING_DROP() which complains conversion to bool not needed here. Signed-off-by: Jules Irenge Link: https://lore.kernel.org/r/ZUWIIKbz4vukl8qb@octinomon Reviewed-by: Ilpo Järvinen Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/mellanox/mlxbf-tmfifo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/mellanox/mlxbf-tmfifo.c b/drivers/platform/mellanox/mlxbf-tmfifo.c index 5c683b4eaf10..ed16ec422a7b 100644 --- a/drivers/platform/mellanox/mlxbf-tmfifo.c +++ b/drivers/platform/mellanox/mlxbf-tmfifo.c @@ -91,7 +91,7 @@ struct mlxbf_tmfifo_vring { /* Check whether vring is in drop mode. */ #define IS_VRING_DROP(_r) ({ \ typeof(_r) (r) = (_r); \ - (r->desc_head == &r->drop_desc ? true : false); }) + r->desc_head == &r->drop_desc; }) /* A stub length to drop maximum length packet. */ #define VRING_DROP_DESC_MAX_LEN GENMASK(15, 0) From 8d437a0b68c175ed591322e53b7e1f91094abfd5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 4 Nov 2023 21:58:25 +0100 Subject: [PATCH 0111/1562] ACPI: scan: Add LNXVIDEO HID to ignore_serial_bus_ids[] The I2C-core already has filtering to skip i2c_client instantiation for LNXVIDEO acpi_device-s with I2cSerialBus resources, since LNXVIDEO devices are not i2c_client-s and are handled by the acpi_video driver. This filtering was added to i2c-core-acpi.c in commit 3a4991a9864c ("i2c: acpi: Do not create i2c-clients for LNXVIDEO ACPI devices"). Now a similar problem has shown up where the SPI-core is instantiating an unwanted SPI-device for a SpiSerialBus resource under a LNXVIDEO acpi_device. On a Lenovo Yoga Tab 3 YT3-X90F this unwanted SPI-device instanstantiation causes the SPI-device instanstantiation for the WM5102 audio codec to fail with: [ 21.988441] pxa2xx-spi 8086228E:00: chipselect 0 already in use Instead of duplicating the I2C-core filtering in the SPI-core code, push the filtering of SerialBus resources under LNXVIDEO acpi_device-s up into the ACPI-core by adding the LNXVIDEO HID to ignore_serial_bus_ids[]. Note the filtering in the I2C-core i2c_acpi_do_lookup() function is still necessary because this not only impacts i2c_client instantiation but it also makes the I2C-core ignore the I2cSerialBus resource when checking what the maximum speed is the I2C bus supports, which is still necessary. Acked-by: Rafael J. Wysocki Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20231104205828.63139-1-hdegoede@redhat.com --- drivers/acpi/scan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index fa5dd71a80fa..46a9238c72c6 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1727,6 +1727,7 @@ static bool acpi_device_enumeration_by_parent(struct acpi_device *device) * Some ACPI devs contain SerialBus resources even though they are not * attached to a serial bus at all. */ + {ACPI_VIDEO_HID, }, {"MSHW0028", }, /* * HIDs of device with an UartSerialBusV2 resource for which userspace From 70505ea6de24093136103cedcf2deeb85891ed6c Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 4 Nov 2023 21:58:26 +0100 Subject: [PATCH 0112/1562] platform/x86: x86-android-tablets: Add support for SPI device instantiation Some x86 Android tablets have SPI devices which are not properly described in their DSDT. Add support for instantiating SPI devices. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20231104205828.63139-2-hdegoede@redhat.com --- .../platform/x86/x86-android-tablets/core.c | 62 +++++++++++++++++++ .../x86-android-tablets/x86-android-tablets.h | 9 +++ 2 files changed, 71 insertions(+) diff --git a/drivers/platform/x86/x86-android-tablets/core.c b/drivers/platform/x86/x86-android-tablets/core.c index b55957bde034..6a5975ac3286 100644 --- a/drivers/platform/x86/x86-android-tablets/core.c +++ b/drivers/platform/x86/x86-android-tablets/core.c @@ -141,9 +141,11 @@ int x86_acpi_irq_helper_get(const struct x86_acpi_irq_data *data) } static int i2c_client_count; +static int spi_dev_count; static int pdev_count; static int serdev_count; static struct i2c_client **i2c_clients; +static struct spi_device **spi_devs; static struct platform_device **pdevs; static struct serdev_device **serdevs; static struct gpio_keys_button *buttons; @@ -185,6 +187,46 @@ static __init int x86_instantiate_i2c_client(const struct x86_dev_info *dev_info return 0; } +static __init int x86_instantiate_spi_dev(const struct x86_dev_info *dev_info, int idx) +{ + const struct x86_spi_dev_info *spi_dev_info = &dev_info->spi_dev_info[idx]; + struct spi_board_info board_info = spi_dev_info->board_info; + struct spi_controller *controller; + struct acpi_device *adev; + acpi_handle handle; + acpi_status status; + + board_info.irq = x86_acpi_irq_helper_get(&spi_dev_info->irq_data); + if (board_info.irq < 0) + return board_info.irq; + + status = acpi_get_handle(NULL, spi_dev_info->ctrl_path, &handle); + if (ACPI_FAILURE(status)) { + pr_err("Error could not get %s handle\n", spi_dev_info->ctrl_path); + return -ENODEV; + } + + adev = acpi_fetch_acpi_dev(handle); + if (!adev) { + pr_err("Error could not get adev for %s\n", spi_dev_info->ctrl_path); + return -ENODEV; + } + + controller = acpi_spi_find_controller_by_adev(adev); + if (!controller) { + pr_err("Error could not get SPI controller for %s\n", spi_dev_info->ctrl_path); + return -ENODEV; + } + + spi_devs[idx] = spi_new_device(controller, &board_info); + put_device(&controller->dev); + if (IS_ERR(spi_devs[idx])) + return dev_err_probe(&controller->dev, PTR_ERR(spi_devs[idx]), + "creating SPI-device %d\n", idx); + + return 0; +} + static __init int x86_instantiate_serdev(const struct x86_serdev_info *info, int idx) { struct acpi_device *ctrl_adev, *serdev_adev; @@ -263,6 +305,11 @@ static void x86_android_tablet_remove(struct platform_device *pdev) kfree(pdevs); kfree(buttons); + for (i = 0; i < spi_dev_count; i++) + spi_unregister_device(spi_devs[i]); + + kfree(spi_devs); + for (i = 0; i < i2c_client_count; i++) i2c_unregister_device(i2c_clients[i]); @@ -333,6 +380,21 @@ static __init int x86_android_tablet_probe(struct platform_device *pdev) } } + spi_devs = kcalloc(dev_info->spi_dev_count, sizeof(*spi_devs), GFP_KERNEL); + if (!spi_devs) { + x86_android_tablet_remove(pdev); + return -ENOMEM; + } + + spi_dev_count = dev_info->spi_dev_count; + for (i = 0; i < spi_dev_count; i++) { + ret = x86_instantiate_spi_dev(dev_info, i); + if (ret < 0) { + x86_android_tablet_remove(pdev); + return ret; + } + } + /* + 1 to make space for (optional) gpio_keys_button pdev */ pdevs = kcalloc(dev_info->pdev_count + 1, sizeof(*pdevs), GFP_KERNEL); if (!pdevs) { diff --git a/drivers/platform/x86/x86-android-tablets/x86-android-tablets.h b/drivers/platform/x86/x86-android-tablets/x86-android-tablets.h index 9d2fb7fded6d..49fed9410adb 100644 --- a/drivers/platform/x86/x86-android-tablets/x86-android-tablets.h +++ b/drivers/platform/x86/x86-android-tablets/x86-android-tablets.h @@ -14,6 +14,7 @@ #include #include #include +#include struct gpio_desc; struct gpiod_lookup_table; @@ -48,6 +49,12 @@ struct x86_i2c_client_info { struct x86_acpi_irq_data irq_data; }; +struct x86_spi_dev_info { + struct spi_board_info board_info; + char *ctrl_path; + struct x86_acpi_irq_data irq_data; +}; + struct x86_serdev_info { const char *ctrl_hid; const char *ctrl_uid; @@ -72,10 +79,12 @@ struct x86_dev_info { const struct software_node *bat_swnode; struct gpiod_lookup_table * const *gpiod_lookup_tables; const struct x86_i2c_client_info *i2c_client_info; + const struct x86_spi_dev_info *spi_dev_info; const struct platform_device_info *pdev_info; const struct x86_serdev_info *serdev_info; const struct x86_gpio_button *gpio_button; int i2c_client_count; + int spi_dev_count; int pdev_count; int serdev_count; int gpio_button_count; From 115779bf6abef3161c72311614a16d06d7216213 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 4 Nov 2023 21:58:27 +0100 Subject: [PATCH 0113/1562] platform/x86: x86-android-tablets: Add audio codec info for Lenovo Yoga Tab 3 Pro YT3-X90F The SPI attached WM5102 codec on the Lenovo Yoga Tab 3 Pro YT3-X90F is not described in the ACPI tables. Add info to instantiate the SPI device for the codec manually. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20231104205828.63139-3-hdegoede@redhat.com --- .../platform/x86/x86-android-tablets/lenovo.c | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/drivers/platform/x86/x86-android-tablets/lenovo.c b/drivers/platform/x86/x86-android-tablets/lenovo.c index c1e68211283f..0bc6a74b8beb 100644 --- a/drivers/platform/x86/x86-android-tablets/lenovo.c +++ b/drivers/platform/x86/x86-android-tablets/lenovo.c @@ -12,6 +12,8 @@ #include #include +#include +#include #include #include #include @@ -659,6 +661,88 @@ static const struct x86_i2c_client_info lenovo_yt3_i2c_clients[] __initconst = { } }; +/* + * The AOSP 3.5 mm Headset: Accessory Specification gives the following values: + * Function A Play/Pause: 0 ohm + * Function D Voice assistant: 135 ohm + * Function B Volume Up 240 ohm + * Function C Volume Down 470 ohm + * Minimum Mic DC resistance 1000 ohm + * Minimum Ear speaker impedance 16 ohm + * Note the first max value below must be less then the min. speaker impedance, + * to allow CTIA/OMTP detection to work. The other max values are the closest + * value from extcon-arizona.c:arizona_micd_levels halfway 2 button resistances. + */ +static const struct arizona_micd_range arizona_micd_aosp_ranges[] = { + { .max = 11, .key = KEY_PLAYPAUSE }, + { .max = 186, .key = KEY_VOICECOMMAND }, + { .max = 348, .key = KEY_VOLUMEUP }, + { .max = 752, .key = KEY_VOLUMEDOWN }, +}; + +/* YT3 WM5102 arizona_micd_config comes from Android kernel sources */ +static struct arizona_micd_config lenovo_yt3_wm5102_micd_config[] = { + { 0, 1, 0 }, + { ARIZONA_ACCDET_SRC, 2, 1 }, +}; + +static struct arizona_pdata lenovo_yt3_wm5102_pdata = { + .irq_flags = IRQF_TRIGGER_LOW, + .micd_detect_debounce = 200, + .micd_ranges = arizona_micd_aosp_ranges, + .num_micd_ranges = ARRAY_SIZE(arizona_micd_aosp_ranges), + .hpdet_channel = ARIZONA_ACCDET_MODE_HPL, + + /* Below settings come from Android kernel sources */ + .micd_bias_start_time = 1, + .micd_rate = 6, + .micd_configs = lenovo_yt3_wm5102_micd_config, + .num_micd_configs = ARRAY_SIZE(lenovo_yt3_wm5102_micd_config), + .micbias = { + [0] = { /* MICBIAS1 */ + .mV = 2800, + .ext_cap = 1, + .discharge = 1, + .soft_start = 0, + .bypass = 0, + }, + [1] = { /* MICBIAS2 */ + .mV = 2800, + .ext_cap = 1, + .discharge = 1, + .soft_start = 0, + .bypass = 0, + }, + [2] = { /* MICBIAS2 */ + .mV = 2800, + .ext_cap = 1, + .discharge = 1, + .soft_start = 0, + .bypass = 0, + }, + }, +}; + +static const struct x86_spi_dev_info lenovo_yt3_spi_devs[] __initconst = { + { + /* WM5102 codec */ + .board_info = { + .modalias = "wm5102", + .platform_data = &lenovo_yt3_wm5102_pdata, + .max_speed_hz = 5000000, + }, + .ctrl_path = "\\_SB_.PCI0.SPI1", + .irq_data = { + .type = X86_ACPI_IRQ_TYPE_GPIOINT, + .chip = "INT33FF:00", + .index = 91, + .trigger = ACPI_LEVEL_SENSITIVE, + .polarity = ACPI_ACTIVE_LOW, + .con_id = "wm5102_irq", + }, + } +}; + static int __init lenovo_yt3_init(void) { int ret; @@ -702,14 +786,28 @@ static struct gpiod_lookup_table lenovo_yt3_hideep_gpios = { }, }; +static struct gpiod_lookup_table lenovo_yt3_wm5102_gpios = { + .dev_id = "spi1.0", + .table = { + GPIO_LOOKUP("INT33FF:00", 75, "wlf,spkvdd-ena", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("INT33FF:00", 81, "wlf,ldoena", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("INT33FF:00", 82, "reset", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("arizona", 2, "wlf,micd-pol", GPIO_ACTIVE_HIGH), + { } + }, +}; + static struct gpiod_lookup_table * const lenovo_yt3_gpios[] = { &lenovo_yt3_hideep_gpios, + &lenovo_yt3_wm5102_gpios, NULL }; const struct x86_dev_info lenovo_yt3_info __initconst = { .i2c_client_info = lenovo_yt3_i2c_clients, .i2c_client_count = ARRAY_SIZE(lenovo_yt3_i2c_clients), + .spi_dev_info = lenovo_yt3_spi_devs, + .spi_dev_count = ARRAY_SIZE(lenovo_yt3_spi_devs), .gpiod_lookup_tables = lenovo_yt3_gpios, .init = lenovo_yt3_init, }; From 93ec6f222c680cef282a2e94fc42a130d34179b2 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 4 Nov 2023 21:58:28 +0100 Subject: [PATCH 0114/1562] platform/x86: x86-android-tablets: Fix backlight ctrl for Lenovo Yoga Tab 3 Pro YT3-X90F Fix the maximum brightness being much too low on the Yoga Tab 3 Pro. The LP8557 backlight controller can either be configured to multiply its PWM input and the I2C register set level (requiring both to be at 100% for 100% output); or to only take the I2C register set level into account. Multiplying the 2 levels is useful because this will turn off the backlight when the panel goes off and turns off its PWM output. But on the YT3-X90F the panel's PWM output defaults to a duty-cycle of much less then 100%, severely limiting max brightness. In this case the LP8557 should be configured to only take the I2C register into account and the i915 driver must turn off the backlight separately using a VBT MIPI sequence to turn off the backlight. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20231104205828.63139-4-hdegoede@redhat.com --- .../platform/x86/x86-android-tablets/lenovo.c | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/x86-android-tablets/lenovo.c b/drivers/platform/x86/x86-android-tablets/lenovo.c index 0bc6a74b8beb..f1c66a61bfc5 100644 --- a/drivers/platform/x86/x86-android-tablets/lenovo.c +++ b/drivers/platform/x86/x86-android-tablets/lenovo.c @@ -34,12 +34,30 @@ * * To avoid having to have a similar hack in the mainline kernel program the * LP8557 to directly set the level and use the lp855x_bl driver for control. + * + * The LP8557 can either be configured to multiply its PWM input and + * the I2C register set level (requiring both to be at 100% for 100% output); + * or to only take the I2C register set level into account. + * + * Multiplying the 2 levels is useful because this will turn off the backlight + * when the panel goes off and turns off its PWM output. + * + * But on some models the panel's PWM output defaults to a duty-cycle of + * much less then 100%, severely limiting max brightness. In this case + * the LP8557 should be configured to only take the I2C register into + * account and the i915 driver must turn off the panel and the backlight + * separately using e.g. VBT MIPI sequences to turn off the backlight. */ -static struct lp855x_platform_data lenovo_lp8557_pdata = { +static struct lp855x_platform_data lenovo_lp8557_pwm_and_reg_pdata = { .device_control = 0x86, .initial_brightness = 128, }; +static struct lp855x_platform_data lenovo_lp8557_reg_only_pdata = { + .device_control = 0x85, + .initial_brightness = 128, +}; + /* Lenovo Yoga Book X90F / X90L's Android factory img has everything hardcoded */ static const struct property_entry lenovo_yb1_x90_wacom_props[] = { @@ -122,7 +140,7 @@ static const struct x86_i2c_client_info lenovo_yb1_x90_i2c_clients[] __initconst .type = "lp8557", .addr = 0x2c, .dev_name = "lp8557", - .platform_data = &lenovo_lp8557_pdata, + .platform_data = &lenovo_lp8557_pwm_and_reg_pdata, }, .adapter_path = "\\_SB_.PCI0.I2C4", }, { @@ -358,7 +376,7 @@ static struct x86_i2c_client_info lenovo_yoga_tab2_830_1050_i2c_clients[] __init .type = "lp8557", .addr = 0x2c, .dev_name = "lp8557", - .platform_data = &lenovo_lp8557_pdata, + .platform_data = &lenovo_lp8557_pwm_and_reg_pdata, }, .adapter_path = "\\_SB_.I2C3", }, @@ -655,7 +673,7 @@ static const struct x86_i2c_client_info lenovo_yt3_i2c_clients[] __initconst = { .type = "lp8557", .addr = 0x2c, .dev_name = "lp8557", - .platform_data = &lenovo_lp8557_pdata, + .platform_data = &lenovo_lp8557_reg_only_pdata, }, .adapter_path = "\\_SB_.PCI0.I2C1", } From 3ecb4d85461a34323a849769030841533fcd0395 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 12 Nov 2023 08:44:15 +0100 Subject: [PATCH 0115/1562] platform/x86/dell: alienware-wmi: Use kasprintf() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use kasprintf() instead of hand writing it. This saves the need of an intermediate buffer. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/f2b2c9e5d80550e480a627c1b2139d5cc9472ffa.1699775015.git.christophe.jaillet@wanadoo.fr Reviewed-by: Ilpo Järvinen Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/dell/alienware-wmi.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/platform/x86/dell/alienware-wmi.c b/drivers/platform/x86/dell/alienware-wmi.c index a9477e5432e4..f5ee62ce1753 100644 --- a/drivers/platform/x86/dell/alienware-wmi.c +++ b/drivers/platform/x86/dell/alienware-wmi.c @@ -429,7 +429,6 @@ static DEVICE_ATTR(lighting_control_state, 0644, show_control_state, static int alienware_zone_init(struct platform_device *dev) { u8 zone; - char buffer[10]; char *name; if (interface == WMAX) { @@ -466,8 +465,7 @@ static int alienware_zone_init(struct platform_device *dev) return -ENOMEM; for (zone = 0; zone < quirks->num_zones; zone++) { - sprintf(buffer, "zone%02hhX", zone); - name = kstrdup(buffer, GFP_KERNEL); + name = kasprintf(GFP_KERNEL, "zone%02hhX", zone); if (name == NULL) return 1; sysfs_attr_init(&zone_dev_attrs[zone].attr); From d3bb2cb0f1769cb3424f3102ebcde51d18065424 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 19 Nov 2023 11:59:50 +0100 Subject: [PATCH 0116/1562] spi: ingenic: convert not to use dma_request_slave_channel() dma_request_slave_channel() is deprecated. dma_request_chan() should be used directly instead. Switch to the preferred function and update the error handling accordingly. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/1c88236b5d6bff0af902492ea9e066c8cb0dfef5.1700391566.git.christophe.jaillet@wanadoo.fr Signed-off-by: Mark Brown --- drivers/spi/spi-ingenic.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/spi/spi-ingenic.c b/drivers/spi/spi-ingenic.c index cc366936d72b..003a6d21c4c3 100644 --- a/drivers/spi/spi-ingenic.c +++ b/drivers/spi/spi-ingenic.c @@ -346,14 +346,17 @@ static bool spi_ingenic_can_dma(struct spi_controller *ctlr, static int spi_ingenic_request_dma(struct spi_controller *ctlr, struct device *dev) { - ctlr->dma_tx = dma_request_slave_channel(dev, "tx"); - if (!ctlr->dma_tx) - return -ENODEV; + struct dma_chan *chan; - ctlr->dma_rx = dma_request_slave_channel(dev, "rx"); + chan = dma_request_chan(dev, "tx"); + if (IS_ERR(chan)) + return PTR_ERR(chan); + ctlr->dma_tx = chan; - if (!ctlr->dma_rx) - return -ENODEV; + chan = dma_request_chan(dev, "rx"); + if (IS_ERR(chan)) + return PTR_ERR(chan); + ctlr->dma_rx = chan; ctlr->can_dma = spi_ingenic_can_dma; From 252eafe11ffc032579a56c7a29faa8431785a91e Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 17 Nov 2023 14:12:52 -0600 Subject: [PATCH 0117/1562] dt-bindings: spi: axi-spi-engine: convert to yaml This converts the axi-spi-engine binding to yaml. There are a few minor fixes in the conversion: * Added maintainers. * Added descriptions for the clocks. * Fixed the double "@" in the example. * Added a comma between the clocks in the example. Signed-off-by: David Lechner Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20231117-axi-spi-engine-series-1-v1-1-cc59db999b87@baylibre.com Signed-off-by: Mark Brown --- .../bindings/spi/adi,axi-spi-engine.txt | 31 --------- .../bindings/spi/adi,axi-spi-engine.yaml | 66 +++++++++++++++++++ 2 files changed, 66 insertions(+), 31 deletions(-) delete mode 100644 Documentation/devicetree/bindings/spi/adi,axi-spi-engine.txt create mode 100644 Documentation/devicetree/bindings/spi/adi,axi-spi-engine.yaml diff --git a/Documentation/devicetree/bindings/spi/adi,axi-spi-engine.txt b/Documentation/devicetree/bindings/spi/adi,axi-spi-engine.txt deleted file mode 100644 index 8a18d71e6879..000000000000 --- a/Documentation/devicetree/bindings/spi/adi,axi-spi-engine.txt +++ /dev/null @@ -1,31 +0,0 @@ -Analog Devices AXI SPI Engine controller Device Tree Bindings - -Required properties: -- compatible : Must be "adi,axi-spi-engine-1.00.a"" -- reg : Physical base address and size of the register map. -- interrupts : Property with a value describing the interrupt - number. -- clock-names : List of input clock names - "s_axi_aclk", "spi_clk" -- clocks : Clock phandles and specifiers (See clock bindings for - details on clock-names and clocks). -- #address-cells : Must be <1> -- #size-cells : Must be <0> - -Optional subnodes: - Subnodes are use to represent the SPI slave devices connected to the SPI - master. They follow the generic SPI bindings as outlined in spi-bus.txt. - -Example: - - spi@@44a00000 { - compatible = "adi,axi-spi-engine-1.00.a"; - reg = <0x44a00000 0x1000>; - interrupts = <0 56 4>; - clocks = <&clkc 15 &clkc 15>; - clock-names = "s_axi_aclk", "spi_clk"; - - #address-cells = <1>; - #size-cells = <0>; - - /* SPI devices */ - }; diff --git a/Documentation/devicetree/bindings/spi/adi,axi-spi-engine.yaml b/Documentation/devicetree/bindings/spi/adi,axi-spi-engine.yaml new file mode 100644 index 000000000000..d48faa42d025 --- /dev/null +++ b/Documentation/devicetree/bindings/spi/adi,axi-spi-engine.yaml @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/spi/adi,axi-spi-engine.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Analog Devices AXI SPI Engine Controller + +description: | + The AXI SPI Engine controller is part of the SPI Engine framework[1] and + allows memory mapped access to the SPI Engine control bus. This allows it + to be used as a general purpose software driven SPI controller as well as + some optional advanced acceleration and offloading capabilities. + + [1] https://wiki.analog.com/resources/fpga/peripherals/spi_engine + +maintainers: + - Michael Hennerich + - Nuno Sá + +allOf: + - $ref: /schemas/spi/spi-controller.yaml# + +properties: + compatible: + const: adi,axi-spi-engine-1.00.a + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + items: + - description: The AXI interconnect clock. + - description: The SPI controller clock. + + clock-names: + items: + - const: s_axi_aclk + - const: spi_clk + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + +unevaluatedProperties: false + +examples: + - | + spi@44a00000 { + compatible = "adi,axi-spi-engine-1.00.a"; + reg = <0x44a00000 0x1000>; + interrupts = <0 56 4>; + clocks = <&clkc 15>, <&clkc 15>; + clock-names = "s_axi_aclk", "spi_clk"; + + #address-cells = <1>; + #size-cells = <0>; + + /* SPI devices */ + }; From 68539d1803476b4ecd403c126aa74b9f25b45f2b Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 17 Nov 2023 14:12:53 -0600 Subject: [PATCH 0118/1562] MAINTAINERS: add entry for AXI SPI Engine The AXI SPI Engine driver has been in the kernel for many years but has lacked a proper maintainers entry. This adds a new entry for the driver and the devicetree bindings. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20231117-axi-spi-engine-series-1-v1-2-cc59db999b87@baylibre.com Signed-off-by: Mark Brown --- MAINTAINERS | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index ea790149af79..39eac7178202 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3415,6 +3415,16 @@ W: https://ez.analog.com/linux-software-drivers F: Documentation/devicetree/bindings/hwmon/adi,axi-fan-control.yaml F: drivers/hwmon/axi-fan-control.c +AXI SPI ENGINE +M: Michael Hennerich +M: Nuno Sá +R: David Lechner +L: linux-spi@vger.kernel.org +S: Supported +W: https://ez.analog.com/linux-software-drivers +F: Documentation/devicetree/bindings/spi/adi,axi-spi-engine.yaml +F: drivers/spi/spi-axi-spi-engine.c + AXXIA I2C CONTROLLER M: Krzysztof Adamski L: linux-i2c@vger.kernel.org From 9e4ce5220eedea2cc440f3961dec1b5122e815b2 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 17 Nov 2023 14:12:54 -0600 Subject: [PATCH 0119/1562] spi: axi-spi-engine: simplify driver data allocation This simplifies the private data allocation in the AXI SPI Engine driver by making use of the feature built into the spi_alloc_host() function instead of doing it manually. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20231117-axi-spi-engine-series-1-v1-3-cc59db999b87@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index b96e55f59d1a..bdf0aa4ceb1d 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -473,15 +473,11 @@ static int spi_engine_probe(struct platform_device *pdev) if (irq < 0) return irq; - spi_engine = devm_kzalloc(&pdev->dev, sizeof(*spi_engine), GFP_KERNEL); - if (!spi_engine) - return -ENOMEM; - - host = spi_alloc_host(&pdev->dev, 0); + host = spi_alloc_host(&pdev->dev, sizeof(*spi_engine)); if (!host) return -ENOMEM; - spi_controller_set_devdata(host, spi_engine); + spi_engine = spi_controller_get_devdata(host); spin_lock_init(&spi_engine->lock); From e12cd96e8e93044646fdf4b2c9a1de62cfa01e7c Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 17 Nov 2023 14:12:55 -0600 Subject: [PATCH 0120/1562] spi: axi-spi-engine: use devm_spi_alloc_host() This modifies the AXI SPI Engine driver to use devm_spi_alloc_host() instead of spi_alloc_host() to simplify the code a bit. In addition to simplifying the error paths in the probe function, we can also remove spi_controller_get/put() calls in the remove function since devm_spi_alloc_host() sets a flag to no longer decrement the controller reference count in the spi_unregister_controller() function. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20231117-axi-spi-engine-series-1-v1-4-cc59db999b87@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index bdf0aa4ceb1d..77c1c115448d 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -473,7 +473,7 @@ static int spi_engine_probe(struct platform_device *pdev) if (irq < 0) return irq; - host = spi_alloc_host(&pdev->dev, sizeof(*spi_engine)); + host = devm_spi_alloc_host(&pdev->dev, sizeof(*spi_engine)); if (!host) return -ENOMEM; @@ -482,22 +482,16 @@ static int spi_engine_probe(struct platform_device *pdev) spin_lock_init(&spi_engine->lock); spi_engine->clk = devm_clk_get_enabled(&pdev->dev, "s_axi_aclk"); - if (IS_ERR(spi_engine->clk)) { - ret = PTR_ERR(spi_engine->clk); - goto err_put_host; - } + if (IS_ERR(spi_engine->clk)) + return PTR_ERR(spi_engine->clk); spi_engine->ref_clk = devm_clk_get_enabled(&pdev->dev, "spi_clk"); - if (IS_ERR(spi_engine->ref_clk)) { - ret = PTR_ERR(spi_engine->ref_clk); - goto err_put_host; - } + if (IS_ERR(spi_engine->ref_clk)) + return PTR_ERR(spi_engine->ref_clk); spi_engine->base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(spi_engine->base)) { - ret = PTR_ERR(spi_engine->base); - goto err_put_host; - } + if (IS_ERR(spi_engine->base)) + return PTR_ERR(spi_engine->base); version = readl(spi_engine->base + SPI_ENGINE_REG_VERSION); if (SPI_ENGINE_VERSION_MAJOR(version) != 1) { @@ -505,8 +499,7 @@ static int spi_engine_probe(struct platform_device *pdev) SPI_ENGINE_VERSION_MAJOR(version), SPI_ENGINE_VERSION_MINOR(version), SPI_ENGINE_VERSION_PATCH(version)); - ret = -ENODEV; - goto err_put_host; + return -ENODEV; } writel_relaxed(0x00, spi_engine->base + SPI_ENGINE_REG_RESET); @@ -515,7 +508,7 @@ static int spi_engine_probe(struct platform_device *pdev) ret = request_irq(irq, spi_engine_irq, 0, pdev->name, host); if (ret) - goto err_put_host; + return ret; host->dev.of_node = pdev->dev.of_node; host->mode_bits = SPI_CPOL | SPI_CPHA | SPI_3WIRE; @@ -533,14 +526,12 @@ static int spi_engine_probe(struct platform_device *pdev) return 0; err_free_irq: free_irq(irq, host); -err_put_host: - spi_controller_put(host); return ret; } static void spi_engine_remove(struct platform_device *pdev) { - struct spi_controller *host = spi_controller_get(platform_get_drvdata(pdev)); + struct spi_controller *host = platform_get_drvdata(pdev); struct spi_engine *spi_engine = spi_controller_get_devdata(host); int irq = platform_get_irq(pdev, 0); @@ -548,8 +539,6 @@ static void spi_engine_remove(struct platform_device *pdev) free_irq(irq, host); - spi_controller_put(host); - writel_relaxed(0xff, spi_engine->base + SPI_ENGINE_REG_INT_PENDING); writel_relaxed(0x00, spi_engine->base + SPI_ENGINE_REG_INT_ENABLE); writel_relaxed(0x01, spi_engine->base + SPI_ENGINE_REG_RESET); From e094de13ae78035c5642d5dfc65b07301765eebc Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 17 Nov 2023 14:12:56 -0600 Subject: [PATCH 0121/1562] spi: axi-spi-engine: use devm action to reset hw on remove This moves the reset of the hardware to a devm action in the AXI SPI Engine driver. This will allow us to use devm on later calls in the probe function while preserving the order during cleanup. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20231117-axi-spi-engine-series-1-v1-5-cc59db999b87@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 77c1c115448d..c18a4b34777e 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -461,6 +461,15 @@ static int spi_engine_transfer_one_message(struct spi_controller *host, return 0; } +static void spi_engine_release_hw(void *p) +{ + struct spi_engine *spi_engine = p; + + writel_relaxed(0xff, spi_engine->base + SPI_ENGINE_REG_INT_PENDING); + writel_relaxed(0x00, spi_engine->base + SPI_ENGINE_REG_INT_ENABLE); + writel_relaxed(0x01, spi_engine->base + SPI_ENGINE_REG_RESET); +} + static int spi_engine_probe(struct platform_device *pdev) { struct spi_engine *spi_engine; @@ -506,6 +515,11 @@ static int spi_engine_probe(struct platform_device *pdev) writel_relaxed(0xff, spi_engine->base + SPI_ENGINE_REG_INT_PENDING); writel_relaxed(0x00, spi_engine->base + SPI_ENGINE_REG_INT_ENABLE); + ret = devm_add_action_or_reset(&pdev->dev, spi_engine_release_hw, + spi_engine); + if (ret) + return ret; + ret = request_irq(irq, spi_engine_irq, 0, pdev->name, host); if (ret) return ret; @@ -532,16 +546,11 @@ err_free_irq: static void spi_engine_remove(struct platform_device *pdev) { struct spi_controller *host = platform_get_drvdata(pdev); - struct spi_engine *spi_engine = spi_controller_get_devdata(host); int irq = platform_get_irq(pdev, 0); spi_unregister_controller(host); free_irq(irq, host); - - writel_relaxed(0xff, spi_engine->base + SPI_ENGINE_REG_INT_PENDING); - writel_relaxed(0x00, spi_engine->base + SPI_ENGINE_REG_INT_ENABLE); - writel_relaxed(0x01, spi_engine->base + SPI_ENGINE_REG_RESET); } static const struct of_device_id spi_engine_match_table[] = { From 076f32d5db73f16c95b38149f9168210cf267b33 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 17 Nov 2023 14:12:57 -0600 Subject: [PATCH 0122/1562] spi: axi-spi-engine: use devm_request_irq() This replaces request_irq() with devm_request_irq() in the AXI SPI Engine driver. This simplifies the error path and removes the need to call free_irq() in the remove function. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20231117-axi-spi-engine-series-1-v1-6-cc59db999b87@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index c18a4b34777e..81d7352d2b8b 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -520,7 +520,8 @@ static int spi_engine_probe(struct platform_device *pdev) if (ret) return ret; - ret = request_irq(irq, spi_engine_irq, 0, pdev->name, host); + ret = devm_request_irq(&pdev->dev, irq, spi_engine_irq, 0, pdev->name, + host); if (ret) return ret; @@ -533,24 +534,18 @@ static int spi_engine_probe(struct platform_device *pdev) ret = spi_register_controller(host); if (ret) - goto err_free_irq; + return ret; platform_set_drvdata(pdev, host); return 0; -err_free_irq: - free_irq(irq, host); - return ret; } static void spi_engine_remove(struct platform_device *pdev) { struct spi_controller *host = platform_get_drvdata(pdev); - int irq = platform_get_irq(pdev, 0); spi_unregister_controller(host); - - free_irq(irq, host); } static const struct of_device_id spi_engine_match_table[] = { From e16e71e3f3c4b73b20f8c79f7ce8465542a337e9 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 17 Nov 2023 14:12:58 -0600 Subject: [PATCH 0123/1562] spi: axi-spi-engine: use devm_spi_register_controller() This replaces spi_register_controller() with devm_spi_register_controller() in the AXI SPI Engine driver. This saves us from having to call spi_unregister_controller() in the remove function. The remove function is also removed since it is no longer needed. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20231117-axi-spi-engine-series-1-v1-7-cc59db999b87@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 81d7352d2b8b..819744246952 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -532,7 +532,7 @@ static int spi_engine_probe(struct platform_device *pdev) host->transfer_one_message = spi_engine_transfer_one_message; host->num_chipselect = 8; - ret = spi_register_controller(host); + ret = devm_spi_register_controller(&pdev->dev, host); if (ret) return ret; @@ -541,13 +541,6 @@ static int spi_engine_probe(struct platform_device *pdev) return 0; } -static void spi_engine_remove(struct platform_device *pdev) -{ - struct spi_controller *host = platform_get_drvdata(pdev); - - spi_unregister_controller(host); -} - static const struct of_device_id spi_engine_match_table[] = { { .compatible = "adi,axi-spi-engine-1.00.a" }, { }, @@ -556,7 +549,6 @@ MODULE_DEVICE_TABLE(of, spi_engine_match_table); static struct platform_driver spi_engine_driver = { .probe = spi_engine_probe, - .remove_new = spi_engine_remove, .driver = { .name = "spi-engine", .of_match_table = spi_engine_match_table, From e6d5eb85e84aeace5e231b951ece86b20df9f63a Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 17 Nov 2023 14:12:59 -0600 Subject: [PATCH 0124/1562] spi: axi-spi-engine: check for valid clock rate This adds a check for a valid SCLK rate in the axi-spi-engine driver during probe. A valid rate is required to get accurate timing for delays and by not allowing 0 we can avoid divide by zero errors later without additional checks. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20231117-axi-spi-engine-series-1-v1-8-cc59db999b87@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 819744246952..8a6fbb3bb3f1 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -532,6 +532,9 @@ static int spi_engine_probe(struct platform_device *pdev) host->transfer_one_message = spi_engine_transfer_one_message; host->num_chipselect = 8; + if (host->max_speed_hz == 0) + return dev_err_probe(&pdev->dev, -EINVAL, "spi_clk rate is 0"); + ret = devm_spi_register_controller(&pdev->dev, host); if (ret) return ret; From 7f970ecb77b6759d37ee743fc36fc0daba960e75 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 17 Nov 2023 14:13:00 -0600 Subject: [PATCH 0125/1562] spi: axi-spi-engine: move msg state to new struct This moves the message state in the AXI SPI Engine driver to a new struct spi_engine_msg_state. Previously, the driver state contained various pointers that pointed to memory owned by a struct spi_message. However, it did not set any of these pointers to NULL when a message was completed. This could lead to use after free bugs. Example of how this could happen: 1. SPI core calls into spi_engine_transfer_one_message() with msg1. 2. Assume something was misconfigured and spi_engine_tx_next() is not called enough times in interrupt callbacks for msg1 such that spi_engine->tx_xfer is never set to NULL before the msg1 completes. 3. SYNC interrupt is received and spi_finalize_current_message() is called for msg1. spi_engine->msg is set to NULL but no other message-specific state is reset. 4. Caller that sent msg1 is notified of the completion and frees msg1 and the associated xfers and tx/rx buffers. 4. SPI core calls into spi_engine_transfer_one_message() with msg2. 5. When spi_engine_tx_next() is called for msg2, spi_engine->tx_xfer is still be pointing to an xfer from msg1, which was already freed. spi_engine_xfer_next() tries to access xfer->transfer_list of one of the freed xfers and we get a segfault or undefined behavior. To avoid issues like this, instead of putting per-message state in the driver state struct, we can make use of the struct spi_message::state field to store a pointer to a new struct spi_engine_msg_state. This way, all of the state that belongs to specific message stays with that message and we don't have to remember to manually reset all aspects of the message state when a message is completed. Rather, a new state is allocated for each message. Most of the changes are just renames where the state is accessed. One place where this wasn't straightforward was the sync_id member. This has been changed to use ida_alloc_range() since we needed to separate the per-message sync_id from the per-controller next available sync_id. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20231117-axi-spi-engine-series-1-v1-9-cc59db999b87@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 154 ++++++++++++++++++++----------- 1 file changed, 98 insertions(+), 56 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 8a6fbb3bb3f1..745000a9b2c7 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -78,6 +79,32 @@ struct spi_engine_program { uint16_t instructions[]; }; +/** + * struct spi_engine_message_state - SPI engine per-message state + */ +struct spi_engine_message_state { + /** Instructions for executing this message. */ + struct spi_engine_program *p; + /** Number of elements in cmd_buf array. */ + unsigned cmd_length; + /** Array of commands not yet written to CMD FIFO. */ + const uint16_t *cmd_buf; + /** Next xfer with tx_buf not yet fully written to TX FIFO. */ + struct spi_transfer *tx_xfer; + /** Size of tx_buf in bytes. */ + unsigned int tx_length; + /** Bytes not yet written to TX FIFO. */ + const uint8_t *tx_buf; + /** Next xfer with rx_buf not yet fully written to RX FIFO. */ + struct spi_transfer *rx_xfer; + /** Size of tx_buf in bytes. */ + unsigned int rx_length; + /** Bytes not yet written to the RX FIFO. */ + uint8_t *rx_buf; + /** ID to correlate SYNC interrupts with this message. */ + u8 sync_id; +}; + struct spi_engine { struct clk *clk; struct clk *ref_clk; @@ -87,19 +114,7 @@ struct spi_engine { void __iomem *base; struct spi_message *msg; - struct spi_engine_program *p; - unsigned cmd_length; - const uint16_t *cmd_buf; - - struct spi_transfer *tx_xfer; - unsigned int tx_length; - const uint8_t *tx_buf; - - struct spi_transfer *rx_xfer; - unsigned int rx_length; - uint8_t *rx_buf; - - unsigned int sync_id; + struct ida sync_ida; unsigned int completed_id; unsigned int int_enable; @@ -258,100 +273,105 @@ static void spi_engine_xfer_next(struct spi_engine *spi_engine, static void spi_engine_tx_next(struct spi_engine *spi_engine) { - struct spi_transfer *xfer = spi_engine->tx_xfer; + struct spi_engine_message_state *st = spi_engine->msg->state; + struct spi_transfer *xfer = st->tx_xfer; do { spi_engine_xfer_next(spi_engine, &xfer); } while (xfer && !xfer->tx_buf); - spi_engine->tx_xfer = xfer; + st->tx_xfer = xfer; if (xfer) { - spi_engine->tx_length = xfer->len; - spi_engine->tx_buf = xfer->tx_buf; + st->tx_length = xfer->len; + st->tx_buf = xfer->tx_buf; } else { - spi_engine->tx_buf = NULL; + st->tx_buf = NULL; } } static void spi_engine_rx_next(struct spi_engine *spi_engine) { - struct spi_transfer *xfer = spi_engine->rx_xfer; + struct spi_engine_message_state *st = spi_engine->msg->state; + struct spi_transfer *xfer = st->rx_xfer; do { spi_engine_xfer_next(spi_engine, &xfer); } while (xfer && !xfer->rx_buf); - spi_engine->rx_xfer = xfer; + st->rx_xfer = xfer; if (xfer) { - spi_engine->rx_length = xfer->len; - spi_engine->rx_buf = xfer->rx_buf; + st->rx_length = xfer->len; + st->rx_buf = xfer->rx_buf; } else { - spi_engine->rx_buf = NULL; + st->rx_buf = NULL; } } static bool spi_engine_write_cmd_fifo(struct spi_engine *spi_engine) { void __iomem *addr = spi_engine->base + SPI_ENGINE_REG_CMD_FIFO; + struct spi_engine_message_state *st = spi_engine->msg->state; unsigned int n, m, i; const uint16_t *buf; n = readl_relaxed(spi_engine->base + SPI_ENGINE_REG_CMD_FIFO_ROOM); - while (n && spi_engine->cmd_length) { - m = min(n, spi_engine->cmd_length); - buf = spi_engine->cmd_buf; + while (n && st->cmd_length) { + m = min(n, st->cmd_length); + buf = st->cmd_buf; for (i = 0; i < m; i++) writel_relaxed(buf[i], addr); - spi_engine->cmd_buf += m; - spi_engine->cmd_length -= m; + st->cmd_buf += m; + st->cmd_length -= m; n -= m; } - return spi_engine->cmd_length != 0; + return st->cmd_length != 0; } static bool spi_engine_write_tx_fifo(struct spi_engine *spi_engine) { void __iomem *addr = spi_engine->base + SPI_ENGINE_REG_SDO_DATA_FIFO; + struct spi_engine_message_state *st = spi_engine->msg->state; unsigned int n, m, i; const uint8_t *buf; n = readl_relaxed(spi_engine->base + SPI_ENGINE_REG_SDO_FIFO_ROOM); - while (n && spi_engine->tx_length) { - m = min(n, spi_engine->tx_length); - buf = spi_engine->tx_buf; + while (n && st->tx_length) { + m = min(n, st->tx_length); + buf = st->tx_buf; for (i = 0; i < m; i++) writel_relaxed(buf[i], addr); - spi_engine->tx_buf += m; - spi_engine->tx_length -= m; + st->tx_buf += m; + st->tx_length -= m; n -= m; - if (spi_engine->tx_length == 0) + if (st->tx_length == 0) spi_engine_tx_next(spi_engine); } - return spi_engine->tx_length != 0; + return st->tx_length != 0; } static bool spi_engine_read_rx_fifo(struct spi_engine *spi_engine) { void __iomem *addr = spi_engine->base + SPI_ENGINE_REG_SDI_DATA_FIFO; + struct spi_engine_message_state *st = spi_engine->msg->state; unsigned int n, m, i; uint8_t *buf; n = readl_relaxed(spi_engine->base + SPI_ENGINE_REG_SDI_FIFO_LEVEL); - while (n && spi_engine->rx_length) { - m = min(n, spi_engine->rx_length); - buf = spi_engine->rx_buf; + while (n && st->rx_length) { + m = min(n, st->rx_length); + buf = st->rx_buf; for (i = 0; i < m; i++) buf[i] = readl_relaxed(addr); - spi_engine->rx_buf += m; - spi_engine->rx_length -= m; + st->rx_buf += m; + st->rx_length -= m; n -= m; - if (spi_engine->rx_length == 0) + if (st->rx_length == 0) spi_engine_rx_next(spi_engine); } - return spi_engine->rx_length != 0; + return st->rx_length != 0; } static irqreturn_t spi_engine_irq(int irq, void *devid) @@ -387,12 +407,16 @@ static irqreturn_t spi_engine_irq(int irq, void *devid) disable_int |= SPI_ENGINE_INT_SDI_ALMOST_FULL; } - if (pending & SPI_ENGINE_INT_SYNC) { - if (spi_engine->msg && - spi_engine->completed_id == spi_engine->sync_id) { - struct spi_message *msg = spi_engine->msg; + if (pending & SPI_ENGINE_INT_SYNC && spi_engine->msg) { + struct spi_engine_message_state *st = spi_engine->msg->state; - kfree(spi_engine->p); + if (spi_engine->completed_id == st->sync_id) { + struct spi_message *msg = spi_engine->msg; + struct spi_engine_message_state *st = msg->state; + + ida_free(&spi_engine->sync_ida, st->sync_id); + kfree(st->p); + kfree(st); msg->status = 0; msg->actual_length = msg->frame_length; spi_engine->msg = NULL; @@ -417,29 +441,46 @@ static int spi_engine_transfer_one_message(struct spi_controller *host, { struct spi_engine_program p_dry, *p; struct spi_engine *spi_engine = spi_controller_get_devdata(host); + struct spi_engine_message_state *st; unsigned int int_enable = 0; unsigned long flags; size_t size; + int ret; + + st = kzalloc(sizeof(*st), GFP_KERNEL); + if (!st) + return -ENOMEM; p_dry.length = 0; spi_engine_compile_message(spi_engine, msg, true, &p_dry); size = sizeof(*p->instructions) * (p_dry.length + 1); p = kzalloc(sizeof(*p) + size, GFP_KERNEL); - if (!p) + if (!p) { + kfree(st); return -ENOMEM; + } + + ret = ida_alloc_range(&spi_engine->sync_ida, 0, U8_MAX, GFP_KERNEL); + if (ret < 0) { + kfree(p); + kfree(st); + return ret; + } + + st->sync_id = ret; + spi_engine_compile_message(spi_engine, msg, false, p); spin_lock_irqsave(&spi_engine->lock, flags); - spi_engine->sync_id = (spi_engine->sync_id + 1) & 0xff; - spi_engine_program_add_cmd(p, false, - SPI_ENGINE_CMD_SYNC(spi_engine->sync_id)); + spi_engine_program_add_cmd(p, false, SPI_ENGINE_CMD_SYNC(st->sync_id)); + msg->state = st; spi_engine->msg = msg; - spi_engine->p = p; + st->p = p; - spi_engine->cmd_buf = p->instructions; - spi_engine->cmd_length = p->length; + st->cmd_buf = p->instructions; + st->cmd_length = p->length; if (spi_engine_write_cmd_fifo(spi_engine)) int_enable |= SPI_ENGINE_INT_CMD_ALMOST_EMPTY; @@ -448,7 +489,7 @@ static int spi_engine_transfer_one_message(struct spi_controller *host, int_enable |= SPI_ENGINE_INT_SDO_ALMOST_EMPTY; spi_engine_rx_next(spi_engine); - if (spi_engine->rx_length != 0) + if (st->rx_length != 0) int_enable |= SPI_ENGINE_INT_SDI_ALMOST_FULL; int_enable |= SPI_ENGINE_INT_SYNC; @@ -489,6 +530,7 @@ static int spi_engine_probe(struct platform_device *pdev) spi_engine = spi_controller_get_devdata(host); spin_lock_init(&spi_engine->lock); + ida_init(&spi_engine->sync_ida); spi_engine->clk = devm_clk_get_enabled(&pdev->dev, "s_axi_aclk"); if (IS_ERR(spi_engine->clk)) From 0c74de5c6853b0e83413ad237867a37ba30ef3f9 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 17 Nov 2023 14:13:01 -0600 Subject: [PATCH 0126/1562] spi: axi-spi-engine: use message_prepare/unprepare This modifies the AXI SPI Engine driver to make use of the message_prepare and message_unprepare callbacks. This separates the concerns of allocating and freeing the message state from the transfer_one_message callback. The main benfit of this is so that future callers of spi_finalize_current_message() will not have to do manual cleanup of the state. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20231117-axi-spi-engine-series-1-v1-10-cc59db999b87@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 46 +++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 745000a9b2c7..210bea23f433 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -412,11 +412,7 @@ static irqreturn_t spi_engine_irq(int irq, void *devid) if (spi_engine->completed_id == st->sync_id) { struct spi_message *msg = spi_engine->msg; - struct spi_engine_message_state *st = msg->state; - ida_free(&spi_engine->sync_ida, st->sync_id); - kfree(st->p); - kfree(st); msg->status = 0; msg->actual_length = msg->frame_length; spi_engine->msg = NULL; @@ -436,14 +432,12 @@ static irqreturn_t spi_engine_irq(int irq, void *devid) return IRQ_HANDLED; } -static int spi_engine_transfer_one_message(struct spi_controller *host, - struct spi_message *msg) +static int spi_engine_prepare_message(struct spi_controller *host, + struct spi_message *msg) { struct spi_engine_program p_dry, *p; struct spi_engine *spi_engine = spi_controller_get_devdata(host); struct spi_engine_message_state *st; - unsigned int int_enable = 0; - unsigned long flags; size_t size; int ret; @@ -472,15 +466,41 @@ static int spi_engine_transfer_one_message(struct spi_controller *host, spi_engine_compile_message(spi_engine, msg, false, p); - spin_lock_irqsave(&spi_engine->lock, flags); spi_engine_program_add_cmd(p, false, SPI_ENGINE_CMD_SYNC(st->sync_id)); - msg->state = st; - spi_engine->msg = msg; st->p = p; - st->cmd_buf = p->instructions; st->cmd_length = p->length; + msg->state = st; + + return 0; +} + +static int spi_engine_unprepare_message(struct spi_controller *host, + struct spi_message *msg) +{ + struct spi_engine *spi_engine = spi_controller_get_devdata(host); + struct spi_engine_message_state *st = msg->state; + + ida_free(&spi_engine->sync_ida, st->sync_id); + kfree(st->p); + kfree(st); + + return 0; +} + +static int spi_engine_transfer_one_message(struct spi_controller *host, + struct spi_message *msg) +{ + struct spi_engine *spi_engine = spi_controller_get_devdata(host); + struct spi_engine_message_state *st = msg->state; + unsigned int int_enable = 0; + unsigned long flags; + + spin_lock_irqsave(&spi_engine->lock, flags); + + spi_engine->msg = msg; + if (spi_engine_write_cmd_fifo(spi_engine)) int_enable |= SPI_ENGINE_INT_CMD_ALMOST_EMPTY; @@ -572,6 +592,8 @@ static int spi_engine_probe(struct platform_device *pdev) host->bits_per_word_mask = SPI_BPW_MASK(8); host->max_speed_hz = clk_get_rate(spi_engine->ref_clk) / 2; host->transfer_one_message = spi_engine_transfer_one_message; + host->prepare_message = spi_engine_prepare_message; + host->unprepare_message = spi_engine_unprepare_message; host->num_chipselect = 8; if (host->max_speed_hz == 0) From 4a074ddeb90f5e81738b401643651b2dea257f57 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 17 Nov 2023 14:13:02 -0600 Subject: [PATCH 0127/1562] spi: axi-spi-engine: remove completed_id from driver state In the AXI SPI Engine driver, the completed_id field in the driver state is only used in one function and the value does not need to persist between function calls. Therefore, it can be removed from the driver state and made a local variable in the function where it is used. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20231117-axi-spi-engine-series-1-v1-11-cc59db999b87@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 210bea23f433..120001dbc4dc 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -115,7 +115,6 @@ struct spi_engine { struct spi_message *msg; struct ida sync_ida; - unsigned int completed_id; unsigned int int_enable; }; @@ -380,13 +379,14 @@ static irqreturn_t spi_engine_irq(int irq, void *devid) struct spi_engine *spi_engine = spi_controller_get_devdata(host); unsigned int disable_int = 0; unsigned int pending; + int completed_id = -1; pending = readl_relaxed(spi_engine->base + SPI_ENGINE_REG_INT_PENDING); if (pending & SPI_ENGINE_INT_SYNC) { writel_relaxed(SPI_ENGINE_INT_SYNC, spi_engine->base + SPI_ENGINE_REG_INT_PENDING); - spi_engine->completed_id = readl_relaxed( + completed_id = readl_relaxed( spi_engine->base + SPI_ENGINE_REG_SYNC_ID); } @@ -410,7 +410,7 @@ static irqreturn_t spi_engine_irq(int irq, void *devid) if (pending & SPI_ENGINE_INT_SYNC && spi_engine->msg) { struct spi_engine_message_state *st = spi_engine->msg->state; - if (spi_engine->completed_id == st->sync_id) { + if (completed_id == st->sync_id) { struct spi_message *msg = spi_engine->msg; msg->status = 0; From 4e991445478c6404a6846928093837249c52694a Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 17 Nov 2023 14:13:03 -0600 Subject: [PATCH 0128/1562] spi: axi-spi-engine: remove struct spi_engine::msg In the AXI SPI Engine driver, the struct spi_engine::msg member was used to keep track of the current message being processed. The SPI core is already keeping track of this, so we don't need to duplicate the effort. In most cases, we already have a pointer to the current message, so we can pass it directly to the functions that need it. In the one case where we don't have a pointer to the current message, we can get it from struct spi_controller::cur_msg. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20231117-axi-spi-engine-series-1-v1-12-cc59db999b87@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 60 +++++++++++++++----------------- 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 120001dbc4dc..c39f478f34a7 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -112,8 +112,6 @@ struct spi_engine { spinlock_t lock; void __iomem *base; - - struct spi_message *msg; struct ida sync_ida; unsigned int int_enable; @@ -252,10 +250,9 @@ static int spi_engine_compile_message(struct spi_engine *spi_engine, return 0; } -static void spi_engine_xfer_next(struct spi_engine *spi_engine, +static void spi_engine_xfer_next(struct spi_message *msg, struct spi_transfer **_xfer) { - struct spi_message *msg = spi_engine->msg; struct spi_transfer *xfer = *_xfer; if (!xfer) { @@ -270,13 +267,13 @@ static void spi_engine_xfer_next(struct spi_engine *spi_engine, *_xfer = xfer; } -static void spi_engine_tx_next(struct spi_engine *spi_engine) +static void spi_engine_tx_next(struct spi_message *msg) { - struct spi_engine_message_state *st = spi_engine->msg->state; + struct spi_engine_message_state *st = msg->state; struct spi_transfer *xfer = st->tx_xfer; do { - spi_engine_xfer_next(spi_engine, &xfer); + spi_engine_xfer_next(msg, &xfer); } while (xfer && !xfer->tx_buf); st->tx_xfer = xfer; @@ -288,13 +285,13 @@ static void spi_engine_tx_next(struct spi_engine *spi_engine) } } -static void spi_engine_rx_next(struct spi_engine *spi_engine) +static void spi_engine_rx_next(struct spi_message *msg) { - struct spi_engine_message_state *st = spi_engine->msg->state; + struct spi_engine_message_state *st = msg->state; struct spi_transfer *xfer = st->rx_xfer; do { - spi_engine_xfer_next(spi_engine, &xfer); + spi_engine_xfer_next(msg, &xfer); } while (xfer && !xfer->rx_buf); st->rx_xfer = xfer; @@ -306,10 +303,11 @@ static void spi_engine_rx_next(struct spi_engine *spi_engine) } } -static bool spi_engine_write_cmd_fifo(struct spi_engine *spi_engine) +static bool spi_engine_write_cmd_fifo(struct spi_engine *spi_engine, + struct spi_message *msg) { void __iomem *addr = spi_engine->base + SPI_ENGINE_REG_CMD_FIFO; - struct spi_engine_message_state *st = spi_engine->msg->state; + struct spi_engine_message_state *st = msg->state; unsigned int n, m, i; const uint16_t *buf; @@ -327,10 +325,11 @@ static bool spi_engine_write_cmd_fifo(struct spi_engine *spi_engine) return st->cmd_length != 0; } -static bool spi_engine_write_tx_fifo(struct spi_engine *spi_engine) +static bool spi_engine_write_tx_fifo(struct spi_engine *spi_engine, + struct spi_message *msg) { void __iomem *addr = spi_engine->base + SPI_ENGINE_REG_SDO_DATA_FIFO; - struct spi_engine_message_state *st = spi_engine->msg->state; + struct spi_engine_message_state *st = msg->state; unsigned int n, m, i; const uint8_t *buf; @@ -344,16 +343,17 @@ static bool spi_engine_write_tx_fifo(struct spi_engine *spi_engine) st->tx_length -= m; n -= m; if (st->tx_length == 0) - spi_engine_tx_next(spi_engine); + spi_engine_tx_next(msg); } return st->tx_length != 0; } -static bool spi_engine_read_rx_fifo(struct spi_engine *spi_engine) +static bool spi_engine_read_rx_fifo(struct spi_engine *spi_engine, + struct spi_message *msg) { void __iomem *addr = spi_engine->base + SPI_ENGINE_REG_SDI_DATA_FIFO; - struct spi_engine_message_state *st = spi_engine->msg->state; + struct spi_engine_message_state *st = msg->state; unsigned int n, m, i; uint8_t *buf; @@ -367,7 +367,7 @@ static bool spi_engine_read_rx_fifo(struct spi_engine *spi_engine) st->rx_length -= m; n -= m; if (st->rx_length == 0) - spi_engine_rx_next(spi_engine); + spi_engine_rx_next(msg); } return st->rx_length != 0; @@ -376,6 +376,7 @@ static bool spi_engine_read_rx_fifo(struct spi_engine *spi_engine) static irqreturn_t spi_engine_irq(int irq, void *devid) { struct spi_controller *host = devid; + struct spi_message *msg = host->cur_msg; struct spi_engine *spi_engine = spi_controller_get_devdata(host); unsigned int disable_int = 0; unsigned int pending; @@ -393,29 +394,26 @@ static irqreturn_t spi_engine_irq(int irq, void *devid) spin_lock(&spi_engine->lock); if (pending & SPI_ENGINE_INT_CMD_ALMOST_EMPTY) { - if (!spi_engine_write_cmd_fifo(spi_engine)) + if (!spi_engine_write_cmd_fifo(spi_engine, msg)) disable_int |= SPI_ENGINE_INT_CMD_ALMOST_EMPTY; } if (pending & SPI_ENGINE_INT_SDO_ALMOST_EMPTY) { - if (!spi_engine_write_tx_fifo(spi_engine)) + if (!spi_engine_write_tx_fifo(spi_engine, msg)) disable_int |= SPI_ENGINE_INT_SDO_ALMOST_EMPTY; } if (pending & (SPI_ENGINE_INT_SDI_ALMOST_FULL | SPI_ENGINE_INT_SYNC)) { - if (!spi_engine_read_rx_fifo(spi_engine)) + if (!spi_engine_read_rx_fifo(spi_engine, msg)) disable_int |= SPI_ENGINE_INT_SDI_ALMOST_FULL; } - if (pending & SPI_ENGINE_INT_SYNC && spi_engine->msg) { - struct spi_engine_message_state *st = spi_engine->msg->state; + if (pending & SPI_ENGINE_INT_SYNC && msg) { + struct spi_engine_message_state *st = msg->state; if (completed_id == st->sync_id) { - struct spi_message *msg = spi_engine->msg; - msg->status = 0; msg->actual_length = msg->frame_length; - spi_engine->msg = NULL; spi_finalize_current_message(host); disable_int |= SPI_ENGINE_INT_SYNC; } @@ -499,16 +497,14 @@ static int spi_engine_transfer_one_message(struct spi_controller *host, spin_lock_irqsave(&spi_engine->lock, flags); - spi_engine->msg = msg; - - if (spi_engine_write_cmd_fifo(spi_engine)) + if (spi_engine_write_cmd_fifo(spi_engine, msg)) int_enable |= SPI_ENGINE_INT_CMD_ALMOST_EMPTY; - spi_engine_tx_next(spi_engine); - if (spi_engine_write_tx_fifo(spi_engine)) + spi_engine_tx_next(msg); + if (spi_engine_write_tx_fifo(spi_engine, msg)) int_enable |= SPI_ENGINE_INT_SDO_ALMOST_EMPTY; - spi_engine_rx_next(spi_engine); + spi_engine_rx_next(msg); if (st->rx_length != 0) int_enable |= SPI_ENGINE_INT_SDI_ALMOST_FULL; From 145bb2aedb9f78f290c2b5503b553894a6ec53fe Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 17 Nov 2023 14:13:04 -0600 Subject: [PATCH 0129/1562] spi: axi-spi-engine: add support for cs_off This adds support for the spi_transfer::cs_off flag to the AXI SPI Engine driver. The logic is copied from the generic spi_transfer_one_message() in spi.c. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20231117-axi-spi-engine-series-1-v1-13-cc59db999b87@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index c39f478f34a7..1c60e6486ee2 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -216,7 +216,7 @@ static int spi_engine_compile_message(struct spi_engine *spi_engine, struct spi_device *spi = msg->spi; struct spi_transfer *xfer; int clk_div, new_clk_div; - bool cs_change = true; + bool keep_cs = false; clk_div = -1; @@ -224,6 +224,9 @@ static int spi_engine_compile_message(struct spi_engine *spi_engine, SPI_ENGINE_CMD_WRITE(SPI_ENGINE_CMD_REG_CONFIG, spi_engine_get_config(spi))); + xfer = list_first_entry(&msg->transfers, struct spi_transfer, transfer_list); + spi_engine_gen_cs(p, dry, spi, !xfer->cs_off); + list_for_each_entry(xfer, &msg->transfers, transfer_list) { new_clk_div = spi_engine_get_clk_div(spi_engine, spi, xfer); if (new_clk_div != clk_div) { @@ -233,20 +236,28 @@ static int spi_engine_compile_message(struct spi_engine *spi_engine, clk_div)); } - if (cs_change) - spi_engine_gen_cs(p, dry, spi, true); - spi_engine_gen_xfer(p, dry, xfer); spi_engine_gen_sleep(p, dry, spi_engine, clk_div, xfer); - cs_change = xfer->cs_change; - if (list_is_last(&xfer->transfer_list, &msg->transfers)) - cs_change = !cs_change; + if (xfer->cs_change) { + if (list_is_last(&xfer->transfer_list, &msg->transfers)) { + keep_cs = true; + } else { + if (!xfer->cs_off) + spi_engine_gen_cs(p, dry, spi, false); - if (cs_change) - spi_engine_gen_cs(p, dry, spi, false); + if (!list_next_entry(xfer, transfer_list)->cs_off) + spi_engine_gen_cs(p, dry, spi, true); + } + } else if (!list_is_last(&xfer->transfer_list, &msg->transfers) && + xfer->cs_off != list_next_entry(xfer, transfer_list)->cs_off) { + spi_engine_gen_cs(p, dry, spi, xfer->cs_off); + } } + if (!keep_cs) + spi_engine_gen_cs(p, dry, spi, false); + return 0; } From d861b417e1893a46c63cef2cb46d3587da1e5b15 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 17 Nov 2023 14:13:05 -0600 Subject: [PATCH 0130/1562] spi: axi-spi-engine: add support for any word size The AXI SPI Engine IP supports any word size from 1 to 32 bits. This adds support for this by setting the bits_per_word_mask and emitting the appropriate instruction to the SPI Engine each time a transfer requires a new word size. The functions that transfer tx/rx buffers from/to the SPI Engine registers (spi_engine_write_{tx,rx}_fifo()) as well as the function that creates the transfer instruction (spi_engine_gen_xfer()) also have to be modified to take into account the word size since xfer->len is the size of the buffers in bytes rather than words. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20231117-axi-spi-engine-series-1-v1-14-cc59db999b87@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 84 ++++++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 16 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 1c60e6486ee2..cbca783830ea 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -53,6 +53,7 @@ #define SPI_ENGINE_CMD_REG_CLK_DIV 0x0 #define SPI_ENGINE_CMD_REG_CONFIG 0x1 +#define SPI_ENGINE_CMD_REG_XFER_BITS 0x2 #define SPI_ENGINE_MISC_SYNC 0x0 #define SPI_ENGINE_MISC_SLEEP 0x1 @@ -157,7 +158,14 @@ static unsigned int spi_engine_get_clk_div(struct spi_engine *spi_engine, static void spi_engine_gen_xfer(struct spi_engine_program *p, bool dry, struct spi_transfer *xfer) { - unsigned int len = xfer->len; + unsigned int len; + + if (xfer->bits_per_word <= 8) + len = xfer->len; + else if (xfer->bits_per_word <= 16) + len = xfer->len / 2; + else + len = xfer->len / 4; while (len) { unsigned int n = min(len, 256U); @@ -217,6 +225,7 @@ static int spi_engine_compile_message(struct spi_engine *spi_engine, struct spi_transfer *xfer; int clk_div, new_clk_div; bool keep_cs = false; + u8 bits_per_word = 0; clk_div = -1; @@ -236,6 +245,13 @@ static int spi_engine_compile_message(struct spi_engine *spi_engine, clk_div)); } + if (bits_per_word != xfer->bits_per_word) { + bits_per_word = xfer->bits_per_word; + spi_engine_program_add_cmd(p, dry, + SPI_ENGINE_CMD_WRITE(SPI_ENGINE_CMD_REG_XFER_BITS, + bits_per_word)); + } + spi_engine_gen_xfer(p, dry, xfer); spi_engine_gen_sleep(p, dry, spi_engine, clk_div, xfer); @@ -342,16 +358,34 @@ static bool spi_engine_write_tx_fifo(struct spi_engine *spi_engine, void __iomem *addr = spi_engine->base + SPI_ENGINE_REG_SDO_DATA_FIFO; struct spi_engine_message_state *st = msg->state; unsigned int n, m, i; - const uint8_t *buf; n = readl_relaxed(spi_engine->base + SPI_ENGINE_REG_SDO_FIFO_ROOM); while (n && st->tx_length) { - m = min(n, st->tx_length); - buf = st->tx_buf; - for (i = 0; i < m; i++) - writel_relaxed(buf[i], addr); - st->tx_buf += m; - st->tx_length -= m; + if (st->tx_xfer->bits_per_word <= 8) { + const u8 *buf = st->tx_buf; + + m = min(n, st->tx_length); + for (i = 0; i < m; i++) + writel_relaxed(buf[i], addr); + st->tx_buf += m; + st->tx_length -= m; + } else if (st->tx_xfer->bits_per_word <= 16) { + const u16 *buf = (const u16 *)st->tx_buf; + + m = min(n, st->tx_length / 2); + for (i = 0; i < m; i++) + writel_relaxed(buf[i], addr); + st->tx_buf += m * 2; + st->tx_length -= m * 2; + } else { + const u32 *buf = (const u32 *)st->tx_buf; + + m = min(n, st->tx_length / 4); + for (i = 0; i < m; i++) + writel_relaxed(buf[i], addr); + st->tx_buf += m * 4; + st->tx_length -= m * 4; + } n -= m; if (st->tx_length == 0) spi_engine_tx_next(msg); @@ -366,16 +400,34 @@ static bool spi_engine_read_rx_fifo(struct spi_engine *spi_engine, void __iomem *addr = spi_engine->base + SPI_ENGINE_REG_SDI_DATA_FIFO; struct spi_engine_message_state *st = msg->state; unsigned int n, m, i; - uint8_t *buf; n = readl_relaxed(spi_engine->base + SPI_ENGINE_REG_SDI_FIFO_LEVEL); while (n && st->rx_length) { - m = min(n, st->rx_length); - buf = st->rx_buf; - for (i = 0; i < m; i++) - buf[i] = readl_relaxed(addr); - st->rx_buf += m; - st->rx_length -= m; + if (st->rx_xfer->bits_per_word <= 8) { + u8 *buf = st->rx_buf; + + m = min(n, st->rx_length); + for (i = 0; i < m; i++) + buf[i] = readl_relaxed(addr); + st->rx_buf += m; + st->rx_length -= m; + } else if (st->rx_xfer->bits_per_word <= 16) { + u16 *buf = (u16 *)st->rx_buf; + + m = min(n, st->rx_length / 2); + for (i = 0; i < m; i++) + buf[i] = readl_relaxed(addr); + st->rx_buf += m * 2; + st->rx_length -= m * 2; + } else { + u32 *buf = (u32 *)st->rx_buf; + + m = min(n, st->rx_length / 4); + for (i = 0; i < m; i++) + buf[i] = readl_relaxed(addr); + st->rx_buf += m * 4; + st->rx_length -= m * 4; + } n -= m; if (st->rx_length == 0) spi_engine_rx_next(msg); @@ -596,7 +648,7 @@ static int spi_engine_probe(struct platform_device *pdev) host->dev.of_node = pdev->dev.of_node; host->mode_bits = SPI_CPOL | SPI_CPHA | SPI_3WIRE; - host->bits_per_word_mask = SPI_BPW_MASK(8); + host->bits_per_word_mask = SPI_BPW_RANGE_MASK(1, 32); host->max_speed_hz = clk_get_rate(spi_engine->ref_clk) / 2; host->transfer_one_message = spi_engine_transfer_one_message; host->prepare_message = spi_engine_prepare_message; From 9880702d123f202369fb674ae62bae25be27475c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 6 Nov 2023 17:06:27 +0100 Subject: [PATCH 0131/1562] ACPI: property: Support using strings in reference properties In order to allow referencing data nodes directly, which is not possible currently, add support for representing references in device properties as strings (relative or absolute name paths). For example, after this change, the "mipi-img-flash-leds" property in the ASL snippet below will be treated as a proper reference to the LED0 object under LEDD. Package () { "mipi-img-flash-leds", "\\_SB.PCI0.I2C2.LEDD.LED0", } Device (LEDD) { Name (_DSD, Package () // _DSD: Device-Specific Data { ToUUID ("dbb8e3e6-5886-4ba6-8795-1319f52a966b"), /* Hierarchical Data Extension */, Package () { Package () { "mipi-img-flash-led-0", "LED0", } }, }) Name (LED0, Package () // _DSD: Device-Specific Data { ToUUID ("daffd814-6eba-4d8c-8a91-bc9bbf4aa301") /* Device Properties */, Package () { Package () { "mipi-img-max-current", 1000000, } } }) } Also remove the mechanism allowing data nodes to be referenced indirectly, with the help of an object reference pointing to the "ancestor" device and a path relative to it (this mechanism is not expected to be in use in any production platform firmware in the field). Note that this change allows also using strings for referencing device objects, in addition to object references that have been supported already. While at it, add pr_fmt() macro to prefix printouts and update copyright. Co-developed-by: Sakari Ailus Signed-off-by: Sakari Ailus Signed-off-by: Rafael J. Wysocki Tested-by: Sakari Ailus --- drivers/acpi/property.c | 102 ++++++++++++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 20 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 6979a3f9f90a..07d76fb740b6 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -2,14 +2,17 @@ /* * ACPI device specific properties support. * - * Copyright (C) 2014, Intel Corporation + * Copyright (C) 2014 - 2023, Intel Corporation * All rights reserved. * * Authors: Mika Westerberg * Darren Hart * Rafael J. Wysocki + * Sakari Ailus */ +#define pr_fmt(fmt) "ACPI: " fmt + #include #include #include @@ -800,28 +803,16 @@ static int acpi_get_ref_args(struct fwnode_reference_args *args, { u32 nargs = 0, i; - /* - * Find the referred data extension node under the - * referred device node. - */ - for (; *element < end && (*element)->type == ACPI_TYPE_STRING; - (*element)++) { - const char *child_name = (*element)->string.pointer; - - ref_fwnode = acpi_fwnode_get_named_child_node(ref_fwnode, child_name); - if (!ref_fwnode) - return -EINVAL; - } - /* * Assume the following integer elements are all args. Stop counting on - * the first reference or end of the package arguments. In case of - * neither reference, nor integer, return an error, we can't parse it. + * the first reference (possibly represented as a string) or end of the + * package arguments. In case of neither reference, nor integer, return + * an error, we can't parse it. */ for (i = 0; (*element) + i < end && i < num_args; i++) { acpi_object_type type = (*element)[i].type; - if (type == ACPI_TYPE_LOCAL_REFERENCE) + if (type == ACPI_TYPE_LOCAL_REFERENCE || type == ACPI_TYPE_STRING) break; if (type == ACPI_TYPE_INTEGER) @@ -845,6 +836,44 @@ static int acpi_get_ref_args(struct fwnode_reference_args *args, return 0; } +static struct fwnode_handle *acpi_parse_string_ref(const struct fwnode_handle *fwnode, + const char *refstring) +{ + acpi_handle scope, handle; + struct acpi_data_node *dn; + struct acpi_device *device; + acpi_status status; + + if (is_acpi_device_node(fwnode)) { + scope = to_acpi_device_node(fwnode)->handle; + } else if (is_acpi_data_node(fwnode)) { + scope = to_acpi_data_node(fwnode)->handle; + } else { + pr_debug("Bad node type for node %pfw\n", fwnode); + return NULL; + } + + status = acpi_get_handle(scope, refstring, &handle); + if (ACPI_FAILURE(status)) { + acpi_handle_debug(scope, "Unable to get an ACPI handle for %s\n", + refstring); + return NULL; + } + + device = acpi_fetch_acpi_dev(handle); + if (device) + return acpi_fwnode_handle(device); + + status = acpi_get_data_full(handle, acpi_nondev_subnode_tag, + (void **)&dn, NULL); + if (ACPI_FAILURE(status) || !dn) { + acpi_handle_debug(handle, "Subnode not found\n"); + return NULL; + } + + return &dn->fwnode; +} + /** * __acpi_node_get_property_reference - returns handle to the referenced object * @fwnode: Firmware node to get the property from @@ -887,6 +916,7 @@ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, const union acpi_object *element, *end; const union acpi_object *obj; const struct acpi_device_data *data; + struct fwnode_handle *ref_fwnode; struct acpi_device *device; int ret, idx = 0; @@ -910,16 +940,30 @@ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, args->fwnode = acpi_fwnode_handle(device); args->nargs = 0; + + return 0; + case ACPI_TYPE_STRING: + if (index) + return -ENOENT; + + ref_fwnode = acpi_parse_string_ref(fwnode, obj->string.pointer); + if (!ref_fwnode) + return -EINVAL; + + args->fwnode = ref_fwnode; + args->nargs = 0; + return 0; case ACPI_TYPE_PACKAGE: /* * If it is not a single reference, then it is a package of - * references followed by number of ints as follows: + * references, followed by number of ints as follows: * * Package () { REF, INT, REF, INT, INT } * - * The index argument is then used to determine which reference - * the caller wants (along with the arguments). + * Here, REF may be either a local reference or a string. The + * index argument is then used to determine which reference the + * caller wants (along with the arguments). */ break; default: @@ -950,6 +994,24 @@ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, if (idx == index) return 0; + break; + case ACPI_TYPE_STRING: + ref_fwnode = acpi_parse_string_ref(fwnode, + element->string.pointer); + if (!ref_fwnode) + return -EINVAL; + + element++; + + ret = acpi_get_ref_args(idx == index ? args : NULL, + ref_fwnode, &element, end, + num_args); + if (ret < 0) + return ret; + + if (idx == index) + return 0; + break; case ACPI_TYPE_INTEGER: if (idx == index) From bd721b934323e4dcde892013a97e0e5674f4c884 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 6 Nov 2023 17:09:01 +0100 Subject: [PATCH 0132/1562] ACPI: scan: Extract CSI-2 connection graph from _CRS Find ACPI CSI-2 resource descriptors defined since ACPI 6.4 (for CSI-2 and camera configuration) in _CRS for all device objects in the given scope of the ACPI namespace that have them, identify the corresponding "remote endpoint" device objects for them and allocate memory for software nodes needed to create a DT-like data structure representing the CSI-2 connection graph for drivers. The code needed to populate these software nodes will be added by subsequent change sets. Link: https://uefi.org/specs/ACPI/6.5/06_Device_Configuration.html#camera-serial-interface-csi-2-connection-resource-descriptor Co-developed-by: Sakari Ailus Signed-off-by: Sakari Ailus Signed-off-by: Rafael J. Wysocki Tested-by: Sakari Ailus --- drivers/acpi/Makefile | 2 +- drivers/acpi/internal.h | 8 + drivers/acpi/mipi-disco-img.c | 292 ++++++++++++++++++++++++++++++++++ drivers/acpi/scan.c | 48 ++++-- include/acpi/acpi_bus.h | 18 +++ 5 files changed, 358 insertions(+), 10 deletions(-) create mode 100644 drivers/acpi/mipi-disco-img.c diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index eaa09bf52f17..d367e649714f 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -37,7 +37,7 @@ acpi-$(CONFIG_ACPI_SLEEP) += proc.o # ACPI Bus and Device Drivers # acpi-y += bus.o glue.o -acpi-y += scan.o +acpi-y += scan.o mipi-disco-img.o acpi-y += resource.o acpi-y += acpi_processor.o acpi-y += processor_core.o diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 866c7c4ed233..959a2bc61916 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -276,4 +276,12 @@ void acpi_init_lpit(void); static inline void acpi_init_lpit(void) { } #endif +/*-------------------------------------------------------------------------- + ACPI _CRS CSI-2 and MIPI DisCo for Imaging + -------------------------------------------------------------------------- */ + +void acpi_mipi_check_crs_csi2(acpi_handle handle); +void acpi_mipi_scan_crs_csi2(void); +void acpi_mipi_crs_csi2_cleanup(void); + #endif /* _ACPI_INTERNAL_H_ */ diff --git a/drivers/acpi/mipi-disco-img.c b/drivers/acpi/mipi-disco-img.c new file mode 100644 index 000000000000..91281c8cb4f2 --- /dev/null +++ b/drivers/acpi/mipi-disco-img.c @@ -0,0 +1,292 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * MIPI DisCo for Imaging support. + * + * Copyright (C) 2023 Intel Corporation + * + * Support MIPI DisCo for Imaging by parsing ACPI _CRS CSI-2 records defined in + * Section 6.4.3.8.2.4 "Camera Serial Interface (CSI-2) Connection Resource + * Descriptor" of ACPI 6.5. + * + * The implementation looks for the information in the ACPI namespace (CSI-2 + * resource descriptors in _CRS) and constructs software nodes compatible with + * Documentation/firmware-guide/acpi/dsd/graph.rst to represent the CSI-2 + * connection graph. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +static LIST_HEAD(acpi_mipi_crs_csi2_list); + +static void acpi_mipi_data_tag(acpi_handle handle, void *context) +{ +} + +/* Connection data extracted from one _CRS CSI-2 resource descriptor. */ +struct crs_csi2_connection { + struct list_head entry; + struct acpi_resource_csi2_serialbus csi2_data; + acpi_handle remote_handle; + char remote_name[]; +}; + +/* Data extracted from _CRS CSI-2 resource descriptors for one device. */ +struct crs_csi2 { + struct list_head entry; + acpi_handle handle; + struct acpi_device_software_nodes *swnodes; + struct list_head connections; + u32 port_count; +}; + +struct csi2_resources_walk_data { + acpi_handle handle; + struct list_head connections; +}; + +static acpi_status parse_csi2_resource(struct acpi_resource *res, void *context) +{ + struct csi2_resources_walk_data *crwd = context; + struct acpi_resource_csi2_serialbus *csi2_res; + struct acpi_resource_source *csi2_res_src; + u16 csi2_res_src_length; + struct crs_csi2_connection *conn; + acpi_handle remote_handle; + + if (res->type != ACPI_RESOURCE_TYPE_SERIAL_BUS) + return AE_OK; + + csi2_res = &res->data.csi2_serial_bus; + + if (csi2_res->type != ACPI_RESOURCE_SERIAL_TYPE_CSI2) + return AE_OK; + + csi2_res_src = &csi2_res->resource_source; + if (ACPI_FAILURE(acpi_get_handle(NULL, csi2_res_src->string_ptr, + &remote_handle))) { + acpi_handle_debug(crwd->handle, + "unable to find resource source\n"); + return AE_OK; + } + csi2_res_src_length = csi2_res_src->string_length; + if (!csi2_res_src_length) { + acpi_handle_debug(crwd->handle, + "invalid resource source string length\n"); + return AE_OK; + } + + conn = kmalloc(struct_size(conn, remote_name, csi2_res_src_length + 1), + GFP_KERNEL); + if (!conn) + return AE_OK; + + conn->csi2_data = *csi2_res; + strscpy(conn->remote_name, csi2_res_src->string_ptr, csi2_res_src_length); + conn->csi2_data.resource_source.string_ptr = conn->remote_name; + conn->remote_handle = remote_handle; + + list_add(&conn->entry, &crwd->connections); + + return AE_OK; +} + +static struct crs_csi2 *acpi_mipi_add_crs_csi2(acpi_handle handle, + struct list_head *list) +{ + struct crs_csi2 *csi2; + + csi2 = kzalloc(sizeof(*csi2), GFP_KERNEL); + if (!csi2) + return NULL; + + csi2->handle = handle; + INIT_LIST_HEAD(&csi2->connections); + csi2->port_count = 1; + + if (ACPI_FAILURE(acpi_attach_data(handle, acpi_mipi_data_tag, csi2))) { + kfree(csi2); + return NULL; + } + + list_add(&csi2->entry, list); + + return csi2; +} + +static struct crs_csi2 *acpi_mipi_get_crs_csi2(acpi_handle handle) +{ + struct crs_csi2 *csi2; + + if (ACPI_FAILURE(acpi_get_data_full(handle, acpi_mipi_data_tag, + (void **)&csi2, NULL))) + return NULL; + + return csi2; +} + +static void csi_csr2_release_connections(struct list_head *list) +{ + struct crs_csi2_connection *conn, *conn_tmp; + + list_for_each_entry_safe(conn, conn_tmp, list, entry) { + list_del(&conn->entry); + kfree(conn); + } +} + +static void acpi_mipi_del_crs_csi2(struct crs_csi2 *csi2) +{ + list_del(&csi2->entry); + acpi_detach_data(csi2->handle, acpi_mipi_data_tag); + kfree(csi2->swnodes); + csi_csr2_release_connections(&csi2->connections); + kfree(csi2); +} + +/** + * acpi_mipi_check_crs_csi2 - Look for CSI-2 resources in _CRS + * @handle: Device object handle to evaluate _CRS for. + * + * Find all CSI-2 resource descriptors in the given device's _CRS + * and collect them into a list. + */ +void acpi_mipi_check_crs_csi2(acpi_handle handle) +{ + struct csi2_resources_walk_data crwd = { + .handle = handle, + .connections = LIST_HEAD_INIT(crwd.connections), + }; + struct crs_csi2 *csi2; + + /* + * Avoid allocating _CRS CSI-2 objects for devices without any CSI-2 + * resource descriptions in _CRS to reduce overhead. + */ + acpi_walk_resources(handle, METHOD_NAME__CRS, parse_csi2_resource, &crwd); + if (list_empty(&crwd.connections)) + return; + + /* + * Create a _CRS CSI-2 entry to store the extracted connection + * information and add it to the global list. + */ + csi2 = acpi_mipi_add_crs_csi2(handle, &acpi_mipi_crs_csi2_list); + if (!csi2) { + csi_csr2_release_connections(&crwd.connections); + return; /* Nothing really can be done about this. */ + } + + list_replace(&crwd.connections, &csi2->connections); +} + +#define NO_CSI2_PORT (UINT_MAX - 1) + +static void alloc_crs_csi2_swnodes(struct crs_csi2 *csi2) +{ + size_t port_count = csi2->port_count; + struct acpi_device_software_nodes *swnodes; + size_t alloc_size; + unsigned int i; + + /* + * Allocate memory for ports, node pointers (number of nodes + + * 1 (guardian), nodes (root + number of ports * 2 (because for + * every port there is an endpoint)). + */ + if (check_mul_overflow(sizeof(*swnodes->ports) + + sizeof(*swnodes->nodes) * 2 + + sizeof(*swnodes->nodeptrs) * 2, + port_count, &alloc_size) || + check_add_overflow(sizeof(*swnodes) + + sizeof(*swnodes->nodes) + + sizeof(*swnodes->nodeptrs) * 2, + alloc_size, &alloc_size)) { + acpi_handle_info(csi2->handle, + "too many _CRS CSI-2 resource handles (%zu)", + port_count); + return; + } + + swnodes = kmalloc(alloc_size, GFP_KERNEL); + if (!swnodes) + return; + + swnodes->ports = (struct acpi_device_software_node_port *)(swnodes + 1); + swnodes->nodes = (struct software_node *)(swnodes->ports + port_count); + swnodes->nodeptrs = (const struct software_node **)(swnodes->nodes + 1 + + 2 * port_count); + swnodes->num_ports = port_count; + + for (i = 0; i < 2 * port_count + 1; i++) + swnodes->nodeptrs[i] = &swnodes->nodes[i]; + + swnodes->nodeptrs[i] = NULL; + + for (i = 0; i < port_count; i++) + swnodes->ports[i].port_nr = NO_CSI2_PORT; + + csi2->swnodes = swnodes; +} + +/** + * acpi_mipi_scan_crs_csi2 - Create ACPI _CRS CSI-2 software nodes + * + * Note that this function must be called before any struct acpi_device objects + * are bound to any ACPI drivers or scan handlers, so it cannot assume the + * existence of struct acpi_device objects for every device present in the ACPI + * namespace. + * + * acpi_scan_lock in scan.c must be held when calling this function. + */ +void acpi_mipi_scan_crs_csi2(void) +{ + struct crs_csi2 *csi2; + LIST_HEAD(aux_list); + + /* Count references to each ACPI handle in the CSI-2 connection graph. */ + list_for_each_entry(csi2, &acpi_mipi_crs_csi2_list, entry) { + struct crs_csi2_connection *conn; + + list_for_each_entry(conn, &csi2->connections, entry) { + struct crs_csi2 *remote_csi2; + + csi2->port_count++; + + remote_csi2 = acpi_mipi_get_crs_csi2(conn->remote_handle); + if (remote_csi2) { + remote_csi2->port_count++; + continue; + } + /* + * The remote endpoint has no _CRS CSI-2 list entry yet, + * so create one for it and add it to the list. + */ + acpi_mipi_add_crs_csi2(conn->remote_handle, &aux_list); + } + } + list_splice(&aux_list, &acpi_mipi_crs_csi2_list); + + /* Allocate software nodes for representing the CSI-2 information. */ + list_for_each_entry(csi2, &acpi_mipi_crs_csi2_list, entry) + alloc_crs_csi2_swnodes(csi2); +} + +/** + * acpi_mipi_crs_csi2_cleanup - Free _CRS CSI-2 temporary data + */ +void acpi_mipi_crs_csi2_cleanup(void) +{ + struct crs_csi2 *csi2, *csi2_tmp; + + list_for_each_entry_safe(csi2, csi2_tmp, &acpi_mipi_crs_csi2_list, entry) + acpi_mipi_del_crs_csi2(csi2); +} diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index fa5dd71a80fa..8e12c558b8db 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1976,7 +1976,7 @@ static void acpi_scan_init_hotplug(struct acpi_device *adev) } } -static u32 acpi_scan_check_dep(acpi_handle handle, bool check_dep) +static u32 acpi_scan_check_dep(acpi_handle handle) { struct acpi_handle_list dep_devices; acpi_status status; @@ -1989,8 +1989,7 @@ static u32 acpi_scan_check_dep(acpi_handle handle, bool check_dep) * 2. ACPI nodes describing USB ports. * Still, checking for _HID catches more then just these cases ... */ - if (!check_dep || !acpi_has_method(handle, "_DEP") || - !acpi_has_method(handle, "_HID")) + if (!acpi_has_method(handle, "_DEP") || !acpi_has_method(handle, "_HID")) return 0; status = acpi_evaluate_reference(handle, "_DEP", NULL, &dep_devices); @@ -2036,7 +2035,13 @@ static u32 acpi_scan_check_dep(acpi_handle handle, bool check_dep) return count; } -static acpi_status acpi_bus_check_add(acpi_handle handle, bool check_dep, +static acpi_status acpi_scan_check_crs_csi2_cb(acpi_handle handle, u32 a, void *b, void **c) +{ + acpi_mipi_check_crs_csi2(handle); + return AE_OK; +} + +static acpi_status acpi_bus_check_add(acpi_handle handle, bool first_pass, struct acpi_device **adev_p) { struct acpi_device *device = acpi_fetch_acpi_dev(handle); @@ -2054,9 +2059,25 @@ static acpi_status acpi_bus_check_add(acpi_handle handle, bool check_dep, if (acpi_device_should_be_hidden(handle)) return AE_OK; - /* Bail out if there are dependencies. */ - if (acpi_scan_check_dep(handle, check_dep) > 0) - return AE_CTRL_DEPTH; + if (first_pass) { + acpi_mipi_check_crs_csi2(handle); + + /* Bail out if there are dependencies. */ + if (acpi_scan_check_dep(handle) > 0) { + /* + * The entire CSI-2 connection graph needs to be + * extracted before any drivers or scan handlers + * are bound to struct device objects, so scan + * _CRS CSI-2 resource descriptors for all + * devices below the current handle. + */ + acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, + ACPI_UINT32_MAX, + acpi_scan_check_crs_csi2_cb, + NULL, NULL, NULL); + return AE_CTRL_DEPTH; + } + } fallthrough; case ACPI_TYPE_ANY: /* for ACPI_ROOT_OBJECT */ @@ -2079,10 +2100,10 @@ static acpi_status acpi_bus_check_add(acpi_handle handle, bool check_dep, } /* - * If check_dep is true at this point, the device has no dependencies, + * If first_pass is true at this point, the device has no dependencies, * or the creation of the device object would have been postponed above. */ - acpi_add_single_object(&device, handle, type, !check_dep); + acpi_add_single_object(&device, handle, type, !first_pass); if (!device) return AE_CTRL_DEPTH; @@ -2494,12 +2515,21 @@ int acpi_bus_scan(acpi_handle handle) if (!device) return -ENODEV; + /* + * Allocate ACPI _CRS CSI-2 software nodes using information extracted + * from the _CRS CSI-2 resource descriptors during the ACPI namespace + * walk above. + */ + acpi_mipi_scan_crs_csi2(); + acpi_bus_attach(device, (void *)true); /* Pass 2: Enumerate all of the remaining devices. */ acpi_scan_postponed(); + acpi_mipi_crs_csi2_cleanup(); + return 0; } EXPORT_SYMBOL(acpi_bus_scan); diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index afeed6e72049..f122fa1c10a8 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -366,6 +366,24 @@ struct acpi_device_data { struct acpi_gpio_mapping; +struct acpi_device_software_node_port { + unsigned int port_nr; +}; + +/** + * struct acpi_device_software_nodes - Software nodes for an ACPI device + * @nodes: Software nodes for root as well as ports and endpoints. + * @nodeprts: Array of software node pointers, for (un)registering them. + * @ports: Information related to each port and endpoint within a port. + * @num_ports: The number of ports. + */ +struct acpi_device_software_nodes { + struct software_node *nodes; + const struct software_node **nodeptrs; + struct acpi_device_software_node_port *ports; + unsigned int num_ports; +}; + /* Device */ struct acpi_device { u32 pld_crc; From 693c667b32ee1dd312000d4656b3383fffb3af2d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 6 Nov 2023 17:16:26 +0100 Subject: [PATCH 0133/1562] ACPI: scan: Extract _CRS CSI-2 connection information into swnodes Use the connection information extracted from the _CRS CSI-2 resource descriptors for all devices that have them to populate port names and the "reg", "bus-type" and "remote-endpoint" properties in the software nodes representing the CSI-2 connection graph. Link: https://uefi.org/specs/ACPI/6.5/06_Device_Configuration.html#camera-serial-interface-csi-2-connection-resource-descriptor Co-developed-by: Sakari Ailus Signed-off-by: Sakari Ailus Signed-off-by: Rafael J. Wysocki Tested-by: Sakari Ailus --- drivers/acpi/mipi-disco-img.c | 153 +++++++++++++++++++++++++++++++++- include/acpi/acpi_bus.h | 53 ++++++++++++ 2 files changed, 205 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/mipi-disco-img.c b/drivers/acpi/mipi-disco-img.c index 91281c8cb4f2..5ff72d83fad2 100644 --- a/drivers/acpi/mipi-disco-img.c +++ b/drivers/acpi/mipi-disco-img.c @@ -23,6 +23,8 @@ #include #include +#include + #include "internal.h" static LIST_HEAD(acpi_mipi_crs_csi2_list); @@ -237,6 +239,142 @@ static void alloc_crs_csi2_swnodes(struct crs_csi2 *csi2) csi2->swnodes = swnodes; } +#define ACPI_CRS_CSI2_PHY_TYPE_C 0 +#define ACPI_CRS_CSI2_PHY_TYPE_D 1 + +static unsigned int next_csi2_port_index(struct acpi_device_software_nodes *swnodes, + unsigned int port_nr) +{ + unsigned int i; + + for (i = 0; i < swnodes->num_ports; i++) { + struct acpi_device_software_node_port *port = &swnodes->ports[i]; + + if (port->port_nr == port_nr) + return i; + + if (port->port_nr == NO_CSI2_PORT) { + port->port_nr = port_nr; + return i; + } + } + + return NO_CSI2_PORT; +} + +/* Print graph port name into a buffer, return non-zero on failure. */ +#define GRAPH_PORT_NAME(var, num) \ + (snprintf((var), sizeof(var), SWNODE_GRAPH_PORT_NAME_FMT, (num)) >= \ + sizeof(var)) + +static void extract_crs_csi2_conn_info(acpi_handle local_handle, + struct acpi_device_software_nodes *local_swnodes, + struct crs_csi2_connection *conn) +{ + struct crs_csi2 *remote_csi2 = acpi_mipi_get_crs_csi2(conn->remote_handle); + struct acpi_device_software_nodes *remote_swnodes; + struct acpi_device_software_node_port *local_port, *remote_port; + struct software_node *local_node, *remote_node; + unsigned int local_index, remote_index; + unsigned int bus_type; + + /* + * If the previous steps have failed to make room for a _CRS CSI-2 + * representation for the remote end of the given connection, skip it. + */ + if (!remote_csi2) + return; + + remote_swnodes = remote_csi2->swnodes; + if (!remote_swnodes) + return; + + switch (conn->csi2_data.phy_type) { + case ACPI_CRS_CSI2_PHY_TYPE_C: + bus_type = V4L2_FWNODE_BUS_TYPE_CSI2_CPHY; + break; + + case ACPI_CRS_CSI2_PHY_TYPE_D: + bus_type = V4L2_FWNODE_BUS_TYPE_CSI2_DPHY; + break; + + default: + acpi_handle_info(local_handle, "unknown CSI-2 PHY type %u\n", + conn->csi2_data.phy_type); + return; + } + + local_index = next_csi2_port_index(local_swnodes, + conn->csi2_data.local_port_instance); + if (WARN_ON_ONCE(local_index >= local_swnodes->num_ports)) + return; + + remote_index = next_csi2_port_index(remote_swnodes, + conn->csi2_data.resource_source.index); + if (WARN_ON_ONCE(remote_index >= remote_swnodes->num_ports)) + return; + + local_port = &local_swnodes->ports[local_index]; + local_node = &local_swnodes->nodes[ACPI_DEVICE_SWNODE_EP(local_index)]; + local_port->crs_csi2_local = true; + + remote_port = &remote_swnodes->ports[remote_index]; + remote_node = &remote_swnodes->nodes[ACPI_DEVICE_SWNODE_EP(remote_index)]; + + local_port->remote_ep[0] = SOFTWARE_NODE_REFERENCE(remote_node); + remote_port->remote_ep[0] = SOFTWARE_NODE_REFERENCE(local_node); + + local_port->ep_props[ACPI_DEVICE_SWNODE_EP_REMOTE_EP] = + PROPERTY_ENTRY_REF_ARRAY("remote-endpoint", + local_port->remote_ep); + + local_port->ep_props[ACPI_DEVICE_SWNODE_EP_BUS_TYPE] = + PROPERTY_ENTRY_U32("bus-type", bus_type); + + local_port->ep_props[ACPI_DEVICE_SWNODE_EP_REG] = + PROPERTY_ENTRY_U32("reg", 0); + + local_port->port_props[ACPI_DEVICE_SWNODE_PORT_REG] = + PROPERTY_ENTRY_U32("reg", conn->csi2_data.local_port_instance); + + if (GRAPH_PORT_NAME(local_port->port_name, + conn->csi2_data.local_port_instance)) + acpi_handle_info(local_handle, "local port %u name too long", + conn->csi2_data.local_port_instance); + + remote_port->ep_props[ACPI_DEVICE_SWNODE_EP_REMOTE_EP] = + PROPERTY_ENTRY_REF_ARRAY("remote-endpoint", + remote_port->remote_ep); + + remote_port->ep_props[ACPI_DEVICE_SWNODE_EP_BUS_TYPE] = + PROPERTY_ENTRY_U32("bus-type", bus_type); + + remote_port->ep_props[ACPI_DEVICE_SWNODE_EP_REG] = + PROPERTY_ENTRY_U32("reg", 0); + + remote_port->port_props[ACPI_DEVICE_SWNODE_PORT_REG] = + PROPERTY_ENTRY_U32("reg", conn->csi2_data.resource_source.index); + + if (GRAPH_PORT_NAME(remote_port->port_name, + conn->csi2_data.resource_source.index)) + acpi_handle_info(local_handle, "remote port %u name too long", + conn->csi2_data.resource_source.index); +} + +static void prepare_crs_csi2_swnodes(struct crs_csi2 *csi2) +{ + struct acpi_device_software_nodes *local_swnodes = csi2->swnodes; + acpi_handle local_handle = csi2->handle; + struct crs_csi2_connection *conn; + + /* Bail out if the allocation of swnodes has failed. */ + if (!local_swnodes) + return; + + list_for_each_entry(conn, &csi2->connections, entry) + extract_crs_csi2_conn_info(local_handle, local_swnodes, conn); +} + /** * acpi_mipi_scan_crs_csi2 - Create ACPI _CRS CSI-2 software nodes * @@ -275,9 +413,22 @@ void acpi_mipi_scan_crs_csi2(void) } list_splice(&aux_list, &acpi_mipi_crs_csi2_list); - /* Allocate software nodes for representing the CSI-2 information. */ + /* + * Allocate software nodes for representing the CSI-2 information. + * + * This needs to be done for all of the list entries in one go, because + * they may point to each other without restrictions and the next step + * relies on the availability of swnodes memory for each list entry. + */ list_for_each_entry(csi2, &acpi_mipi_crs_csi2_list, entry) alloc_crs_csi2_swnodes(csi2); + + /* + * Set up software node properties using data from _CRS CSI-2 resource + * descriptors. + */ + list_for_each_entry(csi2, &acpi_mipi_crs_csi2_list, entry) + prepare_crs_csi2_swnodes(csi2); } /** diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index f122fa1c10a8..a7fa24f1af46 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -366,8 +366,61 @@ struct acpi_device_data { struct acpi_gpio_mapping; +#define ACPI_DEVICE_CSI2_DATA_LANES 8 + +#define ACPI_DEVICE_SWNODE_PORT_NAME_LENGTH 8 + +enum acpi_device_swnode_port_props { + ACPI_DEVICE_SWNODE_PORT_REG, + ACPI_DEVICE_SWNODE_PORT_NUM_OF, + ACPI_DEVICE_SWNODE_PORT_NUM_ENTRIES +}; + +enum acpi_device_swnode_ep_props { + ACPI_DEVICE_SWNODE_EP_REMOTE_EP, + ACPI_DEVICE_SWNODE_EP_BUS_TYPE, + ACPI_DEVICE_SWNODE_EP_REG, + ACPI_DEVICE_SWNODE_EP_CLOCK_LANES, + ACPI_DEVICE_SWNODE_EP_DATA_LANES, + ACPI_DEVICE_SWNODE_EP_LANE_POLARITIES, + /* TX only */ + ACPI_DEVICE_SWNODE_EP_LINK_FREQUENCIES, + ACPI_DEVICE_SWNODE_EP_NUM_OF, + ACPI_DEVICE_SWNODE_EP_NUM_ENTRIES +}; + +/* + * Each device has a root software node plus two times as many nodes as the + * number of CSI-2 ports. + */ +#define ACPI_DEVICE_SWNODE_PORT(port) (2 * (port) + 1) +#define ACPI_DEVICE_SWNODE_EP(endpoint) \ + (ACPI_DEVICE_SWNODE_PORT(endpoint) + 1) + +/** + * struct acpi_device_software_node_port - MIPI DisCo for Imaging CSI-2 port + * @port_name: Port name. + * @data_lanes: "data-lanes" property values. + * @lane_polarities: "lane-polarities" property values. + * @link_frequencies: "link_frequencies" property values. + * @port_nr: Port number. + * @crs_crs2_local: _CRS CSI2 record present (i.e. this is a transmitter one). + * @port_props: Port properties. + * @ep_props: Endpoint properties. + * @remote_ep: Reference to the remote endpoint. + */ struct acpi_device_software_node_port { + char port_name[ACPI_DEVICE_SWNODE_PORT_NAME_LENGTH + 1]; + u32 data_lanes[ACPI_DEVICE_CSI2_DATA_LANES]; + u32 lane_polarities[ACPI_DEVICE_CSI2_DATA_LANES + 1 /* clock lane */]; + u64 link_frequencies[ACPI_DEVICE_CSI2_DATA_LANES]; unsigned int port_nr; + bool crs_csi2_local; + + struct property_entry port_props[ACPI_DEVICE_SWNODE_PORT_NUM_ENTRIES]; + struct property_entry ep_props[ACPI_DEVICE_SWNODE_EP_NUM_ENTRIES]; + + struct software_node_ref_args remote_ep[1]; }; /** From 48c9996f1dfe92bd7318472651c9ad538d6d53b5 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 6 Nov 2023 17:16:37 +0100 Subject: [PATCH 0134/1562] device property: Add SOFTWARE_NODE() macro for defining software nodes Add SOFTWARE_NODE() macro in order to make defining software nodes look nicer. This is analogous to different PROPERTY_ENTRY_*() macros for defining properties. Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Reviewed-by: Heikki Krogerus Tested-by: Sakari Ailus Signed-off-by: Rafael J. Wysocki --- include/linux/property.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/property.h b/include/linux/property.h index 9f2585d705a8..97f901c0914e 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -489,6 +489,13 @@ struct software_node { const struct property_entry *properties; }; +#define SOFTWARE_NODE(_name_, _properties_, _parent_) \ + (struct software_node) { \ + .name = _name_, \ + .properties = _properties_, \ + .parent = _parent_, \ + } + bool is_software_node(const struct fwnode_handle *fwnode); const struct software_node * to_software_node(const struct fwnode_handle *fwnode); From a6cb0a611273767683d50fb908173b6f88052ce5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 7 Nov 2023 20:19:42 +0100 Subject: [PATCH 0135/1562] ACPI: scan: Extract MIPI DisCo for Imaging data into swnodes Add information extracted from the MIPI DisCo for Imaging device properties to software nodes created during the CSI-2 connection graph discovery. Link: https://www.mipi.org/specifications/mipi-disco-imaging Co-developed-by: Sakari Ailus Signed-off-by: Sakari Ailus Signed-off-by: Rafael J. Wysocki Tested-by: Sakari Ailus --- drivers/acpi/internal.h | 1 + drivers/acpi/mipi-disco-img.c | 252 +++++++++++++++++++++++++++++++++- drivers/acpi/scan.c | 12 +- include/acpi/acpi_bus.h | 17 +++ 4 files changed, 278 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 959a2bc61916..86b670637f2f 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -282,6 +282,7 @@ static inline void acpi_init_lpit(void) { } void acpi_mipi_check_crs_csi2(acpi_handle handle); void acpi_mipi_scan_crs_csi2(void); +void acpi_mipi_init_crs_csi2_swnodes(void); void acpi_mipi_crs_csi2_cleanup(void); #endif /* _ACPI_INTERNAL_H_ */ diff --git a/drivers/acpi/mipi-disco-img.c b/drivers/acpi/mipi-disco-img.c index 5ff72d83fad2..dcbaba91d4fe 100644 --- a/drivers/acpi/mipi-disco-img.c +++ b/drivers/acpi/mipi-disco-img.c @@ -6,12 +6,16 @@ * * Support MIPI DisCo for Imaging by parsing ACPI _CRS CSI-2 records defined in * Section 6.4.3.8.2.4 "Camera Serial Interface (CSI-2) Connection Resource - * Descriptor" of ACPI 6.5. + * Descriptor" of ACPI 6.5 and using device properties defined by the MIPI DisCo + * for Imaging specification. * * The implementation looks for the information in the ACPI namespace (CSI-2 * resource descriptors in _CRS) and constructs software nodes compatible with * Documentation/firmware-guide/acpi/dsd/graph.rst to represent the CSI-2 - * connection graph. + * connection graph. The software nodes are then populated with the data + * extracted from the _CRS CSI-2 resource descriptors and the MIPI DisCo + * for Imaging device properties present in _DSD for the ACPI device objects + * with CSI-2 connections. */ #include @@ -431,6 +435,250 @@ void acpi_mipi_scan_crs_csi2(void) prepare_crs_csi2_swnodes(csi2); } +/* + * Get the index of the next property in the property array, with a given + * maximum value. + */ +#define NEXT_PROPERTY(index, max) \ + (WARN_ON((index) > ACPI_DEVICE_SWNODE_##max) ? \ + ACPI_DEVICE_SWNODE_##max : (index)++) + +static void init_csi2_port_local(struct acpi_device *adev, + struct acpi_device_software_node_port *port, + struct fwnode_handle *port_fwnode, + unsigned int index) +{ + acpi_handle handle = acpi_device_handle(adev); + unsigned int num_link_freqs; + int ret; + + ret = fwnode_property_count_u64(port_fwnode, "mipi-img-link-frequencies"); + if (ret <= 0) + return; + + num_link_freqs = ret; + if (num_link_freqs > ACPI_DEVICE_CSI2_DATA_LANES) { + acpi_handle_info(handle, "Too many link frequencies: %u\n", + num_link_freqs); + num_link_freqs = ACPI_DEVICE_CSI2_DATA_LANES; + } + + ret = fwnode_property_read_u64_array(port_fwnode, + "mipi-img-link-frequencies", + port->link_frequencies, + num_link_freqs); + if (ret) { + acpi_handle_info(handle, "Unable to get link frequencies (%d)\n", + ret); + return; + } + + port->ep_props[NEXT_PROPERTY(index, EP_LINK_FREQUENCIES)] = + PROPERTY_ENTRY_U64_ARRAY_LEN("link-frequencies", + port->link_frequencies, + num_link_freqs); +} + +static void init_csi2_port(struct acpi_device *adev, + struct acpi_device_software_nodes *swnodes, + struct acpi_device_software_node_port *port, + struct fwnode_handle *port_fwnode, + unsigned int port_index) +{ + unsigned int ep_prop_index = ACPI_DEVICE_SWNODE_EP_CLOCK_LANES; + acpi_handle handle = acpi_device_handle(adev); + u8 val[ACPI_DEVICE_CSI2_DATA_LANES]; + int num_lanes = 0; + int ret; + + if (GRAPH_PORT_NAME(port->port_name, port->port_nr)) + return; + + swnodes->nodes[ACPI_DEVICE_SWNODE_PORT(port_index)] = + SOFTWARE_NODE(port->port_name, port->port_props, + &swnodes->nodes[ACPI_DEVICE_SWNODE_ROOT]); + + ret = fwnode_property_read_u8(port_fwnode, "mipi-img-clock-lane", val); + if (!ret) + port->ep_props[NEXT_PROPERTY(ep_prop_index, EP_CLOCK_LANES)] = + PROPERTY_ENTRY_U32("clock-lanes", val[0]); + + ret = fwnode_property_count_u8(port_fwnode, "mipi-img-data-lanes"); + if (ret > 0) { + num_lanes = ret; + + if (num_lanes > ACPI_DEVICE_CSI2_DATA_LANES) { + acpi_handle_info(handle, "Too many data lanes: %u\n", + num_lanes); + num_lanes = ACPI_DEVICE_CSI2_DATA_LANES; + } + + ret = fwnode_property_read_u8_array(port_fwnode, + "mipi-img-data-lanes", + val, num_lanes); + if (!ret) { + unsigned int i; + + for (i = 0; i < num_lanes; i++) + port->data_lanes[i] = val[i]; + + port->ep_props[NEXT_PROPERTY(ep_prop_index, EP_DATA_LANES)] = + PROPERTY_ENTRY_U32_ARRAY_LEN("data-lanes", + port->data_lanes, + num_lanes); + } + } + + ret = fwnode_property_count_u8(port_fwnode, "mipi-img-lane-polarities"); + if (ret < 0) { + acpi_handle_debug(handle, "Lane polarity bytes missing\n"); + } else if (ret * BITS_PER_TYPE(u8) < num_lanes + 1) { + acpi_handle_info(handle, "Too few lane polarity bytes (%lu vs. %d)\n", + ret * BITS_PER_TYPE(u8), num_lanes + 1); + } else { + unsigned long mask = 0; + int byte_count = ret; + unsigned int i; + + /* + * The total number of lanes is ACPI_DEVICE_CSI2_DATA_LANES + 1 + * (data lanes + clock lane). It is not expected to ever be + * greater than the number of bits in an unsigned long + * variable, but ensure that this is the case. + */ + BUILD_BUG_ON(BITS_PER_TYPE(unsigned long) <= ACPI_DEVICE_CSI2_DATA_LANES); + + if (byte_count > sizeof(mask)) { + acpi_handle_info(handle, "Too many lane polarities: %d\n", + byte_count); + byte_count = sizeof(mask); + } + fwnode_property_read_u8_array(port_fwnode, "mipi-img-lane-polarities", + val, byte_count); + + for (i = 0; i < byte_count; i++) + mask |= (unsigned long)val[i] << BITS_PER_TYPE(u8) * i; + + for (i = 0; i <= num_lanes; i++) + port->lane_polarities[i] = test_bit(i, &mask); + + port->ep_props[NEXT_PROPERTY(ep_prop_index, EP_LANE_POLARITIES)] = + PROPERTY_ENTRY_U32_ARRAY_LEN("lane-polarities", + port->lane_polarities, + num_lanes + 1); + } + + swnodes->nodes[ACPI_DEVICE_SWNODE_EP(port_index)] = + SOFTWARE_NODE("endpoint@0", swnodes->ports[port_index].ep_props, + &swnodes->nodes[ACPI_DEVICE_SWNODE_PORT(port_index)]); + + if (port->crs_csi2_local) + init_csi2_port_local(adev, port, port_fwnode, ep_prop_index); +} + +#define MIPI_IMG_PORT_PREFIX "mipi-img-port-" + +static struct fwnode_handle *get_mipi_port_handle(struct fwnode_handle *adev_fwnode, + unsigned int port_nr) +{ + char port_name[sizeof(MIPI_IMG_PORT_PREFIX) + 2]; + + if (snprintf(port_name, sizeof(port_name), "%s%u", + MIPI_IMG_PORT_PREFIX, port_nr) >= sizeof(port_name)) + return NULL; + + return fwnode_get_named_child_node(adev_fwnode, port_name); +} + +static void init_crs_csi2_swnodes(struct crs_csi2 *csi2) +{ + struct acpi_buffer buffer = { .length = ACPI_ALLOCATE_BUFFER }; + struct acpi_device_software_nodes *swnodes = csi2->swnodes; + acpi_handle handle = csi2->handle; + struct fwnode_handle *adev_fwnode; + struct acpi_device *adev; + acpi_status status; + unsigned int i; + int ret; + + /* + * Bail out if the swnodes are not available (either they have not been + * allocated or they have been assigned to the device already). + */ + if (!swnodes) + return; + + adev = acpi_fetch_acpi_dev(handle); + if (!adev) + return; + + adev_fwnode = acpi_fwnode_handle(adev); + + status = acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer); + if (ACPI_FAILURE(status)) { + acpi_handle_info(handle, "Unable to get the path name\n"); + return; + } + + swnodes->nodes[ACPI_DEVICE_SWNODE_ROOT] = + SOFTWARE_NODE(buffer.pointer, swnodes->dev_props, NULL); + + for (i = 0; i < swnodes->num_ports; i++) { + struct acpi_device_software_node_port *port = &swnodes->ports[i]; + struct fwnode_handle *port_fwnode; + + /* + * The MIPI DisCo for Imaging specification defines _DSD device + * properties for providing CSI-2 port parameters that can be + * accessed through the generic device properties framework. To + * access them, it is first necessary to find the data node + * representing the port under the given ACPI device object. + */ + port_fwnode = get_mipi_port_handle(adev_fwnode, port->port_nr); + if (!port_fwnode) { + acpi_handle_info(handle, + "MIPI port name too long for port %u\n", + port->port_nr); + continue; + } + + init_csi2_port(adev, swnodes, port, port_fwnode, i); + + fwnode_handle_put(port_fwnode); + } + + ret = software_node_register_node_group(swnodes->nodeptrs); + if (ret < 0) { + acpi_handle_info(handle, + "Unable to register software nodes (%d)\n", ret); + return; + } + + adev->swnodes = swnodes; + adev_fwnode->secondary = software_node_fwnode(swnodes->nodes); + + /* + * Prevents the swnodes from this csi2 entry from being assigned again + * or freed prematurely. + */ + csi2->swnodes = NULL; +} + +/** + * acpi_mipi_init_crs_csi2_swnodes - Initialize _CRS CSI-2 software nodes + * + * Use MIPI DisCo for Imaging device properties to finalize the initialization + * of CSI-2 software nodes for all ACPI device objects that have been already + * enumerated. + */ +void acpi_mipi_init_crs_csi2_swnodes(void) +{ + struct crs_csi2 *csi2, *csi2_tmp; + + list_for_each_entry_safe(csi2, csi2_tmp, &acpi_mipi_crs_csi2_list, entry) + init_crs_csi2_swnodes(csi2); +} + /** * acpi_mipi_crs_csi2_cleanup - Free _CRS CSI-2 temporary data */ diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 8e12c558b8db..b8df04779904 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -2447,6 +2447,13 @@ static void acpi_scan_postponed_branch(acpi_handle handle) acpi_walk_namespace(ACPI_TYPE_ANY, handle, ACPI_UINT32_MAX, acpi_bus_check_add_2, NULL, NULL, (void **)&adev); + + /* + * Populate the ACPI _CRS CSI-2 software nodes for the ACPI devices that + * have been added above. + */ + acpi_mipi_init_crs_csi2_swnodes(); + acpi_bus_attach(adev, NULL); } @@ -2516,11 +2523,12 @@ int acpi_bus_scan(acpi_handle handle) return -ENODEV; /* - * Allocate ACPI _CRS CSI-2 software nodes using information extracted + * Set up ACPI _CRS CSI-2 software nodes using information extracted * from the _CRS CSI-2 resource descriptors during the ACPI namespace - * walk above. + * walk above and MIPI DisCo for Imaging device properties. */ acpi_mipi_scan_crs_csi2(); + acpi_mipi_init_crs_csi2_swnodes(); acpi_bus_attach(device, (void *)true); diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index a7fa24f1af46..c299fb974e49 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -366,10 +366,24 @@ struct acpi_device_data { struct acpi_gpio_mapping; +#define ACPI_DEVICE_SWNODE_ROOT 0 + +/* + * The maximum expected number of CSI-2 data lanes. + * + * This number is not expected to ever have to be equal to or greater than the + * number of bits in an unsigned long variable, but if it needs to be increased + * above that limit, code will need to be adjusted accordingly. + */ #define ACPI_DEVICE_CSI2_DATA_LANES 8 #define ACPI_DEVICE_SWNODE_PORT_NAME_LENGTH 8 +enum acpi_device_swnode_dev_props { + ACPI_DEVICE_SWNODE_DEV_NUM_OF, + ACPI_DEVICE_SWNODE_DEV_NUM_ENTRIES +}; + enum acpi_device_swnode_port_props { ACPI_DEVICE_SWNODE_PORT_REG, ACPI_DEVICE_SWNODE_PORT_NUM_OF, @@ -425,12 +439,14 @@ struct acpi_device_software_node_port { /** * struct acpi_device_software_nodes - Software nodes for an ACPI device + * @dev_props: Device properties. * @nodes: Software nodes for root as well as ports and endpoints. * @nodeprts: Array of software node pointers, for (un)registering them. * @ports: Information related to each port and endpoint within a port. * @num_ports: The number of ports. */ struct acpi_device_software_nodes { + struct property_entry dev_props[ACPI_DEVICE_SWNODE_DEV_NUM_ENTRIES]; struct software_node *nodes; const struct software_node **nodeptrs; struct acpi_device_software_node_port *ports; @@ -455,6 +471,7 @@ struct acpi_device { struct acpi_device_data data; struct acpi_scan_handler *handler; struct acpi_hotplug_context *hp; + struct acpi_device_software_nodes *swnodes; const struct acpi_gpio_mapping *driver_gpios; void *driver_data; struct device dev; From f533e43a2a3117cc59886cbcd66ca32e42cf1ea9 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 6 Nov 2023 17:28:40 +0100 Subject: [PATCH 0136/1562] ACPI: property: Dig "rotation" property for devices with CSI2 _CRS Find the "rotation" property value for devices with _CRS CSI-2 resource descriptors and use it to add the "rotation" property to the software nodes representing the CSI-2 connection graph. That value typically comes from the _PLD (Physical Location of Device) object if it is present for the given device. This way, camera sensor drivers that know the "rotation" property do not need to care about _PLD on systems using ACPI. Signed-off-by: Sakari Ailus [ rjw: Changelog edits, file rename ] Signed-off-by: Rafael J. Wysocki Tested-by: Sakari Ailus --- drivers/acpi/mipi-disco-img.c | 17 +++++++++++++++++ include/acpi/acpi_bus.h | 1 + 2 files changed, 18 insertions(+) diff --git a/drivers/acpi/mipi-disco-img.c b/drivers/acpi/mipi-disco-img.c index dcbaba91d4fe..b2c7a4922804 100644 --- a/drivers/acpi/mipi-disco-img.c +++ b/drivers/acpi/mipi-disco-img.c @@ -595,6 +595,7 @@ static void init_crs_csi2_swnodes(struct crs_csi2 *csi2) struct acpi_buffer buffer = { .length = ACPI_ALLOCATE_BUFFER }; struct acpi_device_software_nodes *swnodes = csi2->swnodes; acpi_handle handle = csi2->handle; + unsigned int prop_index = 0; struct fwnode_handle *adev_fwnode; struct acpi_device *adev; acpi_status status; @@ -614,6 +615,22 @@ static void init_crs_csi2_swnodes(struct crs_csi2 *csi2) adev_fwnode = acpi_fwnode_handle(adev); + /* + * If the "rotation" property is not present, but _PLD is there, + * evaluate it to get the "rotation" value. + */ + if (!fwnode_property_present(adev_fwnode, "rotation")) { + struct acpi_pld_info *pld; + + status = acpi_get_physical_device_location(handle, &pld); + if (ACPI_SUCCESS(status)) { + swnodes->dev_props[NEXT_PROPERTY(prop_index, DEV_ROTATION)] = + PROPERTY_ENTRY_U32("rotation", + pld->rotation * 45U); + kfree(pld); + } + } + status = acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer); if (ACPI_FAILURE(status)) { acpi_handle_info(handle, "Unable to get the path name\n"); diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index c299fb974e49..1e5d2e2c3444 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -380,6 +380,7 @@ struct acpi_gpio_mapping; #define ACPI_DEVICE_SWNODE_PORT_NAME_LENGTH 8 enum acpi_device_swnode_dev_props { + ACPI_DEVICE_SWNODE_DEV_ROTATION, ACPI_DEVICE_SWNODE_DEV_NUM_OF, ACPI_DEVICE_SWNODE_DEV_NUM_ENTRIES }; From 4cd57d6d527c3570827a6eb8bd790ae216a78ed9 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 6 Nov 2023 17:31:18 +0100 Subject: [PATCH 0137/1562] ACPI: property: Replicate DT-aligned u32 properties from DisCo for Imaging MIPI DisCo for Imaging defines properties for camera sensors that functionally align with DT equivalents. Replicate these properties in the ACPI device swnodes so the code using the corresponding DT properties already does not need to be updated to deal with their MIPI counterparts directly. The replicated properties are: "mipi-img-clock-frequency" -> "clock-frequency" "mipi-img-led-max-current" -> "led-max-microamp" "mipi-img-flash-max-current" -> "flash-max-microamp" "mipi-img-flash-max-timeout" -> "flash-max-timeout-us" Signed-off-by: Sakari Ailus [ rjw: Changelog edits, removal of redundant braces ] Signed-off-by: Rafael J. Wysocki Tested-by: Sakari Ailus --- drivers/acpi/mipi-disco-img.c | 17 +++++++++++++++++ include/acpi/acpi_bus.h | 4 ++++ 2 files changed, 21 insertions(+) diff --git a/drivers/acpi/mipi-disco-img.c b/drivers/acpi/mipi-disco-img.c index b2c7a4922804..cad72d1fc127 100644 --- a/drivers/acpi/mipi-disco-img.c +++ b/drivers/acpi/mipi-disco-img.c @@ -600,6 +600,7 @@ static void init_crs_csi2_swnodes(struct crs_csi2 *csi2) struct acpi_device *adev; acpi_status status; unsigned int i; + u32 val; int ret; /* @@ -631,6 +632,22 @@ static void init_crs_csi2_swnodes(struct crs_csi2 *csi2) } } + if (!fwnode_property_read_u32(adev_fwnode, "mipi-img-clock-frequency", &val)) + swnodes->dev_props[NEXT_PROPERTY(prop_index, DEV_CLOCK_FREQUENCY)] = + PROPERTY_ENTRY_U32("clock-frequency", val); + + if (!fwnode_property_read_u32(adev_fwnode, "mipi-img-led-max-current", &val)) + swnodes->dev_props[NEXT_PROPERTY(prop_index, DEV_LED_MAX_MICROAMP)] = + PROPERTY_ENTRY_U32("led-max-microamp", val); + + if (!fwnode_property_read_u32(adev_fwnode, "mipi-img-flash-max-current", &val)) + swnodes->dev_props[NEXT_PROPERTY(prop_index, DEV_FLASH_MAX_MICROAMP)] = + PROPERTY_ENTRY_U32("flash-max-microamp", val); + + if (!fwnode_property_read_u32(adev_fwnode, "mipi-img-flash-max-timeout-us", &val)) + swnodes->dev_props[NEXT_PROPERTY(prop_index, DEV_FLASH_MAX_TIMEOUT_US)] = + PROPERTY_ENTRY_U32("flash-max-timeout-us", val); + status = acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer); if (ACPI_FAILURE(status)) { acpi_handle_info(handle, "Unable to get the path name\n"); diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 1e5d2e2c3444..989ea623b1c2 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -381,6 +381,10 @@ struct acpi_gpio_mapping; enum acpi_device_swnode_dev_props { ACPI_DEVICE_SWNODE_DEV_ROTATION, + ACPI_DEVICE_SWNODE_DEV_CLOCK_FREQUENCY, + ACPI_DEVICE_SWNODE_DEV_LED_MAX_MICROAMP, + ACPI_DEVICE_SWNODE_DEV_FLASH_MAX_MICROAMP, + ACPI_DEVICE_SWNODE_DEV_FLASH_MAX_TIMEOUT_US, ACPI_DEVICE_SWNODE_DEV_NUM_OF, ACPI_DEVICE_SWNODE_DEV_NUM_ENTRIES }; From 44844db91397d3d94589f3c0c855be02daeebdb3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 9 Nov 2023 16:01:48 +0100 Subject: [PATCH 0138/1562] thermal: core: Add trip thresholds for trip crossing detection The trip crossing detection in handle_thermal_trip() does not work correctly in the cases when a trip point is crossed on the way up and then the zone temperature stays above its low temperature (that is, its temperature decreased by its hysteresis). The trip temperature may be passed by the zone temperature subsequently in that case, even multiple times, but that does not count as the trip crossing as long as the zone temperature does not fall below the trip's low temperature or, in other words, until the trip is crossed on the way down. |-----------low--------high------------| |<--------->| | hyst | | | | -|--> crossed on the way up | <---|-- crossed on the way down However, handle_thermal_trip() will invoke thermal_notify_tz_trip_up() every time the trip temperature is passed by the zone temperature on the way up regardless of whether or not the trip has been crossed on the way down yet. Moreover, it will not call thermal_notify_tz_trip_down() if the last zone temperature was between the trip's temperature and its low temperature, so some "trip crossed on the way down" events may not be reported. To address this issue, introduce trip thresholds equal to either the temperature of the given trip, or its low temperature, such that if the trip's threshold is passed by the zone temperature on the way up, its value will be set to the trip's low temperature and thermal_notify_tz_trip_up() will be called, and if the trip's threshold is passed by the zone temperature on the way down, its value will be set to the trip's temperature (high) and thermal_notify_tz_trip_down() will be called. Accordingly, if the threshold is passed on the way up, it cannot be passed on the way up again until its passed on the way down and if it is passed on the way down, it cannot be passed on the way down again until it is passed on the way up which guarantees correct triggering of trip crossing notifications. If the last temperature of the zone is invalid, the trip's threshold will be set depending of the zone's current temperature: If that temperature is above the trip's temperature, its threshold will be set to its low temperature or otherwise its threshold will be set to its (high) temperature. Because the zone temperature is initially set to invalid and tz->last_temperature is only updated by update_temperature(), this is sufficient to set the correct initial threshold values for all trips. Link: https://lore.kernel.org/all/20220718145038.1114379-4-daniel.lezcano@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 43 ++++++++++++++++++++++++++++------ include/linux/thermal.h | 2 ++ 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 9c17d35ccbbd..625ba07cbe2f 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -345,22 +345,51 @@ static void handle_critical_trips(struct thermal_zone_device *tz, } static void handle_thermal_trip(struct thermal_zone_device *tz, - const struct thermal_trip *trip) + struct thermal_trip *trip) { if (trip->temperature == THERMAL_TEMP_INVALID) return; - if (tz->last_temperature != THERMAL_TEMP_INVALID) { - if (tz->last_temperature < trip->temperature && - tz->temperature >= trip->temperature) + if (tz->last_temperature == THERMAL_TEMP_INVALID) { + /* Initialization. */ + trip->threshold = trip->temperature; + if (tz->temperature >= trip->threshold) + trip->threshold -= trip->hysteresis; + } else if (tz->last_temperature < trip->threshold) { + /* + * The trip threshold is equal to the trip temperature, unless + * the latter has changed in the meantime. In either case, + * the trip is crossed if the current zone temperature is at + * least equal to its temperature, but otherwise ensure that + * the threshold and the trip temperature will be equal. + */ + if (tz->temperature >= trip->temperature) { thermal_notify_tz_trip_up(tz->id, thermal_zone_trip_id(tz, trip), tz->temperature); - if (tz->last_temperature >= trip->temperature && - tz->temperature < trip->temperature - trip->hysteresis) + trip->threshold = trip->temperature - trip->hysteresis; + } else { + trip->threshold = trip->temperature; + } + } else { + /* + * The previous zone temperature was above or equal to the trip + * threshold, which would be equal to the "low temperature" of + * the trip (its temperature minus its hysteresis), unless the + * trip temperature or hysteresis had changed. In either case, + * the trip is crossed if the current zone temperature is below + * the low temperature of the trip, but otherwise ensure that + * the trip threshold will be equal to the low temperature of + * the trip. + */ + if (tz->temperature < trip->temperature - trip->hysteresis) { thermal_notify_tz_trip_down(tz->id, thermal_zone_trip_id(tz, trip), tz->temperature); + trip->threshold = trip->temperature; + } else { + trip->threshold = trip->temperature - trip->hysteresis; + } } if (trip->type == THERMAL_TRIP_CRITICAL || trip->type == THERMAL_TRIP_HOT) @@ -403,7 +432,7 @@ static void thermal_zone_device_init(struct thermal_zone_device *tz) void __thermal_zone_device_update(struct thermal_zone_device *tz, enum thermal_notify_event event) { - const struct thermal_trip *trip; + struct thermal_trip *trip; if (atomic_read(&in_suspend)) return; diff --git a/include/linux/thermal.h b/include/linux/thermal.h index cee814d5d1ac..1f9ee869f9f9 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -57,12 +57,14 @@ enum thermal_notify_event { * struct thermal_trip - representation of a point in temperature domain * @temperature: temperature value in miliCelsius * @hysteresis: relative hysteresis in miliCelsius + * @threshold: trip crossing notification threshold miliCelsius * @type: trip point type * @priv: pointer to driver data associated with this trip */ struct thermal_trip { int temperature; int hysteresis; + int threshold; enum thermal_trip_type type; void *priv; }; From 52304886ea49ee662589aff05925ef226c17a6a6 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 26 Oct 2023 15:53:03 +0200 Subject: [PATCH 0139/1562] ACPI: video: Add comment about acpi_video_backlight_use_native() usage Add a comment explaining that acpi_video_backlight_use_native() MUST only be used by GPU drivers and that it must NOT be used on other places. Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- include/acpi/video.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/acpi/video.h b/include/acpi/video.h index 4230392b5b0b..3d538d4178ab 100644 --- a/include/acpi/video.h +++ b/include/acpi/video.h @@ -75,6 +75,15 @@ static inline enum acpi_backlight_type acpi_video_get_backlight_type(void) return __acpi_video_get_backlight_type(false, NULL); } +/* + * This function MUST only be called by GPU drivers to check if the driver + * should register a backlight class device. This function not only checks + * if a GPU native backlight device should be registered it *also* tells + * the ACPI video-detect code that native GPU backlight control is available. + * Therefor calling this from any place other then the GPU driver is wrong! + * To check if GPU native backlight control is used in other places instead use: + * if (acpi_video_get_backlight_type() == acpi_backlight_native) { ... } + */ static inline bool acpi_video_backlight_use_native(void) { return __acpi_video_get_backlight_type(true, NULL) == acpi_backlight_native; From c7add369b4cc599db336ca67578a052c5b0f0891 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 15 Nov 2023 18:48:11 +0100 Subject: [PATCH 0140/1562] ACPI: video: Drop should_check_lcd_flag() Since commit 3dbc80a3e4c5 ("ACPI: video: Make backlight class device registration a separate step (v2)") acpi_video# backlights are no longer automatically registered. Instead they now only get registered when the GPU/KMS driver calls acpi_video_register_backlight() which it only does when it has detected an internal LCD panel. This fixes the issue of sometimes a non-working acpi_video# backlight showing up on Desktops / HDMI-sticks without an internal LCD display in a more complete and robust manner then the LCD flag check which gets enabled by the should_check_lcd_flag() helper does. Therefor the should_check_lcd_flag() helper is no longer necessary. The lcd_only flag itself is still necessary to only register a single backlight device (for the right output) on the ESPRIMO Mobile M9410 which has 2 ACPI video connector nodes with a _BCM control method, which is the issue for which the flag was originally introduced in commit e50b9be14ab0 ("ACPI / video: only register backlight for LCD device"). Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_video.c | 56 +-------------------------------------- 1 file changed, 1 insertion(+), 55 deletions(-) diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c index d321ca7160d9..5eded14f8853 100644 --- a/drivers/acpi/acpi_video.c +++ b/drivers/acpi/acpi_video.c @@ -67,7 +67,7 @@ MODULE_PARM_DESC(hw_changes_brightness, static bool device_id_scheme = false; module_param(device_id_scheme, bool, 0444); -static int only_lcd = -1; +static int only_lcd; module_param(only_lcd, int, 0444); static bool may_report_brightness_keys; @@ -2141,57 +2141,6 @@ static int __init intel_opregion_present(void) return opregion; } -/* Check if the chassis-type indicates there is no builtin LCD panel */ -static bool dmi_is_desktop(void) -{ - const char *chassis_type; - unsigned long type; - - chassis_type = dmi_get_system_info(DMI_CHASSIS_TYPE); - if (!chassis_type) - return false; - - if (kstrtoul(chassis_type, 10, &type) != 0) - return false; - - switch (type) { - case 0x03: /* Desktop */ - case 0x04: /* Low Profile Desktop */ - case 0x05: /* Pizza Box */ - case 0x06: /* Mini Tower */ - case 0x07: /* Tower */ - case 0x10: /* Lunch Box */ - case 0x11: /* Main Server Chassis */ - return true; - } - - return false; -} - -/* - * We're seeing a lot of bogus backlight interfaces on newer machines - * without a LCD such as desktops, servers and HDMI sticks. Checking the - * lcd flag fixes this, enable this by default on any machines which are: - * 1. Win8 ready (where we also prefer the native backlight driver, so - * normally the acpi_video code should not register there anyways); *and* - * 2.1 Report a desktop/server DMI chassis-type, or - * 2.2 Are an ACPI-reduced-hardware platform (and thus won't use the EC for - backlight control) - */ -static bool should_check_lcd_flag(void) -{ - if (!acpi_osi_is_win8()) - return false; - - if (dmi_is_desktop()) - return true; - - if (acpi_reduced_hardware()) - return true; - - return false; -} - int acpi_video_register(void) { int ret = 0; @@ -2205,9 +2154,6 @@ int acpi_video_register(void) goto leave; } - if (only_lcd == -1) - only_lcd = should_check_lcd_flag(); - dmi_check_system(video_dmi_table); ret = acpi_bus_register_driver(&acpi_video_bus); From 6d392d8daa7514a431678521c2af8be10fc31bc1 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 14 Nov 2023 14:06:11 +0200 Subject: [PATCH 0141/1562] ACPI: Run USB4 _OSC() first with query bit set The platform can deny certain tunneling from the OS and it does that by clearing the control bits it does not want the OS to get and returning with OSC_CAPABILITIES_MASK_ERROR bit set. Currently we do not handle this properly so if this happens, for example when the platform denies PCIe tunneling, we just fail the whole negotiation and revert back to what the Thunderbolt driver is doing to figure out whether the controller is running firmware connection manager or not. However, we should honor what the platform returns. For this reason run the USB4 _OSC() first with query bit set, and then use the returned control double word (that may contain some of the bits cleared by the platform) and run it second time with query bit clear. While there, remove an extra space from the assignment of the control double word. Reported-by: NaamaX Shachar Signed-off-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/bus.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 72e64c0718c9..569bd15f211b 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -408,7 +408,7 @@ static void acpi_bus_decode_usb_osc(const char *msg, u32 bits) static u8 sb_usb_uuid_str[] = "23A0D13A-26AB-486C-9C5F-0FFA525A575A"; static void acpi_bus_osc_negotiate_usb_control(void) { - u32 capbuf[3]; + u32 capbuf[3], *capbuf_ret; struct acpi_osc_context context = { .uuid_str = sb_usb_uuid_str, .rev = 1, @@ -428,7 +428,12 @@ static void acpi_bus_osc_negotiate_usb_control(void) control = OSC_USB_USB3_TUNNELING | OSC_USB_DP_TUNNELING | OSC_USB_PCIE_TUNNELING | OSC_USB_XDOMAIN; - capbuf[OSC_QUERY_DWORD] = 0; + /* + * Run _OSC first with query bit set, trying to get control over + * all tunneling. The platform can then clear out bits in the + * control dword that it does not want to grant to the OS. + */ + capbuf[OSC_QUERY_DWORD] = OSC_QUERY_ENABLE; capbuf[OSC_SUPPORT_DWORD] = 0; capbuf[OSC_CONTROL_DWORD] = control; @@ -441,8 +446,29 @@ static void acpi_bus_osc_negotiate_usb_control(void) goto out_free; } + /* + * Run _OSC again now with query bit clear and the control dword + * matching what the platform granted (which may not have all + * the control bits set). + */ + capbuf_ret = context.ret.pointer; + + capbuf[OSC_QUERY_DWORD] = 0; + capbuf[OSC_CONTROL_DWORD] = capbuf_ret[OSC_CONTROL_DWORD]; + + kfree(context.ret.pointer); + + status = acpi_run_osc(handle, &context); + if (ACPI_FAILURE(status)) + return; + + if (context.ret.length != sizeof(capbuf)) { + pr_info("USB4 _OSC: returned invalid length buffer\n"); + goto out_free; + } + osc_sb_native_usb4_control = - control & acpi_osc_ctx_get_pci_control(&context); + control & acpi_osc_ctx_get_pci_control(&context); acpi_bus_decode_usb_osc("USB4 _OSC: OS supports", control); acpi_bus_decode_usb_osc("USB4 _OSC: OS controls", From afe576a62062cf944372fff2b6510b621ec454f2 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 20 Nov 2023 16:45:38 -0500 Subject: [PATCH 0142/1562] MAINTAINERS: add an entry for the lockdown LSM While lockdown has been present in the kernel for a while, it is missing a MAINTAINERS entry for some reason. Signed-off-by: Matthew Garrett Signed-off-by: Paul Moore --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 34187ece7330..ec70e163e458 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12447,6 +12447,12 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/har F: Documentation/admin-guide/LSM/LoadPin.rst F: security/loadpin/ +LOCKDOWN SECURITY MODULE +L: linux-security-module@vger.kernel.org +S: Odd Fixes +T: git https://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git +F: security/lockdown/ + LOCKING PRIMITIVES M: Peter Zijlstra M: Ingo Molnar From 1712ed62153125e62d4d1e0ca68d35387e6a6993 Mon Sep 17 00:00:00 2001 From: Jacob Satterfield Date: Fri, 3 Nov 2023 17:29:51 +0000 Subject: [PATCH 0143/1562] selinux: refactor avtab_node comparisons In four separate functions within avtab, the same comparison logic is used. The only difference is how the result is handled or whether there is a unique specifier value to be checked for or used. Extracting this functionality into the avtab_node_cmp() function unifies the comparison logic between searching and insertion and gets rid of duplicative code so that the implementation is easier to maintain. Signed-off-by: Jacob Satterfield Reviewed-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/ss/avtab.c | 101 +++++++++++++++--------------------- 1 file changed, 41 insertions(+), 60 deletions(-) diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c index 8751a602ead2..697eb4352439 100644 --- a/security/selinux/ss/avtab.c +++ b/security/selinux/ss/avtab.c @@ -96,12 +96,34 @@ avtab_insert_node(struct avtab *h, struct avtab_node **dst, return newnode; } +static int avtab_node_cmp(const struct avtab_key *key1, + const struct avtab_key *key2) +{ + u16 specified = key1->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); + + if (key1->source_type == key2->source_type && + key1->target_type == key2->target_type && + key1->target_class == key2->target_class && + (specified & key2->specified)) + return 0; + if (key1->source_type < key2->source_type) + return -1; + if (key1->source_type == key2->source_type && + key1->target_type < key2->target_type) + return -1; + if (key1->source_type == key2->source_type && + key1->target_type == key2->target_type && + key1->target_class < key2->target_class) + return -1; + return 1; +} + static int avtab_insert(struct avtab *h, const struct avtab_key *key, const struct avtab_datum *datum) { u32 hvalue; struct avtab_node *prev, *cur, *newnode; - u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); + int cmp; if (!h || !h->nslot || h->nel == U32_MAX) return -EINVAL; @@ -110,23 +132,11 @@ static int avtab_insert(struct avtab *h, const struct avtab_key *key, for (prev = NULL, cur = h->htable[hvalue]; cur; prev = cur, cur = cur->next) { - if (key->source_type == cur->key.source_type && - key->target_type == cur->key.target_type && - key->target_class == cur->key.target_class && - (specified & cur->key.specified)) { - /* extended perms may not be unique */ - if (specified & AVTAB_XPERMS) - break; + cmp = avtab_node_cmp(key, &cur->key); + /* extended perms may not be unique */ + if (cmp == 0 && !(key->specified & AVTAB_XPERMS)) return -EEXIST; - } - if (key->source_type < cur->key.source_type) - break; - if (key->source_type == cur->key.source_type && - key->target_type < cur->key.target_type) - break; - if (key->source_type == cur->key.source_type && - key->target_type == cur->key.target_type && - key->target_class < cur->key.target_class) + if (cmp <= 0) break; } @@ -148,7 +158,7 @@ struct avtab_node *avtab_insert_nonunique(struct avtab *h, { u32 hvalue; struct avtab_node *prev, *cur; - u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); + int cmp; if (!h || !h->nslot || h->nel == U32_MAX) return NULL; @@ -156,19 +166,8 @@ struct avtab_node *avtab_insert_nonunique(struct avtab *h, for (prev = NULL, cur = h->htable[hvalue]; cur; prev = cur, cur = cur->next) { - if (key->source_type == cur->key.source_type && - key->target_type == cur->key.target_type && - key->target_class == cur->key.target_class && - (specified & cur->key.specified)) - break; - if (key->source_type < cur->key.source_type) - break; - if (key->source_type == cur->key.source_type && - key->target_type < cur->key.target_type) - break; - if (key->source_type == cur->key.source_type && - key->target_type == cur->key.target_type && - key->target_class < cur->key.target_class) + cmp = avtab_node_cmp(key, &cur->key); + if (cmp <= 0) break; } return avtab_insert_node(h, prev ? &prev->next : &h->htable[hvalue], @@ -183,7 +182,7 @@ struct avtab_node *avtab_search_node(struct avtab *h, { u32 hvalue; struct avtab_node *cur; - u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); + int cmp; if (!h || !h->nslot) return NULL; @@ -191,20 +190,10 @@ struct avtab_node *avtab_search_node(struct avtab *h, hvalue = avtab_hash(key, h->mask); for (cur = h->htable[hvalue]; cur; cur = cur->next) { - if (key->source_type == cur->key.source_type && - key->target_type == cur->key.target_type && - key->target_class == cur->key.target_class && - (specified & cur->key.specified)) + cmp = avtab_node_cmp(key, &cur->key); + if (cmp == 0) return cur; - - if (key->source_type < cur->key.source_type) - break; - if (key->source_type == cur->key.source_type && - key->target_type < cur->key.target_type) - break; - if (key->source_type == cur->key.source_type && - key->target_type == cur->key.target_type && - key->target_class < cur->key.target_class) + if (cmp < 0) break; } return NULL; @@ -213,27 +202,19 @@ struct avtab_node *avtab_search_node(struct avtab *h, struct avtab_node* avtab_search_node_next(struct avtab_node *node, u16 specified) { + struct avtab_key tmp_key; struct avtab_node *cur; + int cmp; if (!node) return NULL; - - specified &= ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); + tmp_key = node->key; + tmp_key.specified = specified; for (cur = node->next; cur; cur = cur->next) { - if (node->key.source_type == cur->key.source_type && - node->key.target_type == cur->key.target_type && - node->key.target_class == cur->key.target_class && - (specified & cur->key.specified)) + cmp = avtab_node_cmp(&tmp_key, &cur->key); + if (cmp == 0) return cur; - - if (node->key.source_type < cur->key.source_type) - break; - if (node->key.source_type == cur->key.source_type && - node->key.target_type < cur->key.target_type) - break; - if (node->key.source_type == cur->key.source_type && - node->key.target_type == cur->key.target_type && - node->key.target_class < cur->key.target_class) + if (cmp < 0) break; } return NULL; From 4137f324cb29a689e8519d8f7f52d3443bac934b Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 21 Nov 2023 13:21:34 +0100 Subject: [PATCH 0144/1562] MAINTAINERS: spi-nor: add myself as maintainer After being a reviewer for a while, add myself as a maintainer for the spi-nor subsystem. Signed-off-by: Michael Walle Acked-by: Richard Weinberger Acked-by: Miquel Raynal Link: https://lore.kernel.org/r/20231121122134.1952738-1-michael@walle.cc Signed-off-by: Tudor Ambarus --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cf..79e7d727022a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20426,7 +20426,7 @@ F: drivers/pinctrl/spear/ SPI NOR SUBSYSTEM M: Tudor Ambarus M: Pratyush Yadav -R: Michael Walle +M: Michael Walle L: linux-mtd@lists.infradead.org S: Maintained W: http://www.linux-mtd.infradead.org/ From f47507988145185aef5d0e7a0e28dbf6e7776f29 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 17 Oct 2023 22:05:23 +0200 Subject: [PATCH 0145/1562] thermal: ACPI: Move the ACPI thermal library to drivers/acpi/ The ACPI thermal library contains functions that can be used to retrieve trip point temperature values through the platform firmware for various types of trip points. Each of these functions basically evaluates a specific ACPI object, checks if the value produced by it is reasonable and returns it (or THERMAL_TEMP_INVALID if anything fails). It made sense to hold it in drivers/thermal/ so long as it was only used by the code in that directory, but since it is also going to be used by the ACPI thermal driver located in drivers/acpi/, move it to the latter in order to keep the code related to evaluating ACPI objects defined in the specification proper together. No intentional functional impact. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/Kconfig | 5 +++++ drivers/acpi/Makefile | 1 + drivers/{thermal/thermal_acpi.c => acpi/thermal_lib.c} | 0 drivers/thermal/Kconfig | 4 ---- drivers/thermal/Makefile | 1 - drivers/thermal/intel/Kconfig | 2 +- drivers/thermal/intel/int340x_thermal/Kconfig | 2 +- include/linux/acpi.h | 7 +++++++ include/linux/thermal.h | 7 ------- 9 files changed, 15 insertions(+), 14 deletions(-) rename drivers/{thermal/thermal_acpi.c => acpi/thermal_lib.c} (100%) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index f819e760ff19..6f2bfcf7645c 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -61,6 +61,10 @@ config ACPI_CCA_REQUIRED config ACPI_TABLE_LIB bool +config ACPI_THERMAL_LIB + depends on THERMAL + bool + config ACPI_DEBUGGER bool "AML debugger interface" select ACPI_DEBUG @@ -327,6 +331,7 @@ config ACPI_THERMAL tristate "Thermal Zone" depends on ACPI_PROCESSOR select THERMAL + select ACPI_THERMAL_LIB default y help This driver supports ACPI thermal zones. Most mobile and diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index eaa09bf52f17..2a52083704e6 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -89,6 +89,7 @@ obj-$(CONFIG_ACPI_TAD) += acpi_tad.o obj-$(CONFIG_ACPI_PCI_SLOT) += pci_slot.o obj-$(CONFIG_ACPI_PROCESSOR) += processor.o obj-$(CONFIG_ACPI) += container.o +obj-$(CONFIG_ACPI_THERMAL_LIB) += thermal_lib.o obj-$(CONFIG_ACPI_THERMAL) += thermal.o obj-$(CONFIG_ACPI_PLATFORM_PROFILE) += platform_profile.o obj-$(CONFIG_ACPI_NFIT) += nfit/ diff --git a/drivers/thermal/thermal_acpi.c b/drivers/acpi/thermal_lib.c similarity index 100% rename from drivers/thermal/thermal_acpi.c rename to drivers/acpi/thermal_lib.c diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index c81a00fbca7d..59883502eff4 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -76,10 +76,6 @@ config THERMAL_OF Say 'Y' here if you need to build thermal infrastructure based on device tree. -config THERMAL_ACPI - depends on ACPI - bool - config THERMAL_WRITABLE_TRIPS bool "Enable writable trip points" help diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index c934cab309ae..a8318d671036 100644 --- a/drivers/thermal/Makefile +++ b/drivers/thermal/Makefile @@ -13,7 +13,6 @@ thermal_sys-$(CONFIG_THERMAL_NETLINK) += thermal_netlink.o # interface to/from other layers providing sensors thermal_sys-$(CONFIG_THERMAL_HWMON) += thermal_hwmon.o thermal_sys-$(CONFIG_THERMAL_OF) += thermal_of.o -thermal_sys-$(CONFIG_THERMAL_ACPI) += thermal_acpi.o # governors CFLAGS_gov_power_allocator.o := -I$(src) diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig index ecd7e07eece0..b43953b5539f 100644 --- a/drivers/thermal/intel/Kconfig +++ b/drivers/thermal/intel/Kconfig @@ -85,7 +85,7 @@ config INTEL_BXT_PMIC_THERMAL config INTEL_PCH_THERMAL tristate "Intel PCH Thermal Reporting Driver" depends on X86 && PCI - select THERMAL_ACPI if ACPI + select ACPI_THERMAL_LIB if ACPI help Enable this to support thermal reporting on certain intel PCHs. Thermal reporting device will provide temperature reading, diff --git a/drivers/thermal/intel/int340x_thermal/Kconfig b/drivers/thermal/intel/int340x_thermal/Kconfig index 300ea53e9b33..e76b13e44d03 100644 --- a/drivers/thermal/intel/int340x_thermal/Kconfig +++ b/drivers/thermal/intel/int340x_thermal/Kconfig @@ -9,7 +9,7 @@ config INT340X_THERMAL select THERMAL_GOV_USER_SPACE select ACPI_THERMAL_REL select ACPI_FAN - select THERMAL_ACPI + select ACPI_THERMAL_LIB select INTEL_SOC_DTS_IOSF_CORE select INTEL_TCC select PROC_THERMAL_MMIO_RAPL if POWERCAP diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 54189e0e5f41..b63d7811c728 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -424,6 +424,13 @@ extern int acpi_blacklisted(void); extern void acpi_osi_setup(char *str); extern bool acpi_osi_is_win8(void); +#ifdef CONFIG_ACPI_THERMAL_LIB +int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp); +int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp); +int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp); +int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp); +#endif + #ifdef CONFIG_ACPI_NUMA int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index cee814d5d1ac..35f620059456 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -294,13 +294,6 @@ int thermal_zone_get_num_trips(struct thermal_zone_device *tz); int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp); -#ifdef CONFIG_THERMAL_ACPI -int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp); -int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp); -int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp); -int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp); -#endif - #ifdef CONFIG_THERMAL struct thermal_zone_device *thermal_zone_device_register_with_trips( const char *type, From 6908097aa5a7bd0c66c0b7ae9dd994b6ef62be8c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 17 Oct 2023 22:06:52 +0200 Subject: [PATCH 0146/1562] ACPI: thermal_lib: Add functions returning temperature in deci-Kelvin Because the ACPI thermal driver generally operates temperature values in deci-Kelvin, it needs the library functions returning temperature for various trip point types to use deci-Kelvin too. To address that, arrange the ACPI thermal library code in three levels of functions where the high-level ones will return temperature in milli-Celsius, as needed by the thermal core and the majority of thermal drivers, the mid-level ones will return temperature in deci-Kelvin and will be called internally by the corresponding high- level functions, and all of the mid-level functions will call the same low-level one, acpi_trip_temp(), to actually evaluate ACPI objects to retrieve themperature values from the platform firmware. Going forward, this will allow the ACPI thermal driver to use the mid-level functions to provide temperature values needed by it, so as to reduce code duplication related to evaluating trip temperature ACPI control methods. No intentional functional impact. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/thermal_lib.c | 75 ++++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 15 deletions(-) diff --git a/drivers/acpi/thermal_lib.c b/drivers/acpi/thermal_lib.c index 43eaf0f2ff49..02bf570141af 100644 --- a/drivers/acpi/thermal_lib.c +++ b/drivers/acpi/thermal_lib.c @@ -3,8 +3,8 @@ * Copyright 2023 Linaro Limited * Copyright 2023 Intel Corporation * - * Library routines for populating a generic thermal trip point structure - * with data obtained by evaluating a specific object in the ACPI Namespace. + * Library routines for retrieving trip point temperature values from the + * platform firmware via ACPI. */ #include #include @@ -17,11 +17,11 @@ * firmware. Any values out of these boundaries may be considered * bogus and we can assume the firmware has no data to provide. */ -#define TEMP_MIN_DECIK 2180 -#define TEMP_MAX_DECIK 4480 +#define TEMP_MIN_DECIK 2180ULL +#define TEMP_MAX_DECIK 4480ULL -static int thermal_acpi_trip_temp(struct acpi_device *adev, char *obj_name, - int *ret_temp) +static int acpi_trip_temp(struct acpi_device *adev, char *obj_name, + int *ret_temp) { unsigned long long temp; acpi_status status; @@ -33,7 +33,7 @@ static int thermal_acpi_trip_temp(struct acpi_device *adev, char *obj_name, } if (temp >= TEMP_MIN_DECIK && temp <= TEMP_MAX_DECIK) { - *ret_temp = deci_kelvin_to_millicelsius(temp); + *ret_temp = temp; } else { acpi_handle_debug(adev->handle, "%s result %llu out of range\n", obj_name, temp); @@ -43,6 +43,44 @@ static int thermal_acpi_trip_temp(struct acpi_device *adev, char *obj_name, return 0; } +int acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp) +{ + char obj_name[] = {'_', 'A', 'C', '0' + id, '\0'}; + + if (id < 0 || id > 9) + return -EINVAL; + + return acpi_trip_temp(adev, obj_name, ret_temp); +} + +int acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp) +{ + return acpi_trip_temp(adev, "_PSV", ret_temp); +} + +int acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp) +{ + return acpi_trip_temp(adev, "_HOT", ret_temp); +} + +int acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp) +{ + return acpi_trip_temp(adev, "_CRT", ret_temp); +} + +static int thermal_temp(int error, int temp_decik, int *ret_temp) +{ + if (error) + return error; + + if (temp_decik == THERMAL_TEMP_INVALID) + *ret_temp = THERMAL_TEMP_INVALID; + else + *ret_temp = deci_kelvin_to_millicelsius(temp_decik); + + return 0; +} + /** * thermal_acpi_active_trip_temp - Retrieve active trip point temperature * @adev: Target thermal zone ACPI device object. @@ -57,12 +95,10 @@ static int thermal_acpi_trip_temp(struct acpi_device *adev, char *obj_name, */ int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp) { - char obj_name[] = {'_', 'A', 'C', '0' + id, '\0'}; + int temp_decik; + int ret = acpi_active_trip_temp(adev, id, &temp_decik); - if (id < 0 || id > 9) - return -EINVAL; - - return thermal_acpi_trip_temp(adev, obj_name, ret_temp); + return thermal_temp(ret, temp_decik, ret_temp); } EXPORT_SYMBOL_GPL(thermal_acpi_active_trip_temp); @@ -78,7 +114,10 @@ EXPORT_SYMBOL_GPL(thermal_acpi_active_trip_temp); */ int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp) { - return thermal_acpi_trip_temp(adev, "_PSV", ret_temp); + int temp_decik; + int ret = acpi_passive_trip_temp(adev, &temp_decik); + + return thermal_temp(ret, temp_decik, ret_temp); } EXPORT_SYMBOL_GPL(thermal_acpi_passive_trip_temp); @@ -95,7 +134,10 @@ EXPORT_SYMBOL_GPL(thermal_acpi_passive_trip_temp); */ int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp) { - return thermal_acpi_trip_temp(adev, "_HOT", ret_temp); + int temp_decik; + int ret = acpi_hot_trip_temp(adev, &temp_decik); + + return thermal_temp(ret, temp_decik, ret_temp); } EXPORT_SYMBOL_GPL(thermal_acpi_hot_trip_temp); @@ -111,6 +153,9 @@ EXPORT_SYMBOL_GPL(thermal_acpi_hot_trip_temp); */ int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp) { - return thermal_acpi_trip_temp(adev, "_CRT", ret_temp); + int temp_decik; + int ret = acpi_critical_trip_temp(adev, &temp_decik); + + return thermal_temp(ret, temp_decik, ret_temp); } EXPORT_SYMBOL_GPL(thermal_acpi_critical_trip_temp); From 9c8647224e9fabb765019193aa43c054a638f808 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 17 Oct 2023 22:12:33 +0200 Subject: [PATCH 0147/1562] ACPI: thermal: Use library functions to obtain trip point temperature values Modify the ACPI thermal driver to use functions from the ACPI thermal library to obtain trip point temperature values instead of duplicating them locally. Among other things, this requires the functions in question to be exported to it, because it can be built as a module. It effectively changes the behavior of the driver to treat temperature values out of the reasonable range (-55 centigrade to 175 centigrade) as invalid, but there is no other expected functional impact. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/internal.h | 5 ++++ drivers/acpi/thermal.c | 57 +++++++++++++++++--------------------- drivers/acpi/thermal_lib.c | 4 +++ 3 files changed, 34 insertions(+), 32 deletions(-) diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 866c7c4ed233..a3728f70a795 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -85,6 +85,11 @@ bool acpi_scan_is_offline(struct acpi_device *adev, bool uevent); acpi_status acpi_sysfs_table_handler(u32 event, void *table, void *context); void acpi_scan_table_notify(void); +int acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp); +int acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp); +int acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp); +int acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp); + /* -------------------------------------------------------------------------- Device Node Initialization / Removal -------------------------------------------------------------------------- */ diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index f74d81abdbfc..6c29a266dbd0 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -31,6 +31,8 @@ #include #include +#include "internal.h" + #define ACPI_THERMAL_CLASS "thermal_zone" #define ACPI_THERMAL_DEVICE_NAME "Thermal Zone" #define ACPI_THERMAL_NOTIFY_TEMPERATURE 0x80 @@ -188,24 +190,19 @@ static int active_trip_index(struct acpi_thermal *tz, static long get_passive_temp(struct acpi_thermal *tz) { - unsigned long long tmp; - acpi_status status; + int temp; - status = acpi_evaluate_integer(tz->device->handle, "_PSV", NULL, &tmp); - if (ACPI_FAILURE(status)) + if (acpi_passive_trip_temp(tz->device, &temp)) return THERMAL_TEMP_INVALID; - return tmp; + return temp; } static long get_active_temp(struct acpi_thermal *tz, int index) { - char method[] = { '_', 'A', 'C', '0' + index, '\0' }; - unsigned long long tmp; - acpi_status status; + int temp; - status = acpi_evaluate_integer(tz->device->handle, method, NULL, &tmp); - if (ACPI_FAILURE(status)) + if (acpi_active_trip_temp(tz->device, index, &temp)) return THERMAL_TEMP_INVALID; /* @@ -215,10 +212,10 @@ static long get_active_temp(struct acpi_thermal *tz, int index) if (act > 0) { unsigned long long override = celsius_to_deci_kelvin(act); - if (tmp > override) - tmp = override; + if (temp > override) + return override; } - return tmp; + return temp; } static void acpi_thermal_update_trip(struct acpi_thermal *tz, @@ -339,13 +336,12 @@ static void acpi_thermal_trips_update(struct acpi_thermal *tz, u32 event) dev_name(&adev->dev), event, 0); } -static long acpi_thermal_get_critical_trip(struct acpi_thermal *tz) +static int acpi_thermal_get_critical_trip(struct acpi_thermal *tz) { - unsigned long long tmp; - acpi_status status; + int temp; if (crt > 0) { - tmp = celsius_to_deci_kelvin(crt); + temp = celsius_to_deci_kelvin(crt); goto set; } if (crt == -1) { @@ -353,38 +349,34 @@ static long acpi_thermal_get_critical_trip(struct acpi_thermal *tz) return THERMAL_TEMP_INVALID; } - status = acpi_evaluate_integer(tz->device->handle, "_CRT", NULL, &tmp); - if (ACPI_FAILURE(status)) { - acpi_handle_debug(tz->device->handle, "No critical threshold\n"); + if (acpi_critical_trip_temp(tz->device, &temp)) return THERMAL_TEMP_INVALID; - } - if (tmp <= 2732) { + + if (temp <= 2732) { /* * Below zero (Celsius) values clearly aren't right for sure, * so discard them as invalid. */ - pr_info(FW_BUG "Invalid critical threshold (%llu)\n", tmp); + pr_info(FW_BUG "Invalid critical threshold (%d)\n", temp); return THERMAL_TEMP_INVALID; } set: - acpi_handle_debug(tz->device->handle, "Critical threshold [%llu]\n", tmp); - return tmp; + acpi_handle_debug(tz->device->handle, "Critical threshold [%d]\n", temp); + return temp; } -static long acpi_thermal_get_hot_trip(struct acpi_thermal *tz) +static int acpi_thermal_get_hot_trip(struct acpi_thermal *tz) { - unsigned long long tmp; - acpi_status status; + int temp; - status = acpi_evaluate_integer(tz->device->handle, "_HOT", NULL, &tmp); - if (ACPI_FAILURE(status)) { + if (acpi_hot_trip_temp(tz->device, &temp) || temp == THERMAL_TEMP_INVALID) { acpi_handle_debug(tz->device->handle, "No hot threshold\n"); return THERMAL_TEMP_INVALID; } - acpi_handle_debug(tz->device->handle, "Hot threshold [%llu]\n", tmp); - return tmp; + acpi_handle_debug(tz->device->handle, "Hot threshold [%d]\n", temp); + return temp; } static bool passive_trip_params_init(struct acpi_thermal *tz) @@ -1142,6 +1134,7 @@ static void __exit acpi_thermal_exit(void) module_init(acpi_thermal_init); module_exit(acpi_thermal_exit); +MODULE_IMPORT_NS(ACPI_THERMAL); MODULE_AUTHOR("Paul Diefenbaugh"); MODULE_DESCRIPTION("ACPI Thermal Zone Driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/acpi/thermal_lib.c b/drivers/acpi/thermal_lib.c index 02bf570141af..646ff6bda6dd 100644 --- a/drivers/acpi/thermal_lib.c +++ b/drivers/acpi/thermal_lib.c @@ -52,21 +52,25 @@ int acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp) return acpi_trip_temp(adev, obj_name, ret_temp); } +EXPORT_SYMBOL_NS_GPL(acpi_active_trip_temp, ACPI_THERMAL); int acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp) { return acpi_trip_temp(adev, "_PSV", ret_temp); } +EXPORT_SYMBOL_NS_GPL(acpi_passive_trip_temp, ACPI_THERMAL); int acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp) { return acpi_trip_temp(adev, "_HOT", ret_temp); } +EXPORT_SYMBOL_NS_GPL(acpi_hot_trip_temp, ACPI_THERMAL); int acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp) { return acpi_trip_temp(adev, "_CRT", ret_temp); } +EXPORT_SYMBOL_NS_GPL(acpi_critical_trip_temp, ACPI_THERMAL); static int thermal_temp(int error, int temp_decik, int *ret_temp) { From cb21746b179c7c64faeb777a8d1d901f7d680a28 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 21 Nov 2023 09:08:26 +0200 Subject: [PATCH 0148/1562] ACPI: scan: Fix an error message in DisCo for Imaging support The recently merged DisCo for Imaging support used a wrong printk specifier in printing a message. Fix it by using %zu instead of %lu. Also use "bits" instead of "bytes" as these are indeed bytes. Fixes: a6cb0a611273 ("ACPI: scan: Extract MIPI DisCo for Imaging data into swnodes") Signed-off-by: Sakari Ailus Signed-off-by: Rafael J. Wysocki --- drivers/acpi/mipi-disco-img.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/mipi-disco-img.c b/drivers/acpi/mipi-disco-img.c index cad72d1fc127..7286cf4579bc 100644 --- a/drivers/acpi/mipi-disco-img.c +++ b/drivers/acpi/mipi-disco-img.c @@ -533,7 +533,7 @@ static void init_csi2_port(struct acpi_device *adev, if (ret < 0) { acpi_handle_debug(handle, "Lane polarity bytes missing\n"); } else if (ret * BITS_PER_TYPE(u8) < num_lanes + 1) { - acpi_handle_info(handle, "Too few lane polarity bytes (%lu vs. %d)\n", + acpi_handle_info(handle, "Too few lane polarity bits (%zu vs. %d)\n", ret * BITS_PER_TYPE(u8), num_lanes + 1); } else { unsigned long mask = 0; From 49277a5b76373e630075ff7d32fc0f9f51294f24 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 Nov 2023 21:18:40 -0500 Subject: [PATCH 0149/1562] workqueue: Move workqueue_set_unbound_cpumask() and its helpers inside CONFIG_SYSFS Commit fe28f631fa94 ("workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask") makes workqueue_set_unbound_cpumask() static as it is not used elsewhere in the kernel. However, this triggers a kernel test robot warning about 'workqueue_set_unbound_cpumask' defined but not used when CONFIG_SYS isn't defined. It happens that workqueue_set_unbound_cpumask() is only called when CONFIG_SYS is defined. Move workqueue_set_unbound_cpumask() and its helpers inside the CONFIG_SYSFS compilation block to avoid the warning. There is no functional change. Fixes: fe28f631fa94 ("workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311130831.uh0AoCd1-lkp@intel.com/ Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/workqueue.c | 102 ++++++++++++++++++++++----------------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bd9d34eacd78..2fc585d3d6ca 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4417,19 +4417,6 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) mutex_unlock(&ctx->wq->mutex); } -static void apply_wqattrs_lock(void) -{ - /* CPUs should stay stable across pwq creations and installations */ - cpus_read_lock(); - mutex_lock(&wq_pool_mutex); -} - -static void apply_wqattrs_unlock(void) -{ - mutex_unlock(&wq_pool_mutex); - cpus_read_unlock(); -} - static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { @@ -5833,44 +5820,6 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) return ret; } -/** - * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask - * @cpumask: the cpumask to set - * - * The low-level workqueues cpumask is a global cpumask that limits - * the affinity of all unbound workqueues. This function check the @cpumask - * and apply it to all unbound workqueues and updates all pwqs of them. - * - * Return: 0 - Success - * -EINVAL - Invalid @cpumask - * -ENOMEM - Failed to allocate memory for attrs or pwqs. - */ -static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) -{ - int ret = -EINVAL; - - /* - * Not excluding isolated cpus on purpose. - * If the user wishes to include them, we allow that. - */ - cpumask_and(cpumask, cpumask, cpu_possible_mask); - if (!cpumask_empty(cpumask)) { - apply_wqattrs_lock(); - cpumask_copy(wq_requested_unbound_cpumask, cpumask); - if (cpumask_equal(cpumask, wq_unbound_cpumask)) { - ret = 0; - goto out_unlock; - } - - ret = workqueue_apply_unbound_cpumask(cpumask); - -out_unlock: - apply_wqattrs_unlock(); - } - - return ret; -} - /** * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask @@ -6027,6 +5976,19 @@ static struct attribute *wq_sysfs_attrs[] = { }; ATTRIBUTE_GROUPS(wq_sysfs); +static void apply_wqattrs_lock(void) +{ + /* CPUs should stay stable across pwq creations and installations */ + cpus_read_lock(); + mutex_lock(&wq_pool_mutex); +} + +static void apply_wqattrs_unlock(void) +{ + mutex_unlock(&wq_pool_mutex); + cpus_read_unlock(); +} + static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -6203,6 +6165,44 @@ static struct bus_type wq_subsys = { .dev_groups = wq_sysfs_groups, }; +/** + * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask + * @cpumask: the cpumask to set + * + * The low-level workqueues cpumask is a global cpumask that limits + * the affinity of all unbound workqueues. This function check the @cpumask + * and apply it to all unbound workqueues and updates all pwqs of them. + * + * Return: 0 - Success + * -EINVAL - Invalid @cpumask + * -ENOMEM - Failed to allocate memory for attrs or pwqs. + */ +static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) +{ + int ret = -EINVAL; + + /* + * Not excluding isolated cpus on purpose. + * If the user wishes to include them, we allow that. + */ + cpumask_and(cpumask, cpumask, cpu_possible_mask); + if (!cpumask_empty(cpumask)) { + apply_wqattrs_lock(); + cpumask_copy(wq_requested_unbound_cpumask, cpumask); + if (cpumask_equal(cpumask, wq_unbound_cpumask)) { + ret = 0; + goto out_unlock; + } + + ret = workqueue_apply_unbound_cpumask(cpumask); + +out_unlock: + apply_wqattrs_unlock(); + } + + return ret; +} + static ssize_t __wq_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf, cpumask_var_t mask) { From ccd45faf4973746c4f30ea41eec864e5cf191099 Mon Sep 17 00:00:00 2001 From: Nikita Kiryushin Date: Thu, 9 Nov 2023 16:49:25 +0300 Subject: [PATCH 0150/1562] ACPI: video: check for error while searching for backlight device parent If acpi_get_parent() called in acpi_video_dev_register_backlight() fails, for example, because acpi_ut_acquire_mutex() fails inside acpi_get_parent), this can lead to incorrect (uninitialized) acpi_parent handle being passed to acpi_get_pci_dev() for detecting the parent pci device. Check acpi_get_parent() result and set parent device only in case of success. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 9661e92c10a9 ("acpi: tie ACPI backlight devices to PCI devices if possible") Signed-off-by: Nikita Kiryushin Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_video.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c index 5eded14f8853..f71287072719 100644 --- a/drivers/acpi/acpi_video.c +++ b/drivers/acpi/acpi_video.c @@ -1717,12 +1717,12 @@ static void acpi_video_dev_register_backlight(struct acpi_video_device *device) return; count++; - acpi_get_parent(device->dev->handle, &acpi_parent); - - pdev = acpi_get_pci_dev(acpi_parent); - if (pdev) { - parent = &pdev->dev; - pci_dev_put(pdev); + if (ACPI_SUCCESS(acpi_get_parent(device->dev->handle, &acpi_parent))) { + pdev = acpi_get_pci_dev(acpi_parent); + if (pdev) { + parent = &pdev->dev; + pci_dev_put(pdev); + } } memset(&props, 0, sizeof(struct backlight_properties)); From 709f3cbd652e50e96a9d9c62a300313b636e3f6f Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Thu, 16 Nov 2023 16:47:22 -0600 Subject: [PATCH 0151/1562] ACPI: APEI: EINJ: Refactor available_error_type_show() OSPM can discover the error injection capabilities of the platform by executing GET_ERROR_TYPE error injection action.[1] The action returns a DWORD representing a bitmap of platform supported error injections.[2] The available_error_type_show() function determines the bits set within this DWORD and provides a verbose output, from einj_error_type_string array, through /sys/kernel/debug/apei/einj/available_error_type file. The function however, assumes one to one correspondence between an error's position in the bitmap and its array entry offset. Consequently, some errors like Vendor Defined Error Type fail this assumption and will incorrectly be shown as not supported, even if their corresponding bit is set in the bitmap and they have an entry in the array. Navigate around the issue by converting einj_error_type_string into an array of structures with a predetermined mask for all error types corresponding to their bit position in the DWORD returned by GET_ERROR_TYPE action. The same breaks the aforementioned assumption resulting in all supported error types by a platform being outputted through the above available_error_type file. [1] ACPI specification 6.5, Table 18.25 [2] ACPI specification 6.5, Table 18.30 Suggested-by: Alexey Kardashevskiy Signed-off-by: Avadhut Naik Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Tony Luck Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/einj.c | 47 ++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c index 013eb621dc92..506fe319379f 100644 --- a/drivers/acpi/apei/einj.c +++ b/drivers/acpi/apei/einj.c @@ -577,38 +577,39 @@ static u64 error_param2; static u64 error_param3; static u64 error_param4; static struct dentry *einj_debug_dir; -static const char * const einj_error_type_string[] = { - "0x00000001\tProcessor Correctable\n", - "0x00000002\tProcessor Uncorrectable non-fatal\n", - "0x00000004\tProcessor Uncorrectable fatal\n", - "0x00000008\tMemory Correctable\n", - "0x00000010\tMemory Uncorrectable non-fatal\n", - "0x00000020\tMemory Uncorrectable fatal\n", - "0x00000040\tPCI Express Correctable\n", - "0x00000080\tPCI Express Uncorrectable non-fatal\n", - "0x00000100\tPCI Express Uncorrectable fatal\n", - "0x00000200\tPlatform Correctable\n", - "0x00000400\tPlatform Uncorrectable non-fatal\n", - "0x00000800\tPlatform Uncorrectable fatal\n", - "0x00001000\tCXL.cache Protocol Correctable\n", - "0x00002000\tCXL.cache Protocol Uncorrectable non-fatal\n", - "0x00004000\tCXL.cache Protocol Uncorrectable fatal\n", - "0x00008000\tCXL.mem Protocol Correctable\n", - "0x00010000\tCXL.mem Protocol Uncorrectable non-fatal\n", - "0x00020000\tCXL.mem Protocol Uncorrectable fatal\n", +static struct { u32 mask; const char *str; } const einj_error_type_string[] = { + { BIT(0), "Processor Correctable" }, + { BIT(1), "Processor Uncorrectable non-fatal" }, + { BIT(2), "Processor Uncorrectable fatal" }, + { BIT(3), "Memory Correctable" }, + { BIT(4), "Memory Uncorrectable non-fatal" }, + { BIT(5), "Memory Uncorrectable fatal" }, + { BIT(6), "PCI Express Correctable" }, + { BIT(7), "PCI Express Uncorrectable non-fatal" }, + { BIT(8), "PCI Express Uncorrectable fatal" }, + { BIT(9), "Platform Correctable" }, + { BIT(10), "Platform Uncorrectable non-fatal" }, + { BIT(11), "Platform Uncorrectable fatal"}, + { BIT(12), "CXL.cache Protocol Correctable" }, + { BIT(13), "CXL.cache Protocol Uncorrectable non-fatal" }, + { BIT(14), "CXL.cache Protocol Uncorrectable fatal" }, + { BIT(15), "CXL.mem Protocol Correctable" }, + { BIT(16), "CXL.mem Protocol Uncorrectable non-fatal" }, + { BIT(17), "CXL.mem Protocol Uncorrectable fatal" }, }; static int available_error_type_show(struct seq_file *m, void *v) { int rc; - u32 available_error_type = 0; + u32 error_type = 0; - rc = einj_get_available_error_type(&available_error_type); + rc = einj_get_available_error_type(&error_type); if (rc) return rc; for (int pos = 0; pos < ARRAY_SIZE(einj_error_type_string); pos++) - if (available_error_type & BIT(pos)) - seq_puts(m, einj_error_type_string[pos]); + if (error_type & einj_error_type_string[pos].mask) + seq_printf(m, "0x%08x\t%s\n", einj_error_type_string[pos].mask, + einj_error_type_string[pos].str); return 0; } From 71cd3c636404ceb08226b5095ca36a04eb578ca1 Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Thu, 16 Nov 2023 16:47:23 -0600 Subject: [PATCH 0152/1562] fs: debugfs: Add write functionality to debugfs blobs Currently, debugfs_create_blob() creates read-only debugfs binary blob files. In some cases, however, userspace tools need to write variable length data structures into predetermined memory addresses. An example is when injecting Vendor-defined error types through the einj module. In such cases, the functionality to write to these blob files in debugfs would be desired since the mapping aspect can be handled within the modules with userspace tools only needing to write into the blob files. Implement a write callback to enable writing to these blob files, created in debugfs, by owners only. Signed-off-by: Avadhut Naik Reviewed-by: Alexey Kardashevskiy Reviewed-by: Greg Kroah-Hartman Reviewed-by: Tony Luck Signed-off-by: Rafael J. Wysocki --- fs/debugfs/file.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index c45e8c2d62e1..00b834269aad 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -1008,17 +1008,35 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf, return r; } +static ssize_t write_file_blob(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct debugfs_blob_wrapper *blob = file->private_data; + struct dentry *dentry = F_DENTRY(file); + ssize_t r; + + r = debugfs_file_get(dentry); + if (unlikely(r)) + return r; + r = simple_write_to_buffer(blob->data, blob->size, ppos, user_buf, + count); + + debugfs_file_put(dentry); + return r; +} + static const struct file_operations fops_blob = { .read = read_file_blob, + .write = write_file_blob, .open = simple_open, .llseek = default_llseek, }; /** - * debugfs_create_blob - create a debugfs file that is used to read a binary blob + * debugfs_create_blob - create a debugfs file that is used to read and write + * a binary blob * @name: a pointer to a string containing the name of the file to create. - * @mode: the read permission that the file should have (other permissions are - * masked out) + * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. @@ -1027,7 +1045,7 @@ static const struct file_operations fops_blob = { * * This function creates a file in debugfs with the given name that exports * @blob->data as a binary blob. If the @mode variable is so set it can be - * read from. Writing is not supported. + * read from and written to. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is @@ -1042,7 +1060,7 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) { - return debugfs_create_file_unsafe(name, mode & 0444, parent, blob, &fops_blob); + return debugfs_create_file_unsafe(name, mode & 0644, parent, blob, &fops_blob); } EXPORT_SYMBOL_GPL(debugfs_create_blob); From 0706526ec7704dcd046239078ac175d11a88a95e Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Thu, 16 Nov 2023 16:47:24 -0600 Subject: [PATCH 0153/1562] platform/chrome: cros_ec_debugfs: Fix permissions for panicinfo The debugfs_create_blob() function has been used to create read-only binary blobs in debugfs. The function filters out permissions, other than S_IRUSR, S_IRGRP and S_IROTH, provided while creating the blobs. The very behavior though is being changed through previous patch in the series (fs: debugfs: Add write functionality to debugfs blobs) which makes the binary blobs writable by owners. Thus, all permissions provided while creating the blobs, except S_IRUSR,S_IWUSR, S_IRGRP, S_IROTH, will be filtered by debugfs_create_blob(). As such, rectify the permissions of panicinfo file since the S_IFREG flag was anyways being filtered out by debugfs_create_blob(). Moreover, the very flag will always be set be set for the panicinfo file through __debugfs_create_file(). Signed-off-by: Avadhut Naik Reviewed-by: Greg Kroah-Hartman Reviewed-by: Tony Luck Signed-off-by: Rafael J. Wysocki --- drivers/platform/chrome/cros_ec_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c index 091fdc154d79..6bf6f0e7b597 100644 --- a/drivers/platform/chrome/cros_ec_debugfs.c +++ b/drivers/platform/chrome/cros_ec_debugfs.c @@ -454,7 +454,7 @@ static int cros_ec_create_panicinfo(struct cros_ec_debugfs *debug_info) debug_info->panicinfo_blob.data = data; debug_info->panicinfo_blob.size = ret; - debugfs_create_blob("panicinfo", S_IFREG | 0444, debug_info->dir, + debugfs_create_blob("panicinfo", 0444, debug_info->dir, &debug_info->panicinfo_blob); return 0; From 22fca621bd1bbc5366e9cd941eb1c07c0963d984 Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Thu, 16 Nov 2023 16:47:25 -0600 Subject: [PATCH 0154/1562] ACPI: APEI: EINJ: Add support for vendor defined error types Vendor-Defined Error types are supported by the platform apart from standard error types if bit 31 is set in the output of GET_ERROR_TYPE Error Injection Action.[1] While the errors themselves and the length of their associated "OEM Defined data structure" might vary between vendors, the physical address of this structure can be computed through vendor_extension and length fields of "SET_ERROR_TYPE_WITH_ADDRESS" and "Vendor Error Type Extension" Structures respectively.[2][3] Currently, however, the einj module only computes the physical address of Vendor Error Type Extension Structure. Neither does it compute the physical address of OEM Defined structure nor does it establish the memory mapping required for injecting Vendor-defined errors. Consequently, userspace tools have to establish the very mapping through /dev/mem, nopat kernel parameter and system calls like mmap/munmap initially before injecting Vendor-defined errors. Circumvent the issue by computing the physical address of OEM Defined data structure and establishing the required mapping with the structure. Create a new file "oem_error", if the system supports Vendor-defined errors, to export this mapping, through debugfs_create_blob(). Userspace tools can then populate their respective OEM Defined structure instances and just write to the file as part of injecting Vendor-defined Errors. Similarly, the tools can also read from the file if the system firmware provides some information through the OEM defined structure after error injection. [1] ACPI specification 6.5, section 18.6.4 [2] ACPI specification 6.5, Table 18.31 [3] ACPI specification 6.5, Table 18.32 Suggested-by: Yazen Ghannam Signed-off-by: Avadhut Naik Reviewed-by: Tony Luck Reviewed-by: Borislav Petkov (AMD) Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/einj.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c index 506fe319379f..89fb9331c611 100644 --- a/drivers/acpi/apei/einj.c +++ b/drivers/acpi/apei/einj.c @@ -73,6 +73,7 @@ static u32 notrigger; static u32 vendor_flags; static struct debugfs_blob_wrapper vendor_blob; +static struct debugfs_blob_wrapper vendor_errors; static char vendor_dev[64]; /* @@ -182,6 +183,21 @@ static int einj_timedout(u64 *t) return 0; } +static void get_oem_vendor_struct(u64 paddr, int offset, + struct vendor_error_type_extension *v) +{ + unsigned long vendor_size; + u64 target_pa = paddr + offset + sizeof(struct vendor_error_type_extension); + + vendor_size = v->length - sizeof(struct vendor_error_type_extension); + + if (vendor_size) + vendor_errors.data = acpi_os_map_memory(target_pa, vendor_size); + + if (vendor_errors.data) + vendor_errors.size = vendor_size; +} + static void check_vendor_extension(u64 paddr, struct set_error_type_with_address *v5param) { @@ -194,6 +210,7 @@ static void check_vendor_extension(u64 paddr, v = acpi_os_map_iomem(paddr + offset, sizeof(*v)); if (!v) return; + get_oem_vendor_struct(paddr, offset, v); sbdf = v->pcie_sbdf; sprintf(vendor_dev, "%x:%x:%x.%x vendor_id=%x device_id=%x rev_id=%x\n", sbdf >> 24, (sbdf >> 16) & 0xff, @@ -596,6 +613,7 @@ static struct { u32 mask; const char *str; } const einj_error_type_string[] = { { BIT(15), "CXL.mem Protocol Correctable" }, { BIT(16), "CXL.mem Protocol Uncorrectable non-fatal" }, { BIT(17), "CXL.mem Protocol Uncorrectable fatal" }, + { BIT(31), "Vendor Defined Error Types" }, }; static int available_error_type_show(struct seq_file *m, void *v) @@ -768,6 +786,10 @@ static int __init einj_init(void) einj_debug_dir, &vendor_flags); } + if (vendor_errors.size) + debugfs_create_blob("oem_error", 0600, einj_debug_dir, + &vendor_errors); + pr_info("Error INJection is initialized.\n"); return 0; @@ -793,6 +815,8 @@ static void __exit einj_exit(void) sizeof(struct einj_parameter); acpi_os_unmap_iomem(einj_param, size); + if (vendor_errors.size) + acpi_os_unmap_memory(vendor_errors.data, vendor_errors.size); } einj_exec_ctx_init(&ctx); apei_exec_post_unmap_gars(&ctx); From 9862ec7ac1cbc6eb5ee4a045b5d5b8edbb2f7e68 Mon Sep 17 00:00:00 2001 From: Osama Muhammad Date: Wed, 11 Oct 2023 23:46:37 +0500 Subject: [PATCH 0155/1562] FS:JFS:UBSAN:array-index-out-of-bounds in dbAdjTree Syzkaller reported the following issue: UBSAN: array-index-out-of-bounds in fs/jfs/jfs_dmap.c:2867:6 index 196694 is out of range for type 's8[1365]' (aka 'signed char[1365]') CPU: 1 PID: 109 Comm: jfsCommit Not tainted 6.6.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/04/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e7/0x2d0 lib/dump_stack.c:106 ubsan_epilogue lib/ubsan.c:217 [inline] __ubsan_handle_out_of_bounds+0x11c/0x150 lib/ubsan.c:348 dbAdjTree+0x474/0x4f0 fs/jfs/jfs_dmap.c:2867 dbJoin+0x210/0x2d0 fs/jfs/jfs_dmap.c:2834 dbFreeBits+0x4eb/0xda0 fs/jfs/jfs_dmap.c:2331 dbFreeDmap fs/jfs/jfs_dmap.c:2080 [inline] dbFree+0x343/0x650 fs/jfs/jfs_dmap.c:402 txFreeMap+0x798/0xd50 fs/jfs/jfs_txnmgr.c:2534 txUpdateMap+0x342/0x9e0 txLazyCommit fs/jfs/jfs_txnmgr.c:2664 [inline] jfs_lazycommit+0x47a/0xb70 fs/jfs/jfs_txnmgr.c:2732 kthread+0x2d3/0x370 kernel/kthread.c:388 ret_from_fork+0x48/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 ================================================================================ Kernel panic - not syncing: UBSAN: panic_on_warn set ... CPU: 1 PID: 109 Comm: jfsCommit Not tainted 6.6.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/04/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e7/0x2d0 lib/dump_stack.c:106 panic+0x30f/0x770 kernel/panic.c:340 check_panic_on_warn+0x82/0xa0 kernel/panic.c:236 ubsan_epilogue lib/ubsan.c:223 [inline] __ubsan_handle_out_of_bounds+0x13c/0x150 lib/ubsan.c:348 dbAdjTree+0x474/0x4f0 fs/jfs/jfs_dmap.c:2867 dbJoin+0x210/0x2d0 fs/jfs/jfs_dmap.c:2834 dbFreeBits+0x4eb/0xda0 fs/jfs/jfs_dmap.c:2331 dbFreeDmap fs/jfs/jfs_dmap.c:2080 [inline] dbFree+0x343/0x650 fs/jfs/jfs_dmap.c:402 txFreeMap+0x798/0xd50 fs/jfs/jfs_txnmgr.c:2534 txUpdateMap+0x342/0x9e0 txLazyCommit fs/jfs/jfs_txnmgr.c:2664 [inline] jfs_lazycommit+0x47a/0xb70 fs/jfs/jfs_txnmgr.c:2732 kthread+0x2d3/0x370 kernel/kthread.c:388 ret_from_fork+0x48/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 Kernel Offset: disabled Rebooting in 86400 seconds.. The issue is caused when the value of lp becomes greater than CTLTREESIZE which is the max size of stree. Adding a simple check solves this issue. Dave: As the function returns a void, good error handling would require a more intrusive code reorganization, so I modified Osama's patch at use WARN_ON_ONCE for lack of a cleaner option. The patch is tested via syzbot. Reported-by: syzbot+39ba34a099ac2e9bd3cb@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=39ba34a099ac2e9bd3cb Signed-off-by: Osama Muhammad Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_dmap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 11c77757ead9..d55f0dd8d754 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -2871,6 +2871,9 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval) /* is the current value the same as the old value ? if so, * there is nothing to do. */ + if (WARN_ON_ONCE(lp >= CTLTREESIZE)) + return; + if (tp->dmt_stree[lp] == newval) return; From 27e56f59bab5ddafbcfe69ad7a4a6ea1279c1b16 Mon Sep 17 00:00:00 2001 From: Osama Muhammad Date: Sat, 14 Oct 2023 00:10:28 +0500 Subject: [PATCH 0156/1562] UBSAN: array-index-out-of-bounds in dtSplitRoot Syzkaller reported the following issue: oop0: detected capacity change from 0 to 32768 UBSAN: array-index-out-of-bounds in fs/jfs/jfs_dtree.c:1971:9 index -2 is out of range for type 'struct dtslot [128]' CPU: 0 PID: 3613 Comm: syz-executor270 Not tainted 6.0.0-syzkaller-09423-g493ffd6605b2 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/22/2022 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1b1/0x28e lib/dump_stack.c:106 ubsan_epilogue lib/ubsan.c:151 [inline] __ubsan_handle_out_of_bounds+0xdb/0x130 lib/ubsan.c:283 dtSplitRoot+0x8d8/0x1900 fs/jfs/jfs_dtree.c:1971 dtSplitUp fs/jfs/jfs_dtree.c:985 [inline] dtInsert+0x1189/0x6b80 fs/jfs/jfs_dtree.c:863 jfs_mkdir+0x757/0xb00 fs/jfs/namei.c:270 vfs_mkdir+0x3b3/0x590 fs/namei.c:4013 do_mkdirat+0x279/0x550 fs/namei.c:4038 __do_sys_mkdirat fs/namei.c:4053 [inline] __se_sys_mkdirat fs/namei.c:4051 [inline] __x64_sys_mkdirat+0x85/0x90 fs/namei.c:4051 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7fcdc0113fd9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffeb8bc67d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000102 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fcdc0113fd9 RDX: 0000000000000000 RSI: 0000000020000340 RDI: 0000000000000003 RBP: 00007fcdc00d37a0 R08: 0000000000000000 R09: 00007fcdc00d37a0 R10: 00005555559a72c0 R11: 0000000000000246 R12: 00000000f8008000 R13: 0000000000000000 R14: 00083878000000f8 R15: 0000000000000000 The issue is caused when the value of fsi becomes less than -1. The check to break the loop when fsi value becomes -1 is present but syzbot was able to produce value less than -1 which cause the error. This patch simply add the change for the values less than 0. The patch is tested via syzbot. Reported-and-tested-by: syzbot+d4b1df2e9d4ded6488ec@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=d4b1df2e9d4ded6488ec Signed-off-by: Osama Muhammad Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_dtree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 92b7c533407c..f3d3e8b3f50c 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -1970,7 +1970,7 @@ static int dtSplitRoot(tid_t tid, do { f = &rp->slot[fsi]; fsi = f->next; - } while (fsi != -1); + } while (fsi >= 0); f->next = n; } From fa5492ee89463a7590a1449358002ff7ef63529f Mon Sep 17 00:00:00 2001 From: Manas Ghandat Date: Wed, 25 Oct 2023 11:39:07 +0530 Subject: [PATCH 0157/1562] jfs: fix slab-out-of-bounds Read in dtSearch Currently while searching for current page in the sorted entry table of the page there is a out of bound access. Added a bound check to fix the error. Dave: Set return code to -EIO Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202310241724.Ed02yUz9-lkp@intel.com/ Signed-off-by: Manas Ghandat Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_dtree.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index f3d3e8b3f50c..031d8f570f58 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -633,6 +633,11 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) { index = base + (lim >> 1); + if (stbl[index] < 0) { + rc = -EIO; + goto out; + } + if (p->header.flag & BT_LEAF) { /* uppercase leaf name to compare */ cmp = From 74ecdda68242b174920fe7c6133a856fb7d8559b Mon Sep 17 00:00:00 2001 From: Manas Ghandat Date: Tue, 17 Oct 2023 17:33:56 +0530 Subject: [PATCH 0158/1562] jfs: fix array-index-out-of-bounds in dbAdjTree Currently there is a bound check missing in the dbAdjTree while accessing the dmt_stree. To add the required check added the bool is_ctl which is required to determine the size as suggest in the following commit. https://lore.kernel.org/linux-kernel-mentees/f9475918-2186-49b8-b801-6f0f9e75f4fa@oracle.com/ Reported-by: syzbot+39ba34a099ac2e9bd3cb@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=39ba34a099ac2e9bd3cb Signed-off-by: Manas Ghandat Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_dmap.c | 60 ++++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index d55f0dd8d754..cb3cda1390ad 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -63,10 +63,10 @@ */ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, int nblocks); -static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval); -static int dbBackSplit(dmtree_t * tp, int leafno); -static int dbJoin(dmtree_t * tp, int leafno, int newval); -static void dbAdjTree(dmtree_t * tp, int leafno, int newval); +static void dbSplit(dmtree_t *tp, int leafno, int splitsz, int newval, bool is_ctl); +static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl); +static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl); +static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl); static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level); static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results); @@ -2103,7 +2103,7 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, * system. */ if (dp->tree.stree[word] == NOFREE) - dbBackSplit((dmtree_t *) & dp->tree, word); + dbBackSplit((dmtree_t *)&dp->tree, word, false); dbAllocBits(bmp, dp, blkno, nblocks); } @@ -2189,7 +2189,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * the binary system of the leaves if need be. */ dbSplit(tp, word, BUDMIN, - dbMaxBud((u8 *) & dp->wmap[word])); + dbMaxBud((u8 *)&dp->wmap[word]), false); word += 1; } else { @@ -2229,7 +2229,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * system of the leaves to reflect the current * allocation (size). */ - dbSplit(tp, word, size, NOFREE); + dbSplit(tp, word, size, NOFREE, false); /* get the number of dmap words handled */ nw = BUDSIZE(size, BUDMIN); @@ -2336,7 +2336,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, /* update the leaf for this dmap word. */ rc = dbJoin(tp, word, - dbMaxBud((u8 *) & dp->wmap[word])); + dbMaxBud((u8 *)&dp->wmap[word]), false); if (rc) return rc; @@ -2369,7 +2369,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, /* update the leaf. */ - rc = dbJoin(tp, word, size); + rc = dbJoin(tp, word, size, false); if (rc) return rc; @@ -2521,16 +2521,16 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) * that it is at the front of a binary buddy system. */ if (oldval == NOFREE) { - rc = dbBackSplit((dmtree_t *) dcp, leafno); + rc = dbBackSplit((dmtree_t *)dcp, leafno, true); if (rc) { release_metapage(mp); return rc; } oldval = dcp->stree[ti]; } - dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval); + dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval, true); } else { - rc = dbJoin((dmtree_t *) dcp, leafno, newval); + rc = dbJoin((dmtree_t *) dcp, leafno, newval, true); if (rc) { release_metapage(mp); return rc; @@ -2561,7 +2561,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) */ if (alloc) { dbJoin((dmtree_t *) dcp, leafno, - oldval); + oldval, true); } else { /* the dbJoin() above might have * caused a larger binary buddy system @@ -2571,9 +2571,9 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) */ if (dcp->stree[ti] == NOFREE) dbBackSplit((dmtree_t *) - dcp, leafno); + dcp, leafno, true); dbSplit((dmtree_t *) dcp, leafno, - dcp->budmin, oldval); + dcp->budmin, oldval, true); } /* release the buffer and return the error. @@ -2621,7 +2621,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) * * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; */ -static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) +static void dbSplit(dmtree_t *tp, int leafno, int splitsz, int newval, bool is_ctl) { int budsz; int cursz; @@ -2643,7 +2643,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) while (cursz >= splitsz) { /* update the buddy's leaf with its new value. */ - dbAdjTree(tp, leafno ^ budsz, cursz); + dbAdjTree(tp, leafno ^ budsz, cursz, is_ctl); /* on to the next size and buddy. */ @@ -2655,7 +2655,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) /* adjust the dmap tree to reflect the specified leaf's new * value. */ - dbAdjTree(tp, leafno, newval); + dbAdjTree(tp, leafno, newval, is_ctl); } @@ -2686,7 +2686,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) * * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; */ -static int dbBackSplit(dmtree_t * tp, int leafno) +static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl) { int budsz, bud, w, bsz, size; int cursz; @@ -2737,7 +2737,7 @@ static int dbBackSplit(dmtree_t * tp, int leafno) * system in two. */ cursz = leaf[bud] - 1; - dbSplit(tp, bud, cursz, cursz); + dbSplit(tp, bud, cursz, cursz, is_ctl); break; } } @@ -2765,7 +2765,7 @@ static int dbBackSplit(dmtree_t * tp, int leafno) * * RETURN VALUES: none */ -static int dbJoin(dmtree_t * tp, int leafno, int newval) +static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) { int budsz, buddy; s8 *leaf; @@ -2820,12 +2820,12 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval) if (leafno < buddy) { /* leafno is the left buddy. */ - dbAdjTree(tp, buddy, NOFREE); + dbAdjTree(tp, buddy, NOFREE, is_ctl); } else { /* buddy is the left buddy and becomes * leafno. */ - dbAdjTree(tp, leafno, NOFREE); + dbAdjTree(tp, leafno, NOFREE, is_ctl); leafno = buddy; } @@ -2838,7 +2838,7 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval) /* update the leaf value. */ - dbAdjTree(tp, leafno, newval); + dbAdjTree(tp, leafno, newval, is_ctl); return 0; } @@ -2859,21 +2859,23 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval) * * RETURN VALUES: none */ -static void dbAdjTree(dmtree_t * tp, int leafno, int newval) +static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl) { int lp, pp, k; - int max; + int max, size; + + size = is_ctl ? CTLTREESIZE : TREESIZE; /* pick up the index of the leaf for this leafno. */ lp = leafno + le32_to_cpu(tp->dmt_leafidx); + if (WARN_ON_ONCE(lp >= size || lp < 0)) + return; + /* is the current value the same as the old value ? if so, * there is nothing to do. */ - if (WARN_ON_ONCE(lp >= CTLTREESIZE)) - return; - if (tp->dmt_stree[lp] == newval) return; From e0e1958f4c365e380b17ccb35617345b31ef7bf3 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 31 Oct 2023 13:39:04 +0800 Subject: [PATCH 0159/1562] jfs: fix uaf in jfs_evict_inode When the execution of diMount(ipimap) fails, the object ipimap that has been released may be accessed in diFreeSpecial(). Asynchronous ipimap release occurs when rcu_core() calls jfs_free_node(). Therefore, when diMount(ipimap) fails, sbi->ipimap should not be initialized as ipimap. Reported-and-tested-by: syzbot+01cf2dbcbe2022454388@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_mount.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 415eb65a36ff..9b5c6a20b30c 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -172,15 +172,15 @@ int jfs_mount(struct super_block *sb) } jfs_info("jfs_mount: ipimap:0x%p", ipimap); - /* map further access of per fileset inodes by the fileset inode */ - sbi->ipimap = ipimap; - /* initialize fileset inode allocation map */ if ((rc = diMount(ipimap))) { jfs_err("jfs_mount: diMount failed w/rc = %d", rc); goto err_ipimap; } + /* map further access of per fileset inodes by the fileset inode */ + sbi->ipimap = ipimap; + return rc; /* From cca974daeb6c43ea971f8ceff5a7080d7d49ee30 Mon Sep 17 00:00:00 2001 From: Manas Ghandat Date: Wed, 11 Oct 2023 20:09:37 +0530 Subject: [PATCH 0160/1562] jfs: fix shift-out-of-bounds in dbJoin Currently while joining the leaf in a buddy system there is shift out of bound error in calculation of BUDSIZE. Added the required check to the BUDSIZE and fixed the documentation as well. Reported-by: syzbot+411debe54d318eaed386@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=411debe54d318eaed386 Signed-off-by: Manas Ghandat Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_dmap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index cb3cda1390ad..8eec84c651bf 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -2763,7 +2763,9 @@ static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl) * leafno - the number of the leaf to be updated. * newval - the new value for the leaf. * - * RETURN VALUES: none + * RETURN VALUES: + * 0 - success + * -EIO - i/o error */ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) { @@ -2790,6 +2792,10 @@ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) * get the buddy size (number of words covered) of * the new value. */ + + if ((newval - tp->dmt_budmin) > BUDMIN) + return -EIO; + budsz = BUDSIZE(newval, tp->dmt_budmin); /* try to join. From 1957b92aaff0fa71621e61bbd0257b9c3bb9baf2 Mon Sep 17 00:00:00 2001 From: Hugo Villeneuve Date: Tue, 21 Nov 2023 18:09:00 -0500 Subject: [PATCH 0161/1562] regmap: fix regmap_noinc_write() description Change "Write data from" -> "Write data to". Signed-off-by: Hugo Villeneuve Link: https://lore.kernel.org/r/20231121230900.3754785-1-hugo@hugovil.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index ea6157747199..6db77d8e45f9 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -2136,7 +2136,7 @@ static int regmap_noinc_readwrite(struct regmap *map, unsigned int reg, } /** - * regmap_noinc_write(): Write data from a register without incrementing the + * regmap_noinc_write(): Write data to a register without incrementing the * register number * * @map: Register map to write to From ae254858ce0745aba25d107159b580ab5fdada5b Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Tue, 14 Nov 2023 16:51:16 +0100 Subject: [PATCH 0162/1562] selinux: introduce an initial SID for early boot processes Currently, SELinux doesn't allow distinguishing between kernel threads and userspace processes that are started before the policy is first loaded - both get the label corresponding to the kernel SID. The only way a process that persists from early boot can get a meaningful label is by doing a voluntary dyntransition or re-executing itself. Reusing the kernel label for userspace processes is problematic for several reasons: 1. The kernel is considered to be a privileged domain and generally needs to have a wide range of permissions allowed to work correctly, which prevents the policy writer from effectively hardening against early boot processes that might remain running unintentionally after the policy is loaded (they represent a potential extra attack surface that should be mitigated). 2. Despite the kernel being treated as a privileged domain, the policy writer may want to impose certain special limitations on kernel threads that may conflict with the requirements of intentional early boot processes. For example, it is a good hardening practice to limit what executables the kernel can execute as usermode helpers and to confine the resulting usermode helper processes. However, a (legitimate) process surviving from early boot may need to execute a different set of executables. 3. As currently implemented, overlayfs remembers the security context of the process that created an overlayfs mount and uses it to bound subsequent operations on files using this context. If an overlayfs mount is created before the SELinux policy is loaded, these "mounter" checks are made against the kernel context, which may clash with restrictions on the kernel domain (see 2.). To resolve this, introduce a new initial SID (reusing the slot of the former "init" initial SID) that will be assigned to any userspace process started before the policy is first loaded. This is easy to do, as we can simply label any process that goes through the bprm_creds_for_exec LSM hook with the new init-SID instead of propagating the kernel SID from the parent. To provide backwards compatibility for existing policies that are unaware of this new semantic of the "init" initial SID, introduce a new policy capability "userspace_initial_context" and set the "init" SID to the same context as the "kernel" SID unless this capability is set by the policy. Another small backwards compatibility measure is needed in security_sid_to_context_core() for before the initial SELinux policy load - see the code comment for explanation. Signed-off-by: Ondrej Mosnacek Reviewed-by: Stephen Smalley [PM: edited comments based on feedback/discussion] Signed-off-by: Paul Moore --- security/selinux/hooks.c | 28 +++++++++++++++++++ .../selinux/include/initial_sid_to_string.h | 2 +- security/selinux/include/policycap.h | 1 + security/selinux/include/policycap_names.h | 1 + security/selinux/include/security.h | 6 ++++ security/selinux/ss/policydb.c | 27 ++++++++++++++++++ security/selinux/ss/services.c | 13 ++++++++- 7 files changed, 76 insertions(+), 2 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index feda711c6b7b..855589b64641 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2315,6 +2315,19 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) new_tsec->keycreate_sid = 0; new_tsec->sockcreate_sid = 0; + /* + * Before policy is loaded, label any task outside kernel space + * as SECINITSID_INIT, so that any userspace tasks surviving from + * early boot end up with a label different from SECINITSID_KERNEL + * (if the policy chooses to set SECINITSID_INIT != SECINITSID_KERNEL). + */ + if (!selinux_initialized()) { + new_tsec->sid = SECINITSID_INIT; + /* also clear the exec_sid just in case */ + new_tsec->exec_sid = 0; + return 0; + } + if (old_tsec->exec_sid) { new_tsec->sid = old_tsec->exec_sid; /* Reset exec SID on execve. */ @@ -4553,6 +4566,21 @@ static int sock_has_perm(struct sock *sk, u32 perms) if (sksec->sid == SECINITSID_KERNEL) return 0; + /* + * Before POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT, sockets that + * inherited the kernel context from early boot used to be skipped + * here, so preserve that behavior unless the capability is set. + * + * By setting the capability the policy signals that it is ready + * for this quirk to be fixed. Note that sockets created by a kernel + * thread or a usermode helper executed without a transition will + * still be skipped in this check regardless of the policycap + * setting. + */ + if (!selinux_policycap_userspace_initial_context() && + sksec->sid == SECINITSID_INIT) + return 0; + ad_net_init_from_sk(&ad, &net, sk); return avc_has_perm(current_sid(), sksec->sid, sksec->sclass, perms, diff --git a/security/selinux/include/initial_sid_to_string.h b/security/selinux/include/initial_sid_to_string.h index ecc6e74fa09b..5e5f0993dac2 100644 --- a/security/selinux/include/initial_sid_to_string.h +++ b/security/selinux/include/initial_sid_to_string.h @@ -10,7 +10,7 @@ static const char *const initial_sid_to_string[] = { NULL, "file", NULL, - NULL, + "init", "any_socket", "port", "netif", diff --git a/security/selinux/include/policycap.h b/security/selinux/include/policycap.h index f35d3458e71d..c7373e6effe5 100644 --- a/security/selinux/include/policycap.h +++ b/security/selinux/include/policycap.h @@ -12,6 +12,7 @@ enum { POLICYDB_CAP_NNP_NOSUID_TRANSITION, POLICYDB_CAP_GENFS_SECLABEL_SYMLINKS, POLICYDB_CAP_IOCTL_SKIP_CLOEXEC, + POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT, __POLICYDB_CAP_MAX }; #define POLICYDB_CAP_MAX (__POLICYDB_CAP_MAX - 1) diff --git a/security/selinux/include/policycap_names.h b/security/selinux/include/policycap_names.h index 49bbe120d173..28e4c9ee2399 100644 --- a/security/selinux/include/policycap_names.h +++ b/security/selinux/include/policycap_names.h @@ -14,6 +14,7 @@ const char *const selinux_policycap_names[__POLICYDB_CAP_MAX] = { "nnp_nosuid_transition", "genfs_seclabel_symlinks", "ioctl_skip_cloexec", + "userspace_initial_context", }; #endif /* _SELINUX_POLICYCAP_NAMES_H_ */ diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index a9de89af8fdc..074d439fe9ad 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -189,6 +189,12 @@ static inline bool selinux_policycap_ioctl_skip_cloexec(void) selinux_state.policycap[POLICYDB_CAP_IOCTL_SKIP_CLOEXEC]); } +static inline bool selinux_policycap_userspace_initial_context(void) +{ + return READ_ONCE( + selinux_state.policycap[POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT]); +} + struct selinux_policy_convert_data; struct selinux_load_state { diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index bd1e7f26d951..3b19ad28c922 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -857,6 +857,8 @@ void policydb_destroy(struct policydb *p) int policydb_load_isids(struct policydb *p, struct sidtab *s) { struct ocontext *head, *c; + bool isid_init_supported = ebitmap_get_bit(&p->policycaps, + POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT); int rc; rc = sidtab_init(s); @@ -880,6 +882,13 @@ int policydb_load_isids(struct policydb *p, struct sidtab *s) if (!name) continue; + /* + * Also ignore SECINITSID_INIT if the policy doesn't declare + * support for it + */ + if (sid == SECINITSID_INIT && !isid_init_supported) + continue; + rc = sidtab_set_initial(s, sid, &c->context[0]); if (rc) { pr_err("SELinux: unable to load initial SID %s.\n", @@ -887,6 +896,24 @@ int policydb_load_isids(struct policydb *p, struct sidtab *s) sidtab_destroy(s); return rc; } + + /* + * If the policy doesn't support the "userspace_initial_context" + * capability, set SECINITSID_INIT to the same context as + * SECINITSID_KERNEL. This ensures the same behavior as before + * the reintroduction of SECINITSID_INIT, where all tasks + * started before policy load would initially get the context + * corresponding to SECINITSID_KERNEL. + */ + if (sid == SECINITSID_KERNEL && !isid_init_supported) { + rc = sidtab_set_initial(s, SECINITSID_INIT, &c->context[0]); + if (rc) { + pr_err("SELinux: unable to load initial SID %s.\n", + name); + sidtab_destroy(s); + return rc; + } + } } return 0; } diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 1eeffc66ea7d..e88b1b6c4adb 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -1322,8 +1322,19 @@ static int security_sid_to_context_core(u32 sid, char **scontext, if (!selinux_initialized()) { if (sid <= SECINITSID_NUM) { char *scontextp; - const char *s = initial_sid_to_string[sid]; + const char *s; + /* + * Before the policy is loaded, translate + * SECINITSID_INIT to "kernel", because systemd and + * libselinux < 2.6 take a getcon_raw() result that is + * both non-null and not "kernel" to mean that a policy + * is already loaded. + */ + if (sid == SECINITSID_INIT) + sid = SECINITSID_KERNEL; + + s = initial_sid_to_string[sid]; if (!s) return -EINVAL; *scontext_len = strlen(s) + 1; From 9bb6362652f3f4d74a87d572a91ee1b38e673ef6 Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Wed, 25 Oct 2023 23:39:07 +0200 Subject: [PATCH 0163/1562] debugobjects: Stop accessing objects after releasing hash bucket lock After release of the hashbucket lock the tracking object can be modified or freed by a concurrent thread. Using it in such a case is error prone, even for printing the object state: 1. T1 tries to deactivate destroyed object, debugobjects detects it, hash bucket lock is released. 2. T2 preempts T1 and frees the tracking object. 3. The freed tracking object is allocated and initialized for a different to be tracked kernel object. 4. T1 resumes and reports error for wrong kernel object. Create a local copy of the tracking object before releasing the hash bucket lock and use the local copy for reporting and fixups to prevent this. Signed-off-by: Andrzej Hajda Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231025-debugobjects_fix-v3-1-2bc3bf7084c2@intel.com --- lib/debugobjects.c | 204 ++++++++++++++++++--------------------------- 1 file changed, 80 insertions(+), 124 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 2a8e9d63fbe3..fb12a9bacd2f 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -620,9 +620,8 @@ static void debug_objects_fill_pool(void) static void __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack) { - enum debug_obj_state state; + struct debug_obj *obj, o; struct debug_bucket *db; - struct debug_obj *obj; unsigned long flags; debug_objects_fill_pool(); @@ -643,24 +642,18 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack case ODEBUG_STATE_INIT: case ODEBUG_STATE_INACTIVE: obj->state = ODEBUG_STATE_INIT; - break; - - case ODEBUG_STATE_ACTIVE: - state = obj->state; raw_spin_unlock_irqrestore(&db->lock, flags); - debug_print_object(obj, "init"); - debug_object_fixup(descr->fixup_init, addr, state); - return; - - case ODEBUG_STATE_DESTROYED: - raw_spin_unlock_irqrestore(&db->lock, flags); - debug_print_object(obj, "init"); return; default: break; } + o = *obj; raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(&o, "init"); + + if (o.state == ODEBUG_STATE_ACTIVE) + debug_object_fixup(descr->fixup_init, addr, o.state); } /** @@ -701,11 +694,9 @@ EXPORT_SYMBOL_GPL(debug_object_init_on_stack); int debug_object_activate(void *addr, const struct debug_obj_descr *descr) { struct debug_obj o = { .object = addr, .state = ODEBUG_STATE_NOTAVAILABLE, .descr = descr }; - enum debug_obj_state state; struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; - int ret; if (!debug_objects_enabled) return 0; @@ -717,49 +708,38 @@ int debug_object_activate(void *addr, const struct debug_obj_descr *descr) raw_spin_lock_irqsave(&db->lock, flags); obj = lookup_object_or_alloc(addr, db, descr, false, true); - if (likely(!IS_ERR_OR_NULL(obj))) { - bool print_object = false; - + if (unlikely(!obj)) { + raw_spin_unlock_irqrestore(&db->lock, flags); + debug_objects_oom(); + return 0; + } else if (likely(!IS_ERR(obj))) { switch (obj->state) { + case ODEBUG_STATE_ACTIVE: + case ODEBUG_STATE_DESTROYED: + o = *obj; + break; case ODEBUG_STATE_INIT: case ODEBUG_STATE_INACTIVE: obj->state = ODEBUG_STATE_ACTIVE; - ret = 0; - break; - - case ODEBUG_STATE_ACTIVE: - state = obj->state; - raw_spin_unlock_irqrestore(&db->lock, flags); - debug_print_object(obj, "activate"); - ret = debug_object_fixup(descr->fixup_activate, addr, state); - return ret ? 0 : -EINVAL; - - case ODEBUG_STATE_DESTROYED: - print_object = true; - ret = -EINVAL; - break; + fallthrough; default: - ret = 0; - break; + raw_spin_unlock_irqrestore(&db->lock, flags); + return 0; } - raw_spin_unlock_irqrestore(&db->lock, flags); - if (print_object) - debug_print_object(obj, "activate"); - return ret; } raw_spin_unlock_irqrestore(&db->lock, flags); - - /* If NULL the allocation has hit OOM */ - if (!obj) { - debug_objects_oom(); - return 0; - } - - /* Object is neither static nor tracked. It's not initialized */ debug_print_object(&o, "activate"); - ret = debug_object_fixup(descr->fixup_activate, addr, ODEBUG_STATE_NOTAVAILABLE); - return ret ? 0 : -EINVAL; + + switch (o.state) { + case ODEBUG_STATE_ACTIVE: + case ODEBUG_STATE_NOTAVAILABLE: + if (debug_object_fixup(descr->fixup_activate, addr, o.state)) + return 0; + fallthrough; + default: + return -EINVAL; + } } EXPORT_SYMBOL_GPL(debug_object_activate); @@ -770,10 +750,10 @@ EXPORT_SYMBOL_GPL(debug_object_activate); */ void debug_object_deactivate(void *addr, const struct debug_obj_descr *descr) { + struct debug_obj o = { .object = addr, .state = ODEBUG_STATE_NOTAVAILABLE, .descr = descr }; struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; - bool print_object = false; if (!debug_objects_enabled) return; @@ -785,33 +765,24 @@ void debug_object_deactivate(void *addr, const struct debug_obj_descr *descr) obj = lookup_object(addr, db); if (obj) { switch (obj->state) { + case ODEBUG_STATE_DESTROYED: + break; case ODEBUG_STATE_INIT: case ODEBUG_STATE_INACTIVE: case ODEBUG_STATE_ACTIVE: - if (!obj->astate) - obj->state = ODEBUG_STATE_INACTIVE; - else - print_object = true; - break; - - case ODEBUG_STATE_DESTROYED: - print_object = true; - break; + if (obj->astate) + break; + obj->state = ODEBUG_STATE_INACTIVE; + fallthrough; default: - break; + raw_spin_unlock_irqrestore(&db->lock, flags); + return; } + o = *obj; } raw_spin_unlock_irqrestore(&db->lock, flags); - if (!obj) { - struct debug_obj o = { .object = addr, - .state = ODEBUG_STATE_NOTAVAILABLE, - .descr = descr }; - - debug_print_object(&o, "deactivate"); - } else if (print_object) { - debug_print_object(obj, "deactivate"); - } + debug_print_object(&o, "deactivate"); } EXPORT_SYMBOL_GPL(debug_object_deactivate); @@ -822,11 +793,9 @@ EXPORT_SYMBOL_GPL(debug_object_deactivate); */ void debug_object_destroy(void *addr, const struct debug_obj_descr *descr) { - enum debug_obj_state state; + struct debug_obj *obj, o; struct debug_bucket *db; - struct debug_obj *obj; unsigned long flags; - bool print_object = false; if (!debug_objects_enabled) return; @@ -836,32 +805,31 @@ void debug_object_destroy(void *addr, const struct debug_obj_descr *descr) raw_spin_lock_irqsave(&db->lock, flags); obj = lookup_object(addr, db); - if (!obj) - goto out_unlock; + if (!obj) { + raw_spin_unlock_irqrestore(&db->lock, flags); + return; + } switch (obj->state) { + case ODEBUG_STATE_ACTIVE: + case ODEBUG_STATE_DESTROYED: + break; case ODEBUG_STATE_NONE: case ODEBUG_STATE_INIT: case ODEBUG_STATE_INACTIVE: obj->state = ODEBUG_STATE_DESTROYED; - break; - case ODEBUG_STATE_ACTIVE: - state = obj->state; - raw_spin_unlock_irqrestore(&db->lock, flags); - debug_print_object(obj, "destroy"); - debug_object_fixup(descr->fixup_destroy, addr, state); - return; - - case ODEBUG_STATE_DESTROYED: - print_object = true; - break; + fallthrough; default: - break; + raw_spin_unlock_irqrestore(&db->lock, flags); + return; } -out_unlock: + + o = *obj; raw_spin_unlock_irqrestore(&db->lock, flags); - if (print_object) - debug_print_object(obj, "destroy"); + debug_print_object(&o, "destroy"); + + if (o.state == ODEBUG_STATE_ACTIVE) + debug_object_fixup(descr->fixup_destroy, addr, o.state); } EXPORT_SYMBOL_GPL(debug_object_destroy); @@ -872,9 +840,8 @@ EXPORT_SYMBOL_GPL(debug_object_destroy); */ void debug_object_free(void *addr, const struct debug_obj_descr *descr) { - enum debug_obj_state state; + struct debug_obj *obj, o; struct debug_bucket *db; - struct debug_obj *obj; unsigned long flags; if (!debug_objects_enabled) @@ -885,24 +852,26 @@ void debug_object_free(void *addr, const struct debug_obj_descr *descr) raw_spin_lock_irqsave(&db->lock, flags); obj = lookup_object(addr, db); - if (!obj) - goto out_unlock; + if (!obj) { + raw_spin_unlock_irqrestore(&db->lock, flags); + return; + } switch (obj->state) { case ODEBUG_STATE_ACTIVE: - state = obj->state; - raw_spin_unlock_irqrestore(&db->lock, flags); - debug_print_object(obj, "free"); - debug_object_fixup(descr->fixup_free, addr, state); - return; + break; default: hlist_del(&obj->node); raw_spin_unlock_irqrestore(&db->lock, flags); free_object(obj); return; } -out_unlock: + + o = *obj; raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(&o, "free"); + + debug_object_fixup(descr->fixup_free, addr, o.state); } EXPORT_SYMBOL_GPL(debug_object_free); @@ -954,10 +923,10 @@ void debug_object_active_state(void *addr, const struct debug_obj_descr *descr, unsigned int expect, unsigned int next) { + struct debug_obj o = { .object = addr, .state = ODEBUG_STATE_NOTAVAILABLE, .descr = descr }; struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; - bool print_object = false; if (!debug_objects_enabled) return; @@ -970,28 +939,19 @@ debug_object_active_state(void *addr, const struct debug_obj_descr *descr, if (obj) { switch (obj->state) { case ODEBUG_STATE_ACTIVE: - if (obj->astate == expect) - obj->astate = next; - else - print_object = true; - break; - + if (obj->astate != expect) + break; + obj->astate = next; + raw_spin_unlock_irqrestore(&db->lock, flags); + return; default: - print_object = true; break; } + o = *obj; } raw_spin_unlock_irqrestore(&db->lock, flags); - if (!obj) { - struct debug_obj o = { .object = addr, - .state = ODEBUG_STATE_NOTAVAILABLE, - .descr = descr }; - - debug_print_object(&o, "active_state"); - } else if (print_object) { - debug_print_object(obj, "active_state"); - } + debug_print_object(&o, "active_state"); } EXPORT_SYMBOL_GPL(debug_object_active_state); @@ -999,12 +959,10 @@ EXPORT_SYMBOL_GPL(debug_object_active_state); static void __debug_check_no_obj_freed(const void *address, unsigned long size) { unsigned long flags, oaddr, saddr, eaddr, paddr, chunks; - const struct debug_obj_descr *descr; - enum debug_obj_state state; + int cnt, objs_checked = 0; + struct debug_obj *obj, o; struct debug_bucket *db; struct hlist_node *tmp; - struct debug_obj *obj; - int cnt, objs_checked = 0; saddr = (unsigned long) address; eaddr = saddr + size; @@ -1026,12 +984,10 @@ repeat: switch (obj->state) { case ODEBUG_STATE_ACTIVE: - descr = obj->descr; - state = obj->state; + o = *obj; raw_spin_unlock_irqrestore(&db->lock, flags); - debug_print_object(obj, "free"); - debug_object_fixup(descr->fixup_free, - (void *) oaddr, state); + debug_print_object(&o, "free"); + debug_object_fixup(o.descr->fixup_free, (void *)oaddr, o.state); goto repeat; default: hlist_del(&obj->node); From a89299c40911ee29c6ec4fb66f9c598cd947265b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Nov 2023 13:58:25 +0100 Subject: [PATCH 0164/1562] time: Make sysfs_get_uname() function visible in header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function is defined globally in clocksource.c and used conditionally in clockevent.c, which the declaration hidden when clockevent support is disabled. This causes a harmless warning in the definition: kernel/time/clocksource.c:1324:9: warning: no previous prototype for 'sysfs_get_uname' [-Wmissing-prototypes] 1324 | ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) Move the declaration out of the #ifdef so it is always visible. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231108125843.3806765-5-arnd@kernel.org --- kernel/time/tick-internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 649f2b48e8f0..481b7ab65e2c 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -56,7 +56,6 @@ extern int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force); extern void clockevents_handle_noop(struct clock_event_device *dev); extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); -extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); /* Broadcasting support */ # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST @@ -197,3 +196,5 @@ void hrtimers_resume_local(void); #else #define JIFFIES_SHIFT 8 #endif + +extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); From a2ee7581afd59015b8f9ae01fad131aed9f26f01 Mon Sep 17 00:00:00 2001 From: Jeff Brasen Date: Fri, 10 Nov 2023 00:03:21 +0530 Subject: [PATCH 0165/1562] ACPI: thermal: Add Thermal fast Sampling Period (_TFP) support Add support of "Thermal fast Sampling Period (_TFP)" for passive cooling. As per the ACPI specification (ACPI 6.5, Section 11.4.17 "_TFP (Thermal fast Sampling Period)", _TFP overrides _TSP ("Thermal Sampling Period" if both are present in a Thermal zone. Signed-off-by: Jeff Brasen Co-developed-by: Sumit Gupta Signed-off-by: Sumit Gupta [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/thermal.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 6c29a266dbd0..ee28ca93d983 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -92,7 +92,7 @@ struct acpi_thermal_passive { struct acpi_thermal_trip trip; unsigned long tc1; unsigned long tc2; - unsigned long tsp; + unsigned long delay; }; struct acpi_thermal_active { @@ -396,11 +396,17 @@ static bool passive_trip_params_init(struct acpi_thermal *tz) tz->trips.passive.tc2 = tmp; + status = acpi_evaluate_integer(tz->device->handle, "_TFP", NULL, &tmp); + if (ACPI_SUCCESS(status)) { + tz->trips.passive.delay = tmp; + return true; + } + status = acpi_evaluate_integer(tz->device->handle, "_TSP", NULL, &tmp); if (ACPI_FAILURE(status)) return false; - tz->trips.passive.tsp = tmp; + tz->trips.passive.delay = tmp * 100; return true; } @@ -896,7 +902,7 @@ static int acpi_thermal_add(struct acpi_device *device) acpi_trip = &tz->trips.passive.trip; if (acpi_thermal_trip_valid(acpi_trip)) { - passive_delay = tz->trips.passive.tsp * 100; + passive_delay = tz->trips.passive.delay; trip->type = THERMAL_TRIP_PASSIVE; trip->temperature = acpi_thermal_temp(tz, acpi_trip->temp_dk); From 8a399e2f60037ed07a55278e39b20e43dea4f0c2 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:24 +0000 Subject: [PATCH 0166/1562] slub: Keep track of whether slub is on the per-node partial list Now we rely on the "frozen" bit to see if we should manipulate the slab->slab_list, which will be changed in the following patch. Instead we introduce another way to keep track of whether slub is on the per-node partial list, here we reuse the PG_workingset bit. We have to use the atomic set_bit() and clear_bit() variants and change slab_unlock() to bit_spin_unlock() because when cmpxchg is not available and PG_lock is used, there may be concurrent operations on the two bits. Thanks to Mark Brown for reporting a hang and testing of a previous version where the non-atomic operations were used. Suggested-by: Matthew Wilcox Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 03384cd965c5..6efcbf79fd2d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -522,7 +522,7 @@ static __always_inline void slab_unlock(struct slab *slab) struct page *page = slab_page(slab); VM_BUG_ON_PAGE(PageTail(page), page); - __bit_spin_unlock(PG_locked, &page->flags); + bit_spin_unlock(PG_locked, &page->flags); } static inline bool @@ -2116,6 +2116,25 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab) free_slab(s, slab); } +/* + * SLUB reuses PG_workingset bit to keep track of whether it's on + * the per-node partial list. + */ +static inline bool slab_test_node_partial(const struct slab *slab) +{ + return folio_test_workingset((struct folio *)slab_folio(slab)); +} + +static inline void slab_set_node_partial(struct slab *slab) +{ + set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); +} + +static inline void slab_clear_node_partial(struct slab *slab) +{ + clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); +} + /* * Management of partially allocated slabs. */ @@ -2127,6 +2146,7 @@ __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) list_add_tail(&slab->slab_list, &n->partial); else list_add(&slab->slab_list, &n->partial); + slab_set_node_partial(slab); } static inline void add_partial(struct kmem_cache_node *n, @@ -2141,6 +2161,7 @@ static inline void remove_partial(struct kmem_cache_node *n, { lockdep_assert_held(&n->list_lock); list_del(&slab->slab_list); + slab_clear_node_partial(slab); n->nr_partial--; } @@ -4833,6 +4854,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) if (free == slab->objects) { list_move(&slab->slab_list, &discard); + slab_clear_node_partial(slab); n->nr_partial--; dec_slabs_node(s, node, slab->objects); } else if (free <= SHRINK_PROMOTE_MAX) From 4c2ba6a0ed1944c17b957157bd1e686be4ea968a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 21 Sep 2023 14:58:02 +0200 Subject: [PATCH 0167/1562] ACPI: processor: Provide empty stub of acpi_proc_quirk_mwait_check() Commit 0a0e2ea642f6 ("ACPI: processor: Move MWAIT quirk out of acpi_processor.c") added acpi_proc_quirk_mwait_check() that is only defined for x86 and is unlikely to be defined for any other architectures, so put it under #ifdef CONFIG_X86 and provide an empty stub implementation of it for the other cases. This is kind of orthogonal to [1], because if any architectures other than x86 decide to use the processor _OSC, they will see the reported build error. Link: https://lore.kernel.org/lkml/c7a05a44-c0be-46c2-a21d-b242524d482b@roeck-us.net Link: https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/commit/?h=remove-ia64&id=a0334bf78b95532cec54f56b53e8ae1bfe7e1ca1 # [1] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/internal.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 866c7c4ed233..db666f13c2ef 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -148,8 +148,11 @@ int acpi_wakeup_device_init(void); #ifdef CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC void acpi_early_processor_control_setup(void); void acpi_early_processor_set_pdc(void); - +#ifdef CONFIG_X86 void acpi_proc_quirk_mwait_check(void); +#else +static inline void acpi_proc_quirk_mwait_check(void) {} +#endif bool processor_physically_present(acpi_handle handle); #else static inline void acpi_early_processor_control_setup(void) {} From 80b4ff1d2c9bc7e20b82d18535a27fa32dffa1dd Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 22 Nov 2023 11:02:44 -0500 Subject: [PATCH 0168/1562] selftests: remove the LSM_ID_IMA check in lsm/lsm_list_modules_test The IMA LSM ID token was removed as IMA isn't yet a proper LSM, but we forgot to remove the check from the selftest. Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202311221047.a9Dww3vY-lkp@intel.com/ Acked-by: Casey Schaufler Signed-off-by: Paul Moore --- tools/testing/selftests/lsm/lsm_list_modules_test.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/testing/selftests/lsm/lsm_list_modules_test.c b/tools/testing/selftests/lsm/lsm_list_modules_test.c index 445c02f09c74..9df29b1e3497 100644 --- a/tools/testing/selftests/lsm/lsm_list_modules_test.c +++ b/tools/testing/selftests/lsm/lsm_list_modules_test.c @@ -101,9 +101,6 @@ TEST(correct_lsm_list_modules) case LSM_ID_TOMOYO: name = "tomoyo"; break; - case LSM_ID_IMA: - name = "ima"; - break; case LSM_ID_APPARMOR: name = "apparmor"; break; From 56d2eeda87995245300836ee4dbd13b002311782 Mon Sep 17 00:00:00 2001 From: Nikita Kiryushin Date: Thu, 9 Nov 2023 21:08:59 +0300 Subject: [PATCH 0169/1562] ACPI: LPIT: Avoid u32 multiplication overflow In lpit_update_residency() there is a possibility of overflow in multiplication, if tsc_khz is large enough (> UINT_MAX/1000). Change multiplication to mul_u32_u32(). Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: eeb2d80d502a ("ACPI / LPIT: Add Low Power Idle Table (LPIT) support") Signed-off-by: Nikita Kiryushin Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_lpit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/acpi_lpit.c b/drivers/acpi/acpi_lpit.c index c5598b6d5db8..794962c5c88e 100644 --- a/drivers/acpi/acpi_lpit.c +++ b/drivers/acpi/acpi_lpit.c @@ -105,7 +105,7 @@ static void lpit_update_residency(struct lpit_residency_info *info, return; info->frequency = lpit_native->counter_frequency ? - lpit_native->counter_frequency : tsc_khz * 1000; + lpit_native->counter_frequency : mul_u32_u32(tsc_khz, 1000U); if (!info->frequency) info->frequency = 1; From 4c58e9d85c24b5281a2d39a3e6510b5f3b7fc687 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 1 Nov 2023 09:45:00 -0500 Subject: [PATCH 0170/1562] opp: ti: Use device_get_match_data() Use preferred device_get_match_data() instead of of_match_device() to get the driver match data. With this, adjust the includes to explicitly include the correct headers. As this driver only does DT based matching, of_match_device() will never return NULL if we've gotten to probe(). Therefore, the NULL check and error return for it can be dropped. Signed-off-by: Rob Herring Signed-off-by: Viresh Kumar --- drivers/opp/ti-opp-supply.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/opp/ti-opp-supply.c b/drivers/opp/ti-opp-supply.c index 8f3f13fbbb25..e3b97cd1fbbf 100644 --- a/drivers/opp/ti-opp-supply.c +++ b/drivers/opp/ti-opp-supply.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -373,23 +374,15 @@ static int ti_opp_supply_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct device *cpu_dev = get_cpu_device(0); - const struct of_device_id *match; const struct ti_opp_supply_of_data *of_data; int ret = 0; - match = of_match_device(ti_opp_supply_of_match, dev); - if (!match) { - /* We do not expect this to happen */ - dev_err(dev, "%s: Unable to match device\n", __func__); - return -ENODEV; - } - if (!match->data) { + of_data = device_get_match_data(dev); + if (!of_data) { /* Again, unlikely.. but mistakes do happen */ dev_err(dev, "%s: Bad data in match\n", __func__); return -EINVAL; } - of_data = match->data; - dev_set_drvdata(dev, (void *)of_data); /* If we need optimized voltage */ From 50181c0cff31281b9f1071575ffba8a102375ece Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 22 Nov 2023 15:01:19 +0100 Subject: [PATCH 0171/1562] sched/pelt: Avoid underestimation of task utilization Lukasz Luba reported that a thread's util_est can significantly decrease as a result of sharing the CPU with other threads. The use case can be easily reproduced with a periodic task TA that runs 1ms and sleeps 100us. When the task is alone on the CPU, its max utilization and its util_est is around 888. If another similar task starts to run on the same CPU, TA will have to share the CPU runtime and its maximum utilization will decrease around half the CPU capacity (512) then TA's util_est will follow this new maximum trend which is only the result of sharing the CPU with others tasks. Such situation can be detected with runnable_avg wich is close or equal to util_avg when TA is alone, but increases above util_avg when TA shares the CPU with other threads and wait on the runqueue. [ We prefer an util_est that overestimate rather than under estimate because in 1st case we will not provide enough performance to the task which will remain under-provisioned, whereas in the other case we will create some idle time which will enable to reduce contention and as a result reduces the util_est so the overestimate will be transient whereas the underestimate will remain. ] [ mingo: Refined the changelog, added comments from the LKML discussion. ] Reported-by: Lukasz Luba Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/lkml/CAKfTPtDd-HhF-YiNTtL9i5k0PfJbF819Yxu4YquzfXgwi7voyw@mail.gmail.com/#t Link: https://lore.kernel.org/r/20231122140119.472110-1-vincent.guittot@linaro.org Cc: Hongyan Xia --- kernel/sched/fair.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 07f555857698..53dea95ad8c9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4774,6 +4774,11 @@ static inline unsigned long task_util(struct task_struct *p) return READ_ONCE(p->se.avg.util_avg); } +static inline unsigned long task_runnable(struct task_struct *p) +{ + return READ_ONCE(p->se.avg.runnable_avg); +} + static inline unsigned long _task_util_est(struct task_struct *p) { struct util_est ue = READ_ONCE(p->se.avg.util_est); @@ -4892,6 +4897,14 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) return; + /* + * To avoid underestimate of task utilization, skip updates of EWMA if + * we cannot grant that thread got all CPU time it wanted. + */ + if ((ue.enqueued + UTIL_EST_MARGIN) < task_runnable(p)) + goto done; + + /* * Update Task's estimated utilization * From 9c0b4bb7f6303c9c4e2e34984c46f5a86478f84d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 22 Nov 2023 14:39:03 +0100 Subject: [PATCH 0172/1562] sched/cpufreq: Rework schedutil governor performance estimation The current method to take into account uclamp hints when estimating the target frequency can end in a situation where the selected target frequency is finally higher than uclamp hints, whereas there are no real needs. Such cases mainly happen because we are currently mixing the traditional scheduler utilization signal with the uclamp performance hints. By adding these 2 metrics, we loose an important information when it comes to select the target frequency, and we have to make some assumptions which can't fit all cases. Rework the interface between the scheduler and schedutil governor in order to propagate all information down to the cpufreq governor. effective_cpu_util() interface changes and now returns the actual utilization of the CPU with 2 optional inputs: - The minimum performance for this CPU; typically the capacity to handle the deadline task and the interrupt pressure. But also uclamp_min request when available. - The maximum targeting performance for this CPU which reflects the maximum level that we would like to not exceed. By default it will be the CPU capacity but can be reduced because of some performance hints set with uclamp. The value can be lower than actual utilization and/or min performance level. A new sugov_effective_cpu_perf() interface is also available to compute the final performance level that is targeted for the CPU, after applying some cpufreq headroom and taking into account all inputs. With these 2 functions, schedutil is now able to decide when it must go above uclamp hints. It now also has a generic way to get the min performance level. The dependency between energy model and cpufreq governor and its headroom policy doesn't exist anymore. eenv_pd_max_util() asks schedutil for the targeted performance after applying the impact of the waking task. [ mingo: Refined the changelog & C comments. ] Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20231122133904.446032-2-vincent.guittot@linaro.org --- include/linux/energy_model.h | 1 - kernel/sched/core.c | 90 ++++++++++++++------------------ kernel/sched/cpufreq_schedutil.c | 35 +++++++++---- kernel/sched/fair.c | 22 ++++++-- kernel/sched/sched.h | 24 +++------ 5 files changed, 89 insertions(+), 83 deletions(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index b9caa01dfac4..adec808b371a 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -243,7 +243,6 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, scale_cpu = arch_scale_cpu_capacity(cpu); ps = &pd->table[pd->nr_perf_states - 1]; - max_util = map_util_perf(max_util); max_util = min(max_util, allowed_cpu_cap); freq = map_util_freq(max_util, ps->frequency, scale_cpu); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2de77a6d5ef8..db4be4921e7f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7467,18 +7467,13 @@ int sched_core_idle_cpu(int cpu) * required to meet deadlines. */ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - enum cpu_util_type type, - struct task_struct *p) + unsigned long *min, + unsigned long *max) { - unsigned long dl_util, util, irq, max; + unsigned long util, irq, scale; struct rq *rq = cpu_rq(cpu); - max = arch_scale_cpu_capacity(cpu); - - if (!uclamp_is_used() && - type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { - return max; - } + scale = arch_scale_cpu_capacity(cpu); /* * Early check to see if IRQ/steal time saturates the CPU, can be @@ -7486,45 +7481,49 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, * update_irq_load_avg(). */ irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) - return max; + if (unlikely(irq >= scale)) { + if (min) + *min = scale; + if (max) + *max = scale; + return scale; + } + + if (min) { + /* + * The minimum utilization returns the highest level between: + * - the computed DL bandwidth needed with the IRQ pressure which + * steals time to the deadline task. + * - The minimum performance requirement for CFS and/or RT. + */ + *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); + + /* + * When an RT task is runnable and uclamp is not used, we must + * ensure that the task will run at maximum compute capacity. + */ + if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) + *min = max(*min, scale); + } /* * Because the time spend on RT/DL tasks is visible as 'lost' time to * CFS tasks and we use the same metric to track the effective * utilization (PELT windows are synchronized) we can directly add them * to obtain the CPU's actual utilization. - * - * CFS and RT utilization can be boosted or capped, depending on - * utilization clamp constraints requested by currently RUNNABLE - * tasks. - * When there are no CFS RUNNABLE tasks, clamps are released and - * frequency will be gracefully reduced with the utilization decay. */ util = util_cfs + cpu_util_rt(rq); - if (type == FREQUENCY_UTIL) - util = uclamp_rq_util_with(rq, util, p); - - dl_util = cpu_util_dl(rq); + util += cpu_util_dl(rq); /* - * For frequency selection we do not make cpu_util_dl() a permanent part - * of this sum because we want to use cpu_bw_dl() later on, but we need - * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such - * that we select f_max when there is no idle time. - * - * NOTE: numerical errors or stop class might cause us to not quite hit - * saturation when we should -- something for later. + * The maximum hint is a soft bandwidth requirement, which can be lower + * than the actual utilization because of uclamp_max requirements. */ - if (util + dl_util >= max) - return max; + if (max) + *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); - /* - * OTOH, for energy computation we need the estimated running time, so - * include util_dl and ignore dl_bw. - */ - if (type == ENERGY_UTIL) - util += dl_util; + if (util >= scale) + return scale; /* * There is still idle time; further improve the number by using the @@ -7535,28 +7534,15 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, * U' = irq + --------- * U * max */ - util = scale_irq_capacity(util, irq, max); + util = scale_irq_capacity(util, irq, scale); util += irq; - /* - * Bandwidth required by DEADLINE must always be granted while, for - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism - * to gracefully reduce the frequency when no tasks show up for longer - * periods of time. - * - * Ideally we would like to set bw_dl as min/guaranteed freq and util + - * bw_dl as requested freq. However, cpufreq is not yet ready for such - * an interface. So, we only do the latter for now. - */ - if (type == FREQUENCY_UTIL) - util += cpu_bw_dl(rq); - - return min(max, util); + return min(scale, util); } unsigned long sched_cpu_util(int cpu) { - return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL); + return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); } #endif /* CONFIG_SMP */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5888176354e2..f3acf2cf26ed 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -47,7 +47,7 @@ struct sugov_cpu { u64 last_update; unsigned long util; - unsigned long bw_dl; + unsigned long bw_min; /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON @@ -143,7 +143,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; - util = map_util_perf(util); freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) @@ -153,14 +152,30 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } +unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, + unsigned long min, + unsigned long max) +{ + /* Add dvfs headroom to actual utilization */ + actual = map_util_perf(actual); + /* Actually we don't need to target the max performance */ + if (actual < max) + max = actual; + + /* + * Ensure at least minimum performance while providing more compute + * capacity when possible. + */ + return max(min, max); +} + static void sugov_get_util(struct sugov_cpu *sg_cpu) { - unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu); - struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu); - sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util, - FREQUENCY_UTIL, NULL); + util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); + sg_cpu->bw_min = min; + sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max); } /** @@ -306,7 +321,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } */ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) sg_cpu->sg_policy->limits_changed = true; } @@ -407,8 +422,8 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util = prev_util; - cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), - map_util_perf(sg_cpu->util), max_cap); + cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, + sg_cpu->util, max_cap); sg_cpu->sg_policy->last_freq_update_time = time; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 53dea95ad8c9..34fe6e9490c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7793,7 +7793,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv, for_each_cpu(cpu, pd_cpus) { unsigned long util = cpu_util(cpu, p, -1, 0); - busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL); + busy_time += effective_cpu_util(cpu, util, NULL, NULL); } eenv->pd_busy_time = min(eenv->pd_cap, busy_time); @@ -7816,7 +7816,7 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, for_each_cpu(cpu, pd_cpus) { struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; unsigned long util = cpu_util(cpu, p, dst_cpu, 1); - unsigned long eff_util; + unsigned long eff_util, min, max; /* * Performance domain frequency: utilization clamping @@ -7825,7 +7825,23 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); + eff_util = effective_cpu_util(cpu, util, &min, &max); + + /* Task's uclamp can modify min and max value */ + if (tsk && uclamp_is_used()) { + min = max(min, uclamp_eff_value(p, UCLAMP_MIN)); + + /* + * If there is no active max uclamp constraint, + * directly use task's one, otherwise keep max. + */ + if (uclamp_rq_is_idle(cpu_rq(cpu))) + max = uclamp_eff_value(p, UCLAMP_MAX); + else + max = max(max, uclamp_eff_value(p, UCLAMP_MAX)); + } + + eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max); max_util = max(max_util, eff_util); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8a70d51ffa33..c1574cd388e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2994,24 +2994,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_SMP -/** - * enum cpu_util_type - CPU utilization type - * @FREQUENCY_UTIL: Utilization used to select frequency - * @ENERGY_UTIL: Utilization used during energy calculation - * - * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time - * need to be aggregated differently depending on the usage made of them. This - * enum is used within effective_cpu_util() to differentiate the types of - * utilization expected by the callers, and adjust the aggregation accordingly. - */ -enum cpu_util_type { - FREQUENCY_UTIL, - ENERGY_UTIL, -}; - unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - enum cpu_util_type type, - struct task_struct *p); + unsigned long *min, + unsigned long *max); + +unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, + unsigned long min, + unsigned long max); + /* * Verify the fitness of task @p to run on @cpu taking into account the From f12560779f9d734446508f3df17f5632e9aaa2c8 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 22 Nov 2023 14:39:04 +0100 Subject: [PATCH 0173/1562] sched/cpufreq: Rework iowait boost Use the max value that has already been computed inside sugov_get_util() to cap the iowait boost and remove dependency with uclamp_rq_util_with() which is not used anymore. Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20231122133904.446032-3-vincent.guittot@linaro.org --- kernel/sched/cpufreq_schedutil.c | 29 ++++++++------- kernel/sched/sched.h | 60 -------------------------------- 2 files changed, 14 insertions(+), 75 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index f3acf2cf26ed..4ee8ad70be99 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -169,11 +169,12 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, return max(min, max); } -static void sugov_get_util(struct sugov_cpu *sg_cpu) +static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) { unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu); util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); + util = max(util, boost); sg_cpu->bw_min = min; sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max); } @@ -266,18 +267,16 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, +static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, unsigned long max_cap) { - unsigned long boost; - /* No boost currently required */ if (!sg_cpu->iowait_boost) - return; + return 0; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return; + return 0; if (!sg_cpu->iowait_boost_pending) { /* @@ -286,7 +285,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, sg_cpu->iowait_boost >>= 1; if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { sg_cpu->iowait_boost = 0; - return; + return 0; } } @@ -296,10 +295,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, * sg_cpu->util is already in capacity scale; convert iowait_boost * into the same scale so we can compare. */ - boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; - boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); - if (sg_cpu->util < boost) - sg_cpu->util = boost; + return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; } #ifdef CONFIG_NO_HZ_COMMON @@ -329,6 +325,8 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, u64 time, unsigned long max_cap, unsigned int flags) { + unsigned long boost; + sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -337,8 +335,8 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, if (!sugov_should_update_freq(sg_cpu->sg_policy, time)) return false; - sugov_get_util(sg_cpu); - sugov_iowait_apply(sg_cpu, time, max_cap); + boost = sugov_iowait_apply(sg_cpu, time, max_cap); + sugov_get_util(sg_cpu, boost); return true; } @@ -439,9 +437,10 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); + unsigned long boost; - sugov_get_util(j_sg_cpu); - sugov_iowait_apply(j_sg_cpu, time, max_cap); + boost = sugov_iowait_apply(j_sg_cpu, time, max_cap); + sugov_get_util(j_sg_cpu, boost); util = max(j_sg_cpu->util, util); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c1574cd388e7..e58a54bda77d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3058,59 +3058,6 @@ static inline bool uclamp_rq_is_idle(struct rq *rq) return rq->uclamp_flags & UCLAMP_FLAG_IDLE; } -/** - * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. - * @rq: The rq to clamp against. Must not be NULL. - * @util: The util value to clamp. - * @p: The task to clamp against. Can be NULL if you want to clamp - * against @rq only. - * - * Clamps the passed @util to the max(@rq, @p) effective uclamp values. - * - * If sched_uclamp_used static key is disabled, then just return the util - * without any clamping since uclamp aggregation at the rq level in the fast - * path is disabled, rendering this operation a NOP. - * - * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It - * will return the correct effective uclamp value of the task even if the - * static key is disabled. - */ -static __always_inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) -{ - unsigned long min_util = 0; - unsigned long max_util = 0; - - if (!static_branch_likely(&sched_uclamp_used)) - return util; - - if (p) { - min_util = uclamp_eff_value(p, UCLAMP_MIN); - max_util = uclamp_eff_value(p, UCLAMP_MAX); - - /* - * Ignore last runnable task's max clamp, as this task will - * reset it. Similarly, no need to read the rq's min clamp. - */ - if (uclamp_rq_is_idle(rq)) - goto out; - } - - min_util = max_t(unsigned long, min_util, uclamp_rq_get(rq, UCLAMP_MIN)); - max_util = max_t(unsigned long, max_util, uclamp_rq_get(rq, UCLAMP_MAX)); -out: - /* - * Since CPU's {min,max}_util clamps are MAX aggregated considering - * RUNNABLE tasks with _different_ clamps, we can end up with an - * inversion. Fix it now when the clamps are applied. - */ - if (unlikely(min_util >= max_util)) - return min_util; - - return clamp(util, min_util, max_util); -} - /* Is the rq being capped/throttled by uclamp_max? */ static inline bool uclamp_rq_is_capped(struct rq *rq) { @@ -3148,13 +3095,6 @@ static inline unsigned long uclamp_eff_value(struct task_struct *p, return SCHED_CAPACITY_SCALE; } -static inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) -{ - return util; -} - static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; } static inline bool uclamp_is_used(void) From 388a1fb7da6aaa1970c7e2a7d7fcd983a87a8484 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Nov 2023 11:07:56 +0100 Subject: [PATCH 0174/1562] perf: Fix the nr_addr_filters fix Thomas reported that commit 652ffc2104ec ("perf/core: Fix narrow startup race when creating the perf nr_addr_filters sysfs file") made the entire attribute group vanish, instead of only the nr_addr_filters attribute. Additionally a stray return. Insufficient coffee was involved with both writing and merging the patch. Fixes: 652ffc2104ec ("perf/core: Fix narrow startup race when creating the perf nr_addr_filters sysfs file") Reported-by: Thomas Richter Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Richter Link: https://lkml.kernel.org/r/20231122100756.GP8262@noisy.programming.kicks-ass.net --- kernel/events/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4f0c45ab8d7d..59b332cce9e7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11417,12 +11417,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int struct device *dev = kobj_to_dev(kobj); struct pmu *pmu = dev_get_drvdata(dev); - if (!pmu->nr_addr_filters) + if (n == 2 && !pmu->nr_addr_filters) return 0; return a->mode; - - return 0; } static struct attribute_group pmu_dev_attr_group = { From b14b2d56168c1bcf00fccb5a2fe746e64ed970cc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Nov 2023 07:59:57 +0100 Subject: [PATCH 0175/1562] ACPI: thermal_lib: include "internal.h" for function prototypes The newly added functions are declared in a header that is not included before the definition: drivers/acpi/thermal_lib.c:46:5: error: no previous prototype for 'acpi_active_trip_temp' [-Werror=missing-prototypes] 46 | int acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp) | ^~~~~~~~~~~~~~~~~~~~~ drivers/acpi/thermal_lib.c:57:5: error: no previous prototype for 'acpi_passive_trip_temp' [-Werror=missing-prototypes] 57 | int acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp) | ^~~~~~~~~~~~~~~~~~~~~~ drivers/acpi/thermal_lib.c:63:5: error: no previous prototype for 'acpi_hot_trip_temp' [-Werror=missing-prototypes] 63 | int acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp) | ^~~~~~~~~~~~~~~~~~ drivers/acpi/thermal_lib.c:69:5: error: no previous prototype for 'acpi_critical_trip_temp' [-Werror=missing-prototypes] 69 | int acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp) | ^~~~~~~~~~~~~~~~~~~~~~~ Fixes: 6908097aa5a7 ("ACPI: thermal_lib: Add functions returning temperature in deci-Kelvin") Signed-off-by: Arnd Bergmann Signed-off-by: Rafael J. Wysocki --- drivers/acpi/thermal_lib.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/thermal_lib.c b/drivers/acpi/thermal_lib.c index 646ff6bda6dd..4e0519ca9739 100644 --- a/drivers/acpi/thermal_lib.c +++ b/drivers/acpi/thermal_lib.c @@ -9,6 +9,7 @@ #include #include #include +#include "internal.h" /* * Minimum temperature for full military grade is 218°K (-55°C) and From cf35791476fcb3230b98a42241a56242d60ebdd3 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Nov 2023 08:39:35 -0800 Subject: [PATCH 0176/1562] perf/x86/intel/uncore: Generic uncore_get_uncores and MMIO format of SPR Factor out SPR_UNCORE_MMIO_COMMON_FORMAT which can be reused by Granite Rapids in the following patch. Granite Rapids have more uncore units than Sapphire Rapids. Add new parameters to support adjustable uncore units. No functional change. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Ammy Yi Link: https://lore.kernel.org/r/20231117163939.2468007-1-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 34 +++++++++++++++++++--------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 8250f0f59c2b..fc6587016af7 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6079,13 +6079,16 @@ static struct uncore_event_desc spr_uncore_imc_events[] = { { /* end: all zeroes */ }, }; +#define SPR_UNCORE_MMIO_COMMON_FORMAT() \ + SPR_UNCORE_COMMON_FORMAT(), \ + .ops = &spr_uncore_mmio_ops + static struct intel_uncore_type spr_uncore_imc = { - SPR_UNCORE_COMMON_FORMAT(), + SPR_UNCORE_MMIO_COMMON_FORMAT(), .name = "imc", .fixed_ctr_bits = 48, .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, - .ops = &spr_uncore_mmio_ops, .event_descs = spr_uncore_imc_events, }; @@ -6412,7 +6415,8 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type, static struct intel_uncore_type ** uncore_get_uncores(enum uncore_access_type type_id, int num_extra, - struct intel_uncore_type **extra) + struct intel_uncore_type **extra, int max_num_types, + struct intel_uncore_type **uncores) { struct intel_uncore_type **types, **start_types; int i; @@ -6421,9 +6425,9 @@ uncore_get_uncores(enum uncore_access_type type_id, int num_extra, /* Only copy the customized features */ for (; *types; types++) { - if ((*types)->type_id >= UNCORE_SPR_NUM_UNCORE_TYPES) + if ((*types)->type_id >= max_num_types) continue; - uncore_type_customized_copy(*types, spr_uncores[(*types)->type_id]); + uncore_type_customized_copy(*types, uncores[(*types)->type_id]); } for (i = 0; i < num_extra; i++, types++) @@ -6470,7 +6474,9 @@ void spr_uncore_cpu_init(void) uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR, UNCORE_SPR_MSR_EXTRA_UNCORES, - spr_msr_uncores); + spr_msr_uncores, + UNCORE_SPR_NUM_UNCORE_TYPES, + spr_uncores); type = uncore_find_type_by_id(uncore_msr_uncores, UNCORE_SPR_CHA); if (type) { @@ -6552,7 +6558,9 @@ int spr_uncore_pci_init(void) spr_update_device_location(UNCORE_SPR_M3UPI); uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, UNCORE_SPR_PCI_EXTRA_UNCORES, - spr_pci_uncores); + spr_pci_uncores, + UNCORE_SPR_NUM_UNCORE_TYPES, + spr_uncores); return 0; } @@ -6560,12 +6568,16 @@ void spr_uncore_mmio_init(void) { int ret = snbep_pci2phy_map_init(0x3250, SKX_CPUNODEID, SKX_GIDNIDMAP, true); - if (ret) - uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL); - else { + if (ret) { + uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL, + UNCORE_SPR_NUM_UNCORE_TYPES, + spr_uncores); + } else { uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, UNCORE_SPR_MMIO_EXTRA_UNCORES, - spr_mmio_uncores); + spr_mmio_uncores, + UNCORE_SPR_NUM_UNCORE_TYPES, + spr_uncores); spr_uncore_imc_free_running.num_boxes = uncore_type_max_boxes(uncore_mmio_uncores, UNCORE_SPR_IMC) / 2; } From b560e0cd882b11921c84307efe139f1247434c5e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Nov 2023 08:39:36 -0800 Subject: [PATCH 0177/1562] perf/x86/uncore: Use u64 to replace unsigned for the uncore offsets array The current perf doesn't save the complete address of an uncore unit. The complete address of each unit is calculated by the base address + offset. The type of the base address is u64, while the type of offset is unsigned. In the old platforms (without the discovery table method), the base address and offset are hard coded in the driver. Perf can always use the lowest address as the base address. Everything works well. In the new platforms (starting from SPR), the discovery table provides a complete address for all uncore units. To follow the current framework/codes, when parsing the discovery table, the complete address of the first box is stored as a base address. The offset of the following units is calculated by the complete address of the unit minus the base address (the address of the first unit). On GNR, the latter units may have a lower address compared to the first unit. So the offset is a negative value. The upper 32 bits are lost when casting a negative u64 to an unsigned type. Use u64 to replace unsigned for the uncore offsets array to correct the above case. There is no functional change. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Ammy Yi Link: https://lore.kernel.org/r/20231117163939.2468007-2-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.h | 6 +++--- arch/x86/events/intel/uncore_discovery.c | 5 +++-- arch/x86/events/intel/uncore_discovery.h | 2 +- arch/x86/events/intel/uncore_nhmex.c | 2 +- arch/x86/events/intel/uncore_snbep.c | 6 +++--- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index c30fb5bb1222..7428ecaddf72 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -72,9 +72,9 @@ struct intel_uncore_type { unsigned single_fixed:1; unsigned pair_ctr_ctl:1; union { - unsigned *msr_offsets; - unsigned *pci_offsets; - unsigned *mmio_offsets; + u64 *msr_offsets; + u64 *pci_offsets; + u64 *mmio_offsets; }; unsigned *box_ids; struct event_constraint unconstrainted; diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index cb488e41807c..9a698a92962a 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -125,7 +125,8 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit, int die, bool parsed) { struct intel_uncore_discovery_type *type; - unsigned int *box_offset, *ids; + unsigned int *ids; + u64 *box_offset; int i; if (!unit->ctl || !unit->ctl_offset || !unit->ctr_offset) { @@ -153,7 +154,7 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit, if (!type) return; - box_offset = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL); + box_offset = kcalloc(type->num_boxes + 1, sizeof(u64), GFP_KERNEL); if (!box_offset) return; diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index 6ee80ad3423e..22e769a81103 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -125,7 +125,7 @@ struct intel_uncore_discovery_type { u8 ctr_offset; /* Counter 0 offset */ u16 num_boxes; /* number of boxes for the uncore block */ unsigned int *ids; /* Box IDs */ - unsigned int *box_offset; /* Box offset */ + u64 *box_offset; /* Box offset */ }; bool intel_uncore_has_discovery_tables(int *ignore); diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c index 173e2674be6e..56eea2c66cfb 100644 --- a/arch/x86/events/intel/uncore_nhmex.c +++ b/arch/x86/events/intel/uncore_nhmex.c @@ -306,7 +306,7 @@ static const struct attribute_group nhmex_uncore_cbox_format_group = { }; /* msr offset for each instance of cbox */ -static unsigned nhmex_cbox_msr_offsets[] = { +static u64 nhmex_cbox_msr_offsets[] = { 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0, }; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index fc6587016af7..344319ab6dd5 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -5278,7 +5278,7 @@ void snr_uncore_mmio_init(void) /* ICX uncore support */ -static unsigned icx_cha_msr_offsets[] = { +static u64 icx_cha_msr_offsets[] = { 0x2a0, 0x2ae, 0x2bc, 0x2ca, 0x2d8, 0x2e6, 0x2f4, 0x302, 0x310, 0x31e, 0x32c, 0x33a, 0x348, 0x356, 0x364, 0x372, 0x380, 0x38e, 0x3aa, 0x3b8, 0x3c6, 0x3d4, 0x3e2, 0x3f0, 0x3fe, 0x40c, 0x41a, @@ -5326,7 +5326,7 @@ static struct intel_uncore_type icx_uncore_chabox = { .format_group = &snr_uncore_chabox_format_group, }; -static unsigned icx_msr_offsets[] = { +static u64 icx_msr_offsets[] = { 0x0, 0x20, 0x40, 0x90, 0xb0, 0xd0, }; @@ -6184,7 +6184,7 @@ static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = { */ #define SPR_UNCORE_UPI_NUM_BOXES 4 -static unsigned int spr_upi_pci_offsets[SPR_UNCORE_UPI_NUM_BOXES] = { +static u64 spr_upi_pci_offsets[SPR_UNCORE_UPI_NUM_BOXES] = { 0, 0x8000, 0x10000, 0x18000 }; From 632c4bf6d007862307440b177d9fee829857e8bb Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Nov 2023 08:39:37 -0800 Subject: [PATCH 0178/1562] perf/x86/intel/uncore: Support Granite Rapids The same as Sapphire Rapids, Granite Rapids also supports the discovery table feature. All the basic uncore PMON information can be retrieved from the discovery table which resides in the BIOS. There are 4 new units are added on Granite Rapids, b2cmi, b2cxl, ubox, and mdf_sbo. The layout of the counters is exactly the same as the generic uncore counters. Only add a name for the new units. All the details can be retrieved from the discovery table. The description of the new units can be found at https://www.intel.com/content/www/us/en/secure/content-details/772943/content-details.html The other units, e.g., cha, iio, irp, pcu, and imc, are the same as Sapphire Rapids. Ignore the upi and b2upi units in the discovery table, which are broken for now. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Ammy Yi Link: https://lore.kernel.org/r/20231117163939.2468007-3-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 10 ++++ arch/x86/events/intel/uncore.h | 4 ++ arch/x86/events/intel/uncore_snbep.c | 87 ++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 01023aa5125b..7fb1c54c9879 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1814,6 +1814,14 @@ static const struct intel_uncore_init_fun spr_uncore_init __initconst = { .uncore_units_ignore = spr_uncore_units_ignore, }; +static const struct intel_uncore_init_fun gnr_uncore_init __initconst = { + .cpu_init = gnr_uncore_cpu_init, + .pci_init = gnr_uncore_pci_init, + .mmio_init = gnr_uncore_mmio_init, + .use_discovery = true, + .uncore_units_ignore = gnr_uncore_units_ignore, +}; + static const struct intel_uncore_init_fun generic_uncore_init __initconst = { .cpu_init = intel_uncore_generic_uncore_cpu_init, .pci_init = intel_uncore_generic_uncore_pci_init, @@ -1865,6 +1873,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &mtl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &spr_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &gnr_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &gnr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_uncore_init), {}, diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 7428ecaddf72..4838502d89ae 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -593,6 +593,7 @@ extern struct list_head pci2phy_map_head; extern struct pci_extra_dev *uncore_extra_pci_dev; extern struct event_constraint uncore_constraint_empty; extern int spr_uncore_units_ignore[]; +extern int gnr_uncore_units_ignore[]; /* uncore_snb.c */ int snb_uncore_pci_init(void); @@ -634,6 +635,9 @@ void icx_uncore_mmio_init(void); int spr_uncore_pci_init(void); void spr_uncore_cpu_init(void); void spr_uncore_mmio_init(void); +int gnr_uncore_pci_init(void); +void gnr_uncore_cpu_init(void); +void gnr_uncore_mmio_init(void); /* uncore_nhmex.c */ void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 344319ab6dd5..ab31cda797df 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6584,3 +6584,90 @@ void spr_uncore_mmio_init(void) } /* end of SPR uncore support */ + +/* GNR uncore support */ + +#define UNCORE_GNR_NUM_UNCORE_TYPES 23 +#define UNCORE_GNR_TYPE_15 15 +#define UNCORE_GNR_B2UPI 18 +#define UNCORE_GNR_TYPE_21 21 +#define UNCORE_GNR_TYPE_22 22 + +int gnr_uncore_units_ignore[] = { + UNCORE_SPR_UPI, + UNCORE_GNR_TYPE_15, + UNCORE_GNR_B2UPI, + UNCORE_GNR_TYPE_21, + UNCORE_GNR_TYPE_22, + UNCORE_IGNORE_END +}; + +static struct intel_uncore_type gnr_uncore_ubox = { + .name = "ubox", + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type gnr_uncore_b2cmi = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "b2cmi", +}; + +static struct intel_uncore_type gnr_uncore_b2cxl = { + SPR_UNCORE_MMIO_COMMON_FORMAT(), + .name = "b2cxl", +}; + +static struct intel_uncore_type gnr_uncore_mdf_sbo = { + .name = "mdf_sbo", + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = { + &spr_uncore_chabox, + &spr_uncore_iio, + &spr_uncore_irp, + NULL, + &spr_uncore_pcu, + &gnr_uncore_ubox, + &spr_uncore_imc, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + &gnr_uncore_b2cmi, + &gnr_uncore_b2cxl, + NULL, + NULL, + &gnr_uncore_mdf_sbo, + NULL, + NULL, +}; + +void gnr_uncore_cpu_init(void) +{ + uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR, 0, NULL, + UNCORE_GNR_NUM_UNCORE_TYPES, + gnr_uncores); +} + +int gnr_uncore_pci_init(void) +{ + uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, 0, NULL, + UNCORE_GNR_NUM_UNCORE_TYPES, + gnr_uncores); + return 0; +} + +void gnr_uncore_mmio_init(void) +{ + uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL, + UNCORE_GNR_NUM_UNCORE_TYPES, + gnr_uncores); +} + +/* end of GNR uncore support */ From 388d76175bd9bbad52bbff25c88361d9e5c6615e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Nov 2023 08:39:38 -0800 Subject: [PATCH 0179/1562] perf/x86/intel/uncore: Support IIO free-running counters on GNR The free-running counters for IIO uncore blocks on Granite Rapids are similar to Sapphire Rapids. The key difference is the offset of the registers. The number of the IIO uncore blocks can also be retrieved from the discovery table. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Ammy Yi Link: https://lore.kernel.org/r/20231117163939.2468007-4-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index ab31cda797df..aeaa8efe3c62 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6648,11 +6648,21 @@ static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = { NULL, }; +static struct freerunning_counters gnr_iio_freerunning[] = { + [SPR_IIO_MSR_IOCLK] = { 0x290e, 0x01, 0x10, 1, 48 }, + [SPR_IIO_MSR_BW_IN] = { 0x360e, 0x10, 0x80, 8, 48 }, + [SPR_IIO_MSR_BW_OUT] = { 0x2e0e, 0x10, 0x80, 8, 48 }, +}; + void gnr_uncore_cpu_init(void) { - uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR, 0, NULL, + uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR, + UNCORE_SPR_MSR_EXTRA_UNCORES, + spr_msr_uncores, UNCORE_GNR_NUM_UNCORE_TYPES, gnr_uncores); + spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO); + spr_uncore_iio_free_running.freerunning = gnr_iio_freerunning; } int gnr_uncore_pci_init(void) From cb4a6ccf35839895da63fcf6134d6fbd13224805 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Nov 2023 08:39:39 -0800 Subject: [PATCH 0180/1562] perf/x86/intel/uncore: Support Sierra Forest and Grand Ridge The same as Granite Rapids, the Sierra Forest and Grand Ridge also supports the discovery table feature and the same type of the uncore units. The difference of the available units and counters can be retrieved from the discovery table automatically. Just add the CPU model ID. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Ammy Yi Link: https://lore.kernel.org/r/20231117163939.2468007-5-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 7fb1c54c9879..7927c0b832fa 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1877,6 +1877,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &gnr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &gnr_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &gnr_uncore_init), {}, }; MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); From bb1f9e39c1bf7349405a48d2c77087dff6cea32b Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Fri, 24 Nov 2023 20:49:01 +0200 Subject: [PATCH 0181/1562] docs: mtd: spi-nor: add sections about flash additions and testing Add sections about how to propose a new flash addition and about the minimum testing requirements. Reviewed-by: Michael Walle Reviewed-by: Pratyush Yadav Link: https://lore.kernel.org/r/20231124184902.1194235-2-tudor.ambarus@linaro.org Signed-off-by: Tudor Ambarus --- Documentation/driver-api/mtd/spi-nor.rst | 202 +++++++++++++++++++++++ 1 file changed, 202 insertions(+) diff --git a/Documentation/driver-api/mtd/spi-nor.rst b/Documentation/driver-api/mtd/spi-nor.rst index c22f8c0f7950..628096e578c7 100644 --- a/Documentation/driver-api/mtd/spi-nor.rst +++ b/Documentation/driver-api/mtd/spi-nor.rst @@ -63,3 +63,205 @@ The main API is spi_nor_scan(). Before you call the hook, a driver should initialize the necessary fields for spi_nor{}. Please see drivers/mtd/spi-nor/spi-nor.c for detail. Please also refer to spi-fsl-qspi.c when you want to write a new driver for a SPI NOR controller. + +How to propose a new flash addition +----------------------------------- + +Most SPI NOR flashes comply with the JEDEC JESD216 +Serial Flash Discoverable Parameter (SFDP) standard. SFDP describes +the functional and feature capabilities of serial flash devices in a +standard set of internal read-only parameter tables. + +The SPI NOR driver queries the SFDP tables in order to determine the +flash's parameters and settings. If the flash defines the SFDP tables +it's likely that you won't need a flash entry at all, and instead +rely on the generic flash driver which probes the flash solely based +on its SFDP data. All one has to do is to specify the "jedec,spi-nor" +compatible in the device tree. + +There are cases however where you need to define an explicit flash +entry. This typically happens when the flash has settings or support +that is not covered by the SFDP tables (e.g. Block Protection), or +when the flash contains mangled SFDP data. If the later, one needs +to implement the ``spi_nor_fixups`` hooks in order to amend the SFDP +parameters with the correct values. + +Minimum testing requirements +----------------------------- + +Do all the tests from below and paste them in the commit's comments +section, after the ``---`` marker. + +1) Specify the controller that you used to test the flash and specify + the frequency at which the flash was operated, e.g.:: + + This flash is populated on the X board and was tested at Y + frequency using the Z (put compatible) SPI controller. + +2) Dump the sysfs entries and print the md5/sha1/sha256 SFDP checksum:: + + root@1:~# cat /sys/bus/spi/devices/spi0.0/spi-nor/partname + sst26vf064b + root@1:~# cat /sys/bus/spi/devices/spi0.0/spi-nor/jedec_id + bf2643 + root@1:~# cat /sys/bus/spi/devices/spi0.0/spi-nor/manufacturer + sst + root@1:~# xxd -p /sys/bus/spi/devices/spi0.0/spi-nor/sfdp + 53464450060102ff00060110300000ff81000106000100ffbf0001180002 + 0001fffffffffffffffffffffffffffffffffd20f1ffffffff0344eb086b + 083b80bbfeffffffffff00ffffff440b0c200dd80fd810d820914824806f + 1d81ed0f773830b030b0f7ffffff29c25cfff030c080ffffffffffffffff + ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + ffffffffffffffffffffffffffffffffff0004fff37f0000f57f0000f9ff + 7d00f57f0000f37f0000ffffffffffffffffffffffffffffffffffffffff + ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + ffffbf2643ffb95ffdff30f260f332ff0a122346ff0f19320f1919ffffff + ffffffff00669938ff05013506040232b03072428de89888a585c09faf5a + ffff06ec060c0003080bffffffffff07ffff0202ff060300fdfd040700fc + 0300fefe0202070e + root@1:~# sha256sum /sys/bus/spi/devices/spi0.0/spi-nor/sfdp + 428f34d0461876f189ac97f93e68a05fa6428c6650b3b7baf736a921e5898ed1 /sys/bus/spi/devices/spi0.0/spi-nor/sfdp + + Please dump the SFDP tables using ``xxd -p``. It enables us to do + the reverse operation and convert the hexdump to binary with + ``xxd -rp``. Dumping the SFDP data with ``hexdump -Cv`` is accepted, + but less desirable. + +3) Dump debugfs data:: + + root@1:~# cat /sys/kernel/debug/spi-nor/spi0.0/capabilities + Supported read modes by the flash + 1S-1S-1S + opcode 0x03 + mode cycles 0 + dummy cycles 0 + 1S-1S-1S (fast read) + opcode 0x0b + mode cycles 0 + dummy cycles 8 + 1S-1S-2S + opcode 0x3b + mode cycles 0 + dummy cycles 8 + 1S-2S-2S + opcode 0xbb + mode cycles 4 + dummy cycles 0 + 1S-1S-4S + opcode 0x6b + mode cycles 0 + dummy cycles 8 + 1S-4S-4S + opcode 0xeb + mode cycles 2 + dummy cycles 4 + 4S-4S-4S + opcode 0x0b + mode cycles 2 + dummy cycles 4 + + Supported page program modes by the flash + 1S-1S-1S + opcode 0x02 + + root@1:~# cat /sys/kernel/debug/spi-nor/spi0.0/params + name sst26vf064b + id bf 26 43 bf 26 43 + size 8.00 MiB + write size 1 + page size 256 + address nbytes 3 + flags HAS_LOCK | HAS_16BIT_SR | SOFT_RESET | SWP_IS_VOLATILE + + opcodes + read 0xeb + dummy cycles 6 + erase 0x20 + program 0x02 + 8D extension none + + protocols + read 1S-4S-4S + write 1S-1S-1S + register 1S-1S-1S + + erase commands + 20 (4.00 KiB) [0] + d8 (8.00 KiB) [1] + d8 (32.0 KiB) [2] + d8 (64.0 KiB) [3] + c7 (8.00 MiB) + + sector map + region (in hex) | erase mask | flags + ------------------+------------+---------- + 00000000-00007fff | [01 ] | + 00008000-0000ffff | [0 2 ] | + 00010000-007effff | [0 3] | + 007f0000-007f7fff | [0 2 ] | + 007f8000-007fffff | [01 ] | + +4) Use `mtd-utils `__ + and verify that erase, read and page program operations work fine:: + + root@1:~# dd if=/dev/urandom of=./spi_test bs=1M count=2 + 2+0 records in + 2+0 records out + 2097152 bytes (2.1 MB, 2.0 MiB) copied, 0.848566 s, 2.5 MB/s + + root@1:~# mtd_debug erase /dev/mtd0 0 2097152 + Erased 2097152 bytes from address 0x00000000 in flash + + root@1:~# mtd_debug read /dev/mtd0 0 2097152 spi_read + Copied 2097152 bytes from address 0x00000000 in flash to spi_read + + root@1:~# hexdump spi_read + 0000000 ffff ffff ffff ffff ffff ffff ffff ffff + * + 0200000 + + root@1:~# sha256sum spi_read + 4bda3a28f4ffe603c0ec1258c0034d65a1a0d35ab7bd523a834608adabf03cc5 spi_read + + root@1:~# mtd_debug write /dev/mtd0 0 2097152 spi_test + Copied 2097152 bytes from spi_test to address 0x00000000 in flash + + root@1:~# mtd_debug read /dev/mtd0 0 2097152 spi_read + Copied 2097152 bytes from address 0x00000000 in flash to spi_read + + root@1:~# sha256sum spi* + c444216a6ba2a4a66cccd60a0dd062bce4b865dd52b200ef5e21838c4b899ac8 spi_read + c444216a6ba2a4a66cccd60a0dd062bce4b865dd52b200ef5e21838c4b899ac8 spi_test + + If the flash comes erased by default and the previous erase was ignored, + we won't catch it, thus test the erase again:: + + root@1:~# mtd_debug erase /dev/mtd0 0 2097152 + Erased 2097152 bytes from address 0x00000000 in flash + + root@1:~# mtd_debug read /dev/mtd0 0 2097152 spi_read + Copied 2097152 bytes from address 0x00000000 in flash to spi_read + + root@1:~# sha256sum spi* + 4bda3a28f4ffe603c0ec1258c0034d65a1a0d35ab7bd523a834608adabf03cc5 spi_read + c444216a6ba2a4a66cccd60a0dd062bce4b865dd52b200ef5e21838c4b899ac8 spi_test + + Dump some other relevant data:: + + root@1:~# mtd_debug info /dev/mtd0 + mtd.type = MTD_NORFLASH + mtd.flags = MTD_CAP_NORFLASH + mtd.size = 8388608 (8M) + mtd.erasesize = 4096 (4K) + mtd.writesize = 1 + mtd.oobsize = 0 + regions = 0 From 9b3eae3486c86304e047829cfe0073b66dc02b36 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Fri, 24 Nov 2023 20:49:02 +0200 Subject: [PATCH 0182/1562] docs: mtd: spi-nor: drop obsolete info The architecture description is obsolete, it no longer applies to the current SPI NOR framework state, remove it. Reviewed-by: Michael Walle Reviewed-by: Pratyush Yadav Link: https://lore.kernel.org/r/20231124184902.1194235-3-tudor.ambarus@linaro.org Signed-off-by: Tudor Ambarus --- Documentation/driver-api/mtd/spi-nor.rst | 62 ------------------------ 1 file changed, 62 deletions(-) diff --git a/Documentation/driver-api/mtd/spi-nor.rst b/Documentation/driver-api/mtd/spi-nor.rst index 628096e578c7..148fa4288760 100644 --- a/Documentation/driver-api/mtd/spi-nor.rst +++ b/Documentation/driver-api/mtd/spi-nor.rst @@ -2,68 +2,6 @@ SPI NOR framework ================= -Part I - Why do we need this framework? ---------------------------------------- - -SPI bus controllers (drivers/spi/) only deal with streams of bytes; the bus -controller operates agnostic of the specific device attached. However, some -controllers (such as Freescale's QuadSPI controller) cannot easily handle -arbitrary streams of bytes, but rather are designed specifically for SPI NOR. - -In particular, Freescale's QuadSPI controller must know the NOR commands to -find the right LUT sequence. Unfortunately, the SPI subsystem has no notion of -opcodes, addresses, or data payloads; a SPI controller simply knows to send or -receive bytes (Tx and Rx). Therefore, we must define a new layering scheme under -which the controller driver is aware of the opcodes, addressing, and other -details of the SPI NOR protocol. - -Part II - How does the framework work? --------------------------------------- - -This framework just adds a new layer between the MTD and the SPI bus driver. -With this new layer, the SPI NOR controller driver does not depend on the -m25p80 code anymore. - -Before this framework, the layer is like:: - - MTD - ------------------------ - m25p80 - ------------------------ - SPI bus driver - ------------------------ - SPI NOR chip - -After this framework, the layer is like:: - - MTD - ------------------------ - SPI NOR framework - ------------------------ - m25p80 - ------------------------ - SPI bus driver - ------------------------ - SPI NOR chip - -With the SPI NOR controller driver (Freescale QuadSPI), it looks like:: - - MTD - ------------------------ - SPI NOR framework - ------------------------ - fsl-quadSPI - ------------------------ - SPI NOR chip - -Part III - How can drivers use the framework? ---------------------------------------------- - -The main API is spi_nor_scan(). Before you call the hook, a driver should -initialize the necessary fields for spi_nor{}. Please see -drivers/mtd/spi-nor/spi-nor.c for detail. Please also refer to spi-fsl-qspi.c -when you want to write a new driver for a SPI NOR controller. - How to propose a new flash addition ----------------------------------- From 0257e5a3c26b3810831359d39c0821397af8bf29 Mon Sep 17 00:00:00 2001 From: Wenyu Huang Date: Sat, 25 Nov 2023 02:05:27 +0000 Subject: [PATCH 0183/1562] sched/doc: Update documentation after renames and synchronize Chinese version Update the documentation after these changes, which didn't entirely propagate the changes: e23edc86b09d ("sched/fair: Rename check_preempt_curr() to wakeup_preempt()") 03b7fad167ef ("sched: Add task_struct pointer to sched_class::set_curr_task") 2f88c8e802c8 ("sched/eevdf/doc: Modify the documented knob to base_slice_ns as well") [ mingo: Reworked the changelog. ] Signed-off-by: Wenyu Huang Signed-off-by: Ingo Molnar Cc: linux-kernel@vger.kernel.org --- Documentation/scheduler/sched-design-CFS.rst | 8 ++++---- .../translations/zh_CN/scheduler/sched-design-CFS.rst | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst index f68919800f05..6cffffe26500 100644 --- a/Documentation/scheduler/sched-design-CFS.rst +++ b/Documentation/scheduler/sched-design-CFS.rst @@ -180,7 +180,7 @@ This is the (partial) list of the hooks: compat_yield sysctl is turned on; in that case, it places the scheduling entity at the right-most end of the red-black tree. - - check_preempt_curr(...) + - wakeup_preempt(...) This function checks if a task that entered the runnable state should preempt the currently running task. @@ -189,10 +189,10 @@ This is the (partial) list of the hooks: This function chooses the most appropriate task eligible to run next. - - set_curr_task(...) + - set_next_task(...) - This function is called when a task changes its scheduling class or changes - its task group. + This function is called when a task changes its scheduling class, changes + its task group or is scheduled. - task_tick(...) diff --git a/Documentation/translations/zh_CN/scheduler/sched-design-CFS.rst b/Documentation/translations/zh_CN/scheduler/sched-design-CFS.rst index 3076402406c4..abc6709ec3b2 100644 --- a/Documentation/translations/zh_CN/scheduler/sched-design-CFS.rst +++ b/Documentation/translations/zh_CN/scheduler/sched-design-CFS.rst @@ -80,7 +80,7 @@ p->se.vruntime。一旦p->se.vruntime变得足够大,其它的任务将成为 CFS使用纳秒粒度的计时,不依赖于任何jiffies或HZ的细节。因此CFS并不像之前的调度器那样 有“时间片”的概念,也没有任何启发式的设计。唯一可调的参数(你需要打开CONFIG_SCHED_DEBUG)是: - /sys/kernel/debug/sched/min_granularity_ns + /sys/kernel/debug/sched/base_slice_ns 它可以用来将调度器从“桌面”模式(也就是低时延)调节为“服务器”(也就是高批处理)模式。 它的默认设置是适合桌面的工作负载。SCHED_BATCH也被CFS调度器模块处理。 @@ -147,7 +147,7 @@ array)。 这个函数的行为基本上是出队,紧接着入队,除非compat_yield sysctl被开启。在那种情况下, 它将调度实体放在红黑树的最右端。 - - check_preempt_curr(...) + - wakeup_preempt(...) 这个函数检查进入可运行状态的任务能否抢占当前正在运行的任务。 @@ -155,9 +155,9 @@ array)。 这个函数选择接下来最适合运行的任务。 - - set_curr_task(...) + - set_next_task(...) - 这个函数在任务改变调度类或改变任务组时被调用。 + 这个函数在任务改变调度类,改变任务组时,或者任务被调度时被调用。 - task_tick(...) From 8e6a43961f24cf841d3c0d199521d0b284d948b9 Mon Sep 17 00:00:00 2001 From: Andrew Davis Date: Fri, 17 Nov 2023 10:10:05 -0600 Subject: [PATCH 0184/1562] spi: sprd: adi: Use devm_register_restart_handler() Use device life-cycle managed register function to simplify probe error path and eliminate need for explicit remove function. Signed-off-by: Andrew Davis Link: https://lore.kernel.org/r/20231117161006.87734-5-afd@ti.com Signed-off-by: Mark Brown --- drivers/spi/spi-sprd-adi.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/drivers/spi/spi-sprd-adi.c b/drivers/spi/spi-sprd-adi.c index bf01feedbf93..58c3badd9c79 100644 --- a/drivers/spi/spi-sprd-adi.c +++ b/drivers/spi/spi-sprd-adi.c @@ -138,8 +138,7 @@ struct sprd_adi_data { u32 slave_offset; u32 slave_addr_size; int (*read_check)(u32 val, u32 reg); - int (*restart)(struct notifier_block *this, - unsigned long mode, void *cmd); + int (*restart)(struct sys_off_data *data); void (*wdg_rst)(void *p); }; @@ -150,7 +149,6 @@ struct sprd_adi { struct hwspinlock *hwlock; unsigned long slave_vbase; unsigned long slave_pbase; - struct notifier_block restart_handler; const struct sprd_adi_data *data; }; @@ -370,11 +368,9 @@ static void sprd_adi_set_wdt_rst_mode(void *p) #endif } -static int sprd_adi_restart(struct notifier_block *this, unsigned long mode, - void *cmd, struct sprd_adi_wdg *wdg) +static int sprd_adi_restart(struct sprd_adi *sadi, unsigned long mode, + const char *cmd, struct sprd_adi_wdg *wdg) { - struct sprd_adi *sadi = container_of(this, struct sprd_adi, - restart_handler); u32 val, reboot_mode = 0; if (!cmd) @@ -448,8 +444,7 @@ static int sprd_adi_restart(struct notifier_block *this, unsigned long mode, return NOTIFY_DONE; } -static int sprd_adi_restart_sc9860(struct notifier_block *this, - unsigned long mode, void *cmd) +static int sprd_adi_restart_sc9860(struct sys_off_data *data) { struct sprd_adi_wdg wdg = { .base = PMIC_WDG_BASE, @@ -458,7 +453,7 @@ static int sprd_adi_restart_sc9860(struct notifier_block *this, .wdg_clk = PMIC_CLK_EN, }; - return sprd_adi_restart(this, mode, cmd, &wdg); + return sprd_adi_restart(data->cb_data, data->mode, data->cmd, &wdg); } static void sprd_adi_hw_init(struct sprd_adi *sadi) @@ -590,9 +585,9 @@ static int sprd_adi_probe(struct platform_device *pdev) } if (sadi->data->restart) { - sadi->restart_handler.notifier_call = sadi->data->restart; - sadi->restart_handler.priority = 128; - ret = register_restart_handler(&sadi->restart_handler); + ret = devm_register_restart_handler(&pdev->dev, + sadi->data->restart, + sadi); if (ret) { dev_err(&pdev->dev, "can not register restart handler\n"); goto put_ctlr; @@ -606,14 +601,6 @@ put_ctlr: return ret; } -static void sprd_adi_remove(struct platform_device *pdev) -{ - struct spi_controller *ctlr = dev_get_drvdata(&pdev->dev); - struct sprd_adi *sadi = spi_controller_get_devdata(ctlr); - - unregister_restart_handler(&sadi->restart_handler); -} - static struct sprd_adi_data sc9860_data = { .slave_offset = ADI_10BIT_SLAVE_OFFSET, .slave_addr_size = ADI_10BIT_SLAVE_ADDR_SIZE, @@ -657,7 +644,6 @@ static struct platform_driver sprd_adi_driver = { .of_match_table = sprd_adi_of_match, }, .probe = sprd_adi_probe, - .remove_new = sprd_adi_remove, }; module_platform_driver(sprd_adi_driver); From bdf1abd17ed209ccbb24f15002f32ef21145da91 Mon Sep 17 00:00:00 2001 From: Eric Snowberg Date: Mon, 6 Nov 2023 18:06:25 -0500 Subject: [PATCH 0185/1562] ima: Reword IMA_KEYRINGS_PERMIT_SIGNED_BY_BUILTIN_OR_SECONDARY When the machine keyring is enabled, it may be used as a trust source for the .ima keyring. Add a reference to this in IMA_KEYRINGS_PERMIT_SIGNED_BY_BUILTIN_OR_SECONDARY. Signed-off-by: Eric Snowberg Signed-off-by: Mimi Zohar --- security/integrity/ima/Kconfig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig index a6bd817efc1a..a0a767dc5c04 100644 --- a/security/integrity/ima/Kconfig +++ b/security/integrity/ima/Kconfig @@ -243,7 +243,7 @@ config IMA_APPRAISE_MODSIG to accept such signatures. config IMA_KEYRINGS_PERMIT_SIGNED_BY_BUILTIN_OR_SECONDARY - bool "Permit keys validly signed by a built-in or secondary CA cert (EXPERIMENTAL)" + bool "Permit keys validly signed by a built-in, machine (if configured) or secondary (EXPERIMENTAL)" depends on SYSTEM_TRUSTED_KEYRING depends on SECONDARY_TRUSTED_KEYRING depends on INTEGRITY_ASYMMETRIC_KEYS @@ -251,14 +251,14 @@ config IMA_KEYRINGS_PERMIT_SIGNED_BY_BUILTIN_OR_SECONDARY default n help Keys may be added to the IMA or IMA blacklist keyrings, if the - key is validly signed by a CA cert in the system built-in or - secondary trusted keyrings. The key must also have the - digitalSignature usage set. + key is validly signed by a CA cert in the system built-in, + machine (if configured), or secondary trusted keyrings. The + key must also have the digitalSignature usage set. Intermediate keys between those the kernel has compiled in and the IMA keys to be added may be added to the system secondary keyring, provided they are validly signed by a key already resident in the - built-in or secondary trusted keyrings. + built-in, machine (if configured) or secondary trusted keyrings. config IMA_BLACKLIST_KEYRING bool "Create IMA machine owner blacklist keyrings (EXPERIMENTAL)" From f17167bea279d07314ee2629e7ce2dd5a754fec7 Mon Sep 17 00:00:00 2001 From: Eric Snowberg Date: Mon, 6 Nov 2023 18:06:26 -0500 Subject: [PATCH 0186/1562] ima: Remove EXPERIMENTAL from Kconfig Remove the EXPERIMENTAL from the IMA_KEYRINGS_PERMIT_SIGNED_BY_BUILTIN_OR_SECONDARY Kconfig now that digitalSignature usage enforcement is set. Signed-off-by: Eric Snowberg link: https://lore.kernel.org/all/20230508220708.2888510-4-eric.snowberg@oracle.com/ Acked-by: Jarkko Sakkinen Reviewed-by: Mimi Zohar Signed-off-by: Mimi Zohar --- security/integrity/ima/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig index a0a767dc5c04..b98bfe9efd0c 100644 --- a/security/integrity/ima/Kconfig +++ b/security/integrity/ima/Kconfig @@ -243,7 +243,7 @@ config IMA_APPRAISE_MODSIG to accept such signatures. config IMA_KEYRINGS_PERMIT_SIGNED_BY_BUILTIN_OR_SECONDARY - bool "Permit keys validly signed by a built-in, machine (if configured) or secondary (EXPERIMENTAL)" + bool "Permit keys validly signed by a built-in, machine (if configured) or secondary" depends on SYSTEM_TRUSTED_KEYRING depends on SECONDARY_TRUSTED_KEYRING depends on INTEGRITY_ASYMMETRIC_KEYS From b4af096b5df5dd131ab796c79cedc7069d8f4882 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 8 Nov 2023 07:36:27 +0000 Subject: [PATCH 0187/1562] KEYS: encrypted: Add check for strsep Add check for strsep() in order to transfer the error. Fixes: cd3bc044af48 ("KEYS: encrypted: Instantiate key with user-provided decrypted data") Signed-off-by: Chen Ni Signed-off-by: Mimi Zohar --- security/keys/encrypted-keys/encrypted.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index 8af2136069d2..76f55dd13cb8 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -237,6 +237,10 @@ static int datablob_parse(char *datablob, const char **format, break; } *decrypted_data = strsep(&datablob, " \t"); + if (!*decrypted_data) { + pr_info("encrypted_key: decrypted_data is missing\n"); + break; + } ret = 0; break; case Opt_load: From 073d3d2ca7d462afc8159ca0175675b9b7b4f162 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 27 Oct 2023 12:40:04 +0530 Subject: [PATCH 0188/1562] OPP: Level zero is valid The level zero can be used by some OPPs to drop performance state vote for the device. It is perfectly fine to allow the same. _set_opp_level() considers it as an invalid value currently and returns early. In order to support this properly, initialize the level field with U32_MAX, which denotes unused level field. Reported-by: Stephan Gerhold Reviewed-by: Ulf Hansson Tested-by: Stephan Gerhold Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 24 ++++++++++++++++++++---- drivers/opp/of.c | 8 +++++++- include/linux/pm_opp.h | 5 ++++- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 84f345c69ea5..f2e2aa07b431 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -201,7 +201,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_freq_indexed); * @opp: opp for which level value has to be returned for * * Return: level read from device tree corresponding to the opp, else - * return 0. + * return U32_MAX. */ unsigned int dev_pm_opp_get_level(struct dev_pm_opp *opp) { @@ -221,7 +221,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_level); * @index: index of the required opp * * Return: performance state read from device tree corresponding to the - * required opp, else return 0. + * required opp, else return U32_MAX. */ unsigned int dev_pm_opp_get_required_pstate(struct dev_pm_opp *opp, unsigned int index) @@ -808,6 +808,14 @@ struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev, struct dev_pm_opp *opp; opp = _find_key_ceil(dev, &temp, 0, true, _read_level, NULL); + + /* False match */ + if (temp == OPP_LEVEL_UNSET) { + dev_err(dev, "%s: OPP levels aren't available\n", __func__); + dev_pm_opp_put(opp); + return ERR_PTR(-ENODEV); + } + *level = temp; return opp; } @@ -1049,12 +1057,18 @@ static int _set_opp_bw(const struct opp_table *opp_table, static int _set_performance_state(struct device *dev, struct device *pd_dev, struct dev_pm_opp *opp, int i) { - unsigned int pstate = likely(opp) ? opp->required_opps[i]->level: 0; + unsigned int pstate = 0; int ret; if (!pd_dev) return 0; + if (likely(opp)) { + pstate = opp->required_opps[i]->level; + if (pstate == OPP_LEVEL_UNSET) + return 0; + } + ret = dev_pm_domain_set_performance_state(pd_dev, pstate); if (ret) { dev_err(dev, "Failed to set performance state of %s: %d (%d)\n", @@ -1135,7 +1149,7 @@ static int _set_opp_level(struct device *dev, struct opp_table *opp_table, int ret = 0; if (opp) { - if (!opp->level) + if (opp->level == OPP_LEVEL_UNSET) return 0; level = opp->level; @@ -1867,6 +1881,8 @@ struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table) INIT_LIST_HEAD(&opp->node); + opp->level = OPP_LEVEL_UNSET; + return opp; } diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 81fa27599d58..85fad7ca0007 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -1393,8 +1393,14 @@ int of_get_required_opp_performance_state(struct device_node *np, int index) opp = _find_opp_of_np(opp_table, required_np); if (opp) { - pstate = opp->level; + if (opp->level == OPP_LEVEL_UNSET) { + pr_err("%s: OPP levels aren't available for %pOF\n", + __func__, np); + } else { + pstate = opp->level; + } dev_pm_opp_put(opp); + } dev_pm_opp_put_opp_table(opp_table); diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index ccd97bcef269..af53101a1383 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -92,9 +92,12 @@ struct dev_pm_opp_config { struct device ***virt_devs; }; +#define OPP_LEVEL_UNSET U32_MAX + /** * struct dev_pm_opp_data - The data to use to initialize an OPP. - * @level: The performance level for the OPP. + * @level: The performance level for the OPP. Set level to OPP_LEVEL_UNSET if + * level field isn't used. * @freq: The clock rate in Hz for the OPP. * @u_volt: The voltage in uV for the OPP. */ From 6d366d0e544676bf608769b9520644e3f654ff99 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 12 Oct 2023 15:45:21 +0530 Subject: [PATCH 0189/1562] OPP: Use _set_opp_level() for single genpd case There are two genpd (as required-opp) cases that we need to handle, devices with a single genpd and ones with multiple genpds. The multiple genpds case is clear, where the OPP core calls dev_pm_domain_attach_by_name() for them and uses the virtual devices returned by this helper to call dev_pm_domain_set_performance_state() later to change the performance state. The single genpd case however requires special handling as we need to use the same `dev` structure (instead of a virtual one provided by genpd core) for setting the performance state via dev_pm_domain_set_performance_state(). As we move towards more generic code to take care of the required OPPs, where we will recursively call dev_pm_opp_set_opp() for all the required OPPs, the above special case becomes a problem. It doesn't make sense for a device's DT entry to have both "opp-level" and single "required-opps" entry pointing to a genpd's OPP, as that would make the OPP core call dev_pm_domain_set_performance_state() for two different values for the same device structure. And so we can reuse the 'opp->level" field in such a case and call _set_opp_level() for the device. Reviewed-by: Ulf Hansson Tested-by: Stephan Gerhold Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 6 ++++-- drivers/opp/of.c | 31 ++++++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index f2e2aa07b431..aeb216f7e978 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1088,10 +1088,12 @@ static int _opp_set_required_opps_generic(struct device *dev, static int _opp_set_required_opps_genpd(struct device *dev, struct opp_table *opp_table, struct dev_pm_opp *opp, bool scaling_down) { - struct device **genpd_virt_devs = - opp_table->genpd_virt_devs ? opp_table->genpd_virt_devs : &dev; + struct device **genpd_virt_devs = opp_table->genpd_virt_devs; int index, target, delta, ret; + if (!genpd_virt_devs) + return 0; + /* Scaling up? Set required OPPs in normal order, else reverse */ if (!scaling_down) { index = 0; diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 85fad7ca0007..4cdeeab5ceee 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -296,7 +296,7 @@ void _of_clear_opp(struct opp_table *opp_table, struct dev_pm_opp *opp) of_node_put(opp->np); } -static int _link_required_opps(struct dev_pm_opp *opp, +static int _link_required_opps(struct dev_pm_opp *opp, struct opp_table *opp_table, struct opp_table *required_table, int index) { struct device_node *np; @@ -314,6 +314,31 @@ static int _link_required_opps(struct dev_pm_opp *opp, return -ENODEV; } + /* + * There are two genpd (as required-opp) cases that we need to handle, + * devices with a single genpd and ones with multiple genpds. + * + * The single genpd case requires special handling as we need to use the + * same `dev` structure (instead of a virtual one provided by genpd + * core) for setting the performance state. + * + * It doesn't make sense for a device's DT entry to have both + * "opp-level" and single "required-opps" entry pointing to a genpd's + * OPP, as that would make the OPP core call + * dev_pm_domain_set_performance_state() for two different values for + * the same device structure. Lets treat single genpd configuration as a + * case where the OPP's level is directly available without required-opp + * link in the DT. + * + * Just update the `level` with the right value, which + * dev_pm_opp_set_opp() will take care of in the normal path itself. + */ + if (required_table->is_genpd && opp_table->required_opp_count == 1 && + !opp_table->genpd_virt_devs) { + if (!WARN_ON(opp->level != OPP_LEVEL_UNSET)) + opp->level = opp->required_opps[0]->level; + } + return 0; } @@ -338,7 +363,7 @@ static int _of_opp_alloc_required_opps(struct opp_table *opp_table, if (IS_ERR_OR_NULL(required_table)) continue; - ret = _link_required_opps(opp, required_table, i); + ret = _link_required_opps(opp, opp_table, required_table, i); if (ret) goto free_required_opps; } @@ -359,7 +384,7 @@ static int lazy_link_required_opps(struct opp_table *opp_table, int ret; list_for_each_entry(opp, &opp_table->opp_list, node) { - ret = _link_required_opps(opp, new_table, index); + ret = _link_required_opps(opp, opp_table, new_table, index); if (ret) return ret; } From e37440e7e2c2760475d60c5556b59c8880a7fd63 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 27 Oct 2023 14:17:48 +0530 Subject: [PATCH 0190/1562] OPP: Call dev_pm_opp_set_opp() for required OPPs Configuring the required OPP was never properly implemented, we just took an exception for genpds and configured them directly, while leaving out all other required OPP types. Now that a standard call to dev_pm_opp_set_opp() takes care of configuring the opp->level too, the special handling for genpds can be avoided by simply calling dev_pm_opp_set_opp() for the required OPPs, which shall eventually configure the corresponding level for genpds. This also makes it possible for us to configure other type of required OPPs (no concrete users yet though), via the same path. This is how other frameworks take care of parent nodes, like clock, regulators, etc, where we recursively call the same helper. In order to call dev_pm_opp_set_opp() for the virtual genpd devices, they must share the OPP table of the genpd. Call _add_opp_dev() for them to get that done. This commit also extends the struct dev_pm_opp_config to pass required devices, for non-genpd cases, which can be used to call dev_pm_opp_set_opp() for the non-genpd required devices. Reviewed-by: Ulf Hansson Tested-by: Stephan Gerhold Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 168 ++++++++++++++++++++--------------------- drivers/opp/of.c | 17 +++-- drivers/opp/opp.h | 8 +- include/linux/pm_opp.h | 7 +- 4 files changed, 100 insertions(+), 100 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index aeb216f7e978..e08375ed50aa 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1054,48 +1054,22 @@ static int _set_opp_bw(const struct opp_table *opp_table, return 0; } -static int _set_performance_state(struct device *dev, struct device *pd_dev, - struct dev_pm_opp *opp, int i) +/* This is only called for PM domain for now */ +static int _set_required_opps(struct device *dev, struct opp_table *opp_table, + struct dev_pm_opp *opp, bool up) { - unsigned int pstate = 0; - int ret; - - if (!pd_dev) - return 0; - - if (likely(opp)) { - pstate = opp->required_opps[i]->level; - if (pstate == OPP_LEVEL_UNSET) - return 0; - } - - ret = dev_pm_domain_set_performance_state(pd_dev, pstate); - if (ret) { - dev_err(dev, "Failed to set performance state of %s: %d (%d)\n", - dev_name(pd_dev), pstate, ret); - } - - return ret; -} - -static int _opp_set_required_opps_generic(struct device *dev, - struct opp_table *opp_table, struct dev_pm_opp *opp, bool scaling_down) -{ - dev_err(dev, "setting required-opps isn't supported for non-genpd devices\n"); - return -ENOENT; -} - -static int _opp_set_required_opps_genpd(struct device *dev, - struct opp_table *opp_table, struct dev_pm_opp *opp, bool scaling_down) -{ - struct device **genpd_virt_devs = opp_table->genpd_virt_devs; + struct device **devs = opp_table->required_devs; int index, target, delta, ret; - if (!genpd_virt_devs) + if (!devs) return 0; + /* required-opps not fully initialized yet */ + if (lazy_linking_pending(opp_table)) + return -EBUSY; + /* Scaling up? Set required OPPs in normal order, else reverse */ - if (!scaling_down) { + if (up) { index = 0; target = opp_table->required_opp_count; delta = 1; @@ -1106,9 +1080,11 @@ static int _opp_set_required_opps_genpd(struct device *dev, } while (index != target) { - ret = _set_performance_state(dev, genpd_virt_devs[index], opp, index); - if (ret) - return ret; + if (devs[index]) { + ret = dev_pm_opp_set_opp(devs[index], opp->required_opps[index]); + if (ret) + return ret; + } index += delta; } @@ -1116,34 +1092,6 @@ static int _opp_set_required_opps_genpd(struct device *dev, return 0; } -/* This is only called for PM domain for now */ -static int _set_required_opps(struct device *dev, struct opp_table *opp_table, - struct dev_pm_opp *opp, bool up) -{ - /* required-opps not fully initialized yet */ - if (lazy_linking_pending(opp_table)) - return -EBUSY; - - if (opp_table->set_required_opps) - return opp_table->set_required_opps(dev, opp_table, opp, up); - - return 0; -} - -/* Update set_required_opps handler */ -void _update_set_required_opps(struct opp_table *opp_table) -{ - /* Already set */ - if (opp_table->set_required_opps) - return; - - /* All required OPPs will belong to genpd or none */ - if (opp_table->required_opp_tables[0]->is_genpd) - opp_table->set_required_opps = _opp_set_required_opps_genpd; - else - opp_table->set_required_opps = _opp_set_required_opps_generic; -} - static int _set_opp_level(struct device *dev, struct opp_table *opp_table, struct dev_pm_opp *opp) { @@ -2406,19 +2354,13 @@ static void _opp_detach_genpd(struct opp_table *opp_table) { int index; - if (!opp_table->genpd_virt_devs) - return; - for (index = 0; index < opp_table->required_opp_count; index++) { - if (!opp_table->genpd_virt_devs[index]) + if (!opp_table->required_devs[index]) continue; - dev_pm_domain_detach(opp_table->genpd_virt_devs[index], false); - opp_table->genpd_virt_devs[index] = NULL; + dev_pm_domain_detach(opp_table->required_devs[index], false); + opp_table->required_devs[index] = NULL; } - - kfree(opp_table->genpd_virt_devs); - opp_table->genpd_virt_devs = NULL; } /* @@ -2445,14 +2387,14 @@ static int _opp_attach_genpd(struct opp_table *opp_table, struct device *dev, int index = 0, ret = -EINVAL; const char * const *name = names; - if (opp_table->genpd_virt_devs) - return 0; + if (!opp_table->required_devs) { + dev_err(dev, "Required OPPs not available, can't attach genpd\n"); + return -EINVAL; + } - opp_table->genpd_virt_devs = kcalloc(opp_table->required_opp_count, - sizeof(*opp_table->genpd_virt_devs), - GFP_KERNEL); - if (!opp_table->genpd_virt_devs) - return -ENOMEM; + /* Checking only the first one is enough ? */ + if (opp_table->required_devs[0]) + return 0; while (*name) { if (index >= opp_table->required_opp_count) { @@ -2468,13 +2410,25 @@ static int _opp_attach_genpd(struct opp_table *opp_table, struct device *dev, goto err; } - opp_table->genpd_virt_devs[index] = virt_dev; + /* + * Add the virtual genpd device as a user of the OPP table, so + * we can call dev_pm_opp_set_opp() on it directly. + * + * This will be automatically removed when the OPP table is + * removed, don't need to handle that here. + */ + if (!_add_opp_dev(virt_dev, opp_table->required_opp_tables[index])) { + ret = -ENOMEM; + goto err; + } + + opp_table->required_devs[index] = virt_dev; index++; name++; } if (virt_devs) - *virt_devs = opp_table->genpd_virt_devs; + *virt_devs = opp_table->required_devs; return 0; @@ -2484,10 +2438,42 @@ err: } +static int _opp_set_required_devs(struct opp_table *opp_table, + struct device *dev, + struct device **required_devs) +{ + int i; + + if (!opp_table->required_devs) { + dev_err(dev, "Required OPPs not available, can't set required devs\n"); + return -EINVAL; + } + + /* Another device that shares the OPP table has set the required devs ? */ + if (opp_table->required_devs[0]) + return 0; + + for (i = 0; i < opp_table->required_opp_count; i++) + opp_table->required_devs[i] = required_devs[i]; + + return 0; +} + +static void _opp_put_required_devs(struct opp_table *opp_table) +{ + int i; + + for (i = 0; i < opp_table->required_opp_count; i++) + opp_table->required_devs[i] = NULL; +} + static void _opp_clear_config(struct opp_config_data *data) { - if (data->flags & OPP_CONFIG_GENPD) + if (data->flags & OPP_CONFIG_REQUIRED_DEVS) + _opp_put_required_devs(data->opp_table); + else if (data->flags & OPP_CONFIG_GENPD) _opp_detach_genpd(data->opp_table); + if (data->flags & OPP_CONFIG_REGULATOR) _opp_put_regulators(data->opp_table); if (data->flags & OPP_CONFIG_SUPPORTED_HW) @@ -2601,12 +2587,22 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config) /* Attach genpds */ if (config->genpd_names) { + if (config->required_devs) + goto err; + ret = _opp_attach_genpd(opp_table, dev, config->genpd_names, config->virt_devs); if (ret) goto err; data->flags |= OPP_CONFIG_GENPD; + } else if (config->required_devs) { + ret = _opp_set_required_devs(opp_table, dev, + config->required_devs); + if (ret) + goto err; + + data->flags |= OPP_CONFIG_REQUIRED_DEVS; } ret = xa_alloc(&opp_configs, &id, data, XA_LIMIT(1, INT_MAX), diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 4cdeeab5ceee..5a7e294e56b7 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -165,7 +165,7 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table, struct opp_table **required_opp_tables; struct device_node *required_np, *np; bool lazy = false; - int count, i; + int count, i, size; /* Traversing the first OPP node is all we need */ np = of_get_next_available_child(opp_np, NULL); @@ -179,12 +179,13 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table, if (count <= 0) goto put_np; - required_opp_tables = kcalloc(count, sizeof(*required_opp_tables), - GFP_KERNEL); + size = sizeof(*required_opp_tables) + sizeof(*opp_table->required_devs); + required_opp_tables = kcalloc(count, size, GFP_KERNEL); if (!required_opp_tables) goto put_np; opp_table->required_opp_tables = required_opp_tables; + opp_table->required_devs = (void *)(required_opp_tables + count); opp_table->required_opp_count = count; for (i = 0; i < count; i++) { @@ -208,8 +209,6 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table, mutex_lock(&opp_table_lock); list_add(&opp_table->lazy, &lazy_opp_tables); mutex_unlock(&opp_table_lock); - } else { - _update_set_required_opps(opp_table); } goto put_np; @@ -332,9 +331,14 @@ static int _link_required_opps(struct dev_pm_opp *opp, struct opp_table *opp_tab * * Just update the `level` with the right value, which * dev_pm_opp_set_opp() will take care of in the normal path itself. + * + * There is another case though, where a genpd's OPP table has + * required-opps set to a parent genpd. The OPP core expects the user to + * set the respective required `struct device` pointer via + * dev_pm_opp_set_config(). */ if (required_table->is_genpd && opp_table->required_opp_count == 1 && - !opp_table->genpd_virt_devs) { + !opp_table->required_devs[0]) { if (!WARN_ON(opp->level != OPP_LEVEL_UNSET)) opp->level = opp->required_opps[0]->level; } @@ -447,7 +451,6 @@ static void lazy_link_required_opp_table(struct opp_table *new_table) /* All required opp-tables found, remove from lazy list */ if (!lazy) { - _update_set_required_opps(opp_table); list_del_init(&opp_table->lazy); list_for_each_entry(opp, &opp_table->opp_list, node) diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index 08366f90f16b..23dcb2fbf8c3 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -35,6 +35,7 @@ extern struct list_head opp_tables; #define OPP_CONFIG_PROP_NAME BIT(3) #define OPP_CONFIG_SUPPORTED_HW BIT(4) #define OPP_CONFIG_GENPD BIT(5) +#define OPP_CONFIG_REQUIRED_DEVS BIT(6) /** * struct opp_config_data - data for set config operations @@ -160,9 +161,9 @@ enum opp_table_access { * @rate_clk_single: Currently configured frequency for single clk. * @current_opp: Currently configured OPP for the table. * @suspend_opp: Pointer to OPP to be used during device suspend. - * @genpd_virt_devs: List of virtual devices for multiple genpd support. * @required_opp_tables: List of device OPP tables that are required by OPPs in * this table. + * @required_devs: List of devices for required OPP tables. * @required_opp_count: Number of required devices. * @supported_hw: Array of version number to support. * @supported_hw_count: Number of elements in supported_hw array. @@ -180,7 +181,6 @@ enum opp_table_access { * @path_count: Number of interconnect paths * @enabled: Set to true if the device's resources are enabled/configured. * @is_genpd: Marks if the OPP table belongs to a genpd. - * @set_required_opps: Helper responsible to set required OPPs. * @dentry: debugfs dentry pointer of the real device directory (not links). * @dentry_name: Name of the real dentry. * @@ -211,8 +211,8 @@ struct opp_table { struct dev_pm_opp *current_opp; struct dev_pm_opp *suspend_opp; - struct device **genpd_virt_devs; struct opp_table **required_opp_tables; + struct device **required_devs; unsigned int required_opp_count; unsigned int *supported_hw; @@ -229,8 +229,6 @@ struct opp_table { unsigned int path_count; bool enabled; bool is_genpd; - int (*set_required_opps)(struct device *dev, - struct opp_table *opp_table, struct dev_pm_opp *opp, bool scaling_down); #ifdef CONFIG_DEBUG_FS struct dentry *dentry; diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index af53101a1383..81dff7facdc9 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -74,8 +74,10 @@ typedef int (*config_clks_t)(struct device *dev, struct opp_table *opp_table, * @supported_hw_count: Number of elements in the array. * @regulator_names: Array of pointers to the names of the regulator, NULL terminated. * @genpd_names: Null terminated array of pointers containing names of genpd to - * attach. - * @virt_devs: Pointer to return the array of virtual devices. + * attach. Mutually exclusive with required_devs. + * @virt_devs: Pointer to return the array of genpd virtual devices. Mutually + * exclusive with required_devs. + * @required_devs: Required OPP devices. Mutually exclusive with genpd_names/virt_devs. * * This structure contains platform specific OPP configurations for the device. */ @@ -90,6 +92,7 @@ struct dev_pm_opp_config { const char * const *regulator_names; const char * const *genpd_names; struct device ***virt_devs; + struct device **required_devs; }; #define OPP_LEVEL_UNSET U32_MAX From 925141432fa4d8325b7156e88e53d740b12d0b0e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 16 Nov 2023 15:59:35 +0530 Subject: [PATCH 0191/1562] OPP: Don't set OPP recursively for a parent genpd Like other frameworks (clk, regulator, etc.) genpd core too takes care of propagation to performance state to parent genpds. The OPP core shouldn't attempt the same, or it may result in undefined behavior. Add checks at various places to take care of the same. Reviewed-by: Ulf Hansson Tested-by: Stephan Gerhold Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 16 +++++++++++++++- drivers/opp/of.c | 7 +++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index e08375ed50aa..4f1ca84d9ed0 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -2392,6 +2392,12 @@ static int _opp_attach_genpd(struct opp_table *opp_table, struct device *dev, return -EINVAL; } + /* Genpd core takes care of propagation to parent genpd */ + if (opp_table->is_genpd) { + dev_err(dev, "%s: Operation not supported for genpds\n", __func__); + return -EOPNOTSUPP; + } + /* Checking only the first one is enough ? */ if (opp_table->required_devs[0]) return 0; @@ -2453,8 +2459,16 @@ static int _opp_set_required_devs(struct opp_table *opp_table, if (opp_table->required_devs[0]) return 0; - for (i = 0; i < opp_table->required_opp_count; i++) + for (i = 0; i < opp_table->required_opp_count; i++) { + /* Genpd core takes care of propagation to parent genpd */ + if (required_devs[i] && opp_table->is_genpd && + opp_table->required_opp_tables[i]->is_genpd) { + dev_err(dev, "%s: Operation not supported for genpds\n", __func__); + return -EOPNOTSUPP; + } + opp_table->required_devs[i] = required_devs[i]; + } return 0; } diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 5a7e294e56b7..f9f0b22bccbb 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -339,8 +339,11 @@ static int _link_required_opps(struct dev_pm_opp *opp, struct opp_table *opp_tab */ if (required_table->is_genpd && opp_table->required_opp_count == 1 && !opp_table->required_devs[0]) { - if (!WARN_ON(opp->level != OPP_LEVEL_UNSET)) - opp->level = opp->required_opps[0]->level; + /* Genpd core takes care of propagation to parent genpd */ + if (!opp_table->is_genpd) { + if (!WARN_ON(opp->level != OPP_LEVEL_UNSET)) + opp->level = opp->required_opps[0]->level; + } } return 0; From 19cc8b1819a40410c50a3efab6cf27b73298deb5 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 28 Nov 2023 12:31:38 +0530 Subject: [PATCH 0192/1562] OPP: Check for invalid OPP in dev_pm_opp_find_level_ceil() _find_key_ceil() may return an error and that must be checked before passing the same to dev_pm_opp_put(). Fixes: 41907aa4ae37 ("OPP: Level zero is valid") Reported-by: Dan Carpenter Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 4f1ca84d9ed0..c022d548067d 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -808,6 +808,8 @@ struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev, struct dev_pm_opp *opp; opp = _find_key_ceil(dev, &temp, 0, true, _read_level, NULL); + if (IS_ERR(opp)) + return opp; /* False match */ if (temp == OPP_LEVEL_UNSET) { From 4e6d4687f7645e3b4a83d915974e8749c24bf2e2 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 25 Oct 2023 20:22:19 +0100 Subject: [PATCH 0193/1562] thermal: gov_power_allocator: Rename trip_max_desired_temperature Refactor the code and rename the last passive trip point field. There is a comment describing the field properly. Use shorter field name so as to allow to clarify the code. This change is not expected to alter the general functionality. Signed-off-by: Lukasz Luba [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_power_allocator.c | 40 ++++++++++++--------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 83d4f451b1a9..97a8a6e4e1b0 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -59,9 +59,8 @@ static inline s64 div_frac(s64 x, s64 y) * governor switches on when this trip point is crossed. * If the thermal zone only has one passive trip point, * @trip_switch_on should be NULL. - * @trip_max_desired_temperature: last passive trip point of the thermal - * zone. The temperature we are - * controlling for. + * @trip_max: last passive trip point of the thermal zone. The + * temperature we are controlling for. */ struct power_allocator_params { bool allocated_tzp; @@ -69,7 +68,7 @@ struct power_allocator_params { s32 prev_err; u32 sustainable_power; const struct thermal_trip *trip_switch_on; - const struct thermal_trip *trip_max_desired_temperature; + const struct thermal_trip *trip_max; }; /** @@ -93,7 +92,7 @@ static u32 estimate_sustainable_power(struct thermal_zone_device *tz) struct thermal_cooling_device *cdev = instance->cdev; u32 min_power; - if (instance->trip != params->trip_max_desired_temperature) + if (instance->trip != params->trip_max) continue; if (!cdev_is_power_actor(cdev)) @@ -379,8 +378,7 @@ static int allocate_power(struct thermal_zone_device *tz, { struct thermal_instance *instance; struct power_allocator_params *params = tz->governor_data; - const struct thermal_trip *trip_max_desired_temperature = - params->trip_max_desired_temperature; + const struct thermal_trip *trip_max = params->trip_max; u32 *req_power, *max_power, *granted_power, *extra_actor_power; u32 *weighted_req_power; u32 total_req_power, max_allocatable_power, total_weighted_req_power; @@ -390,7 +388,7 @@ static int allocate_power(struct thermal_zone_device *tz, num_actors = 0; total_weight = 0; list_for_each_entry(instance, &tz->thermal_instances, tz_node) { - if ((instance->trip == trip_max_desired_temperature) && + if ((instance->trip == trip_max) && cdev_is_power_actor(instance->cdev)) { num_actors++; total_weight += instance->weight; @@ -429,7 +427,7 @@ static int allocate_power(struct thermal_zone_device *tz, int weight; struct thermal_cooling_device *cdev = instance->cdev; - if (instance->trip != trip_max_desired_temperature) + if (instance->trip != trip_max) continue; if (!cdev_is_power_actor(cdev)) @@ -465,7 +463,7 @@ static int allocate_power(struct thermal_zone_device *tz, total_granted_power = 0; i = 0; list_for_each_entry(instance, &tz->thermal_instances, tz_node) { - if (instance->trip != trip_max_desired_temperature) + if (instance->trip != trip_max) continue; if (!cdev_is_power_actor(instance->cdev)) @@ -531,13 +529,13 @@ static void get_governor_trips(struct thermal_zone_device *tz, if (last_passive) { params->trip_switch_on = first_passive; - params->trip_max_desired_temperature = last_passive; + params->trip_max = last_passive; } else if (first_passive) { params->trip_switch_on = NULL; - params->trip_max_desired_temperature = first_passive; + params->trip_max = first_passive; } else { params->trip_switch_on = NULL; - params->trip_max_desired_temperature = last_active; + params->trip_max = last_active; } } @@ -556,8 +554,8 @@ static void allow_maximum_power(struct thermal_zone_device *tz, bool update) list_for_each_entry(instance, &tz->thermal_instances, tz_node) { struct thermal_cooling_device *cdev = instance->cdev; - if (instance->trip != params->trip_max_desired_temperature || - (!cdev_is_power_actor(instance->cdev))) + if (instance->trip != params->trip_max || + !cdev_is_power_actor(instance->cdev)) continue; instance->target = 0; @@ -642,12 +640,10 @@ static int power_allocator_bind(struct thermal_zone_device *tz) get_governor_trips(tz, params); - if (params->trip_max_desired_temperature) { - int temp = params->trip_max_desired_temperature->temperature; - + if (params->trip_max) estimate_pid_constants(tz, tz->tzp->sustainable_power, - params->trip_switch_on, temp); - } + params->trip_switch_on, + params->trip_max->temperature); reset_pid_controller(params); @@ -688,7 +684,7 @@ static int power_allocator_throttle(struct thermal_zone_device *tz, * We get called for every trip point but we only need to do * our calculations once */ - if (trip != params->trip_max_desired_temperature) + if (trip != params->trip_max) return 0; trip = params->trip_switch_on; @@ -702,7 +698,7 @@ static int power_allocator_throttle(struct thermal_zone_device *tz, tz->passive = 1; - return allocate_power(tz, params->trip_max_desired_temperature->temperature); + return allocate_power(tz, params->trip_max->temperature); } static struct thermal_governor thermal_gov_power_allocator = { From e83747c2f8e3cc5e284e37a8921099f1901d79d8 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 25 Oct 2023 20:22:20 +0100 Subject: [PATCH 0194/1562] thermal: gov_power_allocator: Set up trip points earlier Set up the trip points at the beginning of the binding function. This simplifies the code a bit and allows for further cleanups. Also add a check to fail the binding if the last passive trip point is not found. Signed-off-by: Lukasz Luba [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_power_allocator.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 97a8a6e4e1b0..0dfc5b5ab523 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -617,14 +617,24 @@ static int power_allocator_bind(struct thermal_zone_device *tz) int ret; struct power_allocator_params *params; - ret = check_power_actors(tz); - if (ret) - return ret; - params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) return -ENOMEM; + get_governor_trips(tz, params); + if (!params->trip_max) { + dev_warn(&tz->device, "power_allocator: missing trip_max\n"); + kfree(params); + return -EINVAL; + } + + ret = check_power_actors(tz); + if (ret) { + dev_warn(&tz->device, "power_allocator: binding failed\n"); + kfree(params); + return ret; + } + if (!tz->tzp) { tz->tzp = kzalloc(sizeof(*tz->tzp), GFP_KERNEL); if (!tz->tzp) { @@ -638,12 +648,9 @@ static int power_allocator_bind(struct thermal_zone_device *tz) if (!tz->tzp->sustainable_power) dev_warn(&tz->device, "power_allocator: sustainable_power will be estimated\n"); - get_governor_trips(tz, params); - - if (params->trip_max) - estimate_pid_constants(tz, tz->tzp->sustainable_power, - params->trip_switch_on, - params->trip_max->temperature); + estimate_pid_constants(tz, tz->tzp->sustainable_power, + params->trip_switch_on, + params->trip_max->temperature); reset_pid_controller(params); From c7568e78411a67b28fe23acda934cd26d9dc42a3 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 25 Oct 2023 20:22:21 +0100 Subject: [PATCH 0195/1562] thermal: gov_power_allocator: Check the cooling devices only for trip_max The throttling logic only cares about the last passive trip point and the cooling devices attached to it. Therefore, there is no need to bail out if other trip points have cooling devices which are not a supported by the IPA. Check the cooling devices only for 'trip_max' during the binding. Signed-off-by: Lukasz Luba [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_power_allocator.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 0dfc5b5ab523..4b9ef04577a9 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -578,6 +578,7 @@ static void allow_maximum_power(struct thermal_zone_device *tz, bool update) * check_power_actors() - Check all cooling devices and warn when they are * not power actors * @tz: thermal zone to operate on + * @params: power allocator private data * * Check all cooling devices in the @tz and warn every time they are missing * power actor API. The warning should help to investigate the issue, which @@ -586,12 +587,16 @@ static void allow_maximum_power(struct thermal_zone_device *tz, bool update) * Return: 0 on success, -EINVAL if any cooling device does not implement * the power actor API. */ -static int check_power_actors(struct thermal_zone_device *tz) +static int check_power_actors(struct thermal_zone_device *tz, + struct power_allocator_params *params) { struct thermal_instance *instance; int ret = 0; list_for_each_entry(instance, &tz->thermal_instances, tz_node) { + if (instance->trip != params->trip_max) + continue; + if (!cdev_is_power_actor(instance->cdev)) { dev_warn(&tz->device, "power_allocator: %s is not a power actor\n", instance->cdev->type); @@ -628,7 +633,7 @@ static int power_allocator_bind(struct thermal_zone_device *tz) return -EINVAL; } - ret = check_power_actors(tz); + ret = check_power_actors(tz, params); if (ret) { dev_warn(&tz->device, "power_allocator: binding failed\n"); kfree(params); From 499cc391b41c8ccf5f5eae4c85e1725a037f138f Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 25 Oct 2023 20:22:22 +0100 Subject: [PATCH 0196/1562] thermal: gov_power_allocator: Rearrange local variables Rearrange the order of local variable definitions in multiple functions so as to follow the kernel coding style in that respect. Also, move local variable definitions located in nested code blocks to the beginning of each function to improve the visibility of all local variables in use. This change is not expected to alter the general functionality. Signed-off-by: Lukasz Luba [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_power_allocator.c | 39 ++++++++++++++------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 4b9ef04577a9..79621b42ead3 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -84,13 +84,14 @@ struct power_allocator_params { */ static u32 estimate_sustainable_power(struct thermal_zone_device *tz) { - u32 sustainable_power = 0; - struct thermal_instance *instance; struct power_allocator_params *params = tz->governor_data; + struct thermal_cooling_device *cdev; + struct thermal_instance *instance; + u32 sustainable_power = 0; + u32 min_power; list_for_each_entry(instance, &tz->thermal_instances, tz_node) { - struct thermal_cooling_device *cdev = instance->cdev; - u32 min_power; + cdev = instance->cdev; if (instance->trip != params->trip_max) continue; @@ -211,10 +212,10 @@ static u32 pid_controller(struct thermal_zone_device *tz, int control_temp, u32 max_allocatable_power) { + struct power_allocator_params *params = tz->governor_data; s64 p, i, d, power_range; s32 err, max_power_frac; u32 sustainable_power; - struct power_allocator_params *params = tz->governor_data; max_power_frac = int_to_frac(max_allocatable_power); @@ -373,20 +374,20 @@ static void divvy_up_power(u32 *req_power, u32 *max_power, int num_actors, } } -static int allocate_power(struct thermal_zone_device *tz, - int control_temp) +static int allocate_power(struct thermal_zone_device *tz, int control_temp) { - struct thermal_instance *instance; + u32 total_req_power, max_allocatable_power, total_weighted_req_power; + u32 *req_power, *max_power, *granted_power, *extra_actor_power; struct power_allocator_params *params = tz->governor_data; const struct thermal_trip *trip_max = params->trip_max; - u32 *req_power, *max_power, *granted_power, *extra_actor_power; - u32 *weighted_req_power; - u32 total_req_power, max_allocatable_power, total_weighted_req_power; u32 total_granted_power, power_range; - int i, num_actors, total_weight, ret = 0; + struct thermal_cooling_device *cdev; + struct thermal_instance *instance; + u32 *weighted_req_power; + int i, weight, ret = 0; + int total_weight = 0; + int num_actors = 0; - num_actors = 0; - total_weight = 0; list_for_each_entry(instance, &tz->thermal_instances, tz_node) { if ((instance->trip == trip_max) && cdev_is_power_actor(instance->cdev)) { @@ -424,8 +425,7 @@ static int allocate_power(struct thermal_zone_device *tz, max_allocatable_power = 0; list_for_each_entry(instance, &tz->thermal_instances, tz_node) { - int weight; - struct thermal_cooling_device *cdev = instance->cdev; + cdev = instance->cdev; if (instance->trip != trip_max) continue; @@ -547,12 +547,13 @@ static void reset_pid_controller(struct power_allocator_params *params) static void allow_maximum_power(struct thermal_zone_device *tz, bool update) { - struct thermal_instance *instance; struct power_allocator_params *params = tz->governor_data; + struct thermal_cooling_device *cdev; + struct thermal_instance *instance; u32 req_power; list_for_each_entry(instance, &tz->thermal_instances, tz_node) { - struct thermal_cooling_device *cdev = instance->cdev; + cdev = instance->cdev; if (instance->trip != params->trip_max || !cdev_is_power_actor(instance->cdev)) @@ -619,8 +620,8 @@ static int check_power_actors(struct thermal_zone_device *tz, */ static int power_allocator_bind(struct thermal_zone_device *tz) { - int ret; struct power_allocator_params *params; + int ret; params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) From 30e1178c100df8c8560f4d338fdb6f4fcd27e676 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 25 Oct 2023 20:22:23 +0100 Subject: [PATCH 0197/1562] thermal: gov_power_allocator: Use shorter paths to access data when possible The 'cdev' pointer in allow_maximum_power() is valid, so there is no need to use 'instance->cdev' instead of it. This change is not expected to alter the general functionality. Signed-off-by: Lukasz Luba [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_power_allocator.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 79621b42ead3..0f7f8278eacc 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -560,7 +560,7 @@ static void allow_maximum_power(struct thermal_zone_device *tz, bool update) continue; instance->target = 0; - mutex_lock(&instance->cdev->lock); + mutex_lock(&cdev->lock); /* * Call for updating the cooling devices local stats and avoid * periods of dozen of seconds when those have not been @@ -569,9 +569,9 @@ static void allow_maximum_power(struct thermal_zone_device *tz, bool update) cdev->ops->get_requested_power(cdev, &req_power); if (update) - __thermal_cdev_update(instance->cdev); + __thermal_cdev_update(cdev); - mutex_unlock(&instance->cdev->lock); + mutex_unlock(&cdev->lock); } } From 0458d536ae97c5107b81810778d050da04d83fa2 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 25 Oct 2023 20:22:24 +0100 Subject: [PATCH 0198/1562] thermal: gov_power_allocator: Remove excessive local variables Local variable 'ret' in allocate_power() is only used in the return statement, so drop it. Local variable 'trip_max' in allocate_power() is only used for caching the params->trip_max value which may as well be accessed directly as needed, so drop it either. This change is not expected to alter the general functionality. Signed-off-by: Lukasz Luba [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_power_allocator.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 0f7f8278eacc..e6d2f0fe8d2f 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -379,17 +379,16 @@ static int allocate_power(struct thermal_zone_device *tz, int control_temp) u32 total_req_power, max_allocatable_power, total_weighted_req_power; u32 *req_power, *max_power, *granted_power, *extra_actor_power; struct power_allocator_params *params = tz->governor_data; - const struct thermal_trip *trip_max = params->trip_max; u32 total_granted_power, power_range; struct thermal_cooling_device *cdev; struct thermal_instance *instance; u32 *weighted_req_power; - int i, weight, ret = 0; int total_weight = 0; int num_actors = 0; + int i, weight; list_for_each_entry(instance, &tz->thermal_instances, tz_node) { - if ((instance->trip == trip_max) && + if ((instance->trip == params->trip_max) && cdev_is_power_actor(instance->cdev)) { num_actors++; total_weight += instance->weight; @@ -427,7 +426,7 @@ static int allocate_power(struct thermal_zone_device *tz, int control_temp) list_for_each_entry(instance, &tz->thermal_instances, tz_node) { cdev = instance->cdev; - if (instance->trip != trip_max) + if (instance->trip != params->trip_max) continue; if (!cdev_is_power_actor(cdev)) @@ -463,7 +462,7 @@ static int allocate_power(struct thermal_zone_device *tz, int control_temp) total_granted_power = 0; i = 0; list_for_each_entry(instance, &tz->thermal_instances, tz_node) { - if (instance->trip != trip_max) + if (instance->trip != params->trip_max) continue; if (!cdev_is_power_actor(instance->cdev)) @@ -484,7 +483,7 @@ static int allocate_power(struct thermal_zone_device *tz, int control_temp) kfree(req_power); - return ret; + return 0; } /** From 401888e7206778db54b79e6b3c25a2f1461413e6 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 25 Oct 2023 20:22:25 +0100 Subject: [PATCH 0199/1562] thermal: gov_power_allocator: Rearrange initialization of local variables Rearrange the initialization of local variables in allocate_power() so as to improve code clarity and the visibility of the initial values. This change is not expected to alter the general functionality. Signed-off-by: Lukasz Luba [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_power_allocator.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index e6d2f0fe8d2f..785fff14223d 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -376,16 +376,19 @@ static void divvy_up_power(u32 *req_power, u32 *max_power, int num_actors, static int allocate_power(struct thermal_zone_device *tz, int control_temp) { - u32 total_req_power, max_allocatable_power, total_weighted_req_power; u32 *req_power, *max_power, *granted_power, *extra_actor_power; struct power_allocator_params *params = tz->governor_data; - u32 total_granted_power, power_range; struct thermal_cooling_device *cdev; struct thermal_instance *instance; + u32 total_weighted_req_power = 0; + u32 max_allocatable_power = 0; + u32 total_granted_power = 0; + u32 total_req_power = 0; u32 *weighted_req_power; + u32 power_range, weight; int total_weight = 0; int num_actors = 0; - int i, weight; + int i = 0; list_for_each_entry(instance, &tz->thermal_instances, tz_node) { if ((instance->trip == params->trip_max) && @@ -418,11 +421,6 @@ static int allocate_power(struct thermal_zone_device *tz, int control_temp) extra_actor_power = &req_power[3 * num_actors]; weighted_req_power = &req_power[4 * num_actors]; - i = 0; - total_weighted_req_power = 0; - total_req_power = 0; - max_allocatable_power = 0; - list_for_each_entry(instance, &tz->thermal_instances, tz_node) { cdev = instance->cdev; @@ -459,7 +457,6 @@ static int allocate_power(struct thermal_zone_device *tz, int control_temp) total_weighted_req_power, power_range, granted_power, extra_actor_power); - total_granted_power = 0; i = 0; list_for_each_entry(instance, &tz->thermal_instances, tz_node) { if (instance->trip != params->trip_max) From 877c737db9355acaa1ec2fd2b8dbdaff82605df7 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 27 Nov 2023 14:51:05 -0500 Subject: [PATCH 0200/1562] cgroup/cpuset: Expose cpuset.cpus.isolated The root-only cpuset.cpus.isolated control file shows the current set of isolated CPUs in isolated partitions. This control file is currently exposed only with the cgroup_debug boot command line option which also adds the ".__DEBUG__." prefix. This is actually a useful control file if users want to find out which CPUs are currently in an isolated state by the cpuset controller. Remove CFTYPE_DEBUG flag for this control file and make it available by default without any prefix. The test_cpuset_prs.sh test script and the cgroup-v2.rst documentation file are also updated accordingly. Minor code change is also made in test_cpuset_prs.sh to avoid false test failure when running on debug kernel. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 7 ++++ kernel/cgroup/cpuset.c | 2 +- .../selftests/cgroup/test_cpuset_prs.sh | 32 +++++++++++-------- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index cf5651a11df8..30f6ff2eba47 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2316,6 +2316,13 @@ Cpuset Interface Files treated to have an implicit value of "cpuset.cpus" in the formation of local partition. + cpuset.cpus.isolated + A read-only and root cgroup only multiple values file. + + This file shows the set of all isolated CPUs used in existing + isolated partitions. It will be empty if no isolated partition + is created. + cpuset.cpus.partition A read-write single value file which exists on non-root cpuset-enabled cgroups. This flag is owned by the parent cgroup diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1bad4007ff4b..2a16df86c55c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3974,7 +3974,7 @@ static struct cftype dfl_files[] = { .name = "cpus.isolated", .seq_show = cpuset_common_seq_show, .private = FILE_ISOLATED_CPULIST, - .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, + .flags = CFTYPE_ONLY_ON_ROOT, }, { } /* terminate */ diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 7b7c4c2b6d85..b5eb1be2248c 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -508,7 +508,7 @@ dump_states() XECPUS=$DIR/cpuset.cpus.exclusive.effective PRS=$DIR/cpuset.cpus.partition PCPUS=$DIR/.__DEBUG__.cpuset.cpus.subpartitions - ISCPUS=$DIR/.__DEBUG__.cpuset.cpus.isolated + ISCPUS=$DIR/cpuset.cpus.isolated [[ -e $CPUS ]] && echo "$CPUS: $(cat $CPUS)" [[ -e $XCPUS ]] && echo "$XCPUS: $(cat $XCPUS)" [[ -e $ECPUS ]] && echo "$ECPUS: $(cat $ECPUS)" @@ -593,17 +593,17 @@ check_cgroup_states() # # Get isolated (including offline) CPUs by looking at -# /sys/kernel/debug/sched/domains and *cpuset.cpus.isolated control file, +# /sys/kernel/debug/sched/domains and cpuset.cpus.isolated control file, # if available, and compare that with the expected value. # # Note that isolated CPUs from the sched/domains context include offline # CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may -# not be included in the *cpuset.cpus.isolated control file which contains +# not be included in the cpuset.cpus.isolated control file which contains # only CPUs in isolated partitions. # # $1 - expected isolated cpu list(s) {,} # - expected sched/domains value -# - *cpuset.cpus.isolated value = if not defined +# - cpuset.cpus.isolated value = if not defined # check_isolcpus() { @@ -611,7 +611,7 @@ check_isolcpus() ISOLCPUS= LASTISOLCPU= SCHED_DOMAINS=/sys/kernel/debug/sched/domains - ISCPUS=${CGROUP2}/.__DEBUG__.cpuset.cpus.isolated + ISCPUS=${CGROUP2}/cpuset.cpus.isolated if [[ $EXPECT_VAL = . ]] then EXPECT_VAL= @@ -692,14 +692,18 @@ test_fail() null_isolcpus_check() { [[ $VERBOSE -gt 0 ]] || return 0 - pause 0.02 - check_isolcpus "." - if [[ $? -ne 0 ]] - then - echo "Unexpected isolated CPUs: $ISOLCPUS" - dump_states - exit 1 - fi + # Retry a few times before printing error + RETRY=0 + while [[ $RETRY -lt 5 ]] + do + pause 0.01 + check_isolcpus "." + [[ $? -eq 0 ]] && return 0 + ((RETRY++)) + done + echo "Unexpected isolated CPUs: $ISOLCPUS" + dump_states + exit 1 } # @@ -776,7 +780,7 @@ run_state_test() # NEWLIST=$(cat cpuset.cpus.effective) RETRY=0 - while [[ $NEWLIST != $CPULIST && $RETRY -lt 5 ]] + while [[ $NEWLIST != $CPULIST && $RETRY -lt 8 ]] do # Wait a bit longer & recheck a few times pause 0.01 From 5068d84054b766efe7c6202fc71b2350d1c326f1 Mon Sep 17 00:00:00 2001 From: Yiwei Lin Date: Fri, 17 Nov 2023 16:01:06 +0800 Subject: [PATCH 0201/1562] sched/fair: Update min_vruntime for reweight_entity() correctly Since reweight_entity() may have chance to change the weight of cfs_rq->curr entity, we should also update_min_vruntime() if this is the case Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight") Signed-off-by: Yiwei Lin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Abel Wu Link: https://lore.kernel.org/r/20231117080106.12890-1-s921975628@gmail.com --- kernel/sched/fair.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 34fe6e9490c2..bcea3d55d95d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3815,17 +3815,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { update_load_add(&cfs_rq->load, se->load.weight); - if (!curr) { - /* - * The entity's vruntime has been adjusted, so let's check - * whether the rq-wide min_vruntime needs updated too. Since - * the calculations above require stable min_vruntime rather - * than up-to-date one, we do the update at the end of the - * reweight process. - */ + if (!curr) __enqueue_entity(cfs_rq, se); - update_min_vruntime(cfs_rq); - } + + /* + * The entity's vruntime has been adjusted, so let's check + * whether the rq-wide min_vruntime needs updated too. Since + * the calculations above require stable min_vruntime rather + * than up-to-date one, we do the update at the end of the + * reweight process. + */ + update_min_vruntime(cfs_rq); } } From 418146e39891ef1fb2284dee4cabbfe616cd21cf Mon Sep 17 00:00:00 2001 From: Elliot Berman Date: Mon, 20 Nov 2023 09:36:32 -0800 Subject: [PATCH 0202/1562] freezer,sched: Clean saved_state when restoring it during thaw Clean saved_state after using it during thaw. Cleaning the saved_state allows us to avoid some unnecessary branches in ttwu_state_match. Signed-off-by: Elliot Berman Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231120-freezer-state-multiple-thaws-v1-2-f2e1dd7ce5a2@quicinc.com --- kernel/freezer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/freezer.c b/kernel/freezer.c index c450fa8b8b5e..43b1d1b94d9e 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -187,6 +187,7 @@ static int __restore_freezer_state(struct task_struct *p, void *arg) if (state != TASK_RUNNING) { WRITE_ONCE(p->__state, state); + p->saved_state = TASK_RUNNING; return 1; } From 0f9e0d7928d8e88d57b1482effab70edb9741ce1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 30 Nov 2023 11:52:46 +0530 Subject: [PATCH 0203/1562] perf/x86/amd: Reject branch stack for IBS events The AMD IBS PMU doesn't handle branch stacks, so it should not accept events with brstack. Signed-off-by: Namhyung Kim Signed-off-by: Ravi Bangoria Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231130062246.290-1-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 6911c5399d02..e91970b01d62 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -287,6 +287,9 @@ static int perf_ibs_init(struct perf_event *event) if (config & ~perf_ibs->config_mask) return -EINVAL; + if (has_branch_stack(event)) + return -EOPNOTSUPP; + ret = validate_group(event); if (ret) return ret; From 2082b6956ce95b66b03983f3059744f559493d98 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Thu, 9 Nov 2023 08:39:53 +0300 Subject: [PATCH 0204/1562] mtd: rawnand: meson: handle OOB buffer according OOB layout In case of MTD_OPS_AUTO_OOB mode, MTD/NAND layer fills/reads OOB buffer according current OOB layout so we need to follow it in the driver. Signed-off-by: Arseniy Krasnov Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231109053953.3863664-1-avkrasnov@salutedevices.com --- drivers/mtd/nand/raw/meson_nand.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/raw/meson_nand.c b/drivers/mtd/nand/raw/meson_nand.c index 7e16a13fb438..cdb58aca59c0 100644 --- a/drivers/mtd/nand/raw/meson_nand.c +++ b/drivers/mtd/nand/raw/meson_nand.c @@ -511,7 +511,7 @@ static void meson_nfc_set_user_byte(struct nand_chip *nand, u8 *oob_buf) __le64 *info; int i, count; - for (i = 0, count = 0; i < nand->ecc.steps; i++, count += 2) { + for (i = 0, count = 0; i < nand->ecc.steps; i++, count += (2 + nand->ecc.bytes)) { info = &meson_chip->info_buf[i]; *info |= oob_buf[count]; *info |= oob_buf[count + 1] << 8; @@ -524,7 +524,7 @@ static void meson_nfc_get_user_byte(struct nand_chip *nand, u8 *oob_buf) __le64 *info; int i, count; - for (i = 0, count = 0; i < nand->ecc.steps; i++, count += 2) { + for (i = 0, count = 0; i < nand->ecc.steps; i++, count += (2 + nand->ecc.bytes)) { info = &meson_chip->info_buf[i]; oob_buf[count] = *info; oob_buf[count + 1] = *info >> 8; From 39cefd85098d12439586824c39f8e1948fac186d Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 29 Nov 2023 17:31:55 +0100 Subject: [PATCH 0205/1562] spi: introduce SPI_TRANS_FAIL_IO for error reporting The default message transfer implementation - spi_transfer_one_message - invokes the specific device driver's transfer_one(), then waits for completion. However, there is no mechanism for the device driver to report failure in the middle of the transfer. Introduce SPI_TRANS_FAIL_IO for drivers to report transfer failure. Signed-off-by: Nam Cao Acked-by: Linus Walleij Link: https://lore.kernel.org/r/4b420dac528e60f122adde16851da88e4798c1ea.1701274975.git.namcao@linutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi.c | 3 +++ include/linux/spi/spi.h | 12 ++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 8ead7acb99f3..a4b8c07c5951 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1361,6 +1361,9 @@ static int spi_transfer_wait(struct spi_controller *ctlr, "SPI transfer timed out\n"); return -ETIMEDOUT; } + + if (xfer->error & SPI_TRANS_FAIL_IO) + return -EIO; } return 0; diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 255a0562aea5..aa25ae04c5c3 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -461,10 +461,13 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * - return 1 if the transfer is still in progress. When * the driver is finished with this transfer it must * call spi_finalize_current_transfer() so the subsystem - * can issue the next transfer. Note: transfer_one and - * transfer_one_message are mutually exclusive; when both - * are set, the generic subsystem does not call your - * transfer_one callback. + * can issue the next transfer. If the transfer fails, the + * driver must set the flag SPI_TRANS_FAIL_IO to + * spi_transfer->error first, before calling + * spi_finalize_current_transfer(). + * Note: transfer_one and transfer_one_message are mutually + * exclusive; when both are set, the generic subsystem does + * not call your transfer_one callback. * @handle_err: the subsystem calls the driver to handle an error that occurs * in the generic implementation of transfer_one_message(). * @mem_ops: optimized/dedicated operations for interactions with SPI memory. @@ -1040,6 +1043,7 @@ struct spi_transfer { unsigned len; #define SPI_TRANS_FAIL_NO_START BIT(0) +#define SPI_TRANS_FAIL_IO BIT(1) u16 error; dma_addr_t tx_dma; From 9b2ef250b31d46f7ef522bd1bd84942f998bb3f9 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 29 Nov 2023 17:31:56 +0100 Subject: [PATCH 0206/1562] spi: spl022: switch to use default spi_transfer_one_message() Except for polling mode, this driver's transfer_one_message() makes use of interrupt handler and tasklet. This is problematic because spi_transfer_delay_exec(), who may sleep, is called in interrupt handler and tasklet. This causes the following warning: BUG: sleeping function called from invalid context at drivers/spi/spi.c:1428 Switch to use the default spi_transfer_one_message() instead, which calls spi_transfer_delay_exec() appropriately. Signed-off-by: Nam Cao Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/ae1940abd6ff6a9e77b4373cff60007c641a0c6c.1701274975.git.namcao@linutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi-pl022.c | 368 +++++++--------------------------------- 1 file changed, 64 insertions(+), 304 deletions(-) diff --git a/drivers/spi/spi-pl022.c b/drivers/spi/spi-pl022.c index d1b6110b38fc..1e3bd6f3303a 100644 --- a/drivers/spi/spi-pl022.c +++ b/drivers/spi/spi-pl022.c @@ -338,7 +338,6 @@ struct vendor_data { * @clk: outgoing clock "SPICLK" for the SPI bus * @host: SPI framework hookup * @host_info: controller-specific data from machine setup - * @pump_transfers: Tasklet used in Interrupt Transfer mode * @cur_msg: Pointer to current spi_message being processed * @cur_transfer: Pointer to current spi_transfer * @cur_chip: pointer to current clients chip(assigned from controller_state) @@ -372,9 +371,6 @@ struct pl022 { struct clk *clk; struct spi_controller *host; struct pl022_ssp_controller *host_info; - /* Message per-transfer pump */ - struct tasklet_struct pump_transfers; - struct spi_message *cur_msg; struct spi_transfer *cur_transfer; struct chip_data *cur_chip; bool next_msg_cs_active; @@ -437,93 +433,23 @@ struct chip_data { * (vendor extension). Each of the 5 LSB in the register controls one chip * select signal. */ -static void internal_cs_control(struct pl022 *pl022, u32 command) +static void internal_cs_control(struct pl022 *pl022, bool enable) { u32 tmp; tmp = readw(SSP_CSR(pl022->virtbase)); - if (command == SSP_CHIP_SELECT) + if (enable) tmp &= ~BIT(pl022->cur_cs); else tmp |= BIT(pl022->cur_cs); writew(tmp, SSP_CSR(pl022->virtbase)); } -static void pl022_cs_control(struct pl022 *pl022, u32 command) +static void pl022_cs_control(struct spi_device *spi, bool enable) { + struct pl022 *pl022 = spi_controller_get_devdata(spi->controller); if (pl022->vendor->internal_cs_ctrl) - internal_cs_control(pl022, command); - else if (pl022->cur_gpiod) - /* - * This needs to be inverted since with GPIOLIB in - * control, the inversion will be handled by - * GPIOLIB's active low handling. The "command" - * passed into this function will be SSP_CHIP_SELECT - * which is enum:ed to 0, so we need the inverse - * (1) to activate chip select. - */ - gpiod_set_value(pl022->cur_gpiod, !command); -} - -/** - * giveback - current spi_message is over, schedule next message and call - * callback of this message. Assumes that caller already - * set message->status; dma and pio irqs are blocked - * @pl022: SSP driver private data structure - */ -static void giveback(struct pl022 *pl022) -{ - struct spi_transfer *last_transfer; - pl022->next_msg_cs_active = false; - - last_transfer = list_last_entry(&pl022->cur_msg->transfers, - struct spi_transfer, transfer_list); - - /* Delay if requested before any change in chip select */ - /* - * FIXME: This runs in interrupt context. - * Is this really smart? - */ - spi_transfer_delay_exec(last_transfer); - - if (!last_transfer->cs_change) { - struct spi_message *next_msg; - - /* - * cs_change was not set. We can keep the chip select - * enabled if there is message in the queue and it is - * for the same spi device. - * - * We cannot postpone this until pump_messages, because - * after calling msg->complete (below) the driver that - * sent the current message could be unloaded, which - * could invalidate the cs_control() callback... - */ - /* get a pointer to the next message, if any */ - next_msg = spi_get_next_queued_message(pl022->host); - - /* - * see if the next and current messages point - * to the same spi device. - */ - if (next_msg && next_msg->spi != pl022->cur_msg->spi) - next_msg = NULL; - if (!next_msg || pl022->cur_msg->state == STATE_ERROR) - pl022_cs_control(pl022, SSP_CHIP_DESELECT); - else - pl022->next_msg_cs_active = true; - - } - - pl022->cur_msg = NULL; - pl022->cur_transfer = NULL; - pl022->cur_chip = NULL; - - /* disable the SPI/SSP operation */ - writew((readw(SSP_CR1(pl022->virtbase)) & - (~SSP_CR1_MASK_SSE)), SSP_CR1(pl022->virtbase)); - - spi_finalize_current_message(pl022->host); + internal_cs_control(pl022, enable); } /** @@ -757,30 +683,6 @@ static void readwriter(struct pl022 *pl022) */ } -/** - * next_transfer - Move to the Next transfer in the current spi message - * @pl022: SSP driver private data structure - * - * This function moves though the linked list of spi transfers in the - * current spi message and returns with the state of current spi - * message i.e whether its last transfer is done(STATE_DONE) or - * Next transfer is ready(STATE_RUNNING) - */ -static void *next_transfer(struct pl022 *pl022) -{ - struct spi_message *msg = pl022->cur_msg; - struct spi_transfer *trans = pl022->cur_transfer; - - /* Move to next transfer */ - if (trans->transfer_list.next != &msg->transfers) { - pl022->cur_transfer = - list_entry(trans->transfer_list.next, - struct spi_transfer, transfer_list); - return STATE_RUNNING; - } - return STATE_DONE; -} - /* * This DMA functionality is only compiled in if we have * access to the generic DMA devices/DMA engine. @@ -800,7 +702,6 @@ static void unmap_free_dma_scatter(struct pl022 *pl022) static void dma_callback(void *data) { struct pl022 *pl022 = data; - struct spi_message *msg = pl022->cur_msg; BUG_ON(!pl022->sgt_rx.sgl); @@ -845,13 +746,7 @@ static void dma_callback(void *data) unmap_free_dma_scatter(pl022); - /* Update total bytes transferred */ - msg->actual_length += pl022->cur_transfer->len; - /* Move to next transfer */ - msg->state = next_transfer(pl022); - if (msg->state != STATE_DONE && pl022->cur_transfer->cs_change) - pl022_cs_control(pl022, SSP_CHIP_DESELECT); - tasklet_schedule(&pl022->pump_transfers); + spi_finalize_current_transfer(pl022->host); } static void setup_dma_scatter(struct pl022 *pl022, @@ -1189,6 +1084,9 @@ err_no_rxchan: static void terminate_dma(struct pl022 *pl022) { + if (!pl022->dma_running) + return; + struct dma_chan *rxchan = pl022->dma_rx_channel; struct dma_chan *txchan = pl022->dma_tx_channel; @@ -1200,8 +1098,7 @@ static void terminate_dma(struct pl022 *pl022) static void pl022_dma_remove(struct pl022 *pl022) { - if (pl022->dma_running) - terminate_dma(pl022); + terminate_dma(pl022); if (pl022->dma_tx_channel) dma_release_channel(pl022->dma_tx_channel); if (pl022->dma_rx_channel) @@ -1225,6 +1122,10 @@ static inline int pl022_dma_probe(struct pl022 *pl022) return 0; } +static inline void terminate_dma(struct pl022 *pl022) +{ +} + static inline void pl022_dma_remove(struct pl022 *pl022) { } @@ -1246,16 +1147,7 @@ static inline void pl022_dma_remove(struct pl022 *pl022) static irqreturn_t pl022_interrupt_handler(int irq, void *dev_id) { struct pl022 *pl022 = dev_id; - struct spi_message *msg = pl022->cur_msg; u16 irq_status = 0; - - if (unlikely(!msg)) { - dev_err(&pl022->adev->dev, - "bad message state in interrupt handler"); - /* Never fail */ - return IRQ_HANDLED; - } - /* Read the Interrupt Status Register */ irq_status = readw(SSP_MIS(pl022->virtbase)); @@ -1287,10 +1179,8 @@ static irqreturn_t pl022_interrupt_handler(int irq, void *dev_id) writew(CLEAR_ALL_INTERRUPTS, SSP_ICR(pl022->virtbase)); writew((readw(SSP_CR1(pl022->virtbase)) & (~SSP_CR1_MASK_SSE)), SSP_CR1(pl022->virtbase)); - msg->state = STATE_ERROR; - - /* Schedule message queue handler */ - tasklet_schedule(&pl022->pump_transfers); + pl022->cur_transfer->error |= SPI_TRANS_FAIL_IO; + spi_finalize_current_transfer(pl022->host); return IRQ_HANDLED; } @@ -1318,13 +1208,7 @@ static irqreturn_t pl022_interrupt_handler(int irq, void *dev_id) "number of bytes on a 16bit bus?)\n", (u32) (pl022->rx - pl022->rx_end)); } - /* Update total bytes transferred */ - msg->actual_length += pl022->cur_transfer->len; - /* Move to next transfer */ - msg->state = next_transfer(pl022); - if (msg->state != STATE_DONE && pl022->cur_transfer->cs_change) - pl022_cs_control(pl022, SSP_CHIP_DESELECT); - tasklet_schedule(&pl022->pump_transfers); + spi_finalize_current_transfer(pl022->host); return IRQ_HANDLED; } @@ -1361,98 +1245,20 @@ static int set_up_next_transfer(struct pl022 *pl022, return 0; } -/** - * pump_transfers - Tasklet function which schedules next transfer - * when running in interrupt or DMA transfer mode. - * @data: SSP driver private data structure - * - */ -static void pump_transfers(unsigned long data) +static int do_interrupt_dma_transfer(struct pl022 *pl022) { - struct pl022 *pl022 = (struct pl022 *) data; - struct spi_message *message = NULL; - struct spi_transfer *transfer = NULL; - struct spi_transfer *previous = NULL; + int ret; - /* Get current state information */ - message = pl022->cur_msg; - transfer = pl022->cur_transfer; - - /* Handle for abort */ - if (message->state == STATE_ERROR) { - message->status = -EIO; - giveback(pl022); - return; - } - - /* Handle end of message */ - if (message->state == STATE_DONE) { - message->status = 0; - giveback(pl022); - return; - } - - /* Delay if requested at end of transfer before CS change */ - if (message->state == STATE_RUNNING) { - previous = list_entry(transfer->transfer_list.prev, - struct spi_transfer, - transfer_list); - /* - * FIXME: This runs in interrupt context. - * Is this really smart? - */ - spi_transfer_delay_exec(previous); - - /* Reselect chip select only if cs_change was requested */ - if (previous->cs_change) - pl022_cs_control(pl022, SSP_CHIP_SELECT); - } else { - /* STATE_START */ - message->state = STATE_RUNNING; - } - - if (set_up_next_transfer(pl022, transfer)) { - message->state = STATE_ERROR; - message->status = -EIO; - giveback(pl022); - return; - } - /* Flush the FIFOs and let's go! */ - flush(pl022); - - if (pl022->cur_chip->enable_dma) { - if (configure_dma(pl022)) { - dev_dbg(&pl022->adev->dev, - "configuration of DMA failed, fall back to interrupt mode\n"); - goto err_config_dma; - } - return; - } - -err_config_dma: - /* enable all interrupts except RX */ - writew(ENABLE_ALL_INTERRUPTS & ~SSP_IMSC_MASK_RXIM, SSP_IMSC(pl022->virtbase)); -} - -static void do_interrupt_dma_transfer(struct pl022 *pl022) -{ /* * Default is to enable all interrupts except RX - * this will be enabled once TX is complete */ u32 irqflags = (u32)(ENABLE_ALL_INTERRUPTS & ~SSP_IMSC_MASK_RXIM); - /* Enable target chip, if not already active */ - if (!pl022->next_msg_cs_active) - pl022_cs_control(pl022, SSP_CHIP_SELECT); + ret = set_up_next_transfer(pl022, pl022->cur_transfer); + if (ret) + return ret; - if (set_up_next_transfer(pl022, pl022->cur_transfer)) { - /* Error path */ - pl022->cur_msg->state = STATE_ERROR; - pl022->cur_msg->status = -EIO; - giveback(pl022); - return; - } /* If we're using DMA, set up DMA here */ if (pl022->cur_chip->enable_dma) { /* Configure DMA transfer */ @@ -1469,6 +1275,7 @@ err_config_dma: writew((readw(SSP_CR1(pl022->virtbase)) | SSP_CR1_MASK_SSE), SSP_CR1(pl022->virtbase)); writew(irqflags, SSP_IMSC(pl022->virtbase)); + return 1; } static void print_current_status(struct pl022 *pl022) @@ -1495,111 +1302,67 @@ static void print_current_status(struct pl022 *pl022) } -static void do_polling_transfer(struct pl022 *pl022) +static int do_polling_transfer(struct pl022 *pl022) { - struct spi_message *message = NULL; - struct spi_transfer *transfer = NULL; - struct spi_transfer *previous = NULL; + int ret; unsigned long time, timeout; - message = pl022->cur_msg; + /* Configuration Changing Per Transfer */ + ret = set_up_next_transfer(pl022, pl022->cur_transfer); + if (ret) + return ret; + /* Flush FIFOs and enable SSP */ + flush(pl022); + writew((readw(SSP_CR1(pl022->virtbase)) | SSP_CR1_MASK_SSE), + SSP_CR1(pl022->virtbase)); - while (message->state != STATE_DONE) { - /* Handle for abort */ - if (message->state == STATE_ERROR) - break; - transfer = pl022->cur_transfer; + dev_dbg(&pl022->adev->dev, "polling transfer ongoing ...\n"); - /* Delay if requested at end of transfer */ - if (message->state == STATE_RUNNING) { - previous = - list_entry(transfer->transfer_list.prev, - struct spi_transfer, transfer_list); - spi_transfer_delay_exec(previous); - if (previous->cs_change) - pl022_cs_control(pl022, SSP_CHIP_SELECT); - } else { - /* STATE_START */ - message->state = STATE_RUNNING; - if (!pl022->next_msg_cs_active) - pl022_cs_control(pl022, SSP_CHIP_SELECT); + timeout = jiffies + msecs_to_jiffies(SPI_POLLING_TIMEOUT); + while (pl022->tx < pl022->tx_end || pl022->rx < pl022->rx_end) { + time = jiffies; + readwriter(pl022); + if (time_after(time, timeout)) { + dev_warn(&pl022->adev->dev, + "%s: timeout!\n", __func__); + print_current_status(pl022); + return -ETIMEDOUT; } - - /* Configuration Changing Per Transfer */ - if (set_up_next_transfer(pl022, transfer)) { - /* Error path */ - message->state = STATE_ERROR; - break; - } - /* Flush FIFOs and enable SSP */ - flush(pl022); - writew((readw(SSP_CR1(pl022->virtbase)) | SSP_CR1_MASK_SSE), - SSP_CR1(pl022->virtbase)); - - dev_dbg(&pl022->adev->dev, "polling transfer ongoing ...\n"); - - timeout = jiffies + msecs_to_jiffies(SPI_POLLING_TIMEOUT); - while (pl022->tx < pl022->tx_end || pl022->rx < pl022->rx_end) { - time = jiffies; - readwriter(pl022); - if (time_after(time, timeout)) { - dev_warn(&pl022->adev->dev, - "%s: timeout!\n", __func__); - message->state = STATE_TIMEOUT; - print_current_status(pl022); - goto out; - } - cpu_relax(); - } - - /* Update total byte transferred */ - message->actual_length += pl022->cur_transfer->len; - /* Move to next transfer */ - message->state = next_transfer(pl022); - if (message->state != STATE_DONE - && pl022->cur_transfer->cs_change) - pl022_cs_control(pl022, SSP_CHIP_DESELECT); + cpu_relax(); } -out: - /* Handle end of message */ - if (message->state == STATE_DONE) - message->status = 0; - else if (message->state == STATE_TIMEOUT) - message->status = -EAGAIN; - else - message->status = -EIO; - giveback(pl022); - return; + return 0; } -static int pl022_transfer_one_message(struct spi_controller *host, - struct spi_message *msg) +static int pl022_transfer_one(struct spi_controller *host, struct spi_device *spi, + struct spi_transfer *transfer) { struct pl022 *pl022 = spi_controller_get_devdata(host); - /* Initial message state */ - pl022->cur_msg = msg; - msg->state = STATE_START; - - pl022->cur_transfer = list_entry(msg->transfers.next, - struct spi_transfer, transfer_list); + pl022->cur_transfer = transfer; /* Setup the SPI using the per chip configuration */ - pl022->cur_chip = spi_get_ctldata(msg->spi); - pl022->cur_cs = spi_get_chipselect(msg->spi, 0); + pl022->cur_chip = spi_get_ctldata(spi); + pl022->cur_cs = spi_get_chipselect(spi, 0); /* This is always available but may be set to -ENOENT */ - pl022->cur_gpiod = spi_get_csgpiod(msg->spi, 0); + pl022->cur_gpiod = spi_get_csgpiod(spi, 0); restore_state(pl022); flush(pl022); if (pl022->cur_chip->xfer_type == POLLING_TRANSFER) - do_polling_transfer(pl022); + return do_polling_transfer(pl022); else - do_interrupt_dma_transfer(pl022); + return do_interrupt_dma_transfer(pl022); +} - return 0; +static void pl022_handle_err(struct spi_controller *ctlr, struct spi_message *message) +{ + struct pl022 *pl022 = spi_controller_get_devdata(ctlr); + + terminate_dma(pl022); + writew(DISABLE_ALL_INTERRUPTS, SSP_IMSC(pl022->virtbase)); + writew(CLEAR_ALL_INTERRUPTS, SSP_ICR(pl022->virtbase)); } static int pl022_unprepare_transfer_hardware(struct spi_controller *host) @@ -2138,7 +1901,9 @@ static int pl022_probe(struct amba_device *adev, const struct amba_id *id) host->cleanup = pl022_cleanup; host->setup = pl022_setup; host->auto_runtime_pm = true; - host->transfer_one_message = pl022_transfer_one_message; + host->transfer_one = pl022_transfer_one; + host->set_cs = pl022_cs_control; + host->handle_err = pl022_handle_err; host->unprepare_transfer_hardware = pl022_unprepare_transfer_hardware; host->rt = platform_info->rt; host->dev.of_node = dev->of_node; @@ -2175,10 +1940,6 @@ static int pl022_probe(struct amba_device *adev, const struct amba_id *id) goto err_no_clk; } - /* Initialize transfer pump */ - tasklet_init(&pl022->pump_transfers, pump_transfers, - (unsigned long)pl022); - /* Disable SSP */ writew((readw(SSP_CR1(pl022->virtbase)) & (~SSP_CR1_MASK_SSE)), SSP_CR1(pl022->virtbase)); @@ -2261,7 +2022,6 @@ pl022_remove(struct amba_device *adev) pl022_dma_remove(pl022); amba_release_regions(adev); - tasklet_disable(&pl022->pump_transfers); } #ifdef CONFIG_PM_SLEEP From cff49d58f57e5667c10a0db85d7461790bb85cf8 Mon Sep 17 00:00:00 2001 From: "Chia-Lin Kao (AceLan)" Date: Wed, 29 Nov 2023 14:43:10 +0800 Subject: [PATCH 0207/1562] spi: Unify error codes by replacing -ENOTSUPP with -EOPNOTSUPP This commit updates the SPI subsystem, particularly affecting "SPI MEM" drivers and core parts, by replacing the -ENOTSUPP error code with -EOPNOTSUPP. The key motivations for this change are as follows: 1. The spi-nor driver currently uses EOPNOTSUPP, whereas calls to spi-mem might return ENOTSUPP. This update aims to unify the error reporting within the SPI subsystem for clarity and consistency. 2. The use of ENOTSUPP has been flagged by checkpatch as inappropriate, mainly being reserved for NFS-related errors. To align with kernel coding standards and recommendations, this change is being made. 3. By using EOPNOTSUPP, we provide more specific context to the error, indicating that a particular operation is not supported. This helps differentiate from the more generic ENOTSUPP error, allowing drivers to better handle and respond to different error scenarios. Risks and Considerations: While this change is primarily intended as a code cleanup and error code unification, there is a minor risk of breaking user-space applications that rely on specific return codes for unsupported operations. However, this risk is considered low, as such use-cases are unlikely to be common or critical. Nevertheless, developers and users should be aware of this change, especially if they have scripts or tools that specifically handle SPI error codes. This commit does not introduce any functional changes to the SPI subsystem or the affected drivers. Signed-off-by: "Chia-Lin Kao (AceLan)" Acked-by: Tudor Ambarus Reviewed-by: Mika Westerberg Acked-by: Miquel Raynal Acked-by: Michael Walle Link: https://lore.kernel.org/r/20231129064311.272422-1-acelan.kao@canonical.com Signed-off-by: Mark Brown --- drivers/mtd/nand/spi/core.c | 2 +- drivers/mtd/spi-nor/core.c | 2 +- drivers/spi/atmel-quadspi.c | 2 +- drivers/spi/spi-ath79.c | 2 +- drivers/spi/spi-bcm-qspi.c | 2 +- drivers/spi/spi-mem.c | 6 +++--- drivers/spi/spi-npcm-fiu.c | 2 +- drivers/spi/spi-ti-qspi.c | 4 ++-- drivers/spi/spi-wpcm-fiu.c | 2 +- include/linux/spi/spi-mem.h | 2 ++ 10 files changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c index 849ccfedbc72..e0b6715e5dfe 100644 --- a/drivers/mtd/nand/spi/core.c +++ b/drivers/mtd/nand/spi/core.c @@ -974,7 +974,7 @@ static int spinand_manufacturer_match(struct spinand_device *spinand, spinand->manufacturer = manufacturer; return 0; } - return -ENOTSUPP; + return -EOPNOTSUPP; } static int spinand_id_detect(struct spinand_device *spinand) diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c index 1c443fe568cf..87cb2047df80 100644 --- a/drivers/mtd/spi-nor/core.c +++ b/drivers/mtd/spi-nor/core.c @@ -3146,7 +3146,7 @@ int spi_nor_set_4byte_addr_mode(struct spi_nor *nor, bool enable) int ret; ret = params->set_4byte_addr_mode(nor, enable); - if (ret && ret != -ENOTSUPP) + if (ret && ret != -EOPNOTSUPP) return ret; if (enable) { diff --git a/drivers/spi/atmel-quadspi.c b/drivers/spi/atmel-quadspi.c index 3d1252566134..370c4d1572ed 100644 --- a/drivers/spi/atmel-quadspi.c +++ b/drivers/spi/atmel-quadspi.c @@ -272,7 +272,7 @@ static int atmel_qspi_find_mode(const struct spi_mem_op *op) if (atmel_qspi_is_compatible(op, &atmel_qspi_modes[i])) return i; - return -ENOTSUPP; + return -EOPNOTSUPP; } static bool atmel_qspi_supports_op(struct spi_mem *mem, diff --git a/drivers/spi/spi-ath79.c b/drivers/spi/spi-ath79.c index c9f1d1e1dcf7..b7ada981464a 100644 --- a/drivers/spi/spi-ath79.c +++ b/drivers/spi/spi-ath79.c @@ -146,7 +146,7 @@ static int ath79_exec_mem_op(struct spi_mem *mem, /* Only use for fast-read op. */ if (op->cmd.opcode != 0x0b || op->data.dir != SPI_MEM_DATA_IN || op->addr.nbytes != 3 || op->dummy.nbytes != 1) - return -ENOTSUPP; + return -EOPNOTSUPP; /* disable GPIO mode */ ath79_spi_wr(sp, AR71XX_SPI_REG_FS, 0); diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c index ef08fcac2f6d..d96222e6d7d2 100644 --- a/drivers/spi/spi-bcm-qspi.c +++ b/drivers/spi/spi-bcm-qspi.c @@ -1199,7 +1199,7 @@ static int bcm_qspi_exec_mem_op(struct spi_mem *mem, if (!op->data.nbytes || !op->addr.nbytes || op->addr.nbytes > 4 || op->data.dir != SPI_MEM_DATA_IN) - return -ENOTSUPP; + return -EOPNOTSUPP; buf = op->data.buf.in; addr = op->addr.val; diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c index edd7430d4c05..2dc8ceb85374 100644 --- a/drivers/spi/spi-mem.c +++ b/drivers/spi/spi-mem.c @@ -323,7 +323,7 @@ int spi_mem_exec_op(struct spi_mem *mem, const struct spi_mem_op *op) return ret; if (!spi_mem_internal_supports_op(mem, op)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (ctlr->mem_ops && ctlr->mem_ops->exec_op && !spi_get_csgpiod(mem->spi, 0)) { ret = spi_mem_access_start(mem); @@ -339,7 +339,7 @@ int spi_mem_exec_op(struct spi_mem *mem, const struct spi_mem_op *op) * read path) and expect the core to use the regular SPI * interface in other cases. */ - if (!ret || ret != -ENOTSUPP) + if (!ret || ret != -ENOTSUPP || ret != -EOPNOTSUPP) return ret; } @@ -559,7 +559,7 @@ spi_mem_dirmap_create(struct spi_mem *mem, if (ret) { desc->nodirmap = true; if (!spi_mem_supports_op(desc->mem, &desc->info.op_tmpl)) - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; else ret = 0; } diff --git a/drivers/spi/spi-npcm-fiu.c b/drivers/spi/spi-npcm-fiu.c index 03db9f016a11..f3bb8bbc192f 100644 --- a/drivers/spi/spi-npcm-fiu.c +++ b/drivers/spi/spi-npcm-fiu.c @@ -556,7 +556,7 @@ static int npcm_fiu_exec_op(struct spi_mem *mem, const struct spi_mem_op *op) op->data.nbytes); if (fiu->spix_mode || op->addr.nbytes > 4) - return -ENOTSUPP; + return -EOPNOTSUPP; if (fiu->clkrate != chip->clkrate) { ret = clk_set_rate(fiu->clk, chip->clkrate); diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c index fdc092a05284..a6a89c59c418 100644 --- a/drivers/spi/spi-ti-qspi.c +++ b/drivers/spi/spi-ti-qspi.c @@ -613,12 +613,12 @@ static int ti_qspi_exec_mem_op(struct spi_mem *mem, /* Only optimize read path. */ if (!op->data.nbytes || op->data.dir != SPI_MEM_DATA_IN || !op->addr.nbytes || op->addr.nbytes > 4) - return -ENOTSUPP; + return -EOPNOTSUPP; /* Address exceeds MMIO window size, fall back to regular mode. */ from = op->addr.val; if (from + op->data.nbytes > qspi->mmap_size) - return -ENOTSUPP; + return -EOPNOTSUPP; mutex_lock(&qspi->list_lock); diff --git a/drivers/spi/spi-wpcm-fiu.c b/drivers/spi/spi-wpcm-fiu.c index 852ffe013d32..d76f7b5a9b97 100644 --- a/drivers/spi/spi-wpcm-fiu.c +++ b/drivers/spi/spi-wpcm-fiu.c @@ -361,7 +361,7 @@ static int wpcm_fiu_exec_op(struct spi_mem *mem, const struct spi_mem_op *op) wpcm_fiu_stall_host(fiu, false); - return -ENOTSUPP; + return -EOPNOTSUPP; } static int wpcm_fiu_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op) diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 6b0a7dc48a4b..f866d5c8ed32 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -233,6 +233,8 @@ static inline void *spi_mem_get_drvdata(struct spi_mem *mem) * limitations) * @supports_op: check if an operation is supported by the controller * @exec_op: execute a SPI memory operation + * not all driver provides supports_op(), so it can return -EOPNOTSUPP + * if the op is not supported by the driver/controller * @get_name: get a custom name for the SPI mem device from the controller. * This might be needed if the controller driver has been ported * to use the SPI mem layer and a custom name is used to keep From 7a030abc0185b30a3fd19a7431347c6f5a82c588 Mon Sep 17 00:00:00 2001 From: "Chia-Lin Kao (AceLan)" Date: Wed, 29 Nov 2023 14:43:11 +0800 Subject: [PATCH 0208/1562] mtd: spi-nor: Stop reporting warning message when soft reset is not suported When the software reset command isn't supported, we now stop reporting the warning message to avoid unnecessary warnings and potential confusion. Reviewed-by: Dhruva Gole Reviewed-by: Michael Walle Reviewed-by: Mika Westerberg Acked-by: Pratyush Yadav Signed-off-by: "Chia-Lin Kao (AceLan)" Reviewed-by: Miquel Raynal Acked-by: Tudor Ambarus Link: https://lore.kernel.org/r/20231129064311.272422-2-acelan.kao@canonical.com Signed-off-by: Mark Brown --- drivers/mtd/spi-nor/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c index 87cb2047df80..96a207751cf2 100644 --- a/drivers/mtd/spi-nor/core.c +++ b/drivers/mtd/spi-nor/core.c @@ -3237,7 +3237,8 @@ static void spi_nor_soft_reset(struct spi_nor *nor) ret = spi_mem_exec_op(nor->spimem, &op); if (ret) { - dev_warn(nor->dev, "Software reset failed: %d\n", ret); + if (ret != -EOPNOTSUPP) + dev_warn(nor->dev, "Software reset failed: %d\n", ret); return; } From 7a36b901a6eb0e9945341db71ed3c45c7721cfa9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 27 Nov 2023 20:57:43 +0100 Subject: [PATCH 0209/1562] ACPI: OSL: Use a threaded interrupt handler for SCI In the current arrangement, all of the acpi_ev_sci_xrupt_handler() code is run as an interrupt handler for the SCI, in interrupt context. Among other things, this causes it to run with local interrupts off which can be problematic if many GPEs are enabled and they are located in the I/O address space, for example (because in that case local interrupts will be off for the duration of all of the GPE hardware accesses carried out while handling an SCI combined and that may be quite a bit of time in extreme scenarios). However, there is no particular reason why the code in question really needs to run in interrupt context and in particular, it has no specific reason to run with local interrupts off. The only real requirement is to prevent multiple instences of it from running in parallel with each other, but that can be achieved regardless. For this reason, use request_threaded_irq() instead of request_irq() for the ACPI SCI and pass IRQF_ONESHOT to it in flags to indicate that the interrupt needs to be masked while its handling thread is running so as to prevent it from re-triggering while it is being handled (and in particular until the final handled/not handled outcome is determined). While at it, drop a redundant local variable from acpi_irq(). Signed-off-by: Rafael J. Wysocki Tested-by: Mario Limonciello Tested-by: Mika Westerberg --- drivers/acpi/osl.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index c09cc3c68633..d56dda795118 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -544,11 +544,7 @@ acpi_os_predefined_override(const struct acpi_predefined_names *init_val, static irqreturn_t acpi_irq(int irq, void *dev_id) { - u32 handled; - - handled = (*acpi_irq_handler) (acpi_irq_context); - - if (handled) { + if ((*acpi_irq_handler)(acpi_irq_context)) { acpi_irq_handled++; return IRQ_HANDLED; } else { @@ -582,7 +578,8 @@ acpi_os_install_interrupt_handler(u32 gsi, acpi_osd_handler handler, acpi_irq_handler = handler; acpi_irq_context = context; - if (request_irq(irq, acpi_irq, IRQF_SHARED, "acpi", acpi_irq)) { + if (request_threaded_irq(irq, NULL, acpi_irq, IRQF_SHARED | IRQF_ONESHOT, + "acpi", acpi_irq)) { pr_err("SCI (IRQ%d) allocation failed\n", irq); acpi_irq_handler = NULL; return AE_NOT_ACQUIRED; From 1692cf434ba13ee212495b5af795b6a07e986ce4 Mon Sep 17 00:00:00 2001 From: Alexander Antonov Date: Mon, 27 Nov 2023 10:52:45 -0800 Subject: [PATCH 0210/1562] perf/x86/intel/uncore: Fix NULL pointer dereference issue in upi_fill_topology() Get logical socket id instead of physical id in discover_upi_topology() to avoid out-of-bound access on 'upi = &type->topology[nid][idx];' line that leads to NULL pointer dereference in upi_fill_topology() Fixes: f680b6e6062e ("perf/x86/intel/uncore: Enable UPI topology discovery for Icelake Server") Reported-by: Kyle Meyer Signed-off-by: Alexander Antonov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Tested-by: Kyle Meyer Link: https://lore.kernel.org/r/20231127185246.2371939-2-alexander.antonov@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index aeaa8efe3c62..1efbacfff47d 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -5596,7 +5596,7 @@ static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, i struct pci_dev *ubox = NULL; struct pci_dev *dev = NULL; u32 nid, gid; - int i, idx, ret = -EPERM; + int i, idx, lgc_pkg, ret = -EPERM; struct intel_uncore_topology *upi; unsigned int devfn; @@ -5614,8 +5614,13 @@ static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, i for (i = 0; i < 8; i++) { if (nid != GIDNIDMAP(gid, i)) continue; + lgc_pkg = topology_phys_to_logical_pkg(i); + if (lgc_pkg < 0) { + ret = -EPERM; + goto err; + } for (idx = 0; idx < type->num_boxes; idx++) { - upi = &type->topology[nid][idx]; + upi = &type->topology[lgc_pkg][idx]; devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION); dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus), ubox->bus->number, @@ -5626,6 +5631,7 @@ static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, i goto err; } } + break; } } err: From fdd041028f2294228e10610b4fca6a1a83ac683d Mon Sep 17 00:00:00 2001 From: Alexander Antonov Date: Mon, 27 Nov 2023 10:52:46 -0800 Subject: [PATCH 0211/1562] perf/x86/intel/uncore: Factor out topology_gidnid_map() The same code is used for retrieving package ID procedure from GIDNIDMAP register. Factor out topology_gidnid_map() to avoid code duplication. Signed-off-by: Alexander Antonov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20231127185246.2371939-3-alexander.antonov@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 77 +++++++++++++++------------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 1efbacfff47d..a96496bef678 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1396,6 +1396,29 @@ err: return ret; } +static int topology_gidnid_map(int nodeid, u32 gidnid) +{ + int i, die_id = -1; + + /* + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == GIDNIDMAP(gidnid, i)) { + if (topology_max_die_per_package() > 1) + die_id = i; + else + die_id = topology_phys_to_logical_pkg(i); + if (die_id < 0) + die_id = -ENODEV; + break; + } + } + + return die_id; +} + /* * build pci bus to socket mapping */ @@ -1435,22 +1458,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool break; } - /* - * every three bits in the Node ID mapping register maps - * to a particular node. - */ - for (i = 0; i < 8; i++) { - if (nodeid == GIDNIDMAP(config, i)) { - if (topology_max_die_per_package() > 1) - die_id = i; - else - die_id = topology_phys_to_logical_pkg(i); - if (die_id < 0) - die_id = -ENODEV; - map->pbus_to_dieid[bus] = die_id; - break; - } - } + map->pbus_to_dieid[bus] = topology_gidnid_map(nodeid, config); raw_spin_unlock(&pci2phy_map_lock); } else { segment = pci_domain_nr(ubox_dev->bus); @@ -5596,7 +5604,7 @@ static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, i struct pci_dev *ubox = NULL; struct pci_dev *dev = NULL; u32 nid, gid; - int i, idx, lgc_pkg, ret = -EPERM; + int idx, lgc_pkg, ret = -EPERM; struct intel_uncore_topology *upi; unsigned int devfn; @@ -5611,27 +5619,22 @@ static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, i break; } - for (i = 0; i < 8; i++) { - if (nid != GIDNIDMAP(gid, i)) - continue; - lgc_pkg = topology_phys_to_logical_pkg(i); - if (lgc_pkg < 0) { - ret = -EPERM; - goto err; + lgc_pkg = topology_gidnid_map(nid, gid); + if (lgc_pkg < 0) { + ret = -EPERM; + goto err; + } + for (idx = 0; idx < type->num_boxes; idx++) { + upi = &type->topology[lgc_pkg][idx]; + devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION); + dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus), + ubox->bus->number, + devfn); + if (dev) { + ret = upi_fill_topology(dev, upi, idx); + if (ret) + goto err; } - for (idx = 0; idx < type->num_boxes; idx++) { - upi = &type->topology[lgc_pkg][idx]; - devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION); - dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus), - ubox->bus->number, - devfn); - if (dev) { - ret = upi_fill_topology(dev, upi, idx); - if (ret) - goto err; - } - } - break; } } err: From 59730241647287c0ab64bf0bc7449308392b7ea4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 29 Nov 2023 14:36:07 +0100 Subject: [PATCH 0212/1562] thermal: trip: Drop a redundant check from thermal_zone_set_trip() After recent changes in the thermal framework, a trip points array is required for registering a thermal zone that is not tripless, so the tz->trips pointer in thermal_zone_set_trip() is never NULL and the check involving it is redundant. Drop that check. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Lukasz Luba Acked-by: Daniel Lezcano --- drivers/thermal/thermal_trip.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c index e42456442c68..e3dd583234dd 100644 --- a/drivers/thermal/thermal_trip.c +++ b/drivers/thermal/thermal_trip.c @@ -153,9 +153,6 @@ int thermal_zone_set_trip(struct thermal_zone_device *tz, int trip_id, struct thermal_trip t; int ret; - if (!tz->ops->set_trip_temp && !tz->ops->set_trip_hyst && !tz->trips) - return -EINVAL; - ret = __thermal_zone_get_trip(tz, trip_id, &t); if (ret) return ret; From e1c0b9ef26e5e46fd5c2df9e7f9686e786723f53 Mon Sep 17 00:00:00 2001 From: angquan yu Date: Tue, 28 Nov 2023 21:57:26 -0600 Subject: [PATCH 0213/1562] selftests:breakpoints: Fix Format String Warning in breakpoint_test This commit resolves a compiler warning regardingthe use of non-literal format strings in breakpoint_test.c. The functions `ksft_test_result_pass` and `ksft_test_result_fail` were previously called with a variable `msg` directly, which could potentially lead to format string vulnerabilities. Changes made: - Modified the calls to `ksft_test_result_pass` and `ksft_test_result_fail` by adding a "%s" format specifier. This explicitly declares `msg` as a string argument, adhering to safer coding practices and resolving the compiler warning. This change does not affect the functional behavior of the code but ensures better code safety and compliance with recommended C programming standards. The previous warning is "breakpoint_test.c:287:17: warning: format not a string literal and no format arguments [-Wformat-security] 287 | ksft_test_result_pass(msg); | ^~~~~~~~~~~~~~~~~~~~~ breakpoint_test.c:289:17: warning: format not a string literal and no format arguments [-Wformat-security] 289 | ksft_test_result_fail(msg); | " Signed-off-by: angquan yu Signed-off-by: Shuah Khan --- tools/testing/selftests/breakpoints/breakpoint_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/breakpoints/breakpoint_test.c b/tools/testing/selftests/breakpoints/breakpoint_test.c index 3266cc9293fe..d46962a24724 100644 --- a/tools/testing/selftests/breakpoints/breakpoint_test.c +++ b/tools/testing/selftests/breakpoints/breakpoint_test.c @@ -284,9 +284,9 @@ static void check_success(const char *msg) nr_tests++; if (ret) - ksft_test_result_pass(msg); + ksft_test_result_pass("%s", msg); else - ksft_test_result_fail(msg); + ksft_test_result_fail("%s", msg); } static void launch_instruction_breakpoints(char *buf, int local, int global) From 5e551899788b0731761052f21febdfef668e511f Mon Sep 17 00:00:00 2001 From: angquan yu Date: Tue, 28 Nov 2023 15:48:54 -0600 Subject: [PATCH 0214/1562] selftests/breakpoints: Fix format specifier in ksft_print_msg in step_after_suspend_test.c In the function 'tools/testing/selftests/breakpoints/run_test' within step_after_suspend_test.c, the ksft_print_msg function call incorrectly used '$s' as a format specifier. This commit corrects this typo to use the proper '%s' format specifier, ensuring the error message from waitpid() is correctly displayed. The issue manifested as a compilation warning (too many arguments for format [-Wformat-extra-args]), potentially obscuring actual runtime errors and complicating debugging processes. This fix enhances the clarity of error messages during test failures and ensures compliance with standard C format string conventions. Signed-off-by: angquan yu Signed-off-by: Shuah Khan --- tools/testing/selftests/breakpoints/step_after_suspend_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/breakpoints/step_after_suspend_test.c b/tools/testing/selftests/breakpoints/step_after_suspend_test.c index 2cf6f10ab7c4..b8703c499d28 100644 --- a/tools/testing/selftests/breakpoints/step_after_suspend_test.c +++ b/tools/testing/selftests/breakpoints/step_after_suspend_test.c @@ -89,7 +89,7 @@ int run_test(int cpu) wpid = waitpid(pid, &status, __WALL); if (wpid != pid) { - ksft_print_msg("waitpid() failed: $s\n", strerror(errno)); + ksft_print_msg("waitpid() failed: %s\n", strerror(errno)); return KSFT_FAIL; } if (WIFEXITED(status)) { From 9686e7f59b142095ac6ad0763312ec68fc34c51a Mon Sep 17 00:00:00 2001 From: angquan yu Date: Tue, 28 Nov 2023 21:36:15 -0600 Subject: [PATCH 0215/1562] selftests:x86: Fix Format String Warnings in lam.c This commit addresses compiler warnings in lam.c related to the usage of non-literal format strings without format arguments in the 'run_test' function. Warnings fixed: - Resolved warnings indicating that 'ksft_test_result_skip' and 'ksft_test_result' were called with 't->msg' as a format string without accompanying format arguments. Changes made: - Modified the calls to 'ksft_test_result_skip' and 'ksft_test_result' to explicitly include a format specifier ("%s") for 't->msg'. - This ensures that the string is safely treated as a format argument, adhering to safer coding practices and resolving the compiler warnings. Signed-off-by: angquan yu Signed-off-by: Shuah Khan --- tools/testing/selftests/x86/lam.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c index 8f9b06d9ce03..215b8150b7cc 100644 --- a/tools/testing/selftests/x86/lam.c +++ b/tools/testing/selftests/x86/lam.c @@ -817,7 +817,7 @@ static void run_test(struct testcases *test, int count) /* return 3 is not support LA57, the case should be skipped */ if (ret == 3) { - ksft_test_result_skip(t->msg); + ksft_test_result_skip("%s", t->msg); continue; } @@ -826,7 +826,7 @@ static void run_test(struct testcases *test, int count) else ret = !(t->expected); - ksft_test_result(ret, t->msg); + ksft_test_result(ret, "%s", t->msg); } } From 60e76e7ac088c5146d647cc5cc3f345b54489915 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 23 Nov 2023 10:45:47 +0000 Subject: [PATCH 0216/1562] kselftest/vDSO: Make test name reporting for vdso_abi_test tooling friendly The test results from vdso_abi_test are all formatted using a series of macros: #define VDSO_TEST_PASS_MSG() "\n%s(): PASS\n", __func__ #define VDSO_TEST_FAIL_MSG(x) "\n%s(): %s FAIL\n", __func__, x #define VDSO_TEST_SKIP_MSG(x) "\n%s(): SKIP: Could not find %s\n", __func__, x which don't play nicely with automated KTAP parsers since the actual KTAP lines are in the form ok 1 with no test name and we get an additional log line such as vdso_test_gettimeofday(): PASS with no preceeding # as KTAP requires. The lack of a test name means that many automation systems will have a hard time distinguishing between the different tests or correlating results between runs, the lack of # is less severe but could potentially cause confusion. Fix these issues by rewriting all the result reporting to include both the vDSO function name being tested and (where there is one) the name of the clock being tested in the main KTAP line. Since we have tests both with and without a specific clock we abandon the helper macros and just put the format strings used directly in the ksft_ API calls. When we fail to look up the relevant vDSO symbol we add a separate print statement explaining why the skip is being done. This gives output such as: ok 1 __vdso_gettimeofday # clock_id: CLOCK_REALTIME # The time is 1700673118.58091596 ok 2 __vdso_clock_gettime CLOCK_REALTIME which is much easier for test automation to work with. Signed-off-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/vDSO/vdso_test_abi.c | 66 +++++++++++--------- 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index 883ca85424bc..b304abae6e8f 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -33,9 +33,20 @@ typedef long (*vdso_clock_gettime_t)(clockid_t clk_id, struct timespec *ts); typedef long (*vdso_clock_getres_t)(clockid_t clk_id, struct timespec *ts); typedef time_t (*vdso_time_t)(time_t *t); -#define VDSO_TEST_PASS_MSG() "\n%s(): PASS\n", __func__ -#define VDSO_TEST_FAIL_MSG(x) "\n%s(): %s FAIL\n", __func__, x -#define VDSO_TEST_SKIP_MSG(x) "\n%s(): SKIP: Could not find %s\n", __func__, x +const char *vdso_clock_name[12] = { + "CLOCK_REALTIME", + "CLOCK_MONOTONIC", + "CLOCK_PROCESS_CPUTIME_ID", + "CLOCK_THREAD_CPUTIME_ID", + "CLOCK_MONOTONIC_RAW", + "CLOCK_REALTIME_COARSE", + "CLOCK_MONOTONIC_COARSE", + "CLOCK_BOOTTIME", + "CLOCK_REALTIME_ALARM", + "CLOCK_BOOTTIME_ALARM", + "CLOCK_SGI_CYCLE", + "CLOCK_TAI", +}; static void vdso_test_gettimeofday(void) { @@ -44,7 +55,8 @@ static void vdso_test_gettimeofday(void) (vdso_gettimeofday_t)vdso_sym(version, name[0]); if (!vdso_gettimeofday) { - ksft_test_result_skip(VDSO_TEST_SKIP_MSG(name[0])); + ksft_print_msg("Couldn't find %s\n", name[0]); + ksft_test_result_skip("%s\n", name[0]); return; } @@ -54,9 +66,9 @@ static void vdso_test_gettimeofday(void) if (ret == 0) { ksft_print_msg("The time is %lld.%06lld\n", (long long)tv.tv_sec, (long long)tv.tv_usec); - ksft_test_result_pass(VDSO_TEST_PASS_MSG()); + ksft_test_result_pass("%s\n", name[0]); } else { - ksft_test_result_fail(VDSO_TEST_FAIL_MSG(name[0])); + ksft_test_result_fail("%s\n", name[0]); } } @@ -67,7 +79,9 @@ static void vdso_test_clock_gettime(clockid_t clk_id) (vdso_clock_gettime_t)vdso_sym(version, name[1]); if (!vdso_clock_gettime) { - ksft_test_result_skip(VDSO_TEST_SKIP_MSG(name[1])); + ksft_print_msg("Couldn't find %s\n", name[1]); + ksft_test_result_skip("%s %s\n", name[1], + vdso_clock_name[clk_id]); return; } @@ -77,9 +91,11 @@ static void vdso_test_clock_gettime(clockid_t clk_id) if (ret == 0) { ksft_print_msg("The time is %lld.%06lld\n", (long long)ts.tv_sec, (long long)ts.tv_nsec); - ksft_test_result_pass(VDSO_TEST_PASS_MSG()); + ksft_test_result_pass("%s %s\n", name[1], + vdso_clock_name[clk_id]); } else { - ksft_test_result_fail(VDSO_TEST_FAIL_MSG(name[1])); + ksft_test_result_fail("%s %s\n", name[1], + vdso_clock_name[clk_id]); } } @@ -90,7 +106,8 @@ static void vdso_test_time(void) (vdso_time_t)vdso_sym(version, name[2]); if (!vdso_time) { - ksft_test_result_skip(VDSO_TEST_SKIP_MSG(name[2])); + ksft_print_msg("Couldn't find %s\n", name[2]); + ksft_test_result_skip("%s\n", name[2]); return; } @@ -99,9 +116,9 @@ static void vdso_test_time(void) if (ret > 0) { ksft_print_msg("The time in hours since January 1, 1970 is %lld\n", (long long)(ret / 3600)); - ksft_test_result_pass(VDSO_TEST_PASS_MSG()); + ksft_test_result_pass("%s\n", name[2]); } else { - ksft_test_result_fail(VDSO_TEST_FAIL_MSG(name[2])); + ksft_test_result_fail("%s\n", name[2]); } } @@ -114,7 +131,9 @@ static void vdso_test_clock_getres(clockid_t clk_id) (vdso_clock_getres_t)vdso_sym(version, name[3]); if (!vdso_clock_getres) { - ksft_test_result_skip(VDSO_TEST_SKIP_MSG(name[3])); + ksft_print_msg("Couldn't find %s\n", name[3]); + ksft_test_result_skip("%s %s\n", name[3], + vdso_clock_name[clk_id]); return; } @@ -137,27 +156,14 @@ static void vdso_test_clock_getres(clockid_t clk_id) clock_getres_fail++; if (clock_getres_fail > 0) { - ksft_test_result_fail(VDSO_TEST_FAIL_MSG(name[3])); + ksft_test_result_fail("%s %s\n", name[3], + vdso_clock_name[clk_id]); } else { - ksft_test_result_pass(VDSO_TEST_PASS_MSG()); + ksft_test_result_pass("%s %s\n", name[3], + vdso_clock_name[clk_id]); } } -const char *vdso_clock_name[12] = { - "CLOCK_REALTIME", - "CLOCK_MONOTONIC", - "CLOCK_PROCESS_CPUTIME_ID", - "CLOCK_THREAD_CPUTIME_ID", - "CLOCK_MONOTONIC_RAW", - "CLOCK_REALTIME_COARSE", - "CLOCK_MONOTONIC_COARSE", - "CLOCK_BOOTTIME", - "CLOCK_REALTIME_ALARM", - "CLOCK_BOOTTIME_ALARM", - "CLOCK_SGI_CYCLE", - "CLOCK_TAI", -}; - /* * This function calls vdso_test_clock_gettime and vdso_test_clock_getres * with different values for clock_id. From e63e1354125f923f1f5a393dd63c074427382e7e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 23 Nov 2023 10:45:48 +0000 Subject: [PATCH 0217/1562] kselftest/vDSO: Fix message formatting for clock_id logging When logging the ID of the currently tested clock vdso_test_clock() puts a spurious newline at the start of the format string resulting in output such as # clock_id: CLOCK_BOOTTIME which is a valid but empty KTAP informational message followed by a non conferment output line. Remove the initial newline to create a more KTAP friendly # clock_id: CLOCK_BOOTTIME Signed-off-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/vDSO/vdso_test_abi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index b304abae6e8f..d0e247cca58a 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -170,7 +170,7 @@ static void vdso_test_clock_getres(clockid_t clk_id) */ static inline void vdso_test_clock(clockid_t clock_id) { - ksft_print_msg("\nclock_id: %s\n", vdso_clock_name[clock_id]); + ksft_print_msg("clock_id: %s\n", vdso_clock_name[clock_id]); vdso_test_clock_gettime(clock_id); From 25cfe960a858dd345176253088a8e56538007c22 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 23 Nov 2023 10:45:49 +0000 Subject: [PATCH 0218/1562] kselftest/vDSO: Use ksft_print_msg() rather than printf in vdso_test_abi There are a couple of raw printf() calls in vdso_test_abi which result in non KTAP conforment output such as [vDSO kselftest] VDSO_VERSION: LINUX_2.6 Convert them to use ksft_print_msg() so that they don't cause confusion for parsers. Signed-off-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/vDSO/vdso_test_abi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index d0e247cca58a..96d32fd65b42 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -187,14 +187,14 @@ int main(int argc, char **argv) ksft_set_plan(VDSO_TEST_PLAN); if (!sysinfo_ehdr) { - printf("AT_SYSINFO_EHDR is not present!\n"); + ksft_print_msg("AT_SYSINFO_EHDR is not present!\n"); return KSFT_SKIP; } version = versions[VDSO_VERSION]; name = (const char **)&names[VDSO_NAMES]; - printf("[vDSO kselftest] VDSO_VERSION: %s\n", version); + ksft_print_msg("[vDSO kselftest] VDSO_VERSION: %s\n", version); vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR)); From d837813ff42ef86dac8596dd491bf6869ac7a0e8 Mon Sep 17 00:00:00 2001 From: Osama Muhammad Date: Sun, 20 Aug 2023 19:13:54 +0500 Subject: [PATCH 0219/1562] selftests: prctl: Add prctl test for PR_GET_NAME This patch covers the testing of PR_GET_NAME by reading it's value from proc/self/task/pid/comm and matching it with the value returned by PR_GET_NAME. If the values are matched then it's successful, otherwise it fails. changes since v1: - Handled fscanf,fopen error checking. - Defined MAX_PATH_LEN. Signed-off-by: Osama Muhammad Signed-off-by: Shuah Khan --- .../selftests/prctl/set-process-name.c | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tools/testing/selftests/prctl/set-process-name.c b/tools/testing/selftests/prctl/set-process-name.c index 3bc5e0e09eb9..562f707ba771 100644 --- a/tools/testing/selftests/prctl/set-process-name.c +++ b/tools/testing/selftests/prctl/set-process-name.c @@ -12,6 +12,7 @@ #define CHANGE_NAME "changename" #define EMPTY_NAME "" #define TASK_COMM_LEN 16 +#define MAX_PATH_LEN 50 int set_name(char *name) { @@ -47,6 +48,35 @@ int check_null_pointer(char *check_name) return res; } +int check_name(void) +{ + + int pid; + + pid = getpid(); + FILE *fptr = NULL; + char path[MAX_PATH_LEN] = {}; + char name[TASK_COMM_LEN] = {}; + char output[TASK_COMM_LEN] = {}; + int j; + + j = snprintf(path, MAX_PATH_LEN, "/proc/self/task/%d/comm", pid); + fptr = fopen(path, "r"); + if (!fptr) + return -EIO; + + fscanf(fptr, "%s", output); + if (ferror(fptr)) + return -EIO; + + int res = prctl(PR_GET_NAME, name, NULL, NULL, NULL); + + if (res < 0) + return -errno; + + return !strcmp(output, name); +} + TEST(rename_process) { EXPECT_GE(set_name(CHANGE_NAME), 0); @@ -57,6 +87,8 @@ TEST(rename_process) { EXPECT_GE(set_name(CHANGE_NAME), 0); EXPECT_LT(check_null_pointer(CHANGE_NAME), 0); + + EXPECT_TRUE(check_name()); } TEST_HARNESS_MAIN From 49360d978411eeaeffb96938edb53ee75ba471bc Mon Sep 17 00:00:00 2001 From: Swarup Laxman Kotiaklapudi Date: Sat, 11 Nov 2023 23:08:06 +0530 Subject: [PATCH 0220/1562] selftests: capabilities: namespace create varies for root and normal user This patchset fixes TODO: "If we're already root, we could skip creating the userns." Change namespace creation for root and non-root user differently in create_and_enter_ns() function in this file: tools/testing/selftests/capabilities/test_execve.c Test result with root user: $sudo make TARGETS="capabilities" kselftest ... TAP version 13 1..1 timeout set to 45 selftests: capabilities: test_execve TAP version 13 1..12 [RUN] +++ Tests with uid == 0 +++ [NOTE] Using global UIDs for tests [RUN] Root => ep ... ok 12 Passed Totals: pass:12 fail:0 xfail:0 xpass:0 skip:0 error:0 ================================================== TAP version 13 1..9 [RUN] +++ Tests with uid != 0 +++ [NOTE] Using global UIDs for tests [RUN] Non-root => no caps ... ok 9 Passed Totals: pass:9 fail:0 xfail:0 xpass:0 skip:0 error:0 Test result without root or normal user: $make TARGETS="capabilities" kselftest ... timeout set to 45 selftests: capabilities: test_execve TAP version 13 1..12 [RUN] +++ Tests with uid == 0 +++ [NOTE] Using a user namespace for tests [RUN] Root => ep validate_cap:: Capabilities after execve were correct ok 1 Passed Check cap_ambient manipulation rules ok 2 PR_CAP_AMBIENT_RAISE failed on non-inheritable cap ok 3 PR_CAP_AMBIENT_RAISE failed on non-permitted cap ok 4 PR_CAP_AMBIENT_RAISE worked ok 5 Basic manipulation appears to work [RUN] Root +i => eip validate_cap:: Capabilities after execve were correct ok 6 Passed [RUN] UID 0 +ia => eipa validate_cap:: Capabilities after execve were correct ok 7 Passed ok 8 # SKIP SUID/SGID tests (needs privilege) Planned tests != run tests (12 != 8) Totals: pass:7 fail:0 xfail:0 xpass:0 skip:1 error:0 ================================================== TAP version 13 1..9 [RUN] +++ Tests with uid != 0 +++ [NOTE] Using a user namespace for tests [RUN] Non-root => no caps validate_cap:: Capabilities after execve were correct ok 1 Passed Check cap_ambient manipulation rules ok 2 PR_CAP_AMBIENT_RAISE failed on non-inheritable cap ok 3 PR_CAP_AMBIENT_RAISE failed on non-permitted cap ok 4 PR_CAP_AMBIENT_RAISE worked ok 5 Basic manipulation appears to work [RUN] Non-root +i => i validate_cap:: Capabilities after execve were correct ok 6 Passed [RUN] UID 1 +ia => eipa validate_cap:: Capabilities after execve were correct ok 7 Passed ok 8 # SKIP SUID/SGID tests (needs privilege) Planned tests != run tests (9 != 8) Totals: pass:7 fail:0 xfail:0 xpass:0 skip:1 error:0 Signed-off-by: Swarup Laxman Kotiaklapudi Signed-off-by: Shuah Khan --- tools/testing/selftests/capabilities/test_execve.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c index e3a352b020a7..7cde07a5df78 100644 --- a/tools/testing/selftests/capabilities/test_execve.c +++ b/tools/testing/selftests/capabilities/test_execve.c @@ -88,11 +88,7 @@ static bool create_and_enter_ns(uid_t inner_uid) outer_uid = getuid(); outer_gid = getgid(); - /* - * TODO: If we're already root, we could skip creating the userns. - */ - - if (unshare(CLONE_NEWNS) == 0) { + if (outer_uid == 0 && unshare(CLONE_NEWNS) == 0) { ksft_print_msg("[NOTE]\tUsing global UIDs for tests\n"); if (prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) != 0) ksft_exit_fail_msg("PR_SET_KEEPCAPS - %s\n", From 130a83879954a9fed35cf4474d223b4fcfd479fa Mon Sep 17 00:00:00 2001 From: Atul Kumar Pant Date: Mon, 6 Nov 2023 23:40:05 +0530 Subject: [PATCH 0221/1562] selftests: sched: Remove initialization to 0 for a static variable Fixes following checkpatch.pl issue: ERROR: do not initialise statics to 0 Signed-off-by: Atul Kumar Pant Signed-off-by: Shuah Khan --- tools/testing/selftests/sched/cs_prctl_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/sched/cs_prctl_test.c b/tools/testing/selftests/sched/cs_prctl_test.c index 3e1619b6bf2d..7ba057154343 100644 --- a/tools/testing/selftests/sched/cs_prctl_test.c +++ b/tools/testing/selftests/sched/cs_prctl_test.c @@ -72,7 +72,7 @@ struct child_args { static struct child_args procs[MAX_PROCESSES]; static int num_processes = 2; -static int need_cleanup = 0; +static int need_cleanup; static int _prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) From 7232522e6cafdf466ed7649c14546fd07ccc1978 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 30 Nov 2023 18:56:18 +0200 Subject: [PATCH 0222/1562] fanotify: store fsid in mark instead of in connector Some filesystems like fuse and nfs have zero or non-unique fsid. We would like to avoid reporting ambiguous fsid in events, so we need to avoid marking objects with same fsid and different sb. To make this easier to enforce, store the fsid in the marks of the group instead of in the shared conenctor. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20231130165619.3386452-2-amir73il@gmail.com> --- fs/notify/fanotify/fanotify.c | 19 +++-------- fs/notify/fanotify/fanotify.h | 10 ++++++ fs/notify/fanotify/fanotify_user.c | 18 ++++++++--- fs/notify/mark.c | 52 +++++------------------------- include/linux/fsnotify_backend.h | 13 +++----- 5 files changed, 42 insertions(+), 70 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 9dac7f6e72d2..aff1ab3c32aa 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -838,9 +838,8 @@ out: } /* - * Get cached fsid of the filesystem containing the object from any connector. - * All connectors are supposed to have the same fsid, but we do not verify that - * here. + * Get cached fsid of the filesystem containing the object from any mark. + * All marks are supposed to have the same fsid, but we do not verify that here. */ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) { @@ -849,17 +848,9 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) __kernel_fsid_t fsid = {}; fsnotify_foreach_iter_mark_type(iter_info, mark, type) { - struct fsnotify_mark_connector *conn; - - conn = READ_ONCE(mark->connector); - /* Mark is just getting destroyed or created? */ - if (!conn) + if (!(mark->flags & FSNOTIFY_MARK_FLAG_HAS_FSID)) continue; - if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) - continue; - /* Pairs with smp_wmb() in fsnotify_add_mark_list() */ - smp_rmb(); - fsid = conn->fsid; + fsid = FANOTIFY_MARK(mark)->fsid; if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) continue; return fsid; @@ -1068,7 +1059,7 @@ static void fanotify_freeing_mark(struct fsnotify_mark *mark, static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) { - kmem_cache_free(fanotify_mark_cache, fsn_mark); + kmem_cache_free(fanotify_mark_cache, FANOTIFY_MARK(fsn_mark)); } const struct fsnotify_ops fanotify_fsnotify_ops = { diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 6936671e148d..f3b9ef60f0c0 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -489,6 +489,16 @@ static inline unsigned int fanotify_event_hash_bucket( return event->hash & FANOTIFY_HTABLE_MASK; } +struct fanotify_mark { + struct fsnotify_mark fsn_mark; + __kernel_fsid_t fsid; +}; + +static inline struct fanotify_mark *FANOTIFY_MARK(struct fsnotify_mark *mark) +{ + return container_of(mark, struct fanotify_mark, fsn_mark); +} + static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) { unsigned int mflags = 0; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 4d765c72496f..e3d836d4d156 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1199,6 +1199,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, __kernel_fsid_t *fsid) { struct ucounts *ucounts = group->fanotify_data.ucounts; + struct fanotify_mark *fan_mark; struct fsnotify_mark *mark; int ret; @@ -1211,17 +1212,26 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) return ERR_PTR(-ENOSPC); - mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!mark) { + fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); + if (!fan_mark) { ret = -ENOMEM; goto out_dec_ucounts; } + mark = &fan_mark->fsn_mark; fsnotify_init_mark(mark, group); if (fan_flags & FAN_MARK_EVICTABLE) mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF; - ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid); + /* Cache fsid of filesystem containing the marked object */ + if (fsid) { + fan_mark->fsid = *fsid; + mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID; + } else { + fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0; + } + + ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0); if (ret) { fsnotify_put_mark(mark); goto out_dec_ucounts; @@ -1935,7 +1945,7 @@ static int __init fanotify_user_setup(void) BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); - fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, + fanotify_mark_cache = KMEM_CACHE(fanotify_mark, SLAB_PANIC|SLAB_ACCOUNT); fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event, SLAB_PANIC); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index c74ef947447d..d6944ff86ffa 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -537,8 +537,7 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) } static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, - unsigned int obj_type, - __kernel_fsid_t *fsid) + unsigned int obj_type) { struct fsnotify_mark_connector *conn; @@ -550,14 +549,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, conn->flags = 0; conn->type = obj_type; conn->obj = connp; - /* Cache fsid of filesystem containing the object */ - if (fsid) { - conn->fsid = *fsid; - conn->flags = FSNOTIFY_CONN_FLAG_HAS_FSID; - } else { - conn->fsid.val[0] = conn->fsid.val[1] = 0; - conn->flags = 0; - } + conn->flags = 0; fsnotify_get_sb_connectors(conn); /* @@ -608,8 +600,7 @@ out: */ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, - int add_flags, __kernel_fsid_t *fsid) + unsigned int obj_type, int add_flags) { struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; @@ -619,41 +610,15 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; - /* Backend is expected to check for zero fsid (e.g. tmpfs) */ - if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1])) - return -ENODEV; - restart: spin_lock(&mark->lock); conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); - err = fsnotify_attach_connector_to_object(connp, obj_type, - fsid); + err = fsnotify_attach_connector_to_object(connp, obj_type); if (err) return err; goto restart; - } else if (fsid && !(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) { - conn->fsid = *fsid; - /* Pairs with smp_rmb() in fanotify_get_fsid() */ - smp_wmb(); - conn->flags |= FSNOTIFY_CONN_FLAG_HAS_FSID; - } else if (fsid && (conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID) && - (fsid->val[0] != conn->fsid.val[0] || - fsid->val[1] != conn->fsid.val[1])) { - /* - * Backend is expected to check for non uniform fsid - * (e.g. btrfs), but maybe we missed something? - * Only allow setting conn->fsid once to non zero fsid. - * inotify and non-fid fanotify groups do not set nor test - * conn->fsid. - */ - pr_warn_ratelimited("%s: fsid mismatch on object of type %u: " - "%x.%x != %x.%x\n", __func__, conn->type, - fsid->val[0], fsid->val[1], - conn->fsid.val[0], conn->fsid.val[1]); - err = -EXDEV; - goto out_err; } /* is mark the first mark? */ @@ -703,7 +668,7 @@ out_err: */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, - int add_flags, __kernel_fsid_t *fsid) + int add_flags) { struct fsnotify_group *group = mark->group; int ret = 0; @@ -723,7 +688,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid); + ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags); if (ret) goto err; @@ -742,14 +707,13 @@ err: } int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, int add_flags, - __kernel_fsid_t *fsid) + unsigned int obj_type, int add_flags) { int ret; struct fsnotify_group *group = mark->group; fsnotify_group_lock(group); - ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid); + ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags); fsnotify_group_unlock(group); return ret; } diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index c0892d75ce33..a80b525ca653 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -472,10 +472,8 @@ typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; struct fsnotify_mark_connector { spinlock_t lock; unsigned short type; /* Type of object [lock] */ -#define FSNOTIFY_CONN_FLAG_HAS_FSID 0x01 #define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 unsigned short flags; /* flags [lock] */ - __kernel_fsid_t fsid; /* fsid of filesystem containing object */ union { /* Object pointer [lock] */ fsnotify_connp_t *obj; @@ -530,6 +528,7 @@ struct fsnotify_mark { #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400 +#define FSNOTIFY_MARK_FLAG_HAS_FSID 0x0800 unsigned int flags; /* flags [mark->lock] */ }; @@ -763,11 +762,10 @@ extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, /* attach the mark to the object */ extern int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, - int add_flags, __kernel_fsid_t *fsid); + int add_flags); extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, int add_flags, - __kernel_fsid_t *fsid); + unsigned int obj_type, int add_flags); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, @@ -775,15 +773,14 @@ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, int add_flags) { return fsnotify_add_mark(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL); + FSNOTIFY_OBJ_TYPE_INODE, add_flags); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, add_flags, - NULL); + FSNOTIFY_OBJ_TYPE_INODE, add_flags); } /* given a group and a mark, flag mark to be freed when all references are dropped */ From 30ad1938326bf9303ca38090339d948975a626f5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 30 Nov 2023 18:56:19 +0200 Subject: [PATCH 0223/1562] fanotify: allow "weak" fsid when watching a single filesystem So far, fanotify returns -ENODEV or -EXDEV when trying to set a mark on a filesystem with a "weak" fsid, namely, zero fsid (e.g. fuse), or non-uniform fsid (e.g. btrfs non-root subvol). When group is watching inodes all from the same filesystem (or subvol), allow adding inode marks with "weak" fsid, because there is no ambiguity regarding which filesystem reports the event. The first mark added to a group determines if this group is single or multi filesystem, depending on the fsid at the path of the added mark. If the first mark added has a "strong" fsid, marks with "weak" fsid cannot be added and vice versa. If the first mark added has a "weak" fsid, following marks must have the same "weak" fsid and the same sb as the first mark. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20231130165619.3386452-3-amir73il@gmail.com> --- fs/notify/fanotify/fanotify.c | 15 +--- fs/notify/fanotify/fanotify.h | 6 ++ fs/notify/fanotify/fanotify_user.c | 112 +++++++++++++++++++++++------ include/linux/fsnotify_backend.h | 1 + 4 files changed, 101 insertions(+), 33 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index aff1ab3c32aa..1e4def21811e 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -29,12 +29,6 @@ static unsigned int fanotify_hash_path(const struct path *path) hash_ptr(path->mnt, FANOTIFY_EVENT_HASH_BITS); } -static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1, - __kernel_fsid_t *fsid2) -{ - return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1]; -} - static unsigned int fanotify_hash_fsid(__kernel_fsid_t *fsid) { return hash_32(fsid->val[0], FANOTIFY_EVENT_HASH_BITS) ^ @@ -851,7 +845,8 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) if (!(mark->flags & FSNOTIFY_MARK_FLAG_HAS_FSID)) continue; fsid = FANOTIFY_MARK(mark)->fsid; - if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) + if (!(mark->flags & FSNOTIFY_MARK_FLAG_WEAK_FSID) && + WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) continue; return fsid; } @@ -933,12 +928,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, return 0; } - if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) { + if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) fsid = fanotify_get_fsid(iter_info); - /* Racing with mark destruction or creation? */ - if (!fsid.val[0] && !fsid.val[1]) - return 0; - } event = fanotify_alloc_event(group, mask, data, data_type, dir, file_name, &fsid, match_mask); diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index f3b9ef60f0c0..e5ab33cae6a7 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -499,6 +499,12 @@ static inline struct fanotify_mark *FANOTIFY_MARK(struct fsnotify_mark *mark) return container_of(mark, struct fanotify_mark, fsn_mark); } +static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1, + __kernel_fsid_t *fsid2) +{ + return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1]; +} + static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) { unsigned int mflags = 0; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index e3d836d4d156..f83e7cc5ccf2 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -23,7 +23,7 @@ #include -#include "../../mount.h" +#include "../fsnotify.h" #include "../fdinfo.h" #include "fanotify.h" @@ -1192,11 +1192,68 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, return recalc; } +struct fan_fsid { + struct super_block *sb; + __kernel_fsid_t id; + bool weak; +}; + +static int fanotify_set_mark_fsid(struct fsnotify_group *group, + struct fsnotify_mark *mark, + struct fan_fsid *fsid) +{ + struct fsnotify_mark_connector *conn; + struct fsnotify_mark *old; + struct super_block *old_sb = NULL; + + FANOTIFY_MARK(mark)->fsid = fsid->id; + mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID; + if (fsid->weak) + mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID; + + /* First mark added will determine if group is single or multi fsid */ + if (list_empty(&group->marks_list)) + return 0; + + /* Find sb of an existing mark */ + list_for_each_entry(old, &group->marks_list, g_list) { + conn = READ_ONCE(old->connector); + if (!conn) + continue; + old_sb = fsnotify_connector_sb(conn); + if (old_sb) + break; + } + + /* Only detached marks left? */ + if (!old_sb) + return 0; + + /* Do not allow mixing of marks with weak and strong fsid */ + if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID) + return -EXDEV; + + /* Allow mixing of marks with strong fsid from different fs */ + if (!fsid->weak) + return 0; + + /* Do not allow mixing marks with weak fsid from different fs */ + if (old_sb != fsid->sb) + return -EXDEV; + + /* Do not allow mixing marks from different btrfs sub-volumes */ + if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid, + &FANOTIFY_MARK(mark)->fsid)) + return -EXDEV; + + return 0; +} + static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, unsigned int fan_flags, - __kernel_fsid_t *fsid) + struct fan_fsid *fsid) { struct ucounts *ucounts = group->fanotify_data.ucounts; struct fanotify_mark *fan_mark; @@ -1225,20 +1282,21 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, /* Cache fsid of filesystem containing the marked object */ if (fsid) { - fan_mark->fsid = *fsid; - mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID; + ret = fanotify_set_mark_fsid(group, mark, fsid); + if (ret) + goto out_put_mark; } else { fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0; } ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0); - if (ret) { - fsnotify_put_mark(mark); - goto out_dec_ucounts; - } + if (ret) + goto out_put_mark; return mark; +out_put_mark: + fsnotify_put_mark(mark); out_dec_ucounts: if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS); @@ -1289,7 +1347,7 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, __u32 mask, unsigned int fan_flags, - __kernel_fsid_t *fsid) + struct fan_fsid *fsid) { struct fsnotify_mark *fsn_mark; bool recalc; @@ -1337,7 +1395,7 @@ out: static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) + unsigned int flags, struct fan_fsid *fsid) { return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid); @@ -1345,7 +1403,7 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, static int fanotify_add_sb_mark(struct fsnotify_group *group, struct super_block *sb, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) + unsigned int flags, struct fan_fsid *fsid) { return fanotify_add_mark(group, &sb->s_fsnotify_marks, FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid); @@ -1353,7 +1411,7 @@ static int fanotify_add_sb_mark(struct fsnotify_group *group, static int fanotify_add_inode_mark(struct fsnotify_group *group, struct inode *inode, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) + unsigned int flags, struct fan_fsid *fsid) { pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); @@ -1564,20 +1622,25 @@ out_destroy_group: return fd; } -static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) +static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags, + struct fan_fsid *fsid) { + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; __kernel_fsid_t root_fsid; int err; /* * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse). */ - err = vfs_get_fsid(dentry, fsid); + err = vfs_get_fsid(dentry, &fsid->id); if (err) return err; - if (!fsid->val[0] && !fsid->val[1]) - return -ENODEV; + fsid->sb = dentry->d_sb; + if (!fsid->id.val[0] && !fsid->id.val[1]) { + err = -ENODEV; + goto weak; + } /* * Make sure dentry is not of a filesystem subvolume (e.g. btrfs) @@ -1587,11 +1650,18 @@ static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) if (err) return err; - if (root_fsid.val[0] != fsid->val[0] || - root_fsid.val[1] != fsid->val[1]) - return -EXDEV; + if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) { + err = -EXDEV; + goto weak; + } + fsid->weak = false; return 0; + +weak: + /* Allow weak fsid when marking inodes */ + fsid->weak = true; + return (mark_type == FAN_MARK_INODE) ? 0 : err; } /* Check if filesystem can encode a unique fid */ @@ -1675,7 +1745,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, struct fsnotify_group *group; struct fd f; struct path path; - __kernel_fsid_t __fsid, *fsid = NULL; + struct fan_fsid __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; @@ -1827,7 +1897,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } if (fid_mode) { - ret = fanotify_test_fsid(path.dentry, &__fsid); + ret = fanotify_test_fsid(path.dentry, flags, &__fsid); if (ret) goto path_put_and_out; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index a80b525ca653..7f63be5ca0f1 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -529,6 +529,7 @@ struct fsnotify_mark { #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400 #define FSNOTIFY_MARK_FLAG_HAS_FSID 0x0800 +#define FSNOTIFY_MARK_FLAG_WEAK_FSID 0x1000 unsigned int flags; /* flags [mark->lock] */ }; From 7b91eb6000104c450ebd7af5771ac305691b6fee Mon Sep 17 00:00:00 2001 From: Josh Don Date: Thu, 30 Nov 2023 16:52:03 -0800 Subject: [PATCH 0224/1562] cgroup: Fix documentation for cpu.idle Two problems: - cpu.idle cgroups show up with 0 weight, correct the documentation to indicate this. - cpu.idle has no entry describing it. Signed-off-by: Josh Don Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 30f6ff2eba47..09e65312d20c 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1093,7 +1093,11 @@ All time durations are in microseconds. A read-write single value file which exists on non-root cgroups. The default is "100". - The weight in the range [1, 10000]. + For non idle groups (cpu.idle = 0), the weight is in the + range [1, 10000]. + + If the cgroup has been configured to be SCHED_IDLE (cpu.idle = 1), + then the weight will show as a 0. cpu.weight.nice A read-write single value file which exists on non-root @@ -1157,6 +1161,16 @@ All time durations are in microseconds. values similar to the sched_setattr(2). This maximum utilization value is used to clamp the task specific maximum utilization clamp. + cpu.idle + A read-write single value file which exists on non-root cgroups. + The default is 0. + + This is the cgroup analog of the per-task SCHED_IDLE sched policy. + Setting this value to a 1 will make the scheduling policy of the + cgroup SCHED_IDLE. The threads inside the cgroup will retain their + own relative priorities, but the cgroup itself will be treated as + very low priority relative to its peers. + Memory From d499fd418fa15949d86d28bb5442ab88203fc513 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Nov 2023 15:43:26 -0500 Subject: [PATCH 0225/1562] cgroup/rstat: Optimize cgroup_rstat_updated_list() The current design of cgroup_rstat_cpu_pop_updated() is to traverse the updated tree in a way to pop out the leaf nodes first before their parents. This can cause traversal of multiple nodes before a leaf node can be found and popped out. IOW, a given node in the tree can be visited multiple times before the whole operation is done. So it is not very efficient and the code can be hard to read. With the introduction of cgroup_rstat_updated_list() to build a list of cgroups to be flushed first before any flushing operation is being done, we can optimize the way the updated tree nodes are being popped by pushing the parents first to the tail end of the list before their children. In this way, most updated tree nodes will be visited only once with the exception of the subtree root as we still need to go back to its parent and popped it out of its updated_children list. This also makes the code easier to read. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 161 +++++++++++++++++++++++++----------------- 1 file changed, 95 insertions(+), 66 deletions(-) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 1f300bf4dc40..4ec29e6b1d8d 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -74,82 +74,81 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) } /** - * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree - * @pos: current position - * @root: root of the tree to traversal + * cgroup_rstat_push_children - push children cgroups into the given list + * @head: current head of the list (= subtree root) + * @child: first child of the root * @cpu: target cpu + * Return: A new singly linked list of cgroups to be flush * - * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts - * the traversal and %NULL return indicates the end. During traversal, - * each returned cgroup is unlinked from the tree. Must be called with the - * matching cgroup_rstat_cpu_lock held. - * - * The only ordering guarantee is that, for a parent and a child pair - * covered by a given traversal, if a child is visited, its parent is - * guaranteed to be visited afterwards. + * Iteratively traverse down the cgroup_rstat_cpu updated tree level by + * level and push all the parents first before their next level children + * into a singly linked list built from the tail backward like "pushing" + * cgroups into a stack. The root is pushed by the caller. */ -static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, - struct cgroup *root, int cpu) +static struct cgroup *cgroup_rstat_push_children(struct cgroup *head, + struct cgroup *child, int cpu) { - struct cgroup_rstat_cpu *rstatc; - struct cgroup *parent; + struct cgroup *chead = child; /* Head of child cgroup level */ + struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */ + struct cgroup *parent, *grandchild; + struct cgroup_rstat_cpu *crstatc; - if (pos == root) - return NULL; + child->rstat_flush_next = NULL; - /* - * We're gonna walk down to the first leaf and visit/remove it. We - * can pick whatever unvisited node as the starting point. - */ - if (!pos) { - pos = root; - /* return NULL if this subtree is not on-list */ - if (!cgroup_rstat_cpu(pos, cpu)->updated_next) - return NULL; - } else { - pos = cgroup_parent(pos); - } +next_level: + while (chead) { + child = chead; + chead = child->rstat_flush_next; + parent = cgroup_parent(child); - /* walk down to the first leaf */ - while (true) { - rstatc = cgroup_rstat_cpu(pos, cpu); - if (rstatc->updated_children == pos) - break; - pos = rstatc->updated_children; - } - - /* - * Unlink @pos from the tree. As the updated_children list is - * singly linked, we have to walk it to find the removal point. - * However, due to the way we traverse, @pos will be the first - * child in most cases. The only exception is @root. - */ - parent = cgroup_parent(pos); - if (parent) { - struct cgroup_rstat_cpu *prstatc; - struct cgroup **nextp; - - prstatc = cgroup_rstat_cpu(parent, cpu); - nextp = &prstatc->updated_children; - while (*nextp != pos) { - struct cgroup_rstat_cpu *nrstatc; - - nrstatc = cgroup_rstat_cpu(*nextp, cpu); - WARN_ON_ONCE(*nextp == parent); - nextp = &nrstatc->updated_next; + /* updated_next is parent cgroup terminated */ + while (child != parent) { + child->rstat_flush_next = head; + head = child; + crstatc = cgroup_rstat_cpu(child, cpu); + grandchild = crstatc->updated_children; + if (grandchild != child) { + /* Push the grand child to the next level */ + crstatc->updated_children = child; + grandchild->rstat_flush_next = ghead; + ghead = grandchild; + } + child = crstatc->updated_next; + crstatc->updated_next = NULL; } - *nextp = rstatc->updated_next; } - rstatc->updated_next = NULL; - return pos; + if (ghead) { + chead = ghead; + ghead = NULL; + goto next_level; + } + return head; } -/* Return a list of updated cgroups to be flushed */ +/** + * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed + * @root: root of the cgroup subtree to traverse + * @cpu: target cpu + * Return: A singly linked list of cgroups to be flushed + * + * Walks the updated rstat_cpu tree on @cpu from @root. During traversal, + * each returned cgroup is unlinked from the updated tree. + * + * The only ordering guarantee is that, for a parent and a child pair + * covered by a given traversal, the child is before its parent in + * the list. + * + * Note that updated_children is self terminated and points to a list of + * child cgroups if not empty. Whereas updated_next is like a sibling link + * within the children list and terminated by the parent cgroup. An exception + * here is the cgroup root whose updated_next can be self terminated. + */ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) { raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); - struct cgroup *head, *tail, *next; + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu); + struct cgroup *head = NULL, *parent, *child; unsigned long flags; /* @@ -161,12 +160,42 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) * that interrupts are always disabled and later restored. */ raw_spin_lock_irqsave(cpu_lock, flags); - head = tail = cgroup_rstat_cpu_pop_updated(NULL, root, cpu); - while (tail) { - next = cgroup_rstat_cpu_pop_updated(tail, root, cpu); - tail->rstat_flush_next = next; - tail = next; + + /* Return NULL if this subtree is not on-list */ + if (!rstatc->updated_next) + goto unlock_ret; + + /* + * Unlink @root from its parent. As the updated_children list is + * singly linked, we have to walk it to find the removal point. + */ + parent = cgroup_parent(root); + if (parent) { + struct cgroup_rstat_cpu *prstatc; + struct cgroup **nextp; + + prstatc = cgroup_rstat_cpu(parent, cpu); + nextp = &prstatc->updated_children; + while (*nextp != root) { + struct cgroup_rstat_cpu *nrstatc; + + nrstatc = cgroup_rstat_cpu(*nextp, cpu); + WARN_ON_ONCE(*nextp == parent); + nextp = &nrstatc->updated_next; + } + *nextp = rstatc->updated_next; } + + rstatc->updated_next = NULL; + + /* Push @root to the list first before pushing the children */ + head = root; + root->rstat_flush_next = NULL; + child = rstatc->updated_children; + rstatc->updated_children = root; + if (child != root) + head = cgroup_rstat_push_children(head, child, cpu); +unlock_ret: raw_spin_unlock_irqrestore(cpu_lock, flags); return head; } From 77070eeb882124614a40616f01bfe60947be5778 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Nov 2023 15:43:27 -0500 Subject: [PATCH 0226/1562] cgroup: Avoid false cacheline sharing of read mostly rstat_cpu The rstat_cpu and also rstat_css_list of the cgroup structure are read mostly variables. However, they may share the same cacheline as the subsequent rstat_flush_next and *bstat variables which can be updated frequently. That will slow down the cgroup_rstat_cpu() call which is called pretty frequently in the rstat code. Add a CACHELINE_PADDING() line in between them to avoid false cacheline sharing. A parallel kernel build on a 2-socket x86-64 server is used as the benchmarking tool for measuring the lock hold time. Below were the lock hold time frequency distribution before and after the patch: Run time Before patch After patch -------- ------------ ----------- 0-01 us 9,928,562 9,820,428 01-05 us 110,151 50,935 05-10 us 270 93 10-15 us 273 146 15-20 us 135 76 20-25 us 0 2 25-30 us 1 0 It can be seen that the patch further pushes the lock hold time towards the lower end. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 37518436cfe7..5a97ea95b564 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -496,6 +496,13 @@ struct cgroup { struct cgroup_rstat_cpu __percpu *rstat_cpu; struct list_head rstat_css_list; + /* + * Add padding to separate the read mostly rstat_cpu and + * rstat_css_list into a different cacheline from the following + * rstat_flush_next and *bstat fields which can have frequent updates. + */ + CACHELINE_PADDING(_pad_); + /* * A singly-linked list of cgroup structures to be rstat flushed. * This is a scratch field to be used exclusively by From 6bb5153dfbaf88fa4b40ef50b706d2fb186ed92e Mon Sep 17 00:00:00 2001 From: SungHwan Jung Date: Fri, 24 Nov 2023 18:41:18 +0900 Subject: [PATCH 0227/1562] platform/x86: acer-wmi: Add platform profile and mode key support for Predator PHN16-71 The Acer Predator PHN16-71 has the mode key that is used to rotate thermal modes or toggle turbo mode with predator sense app (ver. 4) on windows. This patch includes platform profile and the mode key support for the device and also includes a small fix for "WMI_gaming_execute_u64" function. Signed-off-by: SungHwan Jung Link: https://lore.kernel.org/r/20231124094122.100707-2-onenowy@gmail.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/Kconfig | 1 + drivers/platform/x86/acer-wmi.c | 271 +++++++++++++++++++++++++++++++- 2 files changed, 267 insertions(+), 5 deletions(-) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 7e69fdaccdd5..dbde7e018dc1 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -181,6 +181,7 @@ config ACER_WMI select INPUT_SPARSEKMAP select LEDS_CLASS select NEW_LEDS + select ACPI_PLATFORM_PROFILE help This is a driver for newer Acer (and Wistron) laptops. It adds wireless radio and bluetooth control, and on some laptops, diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c index 0e472aa9bf41..55fedc2e656e 100644 --- a/drivers/platform/x86/acer-wmi.c +++ b/drivers/platform/x86/acer-wmi.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -62,9 +63,12 @@ MODULE_LICENSE("GPL"); #define ACER_WMID_SET_GAMING_LED_METHODID 2 #define ACER_WMID_GET_GAMING_LED_METHODID 4 +#define ACER_WMID_GET_GAMING_SYS_INFO_METHODID 5 #define ACER_WMID_SET_GAMING_FAN_BEHAVIOR 14 #define ACER_WMID_SET_GAMING_MISC_SETTING_METHODID 22 +#define ACER_PREDATOR_V4_THERMAL_PROFILE_EC_OFFSET 0x54 + /* * Acer ACPI method GUIDs */ @@ -90,6 +94,10 @@ enum acer_wmi_event_ids { WMID_GAMING_TURBO_KEY_EVENT = 0x7, }; +enum acer_wmi_predator_v4_sys_info_command { + ACER_WMID_CMD_GET_PREDATOR_V4_BAT_STATUS = 0x02, +}; + static const struct key_entry acer_wmi_keymap[] __initconst = { {KE_KEY, 0x01, {KEY_WLAN} }, /* WiFi */ {KE_KEY, 0x03, {KEY_WLAN} }, /* WiFi */ @@ -229,9 +237,10 @@ struct hotkey_function_type_aa { #define ACER_CAP_THREEG BIT(4) #define ACER_CAP_SET_FUNCTION_MODE BIT(5) #define ACER_CAP_KBD_DOCK BIT(6) -#define ACER_CAP_TURBO_OC BIT(7) -#define ACER_CAP_TURBO_LED BIT(8) -#define ACER_CAP_TURBO_FAN BIT(9) +#define ACER_CAP_TURBO_OC BIT(7) +#define ACER_CAP_TURBO_LED BIT(8) +#define ACER_CAP_TURBO_FAN BIT(9) +#define ACER_CAP_PLATFORM_PROFILE BIT(10) /* * Interface type flags @@ -259,6 +268,7 @@ static bool ec_raw_mode; static bool has_type_aa; static u16 commun_func_bitmap; static u8 commun_fn_key_number; +static bool cycle_gaming_thermal_profile = true; module_param(mailled, int, 0444); module_param(brightness, int, 0444); @@ -266,12 +276,15 @@ module_param(threeg, int, 0444); module_param(force_series, int, 0444); module_param(force_caps, int, 0444); module_param(ec_raw_mode, bool, 0444); +module_param(cycle_gaming_thermal_profile, bool, 0644); MODULE_PARM_DESC(mailled, "Set initial state of Mail LED"); MODULE_PARM_DESC(brightness, "Set initial LCD backlight brightness"); MODULE_PARM_DESC(threeg, "Set initial state of 3G hardware"); MODULE_PARM_DESC(force_series, "Force a different laptop series"); MODULE_PARM_DESC(force_caps, "Force the capability bitmask to this value"); MODULE_PARM_DESC(ec_raw_mode, "Enable EC raw mode"); +MODULE_PARM_DESC(cycle_gaming_thermal_profile, + "Set thermal mode key in cycle mode. Disabling it sets the mode key in turbo toggle mode"); struct acer_data { int mailled; @@ -321,6 +334,7 @@ struct quirk_entry { u8 turbo; u8 cpu_fans; u8 gpu_fans; + u8 predator_v4; }; static struct quirk_entry *quirks; @@ -336,6 +350,9 @@ static void __init set_quirks(void) if (quirks->turbo) interface->capability |= ACER_CAP_TURBO_OC | ACER_CAP_TURBO_LED | ACER_CAP_TURBO_FAN; + + if (quirks->predator_v4) + interface->capability |= ACER_CAP_PLATFORM_PROFILE; } static int __init dmi_matched(const struct dmi_system_id *dmi) @@ -370,6 +387,10 @@ static struct quirk_entry quirk_acer_predator_ph315_53 = { .gpu_fans = 1, }; +static struct quirk_entry quirk_acer_predator_v4 = { + .predator_v4 = 1, +}; + /* This AMW0 laptop has no bluetooth */ static struct quirk_entry quirk_medion_md_98300 = { .wireless = 1, @@ -546,6 +567,15 @@ static const struct dmi_system_id acer_quirks[] __initconst = { }, .driver_data = &quirk_acer_predator_ph315_53, }, + { + .callback = dmi_matched, + .ident = "Acer Predator PHN16-71", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "Predator PHN16-71"), + }, + .driver_data = &quirk_acer_predator_v4, + }, { .callback = set_force_caps, .ident = "Acer Aspire Switch 10E SW3-016", @@ -659,6 +689,31 @@ static const struct dmi_system_id non_acer_quirks[] __initconst = { {} }; +static struct platform_profile_handler platform_profile_handler; +static bool platform_profile_support; + +/* + * The profile used before turbo mode. This variable is needed for + * returning from turbo mode when the mode key is in toggle mode. + */ +static int last_non_turbo_profile; + +enum acer_predator_v4_thermal_profile_ec { + ACER_PREDATOR_V4_THERMAL_PROFILE_ECO = 0x04, + ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO = 0x03, + ACER_PREDATOR_V4_THERMAL_PROFILE_PERFORMANCE = 0x02, + ACER_PREDATOR_V4_THERMAL_PROFILE_QUIET = 0x01, + ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED = 0x00, +}; + +enum acer_predator_v4_thermal_profile_wmi { + ACER_PREDATOR_V4_THERMAL_PROFILE_ECO_WMI = 0x060B, + ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO_WMI = 0x050B, + ACER_PREDATOR_V4_THERMAL_PROFILE_PERFORMANCE_WMI = 0x040B, + ACER_PREDATOR_V4_THERMAL_PROFILE_QUIET_WMI = 0x0B, + ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED_WMI = 0x010B, +}; + /* Find which quirks are needed for a particular vendor/ model pair */ static void __init find_quirks(void) { @@ -1339,7 +1394,7 @@ WMI_gaming_execute_u64(u32 method_id, u64 in, u64 *out) struct acpi_buffer input = { (acpi_size) sizeof(u64), (void *)(&in) }; struct acpi_buffer result = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *obj; - u32 tmp = 0; + u64 tmp = 0; acpi_status status; status = wmi_evaluate_method(WMID_GUID4, 0, method_id, &input, &result); @@ -1698,6 +1753,199 @@ static int acer_toggle_turbo(void) return turbo_led_state; } +static int +acer_predator_v4_platform_profile_get(struct platform_profile_handler *pprof, + enum platform_profile_option *profile) +{ + u8 tp; + int err; + + err = ec_read(ACER_PREDATOR_V4_THERMAL_PROFILE_EC_OFFSET, &tp); + + if (err < 0) + return err; + + switch (tp) { + case ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO: + *profile = PLATFORM_PROFILE_PERFORMANCE; + break; + case ACER_PREDATOR_V4_THERMAL_PROFILE_PERFORMANCE: + *profile = PLATFORM_PROFILE_BALANCED_PERFORMANCE; + break; + case ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED: + *profile = PLATFORM_PROFILE_BALANCED; + break; + case ACER_PREDATOR_V4_THERMAL_PROFILE_QUIET: + *profile = PLATFORM_PROFILE_QUIET; + break; + case ACER_PREDATOR_V4_THERMAL_PROFILE_ECO: + *profile = PLATFORM_PROFILE_LOW_POWER; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int +acer_predator_v4_platform_profile_set(struct platform_profile_handler *pprof, + enum platform_profile_option profile) +{ + int tp; + acpi_status status; + + switch (profile) { + case PLATFORM_PROFILE_PERFORMANCE: + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO_WMI; + break; + case PLATFORM_PROFILE_BALANCED_PERFORMANCE: + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_PERFORMANCE_WMI; + break; + case PLATFORM_PROFILE_BALANCED: + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED_WMI; + break; + case PLATFORM_PROFILE_QUIET: + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_QUIET_WMI; + break; + case PLATFORM_PROFILE_LOW_POWER: + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_ECO_WMI; + break; + default: + return -EOPNOTSUPP; + } + + status = WMI_gaming_execute_u64( + ACER_WMID_SET_GAMING_MISC_SETTING_METHODID, tp, NULL); + + if (ACPI_FAILURE(status)) + return -EIO; + + if (tp != ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO_WMI) + last_non_turbo_profile = tp; + + return 0; +} + +static int acer_platform_profile_setup(void) +{ + if (quirks->predator_v4) { + int err; + + platform_profile_handler.profile_get = + acer_predator_v4_platform_profile_get; + platform_profile_handler.profile_set = + acer_predator_v4_platform_profile_set; + + set_bit(PLATFORM_PROFILE_PERFORMANCE, + platform_profile_handler.choices); + set_bit(PLATFORM_PROFILE_BALANCED_PERFORMANCE, + platform_profile_handler.choices); + set_bit(PLATFORM_PROFILE_BALANCED, + platform_profile_handler.choices); + set_bit(PLATFORM_PROFILE_QUIET, + platform_profile_handler.choices); + set_bit(PLATFORM_PROFILE_LOW_POWER, + platform_profile_handler.choices); + + err = platform_profile_register(&platform_profile_handler); + if (err) + return err; + + platform_profile_support = true; + + /* Set default non-turbo profile */ + last_non_turbo_profile = + ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED_WMI; + } + return 0; +} + +static int acer_thermal_profile_change(void) +{ + /* + * This mode key can rotate each mode or toggle turbo mode. + * On battery, only ECO and BALANCED mode are available. + */ + if (quirks->predator_v4) { + u8 current_tp; + int tp, err; + u64 on_AC; + acpi_status status; + + err = ec_read(ACER_PREDATOR_V4_THERMAL_PROFILE_EC_OFFSET, + ¤t_tp); + + if (err < 0) + return err; + + /* Check power source */ + status = WMI_gaming_execute_u64( + ACER_WMID_GET_GAMING_SYS_INFO_METHODID, + ACER_WMID_CMD_GET_PREDATOR_V4_BAT_STATUS, &on_AC); + + if (ACPI_FAILURE(status)) + return -EIO; + + switch (current_tp) { + case ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO: + if (!on_AC) + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED_WMI; + else if (cycle_gaming_thermal_profile) + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_ECO_WMI; + else + tp = last_non_turbo_profile; + break; + case ACER_PREDATOR_V4_THERMAL_PROFILE_PERFORMANCE: + if (!on_AC) + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED_WMI; + else + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO_WMI; + break; + case ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED: + if (!on_AC) + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_ECO_WMI; + else if (cycle_gaming_thermal_profile) + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_PERFORMANCE_WMI; + else + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO_WMI; + break; + case ACER_PREDATOR_V4_THERMAL_PROFILE_QUIET: + if (!on_AC) + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED_WMI; + else if (cycle_gaming_thermal_profile) + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED_WMI; + else + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO_WMI; + break; + case ACER_PREDATOR_V4_THERMAL_PROFILE_ECO: + if (!on_AC) + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED_WMI; + else if (cycle_gaming_thermal_profile) + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_QUIET_WMI; + else + tp = ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO_WMI; + break; + default: + return -EOPNOTSUPP; + } + + status = WMI_gaming_execute_u64( + ACER_WMID_SET_GAMING_MISC_SETTING_METHODID, tp, NULL); + + if (ACPI_FAILURE(status)) + return -EIO; + + /* Store non-turbo profile for turbo mode toggle*/ + if (tp != ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO_WMI) + last_non_turbo_profile = tp; + + platform_profile_notify(); + } + + return 0; +} + /* * Switch series keyboard dock status */ @@ -1997,6 +2245,8 @@ static void acer_wmi_notify(u32 value, void *context) case WMID_GAMING_TURBO_KEY_EVENT: if (return_value.key_num == 0x4) acer_toggle_turbo(); + if (return_value.key_num == 0x5 && has_cap(ACER_CAP_PLATFORM_PROFILE)) + acer_thermal_profile_change(); break; default: pr_warn("Unknown function number - %d - %d\n", @@ -2245,8 +2495,16 @@ static int acer_platform_probe(struct platform_device *device) if (err) goto error_rfkill; - return err; + if (has_cap(ACER_CAP_PLATFORM_PROFILE)) { + err = acer_platform_profile_setup(); + if (err) + goto error_platform_profile; + } + return 0; + +error_platform_profile: + acer_rfkill_exit(); error_rfkill: if (has_cap(ACER_CAP_BRIGHTNESS)) acer_backlight_exit(); @@ -2265,6 +2523,9 @@ static void acer_platform_remove(struct platform_device *device) acer_backlight_exit(); acer_rfkill_exit(); + + if (platform_profile_support) + platform_profile_remove(); } #ifdef CONFIG_PM_SLEEP From c0ff2c397e84795816816ae8a32fd1104156d0f6 Mon Sep 17 00:00:00 2001 From: SungHwan Jung Date: Fri, 24 Nov 2023 18:41:20 +0900 Subject: [PATCH 0228/1562] platform/x86: acer-wmi: Depend on ACPI_VIDEO instead of selecting it "select ACPI_VIDEO" cause recursive dependency when "depends on HWMON" is added: drivers/hwmon/Kconfig:6:error: recursive dependency detected! drivers/hwmon/Kconfig:6: symbol HWMON is selected by EEEPC_LAPTOP drivers/platform/x86/Kconfig:326: symbol EEEPC_LAPTOP depends on ACPI_VIDEO drivers/acpi/Kconfig:208: symbol ACPI_VIDEO is selected by ACER_WMI drivers/platform/x86/Kconfig:173: symbol ACER_WMI depends on HWMON Replace the select with depends on to avoid this problem when the next patch in this series adds "depends on HWMON". There is a stub defined for the used acpi_video_get_backlight_type() function when ACPI_VIDEO is not set, so use: depends on ACPI_VIDEO || ACPI_VIDEO = n Signed-off-by: SungHwan Jung Link: https://lore.kernel.org/r/20231124094122.100707-4-onenowy@gmail.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index dbde7e018dc1..14059335f19d 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -177,7 +177,7 @@ config ACER_WMI depends on INPUT depends on RFKILL || RFKILL = n depends on ACPI_WMI - select ACPI_VIDEO + depends on ACPI_VIDEO || ACPI_VIDEO = n select INPUT_SPARSEKMAP select LEDS_CLASS select NEW_LEDS From 446dd8efa94ca80a8b91fbe907364001ed1b3d85 Mon Sep 17 00:00:00 2001 From: SungHwan Jung Date: Fri, 24 Nov 2023 18:41:19 +0900 Subject: [PATCH 0229/1562] platform/x86: acer-wmi: add fan speed monitoring for Predator PHN16-71 Support CPU and GPU fan speed monitoring through WMI for Predator PHN16-71. Signed-off-by: SungHwan Jung Link: https://lore.kernel.org/r/20231124094122.100707-3-onenowy@gmail.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/Kconfig | 1 + drivers/platform/x86/acer-wmi.c | 108 +++++++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 14059335f19d..7ddb8cc3d56f 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -178,6 +178,7 @@ config ACER_WMI depends on RFKILL || RFKILL = n depends on ACPI_WMI depends on ACPI_VIDEO || ACPI_VIDEO = n + depends on HWMON select INPUT_SPARSEKMAP select LEDS_CLASS select NEW_LEDS diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c index 55fedc2e656e..88b826e88ebd 100644 --- a/drivers/platform/x86/acer-wmi.c +++ b/drivers/platform/x86/acer-wmi.c @@ -30,6 +30,8 @@ #include #include #include +#include +#include MODULE_AUTHOR("Carlos Corbacho"); MODULE_DESCRIPTION("Acer Laptop WMI Extras Driver"); @@ -69,6 +71,8 @@ MODULE_LICENSE("GPL"); #define ACER_PREDATOR_V4_THERMAL_PROFILE_EC_OFFSET 0x54 +#define ACER_PREDATOR_V4_FAN_SPEED_READ_BIT_MASK GENMASK(20, 8) + /* * Acer ACPI method GUIDs */ @@ -96,6 +100,8 @@ enum acer_wmi_event_ids { enum acer_wmi_predator_v4_sys_info_command { ACER_WMID_CMD_GET_PREDATOR_V4_BAT_STATUS = 0x02, + ACER_WMID_CMD_GET_PREDATOR_V4_CPU_FAN_SPEED = 0x0201, + ACER_WMID_CMD_GET_PREDATOR_V4_GPU_FAN_SPEED = 0x0601, }; static const struct key_entry acer_wmi_keymap[] __initconst = { @@ -241,6 +247,7 @@ struct hotkey_function_type_aa { #define ACER_CAP_TURBO_LED BIT(8) #define ACER_CAP_TURBO_FAN BIT(9) #define ACER_CAP_PLATFORM_PROFILE BIT(10) +#define ACER_CAP_FAN_SPEED_READ BIT(11) /* * Interface type flags @@ -352,7 +359,8 @@ static void __init set_quirks(void) | ACER_CAP_TURBO_FAN; if (quirks->predator_v4) - interface->capability |= ACER_CAP_PLATFORM_PROFILE; + interface->capability |= ACER_CAP_PLATFORM_PROFILE | + ACER_CAP_FAN_SPEED_READ; } static int __init dmi_matched(const struct dmi_system_id *dmi) @@ -1718,6 +1726,26 @@ static int acer_gsensor_event(void) return 0; } +static int acer_get_fan_speed(int fan) +{ + if (quirks->predator_v4) { + acpi_status status; + u64 fanspeed; + + status = WMI_gaming_execute_u64( + ACER_WMID_GET_GAMING_SYS_INFO_METHODID, + fan == 0 ? ACER_WMID_CMD_GET_PREDATOR_V4_CPU_FAN_SPEED : + ACER_WMID_CMD_GET_PREDATOR_V4_GPU_FAN_SPEED, + &fanspeed); + + if (ACPI_FAILURE(status)) + return -EIO; + + return FIELD_GET(ACER_PREDATOR_V4_FAN_SPEED_READ_BIT_MASK, fanspeed); + } + return -EOPNOTSUPP; +} + /* * Predator series turbo button */ @@ -2472,6 +2500,8 @@ static u32 get_wmid_devices(void) return devices; } +static int acer_wmi_hwmon_init(void); + /* * Platform device */ @@ -2501,8 +2531,17 @@ static int acer_platform_probe(struct platform_device *device) goto error_platform_profile; } + if (has_cap(ACER_CAP_FAN_SPEED_READ)) { + err = acer_wmi_hwmon_init(); + if (err) + goto error_hwmon; + } + return 0; +error_hwmon: + if (platform_profile_support) + platform_profile_remove(); error_platform_profile: acer_rfkill_exit(); error_rfkill: @@ -2612,6 +2651,73 @@ static void __init create_debugfs(void) &interface->debug.wmid_devices); } +static umode_t acer_wmi_hwmon_is_visible(const void *data, + enum hwmon_sensor_types type, u32 attr, + int channel) +{ + switch (type) { + case hwmon_fan: + if (acer_get_fan_speed(channel) >= 0) + return 0444; + break; + default: + return 0; + } + + return 0; +} + +static int acer_wmi_hwmon_read(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long *val) +{ + int ret; + + switch (type) { + case hwmon_fan: + ret = acer_get_fan_speed(channel); + if (ret < 0) + return ret; + *val = ret; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static const struct hwmon_channel_info *const acer_wmi_hwmon_info[] = { + HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT, HWMON_F_INPUT), NULL +}; + +static const struct hwmon_ops acer_wmi_hwmon_ops = { + .read = acer_wmi_hwmon_read, + .is_visible = acer_wmi_hwmon_is_visible, +}; + +static const struct hwmon_chip_info acer_wmi_hwmon_chip_info = { + .ops = &acer_wmi_hwmon_ops, + .info = acer_wmi_hwmon_info, +}; + +static int acer_wmi_hwmon_init(void) +{ + struct device *dev = &acer_platform_device->dev; + struct device *hwmon; + + hwmon = devm_hwmon_device_register_with_info(dev, "acer", + &acer_platform_driver, + &acer_wmi_hwmon_chip_info, + NULL); + + if (IS_ERR(hwmon)) { + dev_err(dev, "Could not register acer hwmon device\n"); + return PTR_ERR(hwmon); + } + + return 0; +} + static int __init acer_wmi_init(void) { int err; From 3799b5d2323dffde5e2ba60b5bfbd8e6d4bea28e Mon Sep 17 00:00:00 2001 From: Edson Juliano Drosdeck Date: Mon, 16 Oct 2023 16:13:49 -0300 Subject: [PATCH 0230/1562] platform/x86: asus-laptop: remove redundant braces in if statements Adhere to Linux kernel coding style. Reported by checkpatch: WARNING: braces {} are not necessary for single statement blocks Signed-off-by: Edson Juliano Drosdeck Link: https://lore.kernel.org/r/20231016191349.3856-1-edson.drosdeck@gmail.com Signed-off-by: Hans de Goede --- drivers/platform/x86/asus-laptop.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/platform/x86/asus-laptop.c b/drivers/platform/x86/asus-laptop.c index 761029f39314..bf03ea1b1274 100644 --- a/drivers/platform/x86/asus-laptop.c +++ b/drivers/platform/x86/asus-laptop.c @@ -1816,9 +1816,8 @@ static void asus_dmi_check(void) return; /* On L1400B WLED control the sound card, don't mess with it ... */ - if (strncmp(model, "L1400B", 6) == 0) { + if (strncmp(model, "L1400B", 6) == 0) wlan_status = -1; - } } static bool asus_device_present; From 578dc962ff2000ba4bf52d50717aea0819615634 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Fri, 24 Nov 2023 17:24:35 -0800 Subject: [PATCH 0231/1562] mtd: rawnand: Add destructive operation Erase and program operations need the write protect (wp) pin to be de-asserted to take effect. Add the concept of destructive operation and pass the information to exec_op() so controllers know when they should de-assert this pin without having to decode the command opcode. Signed-off-by: Boris Brezillon Signed-off-by: David Regan Reviewed-by: Florian Fainelli Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231125012438.15191-1-dregan@broadcom.com --- drivers/mtd/nand/raw/nand_base.c | 6 ++++-- include/linux/mtd/rawnand.h | 11 +++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index 9e24bedffd89..7dd9be5d58c4 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -1493,7 +1493,8 @@ static int nand_exec_prog_page_op(struct nand_chip *chip, unsigned int page, NAND_COMMON_TIMING_NS(conf, tWB_max)), NAND_OP_WAIT_RDY(NAND_COMMON_TIMING_MS(conf, tPROG_max), 0), }; - struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs); + struct nand_operation op = NAND_DESTRUCTIVE_OPERATION(chip->cur_cs, + instrs); int naddrs = nand_fill_column_cycles(chip, addrs, offset_in_page); if (naddrs < 0) @@ -1916,7 +1917,8 @@ int nand_erase_op(struct nand_chip *chip, unsigned int eraseblock) NAND_OP_WAIT_RDY(NAND_COMMON_TIMING_MS(conf, tBERS_max), 0), }; - struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs); + struct nand_operation op = NAND_DESTRUCTIVE_OPERATION(chip->cur_cs, + instrs); if (chip->options & NAND_ROW_ADDR_3) instrs[1].ctx.addr.naddrs++; diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index c29ace15a053..bd02aba5e6e3 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -1003,6 +1003,8 @@ struct nand_op_parser { /** * struct nand_operation - NAND operation descriptor * @cs: the CS line to select for this NAND operation + * @deassert_wp: set to true when the operation requires the WP pin to be + * de-asserted (ERASE, PROG, ...) * @instrs: array of instructions to execute * @ninstrs: length of the @instrs array * @@ -1010,6 +1012,7 @@ struct nand_op_parser { */ struct nand_operation { unsigned int cs; + bool deassert_wp; const struct nand_op_instr *instrs; unsigned int ninstrs; }; @@ -1021,6 +1024,14 @@ struct nand_operation { .ninstrs = ARRAY_SIZE(_instrs), \ } +#define NAND_DESTRUCTIVE_OPERATION(_cs, _instrs) \ + { \ + .cs = _cs, \ + .deassert_wp = true, \ + .instrs = _instrs, \ + .ninstrs = ARRAY_SIZE(_instrs), \ + } + int nand_op_parser_exec_op(struct nand_chip *chip, const struct nand_op_parser *parser, const struct nand_operation *op, bool check_only); From 68cce21e3cc5fea8d955a62394454149270c98bc Mon Sep 17 00:00:00 2001 From: David Regan Date: Fri, 24 Nov 2023 17:24:36 -0800 Subject: [PATCH 0232/1562] mtd: rawnand: NAND controller write protect Allow NAND controller to be responsible for write protect pin handling during fast path and exec_op destructive operation when controller_wp flag is set. Signed-off-by: David Regan Reviewed-by: Florian Fainelli Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231125012438.15191-2-dregan@broadcom.com --- drivers/mtd/nand/raw/nand_base.c | 4 ++++ include/linux/mtd/rawnand.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index 7dd9be5d58c4..0f342cd691a3 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -366,6 +366,10 @@ static int nand_check_wp(struct nand_chip *chip) if (chip->options & NAND_BROKEN_XD) return 0; + /* controller responsible for NAND write protect */ + if (chip->controller->controller_wp) + return 0; + /* Check the WP bit */ ret = nand_status_op(chip, &status); if (ret) diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index bd02aba5e6e3..a17f795070d8 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -1115,6 +1115,7 @@ struct nand_controller_ops { * the bus without restarting an entire read operation nor * changing the column. * @supported_op.cont_read: The controller supports sequential cache reads. + * @controller_wp: the controller is in charge of handling the WP pin. */ struct nand_controller { struct mutex lock; @@ -1123,6 +1124,7 @@ struct nand_controller { unsigned int data_only_read: 1; unsigned int cont_read: 1; } supported_op; + bool controller_wp; }; static inline void nand_controller_init(struct nand_controller *nfc) From c86b63b82fde4f96ee94dde827a5f28ff5adeb57 Mon Sep 17 00:00:00 2001 From: David Regan Date: Fri, 24 Nov 2023 17:24:37 -0800 Subject: [PATCH 0233/1562] mtd: rawnand: brcmnand: pass host struct to bcmnand_ctrl_poll_status Pass host struct to bcmnand_ctrl_poll_status instead of ctrl struct since real time status requires host, and ctrl is a member of host. Real time status is required for low level commands vs cached status since the NAND controller will not do an automatic status read at the end of a low level command as it would with a high level command. Signed-off-by: David Regan Reviewed-by: Florian Fainelli Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231125012438.15191-3-dregan@broadcom.com --- drivers/mtd/nand/raw/brcmnand/brcmnand.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c index 30fc399f346e..4867b4dad5e4 100644 --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c @@ -1061,10 +1061,11 @@ enum { CS_SELECT_AUTO_DEVICE_ID_CFG = BIT(30), }; -static int bcmnand_ctrl_poll_status(struct brcmnand_controller *ctrl, +static int bcmnand_ctrl_poll_status(struct brcmnand_host *host, u32 mask, u32 expected_val, unsigned long timeout_ms) { + struct brcmnand_controller *ctrl = host->ctrl; unsigned long limit; u32 val; @@ -1379,7 +1380,7 @@ static void brcmnand_wp(struct mtd_info *mtd, int wp) * make sure ctrl/flash ready before and after * changing state of #WP pin */ - ret = bcmnand_ctrl_poll_status(ctrl, NAND_CTRL_RDY | + ret = bcmnand_ctrl_poll_status(host, NAND_CTRL_RDY | NAND_STATUS_READY, NAND_CTRL_RDY | NAND_STATUS_READY, 0); @@ -1389,7 +1390,7 @@ static void brcmnand_wp(struct mtd_info *mtd, int wp) brcmnand_set_wp(ctrl, wp); nand_status_op(chip, NULL); /* NAND_STATUS_WP 0x00 = protected, 0x80 = not protected */ - ret = bcmnand_ctrl_poll_status(ctrl, + ret = bcmnand_ctrl_poll_status(host, NAND_CTRL_RDY | NAND_STATUS_READY | NAND_STATUS_WP, @@ -1629,13 +1630,13 @@ static void brcmnand_send_cmd(struct brcmnand_host *host, int cmd) */ if (oops_in_progress) { if (ctrl->cmd_pending && - bcmnand_ctrl_poll_status(ctrl, NAND_CTRL_RDY, NAND_CTRL_RDY, 0)) + bcmnand_ctrl_poll_status(host, NAND_CTRL_RDY, NAND_CTRL_RDY, 0)) return; } else BUG_ON(ctrl->cmd_pending != 0); ctrl->cmd_pending = cmd; - ret = bcmnand_ctrl_poll_status(ctrl, NAND_CTRL_RDY, NAND_CTRL_RDY, 0); + ret = bcmnand_ctrl_poll_status(host, NAND_CTRL_RDY, NAND_CTRL_RDY, 0); WARN_ON(ret); mb(); /* flush previous writes */ @@ -1664,7 +1665,7 @@ static bool brcmstb_nand_wait_for_completion(struct nand_chip *chip) if (mtd->oops_panic_write || ctrl->irq < 0) { /* switch to interrupt polling and PIO mode */ disable_ctrl_irqs(ctrl); - sts = bcmnand_ctrl_poll_status(ctrl, NAND_CTRL_RDY, + sts = bcmnand_ctrl_poll_status(host, NAND_CTRL_RDY, NAND_CTRL_RDY, 0); err = sts < 0; } else { From 3c8260ce76634291aed877032a41e373884d69e4 Mon Sep 17 00:00:00 2001 From: David Regan Date: Fri, 24 Nov 2023 17:24:38 -0800 Subject: [PATCH 0234/1562] mtd: rawnand: brcmnand: exec_op implementation exec_op implementation for Broadcom STB, Broadband and iProc SoC This adds exec_op and removes the legacy interface. Based on changes proposed by Boris Brezillon. Link: https://github.com/bbrezillon/linux/commit/4ec6f8d8d83f5aaca5d1877f02d48da96d41fcba Link: https://github.com/bbrezillon/linux/commit/11b4acffd761c4928652d7028d19fcd6f45e4696 Signed-off-by: David Regan [Miquel Raynal: Misc style fixes] Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231125012438.15191-4-dregan@broadcom.com --- drivers/mtd/nand/raw/brcmnand/brcmnand.c | 391 ++++++++++------------- 1 file changed, 168 insertions(+), 223 deletions(-) diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c index 4867b4dad5e4..8faca43ae1ff 100644 --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c @@ -625,6 +625,8 @@ enum { /* Only for v7.2 */ #define ACC_CONTROL_ECC_EXT_SHIFT 13 +static u8 brcmnand_status(struct brcmnand_host *host); + static inline bool brcmnand_non_mmio_ops(struct brcmnand_controller *ctrl) { #if IS_ENABLED(CONFIG_MTD_NAND_BRCMNAND_BCMA) @@ -1022,19 +1024,6 @@ static inline int brcmnand_sector_1k_shift(struct brcmnand_controller *ctrl) return -1; } -static int brcmnand_get_sector_size_1k(struct brcmnand_host *host) -{ - struct brcmnand_controller *ctrl = host->ctrl; - int shift = brcmnand_sector_1k_shift(ctrl); - u16 acc_control_offs = brcmnand_cs_offset(ctrl, host->cs, - BRCMNAND_CS_ACC_CONTROL); - - if (shift < 0) - return 0; - - return (nand_readreg(ctrl, acc_control_offs) >> shift) & 0x1; -} - static void brcmnand_set_sector_size_1k(struct brcmnand_host *host, int val) { struct brcmnand_controller *ctrl = host->ctrl; @@ -1074,6 +1063,9 @@ static int bcmnand_ctrl_poll_status(struct brcmnand_host *host, limit = jiffies + msecs_to_jiffies(timeout_ms); do { + if (mask & INTFC_FLASH_STATUS) + brcmnand_status(host); + val = brcmnand_read_reg(ctrl, BRCMNAND_INTFC_STATUS); if ((val & mask) == expected_val) return 0; @@ -1085,6 +1077,9 @@ static int bcmnand_ctrl_poll_status(struct brcmnand_host *host, * do a final check after time out in case the CPU was busy and the driver * did not get enough time to perform the polling to avoid false alarms */ + if (mask & INTFC_FLASH_STATUS) + brcmnand_status(host); + val = brcmnand_read_reg(ctrl, BRCMNAND_INTFC_STATUS); if ((val & mask) == expected_val) return 0; @@ -1388,7 +1383,8 @@ static void brcmnand_wp(struct mtd_info *mtd, int wp) return; brcmnand_set_wp(ctrl, wp); - nand_status_op(chip, NULL); + /* force controller operation to update internal copy of NAND chip status */ + brcmnand_status(host); /* NAND_STATUS_WP 0x00 = protected, 0x80 = not protected */ ret = bcmnand_ctrl_poll_status(host, NAND_CTRL_RDY | @@ -1644,16 +1640,6 @@ static void brcmnand_send_cmd(struct brcmnand_host *host, int cmd) cmd << brcmnand_cmd_shift(ctrl)); } -/*********************************************************************** - * NAND MTD API: read/program/erase - ***********************************************************************/ - -static void brcmnand_cmd_ctrl(struct nand_chip *chip, int dat, - unsigned int ctrl) -{ - /* intentionally left blank */ -} - static bool brcmstb_nand_wait_for_completion(struct nand_chip *chip) { struct brcmnand_host *host = nand_get_controller_data(chip); @@ -1704,6 +1690,26 @@ static int brcmnand_waitfunc(struct nand_chip *chip) INTFC_FLASH_STATUS; } +static u8 brcmnand_status(struct brcmnand_host *host) +{ + struct nand_chip *chip = &host->chip; + struct mtd_info *mtd = nand_to_mtd(chip); + + brcmnand_set_cmd_addr(mtd, 0); + brcmnand_send_cmd(host, CMD_STATUS_READ); + + return brcmnand_waitfunc(chip); +} + +static u8 brcmnand_reset(struct brcmnand_host *host) +{ + struct nand_chip *chip = &host->chip; + + brcmnand_send_cmd(host, CMD_FLASH_RESET); + + return brcmnand_waitfunc(chip); +} + enum { LLOP_RE = BIT(16), LLOP_WE = BIT(17), @@ -1753,190 +1759,6 @@ static int brcmnand_low_level_op(struct brcmnand_host *host, return brcmnand_waitfunc(chip); } -static void brcmnand_cmdfunc(struct nand_chip *chip, unsigned command, - int column, int page_addr) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct brcmnand_host *host = nand_get_controller_data(chip); - struct brcmnand_controller *ctrl = host->ctrl; - u64 addr = (u64)page_addr << chip->page_shift; - int native_cmd = 0; - - if (command == NAND_CMD_READID || command == NAND_CMD_PARAM || - command == NAND_CMD_RNDOUT) - addr = (u64)column; - /* Avoid propagating a negative, don't-care address */ - else if (page_addr < 0) - addr = 0; - - dev_dbg(ctrl->dev, "cmd 0x%x addr 0x%llx\n", command, - (unsigned long long)addr); - - host->last_cmd = command; - host->last_byte = 0; - host->last_addr = addr; - - switch (command) { - case NAND_CMD_RESET: - native_cmd = CMD_FLASH_RESET; - break; - case NAND_CMD_STATUS: - native_cmd = CMD_STATUS_READ; - break; - case NAND_CMD_READID: - native_cmd = CMD_DEVICE_ID_READ; - break; - case NAND_CMD_READOOB: - native_cmd = CMD_SPARE_AREA_READ; - break; - case NAND_CMD_ERASE1: - native_cmd = CMD_BLOCK_ERASE; - brcmnand_wp(mtd, 0); - break; - case NAND_CMD_PARAM: - native_cmd = CMD_PARAMETER_READ; - break; - case NAND_CMD_SET_FEATURES: - case NAND_CMD_GET_FEATURES: - brcmnand_low_level_op(host, LL_OP_CMD, command, false); - brcmnand_low_level_op(host, LL_OP_ADDR, column, false); - break; - case NAND_CMD_RNDOUT: - native_cmd = CMD_PARAMETER_CHANGE_COL; - addr &= ~((u64)(FC_BYTES - 1)); - /* - * HW quirk: PARAMETER_CHANGE_COL requires SECTOR_SIZE_1K=0 - * NB: hwcfg.sector_size_1k may not be initialized yet - */ - if (brcmnand_get_sector_size_1k(host)) { - host->hwcfg.sector_size_1k = - brcmnand_get_sector_size_1k(host); - brcmnand_set_sector_size_1k(host, 0); - } - break; - } - - if (!native_cmd) - return; - - brcmnand_set_cmd_addr(mtd, addr); - brcmnand_send_cmd(host, native_cmd); - brcmnand_waitfunc(chip); - - if (native_cmd == CMD_PARAMETER_READ || - native_cmd == CMD_PARAMETER_CHANGE_COL) { - /* Copy flash cache word-wise */ - u32 *flash_cache = (u32 *)ctrl->flash_cache; - int i; - - brcmnand_soc_data_bus_prepare(ctrl->soc, true); - - /* - * Must cache the FLASH_CACHE now, since changes in - * SECTOR_SIZE_1K may invalidate it - */ - for (i = 0; i < FC_WORDS; i++) - /* - * Flash cache is big endian for parameter pages, at - * least on STB SoCs - */ - flash_cache[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i)); - - brcmnand_soc_data_bus_unprepare(ctrl->soc, true); - - /* Cleanup from HW quirk: restore SECTOR_SIZE_1K */ - if (host->hwcfg.sector_size_1k) - brcmnand_set_sector_size_1k(host, - host->hwcfg.sector_size_1k); - } - - /* Re-enable protection is necessary only after erase */ - if (command == NAND_CMD_ERASE1) - brcmnand_wp(mtd, 1); -} - -static uint8_t brcmnand_read_byte(struct nand_chip *chip) -{ - struct brcmnand_host *host = nand_get_controller_data(chip); - struct brcmnand_controller *ctrl = host->ctrl; - uint8_t ret = 0; - int addr, offs; - - switch (host->last_cmd) { - case NAND_CMD_READID: - if (host->last_byte < 4) - ret = brcmnand_read_reg(ctrl, BRCMNAND_ID) >> - (24 - (host->last_byte << 3)); - else if (host->last_byte < 8) - ret = brcmnand_read_reg(ctrl, BRCMNAND_ID_EXT) >> - (56 - (host->last_byte << 3)); - break; - - case NAND_CMD_READOOB: - ret = oob_reg_read(ctrl, host->last_byte); - break; - - case NAND_CMD_STATUS: - ret = brcmnand_read_reg(ctrl, BRCMNAND_INTFC_STATUS) & - INTFC_FLASH_STATUS; - if (wp_on) /* hide WP status */ - ret |= NAND_STATUS_WP; - break; - - case NAND_CMD_PARAM: - case NAND_CMD_RNDOUT: - addr = host->last_addr + host->last_byte; - offs = addr & (FC_BYTES - 1); - - /* At FC_BYTES boundary, switch to next column */ - if (host->last_byte > 0 && offs == 0) - nand_change_read_column_op(chip, addr, NULL, 0, false); - - ret = ctrl->flash_cache[offs]; - break; - case NAND_CMD_GET_FEATURES: - if (host->last_byte >= ONFI_SUBFEATURE_PARAM_LEN) { - ret = 0; - } else { - bool last = host->last_byte == - ONFI_SUBFEATURE_PARAM_LEN - 1; - brcmnand_low_level_op(host, LL_OP_RD, 0, last); - ret = brcmnand_read_reg(ctrl, BRCMNAND_LL_RDATA) & 0xff; - } - } - - dev_dbg(ctrl->dev, "read byte = 0x%02x\n", ret); - host->last_byte++; - - return ret; -} - -static void brcmnand_read_buf(struct nand_chip *chip, uint8_t *buf, int len) -{ - int i; - - for (i = 0; i < len; i++, buf++) - *buf = brcmnand_read_byte(chip); -} - -static void brcmnand_write_buf(struct nand_chip *chip, const uint8_t *buf, - int len) -{ - int i; - struct brcmnand_host *host = nand_get_controller_data(chip); - - switch (host->last_cmd) { - case NAND_CMD_SET_FEATURES: - for (i = 0; i < len; i++) - brcmnand_low_level_op(host, LL_OP_WR, buf[i], - (i + 1) == len); - break; - default: - BUG(); - break; - } -} - /* * Kick EDU engine */ @@ -2346,8 +2168,9 @@ static int brcmnand_read_page(struct nand_chip *chip, uint8_t *buf, struct mtd_info *mtd = nand_to_mtd(chip); struct brcmnand_host *host = nand_get_controller_data(chip); u8 *oob = oob_required ? (u8 *)chip->oob_poi : NULL; + u64 addr = (u64)page << chip->page_shift; - nand_read_page_op(chip, page, 0, NULL, 0); + host->last_addr = addr; return brcmnand_read(mtd, chip, host->last_addr, mtd->writesize >> FC_SHIFT, (u32 *)buf, oob); @@ -2360,8 +2183,9 @@ static int brcmnand_read_page_raw(struct nand_chip *chip, uint8_t *buf, struct mtd_info *mtd = nand_to_mtd(chip); u8 *oob = oob_required ? (u8 *)chip->oob_poi : NULL; int ret; + u64 addr = (u64)page << chip->page_shift; - nand_read_page_op(chip, page, 0, NULL, 0); + host->last_addr = addr; brcmnand_set_ecc_enabled(host, 0); ret = brcmnand_read(mtd, chip, host->last_addr, @@ -2469,11 +2293,11 @@ static int brcmnand_write_page(struct nand_chip *chip, const uint8_t *buf, struct mtd_info *mtd = nand_to_mtd(chip); struct brcmnand_host *host = nand_get_controller_data(chip); void *oob = oob_required ? chip->oob_poi : NULL; + u64 addr = (u64)page << chip->page_shift; - nand_prog_page_begin_op(chip, page, 0, NULL, 0); - brcmnand_write(mtd, chip, host->last_addr, (const u32 *)buf, oob); + host->last_addr = addr; - return nand_prog_page_end_op(chip); + return brcmnand_write(mtd, chip, host->last_addr, (const u32 *)buf, oob); } static int brcmnand_write_page_raw(struct nand_chip *chip, const uint8_t *buf, @@ -2482,13 +2306,15 @@ static int brcmnand_write_page_raw(struct nand_chip *chip, const uint8_t *buf, struct mtd_info *mtd = nand_to_mtd(chip); struct brcmnand_host *host = nand_get_controller_data(chip); void *oob = oob_required ? chip->oob_poi : NULL; + u64 addr = (u64)page << chip->page_shift; + int ret = 0; - nand_prog_page_begin_op(chip, page, 0, NULL, 0); + host->last_addr = addr; brcmnand_set_ecc_enabled(host, 0); - brcmnand_write(mtd, chip, host->last_addr, (const u32 *)buf, oob); + ret = brcmnand_write(mtd, chip, host->last_addr, (const u32 *)buf, oob); brcmnand_set_ecc_enabled(host, 1); - return nand_prog_page_end_op(chip); + return ret; } static int brcmnand_write_oob(struct nand_chip *chip, int page) @@ -2512,6 +2338,130 @@ static int brcmnand_write_oob_raw(struct nand_chip *chip, int page) return ret; } +static int brcmnand_exec_instr(struct brcmnand_host *host, int i, + const struct nand_operation *op) +{ + const struct nand_op_instr *instr = &op->instrs[i]; + struct brcmnand_controller *ctrl = host->ctrl; + const u8 *out; + bool last_op; + int ret = 0; + u8 *in; + + /* + * The controller needs to be aware of the last command in the operation + * (WAITRDY excepted). + */ + last_op = ((i == (op->ninstrs - 1)) && (instr->type != NAND_OP_WAITRDY_INSTR)) || + ((i == (op->ninstrs - 2)) && (op->instrs[i+1].type == NAND_OP_WAITRDY_INSTR)); + + switch (instr->type) { + case NAND_OP_CMD_INSTR: + brcmnand_low_level_op(host, LL_OP_CMD, instr->ctx.cmd.opcode, last_op); + break; + + case NAND_OP_ADDR_INSTR: + for (i = 0; i < instr->ctx.addr.naddrs; i++) + brcmnand_low_level_op(host, LL_OP_ADDR, instr->ctx.addr.addrs[i], + last_op && (i == (instr->ctx.addr.naddrs - 1))); + break; + + case NAND_OP_DATA_IN_INSTR: + in = instr->ctx.data.buf.in; + for (i = 0; i < instr->ctx.data.len; i++) { + brcmnand_low_level_op(host, LL_OP_RD, 0, + last_op && (i == (instr->ctx.data.len - 1))); + in[i] = brcmnand_read_reg(host->ctrl, BRCMNAND_LL_RDATA); + } + break; + + case NAND_OP_DATA_OUT_INSTR: + out = instr->ctx.data.buf.out; + for (i = 0; i < instr->ctx.data.len; i++) + brcmnand_low_level_op(host, LL_OP_WR, out[i], + last_op && (i == (instr->ctx.data.len - 1))); + break; + + case NAND_OP_WAITRDY_INSTR: + ret = bcmnand_ctrl_poll_status(host, NAND_CTRL_RDY, NAND_CTRL_RDY, 0); + break; + + default: + dev_err(ctrl->dev, "unsupported instruction type: %d\n", + instr->type); + ret = -EINVAL; + break; + } + + return ret; +} + +static int brcmnand_op_is_status(const struct nand_operation *op) +{ + if ((op->ninstrs == 2) && + (op->instrs[0].type == NAND_OP_CMD_INSTR) && + (op->instrs[0].ctx.cmd.opcode == NAND_CMD_STATUS) && + (op->instrs[1].type == NAND_OP_DATA_IN_INSTR)) + return 1; + + return 0; +} + +static int brcmnand_op_is_reset(const struct nand_operation *op) +{ + if ((op->ninstrs == 2) && + (op->instrs[0].type == NAND_OP_CMD_INSTR) && + (op->instrs[0].ctx.cmd.opcode == NAND_CMD_RESET) && + (op->instrs[1].type == NAND_OP_WAITRDY_INSTR)) + return 1; + + return 0; +} + +static int brcmnand_exec_op(struct nand_chip *chip, + const struct nand_operation *op, + bool check_only) +{ + struct brcmnand_host *host = nand_get_controller_data(chip); + struct mtd_info *mtd = nand_to_mtd(chip); + u8 *status; + unsigned int i; + int ret = 0; + + if (check_only) + return 0; + + if (brcmnand_op_is_status(op)) { + status = op->instrs[1].ctx.data.buf.in; + *status = brcmnand_status(host); + + return 0; + } + else if (brcmnand_op_is_reset(op)) { + ret = brcmnand_reset(host); + if (ret < 0) + return ret; + + brcmnand_wp(mtd, 1); + + return 0; + } + + if (op->deassert_wp) + brcmnand_wp(mtd, 0); + + for (i = 0; i < op->ninstrs; i++) { + ret = brcmnand_exec_instr(host, i, op); + if (ret) + break; + } + + if (op->deassert_wp) + brcmnand_wp(mtd, 1); + + return ret; +} + /*********************************************************************** * Per-CS setup (1 NAND device) ***********************************************************************/ @@ -2822,6 +2772,7 @@ static int brcmnand_attach_chip(struct nand_chip *chip) static const struct nand_controller_ops brcmnand_controller_ops = { .attach_chip = brcmnand_attach_chip, + .exec_op = brcmnand_exec_op, }; static int brcmnand_init_cs(struct brcmnand_host *host, @@ -2846,13 +2797,6 @@ static int brcmnand_init_cs(struct brcmnand_host *host, mtd->owner = THIS_MODULE; mtd->dev.parent = dev; - chip->legacy.cmd_ctrl = brcmnand_cmd_ctrl; - chip->legacy.cmdfunc = brcmnand_cmdfunc; - chip->legacy.waitfunc = brcmnand_waitfunc; - chip->legacy.read_byte = brcmnand_read_byte; - chip->legacy.read_buf = brcmnand_read_buf; - chip->legacy.write_buf = brcmnand_write_buf; - chip->ecc.engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST; chip->ecc.read_page = brcmnand_read_page; chip->ecc.write_page = brcmnand_write_page; @@ -2864,6 +2808,7 @@ static int brcmnand_init_cs(struct brcmnand_host *host, chip->ecc.write_oob = brcmnand_write_oob; chip->controller = &ctrl->controller; + ctrl->controller.controller_wp = 1; /* * The bootloader might have configured 16bit mode but From 5cb475174cce1bfedf1025b6e235e2c43d81144f Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 29 Nov 2023 16:11:47 +0800 Subject: [PATCH 0235/1562] spi: cadence-quadspi: add missing clk_disable_unprepare() in cqspi_probe() cqspi_jh7110_clk_init() is called after clk_prepare_enable(cqspi->clk), if it fails, it should goto label 'probe_reset_failed' to disable cqspi->clk. In the error path after calling cqspi_jh7110_clk_init(), cqspi_jh7110_disable_clk() need be called. Fixes: 33f1ef6d4eb6 ("spi: cadence-quadspi: Add clock configuration for StarFive JH7110 QSPI") Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20231129081147.628004-1-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index 3d7bf62da11c..f94e0d370d46 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -1840,7 +1840,7 @@ static int cqspi_probe(struct platform_device *pdev) if (ddata->jh7110_clk_init) { ret = cqspi_jh7110_clk_init(pdev, cqspi); if (ret) - goto probe_clk_failed; + goto probe_reset_failed; } if (of_device_is_compatible(pdev->dev.of_node, @@ -1901,6 +1901,8 @@ static int cqspi_probe(struct platform_device *pdev) probe_setup_failed: cqspi_controller_enable(cqspi, 0); probe_reset_failed: + if (cqspi->is_jh7110) + cqspi_jh7110_disable_clk(pdev, cqspi); clk_disable_unprepare(cqspi->clk); probe_clk_failed: return ret; From d9cd21d441c8c7c22ef448d23f1d6f5fa698b7f0 Mon Sep 17 00:00:00 2001 From: Henry Shi Date: Fri, 24 Nov 2023 15:03:34 -0500 Subject: [PATCH 0236/1562] platform/x86: Add Silicom Platform Driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Silicom platform (silicom-platform) Linux driver for Swisscom Business Box (Swisscom BB) as well as Cordoba family products. This platform driver provides support for various functions via the Linux LED framework, GPIO framework, Hardware Monitoring (HWMON) and device attributes. Signed-off-by: Henry Shi Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231124200334.5318-1-henryshi2018@gmail.com Signed-off-by: Hans de Goede --- .../ABI/testing/sysfs-platform-silicom | 29 + drivers/platform/x86/Kconfig | 15 + drivers/platform/x86/Makefile | 3 + drivers/platform/x86/silicom-platform.c | 1004 +++++++++++++++++ 4 files changed, 1051 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-platform-silicom create mode 100644 drivers/platform/x86/silicom-platform.c diff --git a/Documentation/ABI/testing/sysfs-platform-silicom b/Documentation/ABI/testing/sysfs-platform-silicom new file mode 100644 index 000000000000..2288b3665d16 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-platform-silicom @@ -0,0 +1,29 @@ +What: /sys/devices/platform/silicom-platform/uc_version +Date: November 2023 +KernelVersion: 6.7 +Contact: Henry Shi +Description: + This file allows to read microcontroller firmware + version of current platform. + +What: /sys/devices/platform/silicom-platform/power_cycle +Date: November 2023 +KernelVersion: 6.7 +Contact: Henry Shi + This file allow user to power cycle the platform. + Default value is 0; when set to 1, it powers down + the platform, waits 5 seconds, then powers on the + device. It returns to default value after power cycle. + + 0 - default value. + +What: /sys/devices/platform/silicom-platform/efuse_status +Date: November 2023 +KernelVersion: 6.7 +Contact: Henry Shi +Description: + This file is read only. It returns the current + OTP status: + + 0 - not programmed. + 1 - programmed. diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 7ddb8cc3d56f..5c6dbbf0235c 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -1089,6 +1089,21 @@ config INTEL_SCU_IPC_UTIL source "drivers/platform/x86/siemens/Kconfig" +config SILICOM_PLATFORM + tristate "Silicom Edge Networking device support" + depends on HWMON + depends on GPIOLIB + depends on LEDS_CLASS_MULTICOLOR + help + This option enables support for the LEDs/GPIO/etc downstream of the + embedded controller on Silicom "Cordoba" hardware and derivatives. + + This platform driver provides support for various functions via + the Linux LED framework, GPIO framework, Hardware Monitoring (HWMON) + and device attributes. + + If you have a Silicom network appliance, say Y or M here. + config WINMATE_FM07_KEYS tristate "Winmate FM07/FM07P front-panel keys driver" depends on INPUT diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index c7a18e95ad8c..1de432e8861e 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -136,6 +136,9 @@ obj-$(CONFIG_X86_INTEL_LPSS) += pmc_atom.o # Siemens Simatic Industrial PCs obj-$(CONFIG_SIEMENS_SIMATIC_IPC) += siemens/ +# Silicom +obj-$(CONFIG_SILICOM_PLATFORM) += silicom-platform.o + # Winmate obj-$(CONFIG_WINMATE_FM07_KEYS) += winmate-fm07-keys.o diff --git a/drivers/platform/x86/silicom-platform.c b/drivers/platform/x86/silicom-platform.c new file mode 100644 index 000000000000..84b92b3f9f4b --- /dev/null +++ b/drivers/platform/x86/silicom-platform.c @@ -0,0 +1,1004 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// silicom-platform.c - Silicom MEC170x platform driver +// +// Copyright (C) 2023 Henry Shi +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MEC_POWER_CYCLE_ADDR 0x24 +#define MEC_EFUSE_LSB_ADDR 0x28 +#define MEC_GPIO_IN_POS 0x08 +#define MEC_IO_BASE 0x0800 +#define MEC_IO_LEN 0x8 +#define IO_REG_BANK 0x0 +#define DEFAULT_CHAN_LO 0 +#define DEFAULT_CHAN_HI 0 +#define DEFAULT_CHAN_LO_T 0xc +#define MEC_ADDR (MEC_IO_BASE + 0x02) +#define EC_ADDR_LSB MEC_ADDR +#define SILICOM_MEC_MAGIC 0x5a + +#define MEC_PORT_CHANNEL_MASK GENMASK(2, 0) +#define MEC_PORT_DWORD_OFFSET GENMASK(31, 3) +#define MEC_DATA_OFFSET_MASK GENMASK(1, 0) +#define MEC_PORT_OFFSET_MASK GENMASK(7, 2) + +#define MEC_TEMP_LOC GENMASK(31, 16) +#define MEC_VERSION_LOC GENMASK(15, 8) +#define MEC_VERSION_MAJOR GENMASK(15, 14) +#define MEC_VERSION_MINOR GENMASK(13, 8) + +#define EC_ADDR_MSB (MEC_IO_BASE + 0x3) +#define MEC_DATA_OFFSET(offset) (MEC_IO_BASE + 0x04 + (offset)) + +#define OFFSET_BIT_TO_CHANNEL(off, bit) ((((off) + 0x014) << 3) | (bit)) +#define CHANNEL_TO_OFFSET(chan) (((chan) >> 3) - 0x14) + +static DEFINE_MUTEX(mec_io_mutex); +static unsigned int efuse_status; +static unsigned int mec_uc_version; +static unsigned int power_cycle; + +static const struct hwmon_channel_info *silicom_fan_control_info[] = { + HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT | HWMON_F_LABEL), + HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT | HWMON_T_LABEL), + NULL +}; + +struct silicom_platform_info { + int io_base; + int io_len; + struct led_classdev_mc *led_info; + struct gpio_chip *gpiochip; + u8 *gpio_channels; + u16 ngpio; +}; + +static const char * const plat_0222_gpio_names[] = { + "AUTOM0_SFP_TX_FAULT", + "SLOT2_LED_OUT", + "SIM_M2_SLOT2_B_DET", + "SIM_M2_SLOT2_A_DET", + "SLOT1_LED_OUT", + "SIM_M2_SLOT1_B_DET", + "SIM_M2_SLOT1_A_DET", + "SLOT0_LED_OUT", + "WAN_SFP0_RX_LOS", + "WAN_SFP0_PRSNT_N", + "WAN_SFP0_TX_FAULT", + "AUTOM1_SFP_RX_LOS", + "AUTOM1_SFP_PRSNT_N", + "AUTOM1_SFP_TX_FAULT", + "AUTOM0_SFP_RX_LOS", + "AUTOM0_SFP_PRSNT_N", + "WAN_SFP1_RX_LOS", + "WAN_SFP1_PRSNT_N", + "WAN_SFP1_TX_FAULT", + "SIM_M2_SLOT1_MUX_SEL", + "W_DISABLE_M2_SLOT1_N", + "W_DISABLE_MPCIE_SLOT0_N", + "W_DISABLE_M2_SLOT0_N", + "BT_COMMAND_MODE", + "WAN_SFP1_TX_DISABLE", + "WAN_SFP0_TX_DISABLE", + "AUTOM1_SFP_TX_DISABLE", + "AUTOM0_SFP_TX_DISABLE", + "SIM_M2_SLOT2_MUX_SEL", + "W_DISABLE_M2_SLOT2_N", + "RST_CTL_M2_SLOT_1_N", + "RST_CTL_M2_SLOT_2_N", + "PM_USB_PWR_EN_BOT", + "PM_USB_PWR_EN_TOP", +}; + +static u8 plat_0222_gpio_channels[] = { + OFFSET_BIT_TO_CHANNEL(0x00, 0), + OFFSET_BIT_TO_CHANNEL(0x00, 1), + OFFSET_BIT_TO_CHANNEL(0x00, 2), + OFFSET_BIT_TO_CHANNEL(0x00, 3), + OFFSET_BIT_TO_CHANNEL(0x00, 4), + OFFSET_BIT_TO_CHANNEL(0x00, 5), + OFFSET_BIT_TO_CHANNEL(0x00, 6), + OFFSET_BIT_TO_CHANNEL(0x00, 7), + OFFSET_BIT_TO_CHANNEL(0x01, 0), + OFFSET_BIT_TO_CHANNEL(0x01, 1), + OFFSET_BIT_TO_CHANNEL(0x01, 2), + OFFSET_BIT_TO_CHANNEL(0x01, 3), + OFFSET_BIT_TO_CHANNEL(0x01, 4), + OFFSET_BIT_TO_CHANNEL(0x01, 5), + OFFSET_BIT_TO_CHANNEL(0x01, 6), + OFFSET_BIT_TO_CHANNEL(0x01, 7), + OFFSET_BIT_TO_CHANNEL(0x02, 0), + OFFSET_BIT_TO_CHANNEL(0x02, 1), + OFFSET_BIT_TO_CHANNEL(0x02, 2), + OFFSET_BIT_TO_CHANNEL(0x09, 0), + OFFSET_BIT_TO_CHANNEL(0x09, 1), + OFFSET_BIT_TO_CHANNEL(0x09, 2), + OFFSET_BIT_TO_CHANNEL(0x09, 3), + OFFSET_BIT_TO_CHANNEL(0x0a, 0), + OFFSET_BIT_TO_CHANNEL(0x0a, 1), + OFFSET_BIT_TO_CHANNEL(0x0a, 2), + OFFSET_BIT_TO_CHANNEL(0x0a, 3), + OFFSET_BIT_TO_CHANNEL(0x0a, 4), + OFFSET_BIT_TO_CHANNEL(0x0a, 5), + OFFSET_BIT_TO_CHANNEL(0x0a, 6), + OFFSET_BIT_TO_CHANNEL(0x0b, 0), + OFFSET_BIT_TO_CHANNEL(0x0b, 1), + OFFSET_BIT_TO_CHANNEL(0x0b, 2), + OFFSET_BIT_TO_CHANNEL(0x0b, 3), +}; + +static struct platform_device *silicom_platform_dev; +static struct led_classdev_mc *silicom_led_info __initdata; +static struct gpio_chip *silicom_gpiochip __initdata; +static u8 *silicom_gpio_channels __initdata; + +static int silicom_mec_port_get(unsigned int offset) +{ + unsigned short mec_data_addr; + unsigned short mec_port_addr; + u8 reg; + + mec_data_addr = FIELD_GET(MEC_PORT_DWORD_OFFSET, offset) & MEC_DATA_OFFSET_MASK; + mec_port_addr = FIELD_GET(MEC_PORT_DWORD_OFFSET, offset) & MEC_PORT_OFFSET_MASK; + + mutex_lock(&mec_io_mutex); + outb(mec_port_addr, MEC_ADDR); + reg = inb(MEC_DATA_OFFSET(mec_data_addr)); + mutex_unlock(&mec_io_mutex); + + return (reg >> (offset & MEC_PORT_CHANNEL_MASK)) & 0x01; +} + +static enum led_brightness silicom_mec_led_get(int channel) +{ + /* Outputs are active low */ + return silicom_mec_port_get(channel) ? LED_OFF : LED_ON; +} + +static void silicom_mec_port_set(int channel, int on) +{ + + unsigned short mec_data_addr; + unsigned short mec_port_addr; + u8 reg; + + mec_data_addr = FIELD_GET(MEC_PORT_DWORD_OFFSET, channel) & MEC_DATA_OFFSET_MASK; + mec_port_addr = FIELD_GET(MEC_PORT_DWORD_OFFSET, channel) & MEC_PORT_OFFSET_MASK; + + mutex_lock(&mec_io_mutex); + outb(mec_port_addr, MEC_ADDR); + reg = inb(MEC_DATA_OFFSET(mec_data_addr)); + /* Outputs are active low, so clear the bit for on, or set it for off */ + if (on) + reg &= ~(1 << (channel & MEC_PORT_CHANNEL_MASK)); + else + reg |= 1 << (channel & MEC_PORT_CHANNEL_MASK); + outb(reg, MEC_DATA_OFFSET(mec_data_addr)); + mutex_unlock(&mec_io_mutex); +} + +static enum led_brightness silicom_mec_led_mc_brightness_get(struct led_classdev *led_cdev) +{ + struct led_classdev_mc *mc_cdev = lcdev_to_mccdev(led_cdev); + enum led_brightness brightness = LED_OFF; + int i; + + for (i = 0; i < mc_cdev->num_colors; i++) { + mc_cdev->subled_info[i].brightness = + silicom_mec_led_get(mc_cdev->subled_info[i].channel); + /* Mark the overall brightness as LED_ON if any of the subleds are on */ + if (mc_cdev->subled_info[i].brightness != LED_OFF) + brightness = LED_ON; + } + + return brightness; +} + +static void silicom_mec_led_mc_brightness_set(struct led_classdev *led_cdev, + enum led_brightness brightness) +{ + struct led_classdev_mc *mc_cdev = lcdev_to_mccdev(led_cdev); + int i; + + led_mc_calc_color_components(mc_cdev, brightness); + for (i = 0; i < mc_cdev->num_colors; i++) { + silicom_mec_port_set(mc_cdev->subled_info[i].channel, + mc_cdev->subled_info[i].brightness); + } +} + +static int silicom_gpio_get_direction(struct gpio_chip *gc, + unsigned int offset) +{ + u8 *channels = gpiochip_get_data(gc); + + /* Input registers have offsets between [0x00, 0x07] */ + if (CHANNEL_TO_OFFSET(channels[offset]) < MEC_GPIO_IN_POS) + return GPIO_LINE_DIRECTION_IN; + + return GPIO_LINE_DIRECTION_OUT; +} + +static int silicom_gpio_direction_input(struct gpio_chip *gc, + unsigned int offset) +{ + int direction = silicom_gpio_get_direction(gc, offset); + + return direction == GPIO_LINE_DIRECTION_IN ? 0 : -EINVAL; +} + +static void silicom_gpio_set(struct gpio_chip *gc, + unsigned int offset, + int value) +{ + int direction = silicom_gpio_get_direction(gc, offset); + u8 *channels = gpiochip_get_data(gc); + int channel = channels[offset]; + + if (direction == GPIO_LINE_DIRECTION_IN) + return; + + if (value) + silicom_mec_port_set(channel, 0); + else if (value == 0) + silicom_mec_port_set(channel, 1); + else + pr_err("Wrong argument value: %d\n", value); +} + +static int silicom_gpio_direction_output(struct gpio_chip *gc, + unsigned int offset, + int value) +{ + int direction = silicom_gpio_get_direction(gc, offset); + + if (direction == GPIO_LINE_DIRECTION_IN) + return -EINVAL; + + silicom_gpio_set(gc, offset, value); + + return 0; +} + +static int silicom_gpio_get(struct gpio_chip *gc, unsigned int offset) +{ + u8 *channels = gpiochip_get_data(gc); + int channel = channels[offset]; + + return silicom_mec_port_get(channel); +} + +static struct mc_subled plat_0222_wan_mc_subled_info[] __initdata = { + { + .color_index = LED_COLOR_ID_WHITE, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0c, 7), + }, + { + .color_index = LED_COLOR_ID_YELLOW, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0c, 6), + }, + { + .color_index = LED_COLOR_ID_RED, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0c, 5), + }, +}; + +static struct mc_subled plat_0222_sys_mc_subled_info[] __initdata = { + { + .color_index = LED_COLOR_ID_WHITE, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0c, 4), + }, + { + .color_index = LED_COLOR_ID_AMBER, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0c, 3), + }, + { + .color_index = LED_COLOR_ID_RED, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0c, 2), + }, +}; + +static struct mc_subled plat_0222_stat1_mc_subled_info[] __initdata = { + { + .color_index = LED_COLOR_ID_RED, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0c, 1), + }, + { + .color_index = LED_COLOR_ID_GREEN, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0c, 0), + }, + { + .color_index = LED_COLOR_ID_BLUE, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0d, 7), + }, + { + .color_index = LED_COLOR_ID_YELLOW, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0d, 6), + }, +}; + +static struct mc_subled plat_0222_stat2_mc_subled_info[] __initdata = { + { + .color_index = LED_COLOR_ID_RED, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0d, 5), + }, + { + .color_index = LED_COLOR_ID_GREEN, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0d, 4), + }, + { + .color_index = LED_COLOR_ID_BLUE, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0d, 3), + }, + { + .color_index = LED_COLOR_ID_YELLOW, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0d, 2), + }, +}; + +static struct mc_subled plat_0222_stat3_mc_subled_info[] __initdata = { + { + .color_index = LED_COLOR_ID_RED, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0d, 1), + }, + { + .color_index = LED_COLOR_ID_GREEN, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0d, 0), + }, + { + .color_index = LED_COLOR_ID_BLUE, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0e, 1), + }, + { + .color_index = LED_COLOR_ID_YELLOW, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x0e, 0), + }, +}; + +static struct led_classdev_mc plat_0222_mc_led_info[] __initdata = { + { + .led_cdev = { + .name = "platled::wan", + .brightness = 0, + .max_brightness = 1, + .brightness_set = silicom_mec_led_mc_brightness_set, + .brightness_get = silicom_mec_led_mc_brightness_get, + }, + .num_colors = ARRAY_SIZE(plat_0222_wan_mc_subled_info), + .subled_info = plat_0222_wan_mc_subled_info, + }, + { + .led_cdev = { + .name = "platled::sys", + .brightness = 0, + .max_brightness = 1, + .brightness_set = silicom_mec_led_mc_brightness_set, + .brightness_get = silicom_mec_led_mc_brightness_get, + }, + .num_colors = ARRAY_SIZE(plat_0222_sys_mc_subled_info), + .subled_info = plat_0222_sys_mc_subled_info, + }, + { + .led_cdev = { + .name = "platled::stat1", + .brightness = 0, + .max_brightness = 1, + .brightness_set = silicom_mec_led_mc_brightness_set, + .brightness_get = silicom_mec_led_mc_brightness_get, + }, + .num_colors = ARRAY_SIZE(plat_0222_stat1_mc_subled_info), + .subled_info = plat_0222_stat1_mc_subled_info, + }, + { + .led_cdev = { + .name = "platled::stat2", + .brightness = 0, + .max_brightness = 1, + .brightness_set = silicom_mec_led_mc_brightness_set, + .brightness_get = silicom_mec_led_mc_brightness_get, + }, + .num_colors = ARRAY_SIZE(plat_0222_stat2_mc_subled_info), + .subled_info = plat_0222_stat2_mc_subled_info, + }, + { + .led_cdev = { + .name = "platled::stat3", + .brightness = 0, + .max_brightness = 1, + .brightness_set = silicom_mec_led_mc_brightness_set, + .brightness_get = silicom_mec_led_mc_brightness_get, + }, + .num_colors = ARRAY_SIZE(plat_0222_stat3_mc_subled_info), + .subled_info = plat_0222_stat3_mc_subled_info, + }, + { }, +}; + +static struct gpio_chip silicom_gpio_chip = { + .label = "silicom-gpio", + .get_direction = silicom_gpio_get_direction, + .direction_input = silicom_gpio_direction_input, + .direction_output = silicom_gpio_direction_output, + .get = silicom_gpio_get, + .set = silicom_gpio_set, + .base = -1, + .ngpio = ARRAY_SIZE(plat_0222_gpio_channels), + .names = plat_0222_gpio_names, + /* + * We're using a mutex to protect the indirect access, so we can sleep + * if the lock blocks + */ + .can_sleep = true, +}; + +static struct silicom_platform_info silicom_plat_0222_cordoba_info __initdata = { + .io_base = MEC_IO_BASE, + .io_len = MEC_IO_LEN, + .led_info = plat_0222_mc_led_info, + .gpiochip = &silicom_gpio_chip, + .gpio_channels = plat_0222_gpio_channels, + /* + * The original generic cordoba does not have the last 4 outputs of the + * plat_0222 variant, the rest are the same, so use the same longer list, + * but ignore the last entries here + */ + .ngpio = ARRAY_SIZE(plat_0222_gpio_channels), + +}; + +static struct mc_subled cordoba_fp_left_mc_subled_info[] __initdata = { + { + .color_index = LED_COLOR_ID_RED, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x08, 6), + }, + { + .color_index = LED_COLOR_ID_GREEN, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x08, 5), + }, + { + .color_index = LED_COLOR_ID_BLUE, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x09, 7), + }, + { + .color_index = LED_COLOR_ID_AMBER, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x09, 4), + }, +}; + +static struct mc_subled cordoba_fp_center_mc_subled_info[] __initdata = { + { + .color_index = LED_COLOR_ID_RED, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x08, 7), + }, + { + .color_index = LED_COLOR_ID_GREEN, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x08, 4), + }, + { + .color_index = LED_COLOR_ID_BLUE, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x08, 3), + }, + { + .color_index = LED_COLOR_ID_AMBER, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x09, 6), + }, +}; + +static struct mc_subled cordoba_fp_right_mc_subled_info[] __initdata = { + { + .color_index = LED_COLOR_ID_RED, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x08, 2), + }, + { + .color_index = LED_COLOR_ID_GREEN, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x08, 1), + }, + { + .color_index = LED_COLOR_ID_BLUE, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x08, 0), + }, + { + .color_index = LED_COLOR_ID_AMBER, + .brightness = 1, + .intensity = 0, + .channel = OFFSET_BIT_TO_CHANNEL(0x09, 5), + }, +}; + +static struct led_classdev_mc cordoba_mc_led_info[] __initdata = { + { + .led_cdev = { + .name = "platled::fp_left", + .brightness = 0, + .max_brightness = 1, + .brightness_set = silicom_mec_led_mc_brightness_set, + .brightness_get = silicom_mec_led_mc_brightness_get, + }, + .num_colors = ARRAY_SIZE(cordoba_fp_left_mc_subled_info), + .subled_info = cordoba_fp_left_mc_subled_info, + }, + { + .led_cdev = { + .name = "platled::fp_center", + .brightness = 0, + .max_brightness = 1, + .brightness_set = silicom_mec_led_mc_brightness_set, + .brightness_get = silicom_mec_led_mc_brightness_get, + }, + .num_colors = ARRAY_SIZE(cordoba_fp_center_mc_subled_info), + .subled_info = cordoba_fp_center_mc_subled_info, + }, + { + .led_cdev = { + .name = "platled::fp_right", + .brightness = 0, + .max_brightness = 1, + .brightness_set = silicom_mec_led_mc_brightness_set, + .brightness_get = silicom_mec_led_mc_brightness_get, + }, + .num_colors = ARRAY_SIZE(cordoba_fp_right_mc_subled_info), + .subled_info = cordoba_fp_right_mc_subled_info, + }, + { }, +}; + +static struct silicom_platform_info silicom_generic_cordoba_info __initdata = { + .io_base = MEC_IO_BASE, + .io_len = MEC_IO_LEN, + .led_info = cordoba_mc_led_info, + .gpiochip = &silicom_gpio_chip, + .gpio_channels = plat_0222_gpio_channels, + .ngpio = ARRAY_SIZE(plat_0222_gpio_channels), +}; + +/* + * sysfs interface + */ +static ssize_t efuse_status_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + u32 reg; + + mutex_lock(&mec_io_mutex); + /* Select memory region */ + outb(IO_REG_BANK, EC_ADDR_MSB); + outb(MEC_EFUSE_LSB_ADDR, EC_ADDR_LSB); + + /* Get current data from the address */ + reg = inl(MEC_DATA_OFFSET(DEFAULT_CHAN_LO)); + mutex_unlock(&mec_io_mutex); + + efuse_status = reg & 0x1; + + return sysfs_emit(buf, "%u\n", efuse_status); +} +static DEVICE_ATTR_RO(efuse_status); + +static ssize_t uc_version_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int uc_version; + u32 reg; + + mutex_lock(&mec_io_mutex); + outb(IO_REG_BANK, EC_ADDR_MSB); + outb(DEFAULT_CHAN_LO, EC_ADDR_LSB); + + reg = inl(MEC_DATA_OFFSET(DEFAULT_CHAN_LO)); + mutex_unlock(&mec_io_mutex); + uc_version = FIELD_GET(MEC_VERSION_LOC, reg); + if (uc_version >= 192) + return -EINVAL; + + uc_version = FIELD_GET(MEC_VERSION_MAJOR, reg) * 100 + + FIELD_GET(MEC_VERSION_MINOR, reg); + + mec_uc_version = uc_version; + + return sysfs_emit(buf, "%u\n", mec_uc_version); +} +static DEVICE_ATTR_RO(uc_version); + +static ssize_t power_cycle_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", power_cycle); +} + +static void powercycle_uc(void) +{ + /* Select memory region */ + outb(IO_REG_BANK, EC_ADDR_MSB); + outb(MEC_POWER_CYCLE_ADDR, EC_ADDR_LSB); + + /* Set to 1 for current data from the address */ + outb(1, MEC_DATA_OFFSET(DEFAULT_CHAN_LO)); +} + +static ssize_t power_cycle_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int rc; + unsigned int power_cycle_cmd; + + rc = kstrtou32(buf, 0, &power_cycle_cmd); + if (rc) + return -EINVAL; + + if (power_cycle_cmd > 0) { + mutex_lock(&mec_io_mutex); + power_cycle = power_cycle_cmd; + powercycle_uc(); + mutex_unlock(&mec_io_mutex); + } + + return count; +} +static DEVICE_ATTR_RW(power_cycle); + +static struct attribute *silicom_attrs[] = { + &dev_attr_efuse_status.attr, + &dev_attr_uc_version.attr, + &dev_attr_power_cycle.attr, + NULL, +}; +ATTRIBUTE_GROUPS(silicom); + +static struct platform_driver silicom_platform_driver = { + .driver = { + .name = "silicom-platform", + .dev_groups = silicom_groups, + }, +}; + +static int __init silicom_mc_leds_register(struct device *dev, + const struct led_classdev_mc *mc_leds) +{ + int size = sizeof(struct mc_subled); + struct led_classdev_mc *led; + int i, err; + + for (i = 0; mc_leds[i].led_cdev.name; i++) { + + led = devm_kzalloc(dev, sizeof(*led), GFP_KERNEL); + if (!led) + return -ENOMEM; + memcpy(led, &mc_leds[i], sizeof(*led)); + + led->subled_info = devm_kzalloc(dev, led->num_colors * size, GFP_KERNEL); + if (!led->subled_info) + return -ENOMEM; + memcpy(led->subled_info, mc_leds[i].subled_info, led->num_colors * size); + + err = devm_led_classdev_multicolor_register(dev, led); + if (err) + return err; + } + + return 0; +} + +static u32 rpm_get(void) +{ + u32 reg; + + mutex_lock(&mec_io_mutex); + /* Select memory region */ + outb(IO_REG_BANK, EC_ADDR_MSB); + outb(DEFAULT_CHAN_LO_T, EC_ADDR_LSB); + reg = inw(MEC_DATA_OFFSET(DEFAULT_CHAN_LO)); + mutex_unlock(&mec_io_mutex); + + return reg; +} + +static u32 temp_get(void) +{ + u32 reg; + + mutex_lock(&mec_io_mutex); + /* Select memory region */ + outb(IO_REG_BANK, EC_ADDR_MSB); + outb(DEFAULT_CHAN_LO_T, EC_ADDR_LSB); + reg = inl(MEC_DATA_OFFSET(DEFAULT_CHAN_LO)); + mutex_unlock(&mec_io_mutex); + + return FIELD_GET(MEC_TEMP_LOC, reg) * 100; +} + +static umode_t silicom_fan_control_fan_is_visible(const u32 attr) +{ + switch (attr) { + case hwmon_fan_input: + case hwmon_fan_label: + return 0444; + default: + return 0; + } +} + +static umode_t silicom_fan_control_temp_is_visible(const u32 attr) +{ + switch (attr) { + case hwmon_temp_input: + case hwmon_temp_label: + return 0444; + default: + return 0; + } +} + +static int silicom_fan_control_read_fan(struct device *dev, u32 attr, long *val) +{ + switch (attr) { + case hwmon_fan_input: + *val = rpm_get(); + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int silicom_fan_control_read_temp(struct device *dev, u32 attr, long *val) +{ + switch (attr) { + case hwmon_temp_input: + *val = temp_get(); + return 0; + default: + return -EOPNOTSUPP; + } +} + +static umode_t silicom_fan_control_is_visible(const void *data, + enum hwmon_sensor_types type, + u32 attr, int channel) +{ + switch (type) { + case hwmon_fan: + return silicom_fan_control_fan_is_visible(attr); + case hwmon_temp: + return silicom_fan_control_temp_is_visible(attr); + default: + return 0; + } +} + +static int silicom_fan_control_read(struct device *dev, + enum hwmon_sensor_types type, + u32 attr, int channel, + long *val) +{ + switch (type) { + case hwmon_fan: + return silicom_fan_control_read_fan(dev, attr, val); + case hwmon_temp: + return silicom_fan_control_read_temp(dev, attr, val); + default: + return -EOPNOTSUPP; + } +} + +static int silicom_fan_control_read_labels(struct device *dev, + enum hwmon_sensor_types type, + u32 attr, int channel, + const char **str) +{ + switch (type) { + case hwmon_fan: + *str = "Silicom_platfomr: Fan Speed"; + return 0; + case hwmon_temp: + *str = "Silicom_platform: Thermostat Sensor"; + return 0; + default: + return -EOPNOTSUPP; + } +} + +static const struct hwmon_ops silicom_fan_control_hwmon_ops = { + .is_visible = silicom_fan_control_is_visible, + .read = silicom_fan_control_read, + .read_string = silicom_fan_control_read_labels, +}; + +static const struct hwmon_chip_info silicom_chip_info = { + .ops = &silicom_fan_control_hwmon_ops, + .info = silicom_fan_control_info, +}; + +static int __init silicom_platform_probe(struct platform_device *device) +{ + struct device *hwmon_dev; + u8 magic, ver; + int err; + + if (!devm_request_region(&device->dev, MEC_IO_BASE, MEC_IO_LEN, "mec")) { + dev_err(&device->dev, "couldn't reserve MEC io ports\n"); + return -EBUSY; + } + + /* Sanity check magic number read for EC */ + outb(IO_REG_BANK, MEC_ADDR); + magic = inb(MEC_DATA_OFFSET(DEFAULT_CHAN_LO)); + ver = inb(MEC_DATA_OFFSET(DEFAULT_CHAN_HI)); + dev_dbg(&device->dev, "EC magic 0x%02x, version 0x%02x\n", magic, ver); + + if (magic != SILICOM_MEC_MAGIC) { + dev_err(&device->dev, "Bad EC magic 0x%02x!\n", magic); + return -ENODEV; + } + + err = silicom_mc_leds_register(&device->dev, silicom_led_info); + if (err) { + dev_err(&device->dev, "Failed to register LEDs\n"); + return err; + } + + err = devm_gpiochip_add_data(&device->dev, silicom_gpiochip, + silicom_gpio_channels); + if (err) { + dev_err(&device->dev, "Failed to register gpiochip: %d\n", err); + return err; + } + + hwmon_dev = devm_hwmon_device_register_with_info(&device->dev, "silicom_fan", NULL, + &silicom_chip_info, NULL); + err = PTR_ERR_OR_ZERO(hwmon_dev); + if (err) { + dev_err(&device->dev, "Failed to register hwmon_dev: %d\n", err); + return err; + } + + return err; +} + +static int __init silicom_platform_info_init(const struct dmi_system_id *id) +{ + struct silicom_platform_info *info = id->driver_data; + + silicom_led_info = info->led_info; + silicom_gpio_channels = info->gpio_channels; + silicom_gpiochip = info->gpiochip; + silicom_gpiochip->ngpio = info->ngpio; + + return 1; +} + +static const struct dmi_system_id silicom_dmi_ids[] __initconst = { + { + .callback = silicom_platform_info_init, + .ident = "Silicom Cordoba (Generic)", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Silicom"), + DMI_MATCH(DMI_BOARD_NAME, "80300-0214-G"), + }, + .driver_data = &silicom_generic_cordoba_info, + }, + { + .callback = silicom_platform_info_init, + .ident = "Silicom Cordoba (Generic)", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Silicom"), + DMI_MATCH(DMI_BOARD_NAME, "80500-0214-G"), + }, + .driver_data = &silicom_generic_cordoba_info, + }, + { + .callback = silicom_platform_info_init, + .ident = "Silicom Cordoba (plat_0222)", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Silicom"), + DMI_MATCH(DMI_BOARD_NAME, "80300-0222-G"), + }, + .driver_data = &silicom_plat_0222_cordoba_info, + }, + { }, +}; +MODULE_DEVICE_TABLE(dmi, silicom_dmi_ids); + +static int __init silicom_platform_init(void) +{ + if (!dmi_check_system(silicom_dmi_ids)) { + pr_err("No DMI match for this platform\n"); + return -ENODEV; + } + silicom_platform_dev = platform_create_bundle(&silicom_platform_driver, + silicom_platform_probe, + NULL, 0, NULL, 0); + + return PTR_ERR_OR_ZERO(silicom_platform_dev); +} + +static void __exit silicom_platform_exit(void) +{ + platform_device_unregister(silicom_platform_dev); + platform_driver_unregister(&silicom_platform_driver); +} + +module_init(silicom_platform_init); +module_exit(silicom_platform_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Henry Shi "); +MODULE_DESCRIPTION("Platform driver for Silicom network appliances"); From 8cbcc1dbf8a62c730fadd60de761e0658547a589 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:13 -0800 Subject: [PATCH 0237/1562] platform/x86/intel/vsec: Fix xa_alloc memory leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 936874b77dd0 ("platform/x86/intel/vsec: Add PCI error recovery support to Intel PMT") added an xarray to track the list of vsec devices to be recovered after a PCI error. But it did not provide cleanup for the list leading to a memory leak that was caught by kmemleak. Do xa_alloc() before devm_add_action_or_reset() so that the list may be cleaned up with xa_erase() in the release function. Fixes: 936874b77dd0 ("platform/x86/intel/vsec: Add PCI error recovery support to Intel PMT") Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-2-david.e.box@linux.intel.com [hdegoede@redhat.com: Add missing xa_erase() on error-exit Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/vsec.c | 25 +++++++++++++++---------- drivers/platform/x86/intel/vsec.h | 1 + 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index c1f9e4471b28..343ab6a82c01 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -120,6 +120,8 @@ static void intel_vsec_dev_release(struct device *dev) { struct intel_vsec_device *intel_vsec_dev = dev_to_ivdev(dev); + xa_erase(&auxdev_array, intel_vsec_dev->id); + mutex_lock(&vsec_ida_lock); ida_free(intel_vsec_dev->ida, intel_vsec_dev->auxdev.id); mutex_unlock(&vsec_ida_lock); @@ -135,19 +137,28 @@ int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, struct auxiliary_device *auxdev = &intel_vsec_dev->auxdev; int ret, id; - mutex_lock(&vsec_ida_lock); - ret = ida_alloc(intel_vsec_dev->ida, GFP_KERNEL); - mutex_unlock(&vsec_ida_lock); + ret = xa_alloc(&auxdev_array, &intel_vsec_dev->id, intel_vsec_dev, + PMT_XA_LIMIT, GFP_KERNEL); if (ret < 0) { kfree(intel_vsec_dev->resource); kfree(intel_vsec_dev); return ret; } + mutex_lock(&vsec_ida_lock); + id = ida_alloc(intel_vsec_dev->ida, GFP_KERNEL); + mutex_unlock(&vsec_ida_lock); + if (id < 0) { + xa_erase(&auxdev_array, intel_vsec_dev->id); + kfree(intel_vsec_dev->resource); + kfree(intel_vsec_dev); + return id; + } + if (!parent) parent = &pdev->dev; - auxdev->id = ret; + auxdev->id = id; auxdev->name = name; auxdev->dev.parent = parent; auxdev->dev.release = intel_vsec_dev_release; @@ -169,12 +180,6 @@ int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, if (ret < 0) return ret; - /* Add auxdev to list */ - ret = xa_alloc(&auxdev_array, &id, intel_vsec_dev, PMT_XA_LIMIT, - GFP_KERNEL); - if (ret) - return ret; - return 0; } EXPORT_SYMBOL_NS_GPL(intel_vsec_add_aux, INTEL_VSEC); diff --git a/drivers/platform/x86/intel/vsec.h b/drivers/platform/x86/intel/vsec.h index 0fd042c171ba..0a6201b4a0e9 100644 --- a/drivers/platform/x86/intel/vsec.h +++ b/drivers/platform/x86/intel/vsec.h @@ -45,6 +45,7 @@ struct intel_vsec_device { struct ida *ida; struct intel_vsec_platform_info *info; int num_resources; + int id; /* xa */ void *priv_data; size_t priv_data_size; }; From ace7b6f00870cea56460df335606e35ace3c07ac Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:14 -0800 Subject: [PATCH 0238/1562] platform/x86/intel/vsec: Remove unnecessary return MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In intel_vsec_add_aux(), just return from the last call to devm_add_action_or_reset() instead of checking its return value. Suggested-by: Ilpo Järvinen Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-3-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/vsec.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 343ab6a82c01..25017227a0a6 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -175,12 +175,8 @@ int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, return ret; } - ret = devm_add_action_or_reset(parent, intel_vsec_remove_aux, + return devm_add_action_or_reset(parent, intel_vsec_remove_aux, auxdev); - if (ret < 0) - return ret; - - return 0; } EXPORT_SYMBOL_NS_GPL(intel_vsec_add_aux, INTEL_VSEC); From dbc01b0c86a7b23ffd06e14a84591500b04591ed Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:15 -0800 Subject: [PATCH 0239/1562] platform/x86/intel/vsec: Move structures to header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for exporting an API to register Intel Vendor Specific Extended Capabilities (VSEC) from other drivers, move needed structures to the header file. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-4-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/vsec.c | 35 ------------------------------ drivers/platform/x86/intel/vsec.h | 36 +++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 25017227a0a6..4dc490fd4a5b 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -24,13 +24,6 @@ #include "vsec.h" -/* Intel DVSEC offsets */ -#define INTEL_DVSEC_ENTRIES 0xA -#define INTEL_DVSEC_SIZE 0xB -#define INTEL_DVSEC_TABLE 0xC -#define INTEL_DVSEC_TABLE_BAR(x) ((x) & GENMASK(2, 0)) -#define INTEL_DVSEC_TABLE_OFFSET(x) ((x) & GENMASK(31, 3)) -#define TABLE_OFFSET_SHIFT 3 #define PMT_XA_START 0 #define PMT_XA_MAX INT_MAX #define PMT_XA_LIMIT XA_LIMIT(PMT_XA_START, PMT_XA_MAX) @@ -39,34 +32,6 @@ static DEFINE_IDA(intel_vsec_ida); static DEFINE_IDA(intel_vsec_sdsi_ida); static DEFINE_XARRAY_ALLOC(auxdev_array); -/** - * struct intel_vsec_header - Common fields of Intel VSEC and DVSEC registers. - * @rev: Revision ID of the VSEC/DVSEC register space - * @length: Length of the VSEC/DVSEC register space - * @id: ID of the feature - * @num_entries: Number of instances of the feature - * @entry_size: Size of the discovery table for each feature - * @tbir: BAR containing the discovery tables - * @offset: BAR offset of start of the first discovery table - */ -struct intel_vsec_header { - u8 rev; - u16 length; - u16 id; - u8 num_entries; - u8 entry_size; - u8 tbir; - u32 offset; -}; - -enum intel_vsec_id { - VSEC_ID_TELEMETRY = 2, - VSEC_ID_WATCHER = 3, - VSEC_ID_CRASHLOG = 4, - VSEC_ID_SDSI = 65, - VSEC_ID_TPMI = 66, -}; - static const char *intel_vsec_name(enum intel_vsec_id id) { switch (id) { diff --git a/drivers/platform/x86/intel/vsec.h b/drivers/platform/x86/intel/vsec.h index 0a6201b4a0e9..c242c07ea69c 100644 --- a/drivers/platform/x86/intel/vsec.h +++ b/drivers/platform/x86/intel/vsec.h @@ -11,9 +11,45 @@ #define VSEC_CAP_SDSI BIT(3) #define VSEC_CAP_TPMI BIT(4) +/* Intel DVSEC offsets */ +#define INTEL_DVSEC_ENTRIES 0xA +#define INTEL_DVSEC_SIZE 0xB +#define INTEL_DVSEC_TABLE 0xC +#define INTEL_DVSEC_TABLE_BAR(x) ((x) & GENMASK(2, 0)) +#define INTEL_DVSEC_TABLE_OFFSET(x) ((x) & GENMASK(31, 3)) +#define TABLE_OFFSET_SHIFT 3 + struct pci_dev; struct resource; +enum intel_vsec_id { + VSEC_ID_TELEMETRY = 2, + VSEC_ID_WATCHER = 3, + VSEC_ID_CRASHLOG = 4, + VSEC_ID_SDSI = 65, + VSEC_ID_TPMI = 66, +}; + +/** + * struct intel_vsec_header - Common fields of Intel VSEC and DVSEC registers. + * @rev: Revision ID of the VSEC/DVSEC register space + * @length: Length of the VSEC/DVSEC register space + * @id: ID of the feature + * @num_entries: Number of instances of the feature + * @entry_size: Size of the discovery table for each feature + * @tbir: BAR containing the discovery tables + * @offset: BAR offset of start of the first discovery table + */ +struct intel_vsec_header { + u8 rev; + u16 length; + u16 id; + u8 num_entries; + u8 entry_size; + u8 tbir; + u32 offset; +}; + enum intel_vsec_quirks { /* Watcher feature not supported */ VSEC_QUIRK_NO_WATCHER = BIT(0), From 0a0a52abaa65b844afde3d7229c209a8cddc5a07 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:16 -0800 Subject: [PATCH 0240/1562] platform/x86/intel/vsec: remove platform_info from vsec device structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for exporting an API to register Intel Vendor Specific Extended Capabilities (VSEC) from other drivers, remove the pointer to platform_info from intel_vsec_device. This prevents a potential page fault when auxiliary drivers probe and attempt to dereference this pointer to access the needed quirks field. Instead, just add the quirks to intel_vsec_device. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-5-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmt/class.c | 2 +- drivers/platform/x86/intel/vsec.c | 2 +- drivers/platform/x86/intel/vsec.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/intel/pmt/class.c b/drivers/platform/x86/intel/pmt/class.c index f32a233470de..2ad91d2fd954 100644 --- a/drivers/platform/x86/intel/pmt/class.c +++ b/drivers/platform/x86/intel/pmt/class.c @@ -31,7 +31,7 @@ bool intel_pmt_is_early_client_hw(struct device *dev) * differences from the server platforms (which use the Out Of Band * Management Services Module OOBMSM). */ - return !!(ivdev->info->quirks & VSEC_QUIRK_EARLY_HW); + return !!(ivdev->quirks & VSEC_QUIRK_EARLY_HW); } EXPORT_SYMBOL_NS_GPL(intel_pmt_is_early_client_hw, INTEL_PMT); diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 4dc490fd4a5b..bcdc727c4cc3 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -194,7 +194,7 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he intel_vsec_dev->pcidev = pdev; intel_vsec_dev->resource = res; intel_vsec_dev->num_resources = header->num_entries; - intel_vsec_dev->info = info; + intel_vsec_dev->quirks = info->quirks; if (header->id == VSEC_ID_SDSI) intel_vsec_dev->ida = &intel_vsec_sdsi_ida; diff --git a/drivers/platform/x86/intel/vsec.h b/drivers/platform/x86/intel/vsec.h index c242c07ea69c..8b9fad170503 100644 --- a/drivers/platform/x86/intel/vsec.h +++ b/drivers/platform/x86/intel/vsec.h @@ -79,11 +79,11 @@ struct intel_vsec_device { struct pci_dev *pcidev; struct resource *resource; struct ida *ida; - struct intel_vsec_platform_info *info; int num_resources; int id; /* xa */ void *priv_data; size_t priv_data_size; + unsigned long quirks; }; int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, From 1d1b4770d4b661ecdf899c314ce406b9840c0c22 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:17 -0800 Subject: [PATCH 0241/1562] platform/x86/intel/vsec: Use cleanup.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use cleanup.h helpers to handle cleanup of resources in intel_vsec_add_dev() after failures. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-6-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/vsec.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index bcdc727c4cc3..6b0e7363397a 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -148,8 +149,9 @@ EXPORT_SYMBOL_NS_GPL(intel_vsec_add_aux, INTEL_VSEC); static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *header, struct intel_vsec_platform_info *info) { - struct intel_vsec_device *intel_vsec_dev; - struct resource *res, *tmp; + struct intel_vsec_device __free(kfree) *intel_vsec_dev = NULL; + struct resource __free(kfree) *res = NULL; + struct resource *tmp; unsigned long quirks = info->quirks; int i; @@ -171,10 +173,8 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he return -ENOMEM; res = kcalloc(header->num_entries, sizeof(*res), GFP_KERNEL); - if (!res) { - kfree(intel_vsec_dev); + if (!res) return -ENOMEM; - } if (quirks & VSEC_QUIRK_TABLE_SHIFT) header->offset >>= TABLE_OFFSET_SHIFT; @@ -192,7 +192,7 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he } intel_vsec_dev->pcidev = pdev; - intel_vsec_dev->resource = res; + intel_vsec_dev->resource = no_free_ptr(res); intel_vsec_dev->num_resources = header->num_entries; intel_vsec_dev->quirks = info->quirks; @@ -201,7 +201,11 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he else intel_vsec_dev->ida = &intel_vsec_ida; - return intel_vsec_add_aux(pdev, NULL, intel_vsec_dev, + /* + * Pass the ownership of intel_vsec_dev and resource within it to + * intel_vsec_add_aux() + */ + return intel_vsec_add_aux(pdev, NULL, no_free_ptr(intel_vsec_dev), intel_vsec_name(header->id)); } From 6dfc2514acee37e30ce59f1f25b1f8f6aa7c1b08 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:18 -0800 Subject: [PATCH 0242/1562] platform/x86/intel/vsec: Assign auxdev parent by argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of checking for a NULL parent argument in intel_vsec_add_aux() and then assigning it to the probed device, remove this check and just pass the device in the call. Since this function is exported, return -EINVAL if the parent is not specified. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-7-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/vsec.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 6b0e7363397a..bcfb5d480ebd 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -103,6 +103,9 @@ int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, struct auxiliary_device *auxdev = &intel_vsec_dev->auxdev; int ret, id; + if (!parent) + return -EINVAL; + ret = xa_alloc(&auxdev_array, &intel_vsec_dev->id, intel_vsec_dev, PMT_XA_LIMIT, GFP_KERNEL); if (ret < 0) { @@ -121,9 +124,6 @@ int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, return id; } - if (!parent) - parent = &pdev->dev; - auxdev->id = id; auxdev->name = name; auxdev->dev.parent = parent; @@ -205,7 +205,7 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he * Pass the ownership of intel_vsec_dev and resource within it to * intel_vsec_add_aux() */ - return intel_vsec_add_aux(pdev, NULL, no_free_ptr(intel_vsec_dev), + return intel_vsec_add_aux(pdev, &pdev->dev, no_free_ptr(intel_vsec_dev), intel_vsec_name(header->id)); } From 4edbd117ba3f7beacfb439aad60e8a5de77114b4 Mon Sep 17 00:00:00 2001 From: Gayatri Kammela Date: Wed, 29 Nov 2023 14:21:19 -0800 Subject: [PATCH 0243/1562] platform/x86/intel/vsec: Add intel_vsec_register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add and export intel_vsec_register() to allow the registration of Intel extended capabilities from other drivers. Add check to look for memory conflicts before registering a new capability. Since the vsec provider may not be a PCI device, add a parent field to intel_vsec_platform_info() to allow specifying the parent device for device managed cleanup. Signed-off-by: Gayatri Kammela Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-8-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/vsec.c | 24 +++++++++++++++++++++++- drivers/platform/x86/intel/vsec.h | 4 ++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index bcfb5d480ebd..5568d6236bd6 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -152,9 +152,15 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he struct intel_vsec_device __free(kfree) *intel_vsec_dev = NULL; struct resource __free(kfree) *res = NULL; struct resource *tmp; + struct device *parent; unsigned long quirks = info->quirks; int i; + if (info->parent) + parent = info->parent; + else + parent = &pdev->dev; + if (!intel_vsec_supported(header->id, info->caps)) return -EINVAL; @@ -189,6 +195,12 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he header->offset + i * (header->entry_size * sizeof(u32)); tmp->end = tmp->start + (header->entry_size * sizeof(u32)) - 1; tmp->flags = IORESOURCE_MEM; + + /* Check resource is not in use */ + if (!request_mem_region(tmp->start, resource_size(tmp), "")) + return -EBUSY; + + release_mem_region(tmp->start, resource_size(tmp)); } intel_vsec_dev->pcidev = pdev; @@ -205,7 +217,7 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he * Pass the ownership of intel_vsec_dev and resource within it to * intel_vsec_add_aux() */ - return intel_vsec_add_aux(pdev, &pdev->dev, no_free_ptr(intel_vsec_dev), + return intel_vsec_add_aux(pdev, parent, no_free_ptr(intel_vsec_dev), intel_vsec_name(header->id)); } @@ -323,6 +335,16 @@ static bool intel_vsec_walk_vsec(struct pci_dev *pdev, return have_devices; } +void intel_vsec_register(struct pci_dev *pdev, + struct intel_vsec_platform_info *info) +{ + if (!pdev || !info) + return; + + intel_vsec_walk_header(pdev, info); +} +EXPORT_SYMBOL_NS_GPL(intel_vsec_register, INTEL_VSEC); + static int intel_vsec_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct intel_vsec_platform_info *info; diff --git a/drivers/platform/x86/intel/vsec.h b/drivers/platform/x86/intel/vsec.h index 8b9fad170503..bb8b6452df70 100644 --- a/drivers/platform/x86/intel/vsec.h +++ b/drivers/platform/x86/intel/vsec.h @@ -69,6 +69,7 @@ enum intel_vsec_quirks { /* Platform specific data */ struct intel_vsec_platform_info { + struct device *parent; struct intel_vsec_header **headers; unsigned long caps; unsigned long quirks; @@ -99,4 +100,7 @@ static inline struct intel_vsec_device *auxdev_to_ivdev(struct auxiliary_device { return container_of(auxdev, struct intel_vsec_device, auxdev); } + +void intel_vsec_register(struct pci_dev *pdev, + struct intel_vsec_platform_info *info); #endif From e97ec7f621fbfdce07bf1b98a26883ee19281747 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:20 -0800 Subject: [PATCH 0244/1562] platform/x86/intel/vsec: Add base address field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some devices may emulate PCI VSEC capabilities in MMIO. In such cases the BAR is not readable from a config space. Provide a field for drivers to indicate the base address to be used. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-9-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmt/class.c | 14 +++++++++++--- drivers/platform/x86/intel/vsec.c | 10 ++++++++-- drivers/platform/x86/intel/vsec.h | 2 ++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/platform/x86/intel/pmt/class.c b/drivers/platform/x86/intel/pmt/class.c index 2ad91d2fd954..32608baaa56c 100644 --- a/drivers/platform/x86/intel/pmt/class.c +++ b/drivers/platform/x86/intel/pmt/class.c @@ -160,10 +160,11 @@ static struct class intel_pmt_class = { static int intel_pmt_populate_entry(struct intel_pmt_entry *entry, struct intel_pmt_header *header, - struct device *dev, + struct intel_vsec_device *ivdev, struct resource *disc_res) { - struct pci_dev *pci_dev = to_pci_dev(dev->parent); + struct pci_dev *pci_dev = ivdev->pcidev; + struct device *dev = &ivdev->auxdev.dev; u8 bir; /* @@ -215,6 +216,13 @@ static int intel_pmt_populate_entry(struct intel_pmt_entry *entry, break; case ACCESS_BARID: + /* Use the provided base address if it exists */ + if (ivdev->base_addr) { + entry->base_addr = ivdev->base_addr + + GET_ADDRESS(header->base_offset); + break; + } + /* * If another BAR was specified then the base offset * represents the offset within that BAR. SO retrieve the @@ -319,7 +327,7 @@ int intel_pmt_dev_create(struct intel_pmt_entry *entry, struct intel_pmt_namespa if (ret) return ret; - ret = intel_pmt_populate_entry(entry, &header, dev, disc_res); + ret = intel_pmt_populate_entry(entry, &header, intel_vsec_dev, disc_res); if (ret) return ret; diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 5568d6236bd6..b68586731e45 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -154,6 +154,7 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he struct resource *tmp; struct device *parent; unsigned long quirks = info->quirks; + u64 base_addr; int i; if (info->parent) @@ -185,14 +186,18 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he if (quirks & VSEC_QUIRK_TABLE_SHIFT) header->offset >>= TABLE_OFFSET_SHIFT; + if (info->base_addr) + base_addr = info->base_addr; + else + base_addr = pdev->resource[header->tbir].start; + /* * The DVSEC/VSEC contains the starting offset and count for a block of * discovery tables. Create a resource array of these tables to the * auxiliary device driver. */ for (i = 0, tmp = res; i < header->num_entries; i++, tmp++) { - tmp->start = pdev->resource[header->tbir].start + - header->offset + i * (header->entry_size * sizeof(u32)); + tmp->start = base_addr + header->offset + i * (header->entry_size * sizeof(u32)); tmp->end = tmp->start + (header->entry_size * sizeof(u32)) - 1; tmp->flags = IORESOURCE_MEM; @@ -207,6 +212,7 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he intel_vsec_dev->resource = no_free_ptr(res); intel_vsec_dev->num_resources = header->num_entries; intel_vsec_dev->quirks = info->quirks; + intel_vsec_dev->base_addr = info->base_addr; if (header->id == VSEC_ID_SDSI) intel_vsec_dev->ida = &intel_vsec_sdsi_ida; diff --git a/drivers/platform/x86/intel/vsec.h b/drivers/platform/x86/intel/vsec.h index bb8b6452df70..e23e76129691 100644 --- a/drivers/platform/x86/intel/vsec.h +++ b/drivers/platform/x86/intel/vsec.h @@ -73,6 +73,7 @@ struct intel_vsec_platform_info { struct intel_vsec_header **headers; unsigned long caps; unsigned long quirks; + u64 base_addr; }; struct intel_vsec_device { @@ -85,6 +86,7 @@ struct intel_vsec_device { void *priv_data; size_t priv_data_size; unsigned long quirks; + u64 base_addr; }; int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, From 4d1b7efee3fc703c64bacc37c4824888c5f26e8b Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:21 -0800 Subject: [PATCH 0245/1562] platform/x86/intel/pmt: Add header to struct intel_pmt_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PMT header is passed to several functions. Instead, store the header in struct intel_pmt_entry which is also passed to these functions and shorten the argument list. This simplifies the calls in preparation for later changes. While here also perform a newline cleanup. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-10-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmt/class.c | 8 +++----- drivers/platform/x86/intel/pmt/class.h | 16 ++++++++-------- drivers/platform/x86/intel/pmt/crashlog.c | 2 +- drivers/platform/x86/intel/pmt/telemetry.c | 2 +- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/platform/x86/intel/pmt/class.c b/drivers/platform/x86/intel/pmt/class.c index 32608baaa56c..142a24e3727d 100644 --- a/drivers/platform/x86/intel/pmt/class.c +++ b/drivers/platform/x86/intel/pmt/class.c @@ -159,12 +159,12 @@ static struct class intel_pmt_class = { }; static int intel_pmt_populate_entry(struct intel_pmt_entry *entry, - struct intel_pmt_header *header, struct intel_vsec_device *ivdev, struct resource *disc_res) { struct pci_dev *pci_dev = ivdev->pcidev; struct device *dev = &ivdev->auxdev.dev; + struct intel_pmt_header *header = &entry->header; u8 bir; /* @@ -313,7 +313,6 @@ int intel_pmt_dev_create(struct intel_pmt_entry *entry, struct intel_pmt_namespa struct intel_vsec_device *intel_vsec_dev, int idx) { struct device *dev = &intel_vsec_dev->auxdev.dev; - struct intel_pmt_header header; struct resource *disc_res; int ret; @@ -323,16 +322,15 @@ int intel_pmt_dev_create(struct intel_pmt_entry *entry, struct intel_pmt_namespa if (IS_ERR(entry->disc_table)) return PTR_ERR(entry->disc_table); - ret = ns->pmt_header_decode(entry, &header, dev); + ret = ns->pmt_header_decode(entry, dev); if (ret) return ret; - ret = intel_pmt_populate_entry(entry, &header, intel_vsec_dev, disc_res); + ret = intel_pmt_populate_entry(entry, intel_vsec_dev, disc_res); if (ret) return ret; return intel_pmt_dev_register(entry, ns, dev); - } EXPORT_SYMBOL_NS_GPL(intel_pmt_dev_create, INTEL_PMT); diff --git a/drivers/platform/x86/intel/pmt/class.h b/drivers/platform/x86/intel/pmt/class.h index db11d58867ce..e477a19f6700 100644 --- a/drivers/platform/x86/intel/pmt/class.h +++ b/drivers/platform/x86/intel/pmt/class.h @@ -18,7 +18,15 @@ #define GET_BIR(v) ((v) & GENMASK(2, 0)) #define GET_ADDRESS(v) ((v) & GENMASK(31, 3)) +struct intel_pmt_header { + u32 base_offset; + u32 size; + u32 guid; + u8 access_type; +}; + struct intel_pmt_entry { + struct intel_pmt_header header; struct bin_attribute pmt_bin_attr; struct kobject *kobj; void __iomem *disc_table; @@ -29,19 +37,11 @@ struct intel_pmt_entry { int devid; }; -struct intel_pmt_header { - u32 base_offset; - u32 size; - u32 guid; - u8 access_type; -}; - struct intel_pmt_namespace { const char *name; struct xarray *xa; const struct attribute_group *attr_grp; int (*pmt_header_decode)(struct intel_pmt_entry *entry, - struct intel_pmt_header *header, struct device *dev); }; diff --git a/drivers/platform/x86/intel/pmt/crashlog.c b/drivers/platform/x86/intel/pmt/crashlog.c index bbb3d61d09f4..4014c02cafdb 100644 --- a/drivers/platform/x86/intel/pmt/crashlog.c +++ b/drivers/platform/x86/intel/pmt/crashlog.c @@ -223,10 +223,10 @@ static const struct attribute_group pmt_crashlog_group = { }; static int pmt_crashlog_header_decode(struct intel_pmt_entry *entry, - struct intel_pmt_header *header, struct device *dev) { void __iomem *disc_table = entry->disc_table; + struct intel_pmt_header *header = &entry->header; struct crashlog_entry *crashlog; if (!pmt_crashlog_supported(entry)) diff --git a/drivers/platform/x86/intel/pmt/telemetry.c b/drivers/platform/x86/intel/pmt/telemetry.c index 39cbc87cc28a..f86080e8bebd 100644 --- a/drivers/platform/x86/intel/pmt/telemetry.c +++ b/drivers/platform/x86/intel/pmt/telemetry.c @@ -58,10 +58,10 @@ static bool pmt_telem_region_overlaps(struct intel_pmt_entry *entry, } static int pmt_telem_header_decode(struct intel_pmt_entry *entry, - struct intel_pmt_header *header, struct device *dev) { void __iomem *disc_table = entry->disc_table; + struct intel_pmt_header *header = &entry->header; if (pmt_telem_region_overlaps(entry, dev)) return 1; From 416eeb2e1fc7b60ab0c7ced26ab966dd7733357d Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:22 -0800 Subject: [PATCH 0246/1562] platform/x86/intel/pmt: telemetry: Export API to read telemetry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export symbols to allow access to Intel PMT Telemetry data on available devices. Provides APIs to search, register, and read telemetry using a kref managed pointer that serves as a handle to a telemetry endpoint. To simplify searching for present devices, have the IDA start at 1 instead of 0 so that 0 can be used to indicate end of search. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-11-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmt/class.c | 21 ++- drivers/platform/x86/intel/pmt/class.h | 14 ++ drivers/platform/x86/intel/pmt/telemetry.c | 191 ++++++++++++++++++++- drivers/platform/x86/intel/pmt/telemetry.h | 126 ++++++++++++++ 4 files changed, 344 insertions(+), 8 deletions(-) create mode 100644 drivers/platform/x86/intel/pmt/telemetry.h diff --git a/drivers/platform/x86/intel/pmt/class.c b/drivers/platform/x86/intel/pmt/class.c index 142a24e3727d..4b53940a64e2 100644 --- a/drivers/platform/x86/intel/pmt/class.c +++ b/drivers/platform/x86/intel/pmt/class.c @@ -17,7 +17,7 @@ #include "../vsec.h" #include "class.h" -#define PMT_XA_START 0 +#define PMT_XA_START 1 #define PMT_XA_MAX INT_MAX #define PMT_XA_LIMIT XA_LIMIT(PMT_XA_START, PMT_XA_MAX) #define GUID_SPR_PUNIT 0x9956f43f @@ -247,6 +247,7 @@ static int intel_pmt_dev_register(struct intel_pmt_entry *entry, struct intel_pmt_namespace *ns, struct device *parent) { + struct intel_vsec_device *ivdev = dev_to_ivdev(parent); struct resource res = {0}; struct device *dev; int ret; @@ -270,7 +271,7 @@ static int intel_pmt_dev_register(struct intel_pmt_entry *entry, if (ns->attr_grp) { ret = sysfs_create_group(entry->kobj, ns->attr_grp); if (ret) - goto fail_sysfs; + goto fail_sysfs_create_group; } /* if size is 0 assume no data buffer, so no file needed */ @@ -295,13 +296,23 @@ static int intel_pmt_dev_register(struct intel_pmt_entry *entry, entry->pmt_bin_attr.size = entry->size; ret = sysfs_create_bin_file(&dev->kobj, &entry->pmt_bin_attr); - if (!ret) - return 0; + if (ret) + goto fail_ioremap; + if (ns->pmt_add_endpoint) { + ret = ns->pmt_add_endpoint(entry, ivdev->pcidev); + if (ret) + goto fail_add_endpoint; + } + + return 0; + +fail_add_endpoint: + sysfs_remove_bin_file(entry->kobj, &entry->pmt_bin_attr); fail_ioremap: if (ns->attr_grp) sysfs_remove_group(entry->kobj, ns->attr_grp); -fail_sysfs: +fail_sysfs_create_group: device_unregister(dev); fail_dev_create: xa_erase(ns->xa, entry->devid); diff --git a/drivers/platform/x86/intel/pmt/class.h b/drivers/platform/x86/intel/pmt/class.h index e477a19f6700..d23c63b73ab7 100644 --- a/drivers/platform/x86/intel/pmt/class.h +++ b/drivers/platform/x86/intel/pmt/class.h @@ -9,6 +9,7 @@ #include #include "../vsec.h" +#include "telemetry.h" /* PMT access types */ #define ACCESS_BARID 2 @@ -18,6 +19,16 @@ #define GET_BIR(v) ((v) & GENMASK(2, 0)) #define GET_ADDRESS(v) ((v) & GENMASK(31, 3)) +struct pci_dev; + +struct telem_endpoint { + struct pci_dev *pcidev; + struct telem_header header; + void __iomem *base; + bool present; + struct kref kref; +}; + struct intel_pmt_header { u32 base_offset; u32 size; @@ -26,6 +37,7 @@ struct intel_pmt_header { }; struct intel_pmt_entry { + struct telem_endpoint *ep; struct intel_pmt_header header; struct bin_attribute pmt_bin_attr; struct kobject *kobj; @@ -43,6 +55,8 @@ struct intel_pmt_namespace { const struct attribute_group *attr_grp; int (*pmt_header_decode)(struct intel_pmt_entry *entry, struct device *dev); + int (*pmt_add_endpoint)(struct intel_pmt_entry *entry, + struct pci_dev *pdev); }; bool intel_pmt_is_early_client_hw(struct device *dev); diff --git a/drivers/platform/x86/intel/pmt/telemetry.c b/drivers/platform/x86/intel/pmt/telemetry.c index f86080e8bebd..09258564dfc4 100644 --- a/drivers/platform/x86/intel/pmt/telemetry.c +++ b/drivers/platform/x86/intel/pmt/telemetry.c @@ -30,6 +30,15 @@ /* Used by client hardware to identify a fixed telemetry entry*/ #define TELEM_CLIENT_FIXED_BLOCK_GUID 0x10000000 +#define NUM_BYTES_QWORD(v) ((v) << 3) +#define SAMPLE_ID_OFFSET(v) ((v) << 3) + +#define NUM_BYTES_DWORD(v) ((v) << 2) +#define SAMPLE_ID_OFFSET32(v) ((v) << 2) + +/* Protects access to the xarray of telemetry endpoint handles */ +static DEFINE_MUTEX(ep_lock); + enum telem_type { TELEM_TYPE_PUNIT = 0, TELEM_TYPE_CRASHLOG, @@ -84,21 +93,195 @@ static int pmt_telem_header_decode(struct intel_pmt_entry *entry, return 0; } +static int pmt_telem_add_endpoint(struct intel_pmt_entry *entry, + struct pci_dev *pdev) +{ + struct telem_endpoint *ep; + + /* Endpoint lifetimes are managed by kref, not devres */ + entry->ep = kzalloc(sizeof(*(entry->ep)), GFP_KERNEL); + if (!entry->ep) + return -ENOMEM; + + ep = entry->ep; + ep->pcidev = pdev; + ep->header.access_type = entry->header.access_type; + ep->header.guid = entry->header.guid; + ep->header.base_offset = entry->header.base_offset; + ep->header.size = entry->header.size; + ep->base = entry->base; + ep->present = true; + + kref_init(&ep->kref); + + return 0; +} + static DEFINE_XARRAY_ALLOC(telem_array); static struct intel_pmt_namespace pmt_telem_ns = { .name = "telem", .xa = &telem_array, .pmt_header_decode = pmt_telem_header_decode, + .pmt_add_endpoint = pmt_telem_add_endpoint, }; +/* Called when all users unregister and the device is removed */ +static void pmt_telem_ep_release(struct kref *kref) +{ + struct telem_endpoint *ep; + + ep = container_of(kref, struct telem_endpoint, kref); + kfree(ep); +} + +unsigned long pmt_telem_get_next_endpoint(unsigned long start) +{ + struct intel_pmt_entry *entry; + unsigned long found_idx; + + mutex_lock(&ep_lock); + xa_for_each_start(&telem_array, found_idx, entry, start) { + /* + * Return first found index after start. + * 0 is not valid id. + */ + if (found_idx > start) + break; + } + mutex_unlock(&ep_lock); + + return found_idx == start ? 0 : found_idx; +} +EXPORT_SYMBOL_NS_GPL(pmt_telem_get_next_endpoint, INTEL_PMT_TELEMETRY); + +struct telem_endpoint *pmt_telem_register_endpoint(int devid) +{ + struct intel_pmt_entry *entry; + unsigned long index = devid; + + mutex_lock(&ep_lock); + entry = xa_find(&telem_array, &index, index, XA_PRESENT); + if (!entry) { + mutex_unlock(&ep_lock); + return ERR_PTR(-ENXIO); + } + + kref_get(&entry->ep->kref); + mutex_unlock(&ep_lock); + + return entry->ep; +} +EXPORT_SYMBOL_NS_GPL(pmt_telem_register_endpoint, INTEL_PMT_TELEMETRY); + +void pmt_telem_unregister_endpoint(struct telem_endpoint *ep) +{ + kref_put(&ep->kref, pmt_telem_ep_release); +} +EXPORT_SYMBOL_NS_GPL(pmt_telem_unregister_endpoint, INTEL_PMT_TELEMETRY); + +int pmt_telem_get_endpoint_info(int devid, struct telem_endpoint_info *info) +{ + struct intel_pmt_entry *entry; + unsigned long index = devid; + int err = 0; + + if (!info) + return -EINVAL; + + mutex_lock(&ep_lock); + entry = xa_find(&telem_array, &index, index, XA_PRESENT); + if (!entry) { + err = -ENXIO; + goto unlock; + } + + info->pdev = entry->ep->pcidev; + info->header = entry->ep->header; + +unlock: + mutex_unlock(&ep_lock); + return err; + +} +EXPORT_SYMBOL_NS_GPL(pmt_telem_get_endpoint_info, INTEL_PMT_TELEMETRY); + +int pmt_telem_read(struct telem_endpoint *ep, u32 id, u64 *data, u32 count) +{ + u32 offset, size; + + if (!ep->present) + return -ENODEV; + + offset = SAMPLE_ID_OFFSET(id); + size = ep->header.size; + + if (offset + NUM_BYTES_QWORD(count) > size) + return -EINVAL; + + memcpy_fromio(data, ep->base + offset, NUM_BYTES_QWORD(count)); + + return ep->present ? 0 : -EPIPE; +} +EXPORT_SYMBOL_NS_GPL(pmt_telem_read, INTEL_PMT_TELEMETRY); + +int pmt_telem_read32(struct telem_endpoint *ep, u32 id, u32 *data, u32 count) +{ + u32 offset, size; + + if (!ep->present) + return -ENODEV; + + offset = SAMPLE_ID_OFFSET32(id); + size = ep->header.size; + + if (offset + NUM_BYTES_DWORD(count) > size) + return -EINVAL; + + memcpy_fromio(data, ep->base + offset, NUM_BYTES_DWORD(count)); + + return ep->present ? 0 : -EPIPE; +} +EXPORT_SYMBOL_NS_GPL(pmt_telem_read32, INTEL_PMT_TELEMETRY); + +struct telem_endpoint * +pmt_telem_find_and_register_endpoint(struct pci_dev *pcidev, u32 guid, u16 pos) +{ + int devid = 0; + int inst = 0; + int err = 0; + + while ((devid = pmt_telem_get_next_endpoint(devid))) { + struct telem_endpoint_info ep_info; + + err = pmt_telem_get_endpoint_info(devid, &ep_info); + if (err) + return ERR_PTR(err); + + if (ep_info.header.guid == guid && ep_info.pdev == pcidev) { + if (inst == pos) + return pmt_telem_register_endpoint(devid); + ++inst; + } + } + + return ERR_PTR(-ENXIO); +} +EXPORT_SYMBOL_NS_GPL(pmt_telem_find_and_register_endpoint, INTEL_PMT_TELEMETRY); + static void pmt_telem_remove(struct auxiliary_device *auxdev) { struct pmt_telem_priv *priv = auxiliary_get_drvdata(auxdev); int i; - for (i = 0; i < priv->num_entries; i++) - intel_pmt_dev_destroy(&priv->entry[i], &pmt_telem_ns); -} + mutex_lock(&ep_lock); + for (i = 0; i < priv->num_entries; i++) { + struct intel_pmt_entry *entry = &priv->entry[i]; + + kref_put(&entry->ep->kref, pmt_telem_ep_release); + intel_pmt_dev_destroy(entry, &pmt_telem_ns); + } + mutex_unlock(&ep_lock); +}; static int pmt_telem_probe(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id) { @@ -117,7 +300,9 @@ static int pmt_telem_probe(struct auxiliary_device *auxdev, const struct auxilia for (i = 0; i < intel_vsec_dev->num_resources; i++) { struct intel_pmt_entry *entry = &priv->entry[priv->num_entries]; + mutex_lock(&ep_lock); ret = intel_pmt_dev_create(entry, &pmt_telem_ns, intel_vsec_dev, i); + mutex_unlock(&ep_lock); if (ret < 0) goto abort_probe; if (ret) diff --git a/drivers/platform/x86/intel/pmt/telemetry.h b/drivers/platform/x86/intel/pmt/telemetry.h new file mode 100644 index 000000000000..d45af5512b4e --- /dev/null +++ b/drivers/platform/x86/intel/pmt/telemetry.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TELEMETRY_H +#define _TELEMETRY_H + +/* Telemetry types */ +#define PMT_TELEM_TELEMETRY 0 +#define PMT_TELEM_CRASHLOG 1 + +struct telem_endpoint; +struct pci_dev; + +struct telem_header { + u8 access_type; + u16 size; + u32 guid; + u32 base_offset; +}; + +struct telem_endpoint_info { + struct pci_dev *pdev; + struct telem_header header; +}; + +/** + * pmt_telem_get_next_endpoint() - Get next device id for a telemetry endpoint + * @start: starting devid to look from + * + * This functions can be used in a while loop predicate to retrieve the devid + * of all available telemetry endpoints. Functions pmt_telem_get_next_endpoint() + * and pmt_telem_register_endpoint() can be used inside of the loop to examine + * endpoint info and register to receive a pointer to the endpoint. The pointer + * is then usable in the telemetry read calls to access the telemetry data. + * + * Return: + * * devid - devid of the next present endpoint from start + * * 0 - when no more endpoints are present after start + */ +unsigned long pmt_telem_get_next_endpoint(unsigned long start); + +/** + * pmt_telem_register_endpoint() - Register a telemetry endpoint + * @devid: device id/handle of the telemetry endpoint + * + * Increments the kref usage counter for the endpoint. + * + * Return: + * * endpoint - On success returns pointer to the telemetry endpoint + * * -ENXIO - telemetry endpoint not found + */ +struct telem_endpoint *pmt_telem_register_endpoint(int devid); + +/** + * pmt_telem_unregister_endpoint() - Unregister a telemetry endpoint + * @ep: ep structure to populate. + * + * Decrements the kref usage counter for the endpoint. + */ +void pmt_telem_unregister_endpoint(struct telem_endpoint *ep); + +/** + * pmt_telem_get_endpoint_info() - Get info for an endpoint from its devid + * @devid: device id/handle of the telemetry endpoint + * @info: Endpoint info structure to be populated + * + * Return: + * * 0 - Success + * * -ENXIO - telemetry endpoint not found for the devid + * * -EINVAL - @info is NULL + */ +int pmt_telem_get_endpoint_info(int devid, struct telem_endpoint_info *info); + +/** + * pmt_telem_find_and_register_endpoint() - Get a telemetry endpoint from + * pci_dev device, guid and pos + * @pdev: PCI device inside the Intel vsec + * @guid: GUID of the telemetry space + * @pos: Instance of the guid + * + * Return: + * * endpoint - On success returns pointer to the telemetry endpoint + * * -ENXIO - telemetry endpoint not found + */ +struct telem_endpoint *pmt_telem_find_and_register_endpoint(struct pci_dev *pcidev, + u32 guid, u16 pos); + +/** + * pmt_telem_read() - Read qwords from counter sram using sample id + * @ep: Telemetry endpoint to be read + * @id: The beginning sample id of the metric(s) to be read + * @data: Allocated qword buffer + * @count: Number of qwords requested + * + * Callers must ensure reads are aligned. When the call returns -ENODEV, + * the device has been removed and callers should unregister the telemetry + * endpoint. + * + * Return: + * * 0 - Success + * * -ENODEV - The device is not present. + * * -EINVAL - The offset is out bounds + * * -EPIPE - The device was removed during the read. Data written + * but should be considered invalid. + */ +int pmt_telem_read(struct telem_endpoint *ep, u32 id, u64 *data, u32 count); + +/** + * pmt_telem_read32() - Read qwords from counter sram using sample id + * @ep: Telemetry endpoint to be read + * @id: The beginning sample id of the metric(s) to be read + * @data: Allocated dword buffer + * @count: Number of dwords requested + * + * Callers must ensure reads are aligned. When the call returns -ENODEV, + * the device has been removed and callers should unregister the telemetry + * endpoint. + * + * Return: + * * 0 - Success + * * -ENODEV - The device is not present. + * * -EINVAL - The offset is out bounds + * * -EPIPE - The device was removed during the read. Data written + * but should be considered invalid. + */ +int pmt_telem_read32(struct telem_endpoint *ep, u32 id, u32 *data, u32 count); + +#endif From 2e35e3aa9f10ea430468207c3dd9dc33ba1afc33 Mon Sep 17 00:00:00 2001 From: Xi Pardee Date: Wed, 29 Nov 2023 14:21:23 -0800 Subject: [PATCH 0247/1562] platform/x86:intel/pmc: Call pmc_get_low_power_modes from platform init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to setup a table of low power mode requirements for Meteor Lake, pmc_core_get_low_power_modes() will need to be run from platform init code so that the enabled modes are known, allowing the use of the pmc_for_each_mode helper. Make the function global and call it from the platform init code. Signed-off-by: Xi Pardee Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-12-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/adl.c | 2 ++ drivers/platform/x86/intel/pmc/cnp.c | 2 ++ drivers/platform/x86/intel/pmc/core.c | 7 +++---- drivers/platform/x86/intel/pmc/core.h | 1 + drivers/platform/x86/intel/pmc/icl.c | 10 +++++++++- drivers/platform/x86/intel/pmc/mtl.c | 4 +++- drivers/platform/x86/intel/pmc/spt.c | 10 +++++++++- drivers/platform/x86/intel/pmc/tgl.c | 1 + 8 files changed, 30 insertions(+), 7 deletions(-) diff --git a/drivers/platform/x86/intel/pmc/adl.c b/drivers/platform/x86/intel/pmc/adl.c index 5006008e01be..64c492391ede 100644 --- a/drivers/platform/x86/intel/pmc/adl.c +++ b/drivers/platform/x86/intel/pmc/adl.c @@ -319,6 +319,8 @@ int adl_core_init(struct pmc_dev *pmcdev) if (ret) return ret; + pmc_core_get_low_power_modes(pmcdev); + /* Due to a hardware limitation, the GBE LTR blocks PC10 * when a cable is attached. Tell the PMC to ignore it. */ diff --git a/drivers/platform/x86/intel/pmc/cnp.c b/drivers/platform/x86/intel/pmc/cnp.c index 420aaa1d7c76..59298f184d0e 100644 --- a/drivers/platform/x86/intel/pmc/cnp.c +++ b/drivers/platform/x86/intel/pmc/cnp.c @@ -214,6 +214,8 @@ int cnp_core_init(struct pmc_dev *pmcdev) if (ret) return ret; + pmc_core_get_low_power_modes(pmcdev); + /* Due to a hardware limitation, the GBE LTR blocks PC10 * when a cable is attached. Tell the PMC to ignore it. */ diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c index 84c175b9721a..3894119d61b0 100644 --- a/drivers/platform/x86/intel/pmc/core.c +++ b/drivers/platform/x86/intel/pmc/core.c @@ -966,9 +966,8 @@ static bool pmc_core_pri_verify(u32 lpm_pri, u8 *mode_order) return true; } -static void pmc_core_get_low_power_modes(struct platform_device *pdev) +void pmc_core_get_low_power_modes(struct pmc_dev *pmcdev) { - struct pmc_dev *pmcdev = platform_get_drvdata(pdev); struct pmc *pmc = pmcdev->pmcs[PMC_IDX_MAIN]; u8 pri_order[LPM_MAX_NUM_MODES] = LPM_DEFAULT_PRI; u8 mode_order[LPM_MAX_NUM_MODES]; @@ -1000,7 +999,8 @@ static void pmc_core_get_low_power_modes(struct platform_device *pdev) for (mode = 0; mode < LPM_MAX_NUM_MODES; mode++) pri_order[mode_order[mode]] = mode; else - dev_warn(&pdev->dev, "Assuming a default substate order for this platform\n"); + dev_warn(&pmcdev->pdev->dev, + "Assuming a default substate order for this platform\n"); /* * Loop through all modes from lowest to highest priority, @@ -1250,7 +1250,6 @@ static int pmc_core_probe(struct platform_device *pdev) } pmcdev->pmc_xram_read_bit = pmc_core_check_read_lock_bit(primary_pmc); - pmc_core_get_low_power_modes(pdev); pmc_core_do_dmi_quirks(primary_pmc); pmc_core_dbgfs_register(pmcdev); diff --git a/drivers/platform/x86/intel/pmc/core.h b/drivers/platform/x86/intel/pmc/core.h index 0729f593c6a7..ccf24e0f5e50 100644 --- a/drivers/platform/x86/intel/pmc/core.h +++ b/drivers/platform/x86/intel/pmc/core.h @@ -490,6 +490,7 @@ extern int pmc_core_send_ltr_ignore(struct pmc_dev *pmcdev, u32 value); int pmc_core_resume_common(struct pmc_dev *pmcdev); int get_primary_reg_base(struct pmc *pmc); +extern void pmc_core_get_low_power_modes(struct pmc_dev *pmcdev); extern void pmc_core_ssram_init(struct pmc_dev *pmcdev); diff --git a/drivers/platform/x86/intel/pmc/icl.c b/drivers/platform/x86/intel/pmc/icl.c index d08e3174230d..71b0fd6cb7d8 100644 --- a/drivers/platform/x86/intel/pmc/icl.c +++ b/drivers/platform/x86/intel/pmc/icl.c @@ -53,7 +53,15 @@ const struct pmc_reg_map icl_reg_map = { int icl_core_init(struct pmc_dev *pmcdev) { struct pmc *pmc = pmcdev->pmcs[PMC_IDX_MAIN]; + int ret; pmc->map = &icl_reg_map; - return get_primary_reg_base(pmc); + + ret = get_primary_reg_base(pmc); + if (ret) + return ret; + + pmc_core_get_low_power_modes(pmcdev); + + return ret; } diff --git a/drivers/platform/x86/intel/pmc/mtl.c b/drivers/platform/x86/intel/pmc/mtl.c index 2204bc666980..c3b5f4fe01d1 100644 --- a/drivers/platform/x86/intel/pmc/mtl.c +++ b/drivers/platform/x86/intel/pmc/mtl.c @@ -985,7 +985,7 @@ static int mtl_resume(struct pmc_dev *pmcdev) int mtl_core_init(struct pmc_dev *pmcdev) { struct pmc *pmc = pmcdev->pmcs[PMC_IDX_SOC]; - int ret = 0; + int ret; mtl_d3_fixup(); @@ -1002,6 +1002,8 @@ int mtl_core_init(struct pmc_dev *pmcdev) return ret; } + pmc_core_get_low_power_modes(pmcdev); + /* Due to a hardware limitation, the GBE LTR blocks PC10 * when a cable is attached. Tell the PMC to ignore it. */ diff --git a/drivers/platform/x86/intel/pmc/spt.c b/drivers/platform/x86/intel/pmc/spt.c index 4b6f5cbda16c..ab993a69e33e 100644 --- a/drivers/platform/x86/intel/pmc/spt.c +++ b/drivers/platform/x86/intel/pmc/spt.c @@ -137,7 +137,15 @@ const struct pmc_reg_map spt_reg_map = { int spt_core_init(struct pmc_dev *pmcdev) { struct pmc *pmc = pmcdev->pmcs[PMC_IDX_MAIN]; + int ret; pmc->map = &spt_reg_map; - return get_primary_reg_base(pmc); + + ret = get_primary_reg_base(pmc); + if (ret) + return ret; + + pmc_core_get_low_power_modes(pmcdev); + + return ret; } diff --git a/drivers/platform/x86/intel/pmc/tgl.c b/drivers/platform/x86/intel/pmc/tgl.c index 2449940102db..d5f1d2223c5a 100644 --- a/drivers/platform/x86/intel/pmc/tgl.c +++ b/drivers/platform/x86/intel/pmc/tgl.c @@ -263,6 +263,7 @@ int tgl_core_init(struct pmc_dev *pmcdev) if (ret) return ret; + pmc_core_get_low_power_modes(pmcdev); pmc_core_get_tgl_lpm_reqs(pmcdev->pdev); /* Due to a hardware limitation, the GBE LTR blocks PC10 * when a cable is attached. Tell the PMC to ignore it. From 9512920a6be57af191ab2849b3ec393b8e92530a Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:24 -0800 Subject: [PATCH 0248/1562] platform/x86/intel/pmc: Allow pmc_core_ssram_init to fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, if the PMC SSRAM initialization fails, no error is returned and the only indication is that a PMC device has not been created. Instead, allow an error to be returned and handled directly by the caller. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-13-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/core.h | 2 +- drivers/platform/x86/intel/pmc/core_ssram.c | 21 +++++++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/platform/x86/intel/pmc/core.h b/drivers/platform/x86/intel/pmc/core.h index ccf24e0f5e50..edaa70067e41 100644 --- a/drivers/platform/x86/intel/pmc/core.h +++ b/drivers/platform/x86/intel/pmc/core.h @@ -492,7 +492,7 @@ int pmc_core_resume_common(struct pmc_dev *pmcdev); int get_primary_reg_base(struct pmc *pmc); extern void pmc_core_get_low_power_modes(struct pmc_dev *pmcdev); -extern void pmc_core_ssram_init(struct pmc_dev *pmcdev); +extern int pmc_core_ssram_init(struct pmc_dev *pmcdev); int spt_core_init(struct pmc_dev *pmcdev); int cnp_core_init(struct pmc_dev *pmcdev); diff --git a/drivers/platform/x86/intel/pmc/core_ssram.c b/drivers/platform/x86/intel/pmc/core_ssram.c index 13fa16f0d52e..815950713e25 100644 --- a/drivers/platform/x86/intel/pmc/core_ssram.c +++ b/drivers/platform/x86/intel/pmc/core_ssram.c @@ -35,20 +35,20 @@ static inline u64 get_base(void __iomem *addr, u32 offset) return lo_hi_readq(addr + offset) & GENMASK_ULL(63, 3); } -static void +static int pmc_core_pmc_add(struct pmc_dev *pmcdev, u64 pwrm_base, const struct pmc_reg_map *reg_map, int pmc_index) { struct pmc *pmc = pmcdev->pmcs[pmc_index]; if (!pwrm_base) - return; + return -ENODEV; /* Memory for primary PMC has been allocated in core.c */ if (!pmc) { pmc = devm_kzalloc(&pmcdev->pdev->dev, sizeof(*pmc), GFP_KERNEL); if (!pmc) - return; + return -ENOMEM; } pmc->map = reg_map; @@ -57,10 +57,12 @@ pmc_core_pmc_add(struct pmc_dev *pmcdev, u64 pwrm_base, if (!pmc->regbase) { devm_kfree(&pmcdev->pdev->dev, pmc); - return; + return -ENOMEM; } pmcdev->pmcs[pmc_index] = pmc; + + return 0; } static void @@ -96,7 +98,7 @@ pmc_core_ssram_get_pmc(struct pmc_dev *pmcdev, void __iomem *ssram, u32 offset, iounmap(ssram); } -void pmc_core_ssram_init(struct pmc_dev *pmcdev) +int pmc_core_ssram_init(struct pmc_dev *pmcdev) { void __iomem *ssram; struct pci_dev *pcidev; @@ -105,7 +107,7 @@ void pmc_core_ssram_init(struct pmc_dev *pmcdev) pcidev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(20, 2)); if (!pcidev) - goto out; + return -ENODEV; ret = pcim_enable_device(pcidev); if (ret) @@ -123,11 +125,14 @@ void pmc_core_ssram_init(struct pmc_dev *pmcdev) pmc_core_ssram_get_pmc(pmcdev, ssram, SSRAM_PCH_OFFSET, PMC_IDX_PCH); iounmap(ssram); -out: - return; + + return 0; disable_dev: + pmcdev->ssram_pcidev = NULL; pci_disable_device(pcidev); release_dev: pci_dev_put(pcidev); + + return ret; } From a01486dc4bb17de976c6d0a4b1ad5f8106525dfb Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:25 -0800 Subject: [PATCH 0249/1562] platform/x86/intel/pmc: Cleanup SSRAM discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up the code handling SSRAM discovery. Handle all resource allocation and cleanup in pmc_core_ssram_get_pmc(). Return the error status from this function but only fail the init if we fail to discover the primary PMC. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-14-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/core_ssram.c | 62 +++++++++++---------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/drivers/platform/x86/intel/pmc/core_ssram.c b/drivers/platform/x86/intel/pmc/core_ssram.c index 815950713e25..c1b984255571 100644 --- a/drivers/platform/x86/intel/pmc/core_ssram.c +++ b/drivers/platform/x86/intel/pmc/core_ssram.c @@ -8,6 +8,7 @@ * */ +#include #include #include @@ -21,6 +22,8 @@ #define SSRAM_IOE_OFFSET 0x68 #define SSRAM_DEVID_OFFSET 0x70 +DEFINE_FREE(pmc_core_iounmap, void __iomem *, iounmap(_T)); + static const struct pmc_reg_map *pmc_core_find_regmap(struct pmc_info *list, u16 devid) { for (; list->map; ++list) @@ -65,44 +68,49 @@ pmc_core_pmc_add(struct pmc_dev *pmcdev, u64 pwrm_base, return 0; } -static void -pmc_core_ssram_get_pmc(struct pmc_dev *pmcdev, void __iomem *ssram, u32 offset, - int pmc_idx) +static int +pmc_core_ssram_get_pmc(struct pmc_dev *pmcdev, int pmc_idx, u32 offset) { - u64 pwrm_base; + struct pci_dev *ssram_pcidev = pmcdev->ssram_pcidev; + void __iomem __free(pmc_core_iounmap) *tmp_ssram = NULL; + void __iomem __free(pmc_core_iounmap) *ssram = NULL; + const struct pmc_reg_map *map; + u64 ssram_base, pwrm_base; u16 devid; - if (pmc_idx != PMC_IDX_SOC) { - u64 ssram_base = get_base(ssram, offset); + if (!pmcdev->regmap_list) + return -ENOENT; - if (!ssram_base) - return; + ssram_base = ssram_pcidev->resource[0].start; + tmp_ssram = ioremap(ssram_base, SSRAM_HDR_SIZE); + if (pmc_idx != PMC_IDX_MAIN) { + /* + * The secondary PMC BARS (which are behind hidden PCI devices) + * are read from fixed offsets in MMIO of the primary PMC BAR. + */ + ssram_base = get_base(tmp_ssram, offset); ssram = ioremap(ssram_base, SSRAM_HDR_SIZE); if (!ssram) - return; + return -ENOMEM; + + } else { + ssram = no_free_ptr(tmp_ssram); } pwrm_base = get_base(ssram, SSRAM_PWRM_OFFSET); devid = readw(ssram + SSRAM_DEVID_OFFSET); - if (pmcdev->regmap_list) { - const struct pmc_reg_map *map; + map = pmc_core_find_regmap(pmcdev->regmap_list, devid); + if (!map) + return -ENODEV; - map = pmc_core_find_regmap(pmcdev->regmap_list, devid); - if (map) - pmc_core_pmc_add(pmcdev, pwrm_base, map, pmc_idx); - } - - if (pmc_idx != PMC_IDX_SOC) - iounmap(ssram); + return pmc_core_pmc_add(pmcdev, pwrm_base, map, PMC_IDX_MAIN); } int pmc_core_ssram_init(struct pmc_dev *pmcdev) { - void __iomem *ssram; struct pci_dev *pcidev; - u64 ssram_base; int ret; pcidev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(20, 2)); @@ -113,18 +121,14 @@ int pmc_core_ssram_init(struct pmc_dev *pmcdev) if (ret) goto release_dev; - ssram_base = pcidev->resource[0].start; - ssram = ioremap(ssram_base, SSRAM_HDR_SIZE); - if (!ssram) - goto disable_dev; - pmcdev->ssram_pcidev = pcidev; - pmc_core_ssram_get_pmc(pmcdev, ssram, 0, PMC_IDX_SOC); - pmc_core_ssram_get_pmc(pmcdev, ssram, SSRAM_IOE_OFFSET, PMC_IDX_IOE); - pmc_core_ssram_get_pmc(pmcdev, ssram, SSRAM_PCH_OFFSET, PMC_IDX_PCH); + ret = pmc_core_ssram_get_pmc(pmcdev, PMC_IDX_MAIN, 0); + if (ret) + goto disable_dev; - iounmap(ssram); + pmc_core_ssram_get_pmc(pmcdev, PMC_IDX_IOE, SSRAM_IOE_OFFSET); + pmc_core_ssram_get_pmc(pmcdev, PMC_IDX_PCH, SSRAM_PCH_OFFSET); return 0; From 642dd26f58d91f4bb2e2fcaaf178bbc35369b73a Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:26 -0800 Subject: [PATCH 0250/1562] platform/x86/intel/pmc/mtl: Use return value from pmc_core_ssram_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of checking for a NULL regbase, use the return value from pmc_core_ssram_init() to check if PMC discovery was successful. If not, use the legacy enumeration method (which only works for the primary PMC). Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-15-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/mtl.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/intel/pmc/mtl.c b/drivers/platform/x86/intel/pmc/mtl.c index c3b5f4fe01d1..d1d3d33fb4b8 100644 --- a/drivers/platform/x86/intel/pmc/mtl.c +++ b/drivers/platform/x86/intel/pmc/mtl.c @@ -990,12 +990,16 @@ int mtl_core_init(struct pmc_dev *pmcdev) mtl_d3_fixup(); pmcdev->resume = mtl_resume; - pmcdev->regmap_list = mtl_pmc_info_list; - pmc_core_ssram_init(pmcdev); - /* If regbase not assigned, set map and discover using legacy method */ - if (!pmc->regbase) { + /* + * If ssram init fails use legacy method to at least get the + * primary PMC + */ + ret = pmc_core_ssram_init(pmcdev); + if (ret) { + dev_warn(&pmcdev->pdev->dev, + "ssram init failed, %d, using legacy init\n", ret); pmc->map = &mtl_socm_reg_map; ret = get_primary_reg_base(pmc); if (ret) From 104f74943f4830f3a65fb96565b89014c882db85 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:27 -0800 Subject: [PATCH 0251/1562] platform/x86/intel/pmc: Find and register PMC telemetry entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PMC SSRAM device contains counters that are structured in Intel Platform Monitoring Technology (PMT) telemetry regions. Look for and register these telemetry regions from the driver so that they may be read using the Intel PMT ABI. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-16-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/Kconfig | 1 + drivers/platform/x86/intel/pmc/core_ssram.c | 49 +++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/drivers/platform/x86/intel/pmc/Kconfig b/drivers/platform/x86/intel/pmc/Kconfig index b526597e4deb..d2f651fbec2c 100644 --- a/drivers/platform/x86/intel/pmc/Kconfig +++ b/drivers/platform/x86/intel/pmc/Kconfig @@ -7,6 +7,7 @@ config INTEL_PMC_CORE tristate "Intel PMC Core driver" depends on PCI depends on ACPI + depends on INTEL_PMT_TELEMETRY help The Intel Platform Controller Hub for Intel Core SoCs provides access to Power Management Controller registers via various interfaces. This diff --git a/drivers/platform/x86/intel/pmc/core_ssram.c b/drivers/platform/x86/intel/pmc/core_ssram.c index c1b984255571..9ca720f9cbb2 100644 --- a/drivers/platform/x86/intel/pmc/core_ssram.c +++ b/drivers/platform/x86/intel/pmc/core_ssram.c @@ -13,6 +13,8 @@ #include #include "core.h" +#include "../vsec.h" +#include "../pmt/telemetry.h" #define SSRAM_HDR_SIZE 0x100 #define SSRAM_PWRM_OFFSET 0x14 @@ -24,6 +26,49 @@ DEFINE_FREE(pmc_core_iounmap, void __iomem *, iounmap(_T)); +static void +pmc_add_pmt(struct pmc_dev *pmcdev, u64 ssram_base, void __iomem *ssram) +{ + struct pci_dev *pcidev = pmcdev->ssram_pcidev; + struct intel_vsec_platform_info info = {}; + struct intel_vsec_header *headers[2] = {}; + struct intel_vsec_header header; + void __iomem *dvsec; + u32 dvsec_offset; + u32 table, hdr; + + ssram = ioremap(ssram_base, SSRAM_HDR_SIZE); + if (!ssram) + return; + + dvsec_offset = readl(ssram + SSRAM_DVSEC_OFFSET); + iounmap(ssram); + + dvsec = ioremap(ssram_base + dvsec_offset, SSRAM_DVSEC_SIZE); + if (!dvsec) + return; + + hdr = readl(dvsec + PCI_DVSEC_HEADER1); + header.id = readw(dvsec + PCI_DVSEC_HEADER2); + header.rev = PCI_DVSEC_HEADER1_REV(hdr); + header.length = PCI_DVSEC_HEADER1_LEN(hdr); + header.num_entries = readb(dvsec + INTEL_DVSEC_ENTRIES); + header.entry_size = readb(dvsec + INTEL_DVSEC_SIZE); + + table = readl(dvsec + INTEL_DVSEC_TABLE); + header.tbir = INTEL_DVSEC_TABLE_BAR(table); + header.offset = INTEL_DVSEC_TABLE_OFFSET(table); + iounmap(dvsec); + + headers[0] = &header; + info.caps = VSEC_CAP_TELEMETRY; + info.headers = headers; + info.base_addr = ssram_base; + info.parent = &pmcdev->pdev->dev; + + intel_vsec_register(pcidev, &info); +} + static const struct pmc_reg_map *pmc_core_find_regmap(struct pmc_info *list, u16 devid) { for (; list->map; ++list) @@ -101,6 +146,9 @@ pmc_core_ssram_get_pmc(struct pmc_dev *pmcdev, int pmc_idx, u32 offset) pwrm_base = get_base(ssram, SSRAM_PWRM_OFFSET); devid = readw(ssram + SSRAM_DEVID_OFFSET); + /* Find and register and PMC telemetry entries */ + pmc_add_pmt(pmcdev, ssram_base, ssram); + map = pmc_core_find_regmap(pmcdev->regmap_list, devid); if (!map) return -ENODEV; @@ -140,3 +188,4 @@ release_dev: return ret; } +MODULE_IMPORT_NS(INTEL_VSEC); From 0f601dec1856d675a1251e25e858d8f1cb0b8026 Mon Sep 17 00:00:00 2001 From: Rajvi Jingar Date: Wed, 29 Nov 2023 14:21:28 -0800 Subject: [PATCH 0252/1562] platform/x86/intel/pmc: Display LPM requirements for multiple PMCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the substate_requirements attribute to display the requirements for all the PMCs on a package. Signed-off-by: Rajvi Jingar Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-17-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/core.c | 113 ++++++++++++++------------ 1 file changed, 63 insertions(+), 50 deletions(-) diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c index 3894119d61b0..9d3a1b6ef622 100644 --- a/drivers/platform/x86/intel/pmc/core.c +++ b/drivers/platform/x86/intel/pmc/core.c @@ -728,7 +728,7 @@ static int pmc_core_substate_l_sts_regs_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(pmc_core_substate_l_sts_regs); -static void pmc_core_substate_req_header_show(struct seq_file *s) +static void pmc_core_substate_req_header_show(struct seq_file *s, int pmc_index) { struct pmc_dev *pmcdev = s->private; int i, mode; @@ -743,68 +743,81 @@ static void pmc_core_substate_req_header_show(struct seq_file *s) static int pmc_core_substate_req_regs_show(struct seq_file *s, void *unused) { struct pmc_dev *pmcdev = s->private; - struct pmc *pmc = pmcdev->pmcs[PMC_IDX_MAIN]; - const struct pmc_bit_map **maps = pmc->map->lpm_sts; - const struct pmc_bit_map *map; - const int num_maps = pmc->map->lpm_num_maps; - u32 sts_offset = pmc->map->lpm_status_offset; - u32 *lpm_req_regs = pmc->lpm_req_regs; - int mp; + u32 sts_offset; + u32 *lpm_req_regs; + int num_maps, mp, pmc_index; - /* Display the header */ - pmc_core_substate_req_header_show(s); + for (pmc_index = 0; pmc_index < ARRAY_SIZE(pmcdev->pmcs); ++pmc_index) { + struct pmc *pmc = pmcdev->pmcs[pmc_index]; + const struct pmc_bit_map **maps; - /* Loop over maps */ - for (mp = 0; mp < num_maps; mp++) { - u32 req_mask = 0; - u32 lpm_status; - int mode, idx, i, len = 32; + if (!pmc) + continue; + + maps = pmc->map->lpm_sts; + num_maps = pmc->map->lpm_num_maps; + sts_offset = pmc->map->lpm_status_offset; + lpm_req_regs = pmc->lpm_req_regs; /* - * Capture the requirements and create a mask so that we only - * show an element if it's required for at least one of the - * enabled low power modes + * When there are multiple PMCs, though the PMC may exist, the + * requirement register discovery could have failed so check + * before accessing. */ - pmc_for_each_mode(idx, mode, pmcdev) - req_mask |= lpm_req_regs[mp + (mode * num_maps)]; + if (!lpm_req_regs) + continue; - /* Get the last latched status for this map */ - lpm_status = pmc_core_reg_read(pmc, sts_offset + (mp * 4)); + /* Display the header */ + pmc_core_substate_req_header_show(s, pmc_index); - /* Loop over elements in this map */ - map = maps[mp]; - for (i = 0; map[i].name && i < len; i++) { - u32 bit_mask = map[i].bit_mask; + /* Loop over maps */ + for (mp = 0; mp < num_maps; mp++) { + u32 req_mask = 0; + u32 lpm_status; + const struct pmc_bit_map *map; + int mode, idx, i, len = 32; - if (!(bit_mask & req_mask)) - /* - * Not required for any enabled states - * so don't display - */ - continue; + /* + * Capture the requirements and create a mask so that we only + * show an element if it's required for at least one of the + * enabled low power modes + */ + pmc_for_each_mode(idx, mode, pmcdev) + req_mask |= lpm_req_regs[mp + (mode * num_maps)]; - /* Display the element name in the first column */ - seq_printf(s, "%30s |", map[i].name); + /* Get the last latched status for this map */ + lpm_status = pmc_core_reg_read(pmc, sts_offset + (mp * 4)); - /* Loop over the enabled states and display if required */ - pmc_for_each_mode(idx, mode, pmcdev) { - if (lpm_req_regs[mp + (mode * num_maps)] & bit_mask) - seq_printf(s, " %9s |", - "Required"); - else - seq_printf(s, " %9s |", " "); + /* Loop over elements in this map */ + map = maps[mp]; + for (i = 0; map[i].name && i < len; i++) { + u32 bit_mask = map[i].bit_mask; + + if (!(bit_mask & req_mask)) { + /* + * Not required for any enabled states + * so don't display + */ + continue; + } + + /* Display the element name in the first column */ + seq_printf(s, "pmc%d: %26s |", pmc_index, map[i].name); + + /* Loop over the enabled states and display if required */ + pmc_for_each_mode(idx, mode, pmcdev) { + bool required = lpm_req_regs[mp + (mode * num_maps)] & + bit_mask; + seq_printf(s, " %9s |", required ? "Required" : " "); + } + + /* In Status column, show the last captured state of this agent */ + seq_printf(s, " %9s |", lpm_status & bit_mask ? "Yes" : " "); + + seq_puts(s, "\n"); } - - /* In Status column, show the last captured state of this agent */ - if (lpm_status & bit_mask) - seq_printf(s, " %9s |", "Yes"); - else - seq_printf(s, " %9s |", " "); - - seq_puts(s, "\n"); } } - return 0; } DEFINE_SHOW_ATTRIBUTE(pmc_core_substate_req_regs); From 4d621c3f02ba71cb8ed48b7c32ecb0910000cc28 Mon Sep 17 00:00:00 2001 From: Xi Pardee Date: Wed, 29 Nov 2023 14:21:29 -0800 Subject: [PATCH 0253/1562] platform/x86/intel/pmc: Retrieve LPM information using Intel PMT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On supported platforms, the low power mode (LPM) requirements for entering each idle substate are described in Platform Monitoring Technology (PMT) telemetry entries. Provide a function for platform code to attempt to find and read the requirements from the telemetry entries. Signed-off-by: Xi Pardee Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-18-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/core.h | 3 + drivers/platform/x86/intel/pmc/core_ssram.c | 135 ++++++++++++++++++++ 2 files changed, 138 insertions(+) diff --git a/drivers/platform/x86/intel/pmc/core.h b/drivers/platform/x86/intel/pmc/core.h index edaa70067e41..85b6f6ae4995 100644 --- a/drivers/platform/x86/intel/pmc/core.h +++ b/drivers/platform/x86/intel/pmc/core.h @@ -320,6 +320,7 @@ struct pmc_reg_map { const u32 lpm_status_offset; const u32 lpm_live_status_offset; const u32 etr3_offset; + const u8 *lpm_reg_index; }; /** @@ -329,6 +330,7 @@ struct pmc_reg_map { * specific attributes */ struct pmc_info { + u32 guid; u16 devid; const struct pmc_reg_map *map; }; @@ -486,6 +488,7 @@ extern const struct pmc_bit_map *mtl_ioem_lpm_maps[]; extern const struct pmc_reg_map mtl_ioem_reg_map; extern void pmc_core_get_tgl_lpm_reqs(struct platform_device *pdev); +extern int pmc_core_ssram_get_lpm_reqs(struct pmc_dev *pmcdev); extern int pmc_core_send_ltr_ignore(struct pmc_dev *pmcdev, u32 value); int pmc_core_resume_common(struct pmc_dev *pmcdev); diff --git a/drivers/platform/x86/intel/pmc/core_ssram.c b/drivers/platform/x86/intel/pmc/core_ssram.c index 9ca720f9cbb2..3501c7bd6b33 100644 --- a/drivers/platform/x86/intel/pmc/core_ssram.c +++ b/drivers/platform/x86/intel/pmc/core_ssram.c @@ -24,8 +24,142 @@ #define SSRAM_IOE_OFFSET 0x68 #define SSRAM_DEVID_OFFSET 0x70 +/* PCH query */ +#define LPM_HEADER_OFFSET 1 +#define LPM_REG_COUNT 28 +#define LPM_MODE_OFFSET 1 + DEFINE_FREE(pmc_core_iounmap, void __iomem *, iounmap(_T)); +static u32 pmc_core_find_guid(struct pmc_info *list, const struct pmc_reg_map *map) +{ + for (; list->map; ++list) + if (list->map == map) + return list->guid; + + return 0; +} + +static int pmc_core_get_lpm_req(struct pmc_dev *pmcdev, struct pmc *pmc) +{ + struct telem_endpoint *ep; + const u8 *lpm_indices; + int num_maps, mode_offset = 0; + int ret, mode, i; + int lpm_size; + u32 guid; + + lpm_indices = pmc->map->lpm_reg_index; + num_maps = pmc->map->lpm_num_maps; + lpm_size = LPM_MAX_NUM_MODES * num_maps; + + guid = pmc_core_find_guid(pmcdev->regmap_list, pmc->map); + if (!guid) + return -ENXIO; + + ep = pmt_telem_find_and_register_endpoint(pmcdev->ssram_pcidev, guid, 0); + if (IS_ERR(ep)) { + dev_dbg(&pmcdev->pdev->dev, "couldn't get telem endpoint %ld", + PTR_ERR(ep)); + return -EPROBE_DEFER; + } + + pmc->lpm_req_regs = devm_kzalloc(&pmcdev->pdev->dev, + lpm_size * sizeof(u32), + GFP_KERNEL); + if (!pmc->lpm_req_regs) { + ret = -ENOMEM; + goto unregister_ep; + } + + /* + * PMC Low Power Mode (LPM) table + * + * In telemetry space, the LPM table contains a 4 byte header followed + * by 8 consecutive mode blocks (one for each LPM mode). Each block + * has a 4 byte header followed by a set of registers that describe the + * IP state requirements for the given mode. The IP mapping is platform + * specific but the same for each block, making for easy analysis. + * Platforms only use a subset of the space to track the requirements + * for their IPs. Callers provide the requirement registers they use as + * a list of indices. Each requirement register is associated with an + * IP map that's maintained by the caller. + * + * Header + * +----+----------------------------+----------------------------+ + * | 0 | REVISION | ENABLED MODES | + * +----+--------------+-------------+-------------+--------------+ + * + * Low Power Mode 0 Block + * +----+--------------+-------------+-------------+--------------+ + * | 1 | SUB ID | SIZE | MAJOR | MINOR | + * +----+--------------+-------------+-------------+--------------+ + * | 2 | LPM0 Requirements 0 | + * +----+---------------------------------------------------------+ + * | | ... | + * +----+---------------------------------------------------------+ + * | 29 | LPM0 Requirements 27 | + * +----+---------------------------------------------------------+ + * + * ... + * + * Low Power Mode 7 Block + * +----+--------------+-------------+-------------+--------------+ + * | | SUB ID | SIZE | MAJOR | MINOR | + * +----+--------------+-------------+-------------+--------------+ + * | 60 | LPM7 Requirements 0 | + * +----+---------------------------------------------------------+ + * | | ... | + * +----+---------------------------------------------------------+ + * | 87 | LPM7 Requirements 27 | + * +----+---------------------------------------------------------+ + * + */ + mode_offset = LPM_HEADER_OFFSET + LPM_MODE_OFFSET; + pmc_for_each_mode(i, mode, pmcdev) { + u32 *req_offset = pmc->lpm_req_regs + (mode * num_maps); + int m; + + for (m = 0; m < num_maps; m++) { + u8 sample_id = lpm_indices[m] + mode_offset; + + ret = pmt_telem_read32(ep, sample_id, req_offset, 1); + if (ret) { + dev_err(&pmcdev->pdev->dev, + "couldn't read Low Power Mode requirements: %d\n", ret); + devm_kfree(&pmcdev->pdev->dev, pmc->lpm_req_regs); + goto unregister_ep; + } + ++req_offset; + } + mode_offset += LPM_REG_COUNT + LPM_MODE_OFFSET; + } + +unregister_ep: + pmt_telem_unregister_endpoint(ep); + + return ret; +} + +int pmc_core_ssram_get_lpm_reqs(struct pmc_dev *pmcdev) +{ + int ret, i; + + if (!pmcdev->ssram_pcidev) + return -ENODEV; + + for (i = 0; i < ARRAY_SIZE(pmcdev->pmcs); ++i) { + if (!pmcdev->pmcs[i]) + continue; + + ret = pmc_core_get_lpm_req(pmcdev, pmcdev->pmcs[i]); + if (ret) + return ret; + } + + return 0; +} + static void pmc_add_pmt(struct pmc_dev *pmcdev, u64 ssram_base, void __iomem *ssram) { @@ -189,3 +323,4 @@ release_dev: return ret; } MODULE_IMPORT_NS(INTEL_VSEC); +MODULE_IMPORT_NS(INTEL_PMT_TELEMETRY); From 935b8211a31a52c82150b2b83c8428859393860d Mon Sep 17 00:00:00 2001 From: Xi Pardee Date: Wed, 29 Nov 2023 14:21:30 -0800 Subject: [PATCH 0254/1562] platform/x86/intel/pmc: Read low power mode requirements for MTL-M and MTL-P MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to read the low power mode requirements for Meteor Lake M and Meteor Lake P. Signed-off-by: Xi Pardee Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-19-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/mtl.c | 39 +++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/drivers/platform/x86/intel/pmc/mtl.c b/drivers/platform/x86/intel/pmc/mtl.c index d1d3d33fb4b8..7ceeae507f4c 100644 --- a/drivers/platform/x86/intel/pmc/mtl.c +++ b/drivers/platform/x86/intel/pmc/mtl.c @@ -11,6 +11,13 @@ #include #include "core.h" +/* PMC SSRAM PMT Telemetry GUIDS */ +#define SOCP_LPM_REQ_GUID 0x2625030 +#define IOEM_LPM_REQ_GUID 0x4357464 +#define IOEP_LPM_REQ_GUID 0x5077612 + +static const u8 MTL_LPM_REG_INDEX[] = {0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20}; + /* * Die Mapping to Product. * Product SOCDie IOEDie PCHDie @@ -465,6 +472,7 @@ const struct pmc_reg_map mtl_socm_reg_map = { .lpm_sts = mtl_socm_lpm_maps, .lpm_status_offset = MTL_LPM_STATUS_OFFSET, .lpm_live_status_offset = MTL_LPM_LIVE_STATUS_OFFSET, + .lpm_reg_index = MTL_LPM_REG_INDEX, }; const struct pmc_bit_map mtl_ioep_pfear_map[] = { @@ -782,6 +790,13 @@ const struct pmc_reg_map mtl_ioep_reg_map = { .ltr_show_sts = mtl_ioep_ltr_show_map, .ltr_ignore_offset = CNP_PMC_LTR_IGNORE_OFFSET, .ltr_ignore_max = ADL_NUM_IP_IGN_ALLOWED, + .lpm_num_maps = ADL_LPM_NUM_MAPS, + .lpm_res_counter_step_x2 = TGL_PMC_LPM_RES_COUNTER_STEP_X2, + .lpm_residency_offset = MTL_LPM_RESIDENCY_OFFSET, + .lpm_priority_offset = MTL_LPM_PRI_OFFSET, + .lpm_en_offset = MTL_LPM_EN_OFFSET, + .lpm_sts_latch_en_offset = MTL_LPM_STATUS_LATCH_EN_OFFSET, + .lpm_reg_index = MTL_LPM_REG_INDEX, }; const struct pmc_bit_map mtl_ioem_pfear_map[] = { @@ -922,6 +937,13 @@ const struct pmc_reg_map mtl_ioem_reg_map = { .ltr_show_sts = mtl_ioep_ltr_show_map, .ltr_ignore_offset = CNP_PMC_LTR_IGNORE_OFFSET, .ltr_ignore_max = ADL_NUM_IP_IGN_ALLOWED, + .lpm_sts_latch_en_offset = MTL_LPM_STATUS_LATCH_EN_OFFSET, + .lpm_num_maps = ADL_LPM_NUM_MAPS, + .lpm_priority_offset = MTL_LPM_PRI_OFFSET, + .lpm_en_offset = MTL_LPM_EN_OFFSET, + .lpm_res_counter_step_x2 = TGL_PMC_LPM_RES_COUNTER_STEP_X2, + .lpm_residency_offset = MTL_LPM_RESIDENCY_OFFSET, + .lpm_reg_index = MTL_LPM_REG_INDEX, }; #define PMC_DEVID_SOCM 0x7e7f @@ -929,16 +951,19 @@ const struct pmc_reg_map mtl_ioem_reg_map = { #define PMC_DEVID_IOEM 0x7ebf static struct pmc_info mtl_pmc_info_list[] = { { - .devid = PMC_DEVID_SOCM, - .map = &mtl_socm_reg_map, + .guid = SOCP_LPM_REQ_GUID, + .devid = PMC_DEVID_SOCM, + .map = &mtl_socm_reg_map, }, { - .devid = PMC_DEVID_IOEP, - .map = &mtl_ioep_reg_map, + .guid = IOEP_LPM_REQ_GUID, + .devid = PMC_DEVID_IOEP, + .map = &mtl_ioep_reg_map, }, { - .devid = PMC_DEVID_IOEM, - .map = &mtl_ioem_reg_map + .guid = IOEM_LPM_REQ_GUID, + .devid = PMC_DEVID_IOEM, + .map = &mtl_ioem_reg_map }, {} }; @@ -1014,5 +1039,5 @@ int mtl_core_init(struct pmc_dev *pmcdev) dev_dbg(&pmcdev->pdev->dev, "ignoring GBE LTR\n"); pmc_core_send_ltr_ignore(pmcdev, 3); - return 0; + return pmc_core_ssram_get_lpm_reqs(pmcdev); } From 3621df43b07d9a32e18309de569f43a8b6453966 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:31 -0800 Subject: [PATCH 0255/1562] platform/x86/intel/pmc: Add debug attribute for Die C6 counter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a "die_c6_us_show" debugfs attribute. Reads the counter value using Intel Platform Monitoring Technology (PMT) driver API. This counter is useful for determining the idle residency of CPUs in the compute tile. Also adds a missing forward declaration for punit_ep which was declared in an earlier upstream commit but only used for the first time in this one. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-20-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/core.c | 55 +++++++++++++++++++++++++++ drivers/platform/x86/intel/pmc/core.h | 4 ++ 2 files changed, 59 insertions(+) diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c index 9d3a1b6ef622..983e3a8f4910 100644 --- a/drivers/platform/x86/intel/pmc/core.c +++ b/drivers/platform/x86/intel/pmc/core.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -27,6 +28,7 @@ #include #include "core.h" +#include "../pmt/telemetry.h" /* Maximum number of modes supported by platfoms that has low power mode capability */ const char *pmc_lpm_modes[] = { @@ -822,6 +824,47 @@ static int pmc_core_substate_req_regs_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(pmc_core_substate_req_regs); +static unsigned int pmc_core_get_crystal_freq(void) +{ + unsigned int eax_denominator, ebx_numerator, ecx_hz, edx; + + if (boot_cpu_data.cpuid_level < 0x15) + return 0; + + eax_denominator = ebx_numerator = ecx_hz = edx = 0; + + /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ + cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); + + if (ebx_numerator == 0 || eax_denominator == 0) + return 0; + + return ecx_hz; +} + +static int pmc_core_die_c6_us_show(struct seq_file *s, void *unused) +{ + struct pmc_dev *pmcdev = s->private; + u64 die_c6_res, count; + int ret; + + if (!pmcdev->crystal_freq) { + dev_warn_once(&pmcdev->pdev->dev, "Crystal frequency unavailable\n"); + return -ENXIO; + } + + ret = pmt_telem_read(pmcdev->punit_ep, pmcdev->die_c6_offset, + &count, 1); + if (ret) + return ret; + + die_c6_res = div64_u64(count * HZ_PER_MHZ, pmcdev->crystal_freq); + seq_printf(s, "%llu\n", die_c6_res); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(pmc_core_die_c6_us); + static int pmc_core_lpm_latch_mode_show(struct seq_file *s, void *unused) { struct pmc_dev *pmcdev = s->private; @@ -1118,6 +1161,12 @@ static void pmc_core_dbgfs_register(struct pmc_dev *pmcdev) pmcdev->dbgfs_dir, pmcdev, &pmc_core_substate_req_regs_fops); } + + if (pmcdev->has_die_c6) { + debugfs_create_file("die_c6_us_show", 0444, + pmcdev->dbgfs_dir, pmcdev, + &pmc_core_die_c6_us_fops); + } } static const struct x86_cpu_id intel_pmc_core_ids[] = { @@ -1212,6 +1261,10 @@ static void pmc_core_clean_structure(struct platform_device *pdev) pci_dev_put(pmcdev->ssram_pcidev); pci_disable_device(pmcdev->ssram_pcidev); } + + if (pmcdev->punit_ep) + pmt_telem_unregister_endpoint(pmcdev->punit_ep); + platform_set_drvdata(pdev, NULL); mutex_destroy(&pmcdev->lock); } @@ -1232,6 +1285,8 @@ static int pmc_core_probe(struct platform_device *pdev) if (!pmcdev) return -ENOMEM; + pmcdev->crystal_freq = pmc_core_get_crystal_freq(); + platform_set_drvdata(pdev, pmcdev); pmcdev->pdev = pdev; diff --git a/drivers/platform/x86/intel/pmc/core.h b/drivers/platform/x86/intel/pmc/core.h index 85b6f6ae4995..6d7673145f90 100644 --- a/drivers/platform/x86/intel/pmc/core.h +++ b/drivers/platform/x86/intel/pmc/core.h @@ -16,6 +16,8 @@ #include #include +struct telem_endpoint; + #define SLP_S0_RES_COUNTER_MASK GENMASK(31, 0) #define PMC_BASE_ADDR_DEFAULT 0xFE000000 @@ -357,6 +359,7 @@ struct pmc { * @devs: pointer to an array of pmc pointers * @pdev: pointer to platform_device struct * @ssram_pcidev: pointer to pci device struct for the PMC SSRAM + * @crystal_freq: crystal frequency from cpuid * @dbgfs_dir: path to debugfs interface * @pmc_xram_read_bit: flag to indicate whether PMC XRAM shadow registers * used to read MPHY PG and PLL status are available @@ -374,6 +377,7 @@ struct pmc_dev { struct dentry *dbgfs_dir; struct platform_device *pdev; struct pci_dev *ssram_pcidev; + unsigned int crystal_freq; int pmc_xram_read_bit; struct mutex lock; /* generic mutex lock for PMC Core */ From 6e79648553818bb21021ccf72ae27f4508844818 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 29 Nov 2023 14:21:32 -0800 Subject: [PATCH 0256/1562] platform/x86/intel/pmc: Show Die C6 counter on Meteor Lake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose the Die C6 counter on Meteor Lake. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231129222132.2331261-21-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/mtl.c | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/drivers/platform/x86/intel/pmc/mtl.c b/drivers/platform/x86/intel/pmc/mtl.c index 7ceeae507f4c..38c2f946ec23 100644 --- a/drivers/platform/x86/intel/pmc/mtl.c +++ b/drivers/platform/x86/intel/pmc/mtl.c @@ -10,12 +10,17 @@ #include #include "core.h" +#include "../pmt/telemetry.h" /* PMC SSRAM PMT Telemetry GUIDS */ #define SOCP_LPM_REQ_GUID 0x2625030 #define IOEM_LPM_REQ_GUID 0x4357464 #define IOEP_LPM_REQ_GUID 0x5077612 +/* Die C6 from PUNIT telemetry */ +#define MTL_PMT_DMU_DIE_C6_OFFSET 15 +#define MTL_PMT_DMU_GUID 0x1A067102 + static const u8 MTL_LPM_REG_INDEX[] = {0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20}; /* @@ -968,6 +973,32 @@ static struct pmc_info mtl_pmc_info_list[] = { {} }; +static void mtl_punit_pmt_init(struct pmc_dev *pmcdev) +{ + struct telem_endpoint *ep; + struct pci_dev *pcidev; + + pcidev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(10, 0)); + if (!pcidev) { + dev_err(&pmcdev->pdev->dev, "PUNIT PMT device not found.\n"); + return; + } + + ep = pmt_telem_find_and_register_endpoint(pcidev, MTL_PMT_DMU_GUID, 0); + if (IS_ERR(ep)) { + dev_err(&pmcdev->pdev->dev, + "pmc_core: couldn't get DMU telem endpoint, %ld\n", + PTR_ERR(ep)); + return; + } + + pci_dev_put(pcidev); + pmcdev->punit_ep = ep; + + pmcdev->has_die_c6 = true; + pmcdev->die_c6_offset = MTL_PMT_DMU_DIE_C6_OFFSET; +} + #define MTL_GNA_PCI_DEV 0x7e4c #define MTL_IPU_PCI_DEV 0x7d19 #define MTL_VPU_PCI_DEV 0x7d1d @@ -1032,6 +1063,7 @@ int mtl_core_init(struct pmc_dev *pmcdev) } pmc_core_get_low_power_modes(pmcdev); + mtl_punit_pmt_init(pmcdev); /* Due to a hardware limitation, the GBE LTR blocks PC10 * when a cable is attached. Tell the PMC to ignore it. From 35ddd61cf023b5deb2b7e9f1627abef053281c0a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 4 Dec 2023 15:29:29 +0300 Subject: [PATCH 0257/1562] platform/x86: x86-android-tablets: Fix an IS_ERR() vs NULL check in probe The spi_new_device() function returns NULL on error, it doesn't return error pointers. Fixes: 70505ea6de24 ("platform/x86: x86-android-tablets: Add support for SPI device instantiation") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/4b1b2395-c7c5-44a4-b0b0-6d091c7f46a2@moroto.mountain Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/x86-android-tablets/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/x86-android-tablets/core.c b/drivers/platform/x86/x86-android-tablets/core.c index 6a5975ac3286..f8221a15575b 100644 --- a/drivers/platform/x86/x86-android-tablets/core.c +++ b/drivers/platform/x86/x86-android-tablets/core.c @@ -220,8 +220,8 @@ static __init int x86_instantiate_spi_dev(const struct x86_dev_info *dev_info, i spi_devs[idx] = spi_new_device(controller, &board_info); put_device(&controller->dev); - if (IS_ERR(spi_devs[idx])) - return dev_err_probe(&controller->dev, PTR_ERR(spi_devs[idx]), + if (!spi_devs[idx]) + return dev_err_probe(&controller->dev, -ENOMEM, "creating SPI-device %d\n", idx); return 0; From 422e7d54375889484b66962d1dcbc392a6bd9e7a Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:25 +0000 Subject: [PATCH 0258/1562] slub: Prepare __slab_free() for unfrozen partial slab out of node partial list Now the partially empty slub will be frozen when taken out of node partial list, so the __slab_free() will know from "was_frozen" that the partially empty slab is not on node partial list and is a cpu or cpu partial slab of some cpu. But we will change this, make partial slabs leave the node partial list with unfrozen state, so we need to change __slab_free() to use the new slab_test_node_partial() we just introduced. Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 6efcbf79fd2d..18f18fbbd97e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3631,6 +3631,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, unsigned long counters; struct kmem_cache_node *n = NULL; unsigned long flags; + bool on_node_partial; stat(s, FREE_SLOWPATH); @@ -3678,6 +3679,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, */ spin_lock_irqsave(&n->list_lock, flags); + on_node_partial = slab_test_node_partial(slab); } } @@ -3706,6 +3708,15 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, return; } + /* + * This slab was partially empty but not on the per-node partial list, + * in which case we shouldn't manipulate its list, just return. + */ + if (prior && !on_node_partial) { + spin_unlock_irqrestore(&n->list_lock, flags); + return; + } + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; From 213094b5d1af7e6ab294a6d8f3b50cafb72642ae Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:26 +0000 Subject: [PATCH 0259/1562] slub: Introduce freeze_slab() We will have unfrozen slabs out of the node partial list later, so we need a freeze_slab() function to freeze the partial slab and get its freelist. Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 18f18fbbd97e..253626ef9f37 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3098,6 +3098,33 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) return freelist; } +/* + * Freeze the partial slab and return the pointer to the freelist. + */ +static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) +{ + struct slab new; + unsigned long counters; + void *freelist; + + do { + freelist = slab->freelist; + counters = slab->counters; + + new.counters = counters; + VM_BUG_ON(new.frozen); + + new.inuse = slab->objects; + new.frozen = 1; + + } while (!slab_update_freelist(s, slab, + freelist, counters, + NULL, new.counters, + "freeze_slab")); + + return freelist; +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. From 8cd3fa428b56352beaa38df756c1d3f1556f5514 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:27 +0000 Subject: [PATCH 0260/1562] slub: Delay freezing of partial slabs Now we will freeze slabs when moving them out of node partial list to cpu partial list, this method needs two cmpxchg_double operations: 1. freeze slab (acquire_slab()) under the node list_lock 2. get_freelist() when pick used in ___slab_alloc() Actually we don't need to freeze when moving slabs out of node partial list, we can delay freezing to when use slab freelist in ___slab_alloc(), so we can save one cmpxchg_double(). And there are other good points: - The moving of slabs between node partial list and cpu partial list becomes simpler, since we don't need to freeze or unfreeze at all. - The node list_lock contention would be less, since we don't need to freeze any slab under the node list_lock. We can achieve this because there is no concurrent path would manipulate the partial slab list except the __slab_free() path, which is now serialized by slab_test_node_partial() under the list_lock. Since the slab returned by get_partial() interfaces is not frozen anymore and no freelist is returned in the partial_context, so we need to use the introduced freeze_slab() to freeze it and get its freelist. Similarly, the slabs on the CPU partial list are not frozen anymore, we need to freeze_slab() on it before use. We can now delete acquire_slab() as it became unused. Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 115 ++++++++++++------------------------------------------ 1 file changed, 24 insertions(+), 91 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 253626ef9f37..5a5102a4c273 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2166,7 +2166,7 @@ static inline void remove_partial(struct kmem_cache_node *n, } /* - * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a + * Called only for kmem_cache_debug() caches instead of remove_partial(), with a * slab from the n->partial list. Remove only a single object from the slab, do * the alloc_debug_processing() checks and leave the slab on the list, or move * it to full list if it was the last free object. @@ -2234,51 +2234,6 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, return object; } -/* - * Remove slab from the partial list, freeze it and - * return the pointer to the freelist. - * - * Returns a list of objects or NULL if it fails. - */ -static inline void *acquire_slab(struct kmem_cache *s, - struct kmem_cache_node *n, struct slab *slab, - int mode) -{ - void *freelist; - unsigned long counters; - struct slab new; - - lockdep_assert_held(&n->list_lock); - - /* - * Zap the freelist and set the frozen bit. - * The old freelist is the list of objects for the - * per cpu allocation list. - */ - freelist = slab->freelist; - counters = slab->counters; - new.counters = counters; - if (mode) { - new.inuse = slab->objects; - new.freelist = NULL; - } else { - new.freelist = freelist; - } - - VM_BUG_ON(new.frozen); - new.frozen = 1; - - if (!__slab_update_freelist(s, slab, - freelist, counters, - new.freelist, new.counters, - "acquire_slab")) - return NULL; - - remove_partial(n, slab); - WARN_ON(!freelist); - return freelist; -} - #ifdef CONFIG_SLUB_CPU_PARTIAL static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); #else @@ -2295,7 +2250,6 @@ static struct slab *get_partial_node(struct kmem_cache *s, struct partial_context *pc) { struct slab *slab, *slab2, *partial = NULL; - void *object = NULL; unsigned long flags; unsigned int partial_slabs = 0; @@ -2314,7 +2268,7 @@ static struct slab *get_partial_node(struct kmem_cache *s, continue; if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - object = alloc_single_from_partial(s, n, slab, + void *object = alloc_single_from_partial(s, n, slab, pc->orig_size); if (object) { partial = slab; @@ -2324,13 +2278,10 @@ static struct slab *get_partial_node(struct kmem_cache *s, continue; } - object = acquire_slab(s, n, slab, object == NULL); - if (!object) - break; + remove_partial(n, slab); if (!partial) { partial = slab; - pc->object = object; stat(s, ALLOC_FROM_PARTIAL); } else { put_cpu_partial(s, slab, 0); @@ -2629,9 +2580,6 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) unsigned long flags = 0; while (partial_slab) { - struct slab new; - struct slab old; - slab = partial_slab; partial_slab = slab->next; @@ -2644,23 +2592,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) spin_lock_irqsave(&n->list_lock, flags); } - do { - - old.freelist = slab->freelist; - old.counters = slab->counters; - VM_BUG_ON(!old.frozen); - - new.counters = old.counters; - new.freelist = old.freelist; - - new.frozen = 0; - - } while (!__slab_update_freelist(s, slab, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")); - - if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { slab->next = slab_to_discard; slab_to_discard = slab; } else { @@ -3167,7 +3099,6 @@ reread_slab: node = NUMA_NO_NODE; goto new_slab; } -redo: if (unlikely(!node_match(slab, node))) { /* @@ -3243,7 +3174,8 @@ deactivate_slab: new_slab: - if (slub_percpu_partial(c)) { +#ifdef CONFIG_SLUB_CPU_PARTIAL + while (slub_percpu_partial(c)) { local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(c->slab)) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); @@ -3255,12 +3187,22 @@ new_slab: goto new_objects; } - slab = c->slab = slub_percpu_partial(c); + slab = slub_percpu_partial(c); slub_set_percpu_partial(c, slab); local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, CPU_PARTIAL_ALLOC); - goto redo; + + if (unlikely(!node_match(slab, node) || + !pfmemalloc_match(slab, gfpflags))) { + slab->next = NULL; + __unfreeze_partials(s, slab); + continue; + } + + freelist = freeze_slab(s, slab); + goto retry_load_slab; } +#endif new_objects: @@ -3268,8 +3210,8 @@ new_objects: pc.orig_size = orig_size; slab = get_partial(s, node, &pc); if (slab) { - freelist = pc.object; if (kmem_cache_debug(s)) { + freelist = pc.object; /* * For debug caches here we had to go through * alloc_single_from_partial() so just store the @@ -3281,6 +3223,7 @@ new_objects: return freelist; } + freelist = freeze_slab(s, slab); goto retry_load_slab; } @@ -3682,18 +3625,8 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, was_frozen = new.frozen; new.inuse -= cnt; if ((!new.inuse || !prior) && !was_frozen) { - - if (kmem_cache_has_cpu_partial(s) && !prior) { - - /* - * Slab was on no list before and will be - * partially empty - * We can defer the list move and instead - * freeze it. - */ - new.frozen = 1; - - } else { /* Needs to be taken off a list */ + /* Needs to be taken off a list */ + if (!kmem_cache_has_cpu_partial(s) || prior) { n = get_node(s, slab_nid(slab)); /* @@ -3723,9 +3656,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * activity can be necessary. */ stat(s, FREE_FROZEN); - } else if (new.frozen) { + } else if (kmem_cache_has_cpu_partial(s) && !prior) { /* - * If we just froze the slab then put it onto the + * If we started with a full slab then put it onto the * per cpu partial list. */ put_cpu_partial(s, slab, 1); From 00eb60c28815e22690834b2e3951ded0cd300b8d Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:28 +0000 Subject: [PATCH 0261/1562] slub: Optimize deactivate_slab() Since the introduce of unfrozen slabs on cpu partial list, we don't need to synchronize the slab frozen state under the node list_lock. The caller of deactivate_slab() and the caller of __slab_free() won't manipulate the slab list concurrently. So we can get node list_lock in the last stage if we really need to manipulate the slab list in this path. Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 81 +++++++++++++++++++------------------------------------ 1 file changed, 27 insertions(+), 54 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 5a5102a4c273..47655f2fe55a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2468,10 +2468,8 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) static void deactivate_slab(struct kmem_cache *s, struct slab *slab, void *freelist) { - enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST }; struct kmem_cache_node *n = get_node(s, slab_nid(slab)); int free_delta = 0; - enum slab_modes mode = M_NONE; void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; unsigned long flags = 0; @@ -2509,65 +2507,40 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, /* * Stage two: Unfreeze the slab while splicing the per-cpu * freelist to the head of slab's freelist. - * - * Ensure that the slab is unfrozen while the list presence - * reflects the actual number of objects during unfreeze. - * - * We first perform cmpxchg holding lock and insert to list - * when it succeed. If there is mismatch then the slab is not - * unfrozen and number of objects in the slab may have changed. - * Then release lock and retry cmpxchg again. */ -redo: + do { + old.freelist = READ_ONCE(slab->freelist); + old.counters = READ_ONCE(slab->counters); + VM_BUG_ON(!old.frozen); - old.freelist = READ_ONCE(slab->freelist); - old.counters = READ_ONCE(slab->counters); - VM_BUG_ON(!old.frozen); - - /* Determine target state of the slab */ - new.counters = old.counters; - if (freelist_tail) { - new.inuse -= free_delta; - set_freepointer(s, freelist_tail, old.freelist); - new.freelist = freelist; - } else - new.freelist = old.freelist; - - new.frozen = 0; + /* Determine target state of the slab */ + new.counters = old.counters; + new.frozen = 0; + if (freelist_tail) { + new.inuse -= free_delta; + set_freepointer(s, freelist_tail, old.freelist); + new.freelist = freelist; + } else { + new.freelist = old.freelist; + } + } while (!slab_update_freelist(s, slab, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")); + /* + * Stage three: Manipulate the slab list based on the updated state. + */ if (!new.inuse && n->nr_partial >= s->min_partial) { - mode = M_FREE; - } else if (new.freelist) { - mode = M_PARTIAL; - /* - * Taking the spinlock removes the possibility that - * acquire_slab() will see a slab that is frozen - */ - spin_lock_irqsave(&n->list_lock, flags); - } else { - mode = M_FULL_NOLIST; - } - - - if (!slab_update_freelist(s, slab, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")) { - if (mode == M_PARTIAL) - spin_unlock_irqrestore(&n->list_lock, flags); - goto redo; - } - - - if (mode == M_PARTIAL) { - add_partial(n, slab, tail); - spin_unlock_irqrestore(&n->list_lock, flags); - stat(s, tail); - } else if (mode == M_FREE) { stat(s, DEACTIVATE_EMPTY); discard_slab(s, slab); stat(s, FREE_SLAB); - } else if (mode == M_FULL_NOLIST) { + } else if (new.freelist) { + spin_lock_irqsave(&n->list_lock, flags); + add_partial(n, slab, tail); + spin_unlock_irqrestore(&n->list_lock, flags); + stat(s, tail); + } else { stat(s, DEACTIVATE_FULL); } } From 21316fdc799932ff43fa00a6d6a45b16dbd77844 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:29 +0000 Subject: [PATCH 0262/1562] slub: Rename all *unfreeze_partials* functions to *put_partials* Since all partial slabs on the CPU partial list are not frozen anymore, we don't unfreeze when moving cpu partial slabs to node partial list, it's better to rename these functions. Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 47655f2fe55a..fe5fcf074dfd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2546,7 +2546,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, } #ifdef CONFIG_SLUB_CPU_PARTIAL -static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) +static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) { struct kmem_cache_node *n = NULL, *n2 = NULL; struct slab *slab, *slab_to_discard = NULL; @@ -2588,9 +2588,9 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) } /* - * Unfreeze all the cpu partial slabs. + * Put all the cpu partial slabs to the node partial list. */ -static void unfreeze_partials(struct kmem_cache *s) +static void put_partials(struct kmem_cache *s) { struct slab *partial_slab; unsigned long flags; @@ -2601,11 +2601,11 @@ static void unfreeze_partials(struct kmem_cache *s) local_unlock_irqrestore(&s->cpu_slab->lock, flags); if (partial_slab) - __unfreeze_partials(s, partial_slab); + __put_partials(s, partial_slab); } -static void unfreeze_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) +static void put_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) { struct slab *partial_slab; @@ -2613,7 +2613,7 @@ static void unfreeze_partials_cpu(struct kmem_cache *s, c->partial = NULL; if (partial_slab) - __unfreeze_partials(s, partial_slab); + __put_partials(s, partial_slab); } /* @@ -2626,7 +2626,7 @@ static void unfreeze_partials_cpu(struct kmem_cache *s, static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) { struct slab *oldslab; - struct slab *slab_to_unfreeze = NULL; + struct slab *slab_to_put = NULL; unsigned long flags; int slabs = 0; @@ -2641,7 +2641,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) * per node partial list. Postpone the actual unfreezing * outside of the critical section. */ - slab_to_unfreeze = oldslab; + slab_to_put = oldslab; oldslab = NULL; } else { slabs = oldslab->slabs; @@ -2657,17 +2657,17 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) local_unlock_irqrestore(&s->cpu_slab->lock, flags); - if (slab_to_unfreeze) { - __unfreeze_partials(s, slab_to_unfreeze); + if (slab_to_put) { + __put_partials(s, slab_to_put); stat(s, CPU_PARTIAL_DRAIN); } } #else /* CONFIG_SLUB_CPU_PARTIAL */ -static inline void unfreeze_partials(struct kmem_cache *s) { } -static inline void unfreeze_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) { } +static inline void put_partials(struct kmem_cache *s) { } +static inline void put_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) { } #endif /* CONFIG_SLUB_CPU_PARTIAL */ @@ -2709,7 +2709,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) stat(s, CPUSLAB_FLUSH); } - unfreeze_partials_cpu(s, c); + put_partials_cpu(s, c); } struct slub_flush_work { @@ -2737,7 +2737,7 @@ static void flush_cpu_slab(struct work_struct *w) if (c->slab) flush_slab(s, c); - unfreeze_partials(s); + put_partials(s); } static bool has_cpu_slab(int cpu, struct kmem_cache *s) @@ -3168,7 +3168,7 @@ new_slab: if (unlikely(!node_match(slab, node) || !pfmemalloc_match(slab, gfpflags))) { slab->next = NULL; - __unfreeze_partials(s, slab); + __put_partials(s, slab); continue; } From 31bda717d7777b8b6cf542af2730651ad6bb4839 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:30 +0000 Subject: [PATCH 0263/1562] slub: Update frozen slabs documentations in the source The current updated scheme (which this series implemented) is: - node partial slabs: PG_Workingset && !frozen - cpu partial slabs: !PG_Workingset && !frozen - cpu slabs: !PG_Workingset && frozen - full slabs: !PG_Workingset && !frozen The most important change is that "frozen" bit is not set for the cpu partial slabs anymore, __slab_free() will grab node list_lock then check by !PG_Workingset that it's not on a node partial list. And the "frozen" bit is still kept for the cpu slabs for performance, since we don't need to grab node list_lock to check whether the PG_Workingset is set or not if the "frozen" bit is set in __slab_free(). Update related documentations and comments in the source. Signed-off-by: Chengming Zhou Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Christoph Lameter (Ampere) Signed-off-by: Vlastimil Babka --- mm/slub.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index fe5fcf074dfd..4fc203a4fa03 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -76,13 +76,28 @@ * * Frozen slabs * - * If a slab is frozen then it is exempt from list management. It is not - * on any list except per cpu partial list. The processor that froze the + * If a slab is frozen then it is exempt from list management. It is + * the cpu slab which is actively allocated from by the processor that + * froze it and it is not on any list. The processor that froze the * slab is the one who can perform list operations on the slab. Other * processors may put objects onto the freelist but the processor that * froze the slab is the only one that can retrieve the objects from the * slab's freelist. * + * CPU partial slabs + * + * The partially empty slabs cached on the CPU partial list are used + * for performance reasons, which speeds up the allocation process. + * These slabs are not frozen, but are also exempt from list management, + * by clearing the PG_workingset flag when moving out of the node + * partial list. Please see __slab_free() for more details. + * + * To sum up, the current scheme is: + * - node partial slab: PG_Workingset && !frozen + * - cpu partial slab: !PG_Workingset && !frozen + * - cpu slab: !PG_Workingset && frozen + * - full slab: !PG_Workingset && !frozen + * * list_lock * * The list_lock protects the partial and full list on each node and @@ -2617,8 +2632,7 @@ static void put_partials_cpu(struct kmem_cache *s, } /* - * Put a slab that was just frozen (in __slab_free|get_partial_node) into a - * partial slab slot if available. + * Put a slab into a partial slab slot if available. * * If we did not find a slot then simply move all the partials to the * per node partial list. From 0445ee000498ec1a5b1ed31bf35816cbeaef5e1e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 20 Nov 2023 17:11:10 +0100 Subject: [PATCH 0264/1562] mm/slab, docs: switch mm-api docs generation from slab.c to slub.c The SLAB implementation is going to be removed, and mm-api.rst currently uses mm/slab.c to obtain kerneldocs for some API functions. Switch it to mm/slub.c and move the relevant kerneldocs of exported functions from one to the other. The rest of kerneldocs in slab.c is for static SLAB implementation-specific functions that don't have counterparts in slub.c and thus can be simply removed with the implementation. Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- Documentation/core-api/mm-api.rst | 2 +- mm/slab.c | 21 --------------------- mm/slub.c | 21 +++++++++++++++++++++ 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 2d091c873d1e..af8151db88b2 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -37,7 +37,7 @@ The Slab Cache .. kernel-doc:: include/linux/slab.h :internal: -.. kernel-doc:: mm/slab.c +.. kernel-doc:: mm/slub.c :export: .. kernel-doc:: mm/slab_common.c diff --git a/mm/slab.c b/mm/slab.c index 9ad3d0f2d1a5..37efe3241f9c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3491,19 +3491,6 @@ error: } EXPORT_SYMBOL(kmem_cache_alloc_bulk); -/** - * kmem_cache_alloc_node - Allocate an object on the specified node - * @cachep: The cache to allocate from. - * @flags: See kmalloc(). - * @nodeid: node number of the target node. - * - * Identical to kmem_cache_alloc but it will allocate memory on the given - * node, which can improve the performance for cpu bound structures. - * - * Fallback to other node is possible if __GFP_THISNODE is not set. - * - * Return: pointer to the new object or %NULL in case of error - */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); @@ -3564,14 +3551,6 @@ void __kmem_cache_free(struct kmem_cache *cachep, void *objp, __do_kmem_cache_free(cachep, objp, caller); } -/** - * kmem_cache_free - Deallocate an object - * @cachep: The cache the allocation was from. - * @objp: The previously allocated object. - * - * Free an object which was previously allocated from this - * cache. - */ void kmem_cache_free(struct kmem_cache *cachep, void *objp) { cachep = cache_from_obj(cachep, objp); diff --git a/mm/slub.c b/mm/slub.c index 63d281dfacdb..3e01731783df 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3518,6 +3518,19 @@ void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, caller, orig_size); } +/** + * kmem_cache_alloc_node - Allocate an object on the specified node + * @s: The cache to allocate from. + * @gfpflags: See kmalloc(). + * @node: node number of the target node. + * + * Identical to kmem_cache_alloc but it will allocate memory on the given + * node, which can improve the performance for cpu bound structures. + * + * Fallback to other node is possible if __GFP_THISNODE is not set. + * + * Return: pointer to the new object or %NULL in case of error + */ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); @@ -3822,6 +3835,14 @@ void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller); } +/** + * kmem_cache_free - Deallocate an object + * @s: The cache the allocation was from. + * @x: The previously allocated object. + * + * Free an object which was previously allocated from this + * cache. + */ void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); From 2a19be61a65157b9c6c25e831392cdefbd0a8940 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 15:43:03 +0200 Subject: [PATCH 0265/1562] mm/slab: remove CONFIG_SLAB from all Kconfig and Makefile Remove CONFIG_SLAB, CONFIG_DEBUG_SLAB, CONFIG_SLAB_DEPRECATED and everything in Kconfig files and mm/Makefile that depends on those. Since SLUB is the only remaining allocator, remove the allocator choice, make CONFIG_SLUB a "def_bool y" for now and remove all explicit dependencies on SLUB or SLAB as it's now always enabled. Make every option's verbose name and description refer to "the slab allocator" without refering to the specific implementation. Do not rename the CONFIG_ option names yet. Everything under #ifdef CONFIG_SLAB, and mm/slab.c is now dead code, all code under #ifdef CONFIG_SLUB is now always compiled. Reviewed-by: Kees Cook Reviewed-by: Christoph Lameter Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- arch/arm64/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/x86/Kconfig | 2 +- lib/Kconfig.debug | 1 - lib/Kconfig.kasan | 11 ++------ lib/Kconfig.kfence | 2 +- lib/Kconfig.kmsan | 2 +- mm/Kconfig | 68 ++++++++++------------------------------------ mm/Kconfig.debug | 16 +++-------- mm/Makefile | 6 +--- 10 files changed, 28 insertions(+), 84 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7b071a00425d..325b7140b576 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -154,7 +154,7 @@ config ARM64 select HAVE_MOVE_PUD select HAVE_PCI select HAVE_ACPI_APEI if (ACPI && EFI) - select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_BITREVERSE select HAVE_ARCH_COMPILER_H diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 3bec98d20283..afa42a6f2e09 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -146,7 +146,7 @@ config S390 select GENERIC_TIME_VSYSCALL select GENERIC_VDSO_TIME_NS select GENERIC_IOREMAP if PCI - select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3762f41bb092..3f460f334d4e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -169,7 +169,7 @@ config X86 select HAS_IOPORT select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI - select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_HUGE_VMALLOC if X86_64 diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cc7d53d9dc01..e1765face106 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1985,7 +1985,6 @@ config FAULT_INJECTION config FAILSLAB bool "Fault-injection capability for kmalloc" depends on FAULT_INJECTION - depends on SLAB || SLUB help Provide fault-injection capability for kmalloc. diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index fdca89c05745..97e1fdbb5910 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -37,7 +37,7 @@ menuconfig KASAN (HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS)) && \ CC_HAS_WORKING_NOSANITIZE_ADDRESS) || \ HAVE_ARCH_KASAN_HW_TAGS - depends on (SLUB && SYSFS && !SLUB_TINY) || (SLAB && !DEBUG_SLAB) + depends on SYSFS && !SLUB_TINY select STACKDEPOT_ALWAYS_INIT help Enables KASAN (Kernel Address Sanitizer) - a dynamic memory safety @@ -78,7 +78,7 @@ config KASAN_GENERIC bool "Generic KASAN" depends on HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS - select SLUB_DEBUG if SLUB + select SLUB_DEBUG select CONSTRUCTORS help Enables Generic KASAN. @@ -89,13 +89,11 @@ config KASAN_GENERIC overhead of ~50% for dynamic allocations. The performance slowdown is ~x3. - (Incompatible with CONFIG_DEBUG_SLAB: the kernel does not boot.) - config KASAN_SW_TAGS bool "Software Tag-Based KASAN" depends on HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS - select SLUB_DEBUG if SLUB + select SLUB_DEBUG select CONSTRUCTORS help Enables Software Tag-Based KASAN. @@ -110,12 +108,9 @@ config KASAN_SW_TAGS May potentially introduce problems related to pointer casting and comparison, as it embeds a tag into the top byte of each pointer. - (Incompatible with CONFIG_DEBUG_SLAB: the kernel does not boot.) - config KASAN_HW_TAGS bool "Hardware Tag-Based KASAN" depends on HAVE_ARCH_KASAN_HW_TAGS - depends on SLUB help Enables Hardware Tag-Based KASAN. diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence index 459dda9ef619..6fbbebec683a 100644 --- a/lib/Kconfig.kfence +++ b/lib/Kconfig.kfence @@ -5,7 +5,7 @@ config HAVE_ARCH_KFENCE menuconfig KFENCE bool "KFENCE: low-overhead sampling-based memory safety error detector" - depends on HAVE_ARCH_KFENCE && (SLAB || SLUB) + depends on HAVE_ARCH_KFENCE select STACKTRACE select IRQ_WORK help diff --git a/lib/Kconfig.kmsan b/lib/Kconfig.kmsan index ef2c8f256c57..0541d7b079cc 100644 --- a/lib/Kconfig.kmsan +++ b/lib/Kconfig.kmsan @@ -11,7 +11,7 @@ config HAVE_KMSAN_COMPILER config KMSAN bool "KMSAN: detector of uninitialized values use" depends on HAVE_ARCH_KMSAN && HAVE_KMSAN_COMPILER - depends on SLUB && DEBUG_KERNEL && !KASAN && !KCSAN + depends on DEBUG_KERNEL && !KASAN && !KCSAN depends on !PREEMPT_RT select STACKDEPOT select STACKDEPOT_ALWAYS_INIT diff --git a/mm/Kconfig b/mm/Kconfig index 89971a894b60..4636870499bb 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -226,52 +226,17 @@ config ZSMALLOC_CHAIN_SIZE For more information, see zsmalloc documentation. -menu "SLAB allocator options" - -choice - prompt "Choose SLAB allocator" - default SLUB - help - This option allows to select a slab allocator. - -config SLAB_DEPRECATED - bool "SLAB (DEPRECATED)" - depends on !PREEMPT_RT - help - Deprecated and scheduled for removal in a few cycles. Replaced by - SLUB. - - If you cannot migrate to SLUB, please contact linux-mm@kvack.org - and the people listed in the SLAB ALLOCATOR section of MAINTAINERS - file, explaining why. - - The regular slab allocator that is established and known to work - well in all environments. It organizes cache hot objects in - per cpu and per node queues. +menu "Slab allocator options" config SLUB - bool "SLUB (Unqueued Allocator)" - help - SLUB is a slab allocator that minimizes cache line usage - instead of managing queues of cached objects (SLAB approach). - Per cpu caching is realized using slabs of objects instead - of queues of objects. SLUB can use memory efficiently - and has enhanced diagnostics. SLUB is the default choice for - a slab allocator. - -endchoice - -config SLAB - bool - default y - depends on SLAB_DEPRECATED + def_bool y config SLUB_TINY - bool "Configure SLUB for minimal memory footprint" - depends on SLUB && EXPERT + bool "Configure for minimal memory footprint" + depends on EXPERT select SLAB_MERGE_DEFAULT help - Configures the SLUB allocator in a way to achieve minimal memory + Configures the slab allocator in a way to achieve minimal memory footprint, sacrificing scalability, debugging and other features. This is intended only for the smallest system that had used the SLOB allocator and is not recommended for systems with more than @@ -282,7 +247,6 @@ config SLUB_TINY config SLAB_MERGE_DEFAULT bool "Allow slab caches to be merged" default y - depends on SLAB || SLUB help For reduced kernel memory fragmentation, slab caches can be merged when they share the same size and other characteristics. @@ -296,7 +260,7 @@ config SLAB_MERGE_DEFAULT config SLAB_FREELIST_RANDOM bool "Randomize slab freelist" - depends on SLAB || (SLUB && !SLUB_TINY) + depends on !SLUB_TINY help Randomizes the freelist order used on creating new pages. This security feature reduces the predictability of the kernel slab @@ -304,21 +268,19 @@ config SLAB_FREELIST_RANDOM config SLAB_FREELIST_HARDENED bool "Harden slab freelist metadata" - depends on SLAB || (SLUB && !SLUB_TINY) + depends on !SLUB_TINY help Many kernel heap attacks try to target slab cache metadata and other infrastructure. This options makes minor performance sacrifices to harden the kernel slab allocator against common - freelist exploit methods. Some slab implementations have more - sanity-checking than others. This option is most effective with - CONFIG_SLUB. + freelist exploit methods. config SLUB_STATS default n - bool "Enable SLUB performance statistics" - depends on SLUB && SYSFS && !SLUB_TINY + bool "Enable performance statistics" + depends on SYSFS && !SLUB_TINY help - SLUB statistics are useful to debug SLUBs allocation behavior in + The statistics are useful to debug slab allocation behavior in order find ways to optimize the allocator. This should never be enabled for production use since keeping statistics slows down the allocator by a few percentage points. The slabinfo command @@ -328,8 +290,8 @@ config SLUB_STATS config SLUB_CPU_PARTIAL default y - depends on SLUB && SMP && !SLUB_TINY - bool "SLUB per cpu partial cache" + depends on SMP && !SLUB_TINY + bool "Enable per cpu partial caches" help Per cpu partial caches accelerate objects allocation and freeing that is local to a processor at the price of more indeterminism @@ -339,7 +301,7 @@ config SLUB_CPU_PARTIAL config RANDOM_KMALLOC_CACHES default n - depends on SLUB && !SLUB_TINY + depends on !SLUB_TINY bool "Randomize slab caches for normal kmalloc" help A hardening feature that creates multiple copies of slab caches for @@ -354,7 +316,7 @@ config RANDOM_KMALLOC_CACHES limited degree of memory and CPU overhead that relates to hardware and system workload. -endmenu # SLAB allocator options +endmenu # Slab allocator options config SHUFFLE_PAGE_ALLOCATOR bool "Page allocator randomization" diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 018a5bd2f576..321ab379994f 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -45,18 +45,10 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT Enable debug page memory allocations by default? This value can be overridden by debug_pagealloc=off|on. -config DEBUG_SLAB - bool "Debug slab memory allocations" - depends on DEBUG_KERNEL && SLAB - help - Say Y here to have the kernel do limited verification on memory - allocation as well as poisoning memory on free to catch use of freed - memory. This can make kmalloc/kfree-intensive workloads much slower. - config SLUB_DEBUG default y bool "Enable SLUB debugging support" if EXPERT - depends on SLUB && SYSFS && !SLUB_TINY + depends on SYSFS && !SLUB_TINY select STACKDEPOT if STACKTRACE_SUPPORT help SLUB has extensive debug support features. Disabling these can @@ -66,7 +58,7 @@ config SLUB_DEBUG config SLUB_DEBUG_ON bool "SLUB debugging on by default" - depends on SLUB && SLUB_DEBUG + depends on SLUB_DEBUG select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT default n help @@ -231,8 +223,8 @@ config DEBUG_KMEMLEAK allocations. See Documentation/dev-tools/kmemleak.rst for more details. - Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances - of finding leaks due to the slab objects poisoning. + Enabling SLUB_DEBUG may increase the chances of finding leaks + due to the slab objects poisoning. In order to access the kmemleak file, debugfs needs to be mounted (usually at /sys/kernel/debug). diff --git a/mm/Makefile b/mm/Makefile index 33873c8aedb3..e4b5b75aaec9 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -4,7 +4,6 @@ # KASAN_SANITIZE_slab_common.o := n -KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slub.o := n KCSAN_SANITIZE_kmemleak.o := n @@ -12,7 +11,6 @@ KCSAN_SANITIZE_kmemleak.o := n # the same word but accesses to different bits of that word. Re-enable KCSAN # for these when we have more consensus on what to do about them. KCSAN_SANITIZE_slab_common.o := n -KCSAN_SANITIZE_slab.o := n KCSAN_SANITIZE_slub.o := n KCSAN_SANITIZE_page_alloc.o := n # But enable explicit instrumentation for memory barriers. @@ -22,7 +20,6 @@ KCSAN_INSTRUMENT_BARRIERS := y # flaky coverage that is not a function of syscall inputs. E.g. slab is out of # free pages, or a task is migrated between nodes. KCOV_INSTRUMENT_slab_common.o := n -KCOV_INSTRUMENT_slab.o := n KCOV_INSTRUMENT_slub.o := n KCOV_INSTRUMENT_page_alloc.o := n KCOV_INSTRUMENT_debug-pagealloc.o := n @@ -66,6 +63,7 @@ obj-y += page-alloc.o obj-y += init-mm.o obj-y += memblock.o obj-y += $(memory-hotplug-y) +obj-y += slub.o ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o @@ -82,8 +80,6 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += page_poison.o -obj-$(CONFIG_SLAB) += slab.o -obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_KMSAN) += kmsan/ From 72786c0a3dc5d4151469f512909049a0b17ada3d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 16:17:16 +0200 Subject: [PATCH 0266/1562] KASAN: remove code paths guarded by CONFIG_SLAB With SLAB removed and SLUB the only remaining allocator, we can clean up some code that was depending on the choice. Reviewed-by: Kees Cook Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/kasan/common.c | 13 ++----------- mm/kasan/kasan.h | 3 +-- mm/kasan/quarantine.c | 7 ------- 3 files changed, 3 insertions(+), 20 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 256930da578a..5d95219e69d7 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -153,10 +153,6 @@ void __kasan_poison_object_data(struct kmem_cache *cache, void *object) * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be * accessed after being freed. We preassign tags for objects in these * caches as well. - * 3. For SLAB allocator we can't preassign tags randomly since the freelist - * is stored as an array of indexes instead of a linked list. Assign tags - * based on objects indexes, so that objects that are next to each other - * get different tags. */ static inline u8 assign_tag(struct kmem_cache *cache, const void *object, bool init) @@ -171,17 +167,12 @@ static inline u8 assign_tag(struct kmem_cache *cache, if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) return init ? KASAN_TAG_KERNEL : kasan_random_tag(); - /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ -#ifdef CONFIG_SLAB - /* For SLAB assign tags based on the object index in the freelist. */ - return (u8)obj_to_index(cache, virt_to_slab(object), (void *)object); -#else /* - * For SLUB assign a random tag during slab creation, otherwise reuse + * For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU, + * assign a random tag during slab creation, otherwise reuse * the already assigned tag. */ return init ? kasan_random_tag() : get_tag(object); -#endif } void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 8b06bab5c406..eef50233640a 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -373,8 +373,7 @@ void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); void kasan_save_free_info(struct kmem_cache *cache, void *object); -#if defined(CONFIG_KASAN_GENERIC) && \ - (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) +#ifdef CONFIG_KASAN_GENERIC bool kasan_quarantine_put(struct kmem_cache *cache, void *object); void kasan_quarantine_reduce(void); void kasan_quarantine_remove_cache(struct kmem_cache *cache); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index ca4529156735..138c57b836f2 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -144,10 +144,6 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) { void *object = qlink_to_object(qlink, cache); struct kasan_free_meta *meta = kasan_get_free_meta(cache, object); - unsigned long flags; - - if (IS_ENABLED(CONFIG_SLAB)) - local_irq_save(flags); /* * If init_on_free is enabled and KASAN's free metadata is stored in @@ -166,9 +162,6 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE; ___cache_free(cache, object, _THIS_IP_); - - if (IS_ENABLED(CONFIG_SLAB)) - local_irq_restore(flags); } static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache) From a745b067db0f0711974063deb78c6639c24ec5bf Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 16:28:44 +0200 Subject: [PATCH 0267/1562] KFENCE: cleanup kfence_guarded_alloc() after CONFIG_SLAB removal Some struct slab fields are initialized differently for SLAB and SLUB so we can simplify with SLUB being the only remaining allocator. Reviewed-by: Kees Cook Reviewed-by: Marco Elver Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/kfence/core.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 3872528d0963..8350f5c06f2e 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -463,11 +463,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g /* Set required slab fields. */ slab = virt_to_slab((void *)meta->addr); slab->slab_cache = cache; -#if defined(CONFIG_SLUB) slab->objects = 1; -#elif defined(CONFIG_SLAB) - slab->s_mem = addr; -#endif /* Memory initialization. */ set_canary(meta); From bc3dcb850f1818528fcafb10dd38a4590d9119e3 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 16:33:43 +0200 Subject: [PATCH 0268/1562] mm/memcontrol: remove CONFIG_SLAB #ifdef guards With SLAB removed, these are never true anymore so we can clean up. Reviewed-by: Kees Cook Acked-by: Michal Hocko Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/memcontrol.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 774bd6e21e27..947fb50eba31 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5149,7 +5149,7 @@ out_kfree: return ret; } -#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) static int mem_cgroup_slab_show(struct seq_file *m, void *p) { /* @@ -5258,8 +5258,7 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, -#if defined(CONFIG_MEMCG_KMEM) && \ - (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) { .name = "kmem.slabinfo", .seq_show = mem_cgroup_slab_show, From 70da1d01edf6da3fde1df98b2125a77083a0fb82 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 16:36:55 +0200 Subject: [PATCH 0269/1562] cpu/hotplug: remove CPUHP_SLAB_PREPARE hooks The CPUHP_SLAB_PREPARE hooks are only used by SLAB which is removed. SLUB defines them as NULL, so we can remove those altogether. Acked-by: Thomas Gleixner Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/cpuhotplug.h | 1 - include/linux/slab.h | 8 -------- kernel/cpu.c | 5 ----- 3 files changed, 14 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index d305db70674b..07cb8f7030b6 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -108,7 +108,6 @@ enum cpuhp_state { CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, CPUHP_RELAY_PREPARE, - CPUHP_SLAB_PREPARE, CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, CPUHP_CPUIDLE_COUPLED_PREPARE, diff --git a/include/linux/slab.h b/include/linux/slab.h index d6d6ffeeb9a2..34e43cddc520 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -788,12 +788,4 @@ size_t kmalloc_size_roundup(size_t size); void __init kmem_cache_init_late(void); -#if defined(CONFIG_SMP) && defined(CONFIG_SLAB) -int slab_prepare_cpu(unsigned int cpu); -int slab_dead_cpu(unsigned int cpu); -#else -#define slab_prepare_cpu NULL -#define slab_dead_cpu NULL -#endif - #endif /* _LINUX_SLAB_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 9e4c6780adde..530b026d95a1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2125,11 +2125,6 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = relay_prepare_cpu, .teardown.single = NULL, }, - [CPUHP_SLAB_PREPARE] = { - .name = "slab:prepare", - .startup.single = slab_prepare_cpu, - .teardown.single = slab_dead_cpu, - }, [CPUHP_RCUTREE_PREP] = { .name = "RCU/tree:prepare", .startup.single = rcutree_prepare_cpu, From a9e0b9f27266d46ed6e73aac8d0844602cd0cb93 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 17:43:38 +0200 Subject: [PATCH 0270/1562] mm/slab: remove CONFIG_SLAB code from slab common code In slab_common.c and slab.h headers, we can now remove all code behind CONFIG_SLAB and CONFIG_DEBUG_SLAB ifdefs, and remove all CONFIG_SLUB ifdefs. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 14 ++------- mm/slab.h | 69 ++++---------------------------------------- mm/slab_common.c | 22 ++------------ 3 files changed, 9 insertions(+), 96 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 34e43cddc520..b2015d0e01ad 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -24,7 +24,7 @@ /* * Flags to pass to kmem_cache_create(). - * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. + * The ones marked DEBUG need CONFIG_SLUB_DEBUG enabled, otherwise are no-op */ /* DEBUG: Perform (expensive) checks on alloc/free */ #define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U) @@ -302,25 +302,15 @@ static inline unsigned int arch_slab_minalign(void) * Kmalloc array related definitions */ -#ifdef CONFIG_SLAB /* - * SLAB and SLUB directly allocates requests fitting in to an order-1 page + * SLUB directly allocates requests fitting in to an order-1 page * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW -#define KMALLOC_SHIFT_LOW 5 -#endif -#endif - -#ifdef CONFIG_SLUB -#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) -#ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif -#endif /* Maximum allocatable size */ #define KMALLOC_MAX_SIZE (1UL << KMALLOC_SHIFT_MAX) diff --git a/mm/slab.h b/mm/slab.h index 3d07fb428393..014c36ea51fa 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -42,21 +42,6 @@ typedef union { struct slab { unsigned long __page_flags; -#if defined(CONFIG_SLAB) - - struct kmem_cache *slab_cache; - union { - struct { - struct list_head slab_list; - void *freelist; /* array of free object indexes */ - void *s_mem; /* first object */ - }; - struct rcu_head rcu_head; - }; - unsigned int active; - -#elif defined(CONFIG_SLUB) - struct kmem_cache *slab_cache; union { struct { @@ -91,10 +76,6 @@ struct slab { }; unsigned int __unused; -#else -#error "Unexpected slab allocator configured" -#endif - atomic_t __page_refcount; #ifdef CONFIG_MEMCG unsigned long memcg_data; @@ -111,7 +92,7 @@ SLAB_MATCH(memcg_data, memcg_data); #endif #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); -#if defined(system_has_freelist_aba) && defined(CONFIG_SLUB) +#if defined(system_has_freelist_aba) static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t))); #endif @@ -228,13 +209,7 @@ static inline size_t slab_size(const struct slab *slab) return PAGE_SIZE << slab_order(slab); } -#ifdef CONFIG_SLAB -#include -#endif - -#ifdef CONFIG_SLUB #include -#endif #include #include @@ -320,26 +295,16 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s) SLAB_CACHE_DMA32 | SLAB_PANIC | \ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) -#if defined(CONFIG_DEBUG_SLAB) -#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) -#elif defined(CONFIG_SLUB_DEBUG) +#ifdef CONFIG_SLUB_DEBUG #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) #else #define SLAB_DEBUG_FLAGS (0) #endif -#if defined(CONFIG_SLAB) -#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ - SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ - SLAB_ACCOUNT | SLAB_NO_MERGE) -#elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | SLAB_ACCOUNT | \ SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) -#else -#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) -#endif /* Common flags available with current configuration */ #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) @@ -672,18 +637,14 @@ size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) { -#ifndef CONFIG_SLUB - return s->object_size; - -#else /* CONFIG_SLUB */ -# ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SLUB_DEBUG /* * Debugging requires use of the padding between object * and whatever may come after it. */ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) return s->object_size; -# endif +#endif if (s->flags & SLAB_KASAN) return s->object_size; /* @@ -697,7 +658,6 @@ static inline size_t slab_ksize(const struct kmem_cache *s) * Else we can use all the padding etc for the allocation */ return s->size; -#endif } static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, @@ -775,23 +735,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, * The slab lists for all objects. */ struct kmem_cache_node { -#ifdef CONFIG_SLAB - raw_spinlock_t list_lock; - struct list_head slabs_partial; /* partial list first, better asm code */ - struct list_head slabs_full; - struct list_head slabs_free; - unsigned long total_slabs; /* length of all slab lists */ - unsigned long free_slabs; /* length of free slab list only */ - unsigned long free_objects; - unsigned int free_limit; - unsigned int colour_next; /* Per-node cache coloring */ - struct array_cache *shared; /* shared per node */ - struct alien_cache **alien; /* on other nodes */ - unsigned long next_reap; /* updated without locking */ - int free_touched; /* updated without locking */ -#endif - -#ifdef CONFIG_SLUB spinlock_t list_lock; unsigned long nr_partial; struct list_head partial; @@ -800,8 +743,6 @@ struct kmem_cache_node { atomic_long_t total_objects; struct list_head full; #endif -#endif - }; static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) @@ -818,7 +759,7 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) if ((__n = get_node(__s, __node))) -#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) +#ifdef CONFIG_SLUB_DEBUG void dump_unreclaimable_slab(void); #else static inline void dump_unreclaimable_slab(void) diff --git a/mm/slab_common.c b/mm/slab_common.c index 8d431193c273..63b8411db7ce 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -71,10 +71,8 @@ static int __init setup_slab_merge(char *str) return 1; } -#ifdef CONFIG_SLUB __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0); __setup_param("slub_merge", slub_merge, setup_slab_merge, 0); -#endif __setup("slab_nomerge", setup_slab_nomerge); __setup("slab_merge", setup_slab_merge); @@ -197,10 +195,6 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, if (s->size - size >= sizeof(void *)) continue; - if (IS_ENABLED(CONFIG_SLAB) && align && - (align > s->align || s->align % align)) - continue; - return s; } return NULL; @@ -1222,12 +1216,8 @@ void cache_random_seq_destroy(struct kmem_cache *cachep) } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ -#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) -#ifdef CONFIG_SLAB -#define SLABINFO_RIGHTS (0600) -#else +#ifdef CONFIG_SLUB_DEBUG #define SLABINFO_RIGHTS (0400) -#endif static void print_slabinfo_header(struct seq_file *m) { @@ -1235,18 +1225,10 @@ static void print_slabinfo_header(struct seq_file *m) * Output format version, so at least we can change it * without _too_ many complaints. */ -#ifdef CONFIG_DEBUG_SLAB - seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); -#else seq_puts(m, "slabinfo - version: 2.1\n"); -#endif seq_puts(m, "# name "); seq_puts(m, " : tunables "); seq_puts(m, " : slabdata "); -#ifdef CONFIG_DEBUG_SLAB - seq_puts(m, " : globalstat "); - seq_puts(m, " : cpustat "); -#endif seq_putc(m, '\n'); } @@ -1370,7 +1352,7 @@ static int __init slab_proc_init(void) } module_init(slab_proc_init); -#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */ +#endif /* CONFIG_SLUB_DEBUG */ static __always_inline __realloc_size(2) void * __do_krealloc(const void *p, size_t new_size, gfp_t flags) From 8c20b29db5087684c2d46ca5a33b504b0743c85e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 18:54:46 +0200 Subject: [PATCH 0271/1562] mm/mempool/dmapool: remove CONFIG_DEBUG_SLAB ifdefs CONFIG_DEBUG_SLAB is going away with CONFIG_SLAB, so remove dead ifdefs in mempool and dmapool code. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/dmapool.c | 2 +- mm/mempool.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index a151a21e571b..f0bfc6c490f4 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -36,7 +36,7 @@ #include #include -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) +#ifdef CONFIG_SLUB_DEBUG_ON #define DMAPOOL_DEBUG 1 #endif diff --git a/mm/mempool.c b/mm/mempool.c index 734bcf5afbb7..4759be0ff9de 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -20,7 +20,7 @@ #include #include "slab.h" -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) +#ifdef CONFIG_SLUB_DEBUG_ON static void poison_error(mempool_t *pool, void *element, size_t size, size_t byte) { @@ -95,14 +95,14 @@ static void poison_element(mempool_t *pool, void *element) kunmap_atomic(addr); } } -#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ +#else /* CONFIG_SLUB_DEBUG_ON */ static inline void check_element(mempool_t *pool, void *element) { } static inline void poison_element(mempool_t *pool, void *element) { } -#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ +#endif /* CONFIG_SLUB_DEBUG_ON */ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) { From 2719675fa8111a8d7a060133e1dd4797d20c9754 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 20 Nov 2023 10:59:42 -0800 Subject: [PATCH 0272/1562] cpufreq: intel_pstate: Prioritize firmware-provided balance performance EPP The platform firmware can provide a balance performance EPP value by enabling HWP and programming the EPP to the desired value. However, currently this only takes effect for processors listed in intel_epp_balance_perf[], so in order to enable a new processor model to utilize this mechanism, that table needs to be updated. It arguably should not be necessary to modify the kernel to work properly with every new generation of processors, though, and distributions that don't always ship the most recent kernels should be able to run reasonably well on new hardware without code changes. For this reason, move the check to avoid updating the EPP when the balance performance EPP is unmodified from the power-up default of 0x80 after the check that allows the firmware-provided balance performance EPP value to be retrieved. This will cause the code to always look for the firmware- provided value before consulting intel_epp_balance_perf[] and the handling of new hardware will not depend on whether or not that thable has been updated yet. Signed-off-by: Srinivas Pandruvada [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index a534a1f7f1ee..dd6d23e389f1 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1691,13 +1691,6 @@ static void intel_pstate_update_epp_defaults(struct cpudata *cpudata) { cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); - /* - * If this CPU gen doesn't call for change in balance_perf - * EPP return. - */ - if (epp_values[EPP_INDEX_BALANCE_PERFORMANCE] == HWP_EPP_BALANCE_PERFORMANCE) - return; - /* * If the EPP is set by firmware, which means that firmware enabled HWP * - Is equal or less than 0x80 (default balance_perf EPP) @@ -1710,6 +1703,13 @@ static void intel_pstate_update_epp_defaults(struct cpudata *cpudata) return; } + /* + * If this CPU gen doesn't call for change in balance_perf + * EPP return. + */ + if (epp_values[EPP_INDEX_BALANCE_PERFORMANCE] == HWP_EPP_BALANCE_PERFORMANCE) + return; + /* * Use hard coded value per gen to update the balance_perf * and default EPP. From c4a5118a3ae1eadc687d84eef9431f9e13eb015c Mon Sep 17 00:00:00 2001 From: Alexandra Diupina Date: Tue, 5 Dec 2023 18:12:20 +0300 Subject: [PATCH 0273/1562] cpufreq: scmi: process the result of devm_of_clk_add_hw_provider() devm_of_clk_add_hw_provider() may return an errno, so add a return value check Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 8410e7f3b31e ("cpufreq: scmi: Fix OPP addition failure with a dummy clock provider") Signed-off-by: Alexandra Diupina Signed-off-by: Viresh Kumar --- drivers/cpufreq/scmi-cpufreq.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index c8a7ccc42c16..4ee23f4ebf4a 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -334,8 +334,11 @@ static int scmi_cpufreq_probe(struct scmi_device *sdev) #ifdef CONFIG_COMMON_CLK /* dummy clock provider as needed by OPP if clocks property is used */ - if (of_property_present(dev->of_node, "#clock-cells")) - devm_of_clk_add_hw_provider(dev, of_clk_hw_simple_get, NULL); + if (of_property_present(dev->of_node, "#clock-cells")) { + ret = devm_of_clk_add_hw_provider(dev, of_clk_hw_simple_get, NULL); + if (ret) + return dev_err_probe(dev, ret, "%s: registering clock provider failed\n", __func__); + } #endif ret = cpufreq_register_driver(&scmi_cpufreq_driver); From 9641423174d05da32543e96ced66bb30cebcce16 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Sat, 25 Nov 2023 14:35:25 +0200 Subject: [PATCH 0274/1562] mtd: spi-nor: add erase die (chip) capability JESD216 mentions die erase, but does not provide an opcode for it. Check BFPT dword 11, bits 30:24, "Chip Erase, Typical time", it says: "Typical time to erase one chip (die). User must poll device busy to determine if the operation has completed. For a device consisting of multiple dies, that are individually accessed, the time is for each die to which a chip erase command is applied." So when a flash consists of a single die, this is the erase time for the full chip (die) erase, and when it consists of multiple dies, it's the die erase time. Chip and die are the same thing. Add support for die erase. For now, benefit of the die erase when addr and len are aligned with die size. This could be improved however for the uniform and non-uniform erases cases to use the die erase when possible. For example if one requests that an erase of a 2 die device starting from the last 64KB of the first die to the end of the flash size, we could use just 2 commands, a 64KB erase and a die erase. This improvement is left as an exercise for the reader. Tested-by: Fabio Estevam Link: https://lore.kernel.org/r/20231125123529.55686-2-tudor.ambarus@linaro.org Signed-off-by: Tudor Ambarus --- drivers/mtd/spi-nor/core.c | 108 +++++++++++++++++++++++----------- drivers/mtd/spi-nor/core.h | 8 ++- drivers/mtd/spi-nor/debugfs.c | 2 +- 3 files changed, 81 insertions(+), 37 deletions(-) diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c index 25a64c65717d..479494cf00c9 100644 --- a/drivers/mtd/spi-nor/core.c +++ b/drivers/mtd/spi-nor/core.c @@ -1060,24 +1060,32 @@ static int spi_nor_read_sr2(struct spi_nor *nor, u8 *sr2) } /** - * spi_nor_erase_chip() - Erase the entire flash memory. + * spi_nor_erase_die() - Erase the entire die. * @nor: pointer to 'struct spi_nor'. + * @addr: address of the die. + * @die_size: size of the die. * * Return: 0 on success, -errno otherwise. */ -static int spi_nor_erase_chip(struct spi_nor *nor) +static int spi_nor_erase_die(struct spi_nor *nor, loff_t addr, size_t die_size) { + bool multi_die = nor->mtd.size != die_size; int ret; - dev_dbg(nor->dev, " %lldKiB\n", (long long)(nor->mtd.size >> 10)); + dev_dbg(nor->dev, " %lldKiB\n", (long long)(die_size >> 10)); if (nor->spimem) { - struct spi_mem_op op = SPI_NOR_CHIP_ERASE_OP; + struct spi_mem_op op = + SPI_NOR_DIE_ERASE_OP(nor->params->die_erase_opcode, + nor->addr_nbytes, addr, multi_die); spi_nor_spimem_setup_op(nor, &op, nor->reg_proto); ret = spi_mem_exec_op(nor->spimem, &op); } else { + if (multi_die) + return -EOPNOTSUPP; + ret = spi_nor_controller_ops_write_reg(nor, SPINOR_OP_CHIP_ERASE, NULL, 0); @@ -1792,6 +1800,51 @@ destroy_erase_cmd_list: return ret; } +static int spi_nor_erase_dice(struct spi_nor *nor, loff_t addr, + size_t len, size_t die_size) +{ + unsigned long timeout; + int ret; + + /* + * Scale the timeout linearly with the size of the flash, with + * a minimum calibrated to an old 2MB flash. We could try to + * pull these from CFI/SFDP, but these values should be good + * enough for now. + */ + timeout = max(CHIP_ERASE_2MB_READY_WAIT_JIFFIES, + CHIP_ERASE_2MB_READY_WAIT_JIFFIES * + (unsigned long)(nor->mtd.size / SZ_2M)); + + do { + ret = spi_nor_lock_device(nor); + if (ret) + return ret; + + ret = spi_nor_write_enable(nor); + if (ret) { + spi_nor_unlock_device(nor); + return ret; + } + + ret = spi_nor_erase_die(nor, addr, die_size); + + spi_nor_unlock_device(nor); + if (ret) + return ret; + + ret = spi_nor_wait_till_ready_with_timeout(nor, timeout); + if (ret) + return ret; + + addr += die_size; + len -= die_size; + + } while (len); + + return 0; +} + /* * Erase an address range on the nor chip. The address range may extend * one or more erase sectors. Return an error if there is a problem erasing. @@ -1799,7 +1852,10 @@ destroy_erase_cmd_list: static int spi_nor_erase(struct mtd_info *mtd, struct erase_info *instr) { struct spi_nor *nor = mtd_to_spi_nor(mtd); + u8 n_dice = nor->params->n_dice; + bool multi_die_erase = false; u32 addr, len, rem; + size_t die_size; int ret; dev_dbg(nor->dev, "at 0x%llx, len %lld\n", (long long)instr->addr, @@ -1814,39 +1870,22 @@ static int spi_nor_erase(struct mtd_info *mtd, struct erase_info *instr) addr = instr->addr; len = instr->len; + if (n_dice) { + die_size = div_u64(mtd->size, n_dice); + if (!(len & (die_size - 1)) && !(addr & (die_size - 1))) + multi_die_erase = true; + } else { + die_size = mtd->size; + } + ret = spi_nor_prep_and_lock_pe(nor, instr->addr, instr->len); if (ret) return ret; - /* whole-chip erase? */ - if (len == mtd->size && !(nor->flags & SNOR_F_NO_OP_CHIP_ERASE)) { - unsigned long timeout; - - ret = spi_nor_lock_device(nor); - if (ret) - goto erase_err; - - ret = spi_nor_write_enable(nor); - if (ret) { - spi_nor_unlock_device(nor); - goto erase_err; - } - - ret = spi_nor_erase_chip(nor); - spi_nor_unlock_device(nor); - if (ret) - goto erase_err; - - /* - * Scale the timeout linearly with the size of the flash, with - * a minimum calibrated to an old 2MB flash. We could try to - * pull these from CFI/SFDP, but these values should be good - * enough for now. - */ - timeout = max(CHIP_ERASE_2MB_READY_WAIT_JIFFIES, - CHIP_ERASE_2MB_READY_WAIT_JIFFIES * - (unsigned long)(mtd->size / SZ_2M)); - ret = spi_nor_wait_till_ready_with_timeout(nor, timeout); + /* chip (die) erase? */ + if ((len == mtd->size && !(nor->flags & SNOR_F_NO_OP_CHIP_ERASE)) || + multi_die_erase) { + ret = spi_nor_erase_dice(nor, addr, len, die_size); if (ret) goto erase_err; @@ -2902,6 +2941,9 @@ static int spi_nor_late_init_params(struct spi_nor *nor) return ret; } + if (!nor->params->die_erase_opcode) + nor->params->die_erase_opcode = SPINOR_OP_CHIP_ERASE; + /* Default method kept for backward compatibility. */ if (!params->set_4byte_addr_mode) params->set_4byte_addr_mode = spi_nor_set_4byte_addr_mode_brwr; diff --git a/drivers/mtd/spi-nor/core.h b/drivers/mtd/spi-nor/core.h index a456042379ee..b43ea2d49e74 100644 --- a/drivers/mtd/spi-nor/core.h +++ b/drivers/mtd/spi-nor/core.h @@ -85,9 +85,9 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) -#define SPI_NOR_CHIP_ERASE_OP \ - SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_CHIP_ERASE, 0), \ - SPI_MEM_OP_NO_ADDR, \ +#define SPI_NOR_DIE_ERASE_OP(opcode, addr_nbytes, addr, dice) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(opcode, 0), \ + SPI_MEM_OP_ADDR(dice ? addr_nbytes : 0, addr, 0), \ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) @@ -362,6 +362,7 @@ struct spi_nor_otp { * command in octal DTR mode. * @n_banks: number of banks. * @n_dice: number of dice in the flash memory. + * @die_erase_opcode: die erase opcode. Defaults to SPINOR_OP_CHIP_ERASE. * @vreg_offset: volatile register offset for each die. * @hwcaps: describes the read and page program hardware * capabilities. @@ -399,6 +400,7 @@ struct spi_nor_flash_parameter { u8 rdsr_addr_nbytes; u8 n_banks; u8 n_dice; + u8 die_erase_opcode; u32 *vreg_offset; struct spi_nor_hwcaps hwcaps; diff --git a/drivers/mtd/spi-nor/debugfs.c b/drivers/mtd/spi-nor/debugfs.c index 6e163cb5b478..2dbda6b6938a 100644 --- a/drivers/mtd/spi-nor/debugfs.c +++ b/drivers/mtd/spi-nor/debugfs.c @@ -138,7 +138,7 @@ static int spi_nor_params_show(struct seq_file *s, void *data) if (!(nor->flags & SNOR_F_NO_OP_CHIP_ERASE)) { string_get_size(params->size, 1, STRING_UNITS_2, buf, sizeof(buf)); - seq_printf(s, " %02x (%s)\n", SPINOR_OP_CHIP_ERASE, buf); + seq_printf(s, " %02x (%s)\n", nor->params->die_erase_opcode, buf); } seq_puts(s, "\nsector map\n"); From 461d0babb54462188c98818b472e6a3d5a91fd60 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Sat, 25 Nov 2023 14:35:26 +0200 Subject: [PATCH 0275/1562] mtd: spi-nor: spansion: enable die erase for multi die flashes Enable die erase for spansion multi die flashes. Tested-by: Takahiro Kuwano Link: https://lore.kernel.org/r/20231125123529.55686-3-tudor.ambarus@linaro.org Signed-off-by: Tudor Ambarus --- drivers/mtd/spi-nor/spansion.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/spi-nor/spansion.c b/drivers/mtd/spi-nor/spansion.c index 12921344373d..6cc237c24e07 100644 --- a/drivers/mtd/spi-nor/spansion.c +++ b/drivers/mtd/spi-nor/spansion.c @@ -17,6 +17,7 @@ #define SPINOR_OP_CLSR 0x30 /* Clear status register 1 */ #define SPINOR_OP_CLPEF 0x82 /* Clear program/erase failure flags */ +#define SPINOR_OP_CYPRESS_DIE_ERASE 0x61 /* Chip (die) erase */ #define SPINOR_OP_RD_ANY_REG 0x65 /* Read any register */ #define SPINOR_OP_WR_ANY_REG 0x71 /* Write any register */ #define SPINOR_REG_CYPRESS_VREG 0x00800000 @@ -644,6 +645,7 @@ static int s25hx_t_late_init(struct spi_nor *nor) params->ready = cypress_nor_sr_ready_and_clear; cypress_nor_ecc_init(nor); + params->die_erase_opcode = SPINOR_OP_CYPRESS_DIE_ERASE; return 0; } @@ -933,7 +935,6 @@ static const struct flash_info spansion_nor_parts[] = { .id = SNOR_ID(0x34, 0x2a, 0x1c, 0x0f, 0x00, 0x90), .name = "s25hl02gt", .mfr_flags = USE_CLPEF, - .flags = NO_CHIP_ERASE, .fixups = &s25hx_t_fixups }, { .id = SNOR_ID(0x34, 0x2b, 0x19, 0x0f, 0x08, 0x90), @@ -954,7 +955,6 @@ static const struct flash_info spansion_nor_parts[] = { .id = SNOR_ID(0x34, 0x2b, 0x1c, 0x0f, 0x00, 0x90), .name = "s25hs02gt", .mfr_flags = USE_CLPEF, - .flags = NO_CHIP_ERASE, .fixups = &s25hx_t_fixups }, { .id = SNOR_ID(0x34, 0x5a, 0x1a), From 53919a968b43648822f2d35b6cafacd3950238cc Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Sat, 25 Nov 2023 14:35:27 +0200 Subject: [PATCH 0276/1562] mtd: spi-nor: micron-st: enable die erase for multi die flashes Enable die erase for multi die flashes, it will speed the erase time. Unfortunately, Micron does not provide a 4-byte opcode equivalent for the die erase. The SFDP 4BAIT table fails to consider the die erase too, the standard can be improved. Thus we're forced to enter in the 4 byte address mode in order to benefit of the die erase. Tested on n25q00. This flash defines the 4BAIT SFDP table, thus it will use the 4BAIT opcodes for reads, page programs or erases, with the exception that it will use the die erase command in the 4 byte address mode. Link: https://media-www.micron.com/-/media/client/global/documents/products/data-sheet/nor-flash/serial-nor/n25q/n25q_1gb_3v_65nm.pdf?rev=b6eba74759984f749f8c039bc5bc47b7 Link: https://media-www.micron.com/-/media/client/global/documents/products/data-sheet/nor-flash/serial-nor/mt25q/die-rev-b/mt25q_qlkt_l_02g_cbb_0.pdf?rev=43f7f66fc8da4d7d901b35fa51284c8f Link: https://lore.kernel.org/r/20231125123529.55686-4-tudor.ambarus@linaro.org Signed-off-by: Tudor Ambarus --- drivers/mtd/spi-nor/core.c | 32 ++++++++++++++++--------------- drivers/mtd/spi-nor/micron-st.c | 34 +++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 19 deletions(-) diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c index 479494cf00c9..52e5b569ddfd 100644 --- a/drivers/mtd/spi-nor/core.c +++ b/drivers/mtd/spi-nor/core.c @@ -2935,6 +2935,9 @@ static int spi_nor_late_init_params(struct spi_nor *nor) return ret; } + /* Needed by some flashes late_init hooks. */ + spi_nor_init_flags(nor); + if (nor->info->fixups && nor->info->fixups->late_init) { ret = nor->info->fixups->late_init(nor); if (ret) @@ -2948,7 +2951,6 @@ static int spi_nor_late_init_params(struct spi_nor *nor) if (!params->set_4byte_addr_mode) params->set_4byte_addr_mode = spi_nor_set_4byte_addr_mode_brwr; - spi_nor_init_flags(nor); spi_nor_init_fixup_flags(nor); /* @@ -3186,6 +3188,18 @@ int spi_nor_set_4byte_addr_mode(struct spi_nor *nor, bool enable) struct spi_nor_flash_parameter *params = nor->params; int ret; + if (enable) { + /* + * If the RESET# pin isn't hooked up properly, or the system + * otherwise doesn't perform a reset command in the boot + * sequence, it's impossible to 100% protect against unexpected + * reboots (e.g., crashes). Warn the user (or hopefully, system + * designer) that this is bad. + */ + WARN_ONCE(nor->flags & SNOR_F_BROKEN_RESET, + "enabling reset hack; may not recover from unexpected reboots\n"); + } + ret = params->set_4byte_addr_mode(nor, enable); if (ret && ret != -ENOTSUPP) return ret; @@ -3234,20 +3248,8 @@ static int spi_nor_init(struct spi_nor *nor) if (nor->addr_nbytes == 4 && nor->read_proto != SNOR_PROTO_8_8_8_DTR && - !(nor->flags & SNOR_F_4B_OPCODES)) { - /* - * If the RESET# pin isn't hooked up properly, or the system - * otherwise doesn't perform a reset command in the boot - * sequence, it's impossible to 100% protect against unexpected - * reboots (e.g., crashes). Warn the user (or hopefully, system - * designer) that this is bad. - */ - WARN_ONCE(nor->flags & SNOR_F_BROKEN_RESET, - "enabling reset hack; may not recover from unexpected reboots\n"); - err = spi_nor_set_4byte_addr_mode(nor, true); - if (err) - return err; - } + !(nor->flags & SNOR_F_4B_OPCODES)) + return spi_nor_set_4byte_addr_mode(nor, true); return 0; } diff --git a/drivers/mtd/spi-nor/micron-st.c b/drivers/mtd/spi-nor/micron-st.c index 8920547c12bf..b63f1e9b97d0 100644 --- a/drivers/mtd/spi-nor/micron-st.c +++ b/drivers/mtd/spi-nor/micron-st.c @@ -11,6 +11,7 @@ /* flash_info mfr_flag. Used to read proprietary FSR register. */ #define USE_FSR BIT(0) +#define SPINOR_OP_MT_DIE_ERASE 0xc4 /* Chip (die) erase opcode */ #define SPINOR_OP_RDFSR 0x70 /* Read flag status register */ #define SPINOR_OP_CLFSR 0x50 /* Clear flag status register */ #define SPINOR_OP_MT_DTR_RD 0xfd /* Fast Read opcode in DTR mode */ @@ -192,6 +193,30 @@ static struct spi_nor_fixups mt25qu512a_fixups = { .post_bfpt = mt25qu512a_post_bfpt_fixup, }; +static int st_nor_four_die_late_init(struct spi_nor *nor) +{ + struct spi_nor_flash_parameter *params = nor->params; + + params->die_erase_opcode = SPINOR_OP_MT_DIE_ERASE; + params->n_dice = 4; + + /* + * Unfortunately the die erase opcode does not have a 4-byte opcode + * correspondent for these flashes. The SFDP 4BAIT table fails to + * consider the die erase too. We're forced to enter in the 4 byte + * address mode in order to benefit of the die erase. + */ + return spi_nor_set_4byte_addr_mode(nor, true); +} + +static struct spi_nor_fixups n25q00_fixups = { + .late_init = st_nor_four_die_late_init, +}; + +static struct spi_nor_fixups mt25q02_fixups = { + .late_init = st_nor_four_die_late_init, +}; + static const struct flash_info st_nor_parts[] = { { .name = "m25p05-nonjedec", @@ -366,16 +391,17 @@ static const struct flash_info st_nor_parts[] = { .name = "n25q00", .size = SZ_128M, .flags = SPI_NOR_HAS_LOCK | SPI_NOR_HAS_TB | SPI_NOR_4BIT_BP | - SPI_NOR_BP3_SR_BIT6 | NO_CHIP_ERASE, + SPI_NOR_BP3_SR_BIT6, .no_sfdp_flags = SECT_4K | SPI_NOR_QUAD_READ, .mfr_flags = USE_FSR, + .fixups = &n25q00_fixups, }, { .id = SNOR_ID(0x20, 0xba, 0x22), .name = "mt25ql02g", .size = SZ_256M, - .flags = NO_CHIP_ERASE, .no_sfdp_flags = SECT_4K | SPI_NOR_QUAD_READ, .mfr_flags = USE_FSR, + .fixups = &mt25q02_fixups, }, { .id = SNOR_ID(0x20, 0xbb, 0x15), .name = "n25q016a", @@ -433,16 +459,16 @@ static const struct flash_info st_nor_parts[] = { .id = SNOR_ID(0x20, 0xbb, 0x21), .name = "n25q00a", .size = SZ_128M, - .flags = NO_CHIP_ERASE, .no_sfdp_flags = SECT_4K | SPI_NOR_QUAD_READ, .mfr_flags = USE_FSR, + .fixups = &n25q00_fixups, }, { .id = SNOR_ID(0x20, 0xbb, 0x22), .name = "mt25qu02g", .size = SZ_256M, - .flags = NO_CHIP_ERASE, .no_sfdp_flags = SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ, .mfr_flags = USE_FSR, + .fixups = &mt25q02_fixups, } }; From 06de1257aae787fe3af14d03b9ceb0b9f6af9e1f Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Sat, 25 Nov 2023 14:35:28 +0200 Subject: [PATCH 0277/1562] mtd: spi-nor: remove NO_CHIP_ERASE flag There's no flash using it and we'd like to rely instead on SFDP data, thus remove it. Tested-by: Fabio Estevam Link: https://lore.kernel.org/r/20231125123529.55686-5-tudor.ambarus@linaro.org Signed-off-by: Tudor Ambarus --- drivers/mtd/spi-nor/core.c | 3 --- drivers/mtd/spi-nor/core.h | 8 +++----- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c index 52e5b569ddfd..503fed90c2fa 100644 --- a/drivers/mtd/spi-nor/core.c +++ b/drivers/mtd/spi-nor/core.c @@ -2888,9 +2888,6 @@ static void spi_nor_init_flags(struct spi_nor *nor) nor->flags |= SNOR_F_HAS_SR_BP3_BIT6; } - if (flags & NO_CHIP_ERASE) - nor->flags |= SNOR_F_NO_OP_CHIP_ERASE; - if (flags & SPI_NOR_RWW && nor->params->n_banks > 1 && !nor->controller_ops) nor->flags |= SNOR_F_RWW; diff --git a/drivers/mtd/spi-nor/core.h b/drivers/mtd/spi-nor/core.h index b43ea2d49e74..29ed67725b18 100644 --- a/drivers/mtd/spi-nor/core.h +++ b/drivers/mtd/spi-nor/core.h @@ -489,7 +489,6 @@ struct spi_nor_id { * Usually these will power-up in a write-protected * state. * SPI_NOR_NO_ERASE: no erase command needed. - * NO_CHIP_ERASE: chip does not support chip erase. * SPI_NOR_NO_FR: can't do fastread. * SPI_NOR_QUAD_PP: flash supports Quad Input Page Program. * SPI_NOR_RWW: flash supports reads while write. @@ -539,10 +538,9 @@ struct flash_info { #define SPI_NOR_BP3_SR_BIT6 BIT(4) #define SPI_NOR_SWP_IS_VOLATILE BIT(5) #define SPI_NOR_NO_ERASE BIT(6) -#define NO_CHIP_ERASE BIT(7) -#define SPI_NOR_NO_FR BIT(8) -#define SPI_NOR_QUAD_PP BIT(9) -#define SPI_NOR_RWW BIT(10) +#define SPI_NOR_NO_FR BIT(7) +#define SPI_NOR_QUAD_PP BIT(8) +#define SPI_NOR_RWW BIT(9) u8 no_sfdp_flags; #define SPI_NOR_SKIP_SFDP BIT(0) From c692ba6de1c5b4dc8cad0ba70281ba4cf9d2fdac Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 25 Nov 2023 14:35:29 +0200 Subject: [PATCH 0278/1562] mtd: spi-nor: micron-st: Add support for mt25qu01g Add support for the MT25QU01G 128MB Micron Serial NOR Flash Memory model. Link: https://www.micron.com/-/media/client/global/documents/products/data-sheet/nor-flash/serial-nor/mt25q/die-rev-b/mt25q_qlkt_u_01g_bbb_0.pdf Signed-off-by: Fabio Estevam [ta: introduce die erase] Link: https://lore.kernel.org/r/20231125123529.55686-6-tudor.ambarus@linaro.org Signed-off-by: Tudor Ambarus --- drivers/mtd/spi-nor/micron-st.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/mtd/spi-nor/micron-st.c b/drivers/mtd/spi-nor/micron-st.c index b63f1e9b97d0..3c6499fdb712 100644 --- a/drivers/mtd/spi-nor/micron-st.c +++ b/drivers/mtd/spi-nor/micron-st.c @@ -209,10 +209,30 @@ static int st_nor_four_die_late_init(struct spi_nor *nor) return spi_nor_set_4byte_addr_mode(nor, true); } +static int st_nor_two_die_late_init(struct spi_nor *nor) +{ + struct spi_nor_flash_parameter *params = nor->params; + + params->die_erase_opcode = SPINOR_OP_MT_DIE_ERASE; + params->n_dice = 2; + + /* + * Unfortunately the die erase opcode does not have a 4-byte opcode + * correspondent for these flashes. The SFDP 4BAIT table fails to + * consider the die erase too. We're forced to enter in the 4 byte + * address mode in order to benefit of the die erase. + */ + return spi_nor_set_4byte_addr_mode(nor, true); +} + static struct spi_nor_fixups n25q00_fixups = { .late_init = st_nor_four_die_late_init, }; +static struct spi_nor_fixups mt25q01_fixups = { + .late_init = st_nor_two_die_late_init, +}; + static struct spi_nor_fixups mt25q02_fixups = { .late_init = st_nor_four_die_late_init, }; @@ -455,6 +475,11 @@ static const struct flash_info st_nor_parts[] = { SPI_NOR_BP3_SR_BIT6, .no_sfdp_flags = SECT_4K | SPI_NOR_QUAD_READ, .mfr_flags = USE_FSR, + }, { + .id = SNOR_ID(0x20, 0xbb, 0x21, 0x10, 0x44, 0x00), + .name = "mt25qu01g", + .mfr_flags = USE_FSR, + .fixups = &mt25q01_fixups, }, { .id = SNOR_ID(0x20, 0xbb, 0x21), .name = "n25q00a", From 16a1d968358aa9e897ce995fa45cb15d55a0e83d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 20:43:43 +0200 Subject: [PATCH 0279/1562] mm/slab: remove mm/slab.c and slab_def.h Remove the SLAB implementation. Update CREDITS. Also update and properly sort the SLOB entry there. RIP SLAB allocator (1996 - 2024) Reviewed-by: Kees Cook Acked-by: Christoph Lameter Acked-by: David Rientjes Tested-by: David Rientjes Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- CREDITS | 12 +- include/linux/slab_def.h | 124 -- mm/slab.c | 4005 -------------------------------------- 3 files changed, 8 insertions(+), 4133 deletions(-) delete mode 100644 include/linux/slab_def.h delete mode 100644 mm/slab.c diff --git a/CREDITS b/CREDITS index f33a33fd2371..e9a094a93287 100644 --- a/CREDITS +++ b/CREDITS @@ -9,10 +9,6 @@ Linus ---------- -N: Matt Mackal -E: mpm@selenic.com -D: SLOB slab allocator - N: Matti Aarnio E: mea@nic.funet.fi D: Alpha systems hacking, IPv6 and other network related stuff @@ -1572,6 +1568,10 @@ S: Ampferstr. 50 / 4 S: 6020 Innsbruck S: Austria +N: Mark Hemment +E: markhe@nextd.demon.co.uk +D: SLAB allocator implementation + N: Richard Henderson E: rth@twiddle.net E: rth@cygnus.com @@ -2437,6 +2437,10 @@ D: work on suspend-to-ram/disk, killing duplicates from ioctl32, D: Altera SoCFPGA and Nokia N900 support. S: Czech Republic +N: Olivia Mackall +E: olivia@selenic.com +D: SLOB slab allocator + N: Paul Mackerras E: paulus@samba.org D: PPP driver diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h deleted file mode 100644 index a61e7d55d0d3..000000000000 --- a/include/linux/slab_def.h +++ /dev/null @@ -1,124 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SLAB_DEF_H -#define _LINUX_SLAB_DEF_H - -#include -#include - -/* - * Definitions unique to the original Linux SLAB allocator. - */ - -struct kmem_cache { - struct array_cache __percpu *cpu_cache; - -/* 1) Cache tunables. Protected by slab_mutex */ - unsigned int batchcount; - unsigned int limit; - unsigned int shared; - - unsigned int size; - struct reciprocal_value reciprocal_buffer_size; -/* 2) touched by every alloc & free from the backend */ - - slab_flags_t flags; /* constant flags */ - unsigned int num; /* # of objs per slab */ - -/* 3) cache_grow/shrink */ - /* order of pgs per slab (2^n) */ - unsigned int gfporder; - - /* force GFP flags, e.g. GFP_DMA */ - gfp_t allocflags; - - size_t colour; /* cache colouring range */ - unsigned int colour_off; /* colour offset */ - unsigned int freelist_size; - - /* constructor func */ - void (*ctor)(void *obj); - -/* 4) cache creation/removal */ - const char *name; - struct list_head list; - int refcount; - int object_size; - int align; - -/* 5) statistics */ -#ifdef CONFIG_DEBUG_SLAB - unsigned long num_active; - unsigned long num_allocations; - unsigned long high_mark; - unsigned long grown; - unsigned long reaped; - unsigned long errors; - unsigned long max_freeable; - unsigned long node_allocs; - unsigned long node_frees; - unsigned long node_overflow; - atomic_t allochit; - atomic_t allocmiss; - atomic_t freehit; - atomic_t freemiss; - - /* - * If debugging is enabled, then the allocator can add additional - * fields and/or padding to every object. 'size' contains the total - * object size including these internal fields, while 'obj_offset' - * and 'object_size' contain the offset to the user object and its - * size. - */ - int obj_offset; -#endif /* CONFIG_DEBUG_SLAB */ - -#ifdef CONFIG_KASAN_GENERIC - struct kasan_cache kasan_info; -#endif - -#ifdef CONFIG_SLAB_FREELIST_RANDOM - unsigned int *random_seq; -#endif - -#ifdef CONFIG_HARDENED_USERCOPY - unsigned int useroffset; /* Usercopy region offset */ - unsigned int usersize; /* Usercopy region size */ -#endif - - struct kmem_cache_node *node[MAX_NUMNODES]; -}; - -static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, - void *x) -{ - void *object = x - (x - slab->s_mem) % cache->size; - void *last_object = slab->s_mem + (cache->num - 1) * cache->size; - - if (unlikely(object > last_object)) - return last_object; - else - return object; -} - -/* - * We want to avoid an expensive divide : (offset / cache->size) - * Using the fact that size is a constant for a particular cache, - * we can replace (offset / cache->size) by - * reciprocal_divide(offset, cache->reciprocal_buffer_size) - */ -static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) -{ - u32 offset = (obj - slab->s_mem); - return reciprocal_divide(offset, cache->reciprocal_buffer_size); -} - -static inline int objs_per_slab(const struct kmem_cache *cache, - const struct slab *slab) -{ - if (is_kfence_address(slab_address(slab))) - return 1; - return cache->num; -} - -#endif /* _LINUX_SLAB_DEF_H */ diff --git a/mm/slab.c b/mm/slab.c deleted file mode 100644 index 37efe3241f9c..000000000000 --- a/mm/slab.c +++ /dev/null @@ -1,4005 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/mm/slab.c - * Written by Mark Hemment, 1996/97. - * (markhe@nextd.demon.co.uk) - * - * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli - * - * Major cleanup, different bufctl logic, per-cpu arrays - * (c) 2000 Manfred Spraul - * - * Cleanup, make the head arrays unconditional, preparation for NUMA - * (c) 2002 Manfred Spraul - * - * An implementation of the Slab Allocator as described in outline in; - * UNIX Internals: The New Frontiers by Uresh Vahalia - * Pub: Prentice Hall ISBN 0-13-101908-2 - * or with a little more detail in; - * The Slab Allocator: An Object-Caching Kernel Memory Allocator - * Jeff Bonwick (Sun Microsystems). - * Presented at: USENIX Summer 1994 Technical Conference - * - * The memory is organized in caches, one cache for each object type. - * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) - * Each cache consists out of many slabs (they are small (usually one - * page long) and always contiguous), and each slab contains multiple - * initialized objects. - * - * This means, that your constructor is used only for newly allocated - * slabs and you must pass objects with the same initializations to - * kmem_cache_free. - * - * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, - * normal). If you need a special memory type, then must create a new - * cache for that memory type. - * - * In order to reduce fragmentation, the slabs are sorted in 3 groups: - * full slabs with 0 free objects - * partial slabs - * empty slabs with no allocated objects - * - * If partial slabs exist, then new allocations come from these slabs, - * otherwise from empty slabs or new slabs are allocated. - * - * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache - * during kmem_cache_destroy(). The caller must prevent concurrent allocs. - * - * Each cache has a short per-cpu head array, most allocs - * and frees go into that array, and if that array overflows, then 1/2 - * of the entries in the array are given back into the global cache. - * The head array is strictly LIFO and should improve the cache hit rates. - * On SMP, it additionally reduces the spinlock operations. - * - * The c_cpuarray may not be read with enabled local interrupts - - * it's changed with a smp_call_function(). - * - * SMP synchronization: - * constructors and destructors are called without any locking. - * Several members in struct kmem_cache and struct slab never change, they - * are accessed without any locking. - * The per-cpu arrays are never accessed from the wrong cpu, no locking, - * and local interrupts are disabled so slab code is preempt-safe. - * The non-constant members are protected with a per-cache irq spinlock. - * - * Many thanks to Mark Hemment, who wrote another per-cpu slab patch - * in 2000 - many ideas in the current implementation are derived from - * his patch. - * - * Further notes from the original documentation: - * - * 11 April '97. Started multi-threading - markhe - * The global cache-chain is protected by the mutex 'slab_mutex'. - * The sem is only needed when accessing/extending the cache-chain, which - * can never happen inside an interrupt (kmem_cache_create(), - * kmem_cache_shrink() and kmem_cache_reap()). - * - * At present, each engine can be growing a cache. This should be blocked. - * - * 15 March 2005. NUMA slab allocator. - * Shai Fultheim . - * Shobhit Dayal - * Alok N Kataria - * Christoph Lameter - * - * Modified the slab allocator to be node aware on NUMA systems. - * Each node has its own list of partial, free and full slabs. - * All object allocations for a node occur from node specific slab lists. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include - -#include "internal.h" - -#include "slab.h" - -/* - * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. - * 0 for faster, smaller code (especially in the critical paths). - * - * STATS - 1 to collect stats for /proc/slabinfo. - * 0 for faster, smaller code (especially in the critical paths). - * - * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) - */ - -#ifdef CONFIG_DEBUG_SLAB -#define DEBUG 1 -#define STATS 1 -#define FORCED_DEBUG 1 -#else -#define DEBUG 0 -#define STATS 0 -#define FORCED_DEBUG 0 -#endif - -/* Shouldn't this be in a header file somewhere? */ -#define BYTES_PER_WORD sizeof(void *) -#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) - -#ifndef ARCH_KMALLOC_FLAGS -#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN -#endif - -#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ - <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) - -#if FREELIST_BYTE_INDEX -typedef unsigned char freelist_idx_t; -#else -typedef unsigned short freelist_idx_t; -#endif - -#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) - -/* - * struct array_cache - * - * Purpose: - * - LIFO ordering, to hand out cache-warm objects from _alloc - * - reduce the number of linked list operations - * - reduce spinlock operations - * - * The limit is stored in the per-cpu structure to reduce the data cache - * footprint. - * - */ -struct array_cache { - unsigned int avail; - unsigned int limit; - unsigned int batchcount; - unsigned int touched; - void *entry[]; /* - * Must have this definition in here for the proper - * alignment of array_cache. Also simplifies accessing - * the entries. - */ -}; - -struct alien_cache { - spinlock_t lock; - struct array_cache ac; -}; - -/* - * Need this for bootstrapping a per node allocator. - */ -#define NUM_INIT_LISTS (2 * MAX_NUMNODES) -static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; -#define CACHE_CACHE 0 -#define SIZE_NODE (MAX_NUMNODES) - -static int drain_freelist(struct kmem_cache *cache, - struct kmem_cache_node *n, int tofree); -static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node, struct list_head *list); -static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); -static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); -static void cache_reap(struct work_struct *unused); - -static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, - void **list); -static inline void fixup_slab_list(struct kmem_cache *cachep, - struct kmem_cache_node *n, struct slab *slab, - void **list); - -#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) - -static void kmem_cache_node_init(struct kmem_cache_node *parent) -{ - INIT_LIST_HEAD(&parent->slabs_full); - INIT_LIST_HEAD(&parent->slabs_partial); - INIT_LIST_HEAD(&parent->slabs_free); - parent->total_slabs = 0; - parent->free_slabs = 0; - parent->shared = NULL; - parent->alien = NULL; - parent->colour_next = 0; - raw_spin_lock_init(&parent->list_lock); - parent->free_objects = 0; - parent->free_touched = 0; -} - -#define MAKE_LIST(cachep, listp, slab, nodeid) \ - do { \ - INIT_LIST_HEAD(listp); \ - list_splice(&get_node(cachep, nodeid)->slab, listp); \ - } while (0) - -#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ - do { \ - MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ - MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ - MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ - } while (0) - -#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U) -#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U) -#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) -#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) - -#define BATCHREFILL_LIMIT 16 -/* - * Optimization question: fewer reaps means less probability for unnecessary - * cpucache drain/refill cycles. - * - * OTOH the cpuarrays can contain lots of objects, - * which could lock up otherwise freeable slabs. - */ -#define REAPTIMEOUT_AC (2*HZ) -#define REAPTIMEOUT_NODE (4*HZ) - -#if STATS -#define STATS_INC_ACTIVE(x) ((x)->num_active++) -#define STATS_DEC_ACTIVE(x) ((x)->num_active--) -#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) -#define STATS_INC_GROWN(x) ((x)->grown++) -#define STATS_ADD_REAPED(x, y) ((x)->reaped += (y)) -#define STATS_SET_HIGH(x) \ - do { \ - if ((x)->num_active > (x)->high_mark) \ - (x)->high_mark = (x)->num_active; \ - } while (0) -#define STATS_INC_ERR(x) ((x)->errors++) -#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) -#define STATS_INC_NODEFREES(x) ((x)->node_frees++) -#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) -#define STATS_SET_FREEABLE(x, i) \ - do { \ - if ((x)->max_freeable < i) \ - (x)->max_freeable = i; \ - } while (0) -#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) -#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) -#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) -#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) -#else -#define STATS_INC_ACTIVE(x) do { } while (0) -#define STATS_DEC_ACTIVE(x) do { } while (0) -#define STATS_INC_ALLOCED(x) do { } while (0) -#define STATS_INC_GROWN(x) do { } while (0) -#define STATS_ADD_REAPED(x, y) do { (void)(y); } while (0) -#define STATS_SET_HIGH(x) do { } while (0) -#define STATS_INC_ERR(x) do { } while (0) -#define STATS_INC_NODEALLOCS(x) do { } while (0) -#define STATS_INC_NODEFREES(x) do { } while (0) -#define STATS_INC_ACOVERFLOW(x) do { } while (0) -#define STATS_SET_FREEABLE(x, i) do { } while (0) -#define STATS_INC_ALLOCHIT(x) do { } while (0) -#define STATS_INC_ALLOCMISS(x) do { } while (0) -#define STATS_INC_FREEHIT(x) do { } while (0) -#define STATS_INC_FREEMISS(x) do { } while (0) -#endif - -#if DEBUG - -/* - * memory layout of objects: - * 0 : objp - * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that - * the end of an object is aligned with the end of the real - * allocation. Catches writes behind the end of the allocation. - * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: - * redzone word. - * cachep->obj_offset: The real object. - * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] - * cachep->size - 1* BYTES_PER_WORD: last caller address - * [BYTES_PER_WORD long] - */ -static int obj_offset(struct kmem_cache *cachep) -{ - return cachep->obj_offset; -} - -static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) -{ - BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); - return (unsigned long long *) (objp + obj_offset(cachep) - - sizeof(unsigned long long)); -} - -static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) -{ - BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); - if (cachep->flags & SLAB_STORE_USER) - return (unsigned long long *)(objp + cachep->size - - sizeof(unsigned long long) - - REDZONE_ALIGN); - return (unsigned long long *) (objp + cachep->size - - sizeof(unsigned long long)); -} - -static void **dbg_userword(struct kmem_cache *cachep, void *objp) -{ - BUG_ON(!(cachep->flags & SLAB_STORE_USER)); - return (void **)(objp + cachep->size - BYTES_PER_WORD); -} - -#else - -#define obj_offset(x) 0 -#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) -#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) -#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) - -#endif - -/* - * Do not go above this order unless 0 objects fit into the slab or - * overridden on the command line. - */ -#define SLAB_MAX_ORDER_HI 1 -#define SLAB_MAX_ORDER_LO 0 -static int slab_max_order = SLAB_MAX_ORDER_LO; -static bool slab_max_order_set __initdata; - -static inline void *index_to_obj(struct kmem_cache *cache, - const struct slab *slab, unsigned int idx) -{ - return slab->s_mem + cache->size * idx; -} - -#define BOOT_CPUCACHE_ENTRIES 1 -/* internal cache of cache description objs */ -static struct kmem_cache kmem_cache_boot = { - .batchcount = 1, - .limit = BOOT_CPUCACHE_ENTRIES, - .shared = 1, - .size = sizeof(struct kmem_cache), - .name = "kmem_cache", -}; - -static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); - -static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) -{ - return this_cpu_ptr(cachep->cpu_cache); -} - -/* - * Calculate the number of objects and left-over bytes for a given buffer size. - */ -static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, - slab_flags_t flags, size_t *left_over) -{ - unsigned int num; - size_t slab_size = PAGE_SIZE << gfporder; - - /* - * The slab management structure can be either off the slab or - * on it. For the latter case, the memory allocated for a - * slab is used for: - * - * - @buffer_size bytes for each object - * - One freelist_idx_t for each object - * - * We don't need to consider alignment of freelist because - * freelist will be at the end of slab page. The objects will be - * at the correct alignment. - * - * If the slab management structure is off the slab, then the - * alignment will already be calculated into the size. Because - * the slabs are all pages aligned, the objects will be at the - * correct alignment when allocated. - */ - if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { - num = slab_size / buffer_size; - *left_over = slab_size % buffer_size; - } else { - num = slab_size / (buffer_size + sizeof(freelist_idx_t)); - *left_over = slab_size % - (buffer_size + sizeof(freelist_idx_t)); - } - - return num; -} - -#if DEBUG -#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) - -static void __slab_error(const char *function, struct kmem_cache *cachep, - char *msg) -{ - pr_err("slab error in %s(): cache `%s': %s\n", - function, cachep->name, msg); - dump_stack(); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); -} -#endif - -/* - * By default on NUMA we use alien caches to stage the freeing of - * objects allocated from other nodes. This causes massive memory - * inefficiencies when using fake NUMA setup to split memory into a - * large number of small nodes, so it can be disabled on the command - * line - */ - -static int use_alien_caches __read_mostly = 1; -static int __init noaliencache_setup(char *s) -{ - use_alien_caches = 0; - return 1; -} -__setup("noaliencache", noaliencache_setup); - -static int __init slab_max_order_setup(char *str) -{ - get_option(&str, &slab_max_order); - slab_max_order = slab_max_order < 0 ? 0 : - min(slab_max_order, MAX_ORDER); - slab_max_order_set = true; - - return 1; -} -__setup("slab_max_order=", slab_max_order_setup); - -#ifdef CONFIG_NUMA -/* - * Special reaping functions for NUMA systems called from cache_reap(). - * These take care of doing round robin flushing of alien caches (containing - * objects freed on different nodes from which they were allocated) and the - * flushing of remote pcps by calling drain_node_pages. - */ -static DEFINE_PER_CPU(unsigned long, slab_reap_node); - -static void init_reap_node(int cpu) -{ - per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu), - node_online_map); -} - -static void next_reap_node(void) -{ - int node = __this_cpu_read(slab_reap_node); - - node = next_node_in(node, node_online_map); - __this_cpu_write(slab_reap_node, node); -} - -#else -#define init_reap_node(cpu) do { } while (0) -#define next_reap_node(void) do { } while (0) -#endif - -/* - * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz - * via the workqueue/eventd. - * Add the CPU number into the expiration time to minimize the possibility of - * the CPUs getting into lockstep and contending for the global cache chain - * lock. - */ -static void start_cpu_timer(int cpu) -{ - struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); - - if (reap_work->work.func == NULL) { - init_reap_node(cpu); - INIT_DEFERRABLE_WORK(reap_work, cache_reap); - schedule_delayed_work_on(cpu, reap_work, - __round_jiffies_relative(HZ, cpu)); - } -} - -static void init_arraycache(struct array_cache *ac, int limit, int batch) -{ - if (ac) { - ac->avail = 0; - ac->limit = limit; - ac->batchcount = batch; - ac->touched = 0; - } -} - -static struct array_cache *alloc_arraycache(int node, int entries, - int batchcount, gfp_t gfp) -{ - size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); - struct array_cache *ac = NULL; - - ac = kmalloc_node(memsize, gfp, node); - /* - * The array_cache structures contain pointers to free object. - * However, when such objects are allocated or transferred to another - * cache the pointers are not cleared and they could be counted as - * valid references during a kmemleak scan. Therefore, kmemleak must - * not scan such objects. - */ - kmemleak_no_scan(ac); - init_arraycache(ac, entries, batchcount); - return ac; -} - -static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, - struct slab *slab, void *objp) -{ - struct kmem_cache_node *n; - int slab_node; - LIST_HEAD(list); - - slab_node = slab_nid(slab); - n = get_node(cachep, slab_node); - - raw_spin_lock(&n->list_lock); - free_block(cachep, &objp, 1, slab_node, &list); - raw_spin_unlock(&n->list_lock); - - slabs_destroy(cachep, &list); -} - -/* - * Transfer objects in one arraycache to another. - * Locking must be handled by the caller. - * - * Return the number of entries transferred. - */ -static int transfer_objects(struct array_cache *to, - struct array_cache *from, unsigned int max) -{ - /* Figure out how many entries to transfer */ - int nr = min3(from->avail, max, to->limit - to->avail); - - if (!nr) - return 0; - - memcpy(to->entry + to->avail, from->entry + from->avail - nr, - sizeof(void *) *nr); - - from->avail -= nr; - to->avail += nr; - return nr; -} - -/* &alien->lock must be held by alien callers. */ -static __always_inline void __free_one(struct array_cache *ac, void *objp) -{ - /* Avoid trivial double-free. */ - if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && - WARN_ON_ONCE(ac->avail > 0 && ac->entry[ac->avail - 1] == objp)) - return; - ac->entry[ac->avail++] = objp; -} - -#ifndef CONFIG_NUMA - -#define drain_alien_cache(cachep, alien) do { } while (0) -#define reap_alien(cachep, n) do { } while (0) - -static inline struct alien_cache **alloc_alien_cache(int node, - int limit, gfp_t gfp) -{ - return NULL; -} - -static inline void free_alien_cache(struct alien_cache **ac_ptr) -{ -} - -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) -{ - return 0; -} - -static inline gfp_t gfp_exact_node(gfp_t flags) -{ - return flags & ~__GFP_NOFAIL; -} - -#else /* CONFIG_NUMA */ - -static struct alien_cache *__alloc_alien_cache(int node, int entries, - int batch, gfp_t gfp) -{ - size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); - struct alien_cache *alc = NULL; - - alc = kmalloc_node(memsize, gfp, node); - if (alc) { - kmemleak_no_scan(alc); - init_arraycache(&alc->ac, entries, batch); - spin_lock_init(&alc->lock); - } - return alc; -} - -static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) -{ - struct alien_cache **alc_ptr; - int i; - - if (limit > 1) - limit = 12; - alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node); - if (!alc_ptr) - return NULL; - - for_each_node(i) { - if (i == node || !node_online(i)) - continue; - alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); - if (!alc_ptr[i]) { - for (i--; i >= 0; i--) - kfree(alc_ptr[i]); - kfree(alc_ptr); - return NULL; - } - } - return alc_ptr; -} - -static void free_alien_cache(struct alien_cache **alc_ptr) -{ - int i; - - if (!alc_ptr) - return; - for_each_node(i) - kfree(alc_ptr[i]); - kfree(alc_ptr); -} - -static void __drain_alien_cache(struct kmem_cache *cachep, - struct array_cache *ac, int node, - struct list_head *list) -{ - struct kmem_cache_node *n = get_node(cachep, node); - - if (ac->avail) { - raw_spin_lock(&n->list_lock); - /* - * Stuff objects into the remote nodes shared array first. - * That way we could avoid the overhead of putting the objects - * into the free lists and getting them back later. - */ - if (n->shared) - transfer_objects(n->shared, ac, ac->limit); - - free_block(cachep, ac->entry, ac->avail, node, list); - ac->avail = 0; - raw_spin_unlock(&n->list_lock); - } -} - -/* - * Called from cache_reap() to regularly drain alien caches round robin. - */ -static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) -{ - int node = __this_cpu_read(slab_reap_node); - - if (n->alien) { - struct alien_cache *alc = n->alien[node]; - struct array_cache *ac; - - if (alc) { - ac = &alc->ac; - if (ac->avail && spin_trylock_irq(&alc->lock)) { - LIST_HEAD(list); - - __drain_alien_cache(cachep, ac, node, &list); - spin_unlock_irq(&alc->lock); - slabs_destroy(cachep, &list); - } - } - } -} - -static void drain_alien_cache(struct kmem_cache *cachep, - struct alien_cache **alien) -{ - int i = 0; - struct alien_cache *alc; - struct array_cache *ac; - unsigned long flags; - - for_each_online_node(i) { - alc = alien[i]; - if (alc) { - LIST_HEAD(list); - - ac = &alc->ac; - spin_lock_irqsave(&alc->lock, flags); - __drain_alien_cache(cachep, ac, i, &list); - spin_unlock_irqrestore(&alc->lock, flags); - slabs_destroy(cachep, &list); - } - } -} - -static int __cache_free_alien(struct kmem_cache *cachep, void *objp, - int node, int slab_node) -{ - struct kmem_cache_node *n; - struct alien_cache *alien = NULL; - struct array_cache *ac; - LIST_HEAD(list); - - n = get_node(cachep, node); - STATS_INC_NODEFREES(cachep); - if (n->alien && n->alien[slab_node]) { - alien = n->alien[slab_node]; - ac = &alien->ac; - spin_lock(&alien->lock); - if (unlikely(ac->avail == ac->limit)) { - STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, ac, slab_node, &list); - } - __free_one(ac, objp); - spin_unlock(&alien->lock); - slabs_destroy(cachep, &list); - } else { - n = get_node(cachep, slab_node); - raw_spin_lock(&n->list_lock); - free_block(cachep, &objp, 1, slab_node, &list); - raw_spin_unlock(&n->list_lock); - slabs_destroy(cachep, &list); - } - return 1; -} - -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) -{ - int slab_node = slab_nid(virt_to_slab(objp)); - int node = numa_mem_id(); - /* - * Make sure we are not freeing an object from another node to the array - * cache on this cpu. - */ - if (likely(node == slab_node)) - return 0; - - return __cache_free_alien(cachep, objp, node, slab_node); -} - -/* - * Construct gfp mask to allocate from a specific node but do not reclaim or - * warn about failures. - */ -static inline gfp_t gfp_exact_node(gfp_t flags) -{ - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL); -} -#endif - -static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) -{ - struct kmem_cache_node *n; - - /* - * Set up the kmem_cache_node for cpu before we can - * begin anything. Make sure some other cpu on this - * node has not already allocated this - */ - n = get_node(cachep, node); - if (n) { - raw_spin_lock_irq(&n->list_lock); - n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + - cachep->num; - raw_spin_unlock_irq(&n->list_lock); - - return 0; - } - - n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); - if (!n) - return -ENOMEM; - - kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - - n->free_limit = - (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - - /* - * The kmem_cache_nodes don't come and go as CPUs - * come and go. slab_mutex provides sufficient - * protection here. - */ - cachep->node[node] = n; - - return 0; -} - -#if defined(CONFIG_NUMA) || defined(CONFIG_SMP) -/* - * Allocates and initializes node for a node on each slab cache, used for - * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node - * will be allocated off-node since memory is not yet online for the new node. - * When hotplugging memory or a cpu, existing nodes are not replaced if - * already in use. - * - * Must hold slab_mutex. - */ -static int init_cache_node_node(int node) -{ - int ret; - struct kmem_cache *cachep; - - list_for_each_entry(cachep, &slab_caches, list) { - ret = init_cache_node(cachep, node, GFP_KERNEL); - if (ret) - return ret; - } - - return 0; -} -#endif - -static int setup_kmem_cache_node(struct kmem_cache *cachep, - int node, gfp_t gfp, bool force_change) -{ - int ret = -ENOMEM; - struct kmem_cache_node *n; - struct array_cache *old_shared = NULL; - struct array_cache *new_shared = NULL; - struct alien_cache **new_alien = NULL; - LIST_HEAD(list); - - if (use_alien_caches) { - new_alien = alloc_alien_cache(node, cachep->limit, gfp); - if (!new_alien) - goto fail; - } - - if (cachep->shared) { - new_shared = alloc_arraycache(node, - cachep->shared * cachep->batchcount, 0xbaadf00d, gfp); - if (!new_shared) - goto fail; - } - - ret = init_cache_node(cachep, node, gfp); - if (ret) - goto fail; - - n = get_node(cachep, node); - raw_spin_lock_irq(&n->list_lock); - if (n->shared && force_change) { - free_block(cachep, n->shared->entry, - n->shared->avail, node, &list); - n->shared->avail = 0; - } - - if (!n->shared || force_change) { - old_shared = n->shared; - n->shared = new_shared; - new_shared = NULL; - } - - if (!n->alien) { - n->alien = new_alien; - new_alien = NULL; - } - - raw_spin_unlock_irq(&n->list_lock); - slabs_destroy(cachep, &list); - - /* - * To protect lockless access to n->shared during irq disabled context. - * If n->shared isn't NULL in irq disabled context, accessing to it is - * guaranteed to be valid until irq is re-enabled, because it will be - * freed after synchronize_rcu(). - */ - if (old_shared && force_change) - synchronize_rcu(); - -fail: - kfree(old_shared); - kfree(new_shared); - free_alien_cache(new_alien); - - return ret; -} - -#ifdef CONFIG_SMP - -static void cpuup_canceled(long cpu) -{ - struct kmem_cache *cachep; - struct kmem_cache_node *n = NULL; - int node = cpu_to_mem(cpu); - const struct cpumask *mask = cpumask_of_node(node); - - list_for_each_entry(cachep, &slab_caches, list) { - struct array_cache *nc; - struct array_cache *shared; - struct alien_cache **alien; - LIST_HEAD(list); - - n = get_node(cachep, node); - if (!n) - continue; - - raw_spin_lock_irq(&n->list_lock); - - /* Free limit for this kmem_cache_node */ - n->free_limit -= cachep->batchcount; - - /* cpu is dead; no one can alloc from it. */ - nc = per_cpu_ptr(cachep->cpu_cache, cpu); - free_block(cachep, nc->entry, nc->avail, node, &list); - nc->avail = 0; - - if (!cpumask_empty(mask)) { - raw_spin_unlock_irq(&n->list_lock); - goto free_slab; - } - - shared = n->shared; - if (shared) { - free_block(cachep, shared->entry, - shared->avail, node, &list); - n->shared = NULL; - } - - alien = n->alien; - n->alien = NULL; - - raw_spin_unlock_irq(&n->list_lock); - - kfree(shared); - if (alien) { - drain_alien_cache(cachep, alien); - free_alien_cache(alien); - } - -free_slab: - slabs_destroy(cachep, &list); - } - /* - * In the previous loop, all the objects were freed to - * the respective cache's slabs, now we can go ahead and - * shrink each nodelist to its limit. - */ - list_for_each_entry(cachep, &slab_caches, list) { - n = get_node(cachep, node); - if (!n) - continue; - drain_freelist(cachep, n, INT_MAX); - } -} - -static int cpuup_prepare(long cpu) -{ - struct kmem_cache *cachep; - int node = cpu_to_mem(cpu); - int err; - - /* - * We need to do this right in the beginning since - * alloc_arraycache's are going to use this list. - * kmalloc_node allows us to add the slab to the right - * kmem_cache_node and not this cpu's kmem_cache_node - */ - err = init_cache_node_node(node); - if (err < 0) - goto bad; - - /* - * Now we can go ahead with allocating the shared arrays and - * array caches - */ - list_for_each_entry(cachep, &slab_caches, list) { - err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false); - if (err) - goto bad; - } - - return 0; -bad: - cpuup_canceled(cpu); - return -ENOMEM; -} - -int slab_prepare_cpu(unsigned int cpu) -{ - int err; - - mutex_lock(&slab_mutex); - err = cpuup_prepare(cpu); - mutex_unlock(&slab_mutex); - return err; -} - -/* - * This is called for a failed online attempt and for a successful - * offline. - * - * Even if all the cpus of a node are down, we don't free the - * kmem_cache_node of any cache. This is to avoid a race between cpu_down, and - * a kmalloc allocation from another cpu for memory from the node of - * the cpu going down. The kmem_cache_node structure is usually allocated from - * kmem_cache_create() and gets destroyed at kmem_cache_destroy(). - */ -int slab_dead_cpu(unsigned int cpu) -{ - mutex_lock(&slab_mutex); - cpuup_canceled(cpu); - mutex_unlock(&slab_mutex); - return 0; -} -#endif - -static int slab_online_cpu(unsigned int cpu) -{ - start_cpu_timer(cpu); - return 0; -} - -static int slab_offline_cpu(unsigned int cpu) -{ - /* - * Shutdown cache reaper. Note that the slab_mutex is held so - * that if cache_reap() is invoked it cannot do anything - * expensive but will only modify reap_work and reschedule the - * timer. - */ - cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); - /* Now the cache_reaper is guaranteed to be not running. */ - per_cpu(slab_reap_work, cpu).work.func = NULL; - return 0; -} - -#if defined(CONFIG_NUMA) -/* - * Drains freelist for a node on each slab cache, used for memory hot-remove. - * Returns -EBUSY if all objects cannot be drained so that the node is not - * removed. - * - * Must hold slab_mutex. - */ -static int __meminit drain_cache_node_node(int node) -{ - struct kmem_cache *cachep; - int ret = 0; - - list_for_each_entry(cachep, &slab_caches, list) { - struct kmem_cache_node *n; - - n = get_node(cachep, node); - if (!n) - continue; - - drain_freelist(cachep, n, INT_MAX); - - if (!list_empty(&n->slabs_full) || - !list_empty(&n->slabs_partial)) { - ret = -EBUSY; - break; - } - } - return ret; -} - -static int __meminit slab_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mnb = arg; - int ret = 0; - int nid; - - nid = mnb->status_change_nid; - if (nid < 0) - goto out; - - switch (action) { - case MEM_GOING_ONLINE: - mutex_lock(&slab_mutex); - ret = init_cache_node_node(nid); - mutex_unlock(&slab_mutex); - break; - case MEM_GOING_OFFLINE: - mutex_lock(&slab_mutex); - ret = drain_cache_node_node(nid); - mutex_unlock(&slab_mutex); - break; - case MEM_ONLINE: - case MEM_OFFLINE: - case MEM_CANCEL_ONLINE: - case MEM_CANCEL_OFFLINE: - break; - } -out: - return notifier_from_errno(ret); -} -#endif /* CONFIG_NUMA */ - -/* - * swap the static kmem_cache_node with kmalloced memory - */ -static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list, - int nodeid) -{ - struct kmem_cache_node *ptr; - - ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid); - BUG_ON(!ptr); - - memcpy(ptr, list, sizeof(struct kmem_cache_node)); - /* - * Do not assume that spinlocks can be initialized via memcpy: - */ - raw_spin_lock_init(&ptr->list_lock); - - MAKE_ALL_LISTS(cachep, ptr, nodeid); - cachep->node[nodeid] = ptr; -} - -/* - * For setting up all the kmem_cache_node for cache whose buffer_size is same as - * size of kmem_cache_node. - */ -static void __init set_up_node(struct kmem_cache *cachep, int index) -{ - int node; - - for_each_online_node(node) { - cachep->node[node] = &init_kmem_cache_node[index + node]; - cachep->node[node]->next_reap = jiffies + - REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - } -} - -/* - * Initialisation. Called after the page allocator have been initialised and - * before smp_init(). - */ -void __init kmem_cache_init(void) -{ - int i; - - kmem_cache = &kmem_cache_boot; - - if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1) - use_alien_caches = 0; - - for (i = 0; i < NUM_INIT_LISTS; i++) - kmem_cache_node_init(&init_kmem_cache_node[i]); - - /* - * Fragmentation resistance on low memory - only use bigger - * page orders on machines with more than 32MB of memory if - * not overridden on the command line. - */ - if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT) - slab_max_order = SLAB_MAX_ORDER_HI; - - /* Bootstrap is tricky, because several objects are allocated - * from caches that do not exist yet: - * 1) initialize the kmem_cache cache: it contains the struct - * kmem_cache structures of all caches, except kmem_cache itself: - * kmem_cache is statically allocated. - * Initially an __init data area is used for the head array and the - * kmem_cache_node structures, it's replaced with a kmalloc allocated - * array at the end of the bootstrap. - * 2) Create the first kmalloc cache. - * The struct kmem_cache for the new cache is allocated normally. - * An __init data area is used for the head array. - * 3) Create the remaining kmalloc caches, with minimally sized - * head arrays. - * 4) Replace the __init data head arrays for kmem_cache and the first - * kmalloc cache with kmalloc allocated arrays. - * 5) Replace the __init data for kmem_cache_node for kmem_cache and - * the other cache's with kmalloc allocated memory. - * 6) Resize the head arrays of the kmalloc caches to their final sizes. - */ - - /* 1) create the kmem_cache */ - - /* - * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids - */ - create_boot_cache(kmem_cache, "kmem_cache", - offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *), - SLAB_HWCACHE_ALIGN, 0, 0); - list_add(&kmem_cache->list, &slab_caches); - slab_state = PARTIAL; - - /* - * Initialize the caches that provide memory for the kmem_cache_node - * structures first. Without this, further allocations will bug. - */ - new_kmalloc_cache(INDEX_NODE, KMALLOC_NORMAL, ARCH_KMALLOC_FLAGS); - slab_state = PARTIAL_NODE; - setup_kmalloc_cache_index_table(); - - /* 5) Replace the bootstrap kmem_cache_node */ - { - int nid; - - for_each_online_node(nid) { - init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); - - init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE], - &init_kmem_cache_node[SIZE_NODE + nid], nid); - } - } - - create_kmalloc_caches(ARCH_KMALLOC_FLAGS); -} - -void __init kmem_cache_init_late(void) -{ - struct kmem_cache *cachep; - - /* 6) resize the head arrays to their final sizes */ - mutex_lock(&slab_mutex); - list_for_each_entry(cachep, &slab_caches, list) - if (enable_cpucache(cachep, GFP_NOWAIT)) - BUG(); - mutex_unlock(&slab_mutex); - - /* Done! */ - slab_state = FULL; - -#ifdef CONFIG_NUMA - /* - * Register a memory hotplug callback that initializes and frees - * node. - */ - hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); -#endif - - /* - * The reap timers are started later, with a module init call: That part - * of the kernel is not yet operational. - */ -} - -static int __init cpucache_init(void) -{ - int ret; - - /* - * Register the timers that return unneeded pages to the page allocator - */ - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SLAB online", - slab_online_cpu, slab_offline_cpu); - WARN_ON(ret < 0); - - return 0; -} -__initcall(cpucache_init); - -static noinline void -slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) -{ -#if DEBUG - struct kmem_cache_node *n; - unsigned long flags; - int node; - static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - - if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) - return; - - pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", - nodeid, gfpflags, &gfpflags); - pr_warn(" cache: %s, object size: %d, order: %d\n", - cachep->name, cachep->size, cachep->gfporder); - - for_each_kmem_cache_node(cachep, node, n) { - unsigned long total_slabs, free_slabs, free_objs; - - raw_spin_lock_irqsave(&n->list_lock, flags); - total_slabs = n->total_slabs; - free_slabs = n->free_slabs; - free_objs = n->free_objects; - raw_spin_unlock_irqrestore(&n->list_lock, flags); - - pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", - node, total_slabs - free_slabs, total_slabs, - (total_slabs * cachep->num) - free_objs, - total_slabs * cachep->num); - } -#endif -} - -/* - * Interface to system's page allocator. No need to hold the - * kmem_cache_node ->list_lock. - * - * If we requested dmaable memory, we will get it. Even if we - * did not request dmaable memory, we might get it, but that - * would be relatively rare and ignorable. - */ -static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, - int nodeid) -{ - struct folio *folio; - struct slab *slab; - - flags |= cachep->allocflags; - - folio = (struct folio *) __alloc_pages_node(nodeid, flags, cachep->gfporder); - if (!folio) { - slab_out_of_memory(cachep, flags, nodeid); - return NULL; - } - - slab = folio_slab(folio); - - account_slab(slab, cachep->gfporder, cachep, flags); - __folio_set_slab(folio); - /* Make the flag visible before any changes to folio->mapping */ - smp_wmb(); - /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (sk_memalloc_socks() && folio_is_pfmemalloc(folio)) - slab_set_pfmemalloc(slab); - - return slab; -} - -/* - * Interface to system's page release. - */ -static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab) -{ - int order = cachep->gfporder; - struct folio *folio = slab_folio(slab); - - BUG_ON(!folio_test_slab(folio)); - __slab_clear_pfmemalloc(slab); - page_mapcount_reset(&folio->page); - folio->mapping = NULL; - /* Make the mapping reset visible before clearing the flag */ - smp_wmb(); - __folio_clear_slab(folio); - - mm_account_reclaimed_pages(1 << order); - unaccount_slab(slab, order, cachep); - __free_pages(&folio->page, order); -} - -static void kmem_rcu_free(struct rcu_head *head) -{ - struct kmem_cache *cachep; - struct slab *slab; - - slab = container_of(head, struct slab, rcu_head); - cachep = slab->slab_cache; - - kmem_freepages(cachep, slab); -} - -#if DEBUG -static inline bool is_debug_pagealloc_cache(struct kmem_cache *cachep) -{ - return debug_pagealloc_enabled_static() && OFF_SLAB(cachep) && - ((cachep->size % PAGE_SIZE) == 0); -} - -#ifdef CONFIG_DEBUG_PAGEALLOC -static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map) -{ - if (!is_debug_pagealloc_cache(cachep)) - return; - - __kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); -} - -#else -static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, - int map) {} - -#endif - -static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) -{ - int size = cachep->object_size; - addr = &((char *)addr)[obj_offset(cachep)]; - - memset(addr, val, size); - *(unsigned char *)(addr + size - 1) = POISON_END; -} - -static void dump_line(char *data, int offset, int limit) -{ - int i; - unsigned char error = 0; - int bad_count = 0; - - pr_err("%03x: ", offset); - for (i = 0; i < limit; i++) { - if (data[offset + i] != POISON_FREE) { - error = data[offset + i]; - bad_count++; - } - } - print_hex_dump(KERN_CONT, "", 0, 16, 1, - &data[offset], limit, 1); - - if (bad_count == 1) { - error ^= POISON_FREE; - if (!(error & (error - 1))) { - pr_err("Single bit error detected. Probably bad RAM.\n"); -#ifdef CONFIG_X86 - pr_err("Run memtest86+ or a similar memory test tool.\n"); -#else - pr_err("Run a memory test tool.\n"); -#endif - } - } -} -#endif - -#if DEBUG - -static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) -{ - int i, size; - char *realobj; - - if (cachep->flags & SLAB_RED_ZONE) { - pr_err("Redzone: 0x%llx/0x%llx\n", - *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); - } - - if (cachep->flags & SLAB_STORE_USER) - pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp)); - realobj = (char *)objp + obj_offset(cachep); - size = cachep->object_size; - for (i = 0; i < size && lines; i += 16, lines--) { - int limit; - limit = 16; - if (i + limit > size) - limit = size - i; - dump_line(realobj, i, limit); - } -} - -static void check_poison_obj(struct kmem_cache *cachep, void *objp) -{ - char *realobj; - int size, i; - int lines = 0; - - if (is_debug_pagealloc_cache(cachep)) - return; - - realobj = (char *)objp + obj_offset(cachep); - size = cachep->object_size; - - for (i = 0; i < size; i++) { - char exp = POISON_FREE; - if (i == size - 1) - exp = POISON_END; - if (realobj[i] != exp) { - int limit; - /* Mismatch ! */ - /* Print header */ - if (lines == 0) { - pr_err("Slab corruption (%s): %s start=%px, len=%d\n", - print_tainted(), cachep->name, - realobj, size); - print_objinfo(cachep, objp, 0); - } - /* Hexdump the affected line */ - i = (i / 16) * 16; - limit = 16; - if (i + limit > size) - limit = size - i; - dump_line(realobj, i, limit); - i += 16; - lines++; - /* Limit to 5 lines */ - if (lines > 5) - break; - } - } - if (lines != 0) { - /* Print some data about the neighboring objects, if they - * exist: - */ - struct slab *slab = virt_to_slab(objp); - unsigned int objnr; - - objnr = obj_to_index(cachep, slab, objp); - if (objnr) { - objp = index_to_obj(cachep, slab, objnr - 1); - realobj = (char *)objp + obj_offset(cachep); - pr_err("Prev obj: start=%px, len=%d\n", realobj, size); - print_objinfo(cachep, objp, 2); - } - if (objnr + 1 < cachep->num) { - objp = index_to_obj(cachep, slab, objnr + 1); - realobj = (char *)objp + obj_offset(cachep); - pr_err("Next obj: start=%px, len=%d\n", realobj, size); - print_objinfo(cachep, objp, 2); - } - } -} -#endif - -#if DEBUG -static void slab_destroy_debugcheck(struct kmem_cache *cachep, - struct slab *slab) -{ - int i; - - if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) { - poison_obj(cachep, slab->freelist - obj_offset(cachep), - POISON_FREE); - } - - for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, slab, i); - - if (cachep->flags & SLAB_POISON) { - check_poison_obj(cachep, objp); - slab_kernel_map(cachep, objp, 1); - } - if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "start of a freed object was overwritten"); - if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "end of a freed object was overwritten"); - } - } -} -#else -static void slab_destroy_debugcheck(struct kmem_cache *cachep, - struct slab *slab) -{ -} -#endif - -/** - * slab_destroy - destroy and release all objects in a slab - * @cachep: cache pointer being destroyed - * @slab: slab being destroyed - * - * Destroy all the objs in a slab, and release the mem back to the system. - * Before calling the slab must have been unlinked from the cache. The - * kmem_cache_node ->list_lock is not held/needed. - */ -static void slab_destroy(struct kmem_cache *cachep, struct slab *slab) -{ - void *freelist; - - freelist = slab->freelist; - slab_destroy_debugcheck(cachep, slab); - if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU)) - call_rcu(&slab->rcu_head, kmem_rcu_free); - else - kmem_freepages(cachep, slab); - - /* - * From now on, we don't use freelist - * although actual page can be freed in rcu context - */ - if (OFF_SLAB(cachep)) - kfree(freelist); -} - -/* - * Update the size of the caches before calling slabs_destroy as it may - * recursively call kfree. - */ -static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) -{ - struct slab *slab, *n; - - list_for_each_entry_safe(slab, n, list, slab_list) { - list_del(&slab->slab_list); - slab_destroy(cachep, slab); - } -} - -/** - * calculate_slab_order - calculate size (page order) of slabs - * @cachep: pointer to the cache that is being created - * @size: size of objects to be created in this cache. - * @flags: slab allocation flags - * - * Also calculates the number of objects per slab. - * - * This could be made much more intelligent. For now, try to avoid using - * high order pages for slabs. When the gfp() functions are more friendly - * towards high-order requests, this should be changed. - * - * Return: number of left-over bytes in a slab - */ -static size_t calculate_slab_order(struct kmem_cache *cachep, - size_t size, slab_flags_t flags) -{ - size_t left_over = 0; - int gfporder; - - for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { - unsigned int num; - size_t remainder; - - num = cache_estimate(gfporder, size, flags, &remainder); - if (!num) - continue; - - /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ - if (num > SLAB_OBJ_MAX_NUM) - break; - - if (flags & CFLGS_OFF_SLAB) { - struct kmem_cache *freelist_cache; - size_t freelist_size; - size_t freelist_cache_size; - - freelist_size = num * sizeof(freelist_idx_t); - if (freelist_size > KMALLOC_MAX_CACHE_SIZE) { - freelist_cache_size = PAGE_SIZE << get_order(freelist_size); - } else { - freelist_cache = kmalloc_slab(freelist_size, 0u, _RET_IP_); - if (!freelist_cache) - continue; - freelist_cache_size = freelist_cache->size; - - /* - * Needed to avoid possible looping condition - * in cache_grow_begin() - */ - if (OFF_SLAB(freelist_cache)) - continue; - } - - /* check if off slab has enough benefit */ - if (freelist_cache_size > cachep->size / 2) - continue; - } - - /* Found something acceptable - save it away */ - cachep->num = num; - cachep->gfporder = gfporder; - left_over = remainder; - - /* - * A VFS-reclaimable slab tends to have most allocations - * as GFP_NOFS and we really don't want to have to be allocating - * higher-order pages when we are unable to shrink dcache. - */ - if (flags & SLAB_RECLAIM_ACCOUNT) - break; - - /* - * Large number of objects is good, but very large slabs are - * currently bad for the gfp()s. - */ - if (gfporder >= slab_max_order) - break; - - /* - * Acceptable internal fragmentation? - */ - if (left_over * 8 <= (PAGE_SIZE << gfporder)) - break; - } - return left_over; -} - -static struct array_cache __percpu *alloc_kmem_cache_cpus( - struct kmem_cache *cachep, int entries, int batchcount) -{ - int cpu; - size_t size; - struct array_cache __percpu *cpu_cache; - - size = sizeof(void *) * entries + sizeof(struct array_cache); - cpu_cache = __alloc_percpu(size, sizeof(void *)); - - if (!cpu_cache) - return NULL; - - for_each_possible_cpu(cpu) { - init_arraycache(per_cpu_ptr(cpu_cache, cpu), - entries, batchcount); - } - - return cpu_cache; -} - -static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) -{ - if (slab_state >= FULL) - return enable_cpucache(cachep, gfp); - - cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1); - if (!cachep->cpu_cache) - return 1; - - if (slab_state == DOWN) { - /* Creation of first cache (kmem_cache). */ - set_up_node(kmem_cache, CACHE_CACHE); - } else if (slab_state == PARTIAL) { - /* For kmem_cache_node */ - set_up_node(cachep, SIZE_NODE); - } else { - int node; - - for_each_online_node(node) { - cachep->node[node] = kmalloc_node( - sizeof(struct kmem_cache_node), gfp, node); - BUG_ON(!cachep->node[node]); - kmem_cache_node_init(cachep->node[node]); - } - } - - cachep->node[numa_mem_id()]->next_reap = - jiffies + REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - - cpu_cache_get(cachep)->avail = 0; - cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; - cpu_cache_get(cachep)->batchcount = 1; - cpu_cache_get(cachep)->touched = 0; - cachep->batchcount = 1; - cachep->limit = BOOT_CPUCACHE_ENTRIES; - return 0; -} - -slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name) -{ - return flags; -} - -struct kmem_cache * -__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, - slab_flags_t flags, void (*ctor)(void *)) -{ - struct kmem_cache *cachep; - - cachep = find_mergeable(size, align, flags, name, ctor); - if (cachep) { - cachep->refcount++; - - /* - * Adjust the object sizes so that we clear - * the complete object on kzalloc. - */ - cachep->object_size = max_t(int, cachep->object_size, size); - } - return cachep; -} - -static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, - size_t size, slab_flags_t flags) -{ - size_t left; - - cachep->num = 0; - - /* - * If slab auto-initialization on free is enabled, store the freelist - * off-slab, so that its contents don't end up in one of the allocated - * objects. - */ - if (unlikely(slab_want_init_on_free(cachep))) - return false; - - if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU) - return false; - - left = calculate_slab_order(cachep, size, - flags | CFLGS_OBJFREELIST_SLAB); - if (!cachep->num) - return false; - - if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size) - return false; - - cachep->colour = left / cachep->colour_off; - - return true; -} - -static bool set_off_slab_cache(struct kmem_cache *cachep, - size_t size, slab_flags_t flags) -{ - size_t left; - - cachep->num = 0; - - /* - * Always use on-slab management when SLAB_NOLEAKTRACE - * to avoid recursive calls into kmemleak. - */ - if (flags & SLAB_NOLEAKTRACE) - return false; - - /* - * Size is large, assume best to place the slab management obj - * off-slab (should allow better packing of objs). - */ - left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB); - if (!cachep->num) - return false; - - /* - * If the slab has been placed off-slab, and we have enough space then - * move it on-slab. This is at the expense of any extra colouring. - */ - if (left >= cachep->num * sizeof(freelist_idx_t)) - return false; - - cachep->colour = left / cachep->colour_off; - - return true; -} - -static bool set_on_slab_cache(struct kmem_cache *cachep, - size_t size, slab_flags_t flags) -{ - size_t left; - - cachep->num = 0; - - left = calculate_slab_order(cachep, size, flags); - if (!cachep->num) - return false; - - cachep->colour = left / cachep->colour_off; - - return true; -} - -/* - * __kmem_cache_create - Create a cache. - * @cachep: cache management descriptor - * @flags: SLAB flags - * - * Returns zero on success, nonzero on failure. - * - * The flags are - * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. - */ -int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) -{ - size_t ralign = BYTES_PER_WORD; - gfp_t gfp; - int err; - unsigned int size = cachep->size; - -#if DEBUG -#if FORCED_DEBUG - /* - * Enable redzoning and last user accounting, except for caches with - * large objects, if the increased size would increase the object size - * above the next power of two: caches with object sizes just above a - * power of two have a significant amount of internal fragmentation. - */ - if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + - 2 * sizeof(unsigned long long))) - flags |= SLAB_RED_ZONE | SLAB_STORE_USER; - if (!(flags & SLAB_TYPESAFE_BY_RCU)) - flags |= SLAB_POISON; -#endif -#endif - - /* - * Check that size is in terms of words. This is needed to avoid - * unaligned accesses for some archs when redzoning is used, and makes - * sure any on-slab bufctl's are also correctly aligned. - */ - size = ALIGN(size, BYTES_PER_WORD); - - if (flags & SLAB_RED_ZONE) { - ralign = REDZONE_ALIGN; - /* If redzoning, ensure that the second redzone is suitably - * aligned, by adjusting the object size accordingly. */ - size = ALIGN(size, REDZONE_ALIGN); - } - - /* 3) caller mandated alignment */ - if (ralign < cachep->align) { - ralign = cachep->align; - } - /* disable debug if necessary */ - if (ralign > __alignof__(unsigned long long)) - flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); - /* - * 4) Store it. - */ - cachep->align = ralign; - cachep->colour_off = cache_line_size(); - /* Offset must be a multiple of the alignment. */ - if (cachep->colour_off < cachep->align) - cachep->colour_off = cachep->align; - - if (slab_is_available()) - gfp = GFP_KERNEL; - else - gfp = GFP_NOWAIT; - -#if DEBUG - - /* - * Both debugging options require word-alignment which is calculated - * into align above. - */ - if (flags & SLAB_RED_ZONE) { - /* add space for red zone words */ - cachep->obj_offset += sizeof(unsigned long long); - size += 2 * sizeof(unsigned long long); - } - if (flags & SLAB_STORE_USER) { - /* user store requires one word storage behind the end of - * the real object. But if the second red zone needs to be - * aligned to 64 bits, we must allow that much space. - */ - if (flags & SLAB_RED_ZONE) - size += REDZONE_ALIGN; - else - size += BYTES_PER_WORD; - } -#endif - - kasan_cache_create(cachep, &size, &flags); - - size = ALIGN(size, cachep->align); - /* - * We should restrict the number of objects in a slab to implement - * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. - */ - if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) - size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); - -#if DEBUG - /* - * To activate debug pagealloc, off-slab management is necessary - * requirement. In early phase of initialization, small sized slab - * doesn't get initialized so it would not be possible. So, we need - * to check size >= 256. It guarantees that all necessary small - * sized slab is initialized in current slab initialization sequence. - */ - if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) && - size >= 256 && cachep->object_size > cache_line_size()) { - if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { - size_t tmp_size = ALIGN(size, PAGE_SIZE); - - if (set_off_slab_cache(cachep, tmp_size, flags)) { - flags |= CFLGS_OFF_SLAB; - cachep->obj_offset += tmp_size - size; - size = tmp_size; - goto done; - } - } - } -#endif - - if (set_objfreelist_slab_cache(cachep, size, flags)) { - flags |= CFLGS_OBJFREELIST_SLAB; - goto done; - } - - if (set_off_slab_cache(cachep, size, flags)) { - flags |= CFLGS_OFF_SLAB; - goto done; - } - - if (set_on_slab_cache(cachep, size, flags)) - goto done; - - return -E2BIG; - -done: - cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); - cachep->flags = flags; - cachep->allocflags = __GFP_COMP; - if (flags & SLAB_CACHE_DMA) - cachep->allocflags |= GFP_DMA; - if (flags & SLAB_CACHE_DMA32) - cachep->allocflags |= GFP_DMA32; - if (flags & SLAB_RECLAIM_ACCOUNT) - cachep->allocflags |= __GFP_RECLAIMABLE; - cachep->size = size; - cachep->reciprocal_buffer_size = reciprocal_value(size); - -#if DEBUG - /* - * If we're going to use the generic kernel_map_pages() - * poisoning, then it's going to smash the contents of - * the redzone and userword anyhow, so switch them off. - */ - if (IS_ENABLED(CONFIG_PAGE_POISONING) && - (cachep->flags & SLAB_POISON) && - is_debug_pagealloc_cache(cachep)) - cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); -#endif - - err = setup_cpu_cache(cachep, gfp); - if (err) { - __kmem_cache_release(cachep); - return err; - } - - return 0; -} - -#if DEBUG -static void check_irq_off(void) -{ - BUG_ON(!irqs_disabled()); -} - -static void check_irq_on(void) -{ - BUG_ON(irqs_disabled()); -} - -static void check_mutex_acquired(void) -{ - BUG_ON(!mutex_is_locked(&slab_mutex)); -} - -static void check_spinlock_acquired(struct kmem_cache *cachep) -{ -#ifdef CONFIG_SMP - check_irq_off(); - assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); -#endif -} - -static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) -{ -#ifdef CONFIG_SMP - check_irq_off(); - assert_raw_spin_locked(&get_node(cachep, node)->list_lock); -#endif -} - -#else -#define check_irq_off() do { } while(0) -#define check_irq_on() do { } while(0) -#define check_mutex_acquired() do { } while(0) -#define check_spinlock_acquired(x) do { } while(0) -#define check_spinlock_acquired_node(x, y) do { } while(0) -#endif - -static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, - int node, bool free_all, struct list_head *list) -{ - int tofree; - - if (!ac || !ac->avail) - return; - - tofree = free_all ? ac->avail : (ac->limit + 4) / 5; - if (tofree > ac->avail) - tofree = (ac->avail + 1) / 2; - - free_block(cachep, ac->entry, tofree, node, list); - ac->avail -= tofree; - memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); -} - -static void do_drain(void *arg) -{ - struct kmem_cache *cachep = arg; - struct array_cache *ac; - int node = numa_mem_id(); - struct kmem_cache_node *n; - LIST_HEAD(list); - - check_irq_off(); - ac = cpu_cache_get(cachep); - n = get_node(cachep, node); - raw_spin_lock(&n->list_lock); - free_block(cachep, ac->entry, ac->avail, node, &list); - raw_spin_unlock(&n->list_lock); - ac->avail = 0; - slabs_destroy(cachep, &list); -} - -static void drain_cpu_caches(struct kmem_cache *cachep) -{ - struct kmem_cache_node *n; - int node; - LIST_HEAD(list); - - on_each_cpu(do_drain, cachep, 1); - check_irq_on(); - for_each_kmem_cache_node(cachep, node, n) - if (n->alien) - drain_alien_cache(cachep, n->alien); - - for_each_kmem_cache_node(cachep, node, n) { - raw_spin_lock_irq(&n->list_lock); - drain_array_locked(cachep, n->shared, node, true, &list); - raw_spin_unlock_irq(&n->list_lock); - - slabs_destroy(cachep, &list); - } -} - -/* - * Remove slabs from the list of free slabs. - * Specify the number of slabs to drain in tofree. - * - * Returns the actual number of slabs released. - */ -static int drain_freelist(struct kmem_cache *cache, - struct kmem_cache_node *n, int tofree) -{ - struct list_head *p; - int nr_freed; - struct slab *slab; - - nr_freed = 0; - while (nr_freed < tofree && !list_empty(&n->slabs_free)) { - - raw_spin_lock_irq(&n->list_lock); - p = n->slabs_free.prev; - if (p == &n->slabs_free) { - raw_spin_unlock_irq(&n->list_lock); - goto out; - } - - slab = list_entry(p, struct slab, slab_list); - list_del(&slab->slab_list); - n->free_slabs--; - n->total_slabs--; - /* - * Safe to drop the lock. The slab is no longer linked - * to the cache. - */ - n->free_objects -= cache->num; - raw_spin_unlock_irq(&n->list_lock); - slab_destroy(cache, slab); - nr_freed++; - - cond_resched(); - } -out: - return nr_freed; -} - -bool __kmem_cache_empty(struct kmem_cache *s) -{ - int node; - struct kmem_cache_node *n; - - for_each_kmem_cache_node(s, node, n) - if (!list_empty(&n->slabs_full) || - !list_empty(&n->slabs_partial)) - return false; - return true; -} - -int __kmem_cache_shrink(struct kmem_cache *cachep) -{ - int ret = 0; - int node; - struct kmem_cache_node *n; - - drain_cpu_caches(cachep); - - check_irq_on(); - for_each_kmem_cache_node(cachep, node, n) { - drain_freelist(cachep, n, INT_MAX); - - ret += !list_empty(&n->slabs_full) || - !list_empty(&n->slabs_partial); - } - return (ret ? 1 : 0); -} - -int __kmem_cache_shutdown(struct kmem_cache *cachep) -{ - return __kmem_cache_shrink(cachep); -} - -void __kmem_cache_release(struct kmem_cache *cachep) -{ - int i; - struct kmem_cache_node *n; - - cache_random_seq_destroy(cachep); - - free_percpu(cachep->cpu_cache); - - /* NUMA: free the node structures */ - for_each_kmem_cache_node(cachep, i, n) { - kfree(n->shared); - free_alien_cache(n->alien); - kfree(n); - cachep->node[i] = NULL; - } -} - -/* - * Get the memory for a slab management obj. - * - * For a slab cache when the slab descriptor is off-slab, the - * slab descriptor can't come from the same cache which is being created, - * Because if it is the case, that means we defer the creation of - * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. - * And we eventually call down to __kmem_cache_create(), which - * in turn looks up in the kmalloc_{dma,}_caches for the desired-size one. - * This is a "chicken-and-egg" problem. - * - * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, - * which are all initialized during kmem_cache_init(). - */ -static void *alloc_slabmgmt(struct kmem_cache *cachep, - struct slab *slab, int colour_off, - gfp_t local_flags, int nodeid) -{ - void *freelist; - void *addr = slab_address(slab); - - slab->s_mem = addr + colour_off; - slab->active = 0; - - if (OBJFREELIST_SLAB(cachep)) - freelist = NULL; - else if (OFF_SLAB(cachep)) { - /* Slab management obj is off-slab. */ - freelist = kmalloc_node(cachep->freelist_size, - local_flags, nodeid); - } else { - /* We will use last bytes at the slab for freelist */ - freelist = addr + (PAGE_SIZE << cachep->gfporder) - - cachep->freelist_size; - } - - return freelist; -} - -static inline freelist_idx_t get_free_obj(struct slab *slab, unsigned int idx) -{ - return ((freelist_idx_t *) slab->freelist)[idx]; -} - -static inline void set_free_obj(struct slab *slab, - unsigned int idx, freelist_idx_t val) -{ - ((freelist_idx_t *)(slab->freelist))[idx] = val; -} - -static void cache_init_objs_debug(struct kmem_cache *cachep, struct slab *slab) -{ -#if DEBUG - int i; - - for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, slab, i); - - if (cachep->flags & SLAB_STORE_USER) - *dbg_userword(cachep, objp) = NULL; - - if (cachep->flags & SLAB_RED_ZONE) { - *dbg_redzone1(cachep, objp) = RED_INACTIVE; - *dbg_redzone2(cachep, objp) = RED_INACTIVE; - } - /* - * Constructors are not allowed to allocate memory from the same - * cache which they are a constructor for. Otherwise, deadlock. - * They must also be threaded. - */ - if (cachep->ctor && !(cachep->flags & SLAB_POISON)) { - kasan_unpoison_object_data(cachep, - objp + obj_offset(cachep)); - cachep->ctor(objp + obj_offset(cachep)); - kasan_poison_object_data( - cachep, objp + obj_offset(cachep)); - } - - if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the end of an object"); - if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the start of an object"); - } - /* need to poison the objs? */ - if (cachep->flags & SLAB_POISON) { - poison_obj(cachep, objp, POISON_FREE); - slab_kernel_map(cachep, objp, 0); - } - } -#endif -} - -#ifdef CONFIG_SLAB_FREELIST_RANDOM -/* Hold information during a freelist initialization */ -struct freelist_init_state { - unsigned int pos; - unsigned int *list; - unsigned int count; -}; - -/* - * Initialize the state based on the randomization method available. - * return true if the pre-computed list is available, false otherwise. - */ -static bool freelist_state_initialize(struct freelist_init_state *state, - struct kmem_cache *cachep, - unsigned int count) -{ - bool ret; - if (!cachep->random_seq) { - ret = false; - } else { - state->list = cachep->random_seq; - state->count = count; - state->pos = get_random_u32_below(count); - ret = true; - } - return ret; -} - -/* Get the next entry on the list and randomize it using a random shift */ -static freelist_idx_t next_random_slot(struct freelist_init_state *state) -{ - if (state->pos >= state->count) - state->pos = 0; - return state->list[state->pos++]; -} - -/* Swap two freelist entries */ -static void swap_free_obj(struct slab *slab, unsigned int a, unsigned int b) -{ - swap(((freelist_idx_t *) slab->freelist)[a], - ((freelist_idx_t *) slab->freelist)[b]); -} - -/* - * Shuffle the freelist initialization state based on pre-computed lists. - * return true if the list was successfully shuffled, false otherwise. - */ -static bool shuffle_freelist(struct kmem_cache *cachep, struct slab *slab) -{ - unsigned int objfreelist = 0, i, rand, count = cachep->num; - struct freelist_init_state state; - bool precomputed; - - if (count < 2) - return false; - - precomputed = freelist_state_initialize(&state, cachep, count); - - /* Take a random entry as the objfreelist */ - if (OBJFREELIST_SLAB(cachep)) { - if (!precomputed) - objfreelist = count - 1; - else - objfreelist = next_random_slot(&state); - slab->freelist = index_to_obj(cachep, slab, objfreelist) + - obj_offset(cachep); - count--; - } - - /* - * On early boot, generate the list dynamically. - * Later use a pre-computed list for speed. - */ - if (!precomputed) { - for (i = 0; i < count; i++) - set_free_obj(slab, i, i); - - /* Fisher-Yates shuffle */ - for (i = count - 1; i > 0; i--) { - rand = get_random_u32_below(i + 1); - swap_free_obj(slab, i, rand); - } - } else { - for (i = 0; i < count; i++) - set_free_obj(slab, i, next_random_slot(&state)); - } - - if (OBJFREELIST_SLAB(cachep)) - set_free_obj(slab, cachep->num - 1, objfreelist); - - return true; -} -#else -static inline bool shuffle_freelist(struct kmem_cache *cachep, - struct slab *slab) -{ - return false; -} -#endif /* CONFIG_SLAB_FREELIST_RANDOM */ - -static void cache_init_objs(struct kmem_cache *cachep, - struct slab *slab) -{ - int i; - void *objp; - bool shuffled; - - cache_init_objs_debug(cachep, slab); - - /* Try to randomize the freelist if enabled */ - shuffled = shuffle_freelist(cachep, slab); - - if (!shuffled && OBJFREELIST_SLAB(cachep)) { - slab->freelist = index_to_obj(cachep, slab, cachep->num - 1) + - obj_offset(cachep); - } - - for (i = 0; i < cachep->num; i++) { - objp = index_to_obj(cachep, slab, i); - objp = kasan_init_slab_obj(cachep, objp); - - /* constructor could break poison info */ - if (DEBUG == 0 && cachep->ctor) { - kasan_unpoison_object_data(cachep, objp); - cachep->ctor(objp); - kasan_poison_object_data(cachep, objp); - } - - if (!shuffled) - set_free_obj(slab, i, i); - } -} - -static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slab) -{ - void *objp; - - objp = index_to_obj(cachep, slab, get_free_obj(slab, slab->active)); - slab->active++; - - return objp; -} - -static void slab_put_obj(struct kmem_cache *cachep, - struct slab *slab, void *objp) -{ - unsigned int objnr = obj_to_index(cachep, slab, objp); -#if DEBUG - unsigned int i; - - /* Verify double free bug */ - for (i = slab->active; i < cachep->num; i++) { - if (get_free_obj(slab, i) == objnr) { - pr_err("slab: double free detected in cache '%s', objp %px\n", - cachep->name, objp); - BUG(); - } - } -#endif - slab->active--; - if (!slab->freelist) - slab->freelist = objp + obj_offset(cachep); - - set_free_obj(slab, slab->active, objnr); -} - -/* - * Grow (by 1) the number of slabs within a cache. This is called by - * kmem_cache_alloc() when there are no active objs left in a cache. - */ -static struct slab *cache_grow_begin(struct kmem_cache *cachep, - gfp_t flags, int nodeid) -{ - void *freelist; - size_t offset; - gfp_t local_flags; - int slab_node; - struct kmem_cache_node *n; - struct slab *slab; - - /* - * Be lazy and only check for valid flags here, keeping it out of the - * critical path in kmem_cache_alloc(). - */ - if (unlikely(flags & GFP_SLAB_BUG_MASK)) - flags = kmalloc_fix_flags(flags); - - WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); - local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); - - check_irq_off(); - if (gfpflags_allow_blocking(local_flags)) - local_irq_enable(); - - /* - * Get mem for the objs. Attempt to allocate a physical page from - * 'nodeid'. - */ - slab = kmem_getpages(cachep, local_flags, nodeid); - if (!slab) - goto failed; - - slab_node = slab_nid(slab); - n = get_node(cachep, slab_node); - - /* Get colour for the slab, and cal the next value. */ - n->colour_next++; - if (n->colour_next >= cachep->colour) - n->colour_next = 0; - - offset = n->colour_next; - if (offset >= cachep->colour) - offset = 0; - - offset *= cachep->colour_off; - - /* - * Call kasan_poison_slab() before calling alloc_slabmgmt(), so - * page_address() in the latter returns a non-tagged pointer, - * as it should be for slab pages. - */ - kasan_poison_slab(slab); - - /* Get slab management. */ - freelist = alloc_slabmgmt(cachep, slab, offset, - local_flags & ~GFP_CONSTRAINT_MASK, slab_node); - if (OFF_SLAB(cachep) && !freelist) - goto opps1; - - slab->slab_cache = cachep; - slab->freelist = freelist; - - cache_init_objs(cachep, slab); - - if (gfpflags_allow_blocking(local_flags)) - local_irq_disable(); - - return slab; - -opps1: - kmem_freepages(cachep, slab); -failed: - if (gfpflags_allow_blocking(local_flags)) - local_irq_disable(); - return NULL; -} - -static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab) -{ - struct kmem_cache_node *n; - void *list = NULL; - - check_irq_off(); - - if (!slab) - return; - - INIT_LIST_HEAD(&slab->slab_list); - n = get_node(cachep, slab_nid(slab)); - - raw_spin_lock(&n->list_lock); - n->total_slabs++; - if (!slab->active) { - list_add_tail(&slab->slab_list, &n->slabs_free); - n->free_slabs++; - } else - fixup_slab_list(cachep, n, slab, &list); - - STATS_INC_GROWN(cachep); - n->free_objects += cachep->num - slab->active; - raw_spin_unlock(&n->list_lock); - - fixup_objfreelist_debug(cachep, &list); -} - -#if DEBUG - -/* - * Perform extra freeing checks: - * - detect bad pointers. - * - POISON/RED_ZONE checking - */ -static void kfree_debugcheck(const void *objp) -{ - if (!virt_addr_valid(objp)) { - pr_err("kfree_debugcheck: out of range ptr %lxh\n", - (unsigned long)objp); - BUG(); - } -} - -static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) -{ - unsigned long long redzone1, redzone2; - - redzone1 = *dbg_redzone1(cache, obj); - redzone2 = *dbg_redzone2(cache, obj); - - /* - * Redzone is ok. - */ - if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) - return; - - if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) - slab_error(cache, "double free detected"); - else - slab_error(cache, "memory outside object was overwritten"); - - pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", - obj, redzone1, redzone2); -} - -static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, - unsigned long caller) -{ - unsigned int objnr; - struct slab *slab; - - BUG_ON(virt_to_cache(objp) != cachep); - - objp -= obj_offset(cachep); - kfree_debugcheck(objp); - slab = virt_to_slab(objp); - - if (cachep->flags & SLAB_RED_ZONE) { - verify_redzone_free(cachep, objp); - *dbg_redzone1(cachep, objp) = RED_INACTIVE; - *dbg_redzone2(cachep, objp) = RED_INACTIVE; - } - if (cachep->flags & SLAB_STORE_USER) - *dbg_userword(cachep, objp) = (void *)caller; - - objnr = obj_to_index(cachep, slab, objp); - - BUG_ON(objnr >= cachep->num); - BUG_ON(objp != index_to_obj(cachep, slab, objnr)); - - if (cachep->flags & SLAB_POISON) { - poison_obj(cachep, objp, POISON_FREE); - slab_kernel_map(cachep, objp, 0); - } - return objp; -} - -#else -#define kfree_debugcheck(x) do { } while(0) -#define cache_free_debugcheck(x, objp, z) (objp) -#endif - -static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, - void **list) -{ -#if DEBUG - void *next = *list; - void *objp; - - while (next) { - objp = next - obj_offset(cachep); - next = *(void **)next; - poison_obj(cachep, objp, POISON_FREE); - } -#endif -} - -static inline void fixup_slab_list(struct kmem_cache *cachep, - struct kmem_cache_node *n, struct slab *slab, - void **list) -{ - /* move slabp to correct slabp list: */ - list_del(&slab->slab_list); - if (slab->active == cachep->num) { - list_add(&slab->slab_list, &n->slabs_full); - if (OBJFREELIST_SLAB(cachep)) { -#if DEBUG - /* Poisoning will be done without holding the lock */ - if (cachep->flags & SLAB_POISON) { - void **objp = slab->freelist; - - *objp = *list; - *list = objp; - } -#endif - slab->freelist = NULL; - } - } else - list_add(&slab->slab_list, &n->slabs_partial); -} - -/* Try to find non-pfmemalloc slab if needed */ -static noinline struct slab *get_valid_first_slab(struct kmem_cache_node *n, - struct slab *slab, bool pfmemalloc) -{ - if (!slab) - return NULL; - - if (pfmemalloc) - return slab; - - if (!slab_test_pfmemalloc(slab)) - return slab; - - /* No need to keep pfmemalloc slab if we have enough free objects */ - if (n->free_objects > n->free_limit) { - slab_clear_pfmemalloc(slab); - return slab; - } - - /* Move pfmemalloc slab to the end of list to speed up next search */ - list_del(&slab->slab_list); - if (!slab->active) { - list_add_tail(&slab->slab_list, &n->slabs_free); - n->free_slabs++; - } else - list_add_tail(&slab->slab_list, &n->slabs_partial); - - list_for_each_entry(slab, &n->slabs_partial, slab_list) { - if (!slab_test_pfmemalloc(slab)) - return slab; - } - - n->free_touched = 1; - list_for_each_entry(slab, &n->slabs_free, slab_list) { - if (!slab_test_pfmemalloc(slab)) { - n->free_slabs--; - return slab; - } - } - - return NULL; -} - -static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) -{ - struct slab *slab; - - assert_raw_spin_locked(&n->list_lock); - slab = list_first_entry_or_null(&n->slabs_partial, struct slab, - slab_list); - if (!slab) { - n->free_touched = 1; - slab = list_first_entry_or_null(&n->slabs_free, struct slab, - slab_list); - if (slab) - n->free_slabs--; - } - - if (sk_memalloc_socks()) - slab = get_valid_first_slab(n, slab, pfmemalloc); - - return slab; -} - -static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, - struct kmem_cache_node *n, gfp_t flags) -{ - struct slab *slab; - void *obj; - void *list = NULL; - - if (!gfp_pfmemalloc_allowed(flags)) - return NULL; - - raw_spin_lock(&n->list_lock); - slab = get_first_slab(n, true); - if (!slab) { - raw_spin_unlock(&n->list_lock); - return NULL; - } - - obj = slab_get_obj(cachep, slab); - n->free_objects--; - - fixup_slab_list(cachep, n, slab, &list); - - raw_spin_unlock(&n->list_lock); - fixup_objfreelist_debug(cachep, &list); - - return obj; -} - -/* - * Slab list should be fixed up by fixup_slab_list() for existing slab - * or cache_grow_end() for new slab - */ -static __always_inline int alloc_block(struct kmem_cache *cachep, - struct array_cache *ac, struct slab *slab, int batchcount) -{ - /* - * There must be at least one object available for - * allocation. - */ - BUG_ON(slab->active >= cachep->num); - - while (slab->active < cachep->num && batchcount--) { - STATS_INC_ALLOCED(cachep); - STATS_INC_ACTIVE(cachep); - STATS_SET_HIGH(cachep); - - ac->entry[ac->avail++] = slab_get_obj(cachep, slab); - } - - return batchcount; -} - -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) -{ - int batchcount; - struct kmem_cache_node *n; - struct array_cache *ac, *shared; - int node; - void *list = NULL; - struct slab *slab; - - check_irq_off(); - node = numa_mem_id(); - - ac = cpu_cache_get(cachep); - batchcount = ac->batchcount; - if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { - /* - * If there was little recent activity on this cache, then - * perform only a partial refill. Otherwise we could generate - * refill bouncing. - */ - batchcount = BATCHREFILL_LIMIT; - } - n = get_node(cachep, node); - - BUG_ON(ac->avail > 0 || !n); - shared = READ_ONCE(n->shared); - if (!n->free_objects && (!shared || !shared->avail)) - goto direct_grow; - - raw_spin_lock(&n->list_lock); - shared = READ_ONCE(n->shared); - - /* See if we can refill from the shared array */ - if (shared && transfer_objects(ac, shared, batchcount)) { - shared->touched = 1; - goto alloc_done; - } - - while (batchcount > 0) { - /* Get slab alloc is to come from. */ - slab = get_first_slab(n, false); - if (!slab) - goto must_grow; - - check_spinlock_acquired(cachep); - - batchcount = alloc_block(cachep, ac, slab, batchcount); - fixup_slab_list(cachep, n, slab, &list); - } - -must_grow: - n->free_objects -= ac->avail; -alloc_done: - raw_spin_unlock(&n->list_lock); - fixup_objfreelist_debug(cachep, &list); - -direct_grow: - if (unlikely(!ac->avail)) { - /* Check if we can use obj in pfmemalloc slab */ - if (sk_memalloc_socks()) { - void *obj = cache_alloc_pfmemalloc(cachep, n, flags); - - if (obj) - return obj; - } - - slab = cache_grow_begin(cachep, gfp_exact_node(flags), node); - - /* - * cache_grow_begin() can reenable interrupts, - * then ac could change. - */ - ac = cpu_cache_get(cachep); - if (!ac->avail && slab) - alloc_block(cachep, ac, slab, batchcount); - cache_grow_end(cachep, slab); - - if (!ac->avail) - return NULL; - } - ac->touched = 1; - - return ac->entry[--ac->avail]; -} - -#if DEBUG -static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, - gfp_t flags, void *objp, unsigned long caller) -{ - WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); - if (!objp || is_kfence_address(objp)) - return objp; - if (cachep->flags & SLAB_POISON) { - check_poison_obj(cachep, objp); - slab_kernel_map(cachep, objp, 1); - poison_obj(cachep, objp, POISON_INUSE); - } - if (cachep->flags & SLAB_STORE_USER) - *dbg_userword(cachep, objp) = (void *)caller; - - if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || - *dbg_redzone2(cachep, objp) != RED_INACTIVE) { - slab_error(cachep, "double free, or memory outside object was overwritten"); - pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", - objp, *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); - } - *dbg_redzone1(cachep, objp) = RED_ACTIVE; - *dbg_redzone2(cachep, objp) = RED_ACTIVE; - } - - objp += obj_offset(cachep); - if (cachep->ctor && cachep->flags & SLAB_POISON) - cachep->ctor(objp); - if ((unsigned long)objp & (arch_slab_minalign() - 1)) { - pr_err("0x%px: not aligned to arch_slab_minalign()=%u\n", objp, - arch_slab_minalign()); - } - return objp; -} -#else -#define cache_alloc_debugcheck_after(a, b, objp, d) (objp) -#endif - -static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) -{ - void *objp; - struct array_cache *ac; - - check_irq_off(); - - ac = cpu_cache_get(cachep); - if (likely(ac->avail)) { - ac->touched = 1; - objp = ac->entry[--ac->avail]; - - STATS_INC_ALLOCHIT(cachep); - goto out; - } - - STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags); - /* - * the 'ac' may be updated by cache_alloc_refill(), - * and kmemleak_erase() requires its correct value. - */ - ac = cpu_cache_get(cachep); - -out: - /* - * To avoid a false negative, if an object that is in one of the - * per-CPU caches is leaked, we need to make sure kmemleak doesn't - * treat the array pointers as a reference to the object. - */ - if (objp) - kmemleak_erase(&ac->entry[ac->avail]); - return objp; -} - -#ifdef CONFIG_NUMA -static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); - -/* - * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set. - * - * If we are in_interrupt, then process context, including cpusets and - * mempolicy, may not apply and should not be used for allocation policy. - */ -static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) -{ - int nid_alloc, nid_here; - - if (in_interrupt() || (flags & __GFP_THISNODE)) - return NULL; - nid_alloc = nid_here = numa_mem_id(); - if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) - nid_alloc = cpuset_slab_spread_node(); - else if (current->mempolicy) - nid_alloc = mempolicy_slab_node(); - if (nid_alloc != nid_here) - return ____cache_alloc_node(cachep, flags, nid_alloc); - return NULL; -} - -/* - * Fallback function if there was no memory available and no objects on a - * certain node and fall back is permitted. First we scan all the - * available node for available objects. If that fails then we - * perform an allocation without specifying a node. This allows the page - * allocator to do its reclaim / fallback magic. We then insert the - * slab into the proper nodelist and then allocate from it. - */ -static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) -{ - struct zonelist *zonelist; - struct zoneref *z; - struct zone *zone; - enum zone_type highest_zoneidx = gfp_zone(flags); - void *obj = NULL; - struct slab *slab; - int nid; - unsigned int cpuset_mems_cookie; - - if (flags & __GFP_THISNODE) - return NULL; - -retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = node_zonelist(mempolicy_slab_node(), flags); - -retry: - /* - * Look through allowed nodes for objects available - * from existing per node queues. - */ - for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { - nid = zone_to_nid(zone); - - if (cpuset_zone_allowed(zone, flags) && - get_node(cache, nid) && - get_node(cache, nid)->free_objects) { - obj = ____cache_alloc_node(cache, - gfp_exact_node(flags), nid); - if (obj) - break; - } - } - - if (!obj) { - /* - * This allocation will be performed within the constraints - * of the current cpuset / memory policy requirements. - * We may trigger various forms of reclaim on the allowed - * set and go into memory reserves if necessary. - */ - slab = cache_grow_begin(cache, flags, numa_mem_id()); - cache_grow_end(cache, slab); - if (slab) { - nid = slab_nid(slab); - obj = ____cache_alloc_node(cache, - gfp_exact_node(flags), nid); - - /* - * Another processor may allocate the objects in - * the slab since we are not holding any locks. - */ - if (!obj) - goto retry; - } - } - - if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; - return obj; -} - -/* - * An interface to enable slab creation on nodeid - */ -static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, - int nodeid) -{ - struct slab *slab; - struct kmem_cache_node *n; - void *obj = NULL; - void *list = NULL; - - VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); - n = get_node(cachep, nodeid); - BUG_ON(!n); - - check_irq_off(); - raw_spin_lock(&n->list_lock); - slab = get_first_slab(n, false); - if (!slab) - goto must_grow; - - check_spinlock_acquired_node(cachep, nodeid); - - STATS_INC_NODEALLOCS(cachep); - STATS_INC_ACTIVE(cachep); - STATS_SET_HIGH(cachep); - - BUG_ON(slab->active == cachep->num); - - obj = slab_get_obj(cachep, slab); - n->free_objects--; - - fixup_slab_list(cachep, n, slab, &list); - - raw_spin_unlock(&n->list_lock); - fixup_objfreelist_debug(cachep, &list); - return obj; - -must_grow: - raw_spin_unlock(&n->list_lock); - slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); - if (slab) { - /* This slab isn't counted yet so don't update free_objects */ - obj = slab_get_obj(cachep, slab); - } - cache_grow_end(cachep, slab); - - return obj ? obj : fallback_alloc(cachep, flags); -} - -static __always_inline void * -__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid) -{ - void *objp = NULL; - int slab_node = numa_mem_id(); - - if (nodeid == NUMA_NO_NODE) { - if (current->mempolicy || cpuset_do_slab_mem_spread()) { - objp = alternate_node_alloc(cachep, flags); - if (objp) - goto out; - } - /* - * Use the locally cached objects if possible. - * However ____cache_alloc does not allow fallback - * to other nodes. It may fail while we still have - * objects on other nodes available. - */ - objp = ____cache_alloc(cachep, flags); - nodeid = slab_node; - } else if (nodeid == slab_node) { - objp = ____cache_alloc(cachep, flags); - } else if (!get_node(cachep, nodeid)) { - /* Node not bootstrapped yet */ - objp = fallback_alloc(cachep, flags); - goto out; - } - - /* - * We may just have run out of memory on the local node. - * ____cache_alloc_node() knows how to locate memory on other nodes - */ - if (!objp) - objp = ____cache_alloc_node(cachep, flags, nodeid); -out: - return objp; -} -#else - -static __always_inline void * -__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unused) -{ - return ____cache_alloc(cachep, flags); -} - -#endif /* CONFIG_NUMA */ - -static __always_inline void * -slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, - int nodeid, size_t orig_size, unsigned long caller) -{ - unsigned long save_flags; - void *objp; - struct obj_cgroup *objcg = NULL; - bool init = false; - - flags &= gfp_allowed_mask; - cachep = slab_pre_alloc_hook(cachep, lru, &objcg, 1, flags); - if (unlikely(!cachep)) - return NULL; - - objp = kfence_alloc(cachep, orig_size, flags); - if (unlikely(objp)) - goto out; - - local_irq_save(save_flags); - objp = __do_cache_alloc(cachep, flags, nodeid); - local_irq_restore(save_flags); - objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); - prefetchw(objp); - init = slab_want_init_on_alloc(flags, cachep); - -out: - slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init, - cachep->object_size); - return objp; -} - -static __always_inline void * -slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, - size_t orig_size, unsigned long caller) -{ - return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size, - caller); -} - -/* - * Caller needs to acquire correct kmem_cache_node's list_lock - * @list: List of detached free slabs should be freed by caller - */ -static void free_block(struct kmem_cache *cachep, void **objpp, - int nr_objects, int node, struct list_head *list) -{ - int i; - struct kmem_cache_node *n = get_node(cachep, node); - struct slab *slab; - - n->free_objects += nr_objects; - - for (i = 0; i < nr_objects; i++) { - void *objp; - struct slab *slab; - - objp = objpp[i]; - - slab = virt_to_slab(objp); - list_del(&slab->slab_list); - check_spinlock_acquired_node(cachep, node); - slab_put_obj(cachep, slab, objp); - STATS_DEC_ACTIVE(cachep); - - /* fixup slab chains */ - if (slab->active == 0) { - list_add(&slab->slab_list, &n->slabs_free); - n->free_slabs++; - } else { - /* Unconditionally move a slab to the end of the - * partial list on free - maximum time for the - * other objects to be freed, too. - */ - list_add_tail(&slab->slab_list, &n->slabs_partial); - } - } - - while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) { - n->free_objects -= cachep->num; - - slab = list_last_entry(&n->slabs_free, struct slab, slab_list); - list_move(&slab->slab_list, list); - n->free_slabs--; - n->total_slabs--; - } -} - -static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) -{ - int batchcount; - struct kmem_cache_node *n; - int node = numa_mem_id(); - LIST_HEAD(list); - - batchcount = ac->batchcount; - - check_irq_off(); - n = get_node(cachep, node); - raw_spin_lock(&n->list_lock); - if (n->shared) { - struct array_cache *shared_array = n->shared; - int max = shared_array->limit - shared_array->avail; - if (max) { - if (batchcount > max) - batchcount = max; - memcpy(&(shared_array->entry[shared_array->avail]), - ac->entry, sizeof(void *) * batchcount); - shared_array->avail += batchcount; - goto free_done; - } - } - - free_block(cachep, ac->entry, batchcount, node, &list); -free_done: -#if STATS - { - int i = 0; - struct slab *slab; - - list_for_each_entry(slab, &n->slabs_free, slab_list) { - BUG_ON(slab->active); - - i++; - } - STATS_SET_FREEABLE(cachep, i); - } -#endif - raw_spin_unlock(&n->list_lock); - ac->avail -= batchcount; - memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); - slabs_destroy(cachep, &list); -} - -/* - * Release an obj back to its cache. If the obj has a constructed state, it must - * be in this state _before_ it is released. Called with disabled ints. - */ -static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, - unsigned long caller) -{ - bool init; - - memcg_slab_free_hook(cachep, virt_to_slab(objp), &objp, 1); - - if (is_kfence_address(objp)) { - kmemleak_free_recursive(objp, cachep->flags); - __kfence_free(objp); - return; - } - - /* - * As memory initialization might be integrated into KASAN, - * kasan_slab_free and initialization memset must be - * kept together to avoid discrepancies in behavior. - */ - init = slab_want_init_on_free(cachep); - if (init && !kasan_has_integrated_init()) - memset(objp, 0, cachep->object_size); - /* KASAN might put objp into memory quarantine, delaying its reuse. */ - if (kasan_slab_free(cachep, objp, init)) - return; - - /* Use KCSAN to help debug racy use-after-free. */ - if (!(cachep->flags & SLAB_TYPESAFE_BY_RCU)) - __kcsan_check_access(objp, cachep->object_size, - KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); - - ___cache_free(cachep, objp, caller); -} - -void ___cache_free(struct kmem_cache *cachep, void *objp, - unsigned long caller) -{ - struct array_cache *ac = cpu_cache_get(cachep); - - check_irq_off(); - kmemleak_free_recursive(objp, cachep->flags); - objp = cache_free_debugcheck(cachep, objp, caller); - - /* - * Skip calling cache_free_alien() when the platform is not numa. - * This will avoid cache misses that happen while accessing slabp (which - * is per page memory reference) to get nodeid. Instead use a global - * variable to skip the call, which is mostly likely to be present in - * the cache. - */ - if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) - return; - - if (ac->avail < ac->limit) { - STATS_INC_FREEHIT(cachep); - } else { - STATS_INC_FREEMISS(cachep); - cache_flusharray(cachep, ac); - } - - if (sk_memalloc_socks()) { - struct slab *slab = virt_to_slab(objp); - - if (unlikely(slab_test_pfmemalloc(slab))) { - cache_free_pfmemalloc(cachep, slab, objp); - return; - } - } - - __free_one(ac, objp); -} - -static __always_inline -void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, - gfp_t flags) -{ - void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_); - - trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, NUMA_NO_NODE); - - return ret; -} - -void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) -{ - return __kmem_cache_alloc_lru(cachep, NULL, flags); -} -EXPORT_SYMBOL(kmem_cache_alloc); - -void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, - gfp_t flags) -{ - return __kmem_cache_alloc_lru(cachep, lru, flags); -} -EXPORT_SYMBOL(kmem_cache_alloc_lru); - -static __always_inline void -cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p, unsigned long caller) -{ - size_t i; - - for (i = 0; i < size; i++) - p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller); -} - -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) -{ - struct obj_cgroup *objcg = NULL; - unsigned long irqflags; - size_t i; - - s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); - if (!s) - return 0; - - local_irq_save(irqflags); - for (i = 0; i < size; i++) { - void *objp = kfence_alloc(s, s->object_size, flags) ?: - __do_cache_alloc(s, flags, NUMA_NO_NODE); - - if (unlikely(!objp)) - goto error; - p[i] = objp; - } - local_irq_restore(irqflags); - - cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); - - /* - * memcg and kmem_cache debug support and memory initialization. - * Done outside of the IRQ disabled section. - */ - slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s), s->object_size); - /* FIXME: Trace call missing. Christoph would like a bulk variant */ - return size; -error: - local_irq_restore(irqflags); - cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); - slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); - kmem_cache_free_bulk(s, i, p); - return 0; -} -EXPORT_SYMBOL(kmem_cache_alloc_bulk); - -void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) -{ - void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); - - trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, nodeid); - - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_node); - -void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, - int nodeid, size_t orig_size, - unsigned long caller) -{ - return slab_alloc_node(cachep, NULL, flags, nodeid, - orig_size, caller); -} - -#ifdef CONFIG_PRINTK -void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) -{ - struct kmem_cache *cachep; - unsigned int objnr; - void *objp; - - kpp->kp_ptr = object; - kpp->kp_slab = slab; - cachep = slab->slab_cache; - kpp->kp_slab_cache = cachep; - objp = object - obj_offset(cachep); - kpp->kp_data_offset = obj_offset(cachep); - slab = virt_to_slab(objp); - objnr = obj_to_index(cachep, slab, objp); - objp = index_to_obj(cachep, slab, objnr); - kpp->kp_objp = objp; - if (DEBUG && cachep->flags & SLAB_STORE_USER) - kpp->kp_ret = *dbg_userword(cachep, objp); -} -#endif - -static __always_inline -void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp, - unsigned long caller) -{ - unsigned long flags; - - local_irq_save(flags); - debug_check_no_locks_freed(objp, cachep->object_size); - if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(objp, cachep->object_size); - __cache_free(cachep, objp, caller); - local_irq_restore(flags); -} - -void __kmem_cache_free(struct kmem_cache *cachep, void *objp, - unsigned long caller) -{ - __do_kmem_cache_free(cachep, objp, caller); -} - -void kmem_cache_free(struct kmem_cache *cachep, void *objp) -{ - cachep = cache_from_obj(cachep, objp); - if (!cachep) - return; - - trace_kmem_cache_free(_RET_IP_, objp, cachep); - __do_kmem_cache_free(cachep, objp, _RET_IP_); -} -EXPORT_SYMBOL(kmem_cache_free); - -void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) -{ - unsigned long flags; - - local_irq_save(flags); - for (int i = 0; i < size; i++) { - void *objp = p[i]; - struct kmem_cache *s; - - if (!orig_s) { - struct folio *folio = virt_to_folio(objp); - - /* called via kfree_bulk */ - if (!folio_test_slab(folio)) { - local_irq_restore(flags); - free_large_kmalloc(folio, objp); - local_irq_save(flags); - continue; - } - s = folio_slab(folio)->slab_cache; - } else { - s = cache_from_obj(orig_s, objp); - } - - if (!s) - continue; - - debug_check_no_locks_freed(objp, s->object_size); - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(objp, s->object_size); - - __cache_free(s, objp, _RET_IP_); - } - local_irq_restore(flags); - - /* FIXME: add tracing */ -} -EXPORT_SYMBOL(kmem_cache_free_bulk); - -/* - * This initializes kmem_cache_node or resizes various caches for all nodes. - */ -static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp) -{ - int ret; - int node; - struct kmem_cache_node *n; - - for_each_online_node(node) { - ret = setup_kmem_cache_node(cachep, node, gfp, true); - if (ret) - goto fail; - - } - - return 0; - -fail: - if (!cachep->list.next) { - /* Cache is not active yet. Roll back what we did */ - node--; - while (node >= 0) { - n = get_node(cachep, node); - if (n) { - kfree(n->shared); - free_alien_cache(n->alien); - kfree(n); - cachep->node[node] = NULL; - } - node--; - } - } - return -ENOMEM; -} - -/* Always called with the slab_mutex held */ -static int do_tune_cpucache(struct kmem_cache *cachep, int limit, - int batchcount, int shared, gfp_t gfp) -{ - struct array_cache __percpu *cpu_cache, *prev; - int cpu; - - cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount); - if (!cpu_cache) - return -ENOMEM; - - prev = cachep->cpu_cache; - cachep->cpu_cache = cpu_cache; - /* - * Without a previous cpu_cache there's no need to synchronize remote - * cpus, so skip the IPIs. - */ - if (prev) - kick_all_cpus_sync(); - - check_irq_on(); - cachep->batchcount = batchcount; - cachep->limit = limit; - cachep->shared = shared; - - if (!prev) - goto setup_node; - - for_each_online_cpu(cpu) { - LIST_HEAD(list); - int node; - struct kmem_cache_node *n; - struct array_cache *ac = per_cpu_ptr(prev, cpu); - - node = cpu_to_mem(cpu); - n = get_node(cachep, node); - raw_spin_lock_irq(&n->list_lock); - free_block(cachep, ac->entry, ac->avail, node, &list); - raw_spin_unlock_irq(&n->list_lock); - slabs_destroy(cachep, &list); - } - free_percpu(prev); - -setup_node: - return setup_kmem_cache_nodes(cachep, gfp); -} - -/* Called with slab_mutex held always */ -static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) -{ - int err; - int limit = 0; - int shared = 0; - int batchcount = 0; - - err = cache_random_seq_create(cachep, cachep->num, gfp); - if (err) - goto end; - - /* - * The head array serves three purposes: - * - create a LIFO ordering, i.e. return objects that are cache-warm - * - reduce the number of spinlock operations. - * - reduce the number of linked list operations on the slab and - * bufctl chains: array operations are cheaper. - * The numbers are guessed, we should auto-tune as described by - * Bonwick. - */ - if (cachep->size > 131072) - limit = 1; - else if (cachep->size > PAGE_SIZE) - limit = 8; - else if (cachep->size > 1024) - limit = 24; - else if (cachep->size > 256) - limit = 54; - else - limit = 120; - - /* - * CPU bound tasks (e.g. network routing) can exhibit cpu bound - * allocation behaviour: Most allocs on one cpu, most free operations - * on another cpu. For these cases, an efficient object passing between - * cpus is necessary. This is provided by a shared array. The array - * replaces Bonwick's magazine layer. - * On uniprocessor, it's functionally equivalent (but less efficient) - * to a larger limit. Thus disabled by default. - */ - shared = 0; - if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) - shared = 8; - -#if DEBUG - /* - * With debugging enabled, large batchcount lead to excessively long - * periods with disabled local interrupts. Limit the batchcount - */ - if (limit > 32) - limit = 32; -#endif - batchcount = (limit + 1) / 2; - err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); -end: - if (err) - pr_err("enable_cpucache failed for %s, error %d\n", - cachep->name, -err); - return err; -} - -/* - * Drain an array if it contains any elements taking the node lock only if - * necessary. Note that the node listlock also protects the array_cache - * if drain_array() is used on the shared array. - */ -static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, - struct array_cache *ac, int node) -{ - LIST_HEAD(list); - - /* ac from n->shared can be freed if we don't hold the slab_mutex. */ - check_mutex_acquired(); - - if (!ac || !ac->avail) - return; - - if (ac->touched) { - ac->touched = 0; - return; - } - - raw_spin_lock_irq(&n->list_lock); - drain_array_locked(cachep, ac, node, false, &list); - raw_spin_unlock_irq(&n->list_lock); - - slabs_destroy(cachep, &list); -} - -/** - * cache_reap - Reclaim memory from caches. - * @w: work descriptor - * - * Called from workqueue/eventd every few seconds. - * Purpose: - * - clear the per-cpu caches for this CPU. - * - return freeable pages to the main free memory pool. - * - * If we cannot acquire the cache chain mutex then just give up - we'll try - * again on the next iteration. - */ -static void cache_reap(struct work_struct *w) -{ - struct kmem_cache *searchp; - struct kmem_cache_node *n; - int node = numa_mem_id(); - struct delayed_work *work = to_delayed_work(w); - - if (!mutex_trylock(&slab_mutex)) - /* Give up. Setup the next iteration. */ - goto out; - - list_for_each_entry(searchp, &slab_caches, list) { - check_irq_on(); - - /* - * We only take the node lock if absolutely necessary and we - * have established with reasonable certainty that - * we can do some work if the lock was obtained. - */ - n = get_node(searchp, node); - - reap_alien(searchp, n); - - drain_array(searchp, n, cpu_cache_get(searchp), node); - - /* - * These are racy checks but it does not matter - * if we skip one check or scan twice. - */ - if (time_after(n->next_reap, jiffies)) - goto next; - - n->next_reap = jiffies + REAPTIMEOUT_NODE; - - drain_array(searchp, n, n->shared, node); - - if (n->free_touched) - n->free_touched = 0; - else { - int freed; - - freed = drain_freelist(searchp, n, (n->free_limit + - 5 * searchp->num - 1) / (5 * searchp->num)); - STATS_ADD_REAPED(searchp, freed); - } -next: - cond_resched(); - } - check_irq_on(); - mutex_unlock(&slab_mutex); - next_reap_node(); -out: - /* Set up the next iteration */ - schedule_delayed_work_on(smp_processor_id(), work, - round_jiffies_relative(REAPTIMEOUT_AC)); -} - -void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) -{ - unsigned long active_objs, num_objs, active_slabs; - unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0; - unsigned long free_slabs = 0; - int node; - struct kmem_cache_node *n; - - for_each_kmem_cache_node(cachep, node, n) { - check_irq_on(); - raw_spin_lock_irq(&n->list_lock); - - total_slabs += n->total_slabs; - free_slabs += n->free_slabs; - free_objs += n->free_objects; - - if (n->shared) - shared_avail += n->shared->avail; - - raw_spin_unlock_irq(&n->list_lock); - } - num_objs = total_slabs * cachep->num; - active_slabs = total_slabs - free_slabs; - active_objs = num_objs - free_objs; - - sinfo->active_objs = active_objs; - sinfo->num_objs = num_objs; - sinfo->active_slabs = active_slabs; - sinfo->num_slabs = total_slabs; - sinfo->shared_avail = shared_avail; - sinfo->limit = cachep->limit; - sinfo->batchcount = cachep->batchcount; - sinfo->shared = cachep->shared; - sinfo->objects_per_slab = cachep->num; - sinfo->cache_order = cachep->gfporder; -} - -void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) -{ -#if STATS - { /* node stats */ - unsigned long high = cachep->high_mark; - unsigned long allocs = cachep->num_allocations; - unsigned long grown = cachep->grown; - unsigned long reaped = cachep->reaped; - unsigned long errors = cachep->errors; - unsigned long max_freeable = cachep->max_freeable; - unsigned long node_allocs = cachep->node_allocs; - unsigned long node_frees = cachep->node_frees; - unsigned long overflows = cachep->node_overflow; - - seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu", - allocs, high, grown, - reaped, errors, max_freeable, node_allocs, - node_frees, overflows); - } - /* cpu stats */ - { - unsigned long allochit = atomic_read(&cachep->allochit); - unsigned long allocmiss = atomic_read(&cachep->allocmiss); - unsigned long freehit = atomic_read(&cachep->freehit); - unsigned long freemiss = atomic_read(&cachep->freemiss); - - seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", - allochit, allocmiss, freehit, freemiss); - } -#endif -} - -#define MAX_SLABINFO_WRITE 128 -/** - * slabinfo_write - Tuning for the slab allocator - * @file: unused - * @buffer: user buffer - * @count: data length - * @ppos: unused - * - * Return: %0 on success, negative error code otherwise. - */ -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; - int limit, batchcount, shared, res; - struct kmem_cache *cachep; - - if (count > MAX_SLABINFO_WRITE) - return -EINVAL; - if (copy_from_user(&kbuf, buffer, count)) - return -EFAULT; - kbuf[MAX_SLABINFO_WRITE] = '\0'; - - tmp = strchr(kbuf, ' '); - if (!tmp) - return -EINVAL; - *tmp = '\0'; - tmp++; - if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) - return -EINVAL; - - /* Find the cache in the chain of caches. */ - mutex_lock(&slab_mutex); - res = -EINVAL; - list_for_each_entry(cachep, &slab_caches, list) { - if (!strcmp(cachep->name, kbuf)) { - if (limit < 1 || batchcount < 1 || - batchcount > limit || shared < 0) { - res = 0; - } else { - res = do_tune_cpucache(cachep, limit, - batchcount, shared, - GFP_KERNEL); - } - break; - } - } - mutex_unlock(&slab_mutex); - if (res >= 0) - res = count; - return res; -} - -#ifdef CONFIG_HARDENED_USERCOPY -/* - * Rejects incorrectly sized objects and objects that are to be copied - * to/from userspace but do not fall entirely within the containing slab - * cache's usercopy region. - * - * Returns NULL if check passes, otherwise const char * to name of cache - * to indicate an error. - */ -void __check_heap_object(const void *ptr, unsigned long n, - const struct slab *slab, bool to_user) -{ - struct kmem_cache *cachep; - unsigned int objnr; - unsigned long offset; - - ptr = kasan_reset_tag(ptr); - - /* Find and validate object. */ - cachep = slab->slab_cache; - objnr = obj_to_index(cachep, slab, (void *)ptr); - BUG_ON(objnr >= cachep->num); - - /* Find offset within object. */ - if (is_kfence_address(ptr)) - offset = ptr - kfence_object_start(ptr); - else - offset = ptr - index_to_obj(cachep, slab, objnr) - obj_offset(cachep); - - /* Allow address range falling entirely within usercopy region. */ - if (offset >= cachep->useroffset && - offset - cachep->useroffset <= cachep->usersize && - n <= cachep->useroffset - offset + cachep->usersize) - return; - - usercopy_abort("SLAB object", cachep->name, to_user, offset, n); -} -#endif /* CONFIG_HARDENED_USERCOPY */ From 7ef08ae8277c66657127844179912214c67fb4bc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 09:54:15 +0200 Subject: [PATCH 0280/1562] mm/slab: move struct kmem_cache_cpu declaration to slub.c Nothing outside SLUB itself accesses the struct kmem_cache_cpu fields so it does not need to be declared in slub_def.h. This allows also to move enum stat_item. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/slub_def.h | 54 ---------------------------------------- mm/slub.c | 54 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index deb90cf4bffb..a0229ea42977 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -12,60 +12,6 @@ #include #include -enum stat_item { - ALLOC_FASTPATH, /* Allocation from cpu slab */ - ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ - FREE_FASTPATH, /* Free to cpu slab */ - FREE_SLOWPATH, /* Freeing not to cpu slab */ - FREE_FROZEN, /* Freeing to frozen slab */ - FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ - FREE_REMOVE_PARTIAL, /* Freeing removes last object */ - ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ - ALLOC_SLAB, /* Cpu slab acquired from page allocator */ - ALLOC_REFILL, /* Refill cpu slab from slab freelist */ - ALLOC_NODE_MISMATCH, /* Switching cpu slab */ - FREE_SLAB, /* Slab freed to the page allocator */ - CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ - DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ - DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ - DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ - DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ - DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ - DEACTIVATE_BYPASS, /* Implicit deactivation */ - ORDER_FALLBACK, /* Number of times fallback was necessary */ - CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */ - CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */ - CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ - CPU_PARTIAL_FREE, /* Refill cpu partial on free */ - CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ - CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ - NR_SLUB_STAT_ITEMS -}; - -#ifndef CONFIG_SLUB_TINY -/* - * When changing the layout, make sure freelist and tid are still compatible - * with this_cpu_cmpxchg_double() alignment requirements. - */ -struct kmem_cache_cpu { - union { - struct { - void **freelist; /* Pointer to next available object */ - unsigned long tid; /* Globally unique transaction id */ - }; - freelist_aba_t freelist_tid; - }; - struct slab *slab; /* The slab from which we are allocating */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct slab *partial; /* Partially allocated frozen slabs */ -#endif - local_lock_t lock; /* Protects the fields above */ -#ifdef CONFIG_SLUB_STATS - unsigned stat[NR_SLUB_STAT_ITEMS]; -#endif -}; -#endif /* CONFIG_SLUB_TINY */ - #ifdef CONFIG_SLUB_CPU_PARTIAL #define slub_percpu_partial(c) ((c)->partial) diff --git a/mm/slub.c b/mm/slub.c index 3e01731783df..979932d046fd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -330,6 +330,60 @@ static void debugfs_slab_add(struct kmem_cache *); static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif +enum stat_item { + ALLOC_FASTPATH, /* Allocation from cpu slab */ + ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ + FREE_FASTPATH, /* Free to cpu slab */ + FREE_SLOWPATH, /* Freeing not to cpu slab */ + FREE_FROZEN, /* Freeing to frozen slab */ + FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ + FREE_REMOVE_PARTIAL, /* Freeing removes last object */ + ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ + ALLOC_SLAB, /* Cpu slab acquired from page allocator */ + ALLOC_REFILL, /* Refill cpu slab from slab freelist */ + ALLOC_NODE_MISMATCH, /* Switching cpu slab */ + FREE_SLAB, /* Slab freed to the page allocator */ + CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ + DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ + DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ + DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ + DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ + DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ + DEACTIVATE_BYPASS, /* Implicit deactivation */ + ORDER_FALLBACK, /* Number of times fallback was necessary */ + CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ + CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ + CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ + CPU_PARTIAL_FREE, /* Refill cpu partial on free */ + CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ + CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ + NR_SLUB_STAT_ITEMS +}; + +#ifndef CONFIG_SLUB_TINY +/* + * When changing the layout, make sure freelist and tid are still compatible + * with this_cpu_cmpxchg_double() alignment requirements. + */ +struct kmem_cache_cpu { + union { + struct { + void **freelist; /* Pointer to next available object */ + unsigned long tid; /* Globally unique transaction id */ + }; + freelist_aba_t freelist_tid; + }; + struct slab *slab; /* The slab from which we are allocating */ +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct slab *partial; /* Partially allocated frozen slabs */ +#endif + local_lock_t lock; /* Protects the fields above */ +#ifdef CONFIG_SLUB_STATS + unsigned int stat[NR_SLUB_STAT_ITEMS]; +#endif +}; +#endif /* CONFIG_SLUB_TINY */ + static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS From 19975f83412fbb9b1458f3dfbf16ca043a57788a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 09:59:48 +0200 Subject: [PATCH 0281/1562] mm/slab: move the rest of slub_def.h to mm/slab.h mm/slab.h is the only place to include include/linux/slub_def.h which has allowed switching between SLAB and SLUB. Now we can simply move the contents over and remove slub_def.h. Use this opportunity to fix up some whitespace (alignment) issues. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/slub_def.h | 150 --------------------------------------- mm/slab.h | 138 ++++++++++++++++++++++++++++++++++- 2 files changed, 137 insertions(+), 151 deletions(-) delete mode 100644 include/linux/slub_def.h diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h deleted file mode 100644 index a0229ea42977..000000000000 --- a/include/linux/slub_def.h +++ /dev/null @@ -1,150 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SLUB_DEF_H -#define _LINUX_SLUB_DEF_H - -/* - * SLUB : A Slab allocator without object queues. - * - * (C) 2007 SGI, Christoph Lameter - */ -#include -#include -#include -#include - -#ifdef CONFIG_SLUB_CPU_PARTIAL -#define slub_percpu_partial(c) ((c)->partial) - -#define slub_set_percpu_partial(c, p) \ -({ \ - slub_percpu_partial(c) = (p)->next; \ -}) - -#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) -#else -#define slub_percpu_partial(c) NULL - -#define slub_set_percpu_partial(c, p) - -#define slub_percpu_partial_read_once(c) NULL -#endif // CONFIG_SLUB_CPU_PARTIAL - -/* - * Word size structure that can be atomically updated or read and that - * contains both the order and the number of objects that a slab of the - * given order would contain. - */ -struct kmem_cache_order_objects { - unsigned int x; -}; - -/* - * Slab cache management. - */ -struct kmem_cache { -#ifndef CONFIG_SLUB_TINY - struct kmem_cache_cpu __percpu *cpu_slab; -#endif - /* Used for retrieving partial slabs, etc. */ - slab_flags_t flags; - unsigned long min_partial; - unsigned int size; /* The size of an object including metadata */ - unsigned int object_size;/* The size of an object without metadata */ - struct reciprocal_value reciprocal_size; - unsigned int offset; /* Free pointer offset */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - /* Number of per cpu partial objects to keep around */ - unsigned int cpu_partial; - /* Number of per cpu partial slabs to keep around */ - unsigned int cpu_partial_slabs; -#endif - struct kmem_cache_order_objects oo; - - /* Allocation and freeing of slabs */ - struct kmem_cache_order_objects min; - gfp_t allocflags; /* gfp flags to use on each alloc */ - int refcount; /* Refcount for slab cache destroy */ - void (*ctor)(void *); - unsigned int inuse; /* Offset to metadata */ - unsigned int align; /* Alignment */ - unsigned int red_left_pad; /* Left redzone padding size */ - const char *name; /* Name (only for display!) */ - struct list_head list; /* List of slab caches */ -#ifdef CONFIG_SYSFS - struct kobject kobj; /* For sysfs */ -#endif -#ifdef CONFIG_SLAB_FREELIST_HARDENED - unsigned long random; -#endif - -#ifdef CONFIG_NUMA - /* - * Defragmentation by allocating from a remote node. - */ - unsigned int remote_node_defrag_ratio; -#endif - -#ifdef CONFIG_SLAB_FREELIST_RANDOM - unsigned int *random_seq; -#endif - -#ifdef CONFIG_KASAN_GENERIC - struct kasan_cache kasan_info; -#endif - -#ifdef CONFIG_HARDENED_USERCOPY - unsigned int useroffset; /* Usercopy region offset */ - unsigned int usersize; /* Usercopy region size */ -#endif - - struct kmem_cache_node *node[MAX_NUMNODES]; -}; - -#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) -#define SLAB_SUPPORTS_SYSFS -void sysfs_slab_unlink(struct kmem_cache *); -void sysfs_slab_release(struct kmem_cache *); -#else -static inline void sysfs_slab_unlink(struct kmem_cache *s) -{ -} -static inline void sysfs_slab_release(struct kmem_cache *s) -{ -} -#endif - -void *fixup_red_left(struct kmem_cache *s, void *p); - -static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, - void *x) { - void *object = x - (x - slab_address(slab)) % cache->size; - void *last_object = slab_address(slab) + - (slab->objects - 1) * cache->size; - void *result = (unlikely(object > last_object)) ? last_object : object; - - result = fixup_red_left(cache, result); - return result; -} - -/* Determine object index from a given position */ -static inline unsigned int __obj_to_index(const struct kmem_cache *cache, - void *addr, void *obj) -{ - return reciprocal_divide(kasan_reset_tag(obj) - addr, - cache->reciprocal_size); -} - -static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) -{ - if (is_kfence_address(obj)) - return 0; - return __obj_to_index(cache, slab_address(slab), obj); -} - -static inline int objs_per_slab(const struct kmem_cache *cache, - const struct slab *slab) -{ - return slab->objects; -} -#endif /* _LINUX_SLUB_DEF_H */ diff --git a/mm/slab.h b/mm/slab.h index 014c36ea51fa..3a8d13c099fa 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -209,7 +209,143 @@ static inline size_t slab_size(const struct slab *slab) return PAGE_SIZE << slab_order(slab); } -#include +#include +#include +#include +#include + +#ifdef CONFIG_SLUB_CPU_PARTIAL +#define slub_percpu_partial(c) ((c)->partial) + +#define slub_set_percpu_partial(c, p) \ +({ \ + slub_percpu_partial(c) = (p)->next; \ +}) + +#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) +#else +#define slub_percpu_partial(c) NULL + +#define slub_set_percpu_partial(c, p) + +#define slub_percpu_partial_read_once(c) NULL +#endif // CONFIG_SLUB_CPU_PARTIAL + +/* + * Word size structure that can be atomically updated or read and that + * contains both the order and the number of objects that a slab of the + * given order would contain. + */ +struct kmem_cache_order_objects { + unsigned int x; +}; + +/* + * Slab cache management. + */ +struct kmem_cache { +#ifndef CONFIG_SLUB_TINY + struct kmem_cache_cpu __percpu *cpu_slab; +#endif + /* Used for retrieving partial slabs, etc. */ + slab_flags_t flags; + unsigned long min_partial; + unsigned int size; /* Object size including metadata */ + unsigned int object_size; /* Object size without metadata */ + struct reciprocal_value reciprocal_size; + unsigned int offset; /* Free pointer offset */ +#ifdef CONFIG_SLUB_CPU_PARTIAL + /* Number of per cpu partial objects to keep around */ + unsigned int cpu_partial; + /* Number of per cpu partial slabs to keep around */ + unsigned int cpu_partial_slabs; +#endif + struct kmem_cache_order_objects oo; + + /* Allocation and freeing of slabs */ + struct kmem_cache_order_objects min; + gfp_t allocflags; /* gfp flags to use on each alloc */ + int refcount; /* Refcount for slab cache destroy */ + void (*ctor)(void *object); /* Object constructor */ + unsigned int inuse; /* Offset to metadata */ + unsigned int align; /* Alignment */ + unsigned int red_left_pad; /* Left redzone padding size */ + const char *name; /* Name (only for display!) */ + struct list_head list; /* List of slab caches */ +#ifdef CONFIG_SYSFS + struct kobject kobj; /* For sysfs */ +#endif +#ifdef CONFIG_SLAB_FREELIST_HARDENED + unsigned long random; +#endif + +#ifdef CONFIG_NUMA + /* + * Defragmentation by allocating from a remote node. + */ + unsigned int remote_node_defrag_ratio; +#endif + +#ifdef CONFIG_SLAB_FREELIST_RANDOM + unsigned int *random_seq; +#endif + +#ifdef CONFIG_KASAN_GENERIC + struct kasan_cache kasan_info; +#endif + +#ifdef CONFIG_HARDENED_USERCOPY + unsigned int useroffset; /* Usercopy region offset */ + unsigned int usersize; /* Usercopy region size */ +#endif + + struct kmem_cache_node *node[MAX_NUMNODES]; +}; + +#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) +#define SLAB_SUPPORTS_SYSFS +void sysfs_slab_unlink(struct kmem_cache *s); +void sysfs_slab_release(struct kmem_cache *s); +#else +static inline void sysfs_slab_unlink(struct kmem_cache *s) { } +static inline void sysfs_slab_release(struct kmem_cache *s) { } +#endif + +void *fixup_red_left(struct kmem_cache *s, void *p); + +static inline void *nearest_obj(struct kmem_cache *cache, + const struct slab *slab, void *x) +{ + void *object = x - (x - slab_address(slab)) % cache->size; + void *last_object = slab_address(slab) + + (slab->objects - 1) * cache->size; + void *result = (unlikely(object > last_object)) ? last_object : object; + + result = fixup_red_left(cache, result); + return result; +} + +/* Determine object index from a given position */ +static inline unsigned int __obj_to_index(const struct kmem_cache *cache, + void *addr, void *obj) +{ + return reciprocal_divide(kasan_reset_tag(obj) - addr, + cache->reciprocal_size); +} + +static inline unsigned int obj_to_index(const struct kmem_cache *cache, + const struct slab *slab, void *obj) +{ + if (is_kfence_address(obj)) + return 0; + return __obj_to_index(cache, slab_address(slab), obj); +} + +static inline int objs_per_slab(const struct kmem_cache *cache, + const struct slab *slab) +{ + return slab->objects; +} #include #include From 89c2d061bfa7fe2b5bcb1393a7a79bb5db8d4140 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 11:15:54 +0200 Subject: [PATCH 0282/1562] mm/slab: consolidate includes in the internal mm/slab.h The #include's are scattered at several places of the file, but it does not seem this is needed to prevent any include loops (anymore?) so consolidate them at the top. Also move the misplaced kmem_cache_init() declaration away from the top. Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 3a8d13c099fa..1ac3a2f8d4c0 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -1,10 +1,22 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef MM_SLAB_H #define MM_SLAB_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + /* * Internal slab definitions */ -void __init kmem_cache_init(void); #ifdef CONFIG_64BIT # ifdef system_has_cmpxchg128 @@ -209,11 +221,6 @@ static inline size_t slab_size(const struct slab *slab) return PAGE_SIZE << slab_order(slab); } -#include -#include -#include -#include - #ifdef CONFIG_SLUB_CPU_PARTIAL #define slub_percpu_partial(c) ((c)->partial) @@ -347,14 +354,6 @@ static inline int objs_per_slab(const struct kmem_cache *cache, return slab->objects; } -#include -#include -#include -#include -#include -#include -#include - /* * State of the slab allocator. * @@ -405,6 +404,7 @@ gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); +void __init kmem_cache_init(void); void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags); extern void create_boot_cache(struct kmem_cache *, const char *name, From 6011be59910fb12b757f9d37793d21763268b4a1 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 11:57:45 +0200 Subject: [PATCH 0283/1562] mm/slab: move pre/post-alloc hooks from slab.h to slub.c We don't share the hooks between two slab implementations anymore so they can be moved away from the header. As part of the move, also move should_failslab() from slab_common.c as the pre_alloc hook uses it. This means slab.h can stop including fault-inject.h and kmemleak.h. Fix up some files that were depending on the includes transitively. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/kasan/report.c | 1 + mm/memcontrol.c | 1 + mm/slab.h | 72 ----------------------------------------- mm/slab_common.c | 8 +---- mm/slub.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 84 insertions(+), 79 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index e77facb62900..011f727bfaff 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 947fb50eba31..8a0603517065 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -64,6 +64,7 @@ #include #include #include +#include #include "internal.h" #include #include diff --git a/mm/slab.h b/mm/slab.h index 1ac3a2f8d4c0..65ebf86b3fe9 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -9,8 +9,6 @@ #include #include #include -#include -#include #include #include @@ -796,76 +794,6 @@ static inline size_t slab_ksize(const struct kmem_cache *s) return s->size; } -static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t size, gfp_t flags) -{ - flags &= gfp_allowed_mask; - - might_alloc(flags); - - if (should_failslab(s, flags)) - return NULL; - - if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)) - return NULL; - - return s; -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, gfp_t flags, - size_t size, void **p, bool init, - unsigned int orig_size) -{ - unsigned int zero_size = s->object_size; - bool kasan_init = init; - size_t i; - - flags &= gfp_allowed_mask; - - /* - * For kmalloc object, the allocated memory size(object_size) is likely - * larger than the requested size(orig_size). If redzone check is - * enabled for the extra space, don't zero it, as it will be redzoned - * soon. The redzone operation for this extra space could be seen as a - * replacement of current poisoning under certain debug option, and - * won't break other sanity checks. - */ - if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) && - (s->flags & SLAB_KMALLOC)) - zero_size = orig_size; - - /* - * When slub_debug is enabled, avoid memory initialization integrated - * into KASAN and instead zero out the memory via the memset below with - * the proper size. Otherwise, KASAN might overwrite SLUB redzones and - * cause false-positive reports. This does not lead to a performance - * penalty on production builds, as slub_debug is not intended to be - * enabled there. - */ - if (__slub_debug_enabled()) - kasan_init = false; - - /* - * As memory initialization might be integrated into KASAN, - * kasan_slab_alloc and initialization memset must be - * kept together to avoid discrepancies in behavior. - * - * As p[i] might get tagged, memset and kmemleak hook come after KASAN. - */ - for (i = 0; i < size; i++) { - p[i] = kasan_slab_alloc(s, p[i], flags, kasan_init); - if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) - memset(p[i], 0, zero_size); - kmemleak_alloc_recursive(p[i], s->object_size, 1, - s->flags, flags); - kmsan_slab_alloc(s, p[i], flags); - } - - memcg_slab_post_alloc_hook(s, objcg, flags, size, p); -} /* * The slab lists for all objects. diff --git a/mm/slab_common.c b/mm/slab_common.c index 63b8411db7ce..bbc2e3f061f1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1470,10 +1471,3 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); -int should_failslab(struct kmem_cache *s, gfp_t gfpflags) -{ - if (__should_failslab(s, gfpflags)) - return -ENOMEM; - return 0; -} -ALLOW_ERROR_INJECTION(should_failslab, ERRNO); diff --git a/mm/slub.c b/mm/slub.c index 979932d046fd..9eb6508152c2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -3494,6 +3495,86 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, 0, sizeof(void *)); } +noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags) +{ + if (__should_failslab(s, gfpflags)) + return -ENOMEM; + return 0; +} +ALLOW_ERROR_INJECTION(should_failslab, ERRNO); + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t size, gfp_t flags) +{ + flags &= gfp_allowed_mask; + + might_alloc(flags); + + if (should_failslab(s, flags)) + return NULL; + + if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)) + return NULL; + + return s; +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, gfp_t flags, + size_t size, void **p, bool init, + unsigned int orig_size) +{ + unsigned int zero_size = s->object_size; + bool kasan_init = init; + size_t i; + + flags &= gfp_allowed_mask; + + /* + * For kmalloc object, the allocated memory size(object_size) is likely + * larger than the requested size(orig_size). If redzone check is + * enabled for the extra space, don't zero it, as it will be redzoned + * soon. The redzone operation for this extra space could be seen as a + * replacement of current poisoning under certain debug option, and + * won't break other sanity checks. + */ + if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) && + (s->flags & SLAB_KMALLOC)) + zero_size = orig_size; + + /* + * When slub_debug is enabled, avoid memory initialization integrated + * into KASAN and instead zero out the memory via the memset below with + * the proper size. Otherwise, KASAN might overwrite SLUB redzones and + * cause false-positive reports. This does not lead to a performance + * penalty on production builds, as slub_debug is not intended to be + * enabled there. + */ + if (__slub_debug_enabled()) + kasan_init = false; + + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_alloc and initialization memset must be + * kept together to avoid discrepancies in behavior. + * + * As p[i] might get tagged, memset and kmemleak hook come after KASAN. + */ + for (i = 0; i < size; i++) { + p[i] = kasan_slab_alloc(s, p[i], flags, kasan_init); + if (p[i] && init && (!kasan_init || + !kasan_has_integrated_init())) + memset(p[i], 0, zero_size); + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, flags); + kmsan_slab_alloc(s, p[i], flags); + } + + memcg_slab_post_alloc_hook(s, objcg, flags, size, p); +} + /* * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) * have the fastpath folded into their functions. So no function call From 0bedcc66d2a43a50ab660273842f4737a293dd8a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 14:52:47 +0200 Subject: [PATCH 0284/1562] mm/slab: move memcg related functions from slab.h to slub.c We don't share those between SLAB and SLUB anymore, so most memcg related functions can be moved to slub.c proper. Reviewed-by: Kees Cook Acked-by: Michal Hocko Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.h | 206 ------------------------------------------------------ mm/slub.c | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+), 206 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 65ebf86b3fe9..a81ef7c9282d 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -486,12 +486,6 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); -static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) -{ - return (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; -} - #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON DECLARE_STATIC_KEY_TRUE(slub_debug_enabled); @@ -551,220 +545,20 @@ int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab); void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr); - -static inline void memcg_free_slab_cgroups(struct slab *slab) -{ - kfree(slab_objcgs(slab)); - slab->memcg_data = 0; -} - -static inline size_t obj_full_size(struct kmem_cache *s) -{ - /* - * For each accounted object there is an extra space which is used - * to store obj_cgroup membership. Charge it too. - */ - return s->size + sizeof(struct obj_cgroup *); -} - -/* - * Returns false if the allocation should fail. - */ -static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t objects, gfp_t flags) -{ - struct obj_cgroup *objcg; - - if (!memcg_kmem_online()) - return true; - - if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) - return true; - - /* - * The obtained objcg pointer is safe to use within the current scope, - * defined by current task or set_active_memcg() pair. - * obj_cgroup_get() is used to get a permanent reference. - */ - objcg = current_obj_cgroup(); - if (!objcg) - return true; - - if (lru) { - int ret; - struct mem_cgroup *memcg; - - memcg = get_mem_cgroup_from_objcg(objcg); - ret = memcg_list_lru_alloc(memcg, lru, flags); - css_put(&memcg->css); - - if (ret) - return false; - } - - if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) - return false; - - *objcgp = objcg; - return true; -} - -static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, - gfp_t flags, size_t size, - void **p) -{ - struct slab *slab; - unsigned long off; - size_t i; - - if (!memcg_kmem_online() || !objcg) - return; - - for (i = 0; i < size; i++) { - if (likely(p[i])) { - slab = virt_to_slab(p[i]); - - if (!slab_objcgs(slab) && - memcg_alloc_slab_cgroups(slab, s, flags, - false)) { - obj_cgroup_uncharge(objcg, obj_full_size(s)); - continue; - } - - off = obj_to_index(s, slab, p[i]); - obj_cgroup_get(objcg); - slab_objcgs(slab)[off] = objcg; - mod_objcg_state(objcg, slab_pgdat(slab), - cache_vmstat_idx(s), obj_full_size(s)); - } else { - obj_cgroup_uncharge(objcg, obj_full_size(s)); - } - } -} - -static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - void **p, int objects) -{ - struct obj_cgroup **objcgs; - int i; - - if (!memcg_kmem_online()) - return; - - objcgs = slab_objcgs(slab); - if (!objcgs) - return; - - for (i = 0; i < objects; i++) { - struct obj_cgroup *objcg; - unsigned int off; - - off = obj_to_index(s, slab, p[i]); - objcg = objcgs[off]; - if (!objcg) - continue; - - objcgs[off] = NULL; - obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), - -obj_full_size(s)); - obj_cgroup_put(objcg); - } -} - #else /* CONFIG_MEMCG_KMEM */ static inline struct obj_cgroup **slab_objcgs(struct slab *slab) { return NULL; } -static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) -{ - return NULL; -} - static inline int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { return 0; } - -static inline void memcg_free_slab_cgroups(struct slab *slab) -{ -} - -static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t objects, gfp_t flags) -{ - return true; -} - -static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, - gfp_t flags, size_t size, - void **p) -{ -} - -static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - void **p, int objects) -{ -} #endif /* CONFIG_MEMCG_KMEM */ -static inline struct kmem_cache *virt_to_cache(const void *obj) -{ - struct slab *slab; - - slab = virt_to_slab(obj); - if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", - __func__)) - return NULL; - return slab->slab_cache; -} - -static __always_inline void account_slab(struct slab *slab, int order, - struct kmem_cache *s, gfp_t gfp) -{ - if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) - memcg_alloc_slab_cgroups(slab, s, gfp, true); - - mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), - PAGE_SIZE << order); -} - -static __always_inline void unaccount_slab(struct slab *slab, int order, - struct kmem_cache *s) -{ - if (memcg_kmem_online()) - memcg_free_slab_cgroups(slab); - - mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), - -(PAGE_SIZE << order)); -} - -static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) -{ - struct kmem_cache *cachep; - - if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && - !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) - return s; - - cachep = virt_to_cache(x); - if (WARN(cachep && cachep != s, - "%s: Wrong slab cache. %s but object is from %s\n", - __func__, s->name, cachep->name)) - print_tracking(cachep, x); - return cachep; -} - void free_large_kmalloc(struct folio *folio, void *object); size_t __ksize(const void *objp); diff --git a/mm/slub.c b/mm/slub.c index 9eb6508152c2..844e0beb84ee 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1814,6 +1814,165 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, #endif #endif /* CONFIG_SLUB_DEBUG */ +static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) +{ + return (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; +} + +#ifdef CONFIG_MEMCG_KMEM +static inline void memcg_free_slab_cgroups(struct slab *slab) +{ + kfree(slab_objcgs(slab)); + slab->memcg_data = 0; +} + +static inline size_t obj_full_size(struct kmem_cache *s) +{ + /* + * For each accounted object there is an extra space which is used + * to store obj_cgroup membership. Charge it too. + */ + return s->size + sizeof(struct obj_cgroup *); +} + +/* + * Returns false if the allocation should fail. + */ +static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t objects, gfp_t flags) +{ + struct obj_cgroup *objcg; + + if (!memcg_kmem_online()) + return true; + + if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) + return true; + + /* + * The obtained objcg pointer is safe to use within the current scope, + * defined by current task or set_active_memcg() pair. + * obj_cgroup_get() is used to get a permanent reference. + */ + objcg = current_obj_cgroup(); + if (!objcg) + return true; + + if (lru) { + int ret; + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + ret = memcg_list_lru_alloc(memcg, lru, flags); + css_put(&memcg->css); + + if (ret) + return false; + } + + if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) + return false; + + *objcgp = objcg; + return true; +} + +static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, + gfp_t flags, size_t size, + void **p) +{ + struct slab *slab; + unsigned long off; + size_t i; + + if (!memcg_kmem_online() || !objcg) + return; + + for (i = 0; i < size; i++) { + if (likely(p[i])) { + slab = virt_to_slab(p[i]); + + if (!slab_objcgs(slab) && + memcg_alloc_slab_cgroups(slab, s, flags, false)) { + obj_cgroup_uncharge(objcg, obj_full_size(s)); + continue; + } + + off = obj_to_index(s, slab, p[i]); + obj_cgroup_get(objcg); + slab_objcgs(slab)[off] = objcg; + mod_objcg_state(objcg, slab_pgdat(slab), + cache_vmstat_idx(s), obj_full_size(s)); + } else { + obj_cgroup_uncharge(objcg, obj_full_size(s)); + } + } +} + +static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects) +{ + struct obj_cgroup **objcgs; + int i; + + if (!memcg_kmem_online()) + return; + + objcgs = slab_objcgs(slab); + if (!objcgs) + return; + + for (i = 0; i < objects; i++) { + struct obj_cgroup *objcg; + unsigned int off; + + off = obj_to_index(s, slab, p[i]); + objcg = objcgs[off]; + if (!objcg) + continue; + + objcgs[off] = NULL; + obj_cgroup_uncharge(objcg, obj_full_size(s)); + mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), + -obj_full_size(s)); + obj_cgroup_put(objcg); + } +} +#else /* CONFIG_MEMCG_KMEM */ +static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) +{ + return NULL; +} + +static inline void memcg_free_slab_cgroups(struct slab *slab) +{ +} + +static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t objects, gfp_t flags) +{ + return true; +} + +static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, + gfp_t flags, size_t size, + void **p) +{ +} + +static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + /* * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. @@ -2048,6 +2207,26 @@ static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ +static __always_inline void account_slab(struct slab *slab, int order, + struct kmem_cache *s, gfp_t gfp) +{ + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) + memcg_alloc_slab_cgroups(slab, s, gfp, true); + + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), + PAGE_SIZE << order); +} + +static __always_inline void unaccount_slab(struct slab *slab, int order, + struct kmem_cache *s) +{ + if (memcg_kmem_online()) + memcg_free_slab_cgroups(slab); + + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), + -(PAGE_SIZE << order)); +} + static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct slab *slab; @@ -3965,6 +4144,32 @@ void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) } #endif +static inline struct kmem_cache *virt_to_cache(const void *obj) +{ + struct slab *slab; + + slab = virt_to_slab(obj); + if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__)) + return NULL; + return slab->slab_cache; +} + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + + if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && + !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) + return s; + + cachep = virt_to_cache(x); + if (WARN(cachep && cachep != s, + "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name)) + print_tracking(cachep, x); + return cachep; +} + void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) { slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller); From b52ef56e9b324b172053b03d8c775ef4708fbc23 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 14:57:49 +0200 Subject: [PATCH 0285/1562] mm/slab: move struct kmem_cache_node from slab.h to slub.c The declaration and associated helpers are not used anywhere else anymore. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.h | 29 ----------------------------- mm/slub.c | 27 +++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index a81ef7c9282d..5ae6a978e9c2 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -588,35 +588,6 @@ static inline size_t slab_ksize(const struct kmem_cache *s) return s->size; } - -/* - * The slab lists for all objects. - */ -struct kmem_cache_node { - spinlock_t list_lock; - unsigned long nr_partial; - struct list_head partial; -#ifdef CONFIG_SLUB_DEBUG - atomic_long_t nr_slabs; - atomic_long_t total_objects; - struct list_head full; -#endif -}; - -static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) -{ - return s->node[node]; -} - -/* - * Iterator over all nodes. The body will be executed for each node that has - * a kmem_cache_node structure allocated (which is true for all online nodes) - */ -#define for_each_kmem_cache_node(__s, __node, __n) \ - for (__node = 0; __node < nr_node_ids; __node++) \ - if ((__n = get_node(__s, __node))) - - #ifdef CONFIG_SLUB_DEBUG void dump_unreclaimable_slab(void); #else diff --git a/mm/slub.c b/mm/slub.c index 844e0beb84ee..cc801f8258fe 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -396,6 +396,33 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) #endif } +/* + * The slab lists for all objects. + */ +struct kmem_cache_node { + spinlock_t list_lock; + unsigned long nr_partial; + struct list_head partial; +#ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; + atomic_long_t total_objects; + struct list_head full; +#endif +}; + +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __node < nr_node_ids; __node++) \ + if ((__n = get_node(__s, __node))) + /* * Tracks for which NUMA nodes we have kmem_cache_nodes allocated. * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily From b774d3e326d30fc8ef841101c399e44bdac2aa48 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 15:27:11 +0200 Subject: [PATCH 0286/1562] mm/slab: move kfree() from slab_common.c to slub.c This should result in better code. Currently kfree() makes a function call between compilation units to __kmem_cache_free() which does its own virt_to_slab(), throwing away the struct slab pointer we already had in kfree(). Now it can be reused. Additionally kfree() can now inline the whole SLUB freeing fastpath. Also move over free_large_kmalloc() as the only callsites are now in slub.c, and make it static. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.h | 4 ---- mm/slab_common.c | 45 ------------------------------------------ mm/slub.c | 51 +++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 46 insertions(+), 54 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 5ae6a978e9c2..35a55c4a407d 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -395,8 +395,6 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller); void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, size_t orig_size, unsigned long caller); -void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller); - gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ @@ -559,8 +557,6 @@ static inline int memcg_alloc_slab_cgroups(struct slab *slab, } #endif /* CONFIG_MEMCG_KMEM */ -void free_large_kmalloc(struct folio *folio, void *object); - size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) diff --git a/mm/slab_common.c b/mm/slab_common.c index bbc2e3f061f1..f4f275613d2a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -963,22 +963,6 @@ void __init create_kmalloc_caches(slab_flags_t flags) slab_state = UP; } -void free_large_kmalloc(struct folio *folio, void *object) -{ - unsigned int order = folio_order(folio); - - if (WARN_ON_ONCE(order == 0)) - pr_warn_once("object pointer: 0x%p\n", object); - - kmemleak_free(object); - kasan_kfree_large(object); - kmsan_kfree_large(object); - - mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); - __free_pages(folio_page(folio, 0), order); -} - static void *__kmalloc_large_node(size_t size, gfp_t flags, int node); static __always_inline void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) @@ -1023,35 +1007,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, } EXPORT_SYMBOL(__kmalloc_node_track_caller); -/** - * kfree - free previously allocated memory - * @object: pointer returned by kmalloc() or kmem_cache_alloc() - * - * If @object is NULL, no operation is performed. - */ -void kfree(const void *object) -{ - struct folio *folio; - struct slab *slab; - struct kmem_cache *s; - - trace_kfree(_RET_IP_, object); - - if (unlikely(ZERO_OR_NULL_PTR(object))) - return; - - folio = virt_to_folio(object); - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, (void *)object); - return; - } - - slab = folio_slab(folio); - s = slab->slab_cache; - __kmem_cache_free(s, (void *)object, _RET_IP_); -} -EXPORT_SYMBOL(kfree); - /** * __ksize -- Report full size of underlying allocation * @object: pointer to the object diff --git a/mm/slub.c b/mm/slub.c index cc801f8258fe..2baa9e94d9df 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4197,11 +4197,6 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return cachep; } -void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) -{ - slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller); -} - /** * kmem_cache_free - Deallocate an object * @s: The cache the allocation was from. @@ -4220,6 +4215,52 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); +static void free_large_kmalloc(struct folio *folio, void *object) +{ + unsigned int order = folio_order(folio); + + if (WARN_ON_ONCE(order == 0)) + pr_warn_once("object pointer: 0x%p\n", object); + + kmemleak_free(object); + kasan_kfree_large(object); + kmsan_kfree_large(object); + + mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); + __free_pages(folio_page(folio, 0), order); +} + +/** + * kfree - free previously allocated memory + * @object: pointer returned by kmalloc() or kmem_cache_alloc() + * + * If @object is NULL, no operation is performed. + */ +void kfree(const void *object) +{ + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *x = (void *)object; + + trace_kfree(_RET_IP_, object); + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + folio = virt_to_folio(object); + if (unlikely(!folio_test_slab(folio))) { + free_large_kmalloc(folio, (void *)object); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + slab_free(s, slab, x, NULL, &x, 1, _RET_IP_); +} +EXPORT_SYMBOL(kfree); + struct detached_freelist { struct slab *slab; void *tail; From 5a9d31d980cbc9cefcee18e186bd4c5d51f3cba2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 13 Nov 2023 12:02:02 +0100 Subject: [PATCH 0287/1562] mm/slab: move kmalloc_slab() to mm/slab.h In preparation for the next patch, move the kmalloc_slab() function to the header, as it will have callers from two files, and make it inline. To avoid unnecessary bloat, remove all size checks/warnings from kmalloc_slab() as they just duplicate those in callers, especially after recent changes to kmalloc_size_roundup(). We just need to adjust handling of zero size in __do_kmalloc_node(). Also we can stop handling NULL result from kmalloc_slab() there as that now cannot happen (unless called too early during boot). The size_index array becomes visible so rename it to a more specific kmalloc_size_index. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.h | 28 ++++++++++++++++++++++++++-- mm/slab_common.c | 43 ++++++++----------------------------------- 2 files changed, 34 insertions(+), 37 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 35a55c4a407d..7d7cc7af614e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -389,8 +389,32 @@ extern const struct kmalloc_info_struct { void setup_kmalloc_cache_index_table(void); void create_kmalloc_caches(slab_flags_t); -/* Find the kmalloc slab corresponding for a certain size */ -struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller); +extern u8 kmalloc_size_index[24]; + +static inline unsigned int size_index_elem(unsigned int bytes) +{ + return (bytes - 1) / 8; +} + +/* + * Find the kmem_cache structure that serves a given size of + * allocation + * + * This assumes size is larger than zero and not larger than + * KMALLOC_MAX_CACHE_SIZE and the caller must check that. + */ +static inline struct kmem_cache * +kmalloc_slab(size_t size, gfp_t flags, unsigned long caller) +{ + unsigned int index; + + if (size <= 192) + index = kmalloc_size_index[size_index_elem(size)]; + else + index = fls(size - 1); + + return kmalloc_caches[kmalloc_type(flags, caller)][index]; +} void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, size_t orig_size, diff --git a/mm/slab_common.c b/mm/slab_common.c index f4f275613d2a..31ade17a7ad9 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -665,7 +665,7 @@ EXPORT_SYMBOL(random_kmalloc_seed); * of two cache sizes there. The size of larger slabs can be determined using * fls. */ -static u8 size_index[24] __ro_after_init = { +u8 kmalloc_size_index[24] __ro_after_init = { 3, /* 8 */ 4, /* 16 */ 5, /* 24 */ @@ -692,33 +692,6 @@ static u8 size_index[24] __ro_after_init = { 2 /* 192 */ }; -static inline unsigned int size_index_elem(unsigned int bytes) -{ - return (bytes - 1) / 8; -} - -/* - * Find the kmem_cache structure that serves a given size of - * allocation - */ -struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller) -{ - unsigned int index; - - if (size <= 192) { - if (!size) - return ZERO_SIZE_PTR; - - index = size_index[size_index_elem(size)]; - } else { - if (WARN_ON_ONCE(size > KMALLOC_MAX_CACHE_SIZE)) - return NULL; - index = fls(size - 1); - } - - return kmalloc_caches[kmalloc_type(flags, caller)][index]; -} - size_t kmalloc_size_roundup(size_t size) { if (size && size <= KMALLOC_MAX_CACHE_SIZE) { @@ -843,9 +816,9 @@ void __init setup_kmalloc_cache_index_table(void) for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { unsigned int elem = size_index_elem(i); - if (elem >= ARRAY_SIZE(size_index)) + if (elem >= ARRAY_SIZE(kmalloc_size_index)) break; - size_index[elem] = KMALLOC_SHIFT_LOW; + kmalloc_size_index[elem] = KMALLOC_SHIFT_LOW; } if (KMALLOC_MIN_SIZE >= 64) { @@ -854,7 +827,7 @@ void __init setup_kmalloc_cache_index_table(void) * is 64 byte. */ for (i = 64 + 8; i <= 96; i += 8) - size_index[size_index_elem(i)] = 7; + kmalloc_size_index[size_index_elem(i)] = 7; } @@ -865,7 +838,7 @@ void __init setup_kmalloc_cache_index_table(void) * instead. */ for (i = 128 + 8; i <= 192; i += 8) - size_index[size_index_elem(i)] = 8; + kmalloc_size_index[size_index_elem(i)] = 8; } } @@ -977,10 +950,10 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller return ret; } - s = kmalloc_slab(size, flags, caller); + if (unlikely(!size)) + return ZERO_SIZE_PTR; - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; + s = kmalloc_slab(size, flags, caller); ret = __kmem_cache_alloc_node(s, flags, node, size, caller); ret = kasan_kmalloc(s, ret, size, flags); From 4862caa5cba027bf7de925e05e4d1a64c89d81d6 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 16:57:59 +0200 Subject: [PATCH 0288/1562] mm/slab: move kmalloc() functions from slab_common.c to slub.c This will eliminate a call between compilation units through __kmem_cache_alloc_node() and allow better inlining of the allocation fast path. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.h | 3 -- mm/slab_common.c | 119 -------------------------------------------- mm/slub.c | 126 ++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 118 insertions(+), 130 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 7d7cc7af614e..54deeb0428c6 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -416,9 +416,6 @@ kmalloc_slab(size_t size, gfp_t flags, unsigned long caller) return kmalloc_caches[kmalloc_type(flags, caller)][index]; } -void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t orig_size, - unsigned long caller); gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 31ade17a7ad9..238293b1dbe1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -936,50 +936,6 @@ void __init create_kmalloc_caches(slab_flags_t flags) slab_state = UP; } -static void *__kmalloc_large_node(size_t size, gfp_t flags, int node); -static __always_inline -void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = __kmalloc_large_node(size, flags, node); - trace_kmalloc(caller, ret, size, - PAGE_SIZE << get_order(size), flags, node); - return ret; - } - - if (unlikely(!size)) - return ZERO_SIZE_PTR; - - s = kmalloc_slab(size, flags, caller); - - ret = __kmem_cache_alloc_node(s, flags, node, size, caller); - ret = kasan_kmalloc(s, ret, size, flags); - trace_kmalloc(caller, ret, size, s->size, flags, node); - return ret; -} - -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - return __do_kmalloc_node(size, flags, node, _RET_IP_); -} -EXPORT_SYMBOL(__kmalloc_node); - -void *__kmalloc(size_t size, gfp_t flags) -{ - return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); -} -EXPORT_SYMBOL(__kmalloc); - -void *__kmalloc_node_track_caller(size_t size, gfp_t flags, - int node, unsigned long caller) -{ - return __do_kmalloc_node(size, flags, node, caller); -} -EXPORT_SYMBOL(__kmalloc_node_track_caller); - /** * __ksize -- Report full size of underlying allocation * @object: pointer to the object @@ -1016,30 +972,6 @@ size_t __ksize(const void *object) return slab_ksize(folio_slab(folio)->slab_cache); } -void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) -{ - void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, - size, _RET_IP_); - - trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); - - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; -} -EXPORT_SYMBOL(kmalloc_trace); - -void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) -{ - void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_); - - trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node); - - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; -} -EXPORT_SYMBOL(kmalloc_node_trace); - gfp_t kmalloc_fix_flags(gfp_t flags) { gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; @@ -1052,57 +984,6 @@ gfp_t kmalloc_fix_flags(gfp_t flags) return flags; } -/* - * To avoid unnecessary overhead, we pass through large allocation requests - * directly to the page allocator. We use __GFP_COMP, because we will need to - * know the allocation order to free the pages properly in kfree. - */ - -static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) -{ - struct page *page; - void *ptr = NULL; - unsigned int order = get_order(size); - - if (unlikely(flags & GFP_SLAB_BUG_MASK)) - flags = kmalloc_fix_flags(flags); - - flags |= __GFP_COMP; - page = alloc_pages_node(node, flags, order); - if (page) { - ptr = page_address(page); - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, - PAGE_SIZE << order); - } - - ptr = kasan_kmalloc_large(ptr, size, flags); - /* As ptr might get tagged, call kmemleak hook after KASAN. */ - kmemleak_alloc(ptr, size, 1, flags); - kmsan_kmalloc_large(ptr, size, flags); - - return ptr; -} - -void *kmalloc_large(size_t size, gfp_t flags) -{ - void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); - - trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), - flags, NUMA_NO_NODE); - return ret; -} -EXPORT_SYMBOL(kmalloc_large); - -void *kmalloc_large_node(size_t size, gfp_t flags, int node) -{ - void *ret = __kmalloc_large_node(size, flags, node); - - trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), - flags, node); - return ret; -} -EXPORT_SYMBOL(kmalloc_large_node); - #ifdef CONFIG_SLAB_FREELIST_RANDOM /* Randomize a generic freelist */ static void freelist_randomize(unsigned int *list, diff --git a/mm/slub.c b/mm/slub.c index 2baa9e94d9df..d6bc15929d22 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3851,14 +3851,6 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, } EXPORT_SYMBOL(kmem_cache_alloc_lru); -void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t orig_size, - unsigned long caller) -{ - return slab_alloc_node(s, NULL, gfpflags, node, - caller, orig_size); -} - /** * kmem_cache_alloc_node - Allocate an object on the specified node * @s: The cache to allocate from. @@ -3882,6 +3874,124 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) } EXPORT_SYMBOL(kmem_cache_alloc_node); +/* + * To avoid unnecessary overhead, we pass through large allocation requests + * directly to the page allocator. We use __GFP_COMP, because we will need to + * know the allocation order to free the pages properly in kfree. + */ +static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + struct page *page; + void *ptr = NULL; + unsigned int order = get_order(size); + + if (unlikely(flags & GFP_SLAB_BUG_MASK)) + flags = kmalloc_fix_flags(flags); + + flags |= __GFP_COMP; + page = alloc_pages_node(node, flags, order); + if (page) { + ptr = page_address(page); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); + } + + ptr = kasan_kmalloc_large(ptr, size, flags); + /* As ptr might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc(ptr, size, 1, flags); + kmsan_kmalloc_large(ptr, size, flags); + + return ptr; +} + +void *kmalloc_large(size_t size, gfp_t flags) +{ + void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, NUMA_NO_NODE); + return ret; +} +EXPORT_SYMBOL(kmalloc_large); + +void *kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + void *ret = __kmalloc_large_node(size, flags, node); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, node); + return ret; +} +EXPORT_SYMBOL(kmalloc_large_node); + +static __always_inline +void *__do_kmalloc_node(size_t size, gfp_t flags, int node, + unsigned long caller) +{ + struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = __kmalloc_large_node(size, flags, node); + trace_kmalloc(caller, ret, size, + PAGE_SIZE << get_order(size), flags, node); + return ret; + } + + if (unlikely(!size)) + return ZERO_SIZE_PTR; + + s = kmalloc_slab(size, flags, caller); + + ret = slab_alloc_node(s, NULL, flags, node, caller, size); + ret = kasan_kmalloc(s, ret, size, flags); + trace_kmalloc(caller, ret, size, s->size, flags, node); + return ret; +} + +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_node); + +void *__kmalloc(size_t size, gfp_t flags) +{ + return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc); + +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, flags, node, caller); +} +EXPORT_SYMBOL(__kmalloc_node_track_caller); + +void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, + _RET_IP_, size); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_trace); + +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) +{ + void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_node_trace); + static noinline void free_to_partial_list( struct kmem_cache *s, struct slab *slab, void *head, void *tail, int bulk_cnt, From 49378a05ce7f01a203550eb7c2ef772f6d24565c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 26 Oct 2023 17:45:42 +0200 Subject: [PATCH 0289/1562] mm/slub: remove slab_alloc() and __kmem_cache_alloc_lru() wrappers slab_alloc() is a thin wrapper around slab_alloc_node() with only one caller. Replace with direct call of slab_alloc_node(). __kmem_cache_alloc_lru() itself is a thin wrapper with two callers, so replace it with direct calls of slab_alloc_node() and trace_kmem_cache_alloc(). This also makes sure _RET_IP_ has always the expected value and not depending on inlining decisions. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d6bc15929d22..5683f1d02e4f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3821,33 +3821,26 @@ out: return object; } -static __fastpath_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru, - gfp_t gfpflags, unsigned long addr, size_t orig_size) +void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size); -} - -static __fastpath_inline -void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, - gfp_t gfpflags) -{ - void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size); + void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_, + s->object_size); trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); return ret; } - -void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) -{ - return __kmem_cache_alloc_lru(s, NULL, gfpflags); -} EXPORT_SYMBOL(kmem_cache_alloc); void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) { - return __kmem_cache_alloc_lru(s, lru, gfpflags); + void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_, + s->object_size); + + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_lru); From 3450a0e5a6fc4cdbd70853f12c0c332dd24c1349 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 13 Nov 2023 18:04:05 +0100 Subject: [PATCH 0290/1562] mm/slub: optimize alloc fastpath code layout With allocation fastpaths no longer divided between two .c files, we have better inlining, however checking the disassembly of kmem_cache_alloc() reveals we can do better to make the fastpaths smaller and move the less common situations out of line or to separate functions, to reduce instruction cache pressure. - split memcg pre/post alloc hooks to inlined checks that use likely() to assume there will be no objcg handling necessary, and non-inline functions doing the actual handling - add some more likely/unlikely() to pre/post alloc hooks to indicate which scenarios should be out of line - change gfp_allowed_mask handling in slab_post_alloc_hook() so the code can be optimized away when kasan/kmsan/kmemleak is configured out bloat-o-meter shows: add/remove: 4/2 grow/shrink: 1/8 up/down: 521/-2924 (-2403) Function old new delta __memcg_slab_post_alloc_hook - 461 +461 kmem_cache_alloc_bulk 775 791 +16 __pfx_should_failslab.constprop - 16 +16 __pfx___memcg_slab_post_alloc_hook - 16 +16 should_failslab.constprop - 12 +12 __pfx_memcg_slab_post_alloc_hook 16 - -16 kmem_cache_alloc_lru 1295 1023 -272 kmem_cache_alloc_node 1118 817 -301 kmem_cache_alloc 1076 772 -304 kmalloc_node_trace 1149 838 -311 kmalloc_trace 1102 789 -313 __kmalloc_node_track_caller 1393 1080 -313 __kmalloc_node 1397 1082 -315 __kmalloc 1374 1059 -315 memcg_slab_post_alloc_hook 464 - -464 Note that gcc still decided to inline __memcg_pre_alloc_hook(), but the code is out of line. Forcing noinline did not improve the results. As a result the fastpaths are shorter and overal code size is reduced. Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 89 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 35 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 5683f1d02e4f..77d259f3d592 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1866,25 +1866,17 @@ static inline size_t obj_full_size(struct kmem_cache *s) /* * Returns false if the allocation should fail. */ -static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t objects, gfp_t flags) +static bool __memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t objects, gfp_t flags) { - struct obj_cgroup *objcg; - - if (!memcg_kmem_online()) - return true; - - if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) - return true; - /* * The obtained objcg pointer is safe to use within the current scope, * defined by current task or set_active_memcg() pair. * obj_cgroup_get() is used to get a permanent reference. */ - objcg = current_obj_cgroup(); + struct obj_cgroup *objcg = current_obj_cgroup(); if (!objcg) return true; @@ -1907,17 +1899,34 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, return true; } -static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, - gfp_t flags, size_t size, - void **p) +/* + * Returns false if the allocation should fail. + */ +static __fastpath_inline +bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru, + struct obj_cgroup **objcgp, size_t objects, + gfp_t flags) +{ + if (!memcg_kmem_online()) + return true; + + if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))) + return true; + + return likely(__memcg_slab_pre_alloc_hook(s, lru, objcgp, objects, + flags)); +} + +static void __memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, + gfp_t flags, size_t size, + void **p) { struct slab *slab; unsigned long off; size_t i; - if (!memcg_kmem_online() || !objcg) - return; + flags &= gfp_allowed_mask; for (i = 0; i < size; i++) { if (likely(p[i])) { @@ -1940,6 +1949,16 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, } } +static __fastpath_inline +void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, + gfp_t flags, size_t size, void **p) +{ + if (likely(!memcg_kmem_online() || !objcg)) + return; + + return __memcg_slab_post_alloc_hook(s, objcg, flags, size, p); +} + static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { @@ -3709,34 +3728,34 @@ noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags) } ALLOW_ERROR_INJECTION(should_failslab, ERRNO); -static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t size, gfp_t flags) +static __fastpath_inline +struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t size, gfp_t flags) { flags &= gfp_allowed_mask; might_alloc(flags); - if (should_failslab(s, flags)) + if (unlikely(should_failslab(s, flags))) return NULL; - if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)) + if (unlikely(!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags))) return NULL; return s; } -static inline void slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, gfp_t flags, - size_t size, void **p, bool init, - unsigned int orig_size) +static __fastpath_inline +void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, + gfp_t flags, size_t size, void **p, bool init, + unsigned int orig_size) { unsigned int zero_size = s->object_size; bool kasan_init = init; size_t i; - - flags &= gfp_allowed_mask; + gfp_t init_flags = flags & gfp_allowed_mask; /* * For kmalloc object, the allocated memory size(object_size) is likely @@ -3769,13 +3788,13 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, * As p[i] might get tagged, memset and kmemleak hook come after KASAN. */ for (i = 0; i < size; i++) { - p[i] = kasan_slab_alloc(s, p[i], flags, kasan_init); + p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init); if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) memset(p[i], 0, zero_size); kmemleak_alloc_recursive(p[i], s->object_size, 1, - s->flags, flags); - kmsan_slab_alloc(s, p[i], flags); + s->flags, init_flags); + kmsan_slab_alloc(s, p[i], init_flags); } memcg_slab_post_alloc_hook(s, objcg, flags, size, p); @@ -3799,7 +3818,7 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list bool init = false; s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); - if (!s) + if (unlikely(!s)) return NULL; object = kfence_alloc(s, orig_size, gfpflags); From ecf9a253ce120082ce0a8aff806c4de4865cfcc5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 27 Oct 2023 12:34:18 +0200 Subject: [PATCH 0291/1562] mm/slub: optimize free fast path code layout Inspection of kmem_cache_free() disassembly showed we could make the fast path smaller by providing few more hints to the compiler, and splitting the memcg_slab_free_hook() into an inline part that only checks if there's work to do, and an out of line part doing the actual uncharge. bloat-o-meter results: add/remove: 2/0 grow/shrink: 0/3 up/down: 286/-554 (-268) Function old new delta __memcg_slab_free_hook - 270 +270 __pfx___memcg_slab_free_hook - 16 +16 kfree 828 665 -163 kmem_cache_free 1116 948 -168 kmem_cache_free_bulk.part 1701 1478 -223 Checking kmem_cache_free() disassembly now shows the non-fastpath cases are handled out of line, which should reduce instruction cache usage. Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 77d259f3d592..3f8b95757106 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1959,20 +1959,11 @@ void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, return __memcg_slab_post_alloc_hook(s, objcg, flags, size, p); } -static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - void **p, int objects) +static void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects, + struct obj_cgroup **objcgs) { - struct obj_cgroup **objcgs; - int i; - - if (!memcg_kmem_online()) - return; - - objcgs = slab_objcgs(slab); - if (!objcgs) - return; - - for (i = 0; i < objects; i++) { + for (int i = 0; i < objects; i++) { struct obj_cgroup *objcg; unsigned int off; @@ -1988,6 +1979,22 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, obj_cgroup_put(objcg); } } + +static __fastpath_inline +void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) +{ + struct obj_cgroup **objcgs; + + if (!memcg_kmem_online()) + return; + + objcgs = slab_objcgs(slab); + if (likely(!objcgs)) + return; + + __memcg_slab_free_hook(s, slab, p, objects, objcgs); +} #else /* CONFIG_MEMCG_KMEM */ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) { @@ -2047,7 +2054,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, * The initialization memset's clear the object and the metadata, * but don't touch the SLAB redzone. */ - if (init) { + if (unlikely(init)) { int rsize; if (!kasan_has_integrated_init()) @@ -2083,7 +2090,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, next = get_freepointer(s, object); /* If object's reuse doesn't have to be delayed */ - if (!slab_free_hook(s, object, slab_want_init_on_free(s))) { + if (likely(!slab_free_hook(s, object, + slab_want_init_on_free(s)))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -4282,7 +4290,7 @@ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, * With KASAN enabled slab_free_freelist_hook modifies the freelist * to remove objects, whose reuse must be delayed. */ - if (slab_free_freelist_hook(s, &head, &tail, &cnt)) + if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) do_slab_free(s, slab, head, tail, cnt, addr); } From 2506c1de4081249b1df9c9a7dbd3d038e691e4e5 Mon Sep 17 00:00:00 2001 From: Naresh Solanki Date: Tue, 5 Dec 2023 16:22:04 +0530 Subject: [PATCH 0292/1562] regulator: event: Add regulator netlink event support This commit introduces netlink event support to the regulator subsystem. Changes: - Introduce event.c and regnl.h for netlink event handling. - Implement reg_generate_netlink_event to broadcast regulator events. - Update Makefile to include the new event.c file. Signed-off-by: Naresh Solanki Link: https://lore.kernel.org/r/20231205105207.1262928-1-naresh.solanki@9elements.com Signed-off-by: Mark Brown --- drivers/regulator/Kconfig | 10 ++++ drivers/regulator/Makefile | 1 + drivers/regulator/core.c | 19 ++++++- drivers/regulator/event.c | 91 ++++++++++++++++++++++++++++++ drivers/regulator/regnl.h | 13 +++++ include/linux/regulator/consumer.h | 47 +-------------- include/uapi/regulator/regulator.h | 90 +++++++++++++++++++++++++++++ 7 files changed, 224 insertions(+), 47 deletions(-) create mode 100644 drivers/regulator/event.c create mode 100644 drivers/regulator/regnl.h create mode 100644 include/uapi/regulator/regulator.h diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index f3ec24691378..550145f82726 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -56,6 +56,16 @@ config REGULATOR_USERSPACE_CONSUMER If unsure, say no. +config REGULATOR_NETLINK_EVENTS + bool "Enable support for receiving regulator events via netlink" + depends on NET + help + Enabling this option allows the kernel to broadcast regulator events using + the netlink mechanism. User-space applications can subscribe to these events + for real-time updates on various regulator events. + + If unsure, say no. + config REGULATOR_88PG86X tristate "Marvell 88PG86X voltage regulators" depends on I2C diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index b2b059b5ee56..46fb569e6be8 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_REGULATOR) += core.o dummy.o fixed-helper.o helpers.o devres.o irq_helpers.o +obj-$(CONFIG_REGULATOR_NETLINK_EVENTS) += event.o obj-$(CONFIG_OF) += of_regulator.o obj-$(CONFIG_REGULATOR_FIXED_VOLTAGE) += fixed.o obj-$(CONFIG_REGULATOR_VIRTUAL_CONSUMER) += virtual.o diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 4aa9ec8c22f3..a968dabb48f5 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -33,6 +33,7 @@ #include "dummy.h" #include "internal.h" +#include "regnl.h" static DEFINE_WW_CLASS(regulator_ww_class); static DEFINE_MUTEX(regulator_nesting_mutex); @@ -4854,7 +4855,23 @@ static int _notifier_call_chain(struct regulator_dev *rdev, unsigned long event, void *data) { /* call rdev chain first */ - return blocking_notifier_call_chain(&rdev->notifier, event, data); + int ret = blocking_notifier_call_chain(&rdev->notifier, event, data); + + if (IS_REACHABLE(CONFIG_REGULATOR_NETLINK_EVENTS)) { + struct device *parent = rdev->dev.parent; + const char *rname = rdev_get_name(rdev); + char name[32]; + + /* Avoid duplicate debugfs directory names */ + if (parent && rname == rdev->desc->name) { + snprintf(name, sizeof(name), "%s-%s", dev_name(parent), + rname); + rname = name; + } + reg_generate_netlink_event(rname, event); + } + + return ret; } int _regulator_bulk_get(struct device *dev, int num_consumers, diff --git a/drivers/regulator/event.c b/drivers/regulator/event.c new file mode 100644 index 000000000000..0ec58f306b38 --- /dev/null +++ b/drivers/regulator/event.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Regulator event over netlink + * + * Author: Naresh Solanki + */ + +#include +#include +#include + +#include "regnl.h" + +static unsigned int reg_event_seqnum; + +static const struct genl_multicast_group reg_event_mcgrps[] = { + { .name = REG_GENL_MCAST_GROUP_NAME, }, +}; + +static struct genl_family reg_event_genl_family __ro_after_init = { + .module = THIS_MODULE, + .name = REG_GENL_FAMILY_NAME, + .version = REG_GENL_VERSION, + .maxattr = REG_GENL_ATTR_MAX, + .mcgrps = reg_event_mcgrps, + .n_mcgrps = ARRAY_SIZE(reg_event_mcgrps), +}; + +int reg_generate_netlink_event(const char *reg_name, u64 event) +{ + struct sk_buff *skb; + struct nlattr *attr; + struct reg_genl_event *edata; + void *msg_header; + int size; + + /* allocate memory */ + size = nla_total_size(sizeof(struct reg_genl_event)) + + nla_total_size(0); + + skb = genlmsg_new(size, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + /* add the genetlink message header */ + msg_header = genlmsg_put(skb, 0, reg_event_seqnum++, + ®_event_genl_family, 0, + REG_GENL_CMD_EVENT); + if (!msg_header) { + nlmsg_free(skb); + return -ENOMEM; + } + + /* fill the data */ + attr = nla_reserve(skb, REG_GENL_ATTR_EVENT, sizeof(struct reg_genl_event)); + if (!attr) { + nlmsg_free(skb); + return -EINVAL; + } + + edata = nla_data(attr); + memset(edata, 0, sizeof(struct reg_genl_event)); + + strscpy(edata->reg_name, reg_name, sizeof(edata->reg_name)); + edata->event = event; + + /* send multicast genetlink message */ + genlmsg_end(skb, msg_header); + size = genlmsg_multicast(®_event_genl_family, skb, 0, 0, GFP_ATOMIC); + + return size; +} + +static int __init reg_event_genetlink_init(void) +{ + return genl_register_family(®_event_genl_family); +} + +static int __init reg_event_init(void) +{ + int error; + + /* create genetlink for acpi event */ + error = reg_event_genetlink_init(); + if (error) + pr_warn("Failed to create genetlink family for reg event\n"); + + return 0; +} + +fs_initcall(reg_event_init); diff --git a/drivers/regulator/regnl.h b/drivers/regulator/regnl.h new file mode 100644 index 000000000000..bcba16cc05cc --- /dev/null +++ b/drivers/regulator/regnl.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Regulator event over netlink + * + * Author: Naresh Solanki + */ + +#ifndef __REGULATOR_EVENT_H +#define __REGULATOR_EVENT_H + +int reg_generate_netlink_event(const char *reg_name, u64 event); + +#endif diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 39b666b40ea6..4660582a3302 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -33,6 +33,7 @@ #include #include +#include struct device; struct notifier_block; @@ -84,52 +85,6 @@ struct regulator_dev; #define REGULATOR_MODE_IDLE 0x4 #define REGULATOR_MODE_STANDBY 0x8 -/* - * Regulator notifier events. - * - * UNDER_VOLTAGE Regulator output is under voltage. - * OVER_CURRENT Regulator output current is too high. - * REGULATION_OUT Regulator output is out of regulation. - * FAIL Regulator output has failed. - * OVER_TEMP Regulator over temp. - * FORCE_DISABLE Regulator forcibly shut down by software. - * VOLTAGE_CHANGE Regulator voltage changed. - * Data passed is old voltage cast to (void *). - * DISABLE Regulator was disabled. - * PRE_VOLTAGE_CHANGE Regulator is about to have voltage changed. - * Data passed is "struct pre_voltage_change_data" - * ABORT_VOLTAGE_CHANGE Regulator voltage change failed for some reason. - * Data passed is old voltage cast to (void *). - * PRE_DISABLE Regulator is about to be disabled - * ABORT_DISABLE Regulator disable failed for some reason - * - * NOTE: These events can be OR'ed together when passed into handler. - */ - -#define REGULATOR_EVENT_UNDER_VOLTAGE 0x01 -#define REGULATOR_EVENT_OVER_CURRENT 0x02 -#define REGULATOR_EVENT_REGULATION_OUT 0x04 -#define REGULATOR_EVENT_FAIL 0x08 -#define REGULATOR_EVENT_OVER_TEMP 0x10 -#define REGULATOR_EVENT_FORCE_DISABLE 0x20 -#define REGULATOR_EVENT_VOLTAGE_CHANGE 0x40 -#define REGULATOR_EVENT_DISABLE 0x80 -#define REGULATOR_EVENT_PRE_VOLTAGE_CHANGE 0x100 -#define REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE 0x200 -#define REGULATOR_EVENT_PRE_DISABLE 0x400 -#define REGULATOR_EVENT_ABORT_DISABLE 0x800 -#define REGULATOR_EVENT_ENABLE 0x1000 -/* - * Following notifications should be emitted only if detected condition - * is such that the HW is likely to still be working but consumers should - * take a recovery action to prevent problems esacalating into errors. - */ -#define REGULATOR_EVENT_UNDER_VOLTAGE_WARN 0x2000 -#define REGULATOR_EVENT_OVER_CURRENT_WARN 0x4000 -#define REGULATOR_EVENT_OVER_VOLTAGE_WARN 0x8000 -#define REGULATOR_EVENT_OVER_TEMP_WARN 0x10000 -#define REGULATOR_EVENT_WARN_MASK 0x1E000 - /* * Regulator errors that can be queried using regulator_get_error_flags * diff --git a/include/uapi/regulator/regulator.h b/include/uapi/regulator/regulator.h new file mode 100644 index 000000000000..d2b5612198b6 --- /dev/null +++ b/include/uapi/regulator/regulator.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Regulator uapi header + * + * Author: Naresh Solanki + */ + +#ifndef _UAPI_REGULATOR_H +#define _UAPI_REGULATOR_H + +#ifdef __KERNEL__ +#include +#else +#include +#endif + +/* + * Regulator notifier events. + * + * UNDER_VOLTAGE Regulator output is under voltage. + * OVER_CURRENT Regulator output current is too high. + * REGULATION_OUT Regulator output is out of regulation. + * FAIL Regulator output has failed. + * OVER_TEMP Regulator over temp. + * FORCE_DISABLE Regulator forcibly shut down by software. + * VOLTAGE_CHANGE Regulator voltage changed. + * Data passed is old voltage cast to (void *). + * DISABLE Regulator was disabled. + * PRE_VOLTAGE_CHANGE Regulator is about to have voltage changed. + * Data passed is "struct pre_voltage_change_data" + * ABORT_VOLTAGE_CHANGE Regulator voltage change failed for some reason. + * Data passed is old voltage cast to (void *). + * PRE_DISABLE Regulator is about to be disabled + * ABORT_DISABLE Regulator disable failed for some reason + * + * NOTE: These events can be OR'ed together when passed into handler. + */ + +#define REGULATOR_EVENT_UNDER_VOLTAGE 0x01 +#define REGULATOR_EVENT_OVER_CURRENT 0x02 +#define REGULATOR_EVENT_REGULATION_OUT 0x04 +#define REGULATOR_EVENT_FAIL 0x08 +#define REGULATOR_EVENT_OVER_TEMP 0x10 +#define REGULATOR_EVENT_FORCE_DISABLE 0x20 +#define REGULATOR_EVENT_VOLTAGE_CHANGE 0x40 +#define REGULATOR_EVENT_DISABLE 0x80 +#define REGULATOR_EVENT_PRE_VOLTAGE_CHANGE 0x100 +#define REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE 0x200 +#define REGULATOR_EVENT_PRE_DISABLE 0x400 +#define REGULATOR_EVENT_ABORT_DISABLE 0x800 +#define REGULATOR_EVENT_ENABLE 0x1000 +/* + * Following notifications should be emitted only if detected condition + * is such that the HW is likely to still be working but consumers should + * take a recovery action to prevent problems esacalating into errors. + */ +#define REGULATOR_EVENT_UNDER_VOLTAGE_WARN 0x2000 +#define REGULATOR_EVENT_OVER_CURRENT_WARN 0x4000 +#define REGULATOR_EVENT_OVER_VOLTAGE_WARN 0x8000 +#define REGULATOR_EVENT_OVER_TEMP_WARN 0x10000 +#define REGULATOR_EVENT_WARN_MASK 0x1E000 + +struct reg_genl_event { + char reg_name[32]; + uint64_t event; +}; + +/* attributes of reg_genl_family */ +enum { + REG_GENL_ATTR_UNSPEC, + REG_GENL_ATTR_EVENT, /* reg event info needed by user space */ + __REG_GENL_ATTR_MAX, +}; + +#define REG_GENL_ATTR_MAX (__REG_GENL_ATTR_MAX - 1) + +/* commands supported by the reg_genl_family */ +enum { + REG_GENL_CMD_UNSPEC, + REG_GENL_CMD_EVENT, /* kernel->user notifications for reg events */ + __REG_GENL_CMD_MAX, +}; + +#define REG_GENL_CMD_MAX (__REG_GENL_CMD_MAX - 1) + +#define REG_GENL_FAMILY_NAME "reg_event" +#define REG_GENL_VERSION 0x01 +#define REG_GENL_MCAST_GROUP_NAME "reg_mc_group" + +#endif /* _UAPI_REGULATOR_H */ From 16e5ac127d8d18adf85fe5ba847d77b58d1ed418 Mon Sep 17 00:00:00 2001 From: Naresh Solanki Date: Tue, 5 Dec 2023 16:22:04 +0530 Subject: [PATCH 0293/1562] regulator: event: Add regulator netlink event support This commit introduces netlink event support to the regulator subsystem. Changes: - Introduce event.c and regnl.h for netlink event handling. - Implement reg_generate_netlink_event to broadcast regulator events. - Update Makefile to include the new event.c file. Signed-off-by: Naresh Solanki Link: https://lore.kernel.org/r/20231205105207.1262928-1-naresh.solanki@9elements.com Signed-off-by: Mark Brown --- drivers/regulator/Kconfig | 10 ++++ drivers/regulator/Makefile | 1 + drivers/regulator/core.c | 19 ++++++- drivers/regulator/event.c | 91 ++++++++++++++++++++++++++++++ drivers/regulator/regnl.h | 13 +++++ include/linux/regulator/consumer.h | 47 +-------------- include/uapi/regulator/regulator.h | 90 +++++++++++++++++++++++++++++ 7 files changed, 224 insertions(+), 47 deletions(-) create mode 100644 drivers/regulator/event.c create mode 100644 drivers/regulator/regnl.h create mode 100644 include/uapi/regulator/regulator.h diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index f3ec24691378..550145f82726 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -56,6 +56,16 @@ config REGULATOR_USERSPACE_CONSUMER If unsure, say no. +config REGULATOR_NETLINK_EVENTS + bool "Enable support for receiving regulator events via netlink" + depends on NET + help + Enabling this option allows the kernel to broadcast regulator events using + the netlink mechanism. User-space applications can subscribe to these events + for real-time updates on various regulator events. + + If unsure, say no. + config REGULATOR_88PG86X tristate "Marvell 88PG86X voltage regulators" depends on I2C diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index b2b059b5ee56..46fb569e6be8 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_REGULATOR) += core.o dummy.o fixed-helper.o helpers.o devres.o irq_helpers.o +obj-$(CONFIG_REGULATOR_NETLINK_EVENTS) += event.o obj-$(CONFIG_OF) += of_regulator.o obj-$(CONFIG_REGULATOR_FIXED_VOLTAGE) += fixed.o obj-$(CONFIG_REGULATOR_VIRTUAL_CONSUMER) += virtual.o diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 4aa9ec8c22f3..a968dabb48f5 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -33,6 +33,7 @@ #include "dummy.h" #include "internal.h" +#include "regnl.h" static DEFINE_WW_CLASS(regulator_ww_class); static DEFINE_MUTEX(regulator_nesting_mutex); @@ -4854,7 +4855,23 @@ static int _notifier_call_chain(struct regulator_dev *rdev, unsigned long event, void *data) { /* call rdev chain first */ - return blocking_notifier_call_chain(&rdev->notifier, event, data); + int ret = blocking_notifier_call_chain(&rdev->notifier, event, data); + + if (IS_REACHABLE(CONFIG_REGULATOR_NETLINK_EVENTS)) { + struct device *parent = rdev->dev.parent; + const char *rname = rdev_get_name(rdev); + char name[32]; + + /* Avoid duplicate debugfs directory names */ + if (parent && rname == rdev->desc->name) { + snprintf(name, sizeof(name), "%s-%s", dev_name(parent), + rname); + rname = name; + } + reg_generate_netlink_event(rname, event); + } + + return ret; } int _regulator_bulk_get(struct device *dev, int num_consumers, diff --git a/drivers/regulator/event.c b/drivers/regulator/event.c new file mode 100644 index 000000000000..0ec58f306b38 --- /dev/null +++ b/drivers/regulator/event.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Regulator event over netlink + * + * Author: Naresh Solanki + */ + +#include +#include +#include + +#include "regnl.h" + +static unsigned int reg_event_seqnum; + +static const struct genl_multicast_group reg_event_mcgrps[] = { + { .name = REG_GENL_MCAST_GROUP_NAME, }, +}; + +static struct genl_family reg_event_genl_family __ro_after_init = { + .module = THIS_MODULE, + .name = REG_GENL_FAMILY_NAME, + .version = REG_GENL_VERSION, + .maxattr = REG_GENL_ATTR_MAX, + .mcgrps = reg_event_mcgrps, + .n_mcgrps = ARRAY_SIZE(reg_event_mcgrps), +}; + +int reg_generate_netlink_event(const char *reg_name, u64 event) +{ + struct sk_buff *skb; + struct nlattr *attr; + struct reg_genl_event *edata; + void *msg_header; + int size; + + /* allocate memory */ + size = nla_total_size(sizeof(struct reg_genl_event)) + + nla_total_size(0); + + skb = genlmsg_new(size, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + /* add the genetlink message header */ + msg_header = genlmsg_put(skb, 0, reg_event_seqnum++, + ®_event_genl_family, 0, + REG_GENL_CMD_EVENT); + if (!msg_header) { + nlmsg_free(skb); + return -ENOMEM; + } + + /* fill the data */ + attr = nla_reserve(skb, REG_GENL_ATTR_EVENT, sizeof(struct reg_genl_event)); + if (!attr) { + nlmsg_free(skb); + return -EINVAL; + } + + edata = nla_data(attr); + memset(edata, 0, sizeof(struct reg_genl_event)); + + strscpy(edata->reg_name, reg_name, sizeof(edata->reg_name)); + edata->event = event; + + /* send multicast genetlink message */ + genlmsg_end(skb, msg_header); + size = genlmsg_multicast(®_event_genl_family, skb, 0, 0, GFP_ATOMIC); + + return size; +} + +static int __init reg_event_genetlink_init(void) +{ + return genl_register_family(®_event_genl_family); +} + +static int __init reg_event_init(void) +{ + int error; + + /* create genetlink for acpi event */ + error = reg_event_genetlink_init(); + if (error) + pr_warn("Failed to create genetlink family for reg event\n"); + + return 0; +} + +fs_initcall(reg_event_init); diff --git a/drivers/regulator/regnl.h b/drivers/regulator/regnl.h new file mode 100644 index 000000000000..bcba16cc05cc --- /dev/null +++ b/drivers/regulator/regnl.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Regulator event over netlink + * + * Author: Naresh Solanki + */ + +#ifndef __REGULATOR_EVENT_H +#define __REGULATOR_EVENT_H + +int reg_generate_netlink_event(const char *reg_name, u64 event); + +#endif diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 39b666b40ea6..4660582a3302 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -33,6 +33,7 @@ #include #include +#include struct device; struct notifier_block; @@ -84,52 +85,6 @@ struct regulator_dev; #define REGULATOR_MODE_IDLE 0x4 #define REGULATOR_MODE_STANDBY 0x8 -/* - * Regulator notifier events. - * - * UNDER_VOLTAGE Regulator output is under voltage. - * OVER_CURRENT Regulator output current is too high. - * REGULATION_OUT Regulator output is out of regulation. - * FAIL Regulator output has failed. - * OVER_TEMP Regulator over temp. - * FORCE_DISABLE Regulator forcibly shut down by software. - * VOLTAGE_CHANGE Regulator voltage changed. - * Data passed is old voltage cast to (void *). - * DISABLE Regulator was disabled. - * PRE_VOLTAGE_CHANGE Regulator is about to have voltage changed. - * Data passed is "struct pre_voltage_change_data" - * ABORT_VOLTAGE_CHANGE Regulator voltage change failed for some reason. - * Data passed is old voltage cast to (void *). - * PRE_DISABLE Regulator is about to be disabled - * ABORT_DISABLE Regulator disable failed for some reason - * - * NOTE: These events can be OR'ed together when passed into handler. - */ - -#define REGULATOR_EVENT_UNDER_VOLTAGE 0x01 -#define REGULATOR_EVENT_OVER_CURRENT 0x02 -#define REGULATOR_EVENT_REGULATION_OUT 0x04 -#define REGULATOR_EVENT_FAIL 0x08 -#define REGULATOR_EVENT_OVER_TEMP 0x10 -#define REGULATOR_EVENT_FORCE_DISABLE 0x20 -#define REGULATOR_EVENT_VOLTAGE_CHANGE 0x40 -#define REGULATOR_EVENT_DISABLE 0x80 -#define REGULATOR_EVENT_PRE_VOLTAGE_CHANGE 0x100 -#define REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE 0x200 -#define REGULATOR_EVENT_PRE_DISABLE 0x400 -#define REGULATOR_EVENT_ABORT_DISABLE 0x800 -#define REGULATOR_EVENT_ENABLE 0x1000 -/* - * Following notifications should be emitted only if detected condition - * is such that the HW is likely to still be working but consumers should - * take a recovery action to prevent problems esacalating into errors. - */ -#define REGULATOR_EVENT_UNDER_VOLTAGE_WARN 0x2000 -#define REGULATOR_EVENT_OVER_CURRENT_WARN 0x4000 -#define REGULATOR_EVENT_OVER_VOLTAGE_WARN 0x8000 -#define REGULATOR_EVENT_OVER_TEMP_WARN 0x10000 -#define REGULATOR_EVENT_WARN_MASK 0x1E000 - /* * Regulator errors that can be queried using regulator_get_error_flags * diff --git a/include/uapi/regulator/regulator.h b/include/uapi/regulator/regulator.h new file mode 100644 index 000000000000..d2b5612198b6 --- /dev/null +++ b/include/uapi/regulator/regulator.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Regulator uapi header + * + * Author: Naresh Solanki + */ + +#ifndef _UAPI_REGULATOR_H +#define _UAPI_REGULATOR_H + +#ifdef __KERNEL__ +#include +#else +#include +#endif + +/* + * Regulator notifier events. + * + * UNDER_VOLTAGE Regulator output is under voltage. + * OVER_CURRENT Regulator output current is too high. + * REGULATION_OUT Regulator output is out of regulation. + * FAIL Regulator output has failed. + * OVER_TEMP Regulator over temp. + * FORCE_DISABLE Regulator forcibly shut down by software. + * VOLTAGE_CHANGE Regulator voltage changed. + * Data passed is old voltage cast to (void *). + * DISABLE Regulator was disabled. + * PRE_VOLTAGE_CHANGE Regulator is about to have voltage changed. + * Data passed is "struct pre_voltage_change_data" + * ABORT_VOLTAGE_CHANGE Regulator voltage change failed for some reason. + * Data passed is old voltage cast to (void *). + * PRE_DISABLE Regulator is about to be disabled + * ABORT_DISABLE Regulator disable failed for some reason + * + * NOTE: These events can be OR'ed together when passed into handler. + */ + +#define REGULATOR_EVENT_UNDER_VOLTAGE 0x01 +#define REGULATOR_EVENT_OVER_CURRENT 0x02 +#define REGULATOR_EVENT_REGULATION_OUT 0x04 +#define REGULATOR_EVENT_FAIL 0x08 +#define REGULATOR_EVENT_OVER_TEMP 0x10 +#define REGULATOR_EVENT_FORCE_DISABLE 0x20 +#define REGULATOR_EVENT_VOLTAGE_CHANGE 0x40 +#define REGULATOR_EVENT_DISABLE 0x80 +#define REGULATOR_EVENT_PRE_VOLTAGE_CHANGE 0x100 +#define REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE 0x200 +#define REGULATOR_EVENT_PRE_DISABLE 0x400 +#define REGULATOR_EVENT_ABORT_DISABLE 0x800 +#define REGULATOR_EVENT_ENABLE 0x1000 +/* + * Following notifications should be emitted only if detected condition + * is such that the HW is likely to still be working but consumers should + * take a recovery action to prevent problems esacalating into errors. + */ +#define REGULATOR_EVENT_UNDER_VOLTAGE_WARN 0x2000 +#define REGULATOR_EVENT_OVER_CURRENT_WARN 0x4000 +#define REGULATOR_EVENT_OVER_VOLTAGE_WARN 0x8000 +#define REGULATOR_EVENT_OVER_TEMP_WARN 0x10000 +#define REGULATOR_EVENT_WARN_MASK 0x1E000 + +struct reg_genl_event { + char reg_name[32]; + uint64_t event; +}; + +/* attributes of reg_genl_family */ +enum { + REG_GENL_ATTR_UNSPEC, + REG_GENL_ATTR_EVENT, /* reg event info needed by user space */ + __REG_GENL_ATTR_MAX, +}; + +#define REG_GENL_ATTR_MAX (__REG_GENL_ATTR_MAX - 1) + +/* commands supported by the reg_genl_family */ +enum { + REG_GENL_CMD_UNSPEC, + REG_GENL_CMD_EVENT, /* kernel->user notifications for reg events */ + __REG_GENL_CMD_MAX, +}; + +#define REG_GENL_CMD_MAX (__REG_GENL_CMD_MAX - 1) + +#define REG_GENL_FAMILY_NAME "reg_event" +#define REG_GENL_VERSION 0x01 +#define REG_GENL_MCAST_GROUP_NAME "reg_mc_group" + +#endif /* _UAPI_REGULATOR_H */ From 2e0d75f8dd9e31b3fb175f780494dd7dd988ceae Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 4 Dec 2023 11:33:27 -0600 Subject: [PATCH 0294/1562] spi: axi-spi-engine: return void from spi_engine_compile_message() In the AXI SPI Engine driver, the spi_engine_compile_message() function does not return any error and none of the callers check the return value. So we can change the return type to void and drop the return 0. Signed-off-by: David Lechner Acked-by: Michael Hennerich Acked-by: Nuno Sa Link: https://lore.kernel.org/r/20231204-axi-spi-engine-series-2-v1-1-063672323fce@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index cbca783830ea..982b37ac3063 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -218,7 +218,7 @@ static void spi_engine_gen_cs(struct spi_engine_program *p, bool dry, spi_engine_program_add_cmd(p, dry, SPI_ENGINE_CMD_ASSERT(1, mask)); } -static int spi_engine_compile_message(struct spi_engine *spi_engine, +static void spi_engine_compile_message(struct spi_engine *spi_engine, struct spi_message *msg, bool dry, struct spi_engine_program *p) { struct spi_device *spi = msg->spi; @@ -273,8 +273,6 @@ static int spi_engine_compile_message(struct spi_engine *spi_engine, if (!keep_cs) spi_engine_gen_cs(p, dry, spi, false); - - return 0; } static void spi_engine_xfer_next(struct spi_message *msg, From 9d023ecc31859c7f7c8ca27b5fec52b2dbb8086f Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 4 Dec 2023 11:33:28 -0600 Subject: [PATCH 0295/1562] spi: axi-spi-engine: populate xfer->effective_speed_hz This adds a new spi_engine_precompile_message() function to the ADI AXI SPI Engine driver to populate the xfer->effective_speed_hz field since the SPI core doesn't/can't do this for us. This driver is already using spi_delay_to_ns() which depends on effective_speed_hz to get an accurate value in some cases. Having an effective_speed_hz value can also be used in future changes to simplify other code. Signed-off-by: David Lechner Acked-by: Michael Hennerich Acked-by: Nuno Sa Link: https://lore.kernel.org/r/20231204-axi-spi-engine-series-2-v1-2-063672323fce@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 982b37ac3063..ee7b904ae5cf 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -218,6 +218,27 @@ static void spi_engine_gen_cs(struct spi_engine_program *p, bool dry, spi_engine_program_add_cmd(p, dry, SPI_ENGINE_CMD_ASSERT(1, mask)); } +/* + * Performs precompile steps on the message. + * + * The SPI core does most of the message/transfer validation and filling in + * fields for us via __spi_validate(). This fixes up anything remaining not + * done there. + * + * NB: This is separate from spi_engine_compile_message() because the latter + * is called twice and would otherwise result in double-evaluation. + */ +static void spi_engine_precompile_message(struct spi_message *msg) +{ + unsigned int clk_div, max_hz = msg->spi->controller->max_speed_hz; + struct spi_transfer *xfer; + + list_for_each_entry(xfer, &msg->transfers, transfer_list) { + clk_div = DIV_ROUND_UP(max_hz, xfer->speed_hz); + xfer->effective_speed_hz = max_hz / min(clk_div, 256U); + } +} + static void spi_engine_compile_message(struct spi_engine *spi_engine, struct spi_message *msg, bool dry, struct spi_engine_program *p) { @@ -504,6 +525,8 @@ static int spi_engine_prepare_message(struct spi_controller *host, if (!st) return -ENOMEM; + spi_engine_precompile_message(msg); + p_dry.length = 0; spi_engine_compile_message(spi_engine, msg, true, &p_dry); From 1fc8dc5721bbc7a21cb4cc60c35eb8031942542b Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 4 Dec 2023 11:33:29 -0600 Subject: [PATCH 0296/1562] spi: axi-spi-engine: remove spi_engine_get_clk_div() Now that host->max_speed_hz and xfer->effective_speed_hz are properly set, we can use them instead of having to do more complex calculations to get the clock divider for each transfer. This removes the spi_engine_get_clk_div() function and replaces it with just dividing the two clock rates. Since the hardware register value is the divider minus one, we need to subtract one. Subtracting one was previously done in the spi_engine_get_clk_div() function. Signed-off-by: David Lechner Acked-by: Michael Hennerich Acked-by: Nuno Sa Link: https://lore.kernel.org/r/20231204-axi-spi-engine-series-2-v1-3-063672323fce@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index ee7b904ae5cf..fa2264d630c3 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -140,21 +140,6 @@ static unsigned int spi_engine_get_config(struct spi_device *spi) return config; } -static unsigned int spi_engine_get_clk_div(struct spi_engine *spi_engine, - struct spi_device *spi, struct spi_transfer *xfer) -{ - unsigned int clk_div; - - clk_div = DIV_ROUND_UP(clk_get_rate(spi_engine->ref_clk), - xfer->speed_hz * 2); - if (clk_div > 255) - clk_div = 255; - else if (clk_div > 0) - clk_div -= 1; - - return clk_div; -} - static void spi_engine_gen_xfer(struct spi_engine_program *p, bool dry, struct spi_transfer *xfer) { @@ -243,6 +228,7 @@ static void spi_engine_compile_message(struct spi_engine *spi_engine, struct spi_message *msg, bool dry, struct spi_engine_program *p) { struct spi_device *spi = msg->spi; + struct spi_controller *host = spi->controller; struct spi_transfer *xfer; int clk_div, new_clk_div; bool keep_cs = false; @@ -258,12 +244,13 @@ static void spi_engine_compile_message(struct spi_engine *spi_engine, spi_engine_gen_cs(p, dry, spi, !xfer->cs_off); list_for_each_entry(xfer, &msg->transfers, transfer_list) { - new_clk_div = spi_engine_get_clk_div(spi_engine, spi, xfer); + new_clk_div = host->max_speed_hz / xfer->effective_speed_hz; if (new_clk_div != clk_div) { clk_div = new_clk_div; + /* actual divider used is register value + 1 */ spi_engine_program_add_cmd(p, dry, SPI_ENGINE_CMD_WRITE(SPI_ENGINE_CMD_REG_CLK_DIV, - clk_div)); + clk_div - 1)); } if (bits_per_word != xfer->bits_per_word) { @@ -274,7 +261,7 @@ static void spi_engine_compile_message(struct spi_engine *spi_engine, } spi_engine_gen_xfer(p, dry, xfer); - spi_engine_gen_sleep(p, dry, spi_engine, clk_div, xfer); + spi_engine_gen_sleep(p, dry, spi_engine, clk_div - 1, xfer); if (xfer->cs_change) { if (list_is_last(&xfer->transfer_list, &msg->transfers)) { From be9070bcf67057b7b03c5acc1980d3897448ad20 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 4 Dec 2023 11:33:30 -0600 Subject: [PATCH 0297/1562] spi: axi-spi-engine: fix sleep ticks calculation This fixes the sleep ticks calculation when generating sleep instructions in the AXI SPI Engine driver. The previous calculation was ignoring delays less than one microsecond and missed a microsecond to second conversion factor. This fixes the first issue by not rounding to microseconds. Now that xfer->effective_speed_hz is guaranteed to be set correctly, we can use that to simplify the calculation. This new calculation replaces the old incorrect math. Also add unit suffix to the delay variable for clarity while we are touching this. Signed-off-by: David Lechner Acked-by: Michael Hennerich Acked-by: Nuno Sa Link: https://lore.kernel.org/r/20231204-axi-spi-engine-series-2-v1-4-063672323fce@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index fa2264d630c3..b3e72308fcc5 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -168,22 +168,17 @@ static void spi_engine_gen_xfer(struct spi_engine_program *p, bool dry, } static void spi_engine_gen_sleep(struct spi_engine_program *p, bool dry, - struct spi_engine *spi_engine, unsigned int clk_div, struct spi_transfer *xfer) { - unsigned int spi_clk = clk_get_rate(spi_engine->ref_clk); unsigned int t; - int delay; + int delay_ns; - delay = spi_delay_to_ns(&xfer->delay, xfer); - if (delay < 0) - return; - delay /= 1000; - - if (delay == 0) + delay_ns = spi_delay_to_ns(&xfer->delay, xfer); + if (delay_ns <= 0) return; - t = DIV_ROUND_UP(delay * spi_clk, (clk_div + 1) * 2); + /* rounding down since executing the instruction adds a couple of ticks delay */ + t = DIV_ROUND_DOWN_ULL((u64)delay_ns * xfer->effective_speed_hz, NSEC_PER_SEC); while (t) { unsigned int n = min(t, 256U); @@ -224,8 +219,8 @@ static void spi_engine_precompile_message(struct spi_message *msg) } } -static void spi_engine_compile_message(struct spi_engine *spi_engine, - struct spi_message *msg, bool dry, struct spi_engine_program *p) +static void spi_engine_compile_message(struct spi_message *msg, bool dry, + struct spi_engine_program *p) { struct spi_device *spi = msg->spi; struct spi_controller *host = spi->controller; @@ -261,7 +256,7 @@ static void spi_engine_compile_message(struct spi_engine *spi_engine, } spi_engine_gen_xfer(p, dry, xfer); - spi_engine_gen_sleep(p, dry, spi_engine, clk_div - 1, xfer); + spi_engine_gen_sleep(p, dry, xfer); if (xfer->cs_change) { if (list_is_last(&xfer->transfer_list, &msg->transfers)) { @@ -515,7 +510,7 @@ static int spi_engine_prepare_message(struct spi_controller *host, spi_engine_precompile_message(msg); p_dry.length = 0; - spi_engine_compile_message(spi_engine, msg, true, &p_dry); + spi_engine_compile_message(msg, true, &p_dry); size = sizeof(*p->instructions) * (p_dry.length + 1); p = kzalloc(sizeof(*p) + size, GFP_KERNEL); @@ -533,7 +528,7 @@ static int spi_engine_prepare_message(struct spi_controller *host, st->sync_id = ret; - spi_engine_compile_message(spi_engine, msg, false, p); + spi_engine_compile_message(msg, false, p); spi_engine_program_add_cmd(p, false, SPI_ENGINE_CMD_SYNC(st->sync_id)); From e006c181dd9ab006d7b0982d35ef7951fbffe825 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 4 Dec 2023 11:33:31 -0600 Subject: [PATCH 0298/1562] spi: axi-spi-engine: remove xfer arg from spi_engine_gen_sleep() This replaces the xfer parameter of spi_engine_gen_sleep() in the AXI SPI Engine driver with parameters for the delay in nanoseconds and the SPI SCLK rate. This will allow this function to be used by callers in the future that do not have a spi_transfer struct. Signed-off-by: David Lechner Acked-by: Michael Hennerich Acked-by: Nuno Sa Link: https://lore.kernel.org/r/20231204-axi-spi-engine-series-2-v1-5-063672323fce@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index b3e72308fcc5..84ec37732d8b 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -168,17 +168,16 @@ static void spi_engine_gen_xfer(struct spi_engine_program *p, bool dry, } static void spi_engine_gen_sleep(struct spi_engine_program *p, bool dry, - struct spi_transfer *xfer) + int delay_ns, u32 sclk_hz) { unsigned int t; - int delay_ns; - delay_ns = spi_delay_to_ns(&xfer->delay, xfer); + /* negative delay indicates error, e.g. from spi_delay_to_ns() */ if (delay_ns <= 0) return; /* rounding down since executing the instruction adds a couple of ticks delay */ - t = DIV_ROUND_DOWN_ULL((u64)delay_ns * xfer->effective_speed_hz, NSEC_PER_SEC); + t = DIV_ROUND_DOWN_ULL((u64)delay_ns * sclk_hz, NSEC_PER_SEC); while (t) { unsigned int n = min(t, 256U); @@ -256,7 +255,8 @@ static void spi_engine_compile_message(struct spi_message *msg, bool dry, } spi_engine_gen_xfer(p, dry, xfer); - spi_engine_gen_sleep(p, dry, xfer); + spi_engine_gen_sleep(p, dry, spi_delay_to_ns(&xfer->delay, xfer), + xfer->effective_speed_hz); if (xfer->cs_change) { if (list_is_last(&xfer->transfer_list, &msg->transfers)) { From 125a8390995df1a350e9e16e6da11d010e1e7f76 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 4 Dec 2023 11:33:32 -0600 Subject: [PATCH 0299/1562] spi: axi-spi-engine: implement xfer->cs_change_delay This adds handling of xfer->cs_change_delay to the AXI SPI Engine driver. Signed-off-by: David Lechner Acked-by: Michael Hennerich Acked-by: Nuno Sa Link: https://lore.kernel.org/r/20231204-axi-spi-engine-series-2-v1-6-063672323fce@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 84ec37732d8b..3437829ef8b1 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -265,6 +265,10 @@ static void spi_engine_compile_message(struct spi_message *msg, bool dry, if (!xfer->cs_off) spi_engine_gen_cs(p, dry, spi, false); + spi_engine_gen_sleep(p, dry, spi_delay_to_ns( + &xfer->cs_change_delay, xfer), + xfer->effective_speed_hz); + if (!list_next_entry(xfer, transfer_list)->cs_off) spi_engine_gen_cs(p, dry, spi, true); } From 3106edac599f59e1298b034a19a43e7da002fccc Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 4 Dec 2023 11:33:33 -0600 Subject: [PATCH 0300/1562] spi: axi-spi-engine: restore clkdiv at end of message This modifies the ADI AXI SPI Engine driver to restore the clkdiv configuration register at the end of a SPI message. Having the clkdiv in a known state is needed to be able to add a new command in the future that only performs a delay without any SPI transfers. Furthermore having that state be the smallest possible divider will allow these delays to have the highest possible precision. Changing the initial value of clk_div from -1 to 1 is now possible because we know the function will always be called with a known clkdiv config register state. Making this change will also have the effect of not emitting a clkdiv configuration register instruction in cases where the maximum sclk rate is used. Having one less instruction to process reduces delays on the bus which will be beneficial when we implement offload support to enable reading data from devices at very high rates. Signed-off-by: David Lechner Acked-by: Michael Hennerich Acked-by: Nuno Sa Link: https://lore.kernel.org/r/20231204-axi-spi-engine-series-2-v1-7-063672323fce@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 3437829ef8b1..3798f96da586 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -228,7 +228,7 @@ static void spi_engine_compile_message(struct spi_message *msg, bool dry, bool keep_cs = false; u8 bits_per_word = 0; - clk_div = -1; + clk_div = 1; spi_engine_program_add_cmd(p, dry, SPI_ENGINE_CMD_WRITE(SPI_ENGINE_CMD_REG_CONFIG, @@ -280,6 +280,14 @@ static void spi_engine_compile_message(struct spi_message *msg, bool dry, if (!keep_cs) spi_engine_gen_cs(p, dry, spi, false); + + /* + * Restore clockdiv to default so that future gen_sleep commands don't + * have to be aware of the current register state. + */ + if (clk_div != 1) + spi_engine_program_add_cmd(p, dry, + SPI_ENGINE_CMD_WRITE(SPI_ENGINE_CMD_REG_CLK_DIV, 0)); } static void spi_engine_xfer_next(struct spi_message *msg, From 0db60d821e485a1c9b8080dbec1ba9871efb6a65 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 4 Dec 2023 11:33:34 -0600 Subject: [PATCH 0301/1562] spi: axi-spi-engine: remove delay from CS assertion Now that the AXI SPI Engine driver has support for the various CS delays requested through struct spi_message, we don't need to add a separate delay to the CS assertion instruction. Otherwise, we end up with longer than requested delays. Signed-off-by: David Lechner Acked-by: Michael Hennerich Acked-by: Nuno Sa Link: https://lore.kernel.org/r/20231204-axi-spi-engine-series-2-v1-8-063672323fce@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 3798f96da586..78221715ba81 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -194,7 +194,7 @@ static void spi_engine_gen_cs(struct spi_engine_program *p, bool dry, if (assert) mask ^= BIT(spi_get_chipselect(spi, 0)); - spi_engine_program_add_cmd(p, dry, SPI_ENGINE_CMD_ASSERT(1, mask)); + spi_engine_program_add_cmd(p, dry, SPI_ENGINE_CMD_ASSERT(0, mask)); } /* From 07d33c2810bb5fe67747d11f76980ed68602e287 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 4 Dec 2023 11:33:35 -0600 Subject: [PATCH 0302/1562] spi: axi-spi-engine: add watchdog timer If there is an issue with the AXI SPI Engine hardware a scheduled transfer might never be completed and spi_sync() will block forever. This due to the uninterruptible wait for completion waiting for the spi_finalize_current_message() that never comes. Add a watchdog timer that will abort a transfer 5 seconds after it has been started. This will potentially leave the hardware in a broken state but it allows software to recover and allow to better diagnose the underlying issue. Co-developed-by: Lars-Peter Clausen Signed-off-by: Lars-Peter Clausen Signed-off-by: David Lechner Acked-by: Michael Hennerich Acked-by: Nuno Sa Link: https://lore.kernel.org/r/20231204-axi-spi-engine-series-2-v1-9-063672323fce@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 78221715ba81..58280dd1c901 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -13,6 +13,7 @@ #include #include #include +#include #define SPI_ENGINE_VERSION_MAJOR(x) ((x >> 16) & 0xff) #define SPI_ENGINE_VERSION_MINOR(x) ((x >> 8) & 0xff) @@ -114,6 +115,8 @@ struct spi_engine { void __iomem *base; struct ida sync_ida; + struct timer_list watchdog_timer; + struct spi_controller *controller; unsigned int int_enable; }; @@ -488,9 +491,11 @@ static irqreturn_t spi_engine_irq(int irq, void *devid) struct spi_engine_message_state *st = msg->state; if (completed_id == st->sync_id) { - msg->status = 0; - msg->actual_length = msg->frame_length; - spi_finalize_current_message(host); + if (timer_delete_sync(&spi_engine->watchdog_timer)) { + msg->status = 0; + msg->actual_length = msg->frame_length; + spi_finalize_current_message(host); + } disable_int |= SPI_ENGINE_INT_SYNC; } } @@ -573,6 +578,8 @@ static int spi_engine_transfer_one_message(struct spi_controller *host, unsigned int int_enable = 0; unsigned long flags; + mod_timer(&spi_engine->watchdog_timer, jiffies + msecs_to_jiffies(5000)); + spin_lock_irqsave(&spi_engine->lock, flags); if (spi_engine_write_cmd_fifo(spi_engine, msg)) @@ -596,6 +603,20 @@ static int spi_engine_transfer_one_message(struct spi_controller *host, return 0; } +static void spi_engine_timeout(struct timer_list *timer) +{ + struct spi_engine *spi_engine = from_timer(spi_engine, timer, watchdog_timer); + struct spi_controller *host = spi_engine->controller; + + if (WARN_ON(!host->cur_msg)) + return; + + dev_err(&host->dev, + "Timeout occurred while waiting for transfer to complete. Hardware is probably broken.\n"); + host->cur_msg->status = -ETIMEDOUT; + spi_finalize_current_message(host); +} + static void spi_engine_release_hw(void *p) { struct spi_engine *spi_engine = p; @@ -625,6 +646,8 @@ static int spi_engine_probe(struct platform_device *pdev) spin_lock_init(&spi_engine->lock); ida_init(&spi_engine->sync_ida); + timer_setup(&spi_engine->watchdog_timer, spi_engine_timeout, TIMER_IRQSAFE); + spi_engine->controller = host; spi_engine->clk = devm_clk_get_enabled(&pdev->dev, "s_axi_aclk"); if (IS_ERR(spi_engine->clk)) From 15bece7bec0df91a8ed1c185483d67708425ca8e Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Fri, 24 Nov 2023 20:16:15 +0800 Subject: [PATCH 0303/1562] cpu/hotplug: Remove unused CPU hotplug states There are unused hotplug states which either have never been used or the removal of the usage did not remove the state constant. Drop them to reduce the size of the cpuhp_hp_states array. Signed-off-by: Zenghui Yu Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231124121615.1604-1-yuzenghui@huawei.com --- include/linux/cpuhotplug.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index efc0c0b07efb..af6c21aab985 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -66,15 +66,12 @@ enum cpuhp_state { CPUHP_PERF_POWER, CPUHP_PERF_SUPERH, CPUHP_X86_HPET_DEAD, - CPUHP_X86_APB_DEAD, CPUHP_X86_MCE_DEAD, CPUHP_VIRT_NET_DEAD, CPUHP_IBMVNIC_DEAD, CPUHP_SLUB_DEAD, CPUHP_DEBUG_OBJ_DEAD, CPUHP_MM_WRITEBACK_DEAD, - /* Must be after CPUHP_MM_VMSTAT_DEAD */ - CPUHP_MM_DEMOTION_DEAD, CPUHP_MM_VMSTAT_DEAD, CPUHP_SOFTIRQ_DEAD, CPUHP_NET_MVNETA_DEAD, @@ -96,7 +93,6 @@ enum cpuhp_state { CPUHP_NET_DEV_DEAD, CPUHP_PCI_XGENE_DEAD, CPUHP_IOMMU_IOVA_DEAD, - CPUHP_LUSTRE_CFS_DEAD, CPUHP_AP_ARM_CACHE_B15_RAC_DEAD, CPUHP_PADATA_DEAD, CPUHP_AP_DTPM_CPU_DEAD, @@ -118,7 +114,6 @@ enum cpuhp_state { CPUHP_XEN_EVTCHN_PREPARE, CPUHP_ARM_SHMOBILE_SCU_PREPARE, CPUHP_SH_SH3X_PREPARE, - CPUHP_NET_FLOW_PREPARE, CPUHP_TOPOLOGY_PREPARE, CPUHP_NET_IUCV_PREPARE, CPUHP_ARM_BL_PREPARE, @@ -151,18 +146,14 @@ enum cpuhp_state { CPUHP_AP_IRQ_ARMADA_XP_STARTING, CPUHP_AP_IRQ_BCM2836_STARTING, CPUHP_AP_IRQ_MIPS_GIC_STARTING, - CPUHP_AP_IRQ_RISCV_STARTING, CPUHP_AP_IRQ_LOONGARCH_STARTING, CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, - CPUHP_AP_MICROCODE_LOADER, CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, CPUHP_AP_PERF_X86_STARTING, CPUHP_AP_PERF_X86_AMD_IBS_STARTING, - CPUHP_AP_PERF_X86_CQM_STARTING, CPUHP_AP_PERF_X86_CSTATE_STARTING, CPUHP_AP_PERF_XTENSA_STARTING, - CPUHP_AP_MIPS_OP_LOONGSON3_STARTING, CPUHP_AP_ARM_VFP_STARTING, CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING, CPUHP_AP_PERF_ARM_HW_BREAKPOINT_STARTING, @@ -179,7 +170,6 @@ enum cpuhp_state { CPUHP_AP_QCOM_TIMER_STARTING, CPUHP_AP_TEGRA_TIMER_STARTING, CPUHP_AP_ARMADA_TIMER_STARTING, - CPUHP_AP_MARCO_TIMER_STARTING, CPUHP_AP_MIPS_GIC_TIMER_STARTING, CPUHP_AP_ARC_TIMER_STARTING, CPUHP_AP_RISCV_TIMER_STARTING, @@ -217,9 +207,7 @@ enum cpuhp_state { CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, CPUHP_AP_PERF_X86_RAPL_ONLINE, - CPUHP_AP_PERF_X86_CQM_ONLINE, CPUHP_AP_PERF_X86_CSTATE_ONLINE, - CPUHP_AP_PERF_X86_IDXD_ONLINE, CPUHP_AP_PERF_S390_CF_ONLINE, CPUHP_AP_PERF_S390_SF_ONLINE, CPUHP_AP_PERF_ARM_CCI_ONLINE, @@ -252,8 +240,6 @@ enum cpuhp_state { CPUHP_AP_BASE_CACHEINFO_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, - /* Must be after CPUHP_AP_ONLINE_DYN for node_states[N_CPU] update */ - CPUHP_AP_MM_DEMOTION_ONLINE, CPUHP_AP_X86_HPET_ONLINE, CPUHP_AP_X86_KVM_CLK_ONLINE, CPUHP_AP_ACTIVE, From 57b8543ceee82ea72be1745a6dc3a9111d55a151 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 23 Nov 2023 15:36:13 +0530 Subject: [PATCH 0304/1562] ACPI: bus: update acpi_dev_uid_match() to support multiple types According to the ACPI specification, a _UID object can evaluate to either a numeric value or a string. Update acpi_dev_uid_match() to support _UID matching for both integer and string types. Suggested-by: Mika Westerberg Signed-off-by: Raag Jadav [ rjw: Rename auxiliary macros, relocate kerneldoc comment ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/utils.c | 19 ------------------- include/acpi/acpi_bus.h | 41 ++++++++++++++++++++++++++++++++++++++++- include/linux/acpi.h | 8 +++----- 3 files changed, 43 insertions(+), 25 deletions(-) diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index 28c75242fca9..fe7e850c6479 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -824,25 +824,6 @@ bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs) } EXPORT_SYMBOL(acpi_check_dsm); -/** - * acpi_dev_uid_match - Match device by supplied UID - * @adev: ACPI device to match. - * @uid2: Unique ID of the device. - * - * Matches UID in @adev with given @uid2. - * - * Returns: - * - %true if matches. - * - %false otherwise. - */ -bool acpi_dev_uid_match(struct acpi_device *adev, const char *uid2) -{ - const char *uid1 = acpi_device_uid(adev); - - return uid1 && uid2 && !strcmp(uid1, uid2); -} -EXPORT_SYMBOL_GPL(acpi_dev_uid_match); - /** * acpi_dev_hid_uid_match - Match device by supplied HID and UID * @adev: ACPI device to match. diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 1216d72c650f..4bde0b417476 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -764,10 +764,49 @@ static inline bool acpi_device_can_poweroff(struct acpi_device *adev) adev->power.states[ACPI_STATE_D3_HOT].flags.explicit_set); } -bool acpi_dev_uid_match(struct acpi_device *adev, const char *uid2); bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2); int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer); +static inline bool acpi_str_uid_match(struct acpi_device *adev, const char *uid2) +{ + const char *uid1 = acpi_device_uid(adev); + + return uid1 && uid2 && !strcmp(uid1, uid2); +} + +static inline bool acpi_int_uid_match(struct acpi_device *adev, u64 uid2) +{ + u64 uid1; + + return !acpi_dev_uid_to_integer(adev, &uid1) && uid1 == uid2; +} + +#define TYPE_ENTRY(type, x) \ + const type: x, \ + type: x + +#define ACPI_STR_TYPES(match) \ + TYPE_ENTRY(unsigned char *, match), \ + TYPE_ENTRY(signed char *, match), \ + TYPE_ENTRY(char *, match), \ + TYPE_ENTRY(void *, match) + +/** + * acpi_dev_uid_match - Match device by supplied UID + * @adev: ACPI device to match. + * @uid2: Unique ID of the device. + * + * Matches UID in @adev with given @uid2. + * + * Returns: %true if matches, %false otherwise. + */ +#define acpi_dev_uid_match(adev, uid2) \ + _Generic(uid2, \ + /* Treat @uid2 as a string for acpi string types */ \ + ACPI_STR_TYPES(acpi_str_uid_match), \ + /* Treat as an integer otherwise */ \ + default: acpi_int_uid_match)(adev, uid2) + void acpi_dev_clear_dependencies(struct acpi_device *supplier); bool acpi_dev_ready_for_enumeration(const struct acpi_device *device); struct acpi_device *acpi_dev_get_next_consumer_dev(struct acpi_device *supplier, diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4db54e928b36..2abe81f074de 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -756,6 +756,9 @@ const char *acpi_get_subsystem_id(acpi_handle handle); #define ACPI_HANDLE(dev) (NULL) #define ACPI_HANDLE_FWNODE(fwnode) (NULL) +/* Get rid of the -Wunused-variable for adev */ +#define acpi_dev_uid_match(adev, uid2) (adev && false) + #include struct fwnode_handle; @@ -772,11 +775,6 @@ static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv) struct acpi_device; -static inline bool acpi_dev_uid_match(struct acpi_device *adev, const char *uid2) -{ - return false; -} - static inline bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2) { From b2b32a1738815155d4a0039bb7a6092d40f23e81 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 23 Nov 2023 15:36:14 +0530 Subject: [PATCH 0305/1562] ACPI: bus: update acpi_dev_hid_uid_match() to support multiple types Now that we have _UID matching support for both integer and string types, we can support them into acpi_dev_hid_uid_match() helper as well. Signed-off-by: Raag Jadav Reviewed-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/utils.c | 29 ----------------------------- include/acpi/acpi_bus.h | 24 +++++++++++++++++++++++- include/linux/acpi.h | 7 +------ 3 files changed, 24 insertions(+), 36 deletions(-) diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index fe7e850c6479..03f6de9a0807 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -824,35 +824,6 @@ bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs) } EXPORT_SYMBOL(acpi_check_dsm); -/** - * acpi_dev_hid_uid_match - Match device by supplied HID and UID - * @adev: ACPI device to match. - * @hid2: Hardware ID of the device. - * @uid2: Unique ID of the device, pass NULL to not check _UID. - * - * Matches HID and UID in @adev with given @hid2 and @uid2. Absence of @uid2 - * will be treated as a match. If user wants to validate @uid2, it should be - * done before calling this function. - * - * Returns: - * - %true if matches or @uid2 is NULL. - * - %false otherwise. - */ -bool acpi_dev_hid_uid_match(struct acpi_device *adev, - const char *hid2, const char *uid2) -{ - const char *hid1 = acpi_device_hid(adev); - - if (strcmp(hid1, hid2)) - return false; - - if (!uid2) - return true; - - return acpi_dev_uid_match(adev, uid2); -} -EXPORT_SYMBOL(acpi_dev_hid_uid_match); - /** * acpi_dev_uid_to_integer - treat ACPI device _UID as integer * @adev: ACPI device to get _UID from diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 4bde0b417476..aae31552c574 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -764,9 +764,15 @@ static inline bool acpi_device_can_poweroff(struct acpi_device *adev) adev->power.states[ACPI_STATE_D3_HOT].flags.explicit_set); } -bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2); int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer); +static inline bool acpi_dev_hid_match(struct acpi_device *adev, const char *hid2) +{ + const char *hid1 = acpi_device_hid(adev); + + return hid1 && hid2 && !strcmp(hid1, hid2); +} + static inline bool acpi_str_uid_match(struct acpi_device *adev, const char *uid2) { const char *uid1 = acpi_device_uid(adev); @@ -807,6 +813,22 @@ static inline bool acpi_int_uid_match(struct acpi_device *adev, u64 uid2) /* Treat as an integer otherwise */ \ default: acpi_int_uid_match)(adev, uid2) +/** + * acpi_dev_hid_uid_match - Match device by supplied HID and UID + * @adev: ACPI device to match. + * @hid2: Hardware ID of the device. + * @uid2: Unique ID of the device, pass 0 or NULL to not check _UID. + * + * Matches HID and UID in @adev with given @hid2 and @uid2. Absence of @uid2 + * will be treated as a match. If user wants to validate @uid2, it should be + * done before calling this function. + * + * Returns: %true if matches or @uid2 is 0 or NULL, %false otherwise. + */ +#define acpi_dev_hid_uid_match(adev, hid2, uid2) \ + (acpi_dev_hid_match(adev, hid2) && \ + (!(uid2) || acpi_dev_uid_match(adev, uid2))) + void acpi_dev_clear_dependencies(struct acpi_device *supplier); bool acpi_dev_ready_for_enumeration(const struct acpi_device *device); struct acpi_device *acpi_dev_get_next_consumer_dev(struct acpi_device *supplier, diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 2abe81f074de..75274585656c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -758,6 +758,7 @@ const char *acpi_get_subsystem_id(acpi_handle handle); /* Get rid of the -Wunused-variable for adev */ #define acpi_dev_uid_match(adev, uid2) (adev && false) +#define acpi_dev_hid_uid_match(adev, hid2, uid2) (adev && false) #include @@ -775,12 +776,6 @@ static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv) struct acpi_device; -static inline bool -acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2) -{ - return false; -} - static inline int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer) { return -ENODEV; From 5ecdb287be126172ce7f4d61af5c6402b0fc9e61 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 23 Nov 2023 15:36:15 +0530 Subject: [PATCH 0306/1562] ACPI: LPSS: use acpi_dev_uid_match() for matching _UID Now that we have _UID matching support for integer types, we can use acpi_dev_uid_match() for it. Signed-off-by: Raag Jadav Acked-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_lpss.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c index 875de44961bf..79f4fc7d6871 100644 --- a/drivers/acpi/acpi_lpss.c +++ b/drivers/acpi/acpi_lpss.c @@ -167,13 +167,9 @@ static struct pwm_lookup byt_pwm_lookup[] = { static void byt_pwm_setup(struct lpss_private_data *pdata) { - u64 uid; - /* Only call pwm_add_table for the first PWM controller */ - if (acpi_dev_uid_to_integer(pdata->adev, &uid) || uid != 1) - return; - - pwm_add_table(byt_pwm_lookup, ARRAY_SIZE(byt_pwm_lookup)); + if (acpi_dev_uid_match(pdata->adev, 1)) + pwm_add_table(byt_pwm_lookup, ARRAY_SIZE(byt_pwm_lookup)); } #define LPSS_I2C_ENABLE 0x6c @@ -218,13 +214,9 @@ static struct pwm_lookup bsw_pwm_lookup[] = { static void bsw_pwm_setup(struct lpss_private_data *pdata) { - u64 uid; - /* Only call pwm_add_table for the first PWM controller */ - if (acpi_dev_uid_to_integer(pdata->adev, &uid) || uid != 1) - return; - - pwm_add_table(bsw_pwm_lookup, ARRAY_SIZE(bsw_pwm_lookup)); + if (acpi_dev_uid_match(pdata->adev, 1)) + pwm_add_table(bsw_pwm_lookup, ARRAY_SIZE(bsw_pwm_lookup)); } static const struct property_entry lpt_spi_properties[] = { From 9e93507da2cf57068ec5d3496185fe0236844ae9 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 23 Nov 2023 15:36:16 +0530 Subject: [PATCH 0307/1562] efi: dev-path-parser: use acpi_dev_uid_match() for matching _UID Now that we have _UID matching support for integer types, we can use acpi_dev_uid_match() for it. Signed-off-by: Raag Jadav Reviewed-by: Ard Biesheuvel Signed-off-by: Rafael J. Wysocki --- drivers/firmware/efi/dev-path-parser.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/firmware/efi/dev-path-parser.c b/drivers/firmware/efi/dev-path-parser.c index f80d87c199c3..937be269fee8 100644 --- a/drivers/firmware/efi/dev-path-parser.c +++ b/drivers/firmware/efi/dev-path-parser.c @@ -18,8 +18,6 @@ static long __init parse_acpi_path(const struct efi_dev_path *node, struct acpi_device *adev; struct device *phys_dev; char hid[ACPI_ID_LEN]; - u64 uid; - int ret; if (node->header.length != 12) return -EINVAL; @@ -31,10 +29,9 @@ static long __init parse_acpi_path(const struct efi_dev_path *node, node->acpi.hid >> 16); for_each_acpi_dev_match(adev, hid, NULL, -1) { - ret = acpi_dev_uid_to_integer(adev, &uid); - if (ret == 0 && node->acpi.uid == uid) + if (acpi_dev_uid_match(adev, node->acpi.uid)) break; - if (ret == -ENODATA && node->acpi.uid == 0) + if (!acpi_device_uid(adev) && node->acpi.uid == 0) break; } if (!adev) From 38dd7b72ef8046a3009ef384b711d4509de3d427 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 23 Nov 2023 15:36:17 +0530 Subject: [PATCH 0308/1562] perf: arm_cspmu: drop redundant acpi_dev_uid_to_integer() Now that we have _UID matching support for integer types, we can use acpi_dev_hid_uid_match() for it. Signed-off-by: Raag Jadav Acked-by: Will Deacon Signed-off-by: Rafael J. Wysocki --- drivers/perf/arm_cspmu/arm_cspmu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c index 2cc35dded007..50b89b989ce7 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.c +++ b/drivers/perf/arm_cspmu/arm_cspmu.c @@ -1108,7 +1108,6 @@ static int arm_cspmu_request_irq(struct arm_cspmu *cspmu) static inline int arm_cspmu_find_cpu_container(int cpu, u32 container_uid) { - u64 acpi_uid; struct device *cpu_dev; struct acpi_device *acpi_dev; @@ -1118,8 +1117,7 @@ static inline int arm_cspmu_find_cpu_container(int cpu, u32 container_uid) acpi_dev = ACPI_COMPANION(cpu_dev); while (acpi_dev) { - if (acpi_dev_hid_uid_match(acpi_dev, ACPI_PROCESSOR_CONTAINER_HID, NULL) && - !acpi_dev_uid_to_integer(acpi_dev, &acpi_uid) && acpi_uid == container_uid) + if (acpi_dev_hid_uid_match(acpi_dev, ACPI_PROCESSOR_CONTAINER_HID, container_uid)) return 0; acpi_dev = acpi_dev_parent(acpi_dev); From 3232e7aad11e541da86bbb1fa5ea5737b30bd006 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 5 Dec 2023 17:21:14 -0500 Subject: [PATCH 0309/1562] cgroup/cpuset: Include isolated cpuset CPUs in cpu_is_isolated() check Currently, the cpu_is_isolated() function checks only the statically isolated CPUs specified via the "isolcpus" and "nohz_full" kernel command line options. This function is used by vmstat and memcg to reduce interference with isolated CPUs by not doing stat flushing or scheduling works on those CPUs. Workloads running on isolated CPUs within isolated cpuset partitions should receive the same treatment to reduce unnecessary interference. This patch introduces a new cpuset_cpu_is_isolated() function to be called by cpu_is_isolated() so that the set of dynamically created cpuset isolated CPUs will be included in the check. Assuming that testing a bit in a cpumask is atomic, no synchronization primitive is currently used to synchronize access to the cpuset's isolated_cpus mask. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 6 ++++++ include/linux/sched/isolation.h | 4 +++- kernel/cgroup/cpuset.c | 11 +++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index d629094fac6e..875d12598bd2 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -77,6 +77,7 @@ extern void cpuset_lock(void); extern void cpuset_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); +extern bool cpuset_cpu_is_isolated(int cpu); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -207,6 +208,11 @@ static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) return false; } +static inline bool cpuset_cpu_is_isolated(int cpu) +{ + return false; +} + static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { return node_possible_map; diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index fe1a46f30d24..2b461129d1fa 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -2,6 +2,7 @@ #define _LINUX_SCHED_ISOLATION_H #include +#include #include #include @@ -67,7 +68,8 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type) static inline bool cpu_is_isolated(int cpu) { return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) || - !housekeeping_test_cpu(cpu, HK_TYPE_TICK); + !housekeeping_test_cpu(cpu, HK_TYPE_TICK) || + cpuset_cpu_is_isolated(cpu); } #endif /* _LINUX_SCHED_ISOLATION_H */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2a16df86c55c..dfbb16aca9f4 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1518,6 +1518,17 @@ static void update_unbound_workqueue_cpumask(bool isolcpus_updated) WARN_ON_ONCE(ret < 0); } +/** + * cpuset_cpu_is_isolated - Check if the given CPU is isolated + * @cpu: the CPU number to be checked + * Return: true if CPU is used in an isolated partition, false otherwise + */ +bool cpuset_cpu_is_isolated(int cpu) +{ + return cpumask_test_cpu(cpu, isolated_cpus); +} +EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); + /* * compute_effective_exclusive_cpumask - compute effective exclusive CPUs * @cs: cpuset From 4b3805daaacb2168665c6222f261e68accb120dc Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Mon, 20 Nov 2023 19:41:43 +0800 Subject: [PATCH 0310/1562] ACPI: tables: Correct and clean up the logic of acpi_parse_entries_array() The original intention of acpi_parse_entries_array() is to return the number of all matching entries on success. This number may be greater than the value of the max_entries parameter. When this happens, the function will output a warning message, indicating that `count - max_entries` matching entries remain unprocessed and have been ignored. However, commit 4ceacd02f5a1 ("ACPI / table: Always count matched and successfully parsed entries") changed this logic to return the number of entries successfully processed by the handler. In this case, when the max_entries parameter is not zero, the number of entries successfully processed can never be greater than the value of max_entries. In other words, the expression `count > max_entries` will always evaluate to false. This means that the logic in the final if statement will never be executed. Commit 99b0efd7c886 ("ACPI / tables: do not report the number of entries ignored by acpi_parse_entries()") mentioned this issue, but it tried to fix it by removing part of the warning message. This is meaningless because the pr_warn statement will never be executed in the first place. Commit 8726d4f44150 ("ACPI / tables: fix acpi_parse_entries_array() so it traverses all subtables") introduced an errs variable, which is intended to make acpi_parse_entries_array() always traverse all of the subtables, calling as many of the callbacks as possible. However, it seems that the commit does not achieve this goal. For example, when a handler returns an error, none of the handlers will be called again in the subsequent iterations. This result appears to be no different from before the change. This patch corrects and cleans up the logic of acpi_parse_entries_array(), making it return the number of all matching entries, rather than the number of entries successfully processed by handlers. Additionally, if an error occurs when executing a handler, the function will return -EINVAL immediately. This patch should not affect existing users of acpi_parse_entries_array(). Signed-off-by: Yuntao Wang Reviewed-by: Dave Jiang Signed-off-by: Rafael J. Wysocki --- lib/fw_table.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/lib/fw_table.c b/lib/fw_table.c index 294df54e33b6..c49a09ee3853 100644 --- a/lib/fw_table.c +++ b/lib/fw_table.c @@ -85,11 +85,6 @@ acpi_get_subtable_type(char *id) return ACPI_SUBTABLE_COMMON; } -static __init_or_acpilib bool has_handler(struct acpi_subtable_proc *proc) -{ - return proc->handler || proc->handler_arg; -} - static __init_or_acpilib int call_handler(struct acpi_subtable_proc *proc, union acpi_subtable_headers *hdr, unsigned long end) @@ -133,7 +128,6 @@ acpi_parse_entries_array(char *id, unsigned long table_size, unsigned long table_end, subtable_len, entry_len; struct acpi_subtable_entry entry; int count = 0; - int errs = 0; int i; table_end = (unsigned long)table_header + table_header->length; @@ -145,25 +139,19 @@ acpi_parse_entries_array(char *id, unsigned long table_size, ((unsigned long)table_header + table_size); subtable_len = acpi_get_subtable_header_length(&entry); - while (((unsigned long)entry.hdr) + subtable_len < table_end) { - if (max_entries && count >= max_entries) - break; - + while (((unsigned long)entry.hdr) + subtable_len < table_end) { for (i = 0; i < proc_num; i++) { if (acpi_get_entry_type(&entry) != proc[i].id) continue; - if (!has_handler(&proc[i]) || - (!errs && - call_handler(&proc[i], entry.hdr, table_end))) { - errs++; - continue; - } + + if (!max_entries || count < max_entries) + if (call_handler(&proc[i], entry.hdr, table_end)) + return -EINVAL; proc[i].count++; + count++; break; } - if (i != proc_num) - count++; /* * If entry->length is 0, break from this loop to avoid @@ -180,9 +168,9 @@ acpi_parse_entries_array(char *id, unsigned long table_size, } if (max_entries && count > max_entries) { - pr_warn("[%4.4s:0x%02x] found the maximum %i entries\n", - id, proc->id, count); + pr_warn("[%4.4s:0x%02x] ignored %i entries of %i found\n", + id, proc->id, count - max_entries, count); } - return errs ? -EINVAL : count; + return count; } From 310293a2b94197f3d75e65ab22672287a7938a00 Mon Sep 17 00:00:00 2001 From: Srikar Srimath Tirumala Date: Thu, 23 Nov 2023 17:44:33 +0530 Subject: [PATCH 0311/1562] ACPI: processor: reduce CPUFREQ thermal reduction pctg for Tegra241 Current implementation of processor_thermal performs software throttling in fixed steps of "20%" which can be too coarse for some platforms. We observed some performance gain after reducing the throttle percentage. Change the CPUFREQ thermal reduction percentage and maximum thermal steps to be configurable. Also, update the default values of both for Nvidia Tegra241 (Grace) SoC. The thermal reduction percentage is reduced to "5%" and accordingly the maximum number of thermal steps are increased as they are derived from the reduction percentage. Signed-off-by: Srikar Srimath Tirumala Co-developed-by: Sumit Gupta Signed-off-by: Sumit Gupta Acked-by: Sudeep Holla Acked-by: Hanjun Guo Signed-off-by: Rafael J. Wysocki --- drivers/acpi/arm64/Makefile | 1 + drivers/acpi/arm64/thermal_cpufreq.c | 20 ++++++++++++ drivers/acpi/internal.h | 9 +++++ drivers/acpi/processor_thermal.c | 49 +++++++++++++++++++++++----- 4 files changed, 70 insertions(+), 9 deletions(-) create mode 100644 drivers/acpi/arm64/thermal_cpufreq.c diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile index 143debc1ba4a..726944648c9b 100644 --- a/drivers/acpi/arm64/Makefile +++ b/drivers/acpi/arm64/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_ACPI_GTDT) += gtdt.o obj-$(CONFIG_ACPI_APMT) += apmt.o obj-$(CONFIG_ARM_AMBA) += amba.o obj-y += dma.o init.o +obj-y += thermal_cpufreq.o diff --git a/drivers/acpi/arm64/thermal_cpufreq.c b/drivers/acpi/arm64/thermal_cpufreq.c new file mode 100644 index 000000000000..d524f2cd6044 --- /dev/null +++ b/drivers/acpi/arm64/thermal_cpufreq.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +#include "../internal.h" + +#define SMCCC_SOC_ID_T241 0x036b0241 + +int acpi_arch_thermal_cpufreq_pctg(void) +{ + s32 soc_id = arm_smccc_get_soc_id_version(); + + /* + * Check JEP106 code for NVIDIA Tegra241 chip (036b:0241) and + * reduce the CPUFREQ Thermal reduction percentage to 5%. + */ + if (soc_id == SMCCC_SOC_ID_T241) + return 5; + + return 0; +} diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index db666f13c2ef..14b2df0b54b8 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -85,6 +85,15 @@ bool acpi_scan_is_offline(struct acpi_device *adev, bool uevent); acpi_status acpi_sysfs_table_handler(u32 event, void *table, void *context); void acpi_scan_table_notify(void); +#ifdef CONFIG_ARM64 +int acpi_arch_thermal_cpufreq_pctg(void); +#else +static inline int acpi_arch_thermal_cpufreq_pctg(void) +{ + return 0; +} +#endif + /* -------------------------------------------------------------------------- Device Node Initialization / Removal -------------------------------------------------------------------------- */ diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c index b7c6287eccca..1219adb11ab9 100644 --- a/drivers/acpi/processor_thermal.c +++ b/drivers/acpi/processor_thermal.c @@ -17,6 +17,8 @@ #include #include +#include "internal.h" + #ifdef CONFIG_CPU_FREQ /* If a passive cooling situation is detected, primarily CPUfreq is used, as it @@ -26,12 +28,21 @@ */ #define CPUFREQ_THERMAL_MIN_STEP 0 -#define CPUFREQ_THERMAL_MAX_STEP 3 -static DEFINE_PER_CPU(unsigned int, cpufreq_thermal_reduction_pctg); +static int cpufreq_thermal_max_step __read_mostly = 3; -#define reduction_pctg(cpu) \ - per_cpu(cpufreq_thermal_reduction_pctg, phys_package_first_cpu(cpu)) +/* + * Minimum throttle percentage for processor_thermal cooling device. + * The processor_thermal driver uses it to calculate the percentage amount by + * which cpu frequency must be reduced for each cooling state. This is also used + * to calculate the maximum number of throttling steps or cooling states. + */ +static int cpufreq_thermal_reduction_pctg __read_mostly = 20; + +static DEFINE_PER_CPU(unsigned int, cpufreq_thermal_reduction_step); + +#define reduction_step(cpu) \ + per_cpu(cpufreq_thermal_reduction_step, phys_package_first_cpu(cpu)) /* * Emulate "per package data" using per cpu data (which should really be @@ -71,7 +82,7 @@ static int cpufreq_get_max_state(unsigned int cpu) if (!cpu_has_cpufreq(cpu)) return 0; - return CPUFREQ_THERMAL_MAX_STEP; + return cpufreq_thermal_max_step; } static int cpufreq_get_cur_state(unsigned int cpu) @@ -79,7 +90,7 @@ static int cpufreq_get_cur_state(unsigned int cpu) if (!cpu_has_cpufreq(cpu)) return 0; - return reduction_pctg(cpu); + return reduction_step(cpu); } static int cpufreq_set_cur_state(unsigned int cpu, int state) @@ -92,7 +103,7 @@ static int cpufreq_set_cur_state(unsigned int cpu, int state) if (!cpu_has_cpufreq(cpu)) return 0; - reduction_pctg(cpu) = state; + reduction_step(cpu) = state; /* * Update all the CPUs in the same package because they all @@ -113,7 +124,8 @@ static int cpufreq_set_cur_state(unsigned int cpu, int state) if (!policy) return -EINVAL; - max_freq = (policy->cpuinfo.max_freq * (100 - reduction_pctg(i) * 20)) / 100; + max_freq = (policy->cpuinfo.max_freq * + (100 - reduction_step(i) * cpufreq_thermal_reduction_pctg)) / 100; cpufreq_cpu_put(policy); @@ -126,10 +138,29 @@ static int cpufreq_set_cur_state(unsigned int cpu, int state) return 0; } +static void acpi_thermal_cpufreq_config(void) +{ + int cpufreq_pctg = acpi_arch_thermal_cpufreq_pctg(); + + if (!cpufreq_pctg) + return; + + cpufreq_thermal_reduction_pctg = cpufreq_pctg; + + /* + * Derive the MAX_STEP from minimum throttle percentage so that the reduction + * percentage doesn't end up becoming negative. Also, cap the MAX_STEP so that + * the CPU performance doesn't become 0. + */ + cpufreq_thermal_max_step = (100 / cpufreq_pctg) - 2; +} + void acpi_thermal_cpufreq_init(struct cpufreq_policy *policy) { unsigned int cpu; + acpi_thermal_cpufreq_config(); + for_each_cpu(cpu, policy->related_cpus) { struct acpi_processor *pr = per_cpu(processors, cpu); int ret; @@ -190,7 +221,7 @@ static int acpi_processor_max_state(struct acpi_processor *pr) /* * There exists four states according to - * cpufreq_thermal_reduction_pctg. 0, 1, 2, 3 + * cpufreq_thermal_reduction_step. 0, 1, 2, 3 */ max_state += cpufreq_get_max_state(pr->id); if (pr->flags.throttling) From 143176a46bdd3bfbe9ba2462bf94458e80d65ebf Mon Sep 17 00:00:00 2001 From: Yuluo Qiu Date: Sun, 26 Nov 2023 21:59:13 +0800 Subject: [PATCH 0312/1562] ACPI: video: Add quirk for the Colorful X15 AT 23 Laptop The Colorful X15 AT 23 ACPI video-bus device report spurious ACPI_VIDEO_NOTIFY_CYCLE events resulting in spurious KEY_SWITCHVIDEOMODE events being reported to userspace (and causing trouble there) when an external screen plugged in. Add a quirk setting the report_key_events mask to REPORT_BRIGHTNESS_KEY_EVENTS so that the ACPI_VIDEO_NOTIFY_CYCLE events will be ignored, while still reporting brightness up/down hotkey-presses to userspace normally. Signed-off-by: Yuluo Qiu Co-developed-by: Celeste Liu Signed-off-by: Celeste Liu Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_video.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c index 14663ba4c736..4afdda9db019 100644 --- a/drivers/acpi/acpi_video.c +++ b/drivers/acpi/acpi_video.c @@ -500,6 +500,15 @@ static const struct dmi_system_id video_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Vostro 3350"), }, }, + { + .callback = video_set_report_key_events, + .driver_data = (void *)((uintptr_t)REPORT_BRIGHTNESS_KEY_EVENTS), + .ident = "COLORFUL X15 AT 23", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "COLORFUL"), + DMI_MATCH(DMI_PRODUCT_NAME, "X15 AT 23"), + }, + }, /* * Some machines change the brightness themselves when a brightness * hotkey gets pressed, despite us telling them not to. In this case From ba3f5058db437d919f8468db50483dd9028ff688 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Tue, 28 Nov 2023 05:52:10 +0300 Subject: [PATCH 0313/1562] PNP: ACPI: fix fortify warning When compiling with gcc version 14.0.0 20231126 (experimental) and CONFIG_FORTIFY_SOURCE=y, I've noticed the following: In file included from ./include/linux/string.h:295, from ./include/linux/bitmap.h:12, from ./include/linux/cpumask.h:12, from ./arch/x86/include/asm/paravirt.h:17, from ./arch/x86/include/asm/cpuid.h:62, from ./arch/x86/include/asm/processor.h:19, from ./arch/x86/include/asm/cpufeature.h:5, from ./arch/x86/include/asm/thread_info.h:53, from ./include/linux/thread_info.h:60, from ./arch/x86/include/asm/preempt.h:9, from ./include/linux/preempt.h:79, from ./include/linux/spinlock.h:56, from ./include/linux/mmzone.h:8, from ./include/linux/gfp.h:7, from ./include/linux/slab.h:16, from ./include/linux/resource_ext.h:11, from ./include/linux/acpi.h:13, from drivers/pnp/pnpacpi/rsparser.c:11: In function 'fortify_memcpy_chk', inlined from 'pnpacpi_parse_allocated_vendor' at drivers/pnp/pnpacpi/rsparser.c:158:3, inlined from 'pnpacpi_allocated_resource' at drivers/pnp/pnpacpi/rsparser.c:249:3: ./include/linux/fortify-string.h:588:25: warning: call to '__read_overflow2_field' declared with attribute warning: detected read beyond size of field (2nd parameter); maybe use struct_group()? [-Wattribute-warning] 588 | __read_overflow2_field(q_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ According to the comments in include/linux/fortify-string.h, 'memcpy()', 'memmove()' and 'memset()' must not be used beyond individual struct members to ensure that the compiler can enforce protection against buffer overflows, and, IIUC, this also applies to partial copies from the particular member ('vendor->byte_data' in this case). So it should be better (and safer) to do both copies at once (and 'byte_data' of 'struct acpi_resource_vendor_typed' seems to be a good candidate for '__counted_by(byte_length)' as well). Signed-off-by: Dmitry Antipov Reviewed-by: Kees Cook Signed-off-by: Rafael J. Wysocki --- drivers/pnp/pnpacpi/rsparser.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c index 4f05f610391b..c02ce0834c2c 100644 --- a/drivers/pnp/pnpacpi/rsparser.c +++ b/drivers/pnp/pnpacpi/rsparser.c @@ -151,13 +151,13 @@ static int vendor_resource_matches(struct pnp_dev *dev, static void pnpacpi_parse_allocated_vendor(struct pnp_dev *dev, struct acpi_resource_vendor_typed *vendor) { - if (vendor_resource_matches(dev, vendor, &hp_ccsr_uuid, 16)) { - u64 start, length; + struct { u64 start, length; } range; - memcpy(&start, vendor->byte_data, sizeof(start)); - memcpy(&length, vendor->byte_data + 8, sizeof(length)); - - pnp_add_mem_resource(dev, start, start + length - 1, 0); + if (vendor_resource_matches(dev, vendor, &hp_ccsr_uuid, + sizeof(range))) { + memcpy(&range, vendor->byte_data, sizeof(range)); + pnp_add_mem_resource(dev, range.start, range.start + + range.length - 1, 0); } } From 392829ede37f36efa2e0f034631594786a9c8139 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 29 Nov 2023 14:46:51 +0100 Subject: [PATCH 0314/1562] ACPI: OSL: Rework error handling in acpi_os_execute() Reduce the number of checks and goto labels related to error handling in acpi_os_execute() and drop the status local variable, which turns out to be redundant, from it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Andy Shevchenko --- drivers/acpi/osl.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index d56dda795118..603057f6c63e 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -1060,7 +1060,6 @@ int __init acpi_debugger_init(void) acpi_status acpi_os_execute(acpi_execute_type type, acpi_osd_exec_callback function, void *context) { - acpi_status status = AE_OK; struct acpi_os_dpc *dpc; struct workqueue_struct *queue; int ret; @@ -1073,9 +1072,9 @@ acpi_status acpi_os_execute(acpi_execute_type type, ret = acpi_debugger_create_thread(function, context); if (ret) { pr_err("Kernel thread creation failed\n"); - status = AE_ERROR; + return AE_ERROR; } - goto out_thread; + return AE_OK; } /* @@ -1107,12 +1106,9 @@ acpi_status acpi_os_execute(acpi_execute_type type, INIT_WORK(&dpc->work, acpi_os_execute_deferred); } else { pr_err("Unsupported os_execute type %d.\n", type); - status = AE_ERROR; + goto err; } - if (ACPI_FAILURE(status)) - goto err_workqueue; - /* * On some machines, a software-initiated SMI causes corruption unless * the SMI runs on CPU 0. An SMI can be initiated by any AML, but @@ -1123,13 +1119,14 @@ acpi_status acpi_os_execute(acpi_execute_type type, ret = queue_work_on(0, queue, &dpc->work); if (!ret) { pr_err("Unable to queue work\n"); - status = AE_ERROR; + goto err; } -err_workqueue: - if (ACPI_FAILURE(status)) - kfree(dpc); -out_thread: - return status; + + return AE_OK; + +err: + kfree(dpc); + return AE_ERROR; } EXPORT_SYMBOL(acpi_os_execute); From 3f3a2599374ede5ac47ca89981ff8dd8f304d915 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 29 Nov 2023 14:48:22 +0100 Subject: [PATCH 0315/1562] ACPI: OSL: Rearrange workqueue selection in acpi_os_execute() Replace the 3-branch if () statement used for selecting the target workqueue in acpi_os_execute() with a switch () one that is more suitable for this purpose and carry out the work item initialization before it to avoid code duplication. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Andy Shevchenko --- drivers/acpi/osl.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 603057f6c63e..5eacf807d552 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -1092,19 +1092,21 @@ acpi_status acpi_os_execute(acpi_execute_type type, dpc->function = function; dpc->context = context; + INIT_WORK(&dpc->work, acpi_os_execute_deferred); /* * To prevent lockdep from complaining unnecessarily, make sure that * there is a different static lockdep key for each workqueue by using * INIT_WORK() for each of them separately. */ - if (type == OSL_NOTIFY_HANDLER) { + switch (type) { + case OSL_NOTIFY_HANDLER: queue = kacpi_notify_wq; - INIT_WORK(&dpc->work, acpi_os_execute_deferred); - } else if (type == OSL_GPE_HANDLER) { + break; + case OSL_GPE_HANDLER: queue = kacpid_wq; - INIT_WORK(&dpc->work, acpi_os_execute_deferred); - } else { + break; + default: pr_err("Unsupported os_execute type %d.\n", type); goto err; } From e2ffcda1629012a2c1a3706432bc45fdc899a584 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 29 Nov 2023 14:50:54 +0100 Subject: [PATCH 0316/1562] ACPI: OSL: Allow Notify () handlers to run on all CPUs Notify () handlers, like GPE handlers, are only allowed to run on CPU0 now out of the concern that they might trigger an SMM trap leading to memory corruption. Namely, in some cases, SMM code might corrupt memory if not run on CPU0. However, Notify () handlers are registered by kernel code and they are not likely to evaluate AML that would trigger an SMM trap. In fact, many of them don't even evaluate any AML at all and even if they do, that AML may as well be evaluated in other code paths. In other words, they are not special from the AML evaluation perspective, so there is no real reason to treat them in any special way. Accordingly, allow Notify () handlers, unlike GPE handlers, to be executed by all CPUs in the system. Also adjust the allocation of the "notify" workqueue to allow multiple handlers to be executed at the same time, because they need not be serialized. Signed-off-by: Rafael J. Wysocki Reviewed-by: Andy Shevchenko --- drivers/acpi/osl.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 5eacf807d552..a55cb578741a 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -1061,7 +1061,6 @@ acpi_status acpi_os_execute(acpi_execute_type type, acpi_osd_exec_callback function, void *context) { struct acpi_os_dpc *dpc; - struct workqueue_struct *queue; int ret; ACPI_DEBUG_PRINT((ACPI_DB_EXEC, @@ -1101,24 +1100,22 @@ acpi_status acpi_os_execute(acpi_execute_type type, */ switch (type) { case OSL_NOTIFY_HANDLER: - queue = kacpi_notify_wq; + ret = queue_work(kacpi_notify_wq, &dpc->work); break; case OSL_GPE_HANDLER: - queue = kacpid_wq; + /* + * On some machines, a software-initiated SMI causes corruption + * unless the SMI runs on CPU 0. An SMI can be initiated by + * any AML, but typically it's done in GPE-related methods that + * are run via workqueues, so we can avoid the known corruption + * cases by always queueing on CPU 0. + */ + ret = queue_work_on(0, kacpid_wq, &dpc->work); break; default: pr_err("Unsupported os_execute type %d.\n", type); goto err; } - - /* - * On some machines, a software-initiated SMI causes corruption unless - * the SMI runs on CPU 0. An SMI can be initiated by any AML, but - * typically it's done in GPE-related methods that are run via - * workqueues, so we can avoid the known corruption cases by always - * queueing on CPU 0. - */ - ret = queue_work_on(0, queue, &dpc->work); if (!ret) { pr_err("Unable to queue work\n"); goto err; @@ -1668,7 +1665,7 @@ acpi_status __init acpi_os_initialize(void) acpi_status __init acpi_os_initialize1(void) { kacpid_wq = alloc_workqueue("kacpid", 0, 1); - kacpi_notify_wq = alloc_workqueue("kacpi_notify", 0, 1); + kacpi_notify_wq = alloc_workqueue("kacpi_notify", 0, 0); kacpi_hotplug_wq = alloc_ordered_workqueue("kacpi_hotplug", 0); BUG_ON(!kacpid_wq); BUG_ON(!kacpi_notify_wq); From 72d9b9747e78979510e9aafdd32eb99c7aa30dd1 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 4 Dec 2023 13:00:37 -0500 Subject: [PATCH 0317/1562] ACPI: extlog: fix NULL pointer dereference check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gcc plugin -fanalyzer [1] tries to detect various patterns of incorrect behaviour. The tool reports: drivers/acpi/acpi_extlog.c: In function ‘extlog_exit’: drivers/acpi/acpi_extlog.c:307:12: warning: check of ‘extlog_l1_addr’ for NULL after already dereferencing it [-Wanalyzer-deref-before-check] | | 306 | ((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN; | | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~ | | | | | (1) pointer ‘extlog_l1_addr’ is dereferenced here | 307 | if (extlog_l1_addr) | | ~ | | | | | (2) pointer ‘extlog_l1_addr’ is checked for NULL here but it was already dereferenced at (1) | Fix the NULL pointer dereference check in extlog_exit(). Link: https://gcc.gnu.org/onlinedocs/gcc-10.1.0/gcc/Static-Analyzer-Options.html # [1] Signed-off-by: Prarit Bhargava Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_extlog.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index e120a96e1eae..193147769146 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -303,9 +303,10 @@ err: static void __exit extlog_exit(void) { mce_unregister_decode_chain(&extlog_mce_dec); - ((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN; - if (extlog_l1_addr) + if (extlog_l1_addr) { + ((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN; acpi_os_unmap_iomem(extlog_l1_addr, l1_size); + } if (elog_addr) acpi_os_unmap_iomem(elog_addr, elog_size); release_mem_region(elog_base, elog_size); From be0a3600aa1ebe9d23243c91d41ab1a2d5091a9b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 5 Dec 2023 13:24:08 +0100 Subject: [PATCH 0318/1562] thermal: sysfs: Rework the handling of trip point updates Both trip_point_temp_store() and trip_point_hyst_store() use thermal_zone_set_trip() to update a given trip point, but none of them actually needs to change more than one field in struct thermal_trip representing it. However, each of them effectively calls __thermal_zone_get_trip() twice in a row for the same trip index value, once directly and once via thermal_zone_set_trip(), which is not particularly efficient, and the way in which thermal_zone_set_trip() carries out the update is not particularly straightforward. Moreover, input processing need not be done under the thermal zone lock in any of these functions. Rework trip_point_temp_store() and trip_point_hyst_store() to address the above, move the part of thermal_zone_set_trip() that is still useful to a new function called thermal_zone_trip_updated() and drop the rest of it. While at it, make trip_point_hyst_store() reject negative hysteresis values. Signed-off-by: Rafael J. Wysocki Reviewed-by: Daniel Lezcano --- drivers/thermal/thermal_core.h | 2 ++ drivers/thermal/thermal_sysfs.c | 52 +++++++++++++++++++++++---------- drivers/thermal/thermal_trip.c | 45 ++++++---------------------- include/linux/thermal.h | 4 --- 4 files changed, 47 insertions(+), 56 deletions(-) diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 0a3b3ec5120b..7dfe6c8deb8e 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -124,6 +124,8 @@ int __thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id, struct thermal_trip *trip); int thermal_zone_trip_id(struct thermal_zone_device *tz, const struct thermal_trip *trip); +void thermal_zone_trip_updated(struct thermal_zone_device *tz, + const struct thermal_trip *trip); int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp); /* sysfs I/F */ diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c index eef40d4f3063..06202aa50060 100644 --- a/drivers/thermal/thermal_sysfs.c +++ b/drivers/thermal/thermal_sysfs.c @@ -120,8 +120,13 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct thermal_zone_device *tz = to_thermal_zone(dev); - struct thermal_trip trip; + struct thermal_trip *trip; int trip_id, ret; + int temp; + + ret = kstrtoint(buf, 10, &temp); + if (ret) + return -EINVAL; if (sscanf(attr->attr.name, "trip_point_%d_temp", &trip_id) != 1) return -EINVAL; @@ -133,15 +138,20 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr, goto unlock; } - ret = __thermal_zone_get_trip(tz, trip_id, &trip); - if (ret) - goto unlock; + trip = &tz->trips[trip_id]; - ret = kstrtoint(buf, 10, &trip.temperature); - if (ret) - goto unlock; + if (temp != trip->temperature) { + if (tz->ops->set_trip_temp) { + ret = tz->ops->set_trip_temp(tz, trip_id, temp); + if (ret) + goto unlock; + } + + trip->temperature = temp; + + thermal_zone_trip_updated(tz, trip); + } - ret = thermal_zone_set_trip(tz, trip_id, &trip); unlock: mutex_unlock(&tz->lock); @@ -179,8 +189,13 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct thermal_zone_device *tz = to_thermal_zone(dev); - struct thermal_trip trip; + struct thermal_trip *trip; int trip_id, ret; + int hyst; + + ret = kstrtoint(buf, 10, &hyst); + if (ret || hyst < 0) + return -EINVAL; if (sscanf(attr->attr.name, "trip_point_%d_hyst", &trip_id) != 1) return -EINVAL; @@ -192,15 +207,20 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr, goto unlock; } - ret = __thermal_zone_get_trip(tz, trip_id, &trip); - if (ret) - goto unlock; + trip = &tz->trips[trip_id]; - ret = kstrtoint(buf, 10, &trip.hysteresis); - if (ret) - goto unlock; + if (hyst != trip->hysteresis) { + if (tz->ops->set_trip_hyst) { + ret = tz->ops->set_trip_hyst(tz, trip_id, hyst); + if (ret) + goto unlock; + } + + trip->hysteresis = hyst; + + thermal_zone_trip_updated(tz, trip); + } - ret = thermal_zone_set_trip(tz, trip_id, &trip); unlock: mutex_unlock(&tz->lock); diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c index e3dd583234dd..90861dec7eb0 100644 --- a/drivers/thermal/thermal_trip.c +++ b/drivers/thermal/thermal_trip.c @@ -147,42 +147,6 @@ int thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id, } EXPORT_SYMBOL_GPL(thermal_zone_get_trip); -int thermal_zone_set_trip(struct thermal_zone_device *tz, int trip_id, - const struct thermal_trip *trip) -{ - struct thermal_trip t; - int ret; - - ret = __thermal_zone_get_trip(tz, trip_id, &t); - if (ret) - return ret; - - if (t.type != trip->type) - return -EINVAL; - - if (t.temperature != trip->temperature && tz->ops->set_trip_temp) { - ret = tz->ops->set_trip_temp(tz, trip_id, trip->temperature); - if (ret) - return ret; - } - - if (t.hysteresis != trip->hysteresis && tz->ops->set_trip_hyst) { - ret = tz->ops->set_trip_hyst(tz, trip_id, trip->hysteresis); - if (ret) - return ret; - } - - if (tz->trips && (t.temperature != trip->temperature || t.hysteresis != trip->hysteresis)) - tz->trips[trip_id] = *trip; - - thermal_notify_tz_trip_change(tz->id, trip_id, trip->type, - trip->temperature, trip->hysteresis); - - __thermal_zone_device_update(tz, THERMAL_TRIP_CHANGED); - - return 0; -} - int thermal_zone_trip_id(struct thermal_zone_device *tz, const struct thermal_trip *trip) { @@ -192,3 +156,12 @@ int thermal_zone_trip_id(struct thermal_zone_device *tz, */ return trip - tz->trips; } + +void thermal_zone_trip_updated(struct thermal_zone_device *tz, + const struct thermal_trip *trip) +{ + thermal_notify_tz_trip_change(tz->id, thermal_zone_trip_id(tz, trip), + trip->type, trip->temperature, + trip->hysteresis); + __thermal_zone_device_update(tz, THERMAL_TRIP_CHANGED); +} diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 1f9ee869f9f9..0ea99f50d57c 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -282,10 +282,6 @@ int __thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id, struct thermal_trip *trip); int thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id, struct thermal_trip *trip); - -int thermal_zone_set_trip(struct thermal_zone_device *tz, int trip_id, - const struct thermal_trip *trip); - int for_each_thermal_trip(struct thermal_zone_device *tz, int (*cb)(struct thermal_trip *, void *), void *data); From 18dfb0e4c3c3cd5654182833bcf3dfdcef754e6e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 5 Dec 2023 13:26:59 +0100 Subject: [PATCH 0319/1562] thermal: sysfs: Rework the reading of trip point attributes Rework the _show() callback functions for the trip point temperature, hysteresis and type attributes to avoid copying the values of struct thermal_trip fields that they do not use and make them carry out the same validation checks as the corresponding _store() callback functions. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Daniel Lezcano --- drivers/thermal/thermal_sysfs.c | 52 ++++++++++++++++----------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c index 06202aa50060..9e3d8fa01eea 100644 --- a/drivers/thermal/thermal_sysfs.c +++ b/drivers/thermal/thermal_sysfs.c @@ -83,25 +83,24 @@ trip_point_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); - struct thermal_trip trip; - int trip_id, result; + enum thermal_trip_type type; + int trip_id; if (sscanf(attr->attr.name, "trip_point_%d_type", &trip_id) != 1) return -EINVAL; mutex_lock(&tz->lock); - if (device_is_registered(dev)) - result = __thermal_zone_get_trip(tz, trip_id, &trip); - else - result = -ENODEV; + if (!device_is_registered(dev)) { + mutex_unlock(&tz->lock); + return -ENODEV; + } + + type = tz->trips[trip_id].type; mutex_unlock(&tz->lock); - if (result) - return result; - - switch (trip.type) { + switch (type) { case THERMAL_TRIP_CRITICAL: return sprintf(buf, "critical\n"); case THERMAL_TRIP_HOT: @@ -163,25 +162,23 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); - struct thermal_trip trip; - int trip_id, ret; + int trip_id, temp; if (sscanf(attr->attr.name, "trip_point_%d_temp", &trip_id) != 1) return -EINVAL; mutex_lock(&tz->lock); - if (device_is_registered(dev)) - ret = __thermal_zone_get_trip(tz, trip_id, &trip); - else - ret = -ENODEV; + if (!device_is_registered(dev)) { + mutex_unlock(&tz->lock); + return -ENODEV; + } + + temp = tz->trips[trip_id].temperature; mutex_unlock(&tz->lock); - if (ret) - return ret; - - return sprintf(buf, "%d\n", trip.temperature); + return sprintf(buf, "%d\n", temp); } static ssize_t @@ -232,22 +229,23 @@ trip_point_hyst_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); - struct thermal_trip trip; - int trip_id, ret; + int trip_id, hyst; if (sscanf(attr->attr.name, "trip_point_%d_hyst", &trip_id) != 1) return -EINVAL; mutex_lock(&tz->lock); - if (device_is_registered(dev)) - ret = __thermal_zone_get_trip(tz, trip_id, &trip); - else - ret = -ENODEV; + if (!device_is_registered(dev)) { + mutex_unlock(&tz->lock); + return -ENODEV; + } + + hyst = tz->trips[trip_id].hysteresis; mutex_unlock(&tz->lock); - return ret ? ret : sprintf(buf, "%d\n", trip.hysteresis); + return sprintf(buf, "%d\n", hyst); } static ssize_t From 07bcbdf020c9fd3c14bec51c50225a2a02707b94 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 23 Nov 2023 09:48:09 -0800 Subject: [PATCH 0320/1562] xfs: don't leak recovered attri intent items If recovery finds an xattr log intent item calling for the removal of an attribute and the file doesn't even have an attr fork, we know that the removal is trivially complete. However, we can't just exit the recovery function without doing something about the recovered log intent item -- it's still on the AIL, and not logging an attrd item means it stays there forever. This has likely not been seen in practice because few people use LARP and the runtime code won't log the attri for a no-attrfork removexattr operation. But let's fix this anyway. Also we shouldn't really be testing the attr fork presence until we've taken the ILOCK, though this doesn't matter much in recovery, which is single threaded. Fixes: fdaf1bb3cafc ("xfs: ATTR_REPLACE algorithm with LARP enabled needs rework") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_item.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 36fe2abb16e6..11e88a76a33c 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -329,6 +329,13 @@ xfs_xattri_finish_update( goto out; } + /* If an attr removal is trivially complete, we're done. */ + if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE && + !xfs_inode_hasattr(args->dp)) { + error = 0; + goto out; + } + error = xfs_attr_set_iter(attr); if (!error && attr->xattri_dela_state != XFS_DAS_DONE) error = -EAGAIN; @@ -608,8 +615,6 @@ xfs_attri_item_recover( attr->xattri_dela_state = xfs_attr_init_add_state(args); break; case XFS_ATTRI_OP_FLAGS_REMOVE: - if (!xfs_inode_hasattr(args->dp)) - goto out; attr->xattri_dela_state = xfs_attr_init_remove_state(args); break; default: From 03f7767c9f6120ac933378fdec3bfd78bf07bc11 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 22 Nov 2023 10:23:23 -0800 Subject: [PATCH 0321/1562] xfs: use xfs_defer_pending objects to recover intent items One thing I never quite got around to doing is porting the log intent item recovery code to reconstruct the deferred pending work state. As a result, each intent item open codes xfs_defer_finish_one in its recovery method, because that's what the EFI code did before xfs_defer.c even existed. This is a gross thing to have left unfixed -- if an EFI cannot proceed due to busy extents, we end up creating separate new EFIs for each unfinished work item, which is a change in behavior from what runtime would have done. Worse yet, Long Li pointed out that there's a UAF in the recovery code. The ->commit_pass2 function adds the intent item to the AIL and drops the refcount. The one remaining refcount is now owned by the recovery mechanism (aka the log intent items in the AIL) with the intent of giving the refcount to the intent done item in the ->iop_recover function. However, if something fails later in recovery, xlog_recover_finish will walk the recovered intent items in the AIL and release them. If the CIL hasn't been pushed before that point (which is possible since we don't force the log until later) then the intent done release will try to free its associated intent, which has already been freed. This patch starts to address this mess by having the ->commit_pass2 functions recreate the xfs_defer_pending state. The next few patches will fix the recovery functions. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 105 ++++++++++++++++++++++-------- fs/xfs/libxfs/xfs_defer.h | 5 ++ fs/xfs/libxfs/xfs_log_recover.h | 3 + fs/xfs/xfs_attr_item.c | 10 +-- fs/xfs/xfs_bmap_item.c | 9 +-- fs/xfs/xfs_extfree_item.c | 9 +-- fs/xfs/xfs_log.c | 1 + fs/xfs/xfs_log_priv.h | 1 + fs/xfs/xfs_log_recover.c | 111 ++++++++++++++++---------------- fs/xfs/xfs_refcount_item.c | 9 +-- fs/xfs/xfs_rmap_item.c | 9 +-- 11 files changed, 157 insertions(+), 115 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index f71679ce23b9..363da37a8e7f 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -245,23 +245,53 @@ xfs_defer_create_intents( return ret; } -STATIC void +static inline void xfs_defer_pending_abort( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + + trace_xfs_defer_pending_abort(mp, dfp); + + if (dfp->dfp_intent && !dfp->dfp_done) { + ops->abort_intent(dfp->dfp_intent); + dfp->dfp_intent = NULL; + } +} + +static inline void +xfs_defer_pending_cancel_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct list_head *pwi; + struct list_head *n; + + trace_xfs_defer_cancel_list(mp, dfp); + + list_del(&dfp->dfp_list); + list_for_each_safe(pwi, n, &dfp->dfp_work) { + list_del(pwi); + dfp->dfp_count--; + trace_xfs_defer_cancel_item(mp, dfp, pwi); + ops->cancel_item(pwi); + } + ASSERT(dfp->dfp_count == 0); + kmem_cache_free(xfs_defer_pending_cache, dfp); +} + +STATIC void +xfs_defer_pending_abort_list( struct xfs_mount *mp, struct list_head *dop_list) { struct xfs_defer_pending *dfp; - const struct xfs_defer_op_type *ops; /* Abort intent items that don't have a done item. */ - list_for_each_entry(dfp, dop_list, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_abort(mp, dfp); - if (dfp->dfp_intent && !dfp->dfp_done) { - ops->abort_intent(dfp->dfp_intent); - dfp->dfp_intent = NULL; - } - } + list_for_each_entry(dfp, dop_list, dfp_list) + xfs_defer_pending_abort(mp, dfp); } /* Abort all the intents that were committed. */ @@ -271,7 +301,7 @@ xfs_defer_trans_abort( struct list_head *dop_pending) { trace_xfs_defer_trans_abort(tp, _RET_IP_); - xfs_defer_pending_abort(tp->t_mountp, dop_pending); + xfs_defer_pending_abort_list(tp->t_mountp, dop_pending); } /* @@ -389,27 +419,13 @@ xfs_defer_cancel_list( { struct xfs_defer_pending *dfp; struct xfs_defer_pending *pli; - struct list_head *pwi; - struct list_head *n; - const struct xfs_defer_op_type *ops; /* * Free the pending items. Caller should already have arranged * for the intent items to be released. */ - list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_cancel_list(mp, dfp); - list_del(&dfp->dfp_list); - list_for_each_safe(pwi, n, &dfp->dfp_work) { - list_del(pwi); - dfp->dfp_count--; - trace_xfs_defer_cancel_item(mp, dfp, pwi); - ops->cancel_item(pwi); - } - ASSERT(dfp->dfp_count == 0); - kmem_cache_free(xfs_defer_pending_cache, dfp); - } + list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) + xfs_defer_pending_cancel_work(mp, dfp); } /* @@ -665,6 +681,39 @@ xfs_defer_add( dfp->dfp_count++; } +/* + * Create a pending deferred work item to replay the recovered intent item + * and add it to the list. + */ +void +xfs_defer_start_recovery( + struct xfs_log_item *lip, + enum xfs_defer_ops_type dfp_type, + struct list_head *r_dfops) +{ + struct xfs_defer_pending *dfp; + + dfp = kmem_cache_zalloc(xfs_defer_pending_cache, + GFP_NOFS | __GFP_NOFAIL); + dfp->dfp_type = dfp_type; + dfp->dfp_intent = lip; + INIT_LIST_HEAD(&dfp->dfp_work); + list_add_tail(&dfp->dfp_list, r_dfops); +} + +/* + * Cancel a deferred work item created to recover a log intent item. @dfp + * will be freed after this function returns. + */ +void +xfs_defer_cancel_recovery( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + xfs_defer_pending_abort(mp, dfp); + xfs_defer_pending_cancel_work(mp, dfp); +} + /* * Move deferred ops from one transaction to another and reset the source to * initial state. This is primarily used to carry state forward across @@ -769,7 +818,7 @@ xfs_defer_ops_capture_abort( { unsigned short i; - xfs_defer_pending_abort(mp, &dfc->dfc_dfops); + xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops); xfs_defer_cancel_list(mp, &dfc->dfc_dfops); for (i = 0; i < dfc->dfc_held.dr_bufs; i++) diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 8788ad5f6a73..5dce938ba3d5 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -125,6 +125,11 @@ void xfs_defer_ops_capture_abort(struct xfs_mount *mp, struct xfs_defer_capture *d); void xfs_defer_resources_rele(struct xfs_defer_resources *dres); +void xfs_defer_start_recovery(struct xfs_log_item *lip, + enum xfs_defer_ops_type dfp_type, struct list_head *r_dfops); +void xfs_defer_cancel_recovery(struct xfs_mount *mp, + struct xfs_defer_pending *dfp); + int __init xfs_defer_init_item_caches(void); void xfs_defer_destroy_item_caches(void); diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index a5100a11faf9..271a4ce7375c 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -153,4 +153,7 @@ xlog_recover_resv(const struct xfs_trans_res *r) return ret; } +void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip, + xfs_lsn_t lsn, unsigned int dfp_type); + #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 11e88a76a33c..a32716b8cbbd 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -772,14 +772,8 @@ xlog_recover_attri_commit_pass2( attrip = xfs_attri_init(mp, nv); memcpy(&attrip->attri_format, attri_formatp, len); - /* - * The ATTRI has two references. One for the ATTRD and one for ATTRI to - * ensure it makes it into the AIL. Insert the ATTRI into the AIL - * directly and drop the ATTRI reference. Note that - * xfs_trans_ail_update() drops the AIL lock. - */ - xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn); - xfs_attri_release(attrip); + xlog_recover_intent_item(log, &attrip->attri_item, lsn, + XFS_DEFER_OPS_TYPE_ATTR); xfs_attri_log_nameval_put(nv); return 0; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e736a0844c89..6cbae4fdf43f 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -681,12 +681,9 @@ xlog_recover_bui_commit_pass2( buip = xfs_bui_init(mp); xfs_bui_copy_format(&buip->bui_format, bui_formatp); atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn); - xfs_bui_release(buip); + + xlog_recover_intent_item(log, &buip->bui_item, lsn, + XFS_DEFER_OPS_TYPE_BMAP); return 0; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 3fa8789820ad..cf0ddeb70580 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -820,12 +820,9 @@ xlog_recover_efi_commit_pass2( return error; } atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn); - xfs_efi_release(efip); + + xlog_recover_intent_item(log, &efip->efi_item, lsn, + XFS_DEFER_OPS_TYPE_FREE); return 0; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ee206facf0dc..a1650fc81382 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1542,6 +1542,7 @@ xlog_alloc_log( log->l_covered_state = XLOG_STATE_COVER_IDLE; set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); + INIT_LIST_HEAD(&log->r_dfops); log->l_prev_block = -1; /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index fa3ad1d7b31c..e30c06ec20e3 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -407,6 +407,7 @@ struct xlog { long l_opstate; /* operational state */ uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ struct list_head *l_buf_cancel_table; + struct list_head r_dfops; /* recovered log intent items */ int l_iclog_hsize; /* size of iclog header */ int l_iclog_heads; /* # of iclog header sectors */ uint l_sectBBsize; /* sector size in BBs (2^n) */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a1e18b24971a..b9d2152a2bad 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1723,30 +1723,24 @@ xlog_clear_stale_blocks( */ void xlog_recover_release_intent( - struct xlog *log, - unsigned short intent_type, - uint64_t intent_id) + struct xlog *log, + unsigned short intent_type, + uint64_t intent_id) { - struct xfs_ail_cursor cur; - struct xfs_log_item *lip; - struct xfs_ail *ailp = log->l_ailp; + struct xfs_defer_pending *dfp, *n; + + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + struct xfs_log_item *lip = dfp->dfp_intent; - spin_lock(&ailp->ail_lock); - for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; - lip = xfs_trans_ail_cursor_next(ailp, &cur)) { if (lip->li_type != intent_type) continue; if (!lip->li_ops->iop_match(lip, intent_id)) continue; - spin_unlock(&ailp->ail_lock); - lip->li_ops->iop_release(lip); - spin_lock(&ailp->ail_lock); - break; - } + ASSERT(xlog_item_is_intent(lip)); - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); + xfs_defer_cancel_recovery(log->l_mp, dfp); + } } int @@ -1939,6 +1933,29 @@ xlog_buf_readahead( xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops); } +/* + * Create a deferred work structure for resuming and tracking the progress of a + * log intent item that was found during recovery. + */ +void +xlog_recover_intent_item( + struct xlog *log, + struct xfs_log_item *lip, + xfs_lsn_t lsn, + unsigned int dfp_type) +{ + ASSERT(xlog_item_is_intent(lip)); + + xfs_defer_start_recovery(lip, dfp_type, &log->r_dfops); + + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, lip, lsn); + lip->li_ops->iop_unpin(lip, 0); +} + STATIC int xlog_recover_items_pass2( struct xlog *log, @@ -2533,29 +2550,22 @@ xlog_abort_defer_ops( */ STATIC int xlog_recover_process_intents( - struct xlog *log) + struct xlog *log) { LIST_HEAD(capture_list); - struct xfs_ail_cursor cur; - struct xfs_log_item *lip; - struct xfs_ail *ailp; - int error = 0; + struct xfs_defer_pending *dfp, *n; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) - xfs_lsn_t last_lsn; -#endif + xfs_lsn_t last_lsn; - ailp = log->l_ailp; - spin_lock(&ailp->ail_lock); -#if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif - for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - lip != NULL; - lip = xfs_trans_ail_cursor_next(ailp, &cur)) { - const struct xfs_item_ops *ops; - if (!xlog_item_is_intent(lip)) - break; + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + struct xfs_log_item *lip = dfp->dfp_intent; + const struct xfs_item_ops *ops = lip->li_ops; + + ASSERT(xlog_item_is_intent(lip)); /* * We should never see a redo item with a LSN higher than @@ -2573,19 +2583,22 @@ xlog_recover_process_intents( * The recovery function can free the log item, so we must not * access lip after it returns. */ - spin_unlock(&ailp->ail_lock); - ops = lip->li_ops; error = ops->iop_recover(lip, &capture_list); - spin_lock(&ailp->ail_lock); if (error) { trace_xlog_intent_recovery_failed(log->l_mp, error, ops->iop_recover); break; } - } - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); + /* + * XXX: @lip could have been freed, so detach the log item from + * the pending item before freeing the pending item. This does + * not fix the existing UAF bug that occurs if ->iop_recover + * fails after creating the intent done item. + */ + dfp->dfp_intent = NULL; + xfs_defer_cancel_recovery(log->l_mp, dfp); + } if (error) goto err; @@ -2606,27 +2619,15 @@ err: */ STATIC void xlog_recover_cancel_intents( - struct xlog *log) + struct xlog *log) { - struct xfs_log_item *lip; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp; + struct xfs_defer_pending *dfp, *n; - ailp = log->l_ailp; - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (!xlog_item_is_intent(lip)) - break; + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + ASSERT(xlog_item_is_intent(dfp->dfp_intent)); - spin_unlock(&ailp->ail_lock); - lip->li_ops->iop_release(lip); - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_next(ailp, &cur); + xfs_defer_cancel_recovery(log->l_mp, dfp); } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); } /* diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 2d4444d61e98..b88cb2e98227 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -696,12 +696,9 @@ xlog_recover_cui_commit_pass2( cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); xfs_cui_copy_format(&cuip->cui_format, cui_formatp); atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn); - xfs_cui_release(cuip); + + xlog_recover_intent_item(log, &cuip->cui_item, lsn, + XFS_DEFER_OPS_TYPE_REFCOUNT); return 0; } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 0e0e747028da..c30d4a4a14b2 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -702,12 +702,9 @@ xlog_recover_rui_commit_pass2( ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); xfs_rui_copy_format(&ruip->rui_format, rui_formatp); atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn); - xfs_rui_release(ruip); + + xlog_recover_intent_item(log, &ruip->rui_item, lsn, + XFS_DEFER_OPS_TYPE_RMAP); return 0; } From a050acdfa8003a44eae4558fddafc7afb1aef458 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 22 Nov 2023 10:38:10 -0800 Subject: [PATCH 0322/1562] xfs: pass the xfs_defer_pending object to iop_recover Now that log intent item recovery recreates the xfs_defer_pending state, we should pass that into the ->iop_recover routines so that the intent item can finish the recreation work. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_item.c | 3 ++- fs/xfs/xfs_bmap_item.c | 3 ++- fs/xfs/xfs_extfree_item.c | 3 ++- fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_refcount_item.c | 3 ++- fs/xfs/xfs_rmap_item.c | 3 ++- fs/xfs/xfs_trans.h | 4 +++- 7 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index a32716b8cbbd..6119a7a480a0 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -545,9 +545,10 @@ xfs_attri_validate( */ STATIC int xfs_attri_item_recover( - struct xfs_log_item *lip, + struct xfs_defer_pending *dfp, struct list_head *capture_list) { + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); struct xfs_attr_intent *attr; struct xfs_mount *mp = lip->li_log->l_mp; diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 6cbae4fdf43f..3ef55de370b5 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -486,11 +486,12 @@ xfs_bui_validate( */ STATIC int xfs_bui_item_recover( - struct xfs_log_item *lip, + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_bmap_intent fake = { }; struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index cf0ddeb70580..a8245c5ffe49 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -657,10 +657,11 @@ xfs_efi_validate_ext( */ STATIC int xfs_efi_item_recover( - struct xfs_log_item *lip, + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_efi_log_item *efip = EFI_ITEM(lip); struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b9d2152a2bad..ff768217f2c7 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2583,7 +2583,7 @@ xlog_recover_process_intents( * The recovery function can free the log item, so we must not * access lip after it returns. */ - error = ops->iop_recover(lip, &capture_list); + error = ops->iop_recover(dfp, &capture_list); if (error) { trace_xlog_intent_recovery_failed(log->l_mp, error, ops->iop_recover); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index b88cb2e98227..3456201aa3e6 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -474,10 +474,11 @@ xfs_cui_validate_phys( */ STATIC int xfs_cui_item_recover( - struct xfs_log_item *lip, + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); struct xfs_cud_log_item *cudp; struct xfs_trans *tp; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index c30d4a4a14b2..dfd5a3e4b1fb 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -504,10 +504,11 @@ xfs_rui_validate_map( */ STATIC int xfs_rui_item_recover( - struct xfs_log_item *lip, + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_rud_log_item *rudp; struct xfs_trans *tp; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 6e3646d524ce..4e38357237c3 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -66,6 +66,8 @@ struct xfs_log_item { { (1u << XFS_LI_DIRTY), "DIRTY" }, \ { (1u << XFS_LI_WHITEOUT), "WHITEOUT" } +struct xfs_defer_pending; + struct xfs_item_ops { unsigned flags; void (*iop_size)(struct xfs_log_item *, int *, int *); @@ -78,7 +80,7 @@ struct xfs_item_ops { xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); uint (*iop_push)(struct xfs_log_item *, struct list_head *); void (*iop_release)(struct xfs_log_item *); - int (*iop_recover)(struct xfs_log_item *lip, + int (*iop_recover)(struct xfs_defer_pending *dfp, struct list_head *capture_list); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, From deb4cd8ba87f17b12c72b3827820d9c703e9fd95 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 22 Nov 2023 10:47:10 -0800 Subject: [PATCH 0323/1562] xfs: transfer recovered intent item ownership in ->iop_recover Now that we pass the xfs_defer_pending object into the intent item recovery functions, we know exactly when ownership of the sole refcount passes from the recovery context to the intent done item. At that point, we need to null out dfp_intent so that the recovery mechanism won't release it. This should fix the UAF problem reported by Long Li. Note that we still want to recreate the full deferred work state. That will be addressed in the next patches. Fixes: 2e76f188fd90 ("xfs: cancel intents immediately if process_intents fails") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_log_recover.h | 2 ++ fs/xfs/xfs_attr_item.c | 1 + fs/xfs/xfs_bmap_item.c | 2 ++ fs/xfs/xfs_extfree_item.c | 2 ++ fs/xfs/xfs_log_recover.c | 19 ++++++++++++------- fs/xfs/xfs_refcount_item.c | 1 + fs/xfs/xfs_rmap_item.c | 2 ++ 7 files changed, 22 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 271a4ce7375c..13583df9f239 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -155,5 +155,7 @@ xlog_recover_resv(const struct xfs_trans_res *r) void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip, xfs_lsn_t lsn, unsigned int dfp_type); +void xlog_recover_transfer_intent(struct xfs_trans *tp, + struct xfs_defer_pending *dfp); #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 6119a7a480a0..82775e9537df 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -632,6 +632,7 @@ xfs_attri_item_recover( args->trans = tp; done_item = xfs_trans_get_attrd(tp, attrip); + xlog_recover_transfer_intent(tp, dfp); xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 3ef55de370b5..b6d63b8bdad5 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -524,6 +524,8 @@ xfs_bui_item_recover( goto err_rele; budp = xfs_trans_get_bud(tp, buip); + xlog_recover_transfer_intent(tp, dfp); + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index a8245c5ffe49..c9908fb33765 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -689,7 +689,9 @@ xfs_efi_item_recover( error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp); if (error) return error; + efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); + xlog_recover_transfer_intent(tp, dfp); for (i = 0; i < efip->efi_format.efi_nextents; i++) { struct xfs_extent_free_item fake = { diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index ff768217f2c7..cc14cd1c2282 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2590,13 +2590,6 @@ xlog_recover_process_intents( break; } - /* - * XXX: @lip could have been freed, so detach the log item from - * the pending item before freeing the pending item. This does - * not fix the existing UAF bug that occurs if ->iop_recover - * fails after creating the intent done item. - */ - dfp->dfp_intent = NULL; xfs_defer_cancel_recovery(log->l_mp, dfp); } if (error) @@ -2630,6 +2623,18 @@ xlog_recover_cancel_intents( } } +/* + * Transfer ownership of the recovered log intent item to the recovery + * transaction. + */ +void +xlog_recover_transfer_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + dfp->dfp_intent = NULL; +} + /* * This routine performs a transaction to null out a bad inode pointer * in an agi unlinked inode hash bucket. diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 3456201aa3e6..f1b259223802 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -523,6 +523,7 @@ xfs_cui_item_recover( return error; cudp = xfs_trans_get_cud(tp, cuip); + xlog_recover_transfer_intent(tp, dfp); for (i = 0; i < cuip->cui_format.cui_nextents; i++) { struct xfs_refcount_intent fake = { }; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index dfd5a3e4b1fb..5e8a02d2b045 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -537,7 +537,9 @@ xfs_rui_item_recover( XFS_TRANS_RESERVE, &tp); if (error) return error; + rudp = xfs_trans_get_rud(tp, ruip); + xlog_recover_transfer_intent(tp, dfp); for (i = 0; i < ruip->rui_format.rui_nextents; i++) { struct xfs_rmap_intent fake = { }; From e70fb328d5277297ea2d9169a3a046de6412d777 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 22 Nov 2023 11:13:03 -0800 Subject: [PATCH 0324/1562] xfs: recreate work items when recovering intent items Recreate work items for each xfs_defer_pending object when we are recovering intent items. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 3 +- fs/xfs/libxfs/xfs_defer.h | 9 +++ fs/xfs/xfs_attr_item.c | 90 ++++++++++++++++------------- fs/xfs/xfs_bmap_item.c | 55 +++++++++++------- fs/xfs/xfs_extfree_item.c | 49 +++++++++------- fs/xfs/xfs_refcount_item.c | 60 ++++++++++---------- fs/xfs/xfs_rmap_item.c | 112 ++++++++++++++++++++----------------- 7 files changed, 215 insertions(+), 163 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 363da37a8e7f..8fb523e4f669 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -676,9 +676,8 @@ xfs_defer_add( list_add_tail(&dfp->dfp_list, &tp->t_dfops); } - list_add_tail(li, &dfp->dfp_work); + xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); - dfp->dfp_count++; } /* diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 5dce938ba3d5..bef5823f61fb 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -130,6 +130,15 @@ void xfs_defer_start_recovery(struct xfs_log_item *lip, void xfs_defer_cancel_recovery(struct xfs_mount *mp, struct xfs_defer_pending *dfp); +static inline void +xfs_defer_add_item( + struct xfs_defer_pending *dfp, + struct list_head *work) +{ + list_add_tail(work, &dfp->dfp_work); + dfp->dfp_count++; +} + int __init xfs_defer_init_item_caches(void); void xfs_defer_destroy_item_caches(void); diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 82775e9537df..c4441eacf51c 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -539,42 +539,17 @@ xfs_attri_validate( return xfs_verify_ino(mp, attrp->alfi_ino); } -/* - * Process an attr intent item that was recovered from the log. We need to - * delete the attr that it describes. - */ -STATIC int -xfs_attri_item_recover( +static inline struct xfs_attr_intent * +xfs_attri_recover_work( + struct xfs_mount *mp, struct xfs_defer_pending *dfp, - struct list_head *capture_list) + struct xfs_attri_log_format *attrp, + struct xfs_inode *ip, + struct xfs_attri_log_nameval *nv) { - struct xfs_log_item *lip = dfp->dfp_intent; - struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); struct xfs_attr_intent *attr; - struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_inode *ip; struct xfs_da_args *args; - struct xfs_trans *tp; - struct xfs_trans_res resv; - struct xfs_attri_log_format *attrp; - struct xfs_attri_log_nameval *nv = attrip->attri_nameval; - int error; - int total; int local; - struct xfs_attrd_log_item *done_item = NULL; - - /* - * First check the validity of the attr described by the ATTRI. If any - * are bad, then assume that all are bad and just toss the ATTRI. - */ - attrp = &attrip->attri_format; - if (!xfs_attri_validate(mp, attrp) || - !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) - return -EFSCORRUPTED; - - error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); - if (error) - return error; attr = kmem_zalloc(sizeof(struct xfs_attr_intent) + sizeof(struct xfs_da_args), KM_NOFS); @@ -618,19 +593,58 @@ xfs_attri_item_recover( case XFS_ATTRI_OP_FLAGS_REMOVE: attr->xattri_dela_state = xfs_attr_init_remove_state(args); break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - goto out; } + xfs_defer_add_item(dfp, &attr->xattri_list); + return attr; +} + +/* + * Process an attr intent item that was recovered from the log. We need to + * delete the attr that it describes. + */ +STATIC int +xfs_attri_item_recover( + struct xfs_defer_pending *dfp, + struct list_head *capture_list) +{ + struct xfs_log_item *lip = dfp->dfp_intent; + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_attr_intent *attr; + struct xfs_mount *mp = lip->li_log->l_mp; + struct xfs_inode *ip; + struct xfs_da_args *args; + struct xfs_trans *tp; + struct xfs_trans_res resv; + struct xfs_attri_log_format *attrp; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; + int error; + int total; + struct xfs_attrd_log_item *done_item = NULL; + + /* + * First check the validity of the attr described by the ATTRI. If any + * are bad, then assume that all are bad and just toss the ATTRI. + */ + attrp = &attrip->attri_format; + if (!xfs_attri_validate(mp, attrp) || + !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) + return -EFSCORRUPTED; + + error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); + if (error) + return error; + + attr = xfs_attri_recover_work(mp, dfp, attrp, ip, nv); + args = attr->xattri_da_args; + xfs_init_attr_trans(args, &resv, &total); resv = xlog_recover_resv(&resv); error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp); if (error) - goto out; - + return error; args->trans = tp; + done_item = xfs_trans_get_attrd(tp, attrip); xlog_recover_transfer_intent(tp, dfp); @@ -661,8 +675,6 @@ xfs_attri_item_recover( out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); -out: - xfs_attr_free_item(attr); return error; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index b6d63b8bdad5..b65999bf0ea3 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -480,6 +480,28 @@ xfs_bui_validate( return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } +static inline struct xfs_bmap_intent * +xfs_bui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_map_extent *map) +{ + struct xfs_bmap_intent *bi; + + bi = kmem_cache_zalloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + bi->bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + bi->bi_bmap.br_startblock = map->me_startblock; + bi->bi_bmap.br_startoff = map->me_startoff; + bi->bi_bmap.br_blockcount = map->me_len; + bi->bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + + xfs_defer_add_item(dfp, &bi->bi_list); + return bi; +} + /* * Process a bmap update intent item that was recovered from the log. * We need to update some inode's bmbt. @@ -489,7 +511,6 @@ xfs_bui_item_recover( struct xfs_defer_pending *dfp, struct list_head *capture_list) { - struct xfs_bmap_intent fake = { }; struct xfs_trans_res resv; struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_bui_log_item *buip = BUI_ITEM(lip); @@ -498,6 +519,7 @@ xfs_bui_item_recover( struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_map_extent *map; struct xfs_bud_log_item *budp; + struct xfs_bmap_intent *fake; int iext_delta; int error = 0; @@ -508,9 +530,7 @@ xfs_bui_item_recover( } map = &buip->bui_format.bui_extents[0]; - fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + fake = xfs_bui_recover_work(mp, dfp, map); error = xlog_recover_iget(mp, map->me_owner, &ip); if (error) @@ -529,36 +549,31 @@ xfs_bui_item_recover( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - if (fake.bi_type == XFS_BMAP_MAP) + if (fake->bi_type == XFS_BMAP_MAP) iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; else iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; - error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta); + error = xfs_iext_count_may_overflow(ip, fake->bi_whichfork, iext_delta); if (error == -EFBIG) error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto err_cancel; - fake.bi_owner = ip; - fake.bi_bmap.br_startblock = map->me_startblock; - fake.bi_bmap.br_startoff = map->me_startoff; - fake.bi_bmap.br_blockcount = map->me_len; - fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + fake->bi_owner = ip; - xfs_bmap_update_get_group(mp, &fake); - error = xfs_trans_log_finish_bmap_update(tp, budp, &fake); + xfs_bmap_update_get_group(mp, fake); + error = xfs_trans_log_finish_bmap_update(tp, budp, fake); if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, - sizeof(*map)); - xfs_bmap_update_put_group(&fake); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &buip->bui_format, sizeof(buip->bui_format)); + xfs_bmap_update_put_group(fake); if (error) goto err_cancel; - if (fake.bi_bmap.br_blockcount > 0) { - ASSERT(fake.bi_type == XFS_BMAP_UNMAP); - xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap); + if (fake->bi_bmap.br_blockcount > 0) { + ASSERT(fake->bi_type == XFS_BMAP_UNMAP); + xfs_bmap_unmap_extent(tp, ip, &fake->bi_bmap); } /* diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index c9908fb33765..41108a0b60c9 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -651,6 +651,24 @@ xfs_efi_validate_ext( return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len); } +static inline void +xfs_efi_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_extent *extp) +{ + struct xfs_extent_free_item *xefi; + + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, + GFP_KERNEL | __GFP_NOFAIL); + xefi->xefi_startblock = extp->ext_start; + xefi->xefi_blockcount = extp->ext_len; + xefi->xefi_agresv = XFS_AG_RESV_NONE; + xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN; + + xfs_defer_add_item(dfp, &xefi->xefi_list); +} + /* * Process an extent free intent item that was recovered from * the log. We need to free the extents that it describes. @@ -666,6 +684,7 @@ xfs_efi_item_recover( struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; struct xfs_trans *tp; + struct xfs_extent_free_item *fake; int i; int error = 0; bool requeue_only = false; @@ -683,6 +702,8 @@ xfs_efi_item_recover( sizeof(efip->efi_format)); return -EFSCORRUPTED; } + + xfs_efi_recover_work(mp, dfp, &efip->efi_format.efi_extents[i]); } resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -693,22 +714,11 @@ xfs_efi_item_recover( efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); xlog_recover_transfer_intent(tp, dfp); - for (i = 0; i < efip->efi_format.efi_nextents; i++) { - struct xfs_extent_free_item fake = { - .xefi_owner = XFS_RMAP_OWN_UNKNOWN, - .xefi_agresv = XFS_AG_RESV_NONE, - }; - struct xfs_extent *extp; - - extp = &efip->efi_format.efi_extents[i]; - - fake.xefi_startblock = extp->ext_start; - fake.xefi_blockcount = extp->ext_len; - + list_for_each_entry(fake, &dfp->dfp_work, xefi_list) { if (!requeue_only) { - xfs_extent_free_get_group(mp, &fake); - error = xfs_trans_free_extent(tp, efdp, &fake); - xfs_extent_free_put_group(&fake); + xfs_extent_free_get_group(mp, fake); + error = xfs_trans_free_extent(tp, efdp, fake); + xfs_extent_free_put_group(fake); } /* @@ -717,10 +727,10 @@ xfs_efi_item_recover( * run again later with a new transaction context. */ if (error == -EAGAIN || requeue_only) { - error = xfs_free_extent_later(tp, fake.xefi_startblock, - fake.xefi_blockcount, + error = xfs_free_extent_later(tp, fake->xefi_startblock, + fake->xefi_blockcount, &XFS_RMAP_OINFO_ANY_OWNER, - fake.xefi_agresv); + fake->xefi_agresv); if (!error) { requeue_only = true; continue; @@ -729,7 +739,8 @@ xfs_efi_item_recover( if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - extp, sizeof(*extp)); + &efip->efi_format, + sizeof(efip->efi_format)); if (error) goto abort_error; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index f1b259223802..4ffc34e6f0a0 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -468,6 +468,23 @@ xfs_cui_validate_phys( return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len); } +static inline void +xfs_cui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_phys_extent *pmap) +{ + struct xfs_refcount_intent *ri; + + ri = kmem_cache_alloc(xfs_refcount_intent_cache, + GFP_NOFS | __GFP_NOFAIL); + ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; + ri->ri_startblock = pmap->pe_startblock; + ri->ri_blockcount = pmap->pe_len; + + xfs_defer_add_item(dfp, &ri->ri_list); +} + /* * Process a refcount update intent item that was recovered from the log. * We need to update the refcountbt. @@ -484,7 +501,7 @@ xfs_cui_item_recover( struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - unsigned int refc_type; + struct xfs_refcount_intent *fake; bool requeue_only = false; int i; int error = 0; @@ -502,6 +519,8 @@ xfs_cui_item_recover( sizeof(cuip->cui_format)); return -EFSCORRUPTED; } + + xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]); } /* @@ -525,35 +544,12 @@ xfs_cui_item_recover( cudp = xfs_trans_get_cud(tp, cuip); xlog_recover_transfer_intent(tp, dfp); - for (i = 0; i < cuip->cui_format.cui_nextents; i++) { - struct xfs_refcount_intent fake = { }; - struct xfs_phys_extent *pmap; - - pmap = &cuip->cui_format.cui_extents[i]; - refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; - switch (refc_type) { - case XFS_REFCOUNT_INCREASE: - case XFS_REFCOUNT_DECREASE: - case XFS_REFCOUNT_ALLOC_COW: - case XFS_REFCOUNT_FREE_COW: - fake.ri_type = refc_type; - break; - default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &cuip->cui_format, - sizeof(cuip->cui_format)); - error = -EFSCORRUPTED; - goto abort_error; - } - - fake.ri_startblock = pmap->pe_startblock; - fake.ri_blockcount = pmap->pe_len; - + list_for_each_entry(fake, &dfp->dfp_work, ri_list) { if (!requeue_only) { - xfs_refcount_update_get_group(mp, &fake); + xfs_refcount_update_get_group(mp, fake); error = xfs_trans_log_finish_refcount_update(tp, cudp, - &fake, &rcur); - xfs_refcount_update_put_group(&fake); + fake, &rcur); + xfs_refcount_update_put_group(fake); } if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, @@ -563,13 +559,13 @@ xfs_cui_item_recover( goto abort_error; /* Requeue what we didn't finish. */ - if (fake.ri_blockcount > 0) { + if (fake->ri_blockcount > 0) { struct xfs_bmbt_irec irec = { - .br_startblock = fake.ri_startblock, - .br_blockcount = fake.ri_blockcount, + .br_startblock = fake->ri_startblock, + .br_blockcount = fake->ri_blockcount, }; - switch (fake.ri_type) { + switch (fake->ri_type) { case XFS_REFCOUNT_INCREASE: xfs_refcount_increase_extent(tp, &irec); break; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 5e8a02d2b045..9fb3ae4bfd59 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -498,6 +498,58 @@ xfs_rui_validate_map( return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } +static inline void +xfs_rui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + const struct xfs_map_extent *map) +{ + struct xfs_rmap_intent *ri; + + ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + + switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + case XFS_RMAP_EXTENT_MAP: + ri->ri_type = XFS_RMAP_MAP; + break; + case XFS_RMAP_EXTENT_MAP_SHARED: + ri->ri_type = XFS_RMAP_MAP_SHARED; + break; + case XFS_RMAP_EXTENT_UNMAP: + ri->ri_type = XFS_RMAP_UNMAP; + break; + case XFS_RMAP_EXTENT_UNMAP_SHARED: + ri->ri_type = XFS_RMAP_UNMAP_SHARED; + break; + case XFS_RMAP_EXTENT_CONVERT: + ri->ri_type = XFS_RMAP_CONVERT; + break; + case XFS_RMAP_EXTENT_CONVERT_SHARED: + ri->ri_type = XFS_RMAP_CONVERT_SHARED; + break; + case XFS_RMAP_EXTENT_ALLOC: + ri->ri_type = XFS_RMAP_ALLOC; + break; + case XFS_RMAP_EXTENT_FREE: + ri->ri_type = XFS_RMAP_FREE; + break; + default: + ASSERT(0); + return; + } + + ri->ri_owner = map->me_owner; + ri->ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + ri->ri_bmap.br_startblock = map->me_startblock; + ri->ri_bmap.br_startoff = map->me_startoff; + ri->ri_bmap.br_blockcount = map->me_len; + ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + + xfs_defer_add_item(dfp, &ri->ri_list); +} + /* * Process an rmap update intent item that was recovered from the log. * We need to update the rmapbt. @@ -514,6 +566,7 @@ xfs_rui_item_recover( struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; + struct xfs_rmap_intent *fake; int i; int error = 0; @@ -530,6 +583,8 @@ xfs_rui_item_recover( sizeof(ruip->rui_format)); return -EFSCORRUPTED; } + + xfs_rui_recover_work(mp, dfp, &ruip->rui_format.rui_extents[i]); } resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -541,60 +596,15 @@ xfs_rui_item_recover( rudp = xfs_trans_get_rud(tp, ruip); xlog_recover_transfer_intent(tp, dfp); - for (i = 0; i < ruip->rui_format.rui_nextents; i++) { - struct xfs_rmap_intent fake = { }; - struct xfs_map_extent *map; - - map = &ruip->rui_format.rui_extents[i]; - switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { - case XFS_RMAP_EXTENT_MAP: - fake.ri_type = XFS_RMAP_MAP; - break; - case XFS_RMAP_EXTENT_MAP_SHARED: - fake.ri_type = XFS_RMAP_MAP_SHARED; - break; - case XFS_RMAP_EXTENT_UNMAP: - fake.ri_type = XFS_RMAP_UNMAP; - break; - case XFS_RMAP_EXTENT_UNMAP_SHARED: - fake.ri_type = XFS_RMAP_UNMAP_SHARED; - break; - case XFS_RMAP_EXTENT_CONVERT: - fake.ri_type = XFS_RMAP_CONVERT; - break; - case XFS_RMAP_EXTENT_CONVERT_SHARED: - fake.ri_type = XFS_RMAP_CONVERT_SHARED; - break; - case XFS_RMAP_EXTENT_ALLOC: - fake.ri_type = XFS_RMAP_ALLOC; - break; - case XFS_RMAP_EXTENT_FREE: - fake.ri_type = XFS_RMAP_FREE; - break; - default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &ruip->rui_format, - sizeof(ruip->rui_format)); - error = -EFSCORRUPTED; - goto abort_error; - } - - fake.ri_owner = map->me_owner; - fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - fake.ri_bmap.br_startblock = map->me_startblock; - fake.ri_bmap.br_startoff = map->me_startoff; - fake.ri_bmap.br_blockcount = map->me_len; - fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - - xfs_rmap_update_get_group(mp, &fake); - error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake, + list_for_each_entry(fake, &dfp->dfp_work, ri_list) { + xfs_rmap_update_get_group(mp, fake); + error = xfs_trans_log_finish_rmap_update(tp, rudp, fake, &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - map, sizeof(*map)); - xfs_rmap_update_put_group(&fake); + &ruip->rui_format, + sizeof(ruip->rui_format)); + xfs_rmap_update_put_group(fake); if (error) goto abort_error; From a51489e140d302c7afae763eacf882a23513f7e4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 09:57:42 -0800 Subject: [PATCH 0325/1562] xfs: dump the recovered xattri log item if corruption happens If xfs_attri_item_recover receives a corruption error when it tries to finish a recovered log intent item, it should dump the log item for debugging, just like all the other log intent items. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_item.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index c4441eacf51c..c7c308d2f897 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -666,6 +666,10 @@ xfs_attri_item_recover( xfs_irele(ip); return 0; } + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &attrip->attri_format, + sizeof(attrip->attri_format)); if (error) { xfs_trans_cancel(tp); goto out_unlock; From 172538beba82e7b65d3d7c84cb558f287381cd7a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 10:27:10 -0800 Subject: [PATCH 0326/1562] xfs: don't set XFS_TRANS_HAS_INTENT_DONE when there's no ATTRD log item XFS_TRANS_HAS_INTENT_DONE is a flag to the CIL that we've added a log intent done item to the transaction. This enables an optimization wherein we avoid writing out log intent and log intent done items if they would have ended up in the same checkpoint. This reduces writes to the ondisk log and speeds up recovery as a result. However, callers can use the defer ops machinery to modify xattrs without using the log items. In this situation, there won't be an intent done item, so we do not need to set the flag. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_item.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index bd23c9594a0d..d19a385f9289 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -347,13 +347,15 @@ out: * 1.) releases the ATTRI and frees the ATTRD * 2.) shuts down the filesystem */ - args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; + args->trans->t_flags |= XFS_TRANS_DIRTY; /* * attr intent/done items are null when logged attributes are disabled */ - if (attrdp) + if (attrdp) { + args->trans->t_flags |= XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); + } return error; } From e5f1a5146ec35f3ed5d7f5ac7807a10c0062b6b8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 22 Nov 2023 11:25:45 -0800 Subject: [PATCH 0327/1562] xfs: use xfs_defer_finish_one to finish recovered work items Get rid of the open-coded calls to xfs_defer_finish_one. This also means that the recovery transaction takes care of cleaning up the dfp, and we have solved (I hope) all the ownership issues in recovery. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 2 +- fs/xfs/libxfs/xfs_defer.h | 1 + fs/xfs/libxfs/xfs_log_recover.h | 2 +- fs/xfs/xfs_attr_item.c | 20 +---------- fs/xfs/xfs_bmap_item.c | 24 ++++--------- fs/xfs/xfs_extfree_item.c | 45 +++++------------------- fs/xfs/xfs_log_recover.c | 22 +++++++----- fs/xfs/xfs_refcount_item.c | 61 +++++---------------------------- fs/xfs/xfs_rmap_item.c | 29 +++++----------- 9 files changed, 49 insertions(+), 157 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 8fb523e4f669..eb262ea06122 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -484,7 +484,7 @@ xfs_defer_relog( * Log an intent-done item for the first pending intent, and finish the work * items. */ -static int +int xfs_defer_finish_one( struct xfs_trans *tp, struct xfs_defer_pending *dfp) diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index bef5823f61fb..c1a648e99174 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -41,6 +41,7 @@ void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type, struct list_head *h); int xfs_defer_finish_noroll(struct xfs_trans **tp); int xfs_defer_finish(struct xfs_trans **tp); +int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp); void xfs_defer_cancel(struct xfs_trans *); void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 13583df9f239..52162a17fc5e 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -155,7 +155,7 @@ xlog_recover_resv(const struct xfs_trans_res *r) void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip, xfs_lsn_t lsn, unsigned int dfp_type); -void xlog_recover_transfer_intent(struct xfs_trans *tp, +int xlog_recover_finish_intent(struct xfs_trans *tp, struct xfs_defer_pending *dfp); #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index c7c308d2f897..eaf8a877c2cc 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -620,7 +620,6 @@ xfs_attri_item_recover( struct xfs_attri_log_nameval *nv = attrip->attri_nameval; int error; int total; - struct xfs_attrd_log_item *done_item = NULL; /* * First check the validity of the attr described by the ATTRI. If any @@ -645,27 +644,10 @@ xfs_attri_item_recover( return error; args->trans = tp; - done_item = xfs_trans_get_attrd(tp, attrip); - xlog_recover_transfer_intent(tp, dfp); - xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_xattri_finish_update(attr, done_item); - if (error == -EAGAIN) { - /* - * There's more work to do, so add the intent item to this - * transaction so that we can continue it later. - */ - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); - error = xfs_defer_ops_capture_and_commit(tp, capture_list); - if (error) - goto out_unlock; - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); - return 0; - } + error = xlog_recover_finish_intent(tp, dfp); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &attrip->attri_format, diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index b65999bf0ea3..89f2d9e89607 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -497,6 +497,7 @@ xfs_bui_recover_work( bi->bi_bmap.br_blockcount = map->me_len; bi->bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + xfs_bmap_update_get_group(mp, bi); xfs_defer_add_item(dfp, &bi->bi_list); return bi; @@ -518,8 +519,7 @@ xfs_bui_item_recover( struct xfs_inode *ip = NULL; struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_map_extent *map; - struct xfs_bud_log_item *budp; - struct xfs_bmap_intent *fake; + struct xfs_bmap_intent *work; int iext_delta; int error = 0; @@ -530,7 +530,7 @@ xfs_bui_item_recover( } map = &buip->bui_format.bui_extents[0]; - fake = xfs_bui_recover_work(mp, dfp, map); + work = xfs_bui_recover_work(mp, dfp, map); error = xlog_recover_iget(mp, map->me_owner, &ip); if (error) @@ -543,39 +543,29 @@ xfs_bui_item_recover( if (error) goto err_rele; - budp = xfs_trans_get_bud(tp, buip); - xlog_recover_transfer_intent(tp, dfp); - xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - if (fake->bi_type == XFS_BMAP_MAP) + if (work->bi_type == XFS_BMAP_MAP) iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; else iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; - error = xfs_iext_count_may_overflow(ip, fake->bi_whichfork, iext_delta); + error = xfs_iext_count_may_overflow(ip, work->bi_whichfork, iext_delta); if (error == -EFBIG) error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto err_cancel; - fake->bi_owner = ip; + work->bi_owner = ip; - xfs_bmap_update_get_group(mp, fake); - error = xfs_trans_log_finish_bmap_update(tp, budp, fake); + error = xlog_recover_finish_intent(tp, dfp); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &buip->bui_format, sizeof(buip->bui_format)); - xfs_bmap_update_put_group(fake); if (error) goto err_cancel; - if (fake->bi_bmap.br_blockcount > 0) { - ASSERT(fake->bi_type == XFS_BMAP_UNMAP); - xfs_bmap_unmap_extent(tp, ip, &fake->bi_bmap); - } - /* * Commit transaction, which frees the transaction and saves the inode * for later replay activities. diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 41108a0b60c9..6a434ade486c 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -665,6 +665,7 @@ xfs_efi_recover_work( xefi->xefi_blockcount = extp->ext_len; xefi->xefi_agresv = XFS_AG_RESV_NONE; xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN; + xfs_extent_free_get_group(mp, xefi); xfs_defer_add_item(dfp, &xefi->xefi_list); } @@ -682,12 +683,9 @@ xfs_efi_item_recover( struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_efi_log_item *efip = EFI_ITEM(lip); struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_efd_log_item *efdp; struct xfs_trans *tp; - struct xfs_extent_free_item *fake; int i; int error = 0; - bool requeue_only = false; /* * First check the validity of the extents described by the @@ -711,40 +709,13 @@ xfs_efi_item_recover( if (error) return error; - efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); - xlog_recover_transfer_intent(tp, dfp); - - list_for_each_entry(fake, &dfp->dfp_work, xefi_list) { - if (!requeue_only) { - xfs_extent_free_get_group(mp, fake); - error = xfs_trans_free_extent(tp, efdp, fake); - xfs_extent_free_put_group(fake); - } - - /* - * If we can't free the extent without potentially deadlocking, - * requeue the rest of the extents to a new so that they get - * run again later with a new transaction context. - */ - if (error == -EAGAIN || requeue_only) { - error = xfs_free_extent_later(tp, fake->xefi_startblock, - fake->xefi_blockcount, - &XFS_RMAP_OINFO_ANY_OWNER, - fake->xefi_agresv); - if (!error) { - requeue_only = true; - continue; - } - } - - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &efip->efi_format, - sizeof(efip->efi_format)); - if (error) - goto abort_error; - - } + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &efip->efi_format, + sizeof(efip->efi_format)); + if (error) + goto abort_error; return xfs_defer_ops_capture_and_commit(tp, capture_list); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index cc14cd1c2282..6fab490959d4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2581,7 +2581,8 @@ xlog_recover_process_intents( * replayed in the wrong order! * * The recovery function can free the log item, so we must not - * access lip after it returns. + * access lip after it returns. It must dispose of @dfp if it + * returns 0. */ error = ops->iop_recover(dfp, &capture_list); if (error) { @@ -2589,8 +2590,6 @@ xlog_recover_process_intents( ops->iop_recover); break; } - - xfs_defer_cancel_recovery(log->l_mp, dfp); } if (error) goto err; @@ -2624,15 +2623,22 @@ xlog_recover_cancel_intents( } /* - * Transfer ownership of the recovered log intent item to the recovery - * transaction. + * Transfer ownership of the recovered pending work to the recovery transaction + * and try to finish the work. If there is more work to be done, the dfp will + * remain attached to the transaction. If not, the dfp is freed. */ -void -xlog_recover_transfer_intent( +int +xlog_recover_finish_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { - dfp->dfp_intent = NULL; + int error; + + list_move(&dfp->dfp_list, &tp->t_dfops); + error = xfs_defer_finish_one(tp, dfp); + if (error == -EAGAIN) + return 0; + return error; } /* diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 4ffc34e6f0a0..f561ca73c784 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -481,6 +481,7 @@ xfs_cui_recover_work( ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; ri->ri_startblock = pmap->pe_startblock; ri->ri_blockcount = pmap->pe_len; + xfs_refcount_update_get_group(mp, ri); xfs_defer_add_item(dfp, &ri->ri_list); } @@ -497,12 +498,8 @@ xfs_cui_item_recover( struct xfs_trans_res resv; struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); - struct xfs_cud_log_item *cudp; struct xfs_trans *tp; - struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_refcount_intent *fake; - bool requeue_only = false; int i; int error = 0; @@ -541,59 +538,17 @@ xfs_cui_item_recover( if (error) return error; - cudp = xfs_trans_get_cud(tp, cuip); - xlog_recover_transfer_intent(tp, dfp); + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &cuip->cui_format, + sizeof(cuip->cui_format)); + if (error) + goto abort_error; - list_for_each_entry(fake, &dfp->dfp_work, ri_list) { - if (!requeue_only) { - xfs_refcount_update_get_group(mp, fake); - error = xfs_trans_log_finish_refcount_update(tp, cudp, - fake, &rcur); - xfs_refcount_update_put_group(fake); - } - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &cuip->cui_format, - sizeof(cuip->cui_format)); - if (error) - goto abort_error; - - /* Requeue what we didn't finish. */ - if (fake->ri_blockcount > 0) { - struct xfs_bmbt_irec irec = { - .br_startblock = fake->ri_startblock, - .br_blockcount = fake->ri_blockcount, - }; - - switch (fake->ri_type) { - case XFS_REFCOUNT_INCREASE: - xfs_refcount_increase_extent(tp, &irec); - break; - case XFS_REFCOUNT_DECREASE: - xfs_refcount_decrease_extent(tp, &irec); - break; - case XFS_REFCOUNT_ALLOC_COW: - xfs_refcount_alloc_cow_extent(tp, - irec.br_startblock, - irec.br_blockcount); - break; - case XFS_REFCOUNT_FREE_COW: - xfs_refcount_free_cow_extent(tp, - irec.br_startblock, - irec.br_blockcount); - break; - default: - ASSERT(0); - } - requeue_only = true; - } - } - - xfs_refcount_finish_one_cleanup(tp, rcur, error); return xfs_defer_ops_capture_and_commit(tp, capture_list); abort_error: - xfs_refcount_finish_one_cleanup(tp, rcur, error); xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 9fb3ae4bfd59..23e736179894 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -546,6 +546,7 @@ xfs_rui_recover_work( ri->ri_bmap.br_blockcount = map->me_len; ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + xfs_rmap_update_get_group(mp, ri); xfs_defer_add_item(dfp, &ri->ri_list); } @@ -562,11 +563,8 @@ xfs_rui_item_recover( struct xfs_trans_res resv; struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_rui_log_item *ruip = RUI_ITEM(lip); - struct xfs_rud_log_item *rudp; struct xfs_trans *tp; - struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_rmap_intent *fake; int i; int error = 0; @@ -593,28 +591,17 @@ xfs_rui_item_recover( if (error) return error; - rudp = xfs_trans_get_rud(tp, ruip); - xlog_recover_transfer_intent(tp, dfp); + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &ruip->rui_format, + sizeof(ruip->rui_format)); + if (error) + goto abort_error; - list_for_each_entry(fake, &dfp->dfp_work, ri_list) { - xfs_rmap_update_get_group(mp, fake); - error = xfs_trans_log_finish_rmap_update(tp, rudp, fake, - &rcur); - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &ruip->rui_format, - sizeof(ruip->rui_format)); - xfs_rmap_update_put_group(fake); - if (error) - goto abort_error; - - } - - xfs_rmap_finish_one_cleanup(tp, rcur, error); return xfs_defer_ops_capture_and_commit(tp, capture_list); abort_error: - xfs_rmap_finish_one_cleanup(tp, rcur, error); xfs_trans_cancel(tp); return error; } From 3dd75c8db1c1675a26d3e228bab349c1fc065867 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 10:45:28 -0800 Subject: [PATCH 0328/1562] xfs: hoist intent done flag setting to ->finish_item callsite Each log intent item's ->finish_item call chain inevitably includes some code to set the dirty flag of the transaction. If there's an associated log intent done item, it also sets the item's dirty flag and the transaction's INTENT_DONE flag. This is repeated throughout the codebase. Reduce the LOC by moving all that to xfs_defer_finish_one. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 28 +++++++++++++++++++++++++++- fs/xfs/xfs_attr_item.c | 30 ++++-------------------------- fs/xfs/xfs_bmap_item.c | 16 +--------------- fs/xfs/xfs_extfree_item.c | 20 -------------------- fs/xfs/xfs_refcount_item.c | 16 +--------------- fs/xfs/xfs_rmap_item.c | 16 +--------------- 6 files changed, 34 insertions(+), 92 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index dd565e4e3daf..6214abedf394 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -191,6 +191,32 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, }; +/* Create a log intent done item for a log intent item. */ +static inline void +xfs_defer_create_done( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct xfs_log_item *lip; + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the log intent item and frees the log done item + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + lip = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + if (!lip) + return; + + tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE; + set_bit(XFS_LI_DIRTY, &lip->li_flags); + dfp->dfp_done = lip; +} + /* * Ensure there's a log intent item associated with this deferred work item if * the operation must be restarted on crash. Returns 1 if there's a log item; @@ -496,7 +522,7 @@ xfs_defer_finish_one( trace_xfs_defer_pending_finish(tp->t_mountp, dfp); - dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + xfs_defer_create_done(tp, dfp); list_for_each_safe(li, n, &dfp->dfp_work) { list_del(li); dfp->dfp_count--; diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index d19a385f9289..e7acbb736bee 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -324,39 +324,17 @@ xfs_xattri_finish_update( struct xfs_da_args *args = attr->xattri_da_args; int error; - if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { - error = -EIO; - goto out; - } + if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) + return -EIO; /* If an attr removal is trivially complete, we're done. */ if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE && - !xfs_inode_hasattr(args->dp)) { - error = 0; - goto out; - } + !xfs_inode_hasattr(args->dp)) + return 0; error = xfs_attr_set_iter(attr); if (!error && attr->xattri_dela_state != XFS_DAS_DONE) error = -EAGAIN; -out: - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the ATTRI and frees the ATTRD - * 2.) shuts down the filesystem - */ - args->trans->t_flags |= XFS_TRANS_DIRTY; - - /* - * attr intent/done items are null when logged attributes are disabled - */ - if (attrdp) { - args->trans->t_flags |= XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); - } - return error; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index bd8f6fe22b40..913315cb5123 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -249,21 +249,7 @@ xfs_trans_log_finish_bmap_update( struct xfs_bud_log_item *budp, struct xfs_bmap_intent *bi) { - int error; - - error = xfs_bmap_finish_one(tp, bi); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the BUI and frees the BUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); - - return error; + return xfs_bmap_finish_one(tp, bi); } /* Sort bmap intents by inode. */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 49e96ffd64e0..e8e02f816cbe 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -396,16 +396,6 @@ xfs_trans_free_extent( xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the EFI and frees the EFD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); - /* * If we need a new transaction to make progress, the caller will log a * new EFI with the current contents. It will also log an EFD to cancel @@ -601,16 +591,6 @@ xfs_agfl_free_finish_item( error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno, agbno, agbp, &oinfo); - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the EFI and frees the EFD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); - next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); extp = &(efdp->efd_format.efd_extents[next_extent]); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 48f1a38b272e..2628b1e3969c 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -256,21 +256,7 @@ xfs_trans_log_finish_refcount_update( struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur) { - int error; - - error = xfs_refcount_finish_one(tp, ri, pcur); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the CUI and frees the CUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); - - return error; + return xfs_refcount_finish_one(tp, ri, pcur); } /* Sort refcount intents by AG. */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 23684bc2ab85..8f216a13a7f2 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -297,21 +297,7 @@ xfs_trans_log_finish_rmap_update( struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur) { - int error; - - error = xfs_rmap_finish_one(tp, ri, pcur); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the RUI and frees the RUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); - - return error; + return xfs_rmap_finish_one(tp, ri, pcur); } /* Sort rmap intents by AG. */ From db7ccc0bac2add5a41b66578e376b49328fc99d0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 22 Nov 2023 13:39:25 -0800 Subject: [PATCH 0329/1562] xfs: move ->iop_recover to xfs_defer_op_type Finish off the series by moving the intent item recovery function pointer to the xfs_defer_op_type struct, since this is really a deferred work function now. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 17 +++++++++++++ fs/xfs/libxfs/xfs_defer.h | 4 +++ fs/xfs/libxfs/xfs_log_recover.h | 2 ++ fs/xfs/xfs_attr_item.c | 21 +++++++++------- fs/xfs/xfs_bmap_item.c | 39 ++++++++++++++++-------------- fs/xfs/xfs_extfree_item.c | 43 +++++++++++++++++---------------- fs/xfs/xfs_log_recover.c | 19 ++++++--------- fs/xfs/xfs_refcount_item.c | 24 +++++++++--------- fs/xfs/xfs_rmap_item.c | 24 +++++++++--------- fs/xfs/xfs_trans.h | 4 --- 10 files changed, 109 insertions(+), 88 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index eb262ea06122..dd565e4e3daf 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -713,6 +713,23 @@ xfs_defer_cancel_recovery( xfs_defer_pending_cancel_work(mp, dfp); } +/* Replay the deferred work item created from a recovered log intent item. */ +int +xfs_defer_finish_recovery( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct list_head *capture_list) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + int error; + + error = ops->recover_work(dfp, capture_list); + if (error) + trace_xlog_intent_recovery_failed(mp, error, + ops->recover_work); + return error; +} + /* * Move deferred ops from one transaction to another and reset the source to * initial state. This is primarily used to carry state forward across diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index c1a648e99174..ef86a7f9b059 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -57,6 +57,8 @@ struct xfs_defer_op_type { void (*finish_cleanup)(struct xfs_trans *tp, struct xfs_btree_cur *state, int error); void (*cancel_item)(struct list_head *item); + int (*recover_work)(struct xfs_defer_pending *dfp, + struct list_head *capture_list); unsigned int max_items; }; @@ -130,6 +132,8 @@ void xfs_defer_start_recovery(struct xfs_log_item *lip, enum xfs_defer_ops_type dfp_type, struct list_head *r_dfops); void xfs_defer_cancel_recovery(struct xfs_mount *mp, struct xfs_defer_pending *dfp); +int xfs_defer_finish_recovery(struct xfs_mount *mp, + struct xfs_defer_pending *dfp, struct list_head *capture_list); static inline void xfs_defer_add_item( diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 52162a17fc5e..c8e5d912895b 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -153,6 +153,8 @@ xlog_recover_resv(const struct xfs_trans_res *r) return ret; } +struct xfs_defer_pending; + void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip, xfs_lsn_t lsn, unsigned int dfp_type); int xlog_recover_finish_intent(struct xfs_trans *tp, diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index eaf8a877c2cc..bd23c9594a0d 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -544,12 +544,17 @@ xfs_attri_recover_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp, struct xfs_attri_log_format *attrp, - struct xfs_inode *ip, + struct xfs_inode **ipp, struct xfs_attri_log_nameval *nv) { struct xfs_attr_intent *attr; struct xfs_da_args *args; int local; + int error; + + error = xlog_recover_iget(mp, attrp->alfi_ino, ipp); + if (error) + return ERR_PTR(error); attr = kmem_zalloc(sizeof(struct xfs_attr_intent) + sizeof(struct xfs_da_args), KM_NOFS); @@ -567,7 +572,7 @@ xfs_attri_recover_work( attr->xattri_nameval = xfs_attri_log_nameval_get(nv); ASSERT(attr->xattri_nameval); - args->dp = ip; + args->dp = *ipp; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; args->name = nv->name.i_addr; @@ -604,7 +609,7 @@ xfs_attri_recover_work( * delete the attr that it describes. */ STATIC int -xfs_attri_item_recover( +xfs_attr_recover_work( struct xfs_defer_pending *dfp, struct list_head *capture_list) { @@ -630,11 +635,9 @@ xfs_attri_item_recover( !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) return -EFSCORRUPTED; - error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); - if (error) - return error; - - attr = xfs_attri_recover_work(mp, dfp, attrp, ip, nv); + attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv); + if (IS_ERR(attr)) + return PTR_ERR(attr); args = attr->xattri_da_args; xfs_init_attr_trans(args, &resv, &total); @@ -820,6 +823,7 @@ const struct xfs_defer_op_type xfs_attr_defer_type = { .create_done = xfs_attr_create_done, .finish_item = xfs_attr_finish_item, .cancel_item = xfs_attr_cancel_item, + .recover_work = xfs_attr_recover_work, }; /* @@ -856,7 +860,6 @@ static const struct xfs_item_ops xfs_attri_item_ops = { .iop_format = xfs_attri_item_format, .iop_unpin = xfs_attri_item_unpin, .iop_release = xfs_attri_item_release, - .iop_recover = xfs_attri_item_recover, .iop_match = xfs_attri_item_match, .iop_relog = xfs_attri_item_relog, }; diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 89f2d9e89607..bd8f6fe22b40 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -437,15 +437,6 @@ xfs_bmap_update_cancel_item( kmem_cache_free(xfs_bmap_intent_cache, bi); } -const struct xfs_defer_op_type xfs_bmap_update_defer_type = { - .max_items = XFS_BUI_MAX_FAST_EXTENTS, - .create_intent = xfs_bmap_update_create_intent, - .abort_intent = xfs_bmap_update_abort_intent, - .create_done = xfs_bmap_update_create_done, - .finish_item = xfs_bmap_update_finish_item, - .cancel_item = xfs_bmap_update_cancel_item, -}; - /* Is this recovered BUI ok? */ static inline bool xfs_bui_validate( @@ -484,9 +475,15 @@ static inline struct xfs_bmap_intent * xfs_bui_recover_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp, + struct xfs_inode **ipp, struct xfs_map_extent *map) { struct xfs_bmap_intent *bi; + int error; + + error = xlog_recover_iget(mp, map->me_owner, ipp); + if (error) + return ERR_PTR(error); bi = kmem_cache_zalloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? @@ -497,6 +494,7 @@ xfs_bui_recover_work( bi->bi_bmap.br_blockcount = map->me_len; bi->bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + bi->bi_owner = *ipp; xfs_bmap_update_get_group(mp, bi); xfs_defer_add_item(dfp, &bi->bi_list); @@ -508,7 +506,7 @@ xfs_bui_recover_work( * We need to update some inode's bmbt. */ STATIC int -xfs_bui_item_recover( +xfs_bmap_recover_work( struct xfs_defer_pending *dfp, struct list_head *capture_list) { @@ -530,11 +528,9 @@ xfs_bui_item_recover( } map = &buip->bui_format.bui_extents[0]; - work = xfs_bui_recover_work(mp, dfp, map); - - error = xlog_recover_iget(mp, map->me_owner, &ip); - if (error) - return error; + work = xfs_bui_recover_work(mp, dfp, &ip, map); + if (IS_ERR(work)) + return PTR_ERR(work); /* Allocate transaction and do the work. */ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -557,8 +553,6 @@ xfs_bui_item_recover( if (error) goto err_cancel; - work->bi_owner = ip; - error = xlog_recover_finish_intent(tp, dfp); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, @@ -587,6 +581,16 @@ err_rele: return error; } +const struct xfs_defer_op_type xfs_bmap_update_defer_type = { + .max_items = XFS_BUI_MAX_FAST_EXTENTS, + .create_intent = xfs_bmap_update_create_intent, + .abort_intent = xfs_bmap_update_abort_intent, + .create_done = xfs_bmap_update_create_done, + .finish_item = xfs_bmap_update_finish_item, + .cancel_item = xfs_bmap_update_cancel_item, + .recover_work = xfs_bmap_recover_work, +}; + STATIC bool xfs_bui_item_match( struct xfs_log_item *lip, @@ -627,7 +631,6 @@ static const struct xfs_item_ops xfs_bui_item_ops = { .iop_format = xfs_bui_item_format, .iop_unpin = xfs_bui_item_unpin, .iop_release = xfs_bui_item_release, - .iop_recover = xfs_bui_item_recover, .iop_match = xfs_bui_item_match, .iop_relog = xfs_bui_item_relog, }; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6a434ade486c..49e96ffd64e0 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -567,15 +567,6 @@ xfs_extent_free_cancel_item( kmem_cache_free(xfs_extfree_item_cache, xefi); } -const struct xfs_defer_op_type xfs_extent_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_extent_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, -}; - /* * AGFL blocks are accounted differently in the reserve pools and are not * inserted into the busy extent list. @@ -632,16 +623,6 @@ xfs_agfl_free_finish_item( return error; } -/* sub-type with special handling for AGFL deferred frees */ -const struct xfs_defer_op_type xfs_agfl_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_agfl_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, -}; - /* Is this recovered EFI ok? */ static inline bool xfs_efi_validate_ext( @@ -675,7 +656,7 @@ xfs_efi_recover_work( * the log. We need to free the extents that it describes. */ STATIC int -xfs_efi_item_recover( +xfs_extent_free_recover_work( struct xfs_defer_pending *dfp, struct list_head *capture_list) { @@ -724,6 +705,27 @@ abort_error: return error; } +const struct xfs_defer_op_type xfs_extent_free_defer_type = { + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_extent_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, +}; + +/* sub-type with special handling for AGFL deferred frees */ +const struct xfs_defer_op_type xfs_agfl_free_defer_type = { + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_agfl_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, +}; + STATIC bool xfs_efi_item_match( struct xfs_log_item *lip, @@ -766,7 +768,6 @@ static const struct xfs_item_ops xfs_efi_item_ops = { .iop_format = xfs_efi_item_format, .iop_unpin = xfs_efi_item_unpin, .iop_release = xfs_efi_item_release, - .iop_recover = xfs_efi_item_recover, .iop_match = xfs_efi_item_match, .iop_relog = xfs_efi_item_relog, }; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 6fab490959d4..c18692af2c65 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2562,17 +2562,14 @@ xlog_recover_process_intents( #endif list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { - struct xfs_log_item *lip = dfp->dfp_intent; - const struct xfs_item_ops *ops = lip->li_ops; - - ASSERT(xlog_item_is_intent(lip)); + ASSERT(xlog_item_is_intent(dfp->dfp_intent)); /* * We should never see a redo item with a LSN higher than * the last transaction we found in the log at the start * of recovery. */ - ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0); + ASSERT(XFS_LSN_CMP(last_lsn, dfp->dfp_intent->li_lsn) >= 0); /* * NOTE: If your intent processing routine can create more @@ -2581,15 +2578,13 @@ xlog_recover_process_intents( * replayed in the wrong order! * * The recovery function can free the log item, so we must not - * access lip after it returns. It must dispose of @dfp if it - * returns 0. + * access dfp->dfp_intent after it returns. It must dispose of + * @dfp if it returns 0. */ - error = ops->iop_recover(dfp, &capture_list); - if (error) { - trace_xlog_intent_recovery_failed(log->l_mp, error, - ops->iop_recover); + error = xfs_defer_finish_recovery(log->l_mp, dfp, + &capture_list); + if (error) break; - } } if (error) goto err; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index f561ca73c784..48f1a38b272e 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -433,16 +433,6 @@ xfs_refcount_update_cancel_item( kmem_cache_free(xfs_refcount_intent_cache, ri); } -const struct xfs_defer_op_type xfs_refcount_update_defer_type = { - .max_items = XFS_CUI_MAX_FAST_EXTENTS, - .create_intent = xfs_refcount_update_create_intent, - .abort_intent = xfs_refcount_update_abort_intent, - .create_done = xfs_refcount_update_create_done, - .finish_item = xfs_refcount_update_finish_item, - .finish_cleanup = xfs_refcount_finish_one_cleanup, - .cancel_item = xfs_refcount_update_cancel_item, -}; - /* Is this recovered CUI ok? */ static inline bool xfs_cui_validate_phys( @@ -491,7 +481,7 @@ xfs_cui_recover_work( * We need to update the refcountbt. */ STATIC int -xfs_cui_item_recover( +xfs_refcount_recover_work( struct xfs_defer_pending *dfp, struct list_head *capture_list) { @@ -553,6 +543,17 @@ abort_error: return error; } +const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + .max_items = XFS_CUI_MAX_FAST_EXTENTS, + .create_intent = xfs_refcount_update_create_intent, + .abort_intent = xfs_refcount_update_abort_intent, + .create_done = xfs_refcount_update_create_done, + .finish_item = xfs_refcount_update_finish_item, + .finish_cleanup = xfs_refcount_finish_one_cleanup, + .cancel_item = xfs_refcount_update_cancel_item, + .recover_work = xfs_refcount_recover_work, +}; + STATIC bool xfs_cui_item_match( struct xfs_log_item *lip, @@ -593,7 +594,6 @@ static const struct xfs_item_ops xfs_cui_item_ops = { .iop_format = xfs_cui_item_format, .iop_unpin = xfs_cui_item_unpin, .iop_release = xfs_cui_item_release, - .iop_recover = xfs_cui_item_recover, .iop_match = xfs_cui_item_match, .iop_relog = xfs_cui_item_relog, }; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 23e736179894..23684bc2ab85 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -452,16 +452,6 @@ xfs_rmap_update_cancel_item( kmem_cache_free(xfs_rmap_intent_cache, ri); } -const struct xfs_defer_op_type xfs_rmap_update_defer_type = { - .max_items = XFS_RUI_MAX_FAST_EXTENTS, - .create_intent = xfs_rmap_update_create_intent, - .abort_intent = xfs_rmap_update_abort_intent, - .create_done = xfs_rmap_update_create_done, - .finish_item = xfs_rmap_update_finish_item, - .finish_cleanup = xfs_rmap_finish_one_cleanup, - .cancel_item = xfs_rmap_update_cancel_item, -}; - /* Is this recovered RUI ok? */ static inline bool xfs_rui_validate_map( @@ -556,7 +546,7 @@ xfs_rui_recover_work( * We need to update the rmapbt. */ STATIC int -xfs_rui_item_recover( +xfs_rmap_recover_work( struct xfs_defer_pending *dfp, struct list_head *capture_list) { @@ -606,6 +596,17 @@ abort_error: return error; } +const struct xfs_defer_op_type xfs_rmap_update_defer_type = { + .max_items = XFS_RUI_MAX_FAST_EXTENTS, + .create_intent = xfs_rmap_update_create_intent, + .abort_intent = xfs_rmap_update_abort_intent, + .create_done = xfs_rmap_update_create_done, + .finish_item = xfs_rmap_update_finish_item, + .finish_cleanup = xfs_rmap_finish_one_cleanup, + .cancel_item = xfs_rmap_update_cancel_item, + .recover_work = xfs_rmap_recover_work, +}; + STATIC bool xfs_rui_item_match( struct xfs_log_item *lip, @@ -646,7 +647,6 @@ static const struct xfs_item_ops xfs_rui_item_ops = { .iop_format = xfs_rui_item_format, .iop_unpin = xfs_rui_item_unpin, .iop_release = xfs_rui_item_release, - .iop_recover = xfs_rui_item_recover, .iop_match = xfs_rui_item_match, .iop_relog = xfs_rui_item_relog, }; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 4e38357237c3..5fb018ad9fc0 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -66,8 +66,6 @@ struct xfs_log_item { { (1u << XFS_LI_DIRTY), "DIRTY" }, \ { (1u << XFS_LI_WHITEOUT), "WHITEOUT" } -struct xfs_defer_pending; - struct xfs_item_ops { unsigned flags; void (*iop_size)(struct xfs_log_item *, int *, int *); @@ -80,8 +78,6 @@ struct xfs_item_ops { xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); uint (*iop_push)(struct xfs_log_item *, struct list_head *); void (*iop_release)(struct xfs_log_item *); - int (*iop_recover)(struct xfs_defer_pending *dfp, - struct list_head *capture_list); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, struct xfs_trans *tp); From e6e5299fcbf0b18cad45cd58f99787549c790857 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 10:23:19 -0800 Subject: [PATCH 0330/1562] xfs: collapse the ->finish_item helpers Each log item's ->finish_item function sets up a small amount of state and calls another function to do the work. Collapse that other function into ->finish_item to reduce the call stack height. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_item.c | 60 +++++++++---------------- fs/xfs/xfs_bmap_item.c | 18 +------- fs/xfs/xfs_extfree_item.c | 90 ++++++++++++++------------------------ fs/xfs/xfs_refcount_item.c | 18 +------- fs/xfs/xfs_rmap_item.c | 18 +------- 5 files changed, 58 insertions(+), 146 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index e7acbb736bee..96438cd38633 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -310,34 +310,6 @@ xfs_attrd_item_intent( return &ATTRD_ITEM(lip)->attrd_attrip->attri_item; } -/* - * Performs one step of an attribute update intent and marks the attrd item - * dirty.. An attr operation may be a set or a remove. Note that the - * transaction is marked dirty regardless of whether the operation succeeds or - * fails to support the ATTRI/ATTRD lifecycle rules. - */ -STATIC int -xfs_xattri_finish_update( - struct xfs_attr_intent *attr, - struct xfs_attrd_log_item *attrdp) -{ - struct xfs_da_args *args = attr->xattri_da_args; - int error; - - if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) - return -EIO; - - /* If an attr removal is trivially complete, we're done. */ - if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE && - !xfs_inode_hasattr(args->dp)) - return 0; - - error = xfs_attr_set_iter(attr); - if (!error && attr->xattri_dela_state != XFS_DAS_DONE) - error = -EAGAIN; - return error; -} - /* Log an attr to the intent item. */ STATIC void xfs_attr_log_item( @@ -434,23 +406,33 @@ xfs_attr_finish_item( struct xfs_btree_cur **state) { struct xfs_attr_intent *attr; - struct xfs_attrd_log_item *done_item = NULL; + struct xfs_da_args *args; int error; attr = container_of(item, struct xfs_attr_intent, xattri_list); - if (done) - done_item = ATTRD_ITEM(done); + args = attr->xattri_da_args; - /* - * Always reset trans after EAGAIN cycle - * since the transaction is new - */ - attr->xattri_da_args->trans = tp; + /* Reset trans after EAGAIN cycle since the transaction is new */ + args->trans = tp; - error = xfs_xattri_finish_update(attr, done_item); - if (error != -EAGAIN) - xfs_attr_free_item(attr); + if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + error = -EIO; + goto out; + } + /* If an attr removal is trivially complete, we're done. */ + if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE && + !xfs_inode_hasattr(args->dp)) { + error = 0; + goto out; + } + + error = xfs_attr_set_iter(attr); + if (!error && attr->xattri_dela_state != XFS_DAS_DONE) + return -EAGAIN; + +out: + xfs_attr_free_item(attr); return error; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 913315cb5123..79d19b5b0e5e 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -238,20 +238,6 @@ xfs_trans_get_bud( return budp; } -/* - * Finish an bmap update and log it to the BUD. Note that the - * transaction is marked dirty regardless of whether the bmap update - * succeeds or fails to support the BUI/BUD lifecycle rules. - */ -static int -xfs_trans_log_finish_bmap_update( - struct xfs_trans *tp, - struct xfs_bud_log_item *budp, - struct xfs_bmap_intent *bi) -{ - return xfs_bmap_finish_one(tp, bi); -} - /* Sort bmap intents by inode. */ static int xfs_bmap_update_diff_items( @@ -378,7 +364,7 @@ xfs_bmap_update_put_group( xfs_perag_intent_put(bi->bi_pag); } -/* Process a deferred rmap update. */ +/* Process a deferred bmap update. */ STATIC int xfs_bmap_update_finish_item( struct xfs_trans *tp, @@ -391,7 +377,7 @@ xfs_bmap_update_finish_item( bi = container_of(item, struct xfs_bmap_intent, bi_list); - error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi); + error = xfs_bmap_finish_one(tp, bi); if (!error && bi->bi_bmap.br_blockcount > 0) { ASSERT(bi->bi_type == XFS_BMAP_UNMAP); return -EAGAIN; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index e8e02f816cbe..581a70acd1ac 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -364,59 +364,6 @@ xfs_efd_from_efi( efdp->efd_next_extent = efip->efi_format.efi_nextents; } -/* - * Free an extent and log it to the EFD. Note that the transaction is marked - * dirty regardless of whether the extent free succeeds or fails to support the - * EFI/EFD lifecycle rules. - */ -static int -xfs_trans_free_extent( - struct xfs_trans *tp, - struct xfs_efd_log_item *efdp, - struct xfs_extent_free_item *xefi) -{ - struct xfs_owner_info oinfo = { }; - struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent *extp; - uint next_extent; - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, - xefi->xefi_startblock); - int error; - - oinfo.oi_owner = xefi->xefi_owner; - if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) - oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; - if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) - oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - - trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, - agbno, xefi->xefi_blockcount); - - error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, - xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, - xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); - - /* - * If we need a new transaction to make progress, the caller will log a - * new EFI with the current contents. It will also log an EFD to cancel - * the existing EFI, and so we need to copy all the unprocessed extents - * in this EFI to the EFD so this works correctly. - */ - if (error == -EAGAIN) { - xfs_efd_from_efi(efdp); - return error; - } - - next_extent = efdp->efd_next_extent; - ASSERT(next_extent < efdp->efd_format.efd_nextents); - extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = xefi->xefi_startblock; - extp->ext_len = xefi->xefi_blockcount; - efdp->efd_next_extent++; - - return error; -} - /* Sort bmap items by AG. */ static int xfs_extent_free_diff_items( @@ -517,19 +464,48 @@ xfs_extent_free_finish_item( struct list_head *item, struct xfs_btree_cur **state) { + struct xfs_owner_info oinfo = { }; struct xfs_extent_free_item *xefi; + struct xfs_efd_log_item *efdp = EFD_ITEM(done); + struct xfs_mount *mp = tp->t_mountp; + struct xfs_extent *extp; + uint next_extent; + xfs_agblock_t agbno; int error; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); - error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); + oinfo.oi_owner = xefi->xefi_owner; + if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) + oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; + if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) + oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; + + trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, + agbno, xefi->xefi_blockcount); /* - * Don't free the XEFI if we need a new transaction to complete - * processing of it. + * If we need a new transaction to make progress, the caller will log a + * new EFI with the current contents. It will also log an EFD to cancel + * the existing EFI, and so we need to copy all the unprocessed extents + * in this EFI to the EFD so this works correctly. */ - if (error == -EAGAIN) + error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, + xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, + xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); + if (error == -EAGAIN) { + xfs_efd_from_efi(efdp); return error; + } + + /* Add the work we finished to the EFD, even though nobody uses that */ + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; + efdp->efd_next_extent++; xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 2628b1e3969c..7273f538db2e 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -244,21 +244,6 @@ xfs_trans_get_cud( return cudp; } -/* - * Finish an refcount update and log it to the CUD. Note that the - * transaction is marked dirty regardless of whether the refcount - * update succeeds or fails to support the CUI/CUD lifecycle rules. - */ -static int -xfs_trans_log_finish_refcount_update( - struct xfs_trans *tp, - struct xfs_cud_log_item *cudp, - struct xfs_refcount_intent *ri, - struct xfs_btree_cur **pcur) -{ - return xfs_refcount_finish_one(tp, ri, pcur); -} - /* Sort refcount intents by AG. */ static int xfs_refcount_update_diff_items( @@ -383,10 +368,9 @@ xfs_refcount_update_finish_item( int error; ri = container_of(item, struct xfs_refcount_intent, ri_list); - error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri, - state); /* Did we run out of reservation? Requeue what we didn't finish. */ + error = xfs_refcount_finish_one(tp, ri, state); if (!error && ri->ri_blockcount > 0) { ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE || ri->ri_type == XFS_REFCOUNT_DECREASE); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 8f216a13a7f2..d54fd925b746 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -285,21 +285,6 @@ xfs_trans_set_rmap_flags( } } -/* - * Finish an rmap update and log it to the RUD. Note that the transaction is - * marked dirty regardless of whether the rmap update succeeds or fails to - * support the RUI/RUD lifecycle rules. - */ -static int -xfs_trans_log_finish_rmap_update( - struct xfs_trans *tp, - struct xfs_rud_log_item *rudp, - struct xfs_rmap_intent *ri, - struct xfs_btree_cur **pcur) -{ - return xfs_rmap_finish_one(tp, ri, pcur); -} - /* Sort rmap intents by AG. */ static int xfs_rmap_update_diff_items( @@ -409,8 +394,7 @@ xfs_rmap_update_finish_item( ri = container_of(item, struct xfs_rmap_intent, ri_list); - error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri, - state); + error = xfs_rmap_finish_one(tp, ri, state); xfs_rmap_update_put_group(ri); kmem_cache_free(xfs_rmap_intent_cache, ri); From f3fd7f6fce1cc9b8eb59705b27f823330207b7c9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 10:58:37 -0800 Subject: [PATCH 0331/1562] xfs: hoist ->create_intent boilerplate to its callsite Hoist the dirty flag setting code out of each ->create_intent implementation up to the callsite to reduce boilerplate further. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 2 ++ fs/xfs/xfs_attr_item.c | 3 --- fs/xfs/xfs_bmap_item.c | 3 --- fs/xfs/xfs_extfree_item.c | 3 --- fs/xfs/xfs_refcount_item.c | 3 --- fs/xfs/xfs_rmap_item.c | 3 --- 6 files changed, 2 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 6214abedf394..2871c773a122 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -240,6 +240,8 @@ xfs_defer_create_intent( if (IS_ERR(lip)) return PTR_ERR(lip); + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_intent = lip; return 1; } diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 96438cd38633..fc199256fc8e 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -319,9 +319,6 @@ xfs_attr_log_item( { struct xfs_attri_log_format *attrp; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags); - /* * At this point the xfs_attr_intent has been constructed, and we've * created the log intent. Fill in the attri log item and log format diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 79d19b5b0e5e..24cf70154a54 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -286,9 +286,6 @@ xfs_bmap_update_log_item( uint next_extent; struct xfs_map_extent *map; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 581a70acd1ac..d07cdc3eb809 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -390,9 +390,6 @@ xfs_extent_free_log_item( uint next_extent; struct xfs_extent *extp; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 7273f538db2e..f604b7e3b77e 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -289,9 +289,6 @@ xfs_refcount_update_log_item( uint next_extent; struct xfs_phys_extent *pmap; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index d54fd925b746..05841548691d 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -311,9 +311,6 @@ xfs_rmap_update_log_item( uint next_extent; struct xfs_map_extent *map; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from From bd3a88f6b71c7509566b44b7021581191cc11ae3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 11:44:56 -0800 Subject: [PATCH 0332/1562] xfs: use xfs_defer_create_done for the relogging operation Now that we have a helper to handle creating a log intent done item and updating all the necessary state flags, use it to reduce boilerplate in the ->iop_relog implementations. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 6 +++++- fs/xfs/xfs_attr_item.c | 6 +----- fs/xfs/xfs_bmap_item.c | 6 +----- fs/xfs/xfs_extfree_item.c | 6 ++---- fs/xfs/xfs_refcount_item.c | 6 +----- fs/xfs/xfs_rmap_item.c | 6 +----- fs/xfs/xfs_trans.h | 4 +++- 7 files changed, 14 insertions(+), 26 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 2871c773a122..63b9960a96e1 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -500,7 +500,11 @@ xfs_defer_relog( trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); XFS_STATS_INC((*tpp)->t_mountp, defer_relog); - dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp); + + xfs_defer_create_done(*tpp, dfp); + dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, + dfp->dfp_done, *tpp); + dfp->dfp_done = NULL; } if ((*tpp)->t_flags & XFS_TRANS_DIRTY) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index fc199256fc8e..e9813fa64461 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -630,9 +630,9 @@ out_unlock: static struct xfs_log_item * xfs_attri_item_relog( struct xfs_log_item *intent, + struct xfs_log_item *done_item, struct xfs_trans *tp) { - struct xfs_attrd_log_item *attrdp; struct xfs_attri_log_item *old_attrip; struct xfs_attri_log_item *new_attrip; struct xfs_attri_log_format *new_attrp; @@ -641,10 +641,6 @@ xfs_attri_item_relog( old_attrip = ATTRI_ITEM(intent); old_attrp = &old_attrip->attri_format; - tp->t_flags |= XFS_TRANS_DIRTY; - attrdp = xfs_trans_get_attrd(tp, old_attrip); - set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); - /* * Create a new log item that shares the same name/value buffer as the * old log item. diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 24cf70154a54..ba385c06de5d 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -572,9 +572,9 @@ xfs_bui_item_match( static struct xfs_log_item * xfs_bui_item_relog( struct xfs_log_item *intent, + struct xfs_log_item *done_item, struct xfs_trans *tp) { - struct xfs_bud_log_item *budp; struct xfs_bui_log_item *buip; struct xfs_map_extent *map; unsigned int count; @@ -582,10 +582,6 @@ xfs_bui_item_relog( count = BUI_ITEM(intent)->bui_format.bui_nextents; map = BUI_ITEM(intent)->bui_format.bui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); - buip = xfs_bui_init(tp->t_mountp); memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map)); atomic_set(&buip->bui_next_extent, count); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d07cdc3eb809..807398479187 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -691,9 +691,10 @@ xfs_efi_item_match( static struct xfs_log_item * xfs_efi_item_relog( struct xfs_log_item *intent, + struct xfs_log_item *done_item, struct xfs_trans *tp) { - struct xfs_efd_log_item *efdp; + struct xfs_efd_log_item *efdp = EFD_ITEM(done_item); struct xfs_efi_log_item *efip; struct xfs_extent *extp; unsigned int count; @@ -701,11 +702,8 @@ xfs_efi_item_relog( count = EFI_ITEM(intent)->efi_format.efi_nextents; extp = EFI_ITEM(intent)->efi_format.efi_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count); efdp->efd_next_extent = count; memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp)); - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); efip = xfs_efi_init(tp->t_mountp, count); memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index f604b7e3b77e..142839a8e7b1 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -533,9 +533,9 @@ xfs_cui_item_match( static struct xfs_log_item * xfs_cui_item_relog( struct xfs_log_item *intent, + struct xfs_log_item *done_item, struct xfs_trans *tp) { - struct xfs_cud_log_item *cudp; struct xfs_cui_log_item *cuip; struct xfs_phys_extent *pmap; unsigned int count; @@ -543,10 +543,6 @@ xfs_cui_item_relog( count = CUI_ITEM(intent)->cui_format.cui_nextents; pmap = CUI_ITEM(intent)->cui_format.cui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); - cuip = xfs_cui_init(tp->t_mountp, count); memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); atomic_set(&cuip->cui_next_extent, count); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 05841548691d..e2730b3e0d96 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -586,9 +586,9 @@ xfs_rui_item_match( static struct xfs_log_item * xfs_rui_item_relog( struct xfs_log_item *intent, + struct xfs_log_item *done_item, struct xfs_trans *tp) { - struct xfs_rud_log_item *rudp; struct xfs_rui_log_item *ruip; struct xfs_map_extent *map; unsigned int count; @@ -596,10 +596,6 @@ xfs_rui_item_relog( count = RUI_ITEM(intent)->rui_format.rui_nextents; map = RUI_ITEM(intent)->rui_format.rui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); - ruip = xfs_rui_init(tp->t_mountp, count); memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); atomic_set(&ruip->rui_next_extent, count); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 5fb018ad9fc0..25646e2b12f4 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -80,6 +80,7 @@ struct xfs_item_ops { void (*iop_release)(struct xfs_log_item *); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, + struct xfs_log_item *done_item, struct xfs_trans *tp); struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done); }; @@ -248,9 +249,10 @@ extern struct kmem_cache *xfs_trans_cache; static inline struct xfs_log_item * xfs_trans_item_relog( struct xfs_log_item *lip, + struct xfs_log_item *done_lip, struct xfs_trans *tp) { - return lip->li_ops->iop_relog(lip, tp); + return lip->li_ops->iop_relog(lip, done_lip, tp); } struct xfs_dquot; From 3e0958be2156d90ef908a1a547b4e27a3ec38da9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 11:47:45 -0800 Subject: [PATCH 0333/1562] xfs: clean out XFS_LI_DIRTY setting boilerplate from ->iop_relog Hoist this dirty flag setting to the ->iop_relog callsite to reduce boilerplate. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 9 +++++++-- fs/xfs/xfs_attr_item.c | 1 - fs/xfs/xfs_bmap_item.c | 2 +- fs/xfs/xfs_extfree_item.c | 2 +- fs/xfs/xfs_refcount_item.c | 2 +- fs/xfs/xfs_rmap_item.c | 2 +- 6 files changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 63b9960a96e1..aa19ede91a57 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -474,6 +474,8 @@ xfs_defer_relog( ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); list_for_each_entry(dfp, dfops, dfp_list) { + struct xfs_log_item *lip; + /* * If the log intent item for this deferred op is not a part of * the current log checkpoint, relog the intent item to keep @@ -502,9 +504,12 @@ xfs_defer_relog( XFS_STATS_INC((*tpp)->t_mountp, defer_relog); xfs_defer_create_done(*tpp, dfp); - dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, - dfp->dfp_done, *tpp); + lip = xfs_trans_item_relog(dfp->dfp_intent, dfp->dfp_done, + *tpp); + if (lip) + set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_done = NULL; + dfp->dfp_intent = lip; } if ((*tpp)->t_flags & XFS_TRANS_DIRTY) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index e9813fa64461..5d86a4b8b457 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -655,7 +655,6 @@ xfs_attri_item_relog( new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; xfs_trans_add_item(tp, &new_attrip->attri_item); - set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags); return &new_attrip->attri_item; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index ba385c06de5d..ef72061d7cec 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -586,7 +586,7 @@ xfs_bui_item_relog( memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map)); atomic_set(&buip->bui_next_extent, count); xfs_trans_add_item(tp, &buip->bui_item); - set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); + return &buip->bui_item; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 807398479187..e2e86f2edb3c 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -709,7 +709,7 @@ xfs_efi_item_relog( memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); atomic_set(&efip->efi_next_extent, count); xfs_trans_add_item(tp, &efip->efi_item); - set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); + return &efip->efi_item; } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 142839a8e7b1..01d16e795068 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -547,7 +547,7 @@ xfs_cui_item_relog( memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); atomic_set(&cuip->cui_next_extent, count); xfs_trans_add_item(tp, &cuip->cui_item); - set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); + return &cuip->cui_item; } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index e2730b3e0d96..96b2dc832d62 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -600,7 +600,7 @@ xfs_rui_item_relog( memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); atomic_set(&ruip->rui_next_extent, count); xfs_trans_add_item(tp, &ruip->rui_item); - set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); + return &ruip->rui_item; } From b28852a5bd08654634e4e32eb072fba14c5fae26 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 12:06:08 -0800 Subject: [PATCH 0334/1562] xfs: hoist xfs_trans_add_item calls to defer ops functions Remove even more repeated boilerplate. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 7 ++++++- fs/xfs/xfs_attr_item.c | 4 ---- fs/xfs/xfs_bmap_item.c | 3 --- fs/xfs/xfs_extfree_item.c | 3 --- fs/xfs/xfs_refcount_item.c | 3 --- fs/xfs/xfs_rmap_item.c | 3 --- 6 files changed, 6 insertions(+), 17 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index aa19ede91a57..95f15a4b2126 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -26,6 +26,7 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" +#include "xfs_trans_priv.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -213,6 +214,7 @@ xfs_defer_create_done( return; tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE; + xfs_trans_add_item(tp, lip); set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_done = lip; } @@ -241,6 +243,7 @@ xfs_defer_create_intent( return PTR_ERR(lip); tp->t_flags |= XFS_TRANS_DIRTY; + xfs_trans_add_item(tp, lip); set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_intent = lip; return 1; @@ -506,8 +509,10 @@ xfs_defer_relog( xfs_defer_create_done(*tpp, dfp); lip = xfs_trans_item_relog(dfp->dfp_intent, dfp->dfp_done, *tpp); - if (lip) + if (lip) { + xfs_trans_add_item(*tpp, lip); set_bit(XFS_LI_DIRTY, &lip->li_flags); + } dfp->dfp_done = NULL; dfp->dfp_intent = lip; } diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 5d86a4b8b457..c815811d937a 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -375,7 +375,6 @@ xfs_attr_create_intent( } attrip = xfs_attri_init(mp, attr->xattri_nameval); - xfs_trans_add_item(tp, &attrip->attri_item); xfs_attr_log_item(tp, attrip, attr); return &attrip->attri_item; @@ -654,8 +653,6 @@ xfs_attri_item_relog( new_attrp->alfi_name_len = old_attrp->alfi_name_len; new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; - xfs_trans_add_item(tp, &new_attrip->attri_item); - return &new_attrip->attri_item; } @@ -753,7 +750,6 @@ xfs_trans_get_attrd(struct xfs_trans *tp, attrdp->attrd_attrip = attrip; attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; - xfs_trans_add_item(tp, &attrdp->attrd_item); return attrdp; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index ef72061d7cec..0be7a1224a81 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -234,7 +234,6 @@ xfs_trans_get_bud( budp->bud_buip = buip; budp->bud_format.bud_bui_id = buip->bui_format.bui_id; - xfs_trans_add_item(tp, &budp->bud_item); return budp; } @@ -315,7 +314,6 @@ xfs_bmap_update_create_intent( ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); - xfs_trans_add_item(tp, &buip->bui_item); if (sort) list_sort(mp, items, xfs_bmap_update_diff_items); list_for_each_entry(bi, items, bi_list) @@ -585,7 +583,6 @@ xfs_bui_item_relog( buip = xfs_bui_init(tp->t_mountp); memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map)); atomic_set(&buip->bui_next_extent, count); - xfs_trans_add_item(tp, &buip->bui_item); return &buip->bui_item; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index e2e86f2edb3c..44bbf620e0cf 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -332,7 +332,6 @@ xfs_trans_get_efd( efdp->efd_format.efd_nextents = nextents; efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; - xfs_trans_add_item(tp, &efdp->efd_item); return efdp; } @@ -415,7 +414,6 @@ xfs_extent_free_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &efip->efi_item); if (sort) list_sort(mp, items, xfs_extent_free_diff_items); list_for_each_entry(xefi, items, xefi_list) @@ -708,7 +706,6 @@ xfs_efi_item_relog( efip = xfs_efi_init(tp->t_mountp, count); memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); atomic_set(&efip->efi_next_extent, count); - xfs_trans_add_item(tp, &efip->efi_item); return &efip->efi_item; } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 01d16e795068..a66bb6aa2e5d 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -240,7 +240,6 @@ xfs_trans_get_cud( cudp->cud_cuip = cuip; cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; - xfs_trans_add_item(tp, &cudp->cud_item); return cudp; } @@ -315,7 +314,6 @@ xfs_refcount_update_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &cuip->cui_item); if (sort) list_sort(mp, items, xfs_refcount_update_diff_items); list_for_each_entry(ri, items, ri_list) @@ -546,7 +544,6 @@ xfs_cui_item_relog( cuip = xfs_cui_init(tp->t_mountp, count); memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); atomic_set(&cuip->cui_next_extent, count); - xfs_trans_add_item(tp, &cuip->cui_item); return &cuip->cui_item; } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 96b2dc832d62..d668eb4d099e 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -238,7 +238,6 @@ xfs_trans_get_rud( rudp->rud_ruip = ruip; rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; - xfs_trans_add_item(tp, &rudp->rud_item); return rudp; } @@ -340,7 +339,6 @@ xfs_rmap_update_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &ruip->rui_item); if (sort) list_sort(mp, items, xfs_rmap_update_diff_items); list_for_each_entry(ri, items, ri_list) @@ -599,7 +597,6 @@ xfs_rui_item_relog( ruip = xfs_rui_init(tp->t_mountp, count); memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); atomic_set(&ruip->rui_next_extent, count); - xfs_trans_add_item(tp, &ruip->rui_item); return &ruip->rui_item; } From 8a9aa763e17c5490d3526cbf4c9484d76ecbbe39 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 12:17:37 -0800 Subject: [PATCH 0335/1562] xfs: collapse the ->create_done functions Move the meat of the ->create_done function helpers into ->create_done to reduce the amount of boilerplate. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_item.c | 37 ++++++++++---------------- fs/xfs/xfs_bmap_item.c | 29 ++++++++------------- fs/xfs/xfs_extfree_item.c | 53 ++++++++++++++------------------------ fs/xfs/xfs_refcount_item.c | 27 +++++++------------ fs/xfs/xfs_rmap_item.c | 27 +++++++------------ 5 files changed, 64 insertions(+), 109 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index c815811d937a..27553388da99 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -33,8 +33,6 @@ struct kmem_cache *xfs_attrd_cache; static const struct xfs_item_ops xfs_attri_item_ops; static const struct xfs_item_ops xfs_attrd_item_ops; -static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp, - struct xfs_attri_log_item *attrip); static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip) { @@ -732,16 +730,20 @@ xlog_recover_attri_commit_pass2( return 0; } -/* - * This routine is called to allocate an "attr free done" log item. - */ -static struct xfs_attrd_log_item * -xfs_trans_get_attrd(struct xfs_trans *tp, - struct xfs_attri_log_item *attrip) +/* Get an ATTRD so we can process all the attrs. */ +static struct xfs_log_item * +xfs_attr_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) { - struct xfs_attrd_log_item *attrdp; + struct xfs_attri_log_item *attrip; + struct xfs_attrd_log_item *attrdp; - ASSERT(tp != NULL); + if (!intent) + return NULL; + + attrip = ATTRI_ITEM(intent); attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); @@ -750,20 +752,7 @@ xfs_trans_get_attrd(struct xfs_trans *tp, attrdp->attrd_attrip = attrip; attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; - return attrdp; -} - -/* Get an ATTRD so we can process all the attrs. */ -static struct xfs_log_item * -xfs_attr_create_done( - struct xfs_trans *tp, - struct xfs_log_item *intent, - unsigned int count) -{ - if (!intent) - return NULL; - - return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item; + return &attrdp->attrd_item; } const struct xfs_defer_op_type xfs_attr_defer_type = { diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 0be7a1224a81..f3421e615e1c 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -221,22 +221,6 @@ static const struct xfs_item_ops xfs_bud_item_ops = { .iop_intent = xfs_bud_item_intent, }; -static struct xfs_bud_log_item * -xfs_trans_get_bud( - struct xfs_trans *tp, - struct xfs_bui_log_item *buip) -{ - struct xfs_bud_log_item *budp; - - budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, - &xfs_bud_item_ops); - budp->bud_buip = buip; - budp->bud_format.bud_bui_id = buip->bui_format.bui_id; - - return budp; -} - /* Sort bmap intents by inode. */ static int xfs_bmap_update_diff_items( @@ -321,14 +305,23 @@ xfs_bmap_update_create_intent( return &buip->bui_item; } -/* Get an BUD so we can process all the deferred rmap updates. */ +/* Get an BUD so we can process all the deferred bmap updates. */ static struct xfs_log_item * xfs_bmap_update_create_done( struct xfs_trans *tp, struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item; + struct xfs_bui_log_item *buip = BUI_ITEM(intent); + struct xfs_bud_log_item *budp; + + budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, + &xfs_bud_item_ops); + budp->bud_buip = buip; + budp->bud_format.bud_bui_id = buip->bui_format.bui_id; + + return &budp->bud_item; } /* Take a passive ref to the AG containing the space we're mapping. */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 44bbf620e0cf..518569c64e9c 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -303,38 +303,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = { .iop_intent = xfs_efd_item_intent, }; -/* - * Allocate an "extent free done" log item that will hold nextents worth of - * extents. The caller must use all nextents extents, because we are not - * flexible about this at all. - */ -static struct xfs_efd_log_item * -xfs_trans_get_efd( - struct xfs_trans *tp, - struct xfs_efi_log_item *efip, - unsigned int nextents) -{ - struct xfs_efd_log_item *efdp; - - ASSERT(nextents > 0); - - if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { - efdp = kzalloc(xfs_efd_log_item_sizeof(nextents), - GFP_KERNEL | __GFP_NOFAIL); - } else { - efdp = kmem_cache_zalloc(xfs_efd_cache, - GFP_KERNEL | __GFP_NOFAIL); - } - - xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, - &xfs_efd_item_ops); - efdp->efd_efip = efip; - efdp->efd_format.efd_nextents = nextents; - efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; - - return efdp; -} - /* * Fill the EFD with all extents from the EFI when we need to roll the * transaction and continue with a new EFI. @@ -428,7 +396,26 @@ xfs_extent_free_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item; + struct xfs_efi_log_item *efip = EFI_ITEM(intent); + struct xfs_efd_log_item *efdp; + + ASSERT(count > 0); + + if (count > XFS_EFD_MAX_FAST_EXTENTS) { + efdp = kzalloc(xfs_efd_log_item_sizeof(count), + GFP_KERNEL | __GFP_NOFAIL); + } else { + efdp = kmem_cache_zalloc(xfs_efd_cache, + GFP_KERNEL | __GFP_NOFAIL); + } + + xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, + &xfs_efd_item_ops); + efdp->efd_efip = efip; + efdp->efd_format.efd_nextents = count; + efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; + + return &efdp->efd_item; } /* Take a passive ref to the AG containing the space we're freeing. */ diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index a66bb6aa2e5d..d218a9ed4d82 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -227,22 +227,6 @@ static const struct xfs_item_ops xfs_cud_item_ops = { .iop_intent = xfs_cud_item_intent, }; -static struct xfs_cud_log_item * -xfs_trans_get_cud( - struct xfs_trans *tp, - struct xfs_cui_log_item *cuip) -{ - struct xfs_cud_log_item *cudp; - - cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, - &xfs_cud_item_ops); - cudp->cud_cuip = cuip; - cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; - - return cudp; -} - /* Sort refcount intents by AG. */ static int xfs_refcount_update_diff_items( @@ -328,7 +312,16 @@ xfs_refcount_update_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item; + struct xfs_cui_log_item *cuip = CUI_ITEM(intent); + struct xfs_cud_log_item *cudp; + + cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, + &xfs_cud_item_ops); + cudp->cud_cuip = cuip; + cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; + + return &cudp->cud_item; } /* Take a passive ref to the AG containing the space we're refcounting. */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index d668eb4d099e..96e0c2b0d059 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -225,22 +225,6 @@ static const struct xfs_item_ops xfs_rud_item_ops = { .iop_intent = xfs_rud_item_intent, }; -static struct xfs_rud_log_item * -xfs_trans_get_rud( - struct xfs_trans *tp, - struct xfs_rui_log_item *ruip) -{ - struct xfs_rud_log_item *rudp; - - rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, - &xfs_rud_item_ops); - rudp->rud_ruip = ruip; - rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; - - return rudp; -} - /* Set the map extent flags for this reverse mapping. */ static void xfs_trans_set_rmap_flags( @@ -353,7 +337,16 @@ xfs_rmap_update_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item; + struct xfs_rui_log_item *ruip = RUI_ITEM(intent); + struct xfs_rud_log_item *rudp; + + rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, + &xfs_rud_item_ops); + rudp->rud_ruip = ruip; + rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; + + return &rudp->rud_item; } /* Take a passive ref to the AG containing the space we're rmapping. */ From a6a38f309afc4a7ede01242b603f36c433997780 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Dec 2023 09:17:40 -0800 Subject: [PATCH 0336/1562] xfs: make rextslog computation consistent with mkfs There's a weird discrepancy in xfsprogs dating back to the creation of the Linux port -- if there are zero rt extents, mkfs will set sb_rextents and sb_rextslog both to zero: sbp->sb_rextslog = (uint8_t)(rtextents ? libxfs_highbit32((unsigned int)rtextents) : 0); However, that's not the check that xfs_repair uses for nonzero rtblocks: if (sb->sb_rextslog != libxfs_highbit32((unsigned int)sb->sb_rextents)) The difference here is that xfs_highbit32 returns -1 if its argument is zero. Unfortunately, this means that in the weird corner case of a realtime volume shorter than 1 rt extent, xfs_repair will immediately flag a freshly formatted filesystem as corrupt. Because mkfs has been writing ondisk artifacts like this for decades, we have to accept that as "correct". TBH, zero rextslog for zero rtextents makes more sense to me anyway. Regrettably, the superblock verifier checks created in commit copied xfs_repair even though mkfs has been writing out such filesystems for ages. Fix the superblock verifier to accept what mkfs spits out; the userspace version of this patch will have to fix xfs_repair as well. Note that the new helper leaves the zeroday bug where the upper 32 bits of sb_rextents is ripped off and fed to highbit32. This leads to a seriously undersized rt summary file, which immediately breaks mkfs: $ hugedisk.sh foo /dev/sdc $(( 0x100000080 * 4096))B $ /sbin/mkfs.xfs -f /dev/sda -m rmapbt=0,reflink=0 -r rtdev=/dev/mapper/foo meta-data=/dev/sda isize=512 agcount=4, agsize=1298176 blks = sectsz=512 attr=2, projid32bit=1 = crc=1 finobt=1, sparse=1, rmapbt=0 = reflink=0 bigtime=1 inobtcount=1 nrext64=1 data = bsize=4096 blocks=5192704, imaxpct=25 = sunit=0 swidth=0 blks naming =version 2 bsize=4096 ascii-ci=0, ftype=1 log =internal log bsize=4096 blocks=16384, version=2 = sectsz=512 sunit=0 blks, lazy-count=1 realtime =/dev/mapper/foo extsz=4096 blocks=4294967424, rtextents=4294967424 Discarding blocks...Done. mkfs.xfs: Error initializing the realtime space [117 - Structure needs cleaning] The next patch will drop support for rt volumes with fewer than 1 or more than 2^32-1 rt extents, since they've clearly been broken forever. Fixes: f8e566c0f5e1f ("xfs: validate the realtime geometry in xfs_validate_sb_common") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rtbitmap.c | 12 ++++++++++++ fs/xfs/libxfs/xfs_rtbitmap.h | 3 +++ fs/xfs/libxfs/xfs_sb.c | 3 ++- fs/xfs/xfs_rtalloc.c | 4 ++-- 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index c269d704314d..1c9fed76a356 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1130,6 +1130,18 @@ xfs_rtbitmap_blockcount( return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize); } +/* + * Compute the maximum level number of the realtime summary file, as defined by + * mkfs. The use of highbit32 on a 64-bit quantity is a historic artifact that + * prohibits correct use of rt volumes with more than 2^32 extents. + */ +uint8_t +xfs_compute_rextslog( + xfs_rtbxlen_t rtextents) +{ + return rtextents ? xfs_highbit32(rtextents) : 0; +} + /* * Compute the number of rtbitmap words needed to populate every block of a * bitmap that is large enough to track the given number of rt extents. diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index c0637057d69c..6e5bae324cc3 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -351,6 +351,8 @@ xfs_rtfree_extent( int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, xfs_filblks_t rtlen); +uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); + xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, @@ -369,6 +371,7 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, # define xfs_rtsummary_read_buf(a,b) (-ENOSYS) # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) +# define xfs_compute_rextslog(rtx) (0) static inline xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 1f74d0cd1618..df12bf82ed18 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -25,6 +25,7 @@ #include "xfs_da_format.h" #include "xfs_health.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -509,7 +510,7 @@ xfs_validate_sb_common( NBBY * sbp->sb_blocksize); if (sbp->sb_rextents != rexts || - sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) || + sbp->sb_rextslog != xfs_compute_rextslog(rexts) || sbp->sb_rbmblocks != rbmblocks) { xfs_notice(mp, "realtime geometry sanity check failed"); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 88c48de5c9c8..7c5a50163d2d 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -964,7 +964,7 @@ xfs_growfs_rt( nrextents = nrblocks; do_div(nrextents, in->extsize); nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents); - nrextslog = xfs_highbit32(nrextents); + nrextslog = xfs_compute_rextslog(nrextents); nrsumlevels = nrextslog + 1; nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks); nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); @@ -1031,7 +1031,7 @@ xfs_growfs_rt( nsbp->sb_rblocks = min(nrblocks, nrblocks_step); nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks); ASSERT(nsbp->sb_rextents != 0); - nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); + nsbp->sb_rextslog = xfs_compute_rextslog(nsbp->sb_rextents); nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nsbp->sb_rbmblocks); From cf8f0e6c1429be7652869059ea44696b72d5b726 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 3 Dec 2023 09:19:44 -0800 Subject: [PATCH 0337/1562] xfs: fix 32-bit truncation in xfs_compute_rextslog It's quite reasonable that some customer somewhere will want to configure a realtime volume with more than 2^32 extents. If they try to do this, the highbit32() call will truncate the upper bits of the xfs_rtbxlen_t and produce the wrong value for rextslog. This in turn causes the rsumlevels to be wrong, which results in a realtime summary file that is the wrong length. Fix that. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rtbitmap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 1c9fed76a356..30a2844f62e3 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1132,14 +1132,16 @@ xfs_rtbitmap_blockcount( /* * Compute the maximum level number of the realtime summary file, as defined by - * mkfs. The use of highbit32 on a 64-bit quantity is a historic artifact that - * prohibits correct use of rt volumes with more than 2^32 extents. + * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct + * use of rt volumes with more than 2^32 extents. */ uint8_t xfs_compute_rextslog( xfs_rtbxlen_t rtextents) { - return rtextents ? xfs_highbit32(rtextents) : 0; + if (!rtextents) + return 0; + return xfs_highbit64(rtextents); } /* From 94da54d582e6866a9613514b59f326456e88446d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 4 Dec 2023 12:13:26 -0800 Subject: [PATCH 0338/1562] xfs: document what LARP means Christoph requested a blurb somewhere explaining exactly what LARP means. I don't know of a good place other than the source code (debug knobs aren't covered in Documentation/), so here it is. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_sysfs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index a3c6b1548723..871f16a4a5d8 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -229,6 +229,15 @@ pwork_threads_show( } XFS_SYSFS_ATTR_RW(pwork_threads); +/* + * The "LARP" (Logged extended Attribute Recovery Persistence) debugging knob + * sets the XFS_DA_OP_LOGGED flag on all xfs_attr_set operations performed on + * V5 filesystems. As a result, the intermediate progress of all setxattr and + * removexattr operations are tracked via the log and can be restarted during + * recovery. This is useful for testing xattr recovery prior to merging of the + * parent pointer feature which requires it to maintain consistency, and may be + * enabled for userspace xattrs in the future. + */ static ssize_t larp_store( struct kobject *kobject, From a49c708f9a445457f6a5905732081871234f61c6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 12:31:30 -0800 Subject: [PATCH 0339/1562] xfs: move ->iop_relog to struct xfs_defer_op_type The only log items that need relogging are the ones created for deferred work operations, and the only part of the code base that relogs log items is the deferred work machinery. Move the function pointers. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 31 +++++++++++------- fs/xfs/libxfs/xfs_defer.h | 3 ++ fs/xfs/xfs_attr_item.c | 8 ++--- fs/xfs/xfs_bmap_item.c | 44 ++++++++++++------------- fs/xfs/xfs_extfree_item.c | 67 +++++++++++++++++++------------------- fs/xfs/xfs_refcount_item.c | 46 +++++++++++++------------- fs/xfs/xfs_rmap_item.c | 46 +++++++++++++------------- fs/xfs/xfs_trans.h | 12 ------- 8 files changed, 129 insertions(+), 128 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 95f15a4b2126..54a6be06e6cd 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -459,6 +459,25 @@ xfs_defer_cancel_list( xfs_defer_pending_cancel_work(mp, dfp); } +static inline void +xfs_defer_relog_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + struct xfs_log_item *lip; + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + + xfs_defer_create_done(tp, dfp); + + lip = ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); + if (lip) { + xfs_trans_add_item(tp, lip); + set_bit(XFS_LI_DIRTY, &lip->li_flags); + } + dfp->dfp_done = NULL; + dfp->dfp_intent = lip; +} + /* * Prevent a log intent item from pinning the tail of the log by logging a * done item to release the intent item; and then log a new intent item. @@ -477,8 +496,6 @@ xfs_defer_relog( ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); list_for_each_entry(dfp, dfops, dfp_list) { - struct xfs_log_item *lip; - /* * If the log intent item for this deferred op is not a part of * the current log checkpoint, relog the intent item to keep @@ -506,15 +523,7 @@ xfs_defer_relog( trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); XFS_STATS_INC((*tpp)->t_mountp, defer_relog); - xfs_defer_create_done(*tpp, dfp); - lip = xfs_trans_item_relog(dfp->dfp_intent, dfp->dfp_done, - *tpp); - if (lip) { - xfs_trans_add_item(*tpp, lip); - set_bit(XFS_LI_DIRTY, &lip->li_flags); - } - dfp->dfp_done = NULL; - dfp->dfp_intent = lip; + xfs_defer_relog_intent(*tpp, dfp); } if ((*tpp)->t_flags & XFS_TRANS_DIRTY) diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index ef86a7f9b059..78d6dcd1af2c 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -59,6 +59,9 @@ struct xfs_defer_op_type { void (*cancel_item)(struct list_head *item); int (*recover_work)(struct xfs_defer_pending *dfp, struct list_head *capture_list); + struct xfs_log_item *(*relog_intent)(struct xfs_trans *tp, + struct xfs_log_item *intent, + struct xfs_log_item *done_item); unsigned int max_items; }; diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 27553388da99..988d395a48ad 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -625,10 +625,10 @@ out_unlock: /* Re-log an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_attri_item_relog( +xfs_attr_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_log_item *done_item, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { struct xfs_attri_log_item *old_attrip; struct xfs_attri_log_item *new_attrip; @@ -763,6 +763,7 @@ const struct xfs_defer_op_type xfs_attr_defer_type = { .finish_item = xfs_attr_finish_item, .cancel_item = xfs_attr_cancel_item, .recover_work = xfs_attr_recover_work, + .relog_intent = xfs_attr_relog_intent, }; /* @@ -800,7 +801,6 @@ static const struct xfs_item_ops xfs_attri_item_ops = { .iop_unpin = xfs_attri_item_unpin, .iop_release = xfs_attri_item_release, .iop_match = xfs_attri_item_match, - .iop_relog = xfs_attri_item_relog, }; const struct xlog_recover_item_ops xlog_attri_item_ops = { diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index f3421e615e1c..bc48d733634a 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -541,30 +541,12 @@ err_rele: return error; } -const struct xfs_defer_op_type xfs_bmap_update_defer_type = { - .max_items = XFS_BUI_MAX_FAST_EXTENTS, - .create_intent = xfs_bmap_update_create_intent, - .abort_intent = xfs_bmap_update_abort_intent, - .create_done = xfs_bmap_update_create_done, - .finish_item = xfs_bmap_update_finish_item, - .cancel_item = xfs_bmap_update_cancel_item, - .recover_work = xfs_bmap_recover_work, -}; - -STATIC bool -xfs_bui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return BUI_ITEM(lip)->bui_format.bui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_bui_item_relog( +xfs_bmap_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_log_item *done_item, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { struct xfs_bui_log_item *buip; struct xfs_map_extent *map; @@ -580,6 +562,25 @@ xfs_bui_item_relog( return &buip->bui_item; } +const struct xfs_defer_op_type xfs_bmap_update_defer_type = { + .max_items = XFS_BUI_MAX_FAST_EXTENTS, + .create_intent = xfs_bmap_update_create_intent, + .abort_intent = xfs_bmap_update_abort_intent, + .create_done = xfs_bmap_update_create_done, + .finish_item = xfs_bmap_update_finish_item, + .cancel_item = xfs_bmap_update_cancel_item, + .recover_work = xfs_bmap_recover_work, + .relog_intent = xfs_bmap_relog_intent, +}; + +STATIC bool +xfs_bui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return BUI_ITEM(lip)->bui_format.bui_id == intent_id; +} + static const struct xfs_item_ops xfs_bui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_bui_item_size, @@ -587,7 +588,6 @@ static const struct xfs_item_ops xfs_bui_item_ops = { .iop_unpin = xfs_bui_item_unpin, .iop_release = xfs_bui_item_release, .iop_match = xfs_bui_item_match, - .iop_relog = xfs_bui_item_relog, }; static inline void diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 518569c64e9c..3ca23ab8d92a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -643,41 +643,12 @@ abort_error: return error; } -const struct xfs_defer_op_type xfs_extent_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_extent_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, - .recover_work = xfs_extent_free_recover_work, -}; - -/* sub-type with special handling for AGFL deferred frees */ -const struct xfs_defer_op_type xfs_agfl_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_agfl_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, - .recover_work = xfs_extent_free_recover_work, -}; - -STATIC bool -xfs_efi_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return EFI_ITEM(lip)->efi_format.efi_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_efi_item_relog( +xfs_extent_free_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_log_item *done_item, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { struct xfs_efd_log_item *efdp = EFD_ITEM(done_item); struct xfs_efi_log_item *efip; @@ -697,6 +668,37 @@ xfs_efi_item_relog( return &efip->efi_item; } +const struct xfs_defer_op_type xfs_extent_free_defer_type = { + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_extent_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, + .relog_intent = xfs_extent_free_relog_intent, +}; + +/* sub-type with special handling for AGFL deferred frees */ +const struct xfs_defer_op_type xfs_agfl_free_defer_type = { + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_agfl_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, + .relog_intent = xfs_extent_free_relog_intent, +}; + +STATIC bool +xfs_efi_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return EFI_ITEM(lip)->efi_format.efi_id == intent_id; +} + static const struct xfs_item_ops xfs_efi_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_efi_item_size, @@ -704,7 +706,6 @@ static const struct xfs_item_ops xfs_efi_item_ops = { .iop_unpin = xfs_efi_item_unpin, .iop_release = xfs_efi_item_release, .iop_match = xfs_efi_item_match, - .iop_relog = xfs_efi_item_relog, }; /* diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index d218a9ed4d82..9974be81cb2b 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -501,31 +501,12 @@ abort_error: return error; } -const struct xfs_defer_op_type xfs_refcount_update_defer_type = { - .max_items = XFS_CUI_MAX_FAST_EXTENTS, - .create_intent = xfs_refcount_update_create_intent, - .abort_intent = xfs_refcount_update_abort_intent, - .create_done = xfs_refcount_update_create_done, - .finish_item = xfs_refcount_update_finish_item, - .finish_cleanup = xfs_refcount_finish_one_cleanup, - .cancel_item = xfs_refcount_update_cancel_item, - .recover_work = xfs_refcount_recover_work, -}; - -STATIC bool -xfs_cui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return CUI_ITEM(lip)->cui_format.cui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_cui_item_relog( +xfs_refcount_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_log_item *done_item, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { struct xfs_cui_log_item *cuip; struct xfs_phys_extent *pmap; @@ -541,6 +522,26 @@ xfs_cui_item_relog( return &cuip->cui_item; } +const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + .max_items = XFS_CUI_MAX_FAST_EXTENTS, + .create_intent = xfs_refcount_update_create_intent, + .abort_intent = xfs_refcount_update_abort_intent, + .create_done = xfs_refcount_update_create_done, + .finish_item = xfs_refcount_update_finish_item, + .finish_cleanup = xfs_refcount_finish_one_cleanup, + .cancel_item = xfs_refcount_update_cancel_item, + .recover_work = xfs_refcount_recover_work, + .relog_intent = xfs_refcount_relog_intent, +}; + +STATIC bool +xfs_cui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return CUI_ITEM(lip)->cui_format.cui_id == intent_id; +} + static const struct xfs_item_ops xfs_cui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_cui_item_size, @@ -548,7 +549,6 @@ static const struct xfs_item_ops xfs_cui_item_ops = { .iop_unpin = xfs_cui_item_unpin, .iop_release = xfs_cui_item_release, .iop_match = xfs_cui_item_match, - .iop_relog = xfs_cui_item_relog, }; static inline void diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 96e0c2b0d059..488c4a2a80a3 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -554,31 +554,12 @@ abort_error: return error; } -const struct xfs_defer_op_type xfs_rmap_update_defer_type = { - .max_items = XFS_RUI_MAX_FAST_EXTENTS, - .create_intent = xfs_rmap_update_create_intent, - .abort_intent = xfs_rmap_update_abort_intent, - .create_done = xfs_rmap_update_create_done, - .finish_item = xfs_rmap_update_finish_item, - .finish_cleanup = xfs_rmap_finish_one_cleanup, - .cancel_item = xfs_rmap_update_cancel_item, - .recover_work = xfs_rmap_recover_work, -}; - -STATIC bool -xfs_rui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return RUI_ITEM(lip)->rui_format.rui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_rui_item_relog( +xfs_rmap_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_log_item *done_item, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { struct xfs_rui_log_item *ruip; struct xfs_map_extent *map; @@ -594,6 +575,26 @@ xfs_rui_item_relog( return &ruip->rui_item; } +const struct xfs_defer_op_type xfs_rmap_update_defer_type = { + .max_items = XFS_RUI_MAX_FAST_EXTENTS, + .create_intent = xfs_rmap_update_create_intent, + .abort_intent = xfs_rmap_update_abort_intent, + .create_done = xfs_rmap_update_create_done, + .finish_item = xfs_rmap_update_finish_item, + .finish_cleanup = xfs_rmap_finish_one_cleanup, + .cancel_item = xfs_rmap_update_cancel_item, + .recover_work = xfs_rmap_recover_work, + .relog_intent = xfs_rmap_relog_intent, +}; + +STATIC bool +xfs_rui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return RUI_ITEM(lip)->rui_format.rui_id == intent_id; +} + static const struct xfs_item_ops xfs_rui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_rui_item_size, @@ -601,7 +602,6 @@ static const struct xfs_item_ops xfs_rui_item_ops = { .iop_unpin = xfs_rui_item_unpin, .iop_release = xfs_rui_item_release, .iop_match = xfs_rui_item_match, - .iop_relog = xfs_rui_item_relog, }; static inline void diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 25646e2b12f4..2cb1e143fc49 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -79,9 +79,6 @@ struct xfs_item_ops { uint (*iop_push)(struct xfs_log_item *, struct list_head *); void (*iop_release)(struct xfs_log_item *); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); - struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, - struct xfs_log_item *done_item, - struct xfs_trans *tp); struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done); }; @@ -246,15 +243,6 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, extern struct kmem_cache *xfs_trans_cache; -static inline struct xfs_log_item * -xfs_trans_item_relog( - struct xfs_log_item *lip, - struct xfs_log_item *done_lip, - struct xfs_trans *tp) -{ - return lip->li_ops->iop_relog(lip, done_lip, tp); -} - struct xfs_dquot; int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv, From e14293803f4e84eb23a417b462b56251033b5a66 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Dec 2023 09:24:18 -0800 Subject: [PATCH 0340/1562] xfs: don't allow overly small or large realtime volumes Don't allow realtime volumes that are less than one rt extent long. This has been broken across 4 LTS kernels with nobody noticing, so let's just disable it. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rtbitmap.h | 13 +++++++++++++ fs/xfs/libxfs/xfs_sb.c | 3 ++- fs/xfs/xfs_rtalloc.c | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 6e5bae324cc3..1c84b52de3d4 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -353,6 +353,18 @@ int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); +/* Do we support an rt volume having this number of rtextents? */ +static inline bool +xfs_validate_rtextents( + xfs_rtbxlen_t rtextents) +{ + /* No runt rt volumes */ + if (rtextents == 0) + return false; + + return true; +} + xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, @@ -372,6 +384,7 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) # define xfs_compute_rextslog(rtx) (0) +# define xfs_validate_rtextents(rtx) (false) static inline xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index df12bf82ed18..4a9e8588f4c9 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -509,7 +509,8 @@ xfs_validate_sb_common( rbmblocks = howmany_64(sbp->sb_rextents, NBBY * sbp->sb_blocksize); - if (sbp->sb_rextents != rexts || + if (!xfs_validate_rtextents(rexts) || + sbp->sb_rextents != rexts || sbp->sb_rextslog != xfs_compute_rextslog(rexts) || sbp->sb_rbmblocks != rbmblocks) { xfs_notice(mp, diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 7c5a50163d2d..8feb58c6241c 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -963,6 +963,8 @@ xfs_growfs_rt( */ nrextents = nrblocks; do_div(nrextents, in->extsize); + if (!xfs_validate_rtextents(nrextents)) + return -EINVAL; nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents); nrextslog = xfs_compute_rextslog(nrextents); nrsumlevels = nrextslog + 1; From 9c07bca793b4ff9f0b7871e2a928a1b28b8fa4e3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 4 Dec 2023 12:21:06 -0800 Subject: [PATCH 0341/1562] xfs: elide ->create_done calls for unlogged deferred work Extended attribute updates use the deferred work machinery to manage state across a chain of smaller transactions. All previous deferred work users have employed log intent items and log done items to manage restarting of interrupted operations, which means that ->create_intent sets dfp_intent to a log intent item and ->create_done uses that item to create a log intent done item. However, xattrs have used the INCOMPLETE flag to deal with the lack of recovery support for an interrupted transaction chain. Log items are optional if the xattr update caller didn't set XFS_DA_OP_LOGGED to require a restartable sequence. In other words, ->create_intent can return NULL to say that there's no log intent item. If that's the case, no log intent done item should be created. Clean up xfs_defer_create_done not to do this, so that the ->create_done functions don't have to check for non-null dfp_intent themselves. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 4 ++++ fs/xfs/xfs_attr_item.c | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 54a6be06e6cd..06e890b44c52 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -201,6 +201,10 @@ xfs_defer_create_done( const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; struct xfs_log_item *lip; + /* If there is no log intent item, there can be no log done item. */ + if (!dfp->dfp_intent) + return; + /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 988d395a48ad..39f2c5a46179 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -740,9 +740,6 @@ xfs_attr_create_done( struct xfs_attri_log_item *attrip; struct xfs_attrd_log_item *attrdp; - if (!intent) - return NULL; - attrip = ATTRI_ITEM(intent); attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); From 3f113c2739b1b068854c7ffed635c2bd790d1492 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:40:54 -0800 Subject: [PATCH 0342/1562] xfs: make xchk_iget safer in the presence of corrupt inode btrees When scrub is trying to iget an inode, ensure that it won't end up deadlocked on a cycle in the inode btree by using an empty transaction to store all the buffers. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/common.c | 6 ++++-- fs/xfs/scrub/common.h | 25 +++++++++++++++++++++++++ fs/xfs/scrub/inode.c | 4 ++-- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index de24532fe083..23944fcc1a6c 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -733,6 +733,8 @@ xchk_iget( xfs_ino_t inum, struct xfs_inode **ipp) { + ASSERT(sc->tp != NULL); + return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); } @@ -882,8 +884,8 @@ xchk_iget_for_scrubbing( if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; - /* Try a regular untrusted iget. */ - error = xchk_iget(sc, sc->sm->sm_ino, &ip); + /* Try a safe untrusted iget. */ + error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip); if (!error) return xchk_install_handle_inode(sc, ip); if (error == -ENOENT) diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index cabdc0e16838..c83cf9e5b55f 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -151,12 +151,37 @@ void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags); void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp); +/* + * Grab the inode at @inum. The caller must have created a scrub transaction + * so that we can confirm the inumber by walking the inobt and not deadlock on + * a loop in the inobt. + */ int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp); int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_buf **agi_bpp, struct xfs_inode **ipp); void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip); int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip); +/* + * Safe version of (untrusted) xchk_iget that uses an empty transaction to + * avoid deadlocking on loops in the inobt. This should only be used in a + * scrub or repair setup routine, and only prior to grabbing a transaction. + */ +static inline int +xchk_iget_safe(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp) +{ + int error; + + ASSERT(sc->tp == NULL); + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + error = xchk_iget(sc, inum, ipp); + xchk_trans_cancel(sc); + return error; +} + /* * Don't bother cross-referencing if we already found corruption or cross * referencing discrepancies. diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 889f556bc98f..b7a93380a1ab 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -95,8 +95,8 @@ xchk_setup_inode( if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; - /* Try a regular untrusted iget. */ - error = xchk_iget(sc, sc->sm->sm_ino, &ip); + /* Try a safe untrusted iget. */ + error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip); if (!error) return xchk_install_handle_iscrub(sc, ip); if (error == -ENOENT) From 6b126139401a2284402d7c38fe3168d5a26da41d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:40:55 -0800 Subject: [PATCH 0343/1562] xfs: don't append work items to logged xfs_defer_pending objects When someone tries to add a deferred work item to xfs_defer_add, it will try to attach the work item to the most recently added xfs_defer_pending object attached to the transaction. However, it doesn't check if the pending object has a log intent item attached to it. This is incorrect behavior because we cannot add more work to an object that has already been committed to the ondisk log. Therefore, change the behavior not to append to pending items with a non null dfp_intent. In practice this has not been an issue because the only way xfs_defer_add gets called after log intent items have been committed is from the defer ops ->finish_item functions themselves, and the @dop_pending isolation in xfs_defer_finish_noroll protects the pending items that have already been logged. However, the next patch will add the ability to pause a deferred extent free object during online btree rebuilding, and any new extfree work items need to have their own pending event. While we're at it, hoist the predicate to its own static inline function for readability. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 61 ++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 06e890b44c52..7f9b1c5f2588 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -695,6 +695,51 @@ xfs_defer_cancel( xfs_defer_cancel_list(mp, &tp->t_dfops); } +/* + * Return the last pending work item attached to this transaction if it matches + * the deferred op type. + */ +static inline struct xfs_defer_pending * +xfs_defer_find_last( + struct xfs_trans *tp, + enum xfs_defer_ops_type type, + const struct xfs_defer_op_type *ops) +{ + struct xfs_defer_pending *dfp = NULL; + + /* No dfops at all? */ + if (list_empty(&tp->t_dfops)) + return NULL; + + dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, + dfp_list); + + /* Wrong type? */ + if (dfp->dfp_type != type) + return NULL; + return dfp; +} + +/* + * Decide if we can add a deferred work item to the last dfops item attached + * to the transaction. + */ +static inline bool +xfs_defer_can_append( + struct xfs_defer_pending *dfp, + const struct xfs_defer_op_type *ops) +{ + /* Already logged? */ + if (dfp->dfp_intent) + return false; + + /* Already full? */ + if (ops->max_items && dfp->dfp_count >= ops->max_items) + return false; + + return true; +} + /* Add an item for later deferred processing. */ void xfs_defer_add( @@ -708,19 +753,9 @@ xfs_defer_add( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); - /* - * Add the item to a pending item at the end of the intake list. - * If the last pending item has the same type, reuse it. Else, - * create a new pending item at the end of the intake list. - */ - if (!list_empty(&tp->t_dfops)) { - dfp = list_last_entry(&tp->t_dfops, - struct xfs_defer_pending, dfp_list); - if (dfp->dfp_type != type || - (ops->max_items && dfp->dfp_count >= ops->max_items)) - dfp = NULL; - } - if (!dfp) { + dfp = xfs_defer_find_last(tp, type, ops); + if (!dfp || !xfs_defer_can_append(dfp, ops)) { + /* Create a new pending item at the end of the intake list. */ dfp = kmem_cache_zalloc(xfs_defer_pending_cache, GFP_NOFS | __GFP_NOFAIL); dfp->dfp_type = type; From 4dffb2cbb4839fd6f9bbac0b3fd06cc9015cbb9b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:40:56 -0800 Subject: [PATCH 0344/1562] xfs: allow pausing of pending deferred work items Traditionally, all pending deferred work attached to a transaction is finished when one of the xfs_defer_finish* functions is called. However, online repair wants to be able to allocate space for a new data structure, format a new metadata structure into the allocated space, and commit that into the filesystem. As a hedge against system crashes during repairs, we also want to log some EFI items for the allocated space speculatively, and cancel them if we elect to commit the new data structure. Therefore, introduce the idea of pausing a pending deferred work item. Log intent items are still created for paused items and relogged as necessary. However, paused items are pushed onto a side list before we start calling ->finish_item, and the whole list is reattach to the transaction afterwards. New work items are never attached to paused pending items. Modify xfs_defer_cancel to clean up pending deferred work items holding a log intent item but not a log intent done item, since that is now possible. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 97 ++++++++++++++++++++++++++++++++++----- fs/xfs/libxfs/xfs_defer.h | 17 ++++++- fs/xfs/xfs_trace.h | 13 +++++- 3 files changed, 111 insertions(+), 16 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 7f9b1c5f2588..c4480dec29ec 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -487,7 +487,7 @@ xfs_defer_relog_intent( * done item to release the intent item; and then log a new intent item. * The caller should provide a fresh transaction and roll it after we're done. */ -static int +static void xfs_defer_relog( struct xfs_trans **tpp, struct list_head *dfops) @@ -529,10 +529,6 @@ xfs_defer_relog( xfs_defer_relog_intent(*tpp, dfp); } - - if ((*tpp)->t_flags & XFS_TRANS_DIRTY) - return xfs_defer_trans_roll(tpp); - return 0; } /* @@ -588,6 +584,24 @@ out: return error; } +/* Move all paused deferred work from @tp to @paused_list. */ +static void +xfs_defer_isolate_paused( + struct xfs_trans *tp, + struct list_head *paused_list) +{ + struct xfs_defer_pending *dfp; + struct xfs_defer_pending *pli; + + list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) { + if (!(dfp->dfp_flags & XFS_DEFER_PAUSED)) + continue; + + list_move_tail(&dfp->dfp_list, paused_list); + trace_xfs_defer_isolate_paused(tp->t_mountp, dfp); + } +} + /* * Finish all the pending work. This involves logging intent items for * any work items that wandered in since the last transaction roll (if @@ -603,6 +617,7 @@ xfs_defer_finish_noroll( struct xfs_defer_pending *dfp = NULL; int error = 0; LIST_HEAD(dop_pending); + LIST_HEAD(dop_paused); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -621,6 +636,8 @@ xfs_defer_finish_noroll( */ int has_intents = xfs_defer_create_intents(*tp); + xfs_defer_isolate_paused(*tp, &dop_paused); + list_splice_init(&(*tp)->t_dfops, &dop_pending); if (has_intents < 0) { @@ -633,22 +650,33 @@ xfs_defer_finish_noroll( goto out_shutdown; /* Relog intent items to keep the log moving. */ - error = xfs_defer_relog(tp, &dop_pending); - if (error) - goto out_shutdown; + xfs_defer_relog(tp, &dop_pending); + xfs_defer_relog(tp, &dop_paused); + + if ((*tp)->t_flags & XFS_TRANS_DIRTY) { + error = xfs_defer_trans_roll(tp); + if (error) + goto out_shutdown; + } } - dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, - dfp_list); + dfp = list_first_entry_or_null(&dop_pending, + struct xfs_defer_pending, dfp_list); + if (!dfp) + break; error = xfs_defer_finish_one(*tp, dfp); if (error && error != -EAGAIN) goto out_shutdown; } + /* Requeue the paused items in the outgoing transaction. */ + list_splice_tail_init(&dop_paused, &(*tp)->t_dfops); + trace_xfs_defer_finish_done(*tp, _RET_IP_); return 0; out_shutdown: + list_splice_tail_init(&dop_paused, &dop_pending); xfs_defer_trans_abort(*tp, &dop_pending); xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); trace_xfs_defer_finish_error(*tp, error); @@ -661,6 +689,9 @@ int xfs_defer_finish( struct xfs_trans **tp) { +#ifdef DEBUG + struct xfs_defer_pending *dfp; +#endif int error; /* @@ -680,7 +711,10 @@ xfs_defer_finish( } /* Reset LOWMODE now that we've finished all the dfops. */ - ASSERT(list_empty(&(*tp)->t_dfops)); +#ifdef DEBUG + list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list) + ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); +#endif (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; return 0; } @@ -692,6 +726,7 @@ xfs_defer_cancel( struct xfs_mount *mp = tp->t_mountp; trace_xfs_defer_cancel(tp, _RET_IP_); + xfs_defer_trans_abort(tp, &tp->t_dfops); xfs_defer_cancel_list(mp, &tp->t_dfops); } @@ -733,6 +768,10 @@ xfs_defer_can_append( if (dfp->dfp_intent) return false; + /* Paused items cannot absorb more work */ + if (dfp->dfp_flags & XFS_DEFER_PAUSED) + return NULL; + /* Already full? */ if (ops->max_items && dfp->dfp_count >= ops->max_items) return false; @@ -741,7 +780,7 @@ xfs_defer_can_append( } /* Add an item for later deferred processing. */ -void +struct xfs_defer_pending * xfs_defer_add( struct xfs_trans *tp, enum xfs_defer_ops_type type, @@ -768,6 +807,7 @@ xfs_defer_add( xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); + return dfp; } /* @@ -1093,3 +1133,36 @@ xfs_defer_destroy_item_caches(void) xfs_rmap_intent_destroy_cache(); xfs_defer_destroy_cache(); } + +/* + * Mark a deferred work item so that it will be requeued indefinitely without + * being finished. Caller must ensure there are no data dependencies on this + * work item in the meantime. + */ +void +xfs_defer_item_pause( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED)); + + dfp->dfp_flags |= XFS_DEFER_PAUSED; + + trace_xfs_defer_item_pause(tp->t_mountp, dfp); +} + +/* + * Release a paused deferred work item so that it will be finished during the + * next transaction roll. + */ +void +xfs_defer_item_unpause( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); + + dfp->dfp_flags &= ~XFS_DEFER_PAUSED; + + trace_xfs_defer_item_unpause(tp->t_mountp, dfp); +} diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 78d6dcd1af2c..b0284154f4e0 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -34,11 +34,24 @@ struct xfs_defer_pending { struct xfs_log_item *dfp_intent; /* log intent item */ struct xfs_log_item *dfp_done; /* log done item */ unsigned int dfp_count; /* # extent items */ + unsigned int dfp_flags; enum xfs_defer_ops_type dfp_type; }; -void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type, - struct list_head *h); +/* + * Create a log intent item for this deferred item, but don't actually finish + * the work. Caller must clear this before the final transaction commit. + */ +#define XFS_DEFER_PAUSED (1U << 0) + +#define XFS_DEFER_PENDING_STRINGS \ + { XFS_DEFER_PAUSED, "paused" } + +void xfs_defer_item_pause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); +void xfs_defer_item_unpause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); + +struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, + enum xfs_defer_ops_type type, struct list_head *h); int xfs_defer_finish_noroll(struct xfs_trans **tp); int xfs_defer_finish(struct xfs_trans **tp); int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 3926cf7f2a6e..514095b6ba2b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2551,6 +2551,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, __field(dev_t, dev) __field(int, type) __field(void *, intent) + __field(unsigned int, flags) __field(char, committed) __field(int, nr) ), @@ -2558,13 +2559,15 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, __entry->dev = mp ? mp->m_super->s_dev : 0; __entry->type = dfp->dfp_type; __entry->intent = dfp->dfp_intent; + __entry->flags = dfp->dfp_flags; __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p committed %d nr %d", + TP_printk("dev %d:%d optype %d intent %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->intent, + __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), __entry->committed, __entry->nr) ) @@ -2675,6 +2678,9 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause); #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); @@ -2692,6 +2698,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, __field(void *, intent) __field(void *, item) __field(char, committed) + __field(unsigned int, flags) __field(int, nr) ), TP_fast_assign( @@ -2700,13 +2707,15 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, __entry->intent = dfp->dfp_intent; __entry->item = item; __entry->committed = dfp->dfp_done != NULL; + __entry->flags = dfp->dfp_flags; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d", + TP_printk("dev %d:%d optype %d intent %p item %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->intent, __entry->item, + __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), __entry->committed, __entry->nr) ) From 4c88fef3af4a51c2cdba6a28237e98da4873e8dc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:40:57 -0800 Subject: [PATCH 0345/1562] xfs: remove __xfs_free_extent_later xfs_free_extent_later is a trivial helper, so remove it to reduce the amount of thinking required to understand the deferred freeing interface. This will make it easier to introduce automatic reaping of speculative allocations in the next patch. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_ag.c | 2 +- fs/xfs/libxfs/xfs_alloc.c | 2 +- fs/xfs/libxfs/xfs_alloc.h | 14 +------------- fs/xfs/libxfs/xfs_bmap.c | 4 ++-- fs/xfs/libxfs/xfs_bmap_btree.c | 2 +- fs/xfs/libxfs/xfs_ialloc.c | 5 +++-- fs/xfs/libxfs/xfs_ialloc_btree.c | 2 +- fs/xfs/libxfs/xfs_refcount.c | 6 +++--- fs/xfs/libxfs/xfs_refcount_btree.c | 2 +- fs/xfs/scrub/reap.c | 2 +- fs/xfs/xfs_reflink.c | 2 +- 11 files changed, 16 insertions(+), 27 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index f9f4d694640d..f62ff125a50a 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -984,7 +984,7 @@ xfs_ag_shrink_space( if (err2 != -ENOSPC) goto resv_err; - err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, + err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, XFS_AG_RESV_NONE, true); if (err2) goto resv_err; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 100ab5931b31..c35224ad9428 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2523,7 +2523,7 @@ xfs_defer_agfl_block( * The list is maintained sorted (by block number). */ int -__xfs_free_extent_later( +xfs_free_extent_later( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6bb8d295c321..6b95d1d8a853 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -231,7 +231,7 @@ xfs_buf_to_agfl_bno( return bp->b_addr; } -int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, +int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, bool skip_discard); @@ -256,18 +256,6 @@ void xfs_extent_free_get_group(struct xfs_mount *mp, #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ -static inline int -xfs_free_extent_later( - struct xfs_trans *tp, - xfs_fsblock_t bno, - xfs_filblks_t len, - const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type) -{ - return __xfs_free_extent_later(tp, bno, len, oinfo, type, false); -} - - extern struct kmem_cache *xfs_extfree_item_cache; int __init xfs_extfree_intent_init_cache(void); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index be62acffad6c..68be1dd4f0f2 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -575,7 +575,7 @@ xfs_bmap_btree_to_extents( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) return error; @@ -5218,7 +5218,7 @@ xfs_bmap_del_extent_real( if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); } else { - error = __xfs_free_extent_later(tp, del->br_startblock, + error = xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, XFS_AG_RESV_NONE, ((bflags & XFS_BMAPI_NODISCARD) || diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index bf3f1b36fdd2..8360256cff16 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -272,7 +272,7 @@ xfs_bmbt_free_block( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index b83e54c70906..d61d03e5b853 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1854,7 +1854,7 @@ xfs_difree_inode_chunk( return xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1900,7 +1900,8 @@ xfs_difree_inode_chunk( ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); error = xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE); + &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, + false); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9258f01c0015..42a5e1f227a0 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -161,7 +161,7 @@ __xfs_inobt_free_block( xfs_inobt_mod_blockcount(cur, -1); fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_INOBT, resv); + &XFS_RMAP_OINFO_INOBT, resv, false); } STATIC int diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 646b3fa362ad..3702b4a07110 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1153,7 +1153,7 @@ xfs_refcount_adjust_extents( tmp.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, tmp.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_error; } @@ -1215,7 +1215,7 @@ xfs_refcount_adjust_extents( ext.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, ext.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_error; } @@ -1985,7 +1985,7 @@ xfs_refcount_recover_cow_leftovers( /* Free the block. */ error = xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_trans; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 5c3987d8dc24..3fa795e2488d 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -112,7 +112,7 @@ xfs_refcountbt_free_block( be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false); } STATIC int diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 86a62420e02c..78c9f2085db4 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -410,7 +410,7 @@ xreap_agextent_iter( * Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. */ - error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, + error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, rs->resv, true); if (error) return error; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e5b62dc28466..d5ca8bcae65b 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -618,7 +618,7 @@ xfs_reflink_cancel_cow_blocks( error = xfs_free_extent_later(*tpp, del.br_startblock, del.br_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) break; From e3042be36c343207b7af249a09f50b4e37e9fda4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:40:57 -0800 Subject: [PATCH 0346/1562] xfs: automatic freeing of freshly allocated unwritten space As mentioned in the previous commit, online repair wants to allocate space to write out a new metadata structure, and it also wants to hedge against system crashes during repairs by logging (and later cancelling) EFIs to free the space if we crash before committing the new data structure. Therefore, create a trio of functions to schedule automatic reaping of freshly allocated unwritten space. xfs_alloc_schedule_autoreap creates a paused EFI representing the space we just allocated. Once the allocations are made and the autoreaps scheduled, we can start writing to disk. If the writes succeed, xfs_alloc_cancel_autoreap marks the EFI work items as stale and unpauses the pending deferred work item. Assuming that's done in the same transaction that commits the new structure into the filesystem, we guarantee that either the new object is fully visible, or that all the space gets reclaimed. If the writes succeed but only part of an extent was used, repair must call the same _cancel_autoreap function to kill the first EFI and then log a new EFI to free the unused space. The first EFI is already committed, so it cannot be changed. For full extents that aren't used, xfs_alloc_commit_autoreap will unpause the EFI, which results in the space being freed during the next _defer_finish cycle. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_alloc.c | 104 ++++++++++++++++++++++++++++++++++++-- fs/xfs/libxfs/xfs_alloc.h | 12 +++++ fs/xfs/xfs_extfree_item.c | 9 ++-- 3 files changed, 117 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index c35224ad9428..4940f9377f21 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2522,14 +2522,15 @@ xfs_defer_agfl_block( * Add the extent to the list of extents to be free at transaction end. * The list is maintained sorted (by block number). */ -int -xfs_free_extent_later( +static int +xfs_defer_extent_free( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, - bool skip_discard) + bool skip_discard, + struct xfs_defer_pending **dfpp) { struct xfs_extent_free_item *xefi; struct xfs_mount *mp = tp->t_mountp; @@ -2577,10 +2578,105 @@ xfs_free_extent_later( XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); xfs_extent_free_get_group(mp, xefi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); + *dfpp = xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); return 0; } +int +xfs_free_extent_later( + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_filblks_t len, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type, + bool skip_discard) +{ + struct xfs_defer_pending *dontcare = NULL; + + return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard, + &dontcare); +} + +/* + * Set up automatic freeing of unwritten space in the filesystem. + * + * This function attached a paused deferred extent free item to the + * transaction. Pausing means that the EFI will be logged in the next + * transaction commit, but the pending EFI will not be finished until the + * pending item is unpaused. + * + * If the system goes down after the EFI has been persisted to the log but + * before the pending item is unpaused, log recovery will find the EFI, fail to + * find the EFD, and free the space. + * + * If the pending item is unpaused, the next transaction commit will log an EFD + * without freeing the space. + * + * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the + * @args structure are set to the relevant values. + */ +int +xfs_alloc_schedule_autoreap( + const struct xfs_alloc_arg *args, + bool skip_discard, + struct xfs_alloc_autoreap *aarp) +{ + int error; + + error = xfs_defer_extent_free(args->tp, args->fsbno, args->len, + &args->oinfo, args->resv, skip_discard, &aarp->dfp); + if (error) + return error; + + xfs_defer_item_pause(args->tp, aarp->dfp); + return 0; +} + +/* + * Cancel automatic freeing of unwritten space in the filesystem. + * + * Earlier, we created a paused deferred extent free item and attached it to + * this transaction so that we could automatically roll back a new space + * allocation if the system went down. Now we want to cancel the paused work + * item by marking the EFI stale so we don't actually free the space, unpausing + * the pending item and logging an EFD. + * + * The caller generally should have already mapped the space into the ondisk + * filesystem. If the reserved space was partially used, the caller must call + * xfs_free_extent_later to create a new EFI to free the unused space. + */ +void +xfs_alloc_cancel_autoreap( + struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp) +{ + struct xfs_defer_pending *dfp = aarp->dfp; + struct xfs_extent_free_item *xefi; + + if (!dfp) + return; + + list_for_each_entry(xefi, &dfp->dfp_work, xefi_list) + xefi->xefi_flags |= XFS_EFI_CANCELLED; + + xfs_defer_item_unpause(tp, dfp); +} + +/* + * Commit automatic freeing of unwritten space in the filesystem. + * + * This unpauses an earlier _schedule_autoreap and commits to freeing the + * allocated space. Call this if none of the reserved space was used. + */ +void +xfs_alloc_commit_autoreap( + struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp) +{ + if (aarp->dfp) + xfs_defer_item_unpause(tp, aarp->dfp); +} + #ifdef DEBUG /* * Check if an AGF has a free extent record whose length is equal to diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6b95d1d8a853..851cafbd6449 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -255,6 +255,18 @@ void xfs_extent_free_get_group(struct xfs_mount *mp, #define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */ #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ +#define XFS_EFI_CANCELLED (1U << 3) /* dont actually free the space */ + +struct xfs_alloc_autoreap { + struct xfs_defer_pending *dfp; +}; + +int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args, + bool skip_discard, struct xfs_alloc_autoreap *aarp); +void xfs_alloc_cancel_autoreap(struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp); +void xfs_alloc_commit_autoreap(struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp); extern struct kmem_cache *xfs_extfree_item_cache; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 3ca23ab8d92a..3e3469504271 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -453,7 +453,7 @@ xfs_extent_free_finish_item( struct xfs_extent *extp; uint next_extent; xfs_agblock_t agbno; - int error; + int error = 0; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); @@ -473,9 +473,10 @@ xfs_extent_free_finish_item( * the existing EFI, and so we need to copy all the unprocessed extents * in this EFI to the EFD so this works correctly. */ - error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, - xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, - xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); + if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) + error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, + xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, + xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); if (error == -EAGAIN) { xfs_efd_from_efi(efdp); return error; From 4c8ecd1cfdd01fb727121035014d9f654a30bdf2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:40:58 -0800 Subject: [PATCH 0347/1562] xfs: remove unused fields from struct xbtree_ifakeroot Remove these unused fields since nobody uses them. They should have been removed years ago in a different cleanup series from Christoph Hellwig. Fixes: daf83964a3681 ("xfs: move the per-fork nextents fields into struct xfs_ifork") Fixes: f7e67b20ecbbc ("xfs: move the fork format fields into struct xfs_ifork") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_btree_staging.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index f0d2976050ae..5f638f711246 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -37,12 +37,6 @@ struct xbtree_ifakeroot { /* Number of bytes available for this fork in the inode. */ unsigned int if_fork_size; - - /* Fork format. */ - unsigned int if_format; - - /* Number of records. */ - unsigned int if_extents; }; /* Cursor interactions with fake roots for inode-rooted btrees. */ From be408417630427984a1fddd069f30b245793234c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:40:59 -0800 Subject: [PATCH 0348/1562] xfs: implement block reservation accounting for btrees we're staging Create a new xrep_newbt structure to encapsulate a fake root for creating a staged btree cursor as well as to track all the blocks that we need to reserve in order to build that btree. As for the particular choice of lowspace thresholds and btree block slack factors -- at this point one could say that the thresholds in online repair come from bulkload_estimate_ag_slack in xfs_repair[1]. But that's not the entire story, since the offline btree rebuilding code in xfs_repair was merged as a retroport of the online btree code in this patchset! Before xfs_btree_staging.[ch] came along, xfs_repair determined the slack factor (aka the number of slots to leave unfilled in each new btree block) via open-coded logic in repair/phase5.c[2]. At that point the slack factors were arbitrary quantities per btree. The rmapbt automatically left 10 slots free; everything else left zero. That had a noticeable effect on performance straight after mounting because adding records to /any/ btree would result in splits. A few years ago when this patch was first written, Dave and I decided that repair should generate btree blocks that were 75% full unless space was tight, in which case it should try to fill the blocks to nearly full. We defined tight as ~10% free to avoid repair failures but settled on 3/32 (~9%) to avoid div64. IOWs, we mostly pulled the thresholds out of thin air. We've been QAing with those geometry numbers ever since. ;) Link: https://git.kernel.org/pub/scm/fs/xfs/xfsprogs-dev.git/tree/repair/bulkload.c?h=v6.5.0#n114 Link: https://git.kernel.org/pub/scm/fs/xfs/xfsprogs-dev.git/tree/repair/phase5.c?h=v4.19.0#n1349 Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/Makefile | 1 + fs/xfs/scrub/newbt.c | 495 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/newbt.h | 62 ++++++ fs/xfs/scrub/trace.h | 37 ++++ 4 files changed, 595 insertions(+) create mode 100644 fs/xfs/scrub/newbt.c create mode 100644 fs/xfs/scrub/newbt.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7762c01a85cf..1537d66e5ab0 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -181,6 +181,7 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ + newbt.o \ reap.o \ repair.o \ ) diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c new file mode 100644 index 000000000000..5d1d75d2b1ad --- /dev/null +++ b/fs/xfs/scrub/newbt.c @@ -0,0 +1,495 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" +#include "xfs_defer.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/newbt.h" + +/* + * Estimate proper slack values for a btree that's being reloaded. + * + * Under most circumstances, we'll take whatever default loading value the + * btree bulk loading code calculates for us. However, there are some + * exceptions to this rule: + * + * (1) If this is a per-AG btree and the AG has less than 10% space free. + * (2) If this is an inode btree and the FS has less than 10% space free. + + * In either case, format the new btree blocks almost completely full to + * minimize space usage. + */ +static void +xrep_newbt_estimate_slack( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_btree_bload *bload = &xnr->bload; + uint64_t free; + uint64_t sz; + + /* Let the btree code compute the default slack values. */ + bload->leaf_slack = -1; + bload->node_slack = -1; + + if (sc->ops->type == ST_PERAG) { + free = sc->sa.pag->pagf_freeblks; + sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); + } else { + free = percpu_counter_sum(&sc->mp->m_fdblocks); + sz = sc->mp->m_sb.sb_dblocks; + } + + /* No further changes if there's more than 10% free space left. */ + if (free >= div_u64(sz, 10)) + return; + + /* + * We're low on space; load the btrees as tightly as possible. Leave + * a couple of open slots in each btree block so that we don't end up + * splitting the btrees like crazy after a mount. + */ + if (bload->leaf_slack < 0) + bload->leaf_slack = 2; + if (bload->node_slack < 0) + bload->node_slack = 2; +} + +/* Initialize accounting resources for staging a new AG btree. */ +void +xrep_newbt_init_ag( + struct xrep_newbt *xnr, + struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, + xfs_fsblock_t alloc_hint, + enum xfs_ag_resv_type resv) +{ + memset(xnr, 0, sizeof(struct xrep_newbt)); + xnr->sc = sc; + xnr->oinfo = *oinfo; /* structure copy */ + xnr->alloc_hint = alloc_hint; + xnr->resv = resv; + INIT_LIST_HEAD(&xnr->resv_list); + xrep_newbt_estimate_slack(xnr); +} + +/* Initialize accounting resources for staging a new inode fork btree. */ +int +xrep_newbt_init_inode( + struct xrep_newbt *xnr, + struct xfs_scrub *sc, + int whichfork, + const struct xfs_owner_info *oinfo) +{ + struct xfs_ifork *ifp; + + ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); + if (!ifp) + return -ENOMEM; + + xrep_newbt_init_ag(xnr, sc, oinfo, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), + XFS_AG_RESV_NONE); + xnr->ifake.if_fork = ifp; + xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); + return 0; +} + +/* + * Initialize accounting resources for staging a new btree. Callers are + * expected to add their own reservations (and clean them up) manually. + */ +void +xrep_newbt_init_bare( + struct xrep_newbt *xnr, + struct xfs_scrub *sc) +{ + xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, + XFS_AG_RESV_NONE); +} + +/* + * Designate specific blocks to be used to build our new btree. @pag must be + * a passive reference. + */ +STATIC int +xrep_newbt_add_blocks( + struct xrep_newbt *xnr, + struct xfs_perag *pag, + const struct xfs_alloc_arg *args) +{ + struct xfs_mount *mp = xnr->sc->mp; + struct xrep_newbt_resv *resv; + + resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); + if (!resv) + return -ENOMEM; + + INIT_LIST_HEAD(&resv->list); + resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + resv->len = args->len; + resv->used = 0; + resv->pag = xfs_perag_hold(pag); + + list_add_tail(&resv->list, &xnr->resv_list); + return 0; +} + +/* Don't let our allocation hint take us beyond this AG */ +static inline void +xrep_newbt_validate_ag_alloc_hint( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); + + if (agno == sc->sa.pag->pag_agno && + xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) + return; + + xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_AGFL_BLOCK(sc->mp) + 1); +} + +/* Allocate disk space for a new per-AG btree. */ +STATIC int +xrep_newbt_alloc_ag_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_mount *mp = sc->mp; + int error = 0; + + ASSERT(sc->sa.pag != NULL); + + while (nr_blocks > 0) { + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = mp, + .oinfo = xnr->oinfo, + .minlen = 1, + .maxlen = nr_blocks, + .prod = 1, + .resv = xnr->resv, + }; + xfs_agnumber_t agno; + + xrep_newbt_validate_ag_alloc_hint(xnr); + + error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + + trace_xrep_newbt_alloc_ag_blocks(mp, agno, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + + if (agno != sc->sa.pag->pag_agno) { + ASSERT(agno == sc->sa.pag->pag_agno); + return -EFSCORRUPTED; + } + + error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); + if (error) + return error; + + nr_blocks -= args.len; + xnr->alloc_hint = args.fsbno + args.len; + + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return 0; +} + +/* Don't let our allocation hint take us beyond EOFS */ +static inline void +xrep_newbt_validate_file_alloc_hint( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + + if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) + return; + + xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); +} + +/* Allocate disk space for our new file-based btree. */ +STATIC int +xrep_newbt_alloc_file_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_mount *mp = sc->mp; + int error = 0; + + while (nr_blocks > 0) { + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = mp, + .oinfo = xnr->oinfo, + .minlen = 1, + .maxlen = nr_blocks, + .prod = 1, + .resv = xnr->resv, + }; + struct xfs_perag *pag; + xfs_agnumber_t agno; + + xrep_newbt_validate_file_alloc_hint(xnr); + + error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + + trace_xrep_newbt_alloc_file_blocks(mp, agno, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + + pag = xfs_perag_get(mp, agno); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xrep_newbt_add_blocks(xnr, pag, &args); + xfs_perag_put(pag); + if (error) + return error; + + nr_blocks -= args.len; + xnr->alloc_hint = args.fsbno + args.len; + + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return 0; +} + +/* Allocate disk space for our new btree. */ +int +xrep_newbt_alloc_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + if (xnr->sc->ip) + return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); + return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); +} + +/* + * Free the unused part of a space extent that was reserved for a new ondisk + * structure. Returns the number of EFIs logged or a negative errno. + */ +STATIC int +xrep_newbt_free_extent( + struct xrep_newbt *xnr, + struct xrep_newbt_resv *resv, + bool btree_committed) +{ + struct xfs_scrub *sc = xnr->sc; + xfs_agblock_t free_agbno = resv->agbno; + xfs_extlen_t free_aglen = resv->len; + xfs_fsblock_t fsbno; + int error; + + if (!btree_committed || resv->used == 0) { + /* + * If we're not committing a new btree or we didn't use the + * space reservation, free the entire space extent. + */ + goto free; + } + + /* + * We used space and committed the btree. Remove the written blocks + * from the reservation and possibly log a new EFI to free any unused + * reservation space. + */ + free_agbno += resv->used; + free_aglen -= resv->used; + + if (free_aglen == 0) + return 0; + + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, + free_aglen, xnr->oinfo.oi_owner); + + ASSERT(xnr->resv != XFS_AG_RESV_AGFL); + +free: + /* + * Use EFIs to free the reservations. This reduces the chance + * that we leak blocks if the system goes down. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); + error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, + xnr->resv, true); + if (error) + return error; + + return 1; +} + +/* Free all the accounting info and disk space we reserved for a new btree. */ +STATIC int +xrep_newbt_free( + struct xrep_newbt *xnr, + bool btree_committed) +{ + struct xfs_scrub *sc = xnr->sc; + struct xrep_newbt_resv *resv, *n; + unsigned int freed = 0; + int error = 0; + + /* + * If the filesystem already went down, we can't free the blocks. Skip + * ahead to freeing the incore metadata because we can't fix anything. + */ + if (xfs_is_shutdown(sc->mp)) + goto junkit; + + list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + int ret; + + ret = xrep_newbt_free_extent(xnr, resv, btree_committed); + list_del(&resv->list); + xfs_perag_put(resv->pag); + kfree(resv); + if (ret < 0) { + error = ret; + goto junkit; + } + + freed += ret; + if (freed >= XREP_MAX_ITRUNCATE_EFIS) { + error = xrep_defer_finish(sc); + if (error) + goto junkit; + freed = 0; + } + } + + if (freed) + error = xrep_defer_finish(sc); + +junkit: + /* + * If we still have reservations attached to @newbt, cleanup must have + * failed and the filesystem is about to go down. Clean up the incore + * reservations. + */ + list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + list_del(&resv->list); + xfs_perag_put(resv->pag); + kfree(resv); + } + + if (sc->ip) { + kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); + xnr->ifake.if_fork = NULL; + } + + return error; +} + +/* + * Free all the accounting info and unused disk space allocations after + * committing a new btree. + */ +int +xrep_newbt_commit( + struct xrep_newbt *xnr) +{ + return xrep_newbt_free(xnr, true); +} + +/* + * Free all the accounting info and all of the disk space we reserved for a new + * btree that we're not going to commit. We want to try to roll things back + * cleanly for things like ENOSPC midway through allocation. + */ +void +xrep_newbt_cancel( + struct xrep_newbt *xnr) +{ + xrep_newbt_free(xnr, false); +} + +/* Feed one of the reserved btree blocks to the bulk loader. */ +int +xrep_newbt_claim_block( + struct xfs_btree_cur *cur, + struct xrep_newbt *xnr, + union xfs_btree_ptr *ptr) +{ + struct xrep_newbt_resv *resv; + struct xfs_mount *mp = cur->bc_mp; + xfs_agblock_t agbno; + + /* + * The first item in the list should always have a free block unless + * we're completely out. + */ + resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); + if (resv->used == resv->len) + return -ENOSPC; + + /* + * Peel off a block from the start of the reservation. We allocate + * blocks in order to place blocks on disk in increasing record or key + * order. The block reservations tend to end up on the list in + * decreasing order, which hopefully results in leaf blocks ending up + * together. + */ + agbno = resv->agbno + resv->used; + resv->used++; + + /* If we used all the blocks in this reservation, move it to the end. */ + if (resv->used == resv->len) + list_move_tail(&resv->list, &xnr->resv_list); + + trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, + xnr->oinfo.oi_owner); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, + agbno)); + else + ptr->s = cpu_to_be32(agbno); + return 0; +} diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h new file mode 100644 index 000000000000..ca53271f3a4c --- /dev/null +++ b/fs/xfs/scrub/newbt.h @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_NEWBT_H__ +#define __XFS_SCRUB_NEWBT_H__ + +struct xrep_newbt_resv { + /* Link to list of extents that we've reserved. */ + struct list_head list; + + struct xfs_perag *pag; + + /* AG block of the extent we reserved. */ + xfs_agblock_t agbno; + + /* Length of the reservation. */ + xfs_extlen_t len; + + /* How much of this reservation has been used. */ + xfs_extlen_t used; +}; + +struct xrep_newbt { + struct xfs_scrub *sc; + + /* List of extents that we've reserved. */ + struct list_head resv_list; + + /* Fake root for new btree. */ + union { + struct xbtree_afakeroot afake; + struct xbtree_ifakeroot ifake; + }; + + /* rmap owner of these blocks */ + struct xfs_owner_info oinfo; + + /* btree geometry for the bulk loader */ + struct xfs_btree_bload bload; + + /* Allocation hint */ + xfs_fsblock_t alloc_hint; + + /* per-ag reservation type */ + enum xfs_ag_resv_type resv; +}; + +void xrep_newbt_init_bare(struct xrep_newbt *xnr, struct xfs_scrub *sc); +void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint, + enum xfs_ag_resv_type resv); +int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc, + int whichfork, const struct xfs_owner_info *oinfo); +int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks); +void xrep_newbt_cancel(struct xrep_newbt *xnr); +int xrep_newbt_commit(struct xrep_newbt *xnr); +int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr, + union xfs_btree_ptr *ptr); + +#endif /* __XFS_SCRUB_NEWBT_H__ */ diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 4a8bc6f3c8f2..aa7683075319 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1332,6 +1332,43 @@ TRACE_EVENT(xrep_ialloc_insert, __entry->freemask) ) +DECLARE_EVENT_CLASS(xrep_newbt_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, + int64_t owner), + TP_ARGS(mp, agno, agbno, len, owner), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(int64_t, owner) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = owner; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner) +); +#define DEFINE_NEWBT_EXTENT_EVENT(name) \ +DEFINE_EVENT(xrep_newbt_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, \ + int64_t owner), \ + TP_ARGS(mp, agno, agbno, len, owner)) +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ From 6bb9ea8ecd2c58a66324cb799838e7d49d78a877 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:41:00 -0800 Subject: [PATCH 0349/1562] xfs: log EFIs for all btree blocks being used to stage a btree We need to log EFIs for every extent that we allocate for the purpose of staging a new btree so that if we fail then the blocks will be freed during log recovery. Use the autoreaping mechanism provided by the previous patch to attach paused freeing work to the scrub transaction. We can then mark the EFIs stale if we decide to commit the new btree, or we can unpause the EFIs if we decide to abort the repair. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/newbt.c | 34 ++++++++++++++++++++++++++-------- fs/xfs/scrub/newbt.h | 3 +++ 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 5d1d75d2b1ad..992cf34a13e7 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -139,6 +139,7 @@ xrep_newbt_add_blocks( { struct xfs_mount *mp = xnr->sc->mp; struct xrep_newbt_resv *resv; + int error; resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); if (!resv) @@ -150,8 +151,18 @@ xrep_newbt_add_blocks( resv->used = 0; resv->pag = xfs_perag_hold(pag); + ASSERT(xnr->oinfo.oi_offset == 0); + + error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); + if (error) + goto out_pag; + list_add_tail(&resv->list, &xnr->resv_list); return 0; +out_pag: + xfs_perag_put(resv->pag); + kfree(resv); + return error; } /* Don't let our allocation hint take us beyond this AG */ @@ -330,16 +341,21 @@ xrep_newbt_free_extent( if (!btree_committed || resv->used == 0) { /* * If we're not committing a new btree or we didn't use the - * space reservation, free the entire space extent. + * space reservation, let the existing EFI free the entire + * space extent. */ - goto free; + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, + free_agbno, free_aglen, xnr->oinfo.oi_owner); + xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); + return 1; } /* - * We used space and committed the btree. Remove the written blocks - * from the reservation and possibly log a new EFI to free any unused - * reservation space. + * We used space and committed the btree. Cancel the autoreap, remove + * the written blocks from the reservation, and possibly log a new EFI + * to free any unused reservation space. */ + xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); free_agbno += resv->used; free_aglen -= resv->used; @@ -351,7 +367,6 @@ xrep_newbt_free_extent( ASSERT(xnr->resv != XFS_AG_RESV_AGFL); -free: /* * Use EFIs to free the reservations. This reduces the chance * that we leak blocks if the system goes down. @@ -411,9 +426,10 @@ junkit: /* * If we still have reservations attached to @newbt, cleanup must have * failed and the filesystem is about to go down. Clean up the incore - * reservations. + * reservations and try to commit to freeing the space we used. */ list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); list_del(&resv->list); xfs_perag_put(resv->pag); kfree(resv); @@ -491,5 +507,7 @@ xrep_newbt_claim_block( agbno)); else ptr->s = cpu_to_be32(agbno); - return 0; + + /* Relog all the EFIs. */ + return xrep_defer_finish(xnr->sc); } diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h index ca53271f3a4c..d2baffa17b1a 100644 --- a/fs/xfs/scrub/newbt.h +++ b/fs/xfs/scrub/newbt.h @@ -12,6 +12,9 @@ struct xrep_newbt_resv { struct xfs_perag *pag; + /* Auto-freeing this reservation if we don't commit. */ + struct xfs_alloc_autoreap autoreap; + /* AG block of the extent we reserved. */ xfs_agblock_t agbno; From 3f3cec031099c37513727efc978a12b6346e326d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:41:00 -0800 Subject: [PATCH 0350/1562] xfs: force small EFIs for reaping btree extents Introduce the concept of a defer ops barrier to separate consecutively queued pending work items of the same type. With a barrier in place, the two work items will be tracked separately, and receive separate log intent items. The goal here is to prevent reaping of old metadata blocks from creating unnecessarily huge EFIs that could then run the risk of overflowing the scrub transaction. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 107 ++++++++++++++++++++++++++++++++++---- fs/xfs/libxfs/xfs_defer.h | 3 ++ fs/xfs/scrub/reap.c | 5 ++ 3 files changed, 104 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index c4480dec29ec..ecc2f7ec6991 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -182,6 +182,58 @@ static struct kmem_cache *xfs_defer_pending_cache; * Note that the continuation requested between t2 and t3 is likely to * reoccur. */ +STATIC struct xfs_log_item * +xfs_defer_barrier_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return NULL; +} + +STATIC void +xfs_defer_barrier_abort_intent( + struct xfs_log_item *intent) +{ + /* empty */ +} + +STATIC struct xfs_log_item * +xfs_defer_barrier_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + return NULL; +} + +STATIC int +xfs_defer_barrier_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +STATIC void +xfs_defer_barrier_cancel_item( + struct list_head *item) +{ + ASSERT(0); +} + +static const struct xfs_defer_op_type xfs_barrier_defer_type = { + .max_items = 1, + .create_intent = xfs_defer_barrier_create_intent, + .abort_intent = xfs_defer_barrier_abort_intent, + .create_done = xfs_defer_barrier_create_done, + .finish_item = xfs_defer_barrier_finish_item, + .cancel_item = xfs_defer_barrier_cancel_item, +}; static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type, @@ -190,6 +242,7 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, + [XFS_DEFER_OPS_TYPE_BARRIER] = &xfs_barrier_defer_type, }; /* Create a log intent done item for a log intent item. */ @@ -779,6 +832,23 @@ xfs_defer_can_append( return true; } +/* Create a new pending item at the end of the transaction list. */ +static inline struct xfs_defer_pending * +xfs_defer_alloc( + struct xfs_trans *tp, + enum xfs_defer_ops_type type) +{ + struct xfs_defer_pending *dfp; + + dfp = kmem_cache_zalloc(xfs_defer_pending_cache, + GFP_NOFS | __GFP_NOFAIL); + dfp->dfp_type = type; + INIT_LIST_HEAD(&dfp->dfp_work); + list_add_tail(&dfp->dfp_list, &tp->t_dfops); + + return dfp; +} + /* Add an item for later deferred processing. */ struct xfs_defer_pending * xfs_defer_add( @@ -793,23 +863,38 @@ xfs_defer_add( BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); dfp = xfs_defer_find_last(tp, type, ops); - if (!dfp || !xfs_defer_can_append(dfp, ops)) { - /* Create a new pending item at the end of the intake list. */ - dfp = kmem_cache_zalloc(xfs_defer_pending_cache, - GFP_NOFS | __GFP_NOFAIL); - dfp->dfp_type = type; - dfp->dfp_intent = NULL; - dfp->dfp_done = NULL; - dfp->dfp_count = 0; - INIT_LIST_HEAD(&dfp->dfp_work); - list_add_tail(&dfp->dfp_list, &tp->t_dfops); - } + if (!dfp || !xfs_defer_can_append(dfp, ops)) + dfp = xfs_defer_alloc(tp, type); xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); return dfp; } +/* + * Add a defer ops barrier to force two otherwise adjacent deferred work items + * to be tracked separately and have separate log items. + */ +void +xfs_defer_add_barrier( + struct xfs_trans *tp) +{ + struct xfs_defer_pending *dfp; + const enum xfs_defer_ops_type type = XFS_DEFER_OPS_TYPE_BARRIER; + const struct xfs_defer_op_type *ops = defer_op_types[type]; + + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + /* If the last defer op added was a barrier, we're done. */ + dfp = xfs_defer_find_last(tp, type, ops); + if (dfp) + return; + + xfs_defer_alloc(tp, type); + + trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); +} + /* * Create a pending deferred work item to replay the recovered intent item * and add it to the list. diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index b0284154f4e0..5b1990ef3e5d 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -20,6 +20,7 @@ enum xfs_defer_ops_type { XFS_DEFER_OPS_TYPE_FREE, XFS_DEFER_OPS_TYPE_AGFL_FREE, XFS_DEFER_OPS_TYPE_ATTR, + XFS_DEFER_OPS_TYPE_BARRIER, XFS_DEFER_OPS_TYPE_MAX, }; @@ -163,4 +164,6 @@ xfs_defer_add_item( int __init xfs_defer_init_item_caches(void); void xfs_defer_destroy_item_caches(void); +void xfs_defer_add_barrier(struct xfs_trans *tp); + #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 78c9f2085db4..ee26fcb500b7 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -31,6 +31,7 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_remote.h" +#include "xfs_defer.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -409,6 +410,8 @@ xreap_agextent_iter( /* * Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. + * Add a defer ops barrier every other extent to avoid stressing the + * system with large EFIs. */ error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, rs->resv, true); @@ -416,6 +419,8 @@ xreap_agextent_iter( return error; rs->deferred++; + if (rs->deferred % 2 == 0) + xfs_defer_add_barrier(sc->tp); return 0; } From fa422b353d212373fb2b2857a5ea5a6fa4876f9c Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Mon, 23 Oct 2023 15:20:46 +0800 Subject: [PATCH 0351/1562] mm, pmem, xfs: Introduce MF_MEM_PRE_REMOVE for unbind Now, if we suddenly remove a PMEM device(by calling unbind) which contains FSDAX while programs are still accessing data in this device, e.g.: ``` $FSSTRESS_PROG -d $SCRATCH_MNT -n 99999 -p 4 & # $FSX_PROG -N 1000000 -o 8192 -l 500000 $SCRATCH_MNT/t001 & echo "pfn1.1" > /sys/bus/nd/drivers/nd_pmem/unbind ``` it could come into an unacceptable state: 1. device has gone but mount point still exists, and umount will fail with "target is busy" 2. programs will hang and cannot be killed 3. may crash with NULL pointer dereference To fix this, we introduce a MF_MEM_PRE_REMOVE flag to let it know that we are going to remove the whole device, and make sure all related processes could be notified so that they could end up gracefully. This patch is inspired by Dan's "mm, dax, pmem: Introduce dev_pagemap_failure()"[1]. With the help of dax_holder and ->notify_failure() mechanism, the pmem driver is able to ask filesystem on it to unmap all files in use, and notify processes who are using those files. Call trace: trigger unbind -> unbind_store() -> ... (skip) -> devres_release_all() -> kill_dax() -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE) -> xfs_dax_notify_failure() `-> freeze_super() // freeze (kernel call) `-> do xfs rmap ` -> mf_dax_kill_procs() ` -> collect_procs_fsdax() // all associated processes ` -> unmap_and_kill() ` -> invalidate_inode_pages2_range() // drop file's cache `-> thaw_super() // thaw (both kernel & user call) Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove event. Use the exclusive freeze/thaw[2] to lock the filesystem to prevent new dax mapping from being created. Do not shutdown filesystem directly if configuration is not supported, or if failure range includes metadata area. Make sure all files and processes(not only the current progress) are handled correctly. Also drop the cache of associated files before pmem is removed. [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/ [2]: https://lore.kernel.org/linux-xfs/169116275623.3187159.16862410128731457358.stg-ugh@frogsfrogsfrogs/ Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Reviewed-by: Dan Williams Signed-off-by: Chandan Babu R --- drivers/dax/super.c | 3 +- fs/xfs/xfs_notify_failure.c | 108 ++++++++++++++++++++++++++++++++++-- include/linux/mm.h | 1 + mm/memory-failure.c | 21 +++++-- 4 files changed, 122 insertions(+), 11 deletions(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 0da9232ea175..f4b635526345 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -326,7 +326,8 @@ void kill_dax(struct dax_device *dax_dev) return; if (dax_dev->holder_data != NULL) - dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0); + dax_holder_notify_failure(dax_dev, 0, U64_MAX, + MF_MEM_PRE_REMOVE); clear_bit(DAXDEV_ALIVE, &dax_dev->flags); synchronize_srcu(&dax_srcu); diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index a7daa522e00f..fa50e5308292 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -22,6 +22,7 @@ #include #include +#include struct xfs_failure_info { xfs_agblock_t startblock; @@ -73,10 +74,16 @@ xfs_dax_failure_fn( struct xfs_mount *mp = cur->bc_mp; struct xfs_inode *ip; struct xfs_failure_info *notify = data; + struct address_space *mapping; + pgoff_t pgoff; + unsigned long pgcnt; int error = 0; if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { + /* Continue the query because this isn't a failure. */ + if (notify->mf_flags & MF_MEM_PRE_REMOVE) + return 0; notify->want_shutdown = true; return 0; } @@ -92,14 +99,60 @@ xfs_dax_failure_fn( return 0; } - error = mf_dax_kill_procs(VFS_I(ip)->i_mapping, - xfs_failure_pgoff(mp, rec, notify), - xfs_failure_pgcnt(mp, rec, notify), - notify->mf_flags); + mapping = VFS_I(ip)->i_mapping; + pgoff = xfs_failure_pgoff(mp, rec, notify); + pgcnt = xfs_failure_pgcnt(mp, rec, notify); + + /* Continue the rmap query if the inode isn't a dax file. */ + if (dax_mapping(mapping)) + error = mf_dax_kill_procs(mapping, pgoff, pgcnt, + notify->mf_flags); + + /* Invalidate the cache in dax pages. */ + if (notify->mf_flags & MF_MEM_PRE_REMOVE) + invalidate_inode_pages2_range(mapping, pgoff, + pgoff + pgcnt - 1); + xfs_irele(ip); return error; } +static int +xfs_dax_notify_failure_freeze( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + int error; + + error = freeze_super(sb, FREEZE_HOLDER_KERNEL); + if (error) + xfs_emerg(mp, "already frozen by kernel, err=%d", error); + + return error; +} + +static void +xfs_dax_notify_failure_thaw( + struct xfs_mount *mp, + bool kernel_frozen) +{ + struct super_block *sb = mp->m_super; + int error; + + if (kernel_frozen) { + error = thaw_super(sb, FREEZE_HOLDER_KERNEL); + if (error) + xfs_emerg(mp, "still frozen after notify failure, err=%d", + error); + } + + /* + * Also thaw userspace call anyway because the device is about to be + * removed immediately. + */ + thaw_super(sb, FREEZE_HOLDER_USERSPACE); +} + static int xfs_dax_notify_ddev_failure( struct xfs_mount *mp, @@ -112,15 +165,29 @@ xfs_dax_notify_ddev_failure( struct xfs_btree_cur *cur = NULL; struct xfs_buf *agf_bp = NULL; int error = 0; + bool kernel_frozen = false; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); + if (mf_flags & MF_MEM_PRE_REMOVE) { + xfs_info(mp, "Device is about to be removed!"); + /* + * Freeze fs to prevent new mappings from being created. + * - Keep going on if others already hold the kernel forzen. + * - Keep going on if other errors too because this device is + * starting to fail. + * - If kernel frozen state is hold successfully here, thaw it + * here as well at the end. + */ + kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0; + } + error = xfs_trans_alloc_empty(mp, &tp); if (error) - return error; + goto out; for (; agno <= end_agno; agno++) { struct xfs_rmap_irec ri_low = { }; @@ -165,11 +232,26 @@ xfs_dax_notify_ddev_failure( } xfs_trans_cancel(tp); - if (error || notify.want_shutdown) { + + /* + * Shutdown fs from a force umount in pre-remove case which won't fail, + * so errors can be ignored. Otherwise, shutdown the filesystem with + * CORRUPT flag if error occured or notify.want_shutdown was set during + * RMAP querying. + */ + if (mf_flags & MF_MEM_PRE_REMOVE) + xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); + else if (error || notify.want_shutdown) { xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); if (!error) error = -EFSCORRUPTED; } + +out: + /* Thaw the fs if it has been frozen before. */ + if (mf_flags & MF_MEM_PRE_REMOVE) + xfs_dax_notify_failure_thaw(mp, kernel_frozen); + return error; } @@ -197,6 +279,14 @@ xfs_dax_notify_failure( if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && mp->m_logdev_targp != mp->m_ddev_targp) { + /* + * In the pre-remove case the failure notification is attempting + * to trigger a force unmount. The expectation is that the + * device is still present, but its removal is in progress and + * can not be cancelled, proceed with accessing the log device. + */ + if (mf_flags & MF_MEM_PRE_REMOVE) + return 0; xfs_err(mp, "ondisk log corrupt, shutting down fs!"); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); return -EFSCORRUPTED; @@ -210,6 +300,12 @@ xfs_dax_notify_failure( ddev_start = mp->m_ddev_targp->bt_dax_part_off; ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; + /* Notify failure on the whole device. */ + if (offset == 0 && len == U64_MAX) { + offset = ddev_start; + len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev); + } + /* Ignore the range out of filesystem area */ if (offset + len - 1 < ddev_start) return -ENXIO; diff --git a/include/linux/mm.h b/include/linux/mm.h index 418d26608ece..caf13e94260e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3904,6 +3904,7 @@ enum mf_flags { MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, MF_NO_RETRY = 1 << 6, + MF_MEM_PRE_REMOVE = 1 << 7, }; int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 660c21859118..cff3bda60691 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -679,7 +679,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p, */ static void collect_procs_fsdax(struct page *page, struct address_space *mapping, pgoff_t pgoff, - struct list_head *to_kill) + struct list_head *to_kill, bool pre_remove) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -687,8 +687,15 @@ static void collect_procs_fsdax(struct page *page, i_mmap_lock_read(mapping); rcu_read_lock(); for_each_process(tsk) { - struct task_struct *t = task_early_kill(tsk, true); + struct task_struct *t = tsk; + /* + * Search for all tasks while MF_MEM_PRE_REMOVE is set, because + * the current may not be the one accessing the fsdax page. + * Otherwise, search for the current task. + */ + if (!pre_remove) + t = task_early_kill(tsk, true); if (!t) continue; vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1795,6 +1802,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, dax_entry_t cookie; struct page *page; size_t end = index + count; + bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE; mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; @@ -1806,9 +1814,14 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, if (!page) goto unlock; - SetPageHWPoison(page); + if (!pre_remove) + SetPageHWPoison(page); - collect_procs_fsdax(page, mapping, index, &to_kill); + /* + * The pre_remove case is revoking access, the memory is still + * good and could theoretically be put back into service. + */ + collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove); unmap_and_kill(&to_kill, page_to_pfn(page), mapping, index, mf_flags); unlock: From 011f129fee4bd064a3db30ca1a0139548a619482 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 29 Nov 2023 19:39:47 +0700 Subject: [PATCH 0352/1562] Documentation: xfs: consolidate XFS docs into its own subdirectory XFS docs are currently in upper-level Documentation/filesystems. Although these are currently 4 docs, they are already outstanding as a group and can be moved to its own subdirectory. Consolidate them into Documentation/filesystems/xfs/. Signed-off-by: Bagas Sanjaya Reviewed-by: Bill O'Donnell Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- Documentation/filesystems/index.rst | 5 +---- Documentation/filesystems/xfs/index.rst | 14 ++++++++++++++ .../{ => xfs}/xfs-delayed-logging-design.rst | 0 .../{ => xfs}/xfs-maintainer-entry-profile.rst | 0 .../{ => xfs}/xfs-online-fsck-design.rst | 2 +- .../{ => xfs}/xfs-self-describing-metadata.rst | 0 .../maintainer/maintainer-entry-profile.rst | 2 +- MAINTAINERS | 4 ++-- 8 files changed, 19 insertions(+), 8 deletions(-) create mode 100644 Documentation/filesystems/xfs/index.rst rename Documentation/filesystems/{ => xfs}/xfs-delayed-logging-design.rst (100%) rename Documentation/filesystems/{ => xfs}/xfs-maintainer-entry-profile.rst (100%) rename Documentation/filesystems/{ => xfs}/xfs-online-fsck-design.rst (99%) rename Documentation/filesystems/{ => xfs}/xfs-self-describing-metadata.rst (100%) diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 09cade7eaefc..e18bc5ae3b35 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -121,8 +121,5 @@ Documentation for filesystem implementations. udf virtiofs vfat - xfs-delayed-logging-design - xfs-maintainer-entry-profile - xfs-self-describing-metadata - xfs-online-fsck-design + xfs/index zonefs diff --git a/Documentation/filesystems/xfs/index.rst b/Documentation/filesystems/xfs/index.rst new file mode 100644 index 000000000000..ab66c57a5d18 --- /dev/null +++ b/Documentation/filesystems/xfs/index.rst @@ -0,0 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +XFS Filesystem Documentation +============================ + +.. toctree:: + :maxdepth: 2 + :numbered: + + xfs-delayed-logging-design + xfs-maintainer-entry-profile + xfs-self-describing-metadata + xfs-online-fsck-design diff --git a/Documentation/filesystems/xfs-delayed-logging-design.rst b/Documentation/filesystems/xfs/xfs-delayed-logging-design.rst similarity index 100% rename from Documentation/filesystems/xfs-delayed-logging-design.rst rename to Documentation/filesystems/xfs/xfs-delayed-logging-design.rst diff --git a/Documentation/filesystems/xfs-maintainer-entry-profile.rst b/Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst similarity index 100% rename from Documentation/filesystems/xfs-maintainer-entry-profile.rst rename to Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst diff --git a/Documentation/filesystems/xfs-online-fsck-design.rst b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst similarity index 99% rename from Documentation/filesystems/xfs-online-fsck-design.rst rename to Documentation/filesystems/xfs/xfs-online-fsck-design.rst index a0678101a7d0..352516feef6f 100644 --- a/Documentation/filesystems/xfs-online-fsck-design.rst +++ b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst @@ -962,7 +962,7 @@ disk, but these buffer verifiers cannot provide any consistency checking between metadata structures. For more information, please see the documentation for -Documentation/filesystems/xfs-self-describing-metadata.rst +Documentation/filesystems/xfs/xfs-self-describing-metadata.rst Reverse Mapping --------------- diff --git a/Documentation/filesystems/xfs-self-describing-metadata.rst b/Documentation/filesystems/xfs/xfs-self-describing-metadata.rst similarity index 100% rename from Documentation/filesystems/xfs-self-describing-metadata.rst rename to Documentation/filesystems/xfs/xfs-self-describing-metadata.rst diff --git a/Documentation/maintainer/maintainer-entry-profile.rst b/Documentation/maintainer/maintainer-entry-profile.rst index 7ad4bfc2cc03..18cee1edaecb 100644 --- a/Documentation/maintainer/maintainer-entry-profile.rst +++ b/Documentation/maintainer/maintainer-entry-profile.rst @@ -105,4 +105,4 @@ to do something different in the near future. ../driver-api/media/maintainer-entry-profile ../driver-api/vfio-pci-device-specific-driver-acceptance ../nvme/feature-and-quirk-policy - ../filesystems/xfs-maintainer-entry-profile + ../filesystems/xfs/xfs-maintainer-entry-profile diff --git a/MAINTAINERS b/MAINTAINERS index 788be9ab5b73..76896b511c6a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23894,10 +23894,10 @@ S: Supported W: http://xfs.org/ C: irc://irc.oftc.net/xfs T: git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git -P: Documentation/filesystems/xfs-maintainer-entry-profile.rst +P: Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst F: Documentation/ABI/testing/sysfs-fs-xfs F: Documentation/admin-guide/xfs.rst -F: Documentation/filesystems/xfs-* +F: Documentation/filesystems/xfs/* F: fs/xfs/ F: include/uapi/linux/dqblk_xfs.h F: include/uapi/linux/fsmap.h From 64f08b152a3bc171f4651271bfe973ad3d085ab6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2023 18:40:54 +0100 Subject: [PATCH 0353/1562] xfs: clean up the XFS_IOC_{GS}ET_RESBLKS handler The XFS_IOC_GET_RESBLKS and XFS_IOC_SET_RESBLKS already share a fair amount of code, and will share even more soon. Move the logic for both of them out of the main xfs_file_ioctl function into a xfs_ioctl_getset_resblocks helper to share the code and prepare for additional changes. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_ioctl.c | 87 +++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6c3919687ea6..7edc1d892f0c 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1872,6 +1872,46 @@ xfs_fs_eofblocks_from_user( return 0; } +static int +xfs_ioctl_getset_resblocks( + struct file *filp, + unsigned int cmd, + void __user *arg) +{ + struct xfs_mount *mp = XFS_I(file_inode(filp))->i_mount; + struct xfs_fsop_resblks fsop = { }; + int error; + uint64_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (cmd == XFS_IOC_SET_RESBLKS) { + if (xfs_is_readonly(mp)) + return -EROFS; + + if (copy_from_user(&fsop, arg, sizeof(fsop))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + in = fsop.resblks; + error = xfs_reserve_blocks(mp, &in, &fsop); + mnt_drop_write_file(filp); + if (error) + return error; + } else { + error = xfs_reserve_blocks(mp, NULL, &fsop); + if (error) + return error; + } + + if (copy_to_user(arg, &fsop, sizeof(fsop))) + return -EFAULT; + return 0; +} + /* * These long-unused ioctls were removed from the official ioctl API in 5.17, * but retain these definitions so that we can log warnings about them. @@ -2018,50 +2058,9 @@ xfs_file_ioctl( return 0; } - case XFS_IOC_SET_RESBLKS: { - xfs_fsop_resblks_t inout; - uint64_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (xfs_is_readonly(mp)) - return -EROFS; - - if (copy_from_user(&inout, arg, sizeof(inout))) - return -EFAULT; - - error = mnt_want_write_file(filp); - if (error) - return error; - - /* input parameter is passed in resblks field of structure */ - in = inout.resblks; - error = xfs_reserve_blocks(mp, &in, &inout); - mnt_drop_write_file(filp); - if (error) - return error; - - if (copy_to_user(arg, &inout, sizeof(inout))) - return -EFAULT; - return 0; - } - - case XFS_IOC_GET_RESBLKS: { - xfs_fsop_resblks_t out; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - error = xfs_reserve_blocks(mp, NULL, &out); - if (error) - return error; - - if (copy_to_user(arg, &out, sizeof(out))) - return -EFAULT; - - return 0; - } + case XFS_IOC_SET_RESBLKS: + case XFS_IOC_GET_RESBLKS: + return xfs_ioctl_getset_resblocks(filp, cmd, arg); case XFS_IOC_FSGROWFSDATA: { struct xfs_growfs_data in; From c2c2620de7577db66a859b934715e98e4501e4f4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2023 18:40:55 +0100 Subject: [PATCH 0354/1562] xfs: clean up the XFS_IOC_FSCOUNTS handler Split XFS_IOC_FSCOUNTS out of the main xfs_file_ioctl function, and merge the xfs_fs_counts helper into the ioctl handler. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_fsops.c | 16 ---------------- fs/xfs/xfs_fsops.h | 1 - fs/xfs/xfs_ioctl.c | 29 ++++++++++++++++++++--------- 3 files changed, 20 insertions(+), 26 deletions(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 7cb75cb6b8e9..01681783e2c3 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -343,22 +343,6 @@ xfs_growfs_log( return error; } -/* - * exported through ioctl XFS_IOC_FSCOUNTS - */ - -void -xfs_fs_counts( - xfs_mount_t *mp, - xfs_fsop_counts_t *cnt) -{ - cnt->allocino = percpu_counter_read_positive(&mp->m_icount); - cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); - cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp); - cnt->freertx = percpu_counter_read_positive(&mp->m_frextents); -} - /* * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS * diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 2cffe51a31e8..45f0cb6e8059 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -8,7 +8,6 @@ extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); -extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 7edc1d892f0c..8244210f6786 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1912,6 +1912,24 @@ xfs_ioctl_getset_resblocks( return 0; } +static int +xfs_ioctl_fs_counts( + struct xfs_mount *mp, + struct xfs_fsop_counts __user *uarg) +{ + struct xfs_fsop_counts out = { + .allocino = percpu_counter_read_positive(&mp->m_icount), + .freeino = percpu_counter_read_positive(&mp->m_ifree), + .freedata = percpu_counter_read_positive(&mp->m_fdblocks) - + xfs_fdblocks_unavailable(mp), + .freertx = percpu_counter_read_positive(&mp->m_frextents), + }; + + if (copy_to_user(uarg, &out, sizeof(out))) + return -EFAULT; + return 0; +} + /* * These long-unused ioctls were removed from the official ioctl API in 5.17, * but retain these definitions so that we can log warnings about them. @@ -2048,15 +2066,8 @@ xfs_file_ioctl( return error; } - case XFS_IOC_FSCOUNTS: { - xfs_fsop_counts_t out; - - xfs_fs_counts(mp, &out); - - if (copy_to_user(arg, &out, sizeof(out))) - return -EFAULT; - return 0; - } + case XFS_IOC_FSCOUNTS: + return xfs_ioctl_fs_counts(mp, arg); case XFS_IOC_SET_RESBLKS: case XFS_IOC_GET_RESBLKS: From 646ddf0c4df5181a7057ecccd29e535baaf034b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2023 18:40:56 +0100 Subject: [PATCH 0355/1562] xfs: clean up the xfs_reserve_blocks interface xfs_reserve_blocks has a very odd interface that can only be explained by it directly deriving from the IRIX fcntl handler back in the day. Split reporting out the reserved blocks out of xfs_reserve_blocks into the only caller that cares. This means that the value reported from XFS_IOC_SET_RESBLKS isn't atomically sampled in the same critical section as when it was set anymore, but as the values could change right after setting them anyway that does not matter. It does provide atomic sampling of both values for XFS_IOC_GET_RESBLKS now, though. Also pass a normal scalar integer value for the requested value instead of the pointless pointer. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_fsops.c | 34 +++------------------------------- fs/xfs/xfs_fsops.h | 3 +-- fs/xfs/xfs_ioctl.c | 13 ++++++------- fs/xfs/xfs_mount.c | 8 ++------ fs/xfs/xfs_super.c | 6 ++---- 5 files changed, 14 insertions(+), 50 deletions(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 01681783e2c3..4f5da19142f2 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -344,43 +344,20 @@ xfs_growfs_log( } /* - * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS - * - * xfs_reserve_blocks is called to set m_resblks - * in the in-core mount table. The number of unused reserved blocks - * is kept in m_resblks_avail. - * * Reserve the requested number of blocks if available. Otherwise return * as many as possible to satisfy the request. The actual number - * reserved are returned in outval - * - * A null inval pointer indicates that only the current reserved blocks - * available should be returned no settings are changed. + * reserved are returned in outval. */ - int xfs_reserve_blocks( - xfs_mount_t *mp, - uint64_t *inval, - xfs_fsop_resblks_t *outval) + struct xfs_mount *mp, + uint64_t request) { int64_t lcounter, delta; int64_t fdblks_delta = 0; - uint64_t request; int64_t free; int error = 0; - /* If inval is null, report current values and return */ - if (inval == (uint64_t *)NULL) { - if (!outval) - return -EINVAL; - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - return 0; - } - - request = *inval; - /* * With per-cpu counters, this becomes an interesting problem. we need * to work out if we are freeing or allocation blocks first, then we can @@ -450,11 +427,6 @@ xfs_reserve_blocks( spin_lock(&mp->m_sb_lock); } out: - if (outval) { - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - } - spin_unlock(&mp->m_sb_lock); return error; } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 45f0cb6e8059..7536f8a92746 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -8,8 +8,7 @@ extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); -extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval, - xfs_fsop_resblks_t *outval); +int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags); extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 8244210f6786..f02b6e558af5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1881,7 +1881,6 @@ xfs_ioctl_getset_resblocks( struct xfs_mount *mp = XFS_I(file_inode(filp))->i_mount; struct xfs_fsop_resblks fsop = { }; int error; - uint64_t in; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1896,17 +1895,17 @@ xfs_ioctl_getset_resblocks( error = mnt_want_write_file(filp); if (error) return error; - in = fsop.resblks; - error = xfs_reserve_blocks(mp, &in, &fsop); + error = xfs_reserve_blocks(mp, fsop.resblks); mnt_drop_write_file(filp); if (error) return error; - } else { - error = xfs_reserve_blocks(mp, NULL, &fsop); - if (error) - return error; } + spin_lock(&mp->m_sb_lock); + fsop.resblks = mp->m_resblks; + fsop.resblks_avail = mp->m_resblks_avail; + spin_unlock(&mp->m_sb_lock); + if (copy_to_user(arg, &fsop, sizeof(fsop))) return -EFAULT; return 0; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index aed5be5508fe..aabb25dc3efa 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -637,7 +637,6 @@ xfs_mountfs( struct xfs_sb *sbp = &(mp->m_sb); struct xfs_inode *rip; struct xfs_ino_geometry *igeo = M_IGEO(mp); - uint64_t resblks; uint quotamount = 0; uint quotaflags = 0; int error = 0; @@ -974,8 +973,7 @@ xfs_mountfs( * we were already there on the last unmount. Warn if this occurs. */ if (!xfs_is_readonly(mp)) { - resblks = xfs_default_resblks(mp); - error = xfs_reserve_blocks(mp, &resblks, NULL); + error = xfs_reserve_blocks(mp, xfs_default_resblks(mp)); if (error) xfs_warn(mp, "Unable to allocate reserve blocks. Continuing without reserve pool."); @@ -1053,7 +1051,6 @@ void xfs_unmountfs( struct xfs_mount *mp) { - uint64_t resblks; int error; /* @@ -1090,8 +1087,7 @@ xfs_unmountfs( * we only every apply deltas to the superblock and hence the incore * value does not matter.... */ - resblks = 0; - error = xfs_reserve_blocks(mp, &resblks, NULL); + error = xfs_reserve_blocks(mp, 0); if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 764304595e8b..d0009430a627 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -906,10 +906,8 @@ xfs_fs_statfs( STATIC void xfs_save_resvblks(struct xfs_mount *mp) { - uint64_t resblks = 0; - mp->m_resblks_save = mp->m_resblks; - xfs_reserve_blocks(mp, &resblks, NULL); + xfs_reserve_blocks(mp, 0); } STATIC void @@ -923,7 +921,7 @@ xfs_restore_resvblks(struct xfs_mount *mp) } else resblks = xfs_default_resblks(mp); - xfs_reserve_blocks(mp, &resblks, NULL); + xfs_reserve_blocks(mp, resblks); } /* From 08e54ca42d6a0d88709a1be38eb95843142b5101 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2023 18:40:57 +0100 Subject: [PATCH 0356/1562] xfs: clean up xfs_fsops.h Use struct types instead of typedefs so that the header can be included with pulling in the headers that define the typedefs, and remove the pointless externs. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_fsops.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 7536f8a92746..44457b0a0593 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -6,12 +6,12 @@ #ifndef __XFS_FSOPS_H__ #define __XFS_FSOPS_H__ -extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); -extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); +int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); +int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); -extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags); +int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags); -extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); -extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); +int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); +int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); #endif /* __XFS_FSOPS_H__ */ From e6af9c98cbf0164a619d95572136bfb54d482dd6 Mon Sep 17 00:00:00 2001 From: Jiachen Zhang Date: Tue, 5 Dec 2023 13:58:58 +0800 Subject: [PATCH 0357/1562] xfs: ensure logflagsp is initialized in xfs_bmap_del_extent_real In the case of returning -ENOSPC, ensure logflagsp is initialized by 0. Otherwise the caller __xfs_bunmapi will set uninitialized illegal tmp_logflags value into xfs log, which might cause unpredictable error in the log recovery procedure. Also, remove the flags variable and set the *logflagsp directly, so that the code should be more robust in the long run. Fixes: 1b24b633aafe ("xfs: move some more code into xfs_bmap_del_extent_real") Signed-off-by: Jiachen Zhang Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 73 +++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 42 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 68be1dd4f0f2..ca6614f4eac5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5010,7 +5010,6 @@ xfs_bmap_del_extent_real( xfs_fileoff_t del_endoff; /* first offset past del */ int do_fx; /* free extent at end of routine */ int error; /* error return value */ - int flags = 0;/* inode logging flags */ struct xfs_bmbt_irec got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ @@ -5023,6 +5022,8 @@ xfs_bmap_del_extent_real( uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; + *logflagsp = 0; + mp = ip->i_mount; XFS_STATS_INC(mp, xs_del_exlist); @@ -5035,7 +5036,6 @@ xfs_bmap_del_extent_real( ASSERT(got_endoff >= del_endoff); ASSERT(!isnullstartblock(got.br_startblock)); qfield = 0; - error = 0; /* * If it's the case where the directory code is running with no block @@ -5051,13 +5051,13 @@ xfs_bmap_del_extent_real( del->br_startoff > got.br_startoff && del_endoff < got_endoff) return -ENOSPC; - flags = XFS_ILOG_CORE; + *logflagsp = XFS_ILOG_CORE; if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { if (!(bflags & XFS_BMAPI_REMAP)) { error = xfs_rtfree_blocks(tp, del->br_startblock, del->br_blockcount); if (error) - goto done; + return error; } do_fx = 0; @@ -5072,11 +5072,9 @@ xfs_bmap_del_extent_real( if (cur) { error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } + return error; + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } if (got.br_startoff == del->br_startoff) @@ -5093,17 +5091,15 @@ xfs_bmap_del_extent_real( xfs_iext_prev(ifp, icur); ifp->if_nextents--; - flags |= XFS_ILOG_CORE; + *logflagsp |= XFS_ILOG_CORE; if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } if ((error = xfs_btree_delete(cur, &i))) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } + return error; + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; break; case BMAP_LEFT_FILLING: /* @@ -5114,12 +5110,12 @@ xfs_bmap_del_extent_real( got.br_blockcount -= del->br_blockcount; xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; break; case BMAP_RIGHT_FILLING: /* @@ -5128,12 +5124,12 @@ xfs_bmap_del_extent_real( got.br_blockcount -= del->br_blockcount; xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; break; case 0: /* @@ -5150,18 +5146,18 @@ xfs_bmap_del_extent_real( new.br_state = got.br_state; new.br_startblock = del_endblock; - flags |= XFS_ILOG_CORE; + *logflagsp |= XFS_ILOG_CORE; if (cur) { error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; error = xfs_btree_increment(cur, 0, &i); if (error) - goto done; + return error; cur->bc_rec.b = new; error = xfs_btree_insert(cur, &i); if (error && error != -ENOSPC) - goto done; + return error; /* * If get no-space back from btree insert, it tried a * split, and we have a zero block reservation. Fix up @@ -5174,33 +5170,28 @@ xfs_bmap_del_extent_real( */ error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } + return error; + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; /* * Update the btree record back * to the original value. */ error = xfs_bmbt_update(cur, &old); if (error) - goto done; + return error; /* * Reset the extent record back * to the original value. */ xfs_iext_update_extent(ip, state, icur, &old); - flags = 0; - error = -ENOSPC; - goto done; - } - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; + *logflagsp = 0; + return -ENOSPC; } + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } else - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); ifp->if_nextents++; xfs_iext_next(ifp, icur); @@ -5224,7 +5215,7 @@ xfs_bmap_del_extent_real( ((bflags & XFS_BMAPI_NODISCARD) || del->br_state == XFS_EXT_UNWRITTEN)); if (error) - goto done; + return error; } } @@ -5239,9 +5230,7 @@ xfs_bmap_del_extent_real( if (qfield && !(bflags & XFS_BMAPI_REMAP)) xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); -done: - *logflagsp = flags; - return error; + return 0; } /* From 5759aa4f956034b289b0ae2c99daddfc775442e1 Mon Sep 17 00:00:00 2001 From: Zhang Tianci Date: Tue, 5 Dec 2023 13:58:59 +0800 Subject: [PATCH 0358/1562] xfs: update dir3 leaf block metadata after swap xfs_da3_swap_lastblock() copy the last block content to the dead block, but do not update the metadata in it. We need update some metadata for some kinds of type block, such as dir3 leafn block records its blkno, we shall update it to the dead block blkno. Otherwise, before write the xfs_buf to disk, the verify_write() will fail in blk_hdr->blkno != xfs_buf->b_bn, then xfs will be shutdown. We will get this warning: XFS (dm-0): Metadata corruption detected at xfs_dir3_leaf_verify+0xa8/0xe0 [xfs], xfs_dir3_leafn block 0x178 XFS (dm-0): Unmount and run xfs_repair XFS (dm-0): First 128 bytes of corrupted metadata buffer: 00000000e80f1917: 00 80 00 0b 00 80 00 07 3d ff 00 00 00 00 00 00 ........=....... 000000009604c005: 00 00 00 00 00 00 01 a0 00 00 00 00 00 00 00 00 ................ 000000006b6fb2bf: e4 44 e3 97 b5 64 44 41 8b 84 60 0e 50 43 d9 bf .D...dDA..`.PC.. 00000000678978a2: 00 00 00 00 00 00 00 83 01 73 00 93 00 00 00 00 .........s...... 00000000b28b247c: 99 29 1d 38 00 00 00 00 99 29 1d 40 00 00 00 00 .).8.....).@.... 000000002b2a662c: 99 29 1d 48 00 00 00 00 99 49 11 00 00 00 00 00 .).H.....I...... 00000000ea2ffbb8: 99 49 11 08 00 00 45 25 99 49 11 10 00 00 48 fe .I....E%.I....H. 0000000069e86440: 99 49 11 18 00 00 4c 6b 99 49 11 20 00 00 4d 97 .I....Lk.I. ..M. XFS (dm-0): xfs_do_force_shutdown(0x8) called from line 1423 of file fs/xfs/xfs_buf.c. Return address = 00000000c0ff63c1 XFS (dm-0): Corruption of in-memory data detected. Shutting down filesystem XFS (dm-0): Please umount the filesystem and rectify the problem(s) >From the log above, we know xfs_buf->b_no is 0x178, but the block's hdr record its blkno is 0x1a0. Fixes: 24df33b45ecf ("xfs: add CRC checking to dir2 leaf blocks") Signed-off-by: Zhang Tianci Suggested-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_da_btree.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e576560b46e9..282c7cf032f4 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2316,10 +2316,17 @@ xfs_da3_swap_lastblock( return error; /* * Copy the last block into the dead buffer and log it. + * On CRC-enabled file systems, also update the stamped in blkno. */ memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize); + if (xfs_has_crc(mp)) { + struct xfs_da3_blkinfo *da3 = dead_buf->b_addr; + + da3->blkno = cpu_to_be64(xfs_buf_daddr(dead_buf)); + } xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1); dead_info = dead_buf->b_addr; + /* * Get values from the moved block. */ From fd45ddb9dd606b3eaddf26e13f64340636955986 Mon Sep 17 00:00:00 2001 From: Zhang Tianci Date: Tue, 5 Dec 2023 13:59:00 +0800 Subject: [PATCH 0359/1562] xfs: extract xfs_da_buf_copy() helper function This patch does not modify logic. xfs_da_buf_copy() will copy one block from src xfs_buf to dst xfs_buf, and update the block metadata in dst directly. Signed-off-by: Zhang Tianci Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr_leaf.c | 12 ++---- fs/xfs/libxfs/xfs_da_btree.c | 74 ++++++++++++++--------------------- fs/xfs/libxfs/xfs_da_btree.h | 2 + 3 files changed, 36 insertions(+), 52 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2580ae47209a..654e17e6610d 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1244,14 +1244,10 @@ xfs_attr3_leaf_to_node( if (error) goto out; - /* copy leaf to new buffer, update identifiers */ - xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); - bp2->b_ops = bp1->b_ops; - memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize); - if (xfs_has_crc(mp)) { - struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; - hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2)); - } + /* + * Copy leaf to new buffer and log it. + */ + xfs_da_buf_copy(bp2, bp1, args->geo->blksize); xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); /* diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 282c7cf032f4..5457188bb4de 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -421,6 +421,25 @@ xfs_da3_node_read_mapped( return xfs_da3_node_set_type(tp, *bpp); } +/* + * Copy src directory/attr leaf/node buffer to the dst. + * For v5 file systems make sure the right blkno is stamped in. + */ +void +xfs_da_buf_copy( + struct xfs_buf *dst, + struct xfs_buf *src, + size_t size) +{ + struct xfs_da3_blkinfo *da3 = dst->b_addr; + + memcpy(dst->b_addr, src->b_addr, size); + dst->b_ops = src->b_ops; + xfs_trans_buf_copy_type(dst, src); + if (xfs_has_crc(dst->b_mount)) + da3->blkno = cpu_to_be64(xfs_buf_daddr(dst)); +} + /*======================================================================== * Routines used for growing the Btree. *========================================================================*/ @@ -690,12 +709,6 @@ xfs_da3_root_split( btree = icnodehdr.btree; size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot); level = icnodehdr.level; - - /* - * we are about to copy oldroot to bp, so set up the type - * of bp while we know exactly what it will be. - */ - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); } else { struct xfs_dir3_icleaf_hdr leafhdr; @@ -707,31 +720,17 @@ xfs_da3_root_split( size = (int)((char *)&leafhdr.ents[leafhdr.count] - (char *)leaf); level = 0; - - /* - * we are about to copy oldroot to bp, so set up the type - * of bp while we know exactly what it will be. - */ - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); } /* - * we can copy most of the information in the node from one block to - * another, but for CRC enabled headers we have to make sure that the - * block specific identifiers are kept intact. We update the buffer - * directly for this. + * Copy old root to new buffer and log it. */ - memcpy(node, oldroot, size); - if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || - oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { - struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; - - node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - } + xfs_da_buf_copy(bp, blk1->bp, size); xfs_trans_log_buf(tp, bp, 0, size - 1); - bp->b_ops = blk1->bp->b_ops; - xfs_trans_buf_copy_type(bp, blk1->bp); + /* + * Update blk1 to point to new buffer. + */ blk1->bp = bp; blk1->blkno = blkno; @@ -1220,21 +1219,14 @@ xfs_da3_root_join( xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); /* - * This could be copying a leaf back into the root block in the case of - * there only being a single leaf block left in the tree. Hence we have - * to update the b_ops pointer as well to match the buffer type change - * that could occur. For dir3 blocks we also need to update the block - * number in the buffer header. + * Copy child to root buffer and log it. */ - memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize); - root_blk->bp->b_ops = bp->b_ops; - xfs_trans_buf_copy_type(root_blk->bp, bp); - if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { - struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; - da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp)); - } + xfs_da_buf_copy(root_blk->bp, bp, args->geo->blksize); xfs_trans_log_buf(args->trans, root_blk->bp, 0, args->geo->blksize - 1); + /* + * Now we can drop the child buffer. + */ error = xfs_da_shrink_inode(args, child, bp); return error; } @@ -2316,14 +2308,8 @@ xfs_da3_swap_lastblock( return error; /* * Copy the last block into the dead buffer and log it. - * On CRC-enabled file systems, also update the stamped in blkno. */ - memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize); - if (xfs_has_crc(mp)) { - struct xfs_da3_blkinfo *da3 = dead_buf->b_addr; - - da3->blkno = cpu_to_be64(xfs_buf_daddr(dead_buf)); - } + xfs_da_buf_copy(dead_buf, last_buf, args->geo->blksize); xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1); dead_info = dead_buf->b_addr; diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ffa3df5b2893..706baf36e175 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -219,6 +219,8 @@ int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); +void xfs_da_buf_copy(struct xfs_buf *dst, struct xfs_buf *src, + size_t size); uint xfs_da_hashname(const uint8_t *name_string, int name_length); enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, From c12c50393c1f6f7d7e45c7f55da9c013c0cc0522 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2023 21:07:18 +0100 Subject: [PATCH 0360/1562] xfs: use static_assert to check struct sizes and offsets Use the compiler-provided static_assert built-in from C11 instead of the kernel-specific BUILD_BUG_ON_MSG for the structure size and offset checks in xfs_ondisk. This not only gives slightly nicer error messages in case things go south, but can also be trivially used as-is in userspace. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Reviewed-by: Carlos Maiolino Signed-off-by: Chandan Babu R --- fs/xfs/xfs_ondisk.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 21a7e350b4c5..d9c988c5ad69 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -7,16 +7,16 @@ #define __XFS_ONDISK_H #define XFS_CHECK_STRUCT_SIZE(structname, size) \ - BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \ - #structname ") is wrong, expected " #size) + static_assert(sizeof(structname) == (size), \ + "XFS: sizeof(" #structname ") is wrong, expected " #size) #define XFS_CHECK_OFFSET(structname, member, off) \ - BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \ + static_assert(offsetof(structname, member) == (off), \ "XFS: offsetof(" #structname ", " #member ") is wrong, " \ "expected " #off) #define XFS_CHECK_VALUE(value, expected) \ - BUILD_BUG_ON_MSG((value) != (expected), \ + static_assert((value) == (expected), \ "XFS: value of " #value " is wrong, expected " #expected) static inline void __init From 18793e050504288345eb455a471677b57117bcc6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2023 21:07:19 +0100 Subject: [PATCH 0361/1562] xfs: move xfs_ondisk.h to libxfs/ Move xfs_ondisk.h to libxfs so that we can do the struct sanity checks in userspace libxfs as well. This should allow us to retire the somewhat fragile xfs/122 test on xfstests. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Reviewed-by: Carlos Maiolino Signed-off-by: Chandan Babu R --- fs/xfs/{ => libxfs}/xfs_ondisk.h | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename fs/xfs/{ => libxfs}/xfs_ondisk.h (100%) diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h similarity index 100% rename from fs/xfs/xfs_ondisk.h rename to fs/xfs/libxfs/xfs_ondisk.h From 6f3dd2c31d7d703a814c59f60daf95c57fa6a4c2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 7 Aug 2023 20:50:44 +0200 Subject: [PATCH 0362/1562] mm/slub: fix bulk alloc and free stats The SLUB sysfs stats enabled CONFIG_SLUB_STATS have two deficiencies identified wrt bulk alloc/free operations: - Bulk allocations from cpu freelist are not counted. Add the ALLOC_FASTPATH counter there. - Bulk fastpath freeing will count a list of multiple objects with a single FREE_FASTPATH inc. Add a stat_add() variant to count them all. Reviewed-by: Chengming Zhou Signed-off-by: Vlastimil Babka --- mm/slub.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 3f8b95757106..d7b0ca6012e0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -396,6 +396,14 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) #endif } +static inline +void stat_add(const struct kmem_cache *s, enum stat_item si, int v) +{ +#ifdef CONFIG_SLUB_STATS + raw_cpu_add(s->cpu_slab->stat[si], v); +#endif +} + /* * The slab lists for all objects. */ @@ -4268,7 +4276,7 @@ redo: local_unlock(&s->cpu_slab->lock); } - stat(s, FREE_FASTPATH); + stat_add(s, FREE_FASTPATH, cnt); } #else /* CONFIG_SLUB_TINY */ static void do_slab_free(struct kmem_cache *s, @@ -4545,6 +4553,7 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, c->freelist = get_freepointer(s, object); p[i] = object; maybe_wipe_obj_freeptr(s, p[i]); + stat(s, ALLOC_FASTPATH); } c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); From 520a688a2edfddba97968bf9e133b9a3d7c78059 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 2 Nov 2023 16:34:39 +0100 Subject: [PATCH 0363/1562] mm/slub: introduce __kmem_cache_free_bulk() without free hooks Currently, when __kmem_cache_alloc_bulk() fails, it frees back the objects that were allocated before the failure, using kmem_cache_free_bulk(). Because kmem_cache_free_bulk() calls the free hooks (KASAN etc.) and those expect objects that were processed by the post alloc hooks, slab_post_alloc_hook() is called before kmem_cache_free_bulk(). This is wasteful, although not a big concern in practice for the rare error path. But in order to efficiently handle percpu array batch refill and free in the near future, we will also need a variant of kmem_cache_free_bulk() that avoids the free hooks. So introduce it now and use it for the failure path. In case of failure we however still need to perform memcg uncharge so handle that in a new memcg_slab_alloc_error_hook(). Thanks to Chengming Zhou for noticing the missing uncharge. As a consequence, __kmem_cache_alloc_bulk() no longer needs the objcg parameter, remove it. Reviewed-by: Chengming Zhou Signed-off-by: Vlastimil Babka --- mm/slub.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d7b0ca6012e0..0a9e4bd0dd68 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2003,6 +2003,14 @@ void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, __memcg_slab_free_hook(s, slab, p, objects, objcgs); } + +static inline +void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, + struct obj_cgroup *objcg) +{ + if (objcg) + obj_cgroup_uncharge(objcg, objects * obj_full_size(s)); +} #else /* CONFIG_MEMCG_KMEM */ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) { @@ -2032,6 +2040,12 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { } + +static inline +void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, + struct obj_cgroup *objcg) +{ +} #endif /* CONFIG_MEMCG_KMEM */ /* @@ -4478,6 +4492,27 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, return same; } +/* + * Internal bulk free of objects that were not initialised by the post alloc + * hooks and thus should not be processed by the free hooks + */ +static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + if (!size) + return; + + do { + struct detached_freelist df; + + size = build_detached_freelist(s, size, p, &df); + if (!df.slab) + continue; + + do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, + _RET_IP_); + } while (likely(size)); +} + /* Note that interrupts must be enabled when calling this function. */ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) { @@ -4498,8 +4533,9 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) EXPORT_SYMBOL(kmem_cache_free_bulk); #ifndef CONFIG_SLUB_TINY -static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p, struct obj_cgroup *objcg) +static inline +int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) { struct kmem_cache_cpu *c; unsigned long irqflags; @@ -4563,14 +4599,13 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, error: slub_put_cpu_ptr(s->cpu_slab); - slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); - kmem_cache_free_bulk(s, i, p); + __kmem_cache_free_bulk(s, i, p); return 0; } #else /* CONFIG_SLUB_TINY */ static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p, struct obj_cgroup *objcg) + size_t size, void **p) { int i; @@ -4593,8 +4628,7 @@ static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, return i; error: - slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); - kmem_cache_free_bulk(s, i, p); + __kmem_cache_free_bulk(s, i, p); return 0; } #endif /* CONFIG_SLUB_TINY */ @@ -4614,15 +4648,19 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(!s)) return 0; - i = __kmem_cache_alloc_bulk(s, flags, size, p, objcg); + i = __kmem_cache_alloc_bulk(s, flags, size, p); /* * memcg and kmem_cache debug support and memory initialization. * Done outside of the IRQ disabled fastpath loop. */ - if (i != 0) + if (likely(i != 0)) { slab_post_alloc_hook(s, objcg, flags, size, p, slab_want_init_on_alloc(flags, s), s->object_size); + } else { + memcg_slab_alloc_error_hook(s, size, objcg); + } + return i; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); From 284f17ac13fe34ae9eecbe57bb91553374d9b855 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 3 Nov 2023 20:24:51 +0100 Subject: [PATCH 0364/1562] mm/slub: handle bulk and single object freeing separately Currently we have a single function slab_free() handling both single object freeing and bulk freeing with necessary hooks, the latter case requiring slab_free_freelist_hook(). It should be however better to distinguish the two use cases for the following reasons: - code simpler to follow for the single object case - better code generation - although inlining should eliminate the slab_free_freelist_hook() for single object freeing in case no debugging options are enabled, it seems it's not perfect. When e.g. KASAN is enabled, we're imposing additional unnecessary overhead for single object freeing. - preparation to add percpu array caches in near future Therefore, simplify slab_free() for the single object case by dropping unnecessary parameters and calling only slab_free_hook() instead of slab_free_freelist_hook(). Rename the bulk variant to slab_free_bulk() and adjust callers accordingly. While at it, flip (and document) slab_free_hook() return value so that it returns true when the freeing can proceed, which matches the logic of slab_free_freelist_hook() and is not confusingly the opposite. Additionally we can simplify a bit by changing the tail parameter of do_slab_free() when freeing a single object - instead of NULL we can set it equal to head. bloat-o-meter shows small code reduction with a .config that has KASAN etc disabled: add/remove: 0/0 grow/shrink: 0/4 up/down: 0/-118 (-118) Function old new delta kmem_cache_alloc_bulk 1203 1196 -7 kmem_cache_free 861 835 -26 __kmem_cache_free 741 704 -37 kmem_cache_free_bulk 911 863 -48 Reviewed-by: Chengming Zhou Signed-off-by: Vlastimil Babka --- mm/slub.c | 59 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 0a9e4bd0dd68..af8c8fc9e799 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2051,9 +2051,12 @@ void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, /* * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. + * + * Returns true if freeing of the object can proceed, false if its reuse + * was delayed by KASAN quarantine. */ -static __always_inline bool slab_free_hook(struct kmem_cache *s, - void *x, bool init) +static __always_inline +bool slab_free_hook(struct kmem_cache *s, void *x, bool init) { kmemleak_free_recursive(x, s->flags); kmsan_slab_free(s, x); @@ -2086,7 +2089,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, s->size - s->inuse - rsize); } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return kasan_slab_free(s, x, init); + return !kasan_slab_free(s, x, init); } static inline bool slab_free_freelist_hook(struct kmem_cache *s, @@ -2096,7 +2099,7 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, void *object; void *next = *head; - void *old_tail = *tail ? *tail : *head; + void *old_tail = *tail; if (is_kfence_address(next)) { slab_free_hook(s, next, false); @@ -2112,8 +2115,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, next = get_freepointer(s, object); /* If object's reuse doesn't have to be delayed */ - if (likely(!slab_free_hook(s, object, - slab_want_init_on_free(s)))) { + if (likely(slab_free_hook(s, object, + slab_want_init_on_free(s)))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -2128,9 +2131,6 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, } } while (object != old_tail); - if (*head == *tail) - *tail = NULL; - return *head != NULL; } @@ -4241,7 +4241,6 @@ static __always_inline void do_slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { - void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; void **freelist; @@ -4260,14 +4259,14 @@ redo: barrier(); if (unlikely(slab != c->slab)) { - __slab_free(s, slab, head, tail_obj, cnt, addr); + __slab_free(s, slab, head, tail, cnt, addr); return; } if (USE_LOCKLESS_FAST_PATH()) { freelist = READ_ONCE(c->freelist); - set_freepointer(s, tail_obj, freelist); + set_freepointer(s, tail, freelist); if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { note_cmpxchg_failure("slab_free", s, tid); @@ -4284,7 +4283,7 @@ redo: tid = c->tid; freelist = c->freelist; - set_freepointer(s, tail_obj, freelist); + set_freepointer(s, tail, freelist); c->freelist = head; c->tid = next_tid(tid); @@ -4297,15 +4296,27 @@ static void do_slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { - void *tail_obj = tail ? : head; - - __slab_free(s, slab, head, tail_obj, cnt, addr); + __slab_free(s, slab, head, tail, cnt, addr); } #endif /* CONFIG_SLUB_TINY */ -static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, - void *head, void *tail, void **p, int cnt, - unsigned long addr) +static __fastpath_inline +void slab_free(struct kmem_cache *s, struct slab *slab, void *object, + unsigned long addr) +{ + bool init; + + memcg_slab_free_hook(s, slab, &object, 1); + + init = !is_kfence_address(object) && slab_want_init_on_free(s); + + if (likely(slab_free_hook(s, object, init))) + do_slab_free(s, slab, object, object, 1, addr); +} + +static __fastpath_inline +void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, + void *tail, void **p, int cnt, unsigned long addr) { memcg_slab_free_hook(s, slab, p, cnt); /* @@ -4319,7 +4330,7 @@ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, #ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { - do_slab_free(cache, virt_to_slab(x), x, NULL, 1, addr); + do_slab_free(cache, virt_to_slab(x), x, x, 1, addr); } #endif @@ -4363,7 +4374,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x) if (!s) return; trace_kmem_cache_free(_RET_IP_, x, s); - slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_); + slab_free(s, virt_to_slab(x), x, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); @@ -4409,7 +4420,7 @@ void kfree(const void *object) slab = folio_slab(folio); s = slab->slab_cache; - slab_free(s, slab, x, NULL, &x, 1, _RET_IP_); + slab_free(s, slab, x, _RET_IP_); } EXPORT_SYMBOL(kfree); @@ -4526,8 +4537,8 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (!df.slab) continue; - slab_free(df.s, df.slab, df.freelist, df.tail, &p[size], df.cnt, - _RET_IP_); + slab_free_bulk(df.s, df.slab, df.freelist, df.tail, &p[size], + df.cnt, _RET_IP_); } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); From 4ae08845db4c1f759b8382bc7527ab8249230e7f Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Sat, 25 Nov 2023 14:51:28 +0530 Subject: [PATCH 0365/1562] mfd: tps6594: Use spi_get_chipselect() API to access spi->chip_select In preparation for adding multiple CS support for a device, set/get functions were introduces accessing spi->chip_select in 'commit 303feb3cc06a ("spi: Add APIs in spi core to set/get spi->chip_select and spi->cs_gpiod")'. Replace spi->chip_select with spi_get_chipselect() API. Signed-off-by: Amit Kumar Mahapatra Link: https://lore.kernel.org/r/20231125092137.2948-2-amit.kumar-mahapatra@amd.com Signed-off-by: Lee Jones --- drivers/mfd/tps6594-spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/tps6594-spi.c b/drivers/mfd/tps6594-spi.c index f4b4f37f957f..24b72847e3f5 100644 --- a/drivers/mfd/tps6594-spi.c +++ b/drivers/mfd/tps6594-spi.c @@ -98,7 +98,7 @@ static int tps6594_spi_probe(struct spi_device *spi) spi_set_drvdata(spi, tps); tps->dev = dev; - tps->reg = spi->chip_select; + tps->reg = spi_get_chipselect(spi, 0); tps->irq = spi->irq; tps->regmap = devm_regmap_init(dev, NULL, spi, &tps6594_spi_regmap_config); From f05e2f61fe88092e0d341ea27644a84e3386358d Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Sat, 25 Nov 2023 14:51:29 +0530 Subject: [PATCH 0366/1562] ALSA: hda/cs35l56: Use set/get APIs to access spi->chip_select In preparation for adding multiple CS support for a device, set/get functions were introduces accessing spi->chip_select in 'commit 303feb3cc06a ("spi: Add APIs in spi core to set/get spi->chip_select and spi->cs_gpiod")'. Replace spi->chip_select with spi_get_chipselect() API. Signed-off-by: Amit Kumar Mahapatra Link: https://lore.kernel.org/r/20231125092137.2948-3-amit.kumar-mahapatra@amd.com Signed-off-by: Mark Brown --- sound/pci/hda/cs35l56_hda_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/cs35l56_hda_spi.c b/sound/pci/hda/cs35l56_hda_spi.c index 756aec342eab..1c5cb3b1e2c3 100644 --- a/sound/pci/hda/cs35l56_hda_spi.c +++ b/sound/pci/hda/cs35l56_hda_spi.c @@ -29,7 +29,7 @@ static int cs35l56_hda_spi_probe(struct spi_device *spi) return ret; } - ret = cs35l56_hda_common_probe(cs35l56, spi->chip_select); + ret = cs35l56_hda_common_probe(cs35l56, spi_get_chipselect(spi, 0)); if (ret) return ret; ret = cs35l56_irq_request(&cs35l56->base, spi->irq); From 4d8ff6b0991d5e86b17b235fc46ec62e9195cb9b Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Sat, 25 Nov 2023 14:51:30 +0530 Subject: [PATCH 0367/1562] spi: Add multi-cs memories support in SPI core AMD-Xilinx GQSPI controller has two advanced mode that allows the controller to consider two flashes as one single device. One of these two mode is the parallel mode in which each byte of data is stored in both devices, the even bits in the lower flash & the odd bits in the upper flash. The byte split is automatically handled by the QSPI controller. The other mode is the stacked mode in which both the flashes share the same SPI bus but each of the device contain half of the data. In this mode, the controller does not follow CS requests but instead internally wires the two CS levels with the value of the most significant address bit. For supporting both these modes SPI core need to be updated for providing multiple CS for a single SPI device. For adding multi CS support the SPI device need to be aware of all the CS values. So, the "chip_select" member in the spi_device structure is now an array that holds all the CS values. spi_device structure now has a "cs_index_mask" member. This acts as an index to the chip_select array. If nth bit of spi->cs_index_mask is set then the driver would assert spi->chip_select[n]. In parallel mode all the chip selects are asserted/de-asserted simultaneously and each byte of data is stored in both devices, the even bits in one, the odd bits in the other. The split is automatically handled by the GQSPI controller. The GQSPI controller supports a maximum of two flashes connected in parallel mode. A SPI_CONTROLLER_MULTI_CS flag bit is added in the spi controller flags, through ctlr->flags the spi core will make sure that the controller is capable of handling multiple chip selects at once. For supporting multiple CS via GPIO the cs_gpiod member of the spi_device structure is now an array that holds the gpio descriptor for each chipselect. CS GPIO is not tested on our hardware, but it has been tested by @Stefan https://lore.kernel.org/all/005001da1efc$619ad5a0$24d080e0$@opensource.cirrus.com/ Signed-off-by: Amit Kumar Mahapatra Tested-by: Stefan Binding Link: https://lore.kernel.org/r/20231125092137.2948-4-amit.kumar-mahapatra@amd.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 257 +++++++++++++++++++++++++++++++++------- include/linux/spi/spi.h | 51 ++++++-- 2 files changed, 257 insertions(+), 51 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 8ead7acb99f3..45b6898cf0ee 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -612,10 +612,21 @@ static int spi_dev_check(struct device *dev, void *data) { struct spi_device *spi = to_spi_device(dev); struct spi_device *new_spi = data; + int idx, nw_idx; + u8 cs, cs_nw; - if (spi->controller == new_spi->controller && - spi_get_chipselect(spi, 0) == spi_get_chipselect(new_spi, 0)) - return -EBUSY; + if (spi->controller == new_spi->controller) { + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + cs = spi_get_chipselect(spi, idx); + for (nw_idx = 0; nw_idx < SPI_CS_CNT_MAX; nw_idx++) { + cs_nw = spi_get_chipselect(new_spi, nw_idx); + if (cs != 0xFF && cs_nw != 0xFF && cs == cs_nw) { + dev_err(dev, "chipselect %d already in use\n", cs_nw); + return -EBUSY; + } + } + } + } return 0; } @@ -629,13 +640,32 @@ static int __spi_add_device(struct spi_device *spi) { struct spi_controller *ctlr = spi->controller; struct device *dev = ctlr->dev.parent; - int status; + int status, idx, nw_idx; + u8 cs, nw_cs; - /* Chipselects are numbered 0..max; validate. */ - if (spi_get_chipselect(spi, 0) >= ctlr->num_chipselect) { - dev_err(dev, "cs%d >= max %d\n", spi_get_chipselect(spi, 0), - ctlr->num_chipselect); - return -EINVAL; + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + /* Chipselects are numbered 0..max; validate. */ + cs = spi_get_chipselect(spi, idx); + if (cs != 0xFF && cs >= ctlr->num_chipselect) { + dev_err(dev, "cs%d >= max %d\n", spi_get_chipselect(spi, idx), + ctlr->num_chipselect); + return -EINVAL; + } + } + + /* + * Make sure that multiple logical CS doesn't map to the same physical CS. + * For example, spi->chip_select[0] != spi->chip_select[1] and so on. + */ + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + cs = spi_get_chipselect(spi, idx); + for (nw_idx = idx + 1; nw_idx < SPI_CS_CNT_MAX; nw_idx++) { + nw_cs = spi_get_chipselect(spi, nw_idx); + if (cs != 0xFF && nw_cs != 0xFF && cs == nw_cs) { + dev_err(dev, "chipselect %d already in use\n", nw_cs); + return -EBUSY; + } + } } /* Set the bus ID string */ @@ -647,11 +677,8 @@ static int __spi_add_device(struct spi_device *spi) * its configuration. */ status = bus_for_each_dev(&spi_bus_type, NULL, spi, spi_dev_check); - if (status) { - dev_err(dev, "chipselect %d already in use\n", - spi_get_chipselect(spi, 0)); + if (status) return status; - } /* Controller may unregister concurrently */ if (IS_ENABLED(CONFIG_SPI_DYNAMIC) && @@ -659,8 +686,15 @@ static int __spi_add_device(struct spi_device *spi) return -ENODEV; } - if (ctlr->cs_gpiods) - spi_set_csgpiod(spi, 0, ctlr->cs_gpiods[spi_get_chipselect(spi, 0)]); + if (ctlr->cs_gpiods) { + u8 cs; + + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + cs = spi_get_chipselect(spi, idx); + if (cs != 0xFF) + spi_set_csgpiod(spi, idx, ctlr->cs_gpiods[cs]); + } + } /* * Drivers may modify this initial i/o setup, but will @@ -701,6 +735,9 @@ int spi_add_device(struct spi_device *spi) struct spi_controller *ctlr = spi->controller; int status; + /* Set the bus ID string */ + spi_dev_set_name(spi); + mutex_lock(&ctlr->add_lock); status = __spi_add_device(spi); mutex_unlock(&ctlr->add_lock); @@ -727,6 +764,7 @@ struct spi_device *spi_new_device(struct spi_controller *ctlr, { struct spi_device *proxy; int status; + u8 idx; /* * NOTE: caller did any chip->bus_num checks necessary. @@ -742,6 +780,18 @@ struct spi_device *spi_new_device(struct spi_controller *ctlr, WARN_ON(strlen(chip->modalias) >= sizeof(proxy->modalias)); + /* + * Zero(0) is a valid physical CS value and can be located at any + * logical CS in the spi->chip_select[]. If all the physical CS + * are initialized to 0 then It would be difficult to differentiate + * between a valid physical CS 0 & an unused logical CS whose physical + * CS can be 0. As a solution to this issue initialize all the CS to 0xFF. + * Now all the unused logical CS will have 0xFF physical CS value & can be + * ignore while performing physical CS validity checks. + */ + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) + spi_set_chipselect(proxy, idx, 0xFF); + spi_set_chipselect(proxy, 0, chip->chip_select); proxy->max_speed_hz = chip->max_speed_hz; proxy->mode = chip->mode; @@ -750,6 +800,15 @@ struct spi_device *spi_new_device(struct spi_controller *ctlr, proxy->dev.platform_data = (void *) chip->platform_data; proxy->controller_data = chip->controller_data; proxy->controller_state = NULL; + /* + * spi->chip_select[i] gives the corresponding physical CS for logical CS i + * logical CS number is represented by setting the ith bit in spi->cs_index_mask + * So, for example, if spi->cs_index_mask = 0x01 then logical CS number is 0 and + * spi->chip_select[0] will give the physical CS. + * By default spi->chip_select[0] will hold the physical CS number so, set + * spi->cs_index_mask as 0x01. + */ + proxy->cs_index_mask = 0x01; if (chip->swnode) { status = device_add_software_node(&proxy->dev, chip->swnode); @@ -942,32 +1001,51 @@ static void spi_res_release(struct spi_controller *ctlr, struct spi_message *mes } /*-------------------------------------------------------------------------*/ +static inline bool spi_is_last_cs(struct spi_device *spi) +{ + u8 idx; + bool last = false; + + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + if ((spi->cs_index_mask >> idx) & 0x01) { + if (spi->controller->last_cs[idx] == spi_get_chipselect(spi, idx)) + last = true; + } + } + return last; +} + static void spi_set_cs(struct spi_device *spi, bool enable, bool force) { bool activate = enable; + u8 idx; /* * Avoid calling into the driver (or doing delays) if the chip select * isn't actually changing from the last time this was called. */ - if (!force && ((enable && spi->controller->last_cs == spi_get_chipselect(spi, 0)) || - (!enable && spi->controller->last_cs != spi_get_chipselect(spi, 0))) && + if (!force && ((enable && spi->controller->last_cs_index_mask == spi->cs_index_mask && + spi_is_last_cs(spi)) || + (!enable && spi->controller->last_cs_index_mask == spi->cs_index_mask && + !spi_is_last_cs(spi))) && (spi->controller->last_cs_mode_high == (spi->mode & SPI_CS_HIGH))) return; trace_spi_set_cs(spi, activate); - spi->controller->last_cs = enable ? spi_get_chipselect(spi, 0) : -1; + spi->controller->last_cs_index_mask = spi->cs_index_mask; + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) + spi->controller->last_cs[idx] = enable ? spi_get_chipselect(spi, 0) : -1; spi->controller->last_cs_mode_high = spi->mode & SPI_CS_HIGH; - if ((spi_get_csgpiod(spi, 0) || !spi->controller->set_cs_timing) && !activate) - spi_delay_exec(&spi->cs_hold, NULL); - if (spi->mode & SPI_CS_HIGH) enable = !enable; - if (spi_get_csgpiod(spi, 0)) { + if (spi_is_csgpiod(spi)) { + if (!spi->controller->set_cs_timing && !activate) + spi_delay_exec(&spi->cs_hold, NULL); + if (!(spi->mode & SPI_NO_CS)) { /* * Historically ACPI has no means of the GPIO polarity and @@ -979,26 +1057,38 @@ static void spi_set_cs(struct spi_device *spi, bool enable, bool force) * ambiguity. That's why we use enable, that takes SPI_CS_HIGH * into account. */ - if (has_acpi_companion(&spi->dev)) - gpiod_set_value_cansleep(spi_get_csgpiod(spi, 0), !enable); - else - /* Polarity handled by GPIO library */ - gpiod_set_value_cansleep(spi_get_csgpiod(spi, 0), activate); + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + if (((spi->cs_index_mask >> idx) & 0x01) && + spi_get_csgpiod(spi, idx)) { + if (has_acpi_companion(&spi->dev)) + gpiod_set_value_cansleep(spi_get_csgpiod(spi, idx), + !enable); + else + /* Polarity handled by GPIO library */ + gpiod_set_value_cansleep(spi_get_csgpiod(spi, idx), + activate); + + if (activate) + spi_delay_exec(&spi->cs_setup, NULL); + else + spi_delay_exec(&spi->cs_inactive, NULL); + } + } } /* Some SPI masters need both GPIO CS & slave_select */ if ((spi->controller->flags & SPI_CONTROLLER_GPIO_SS) && spi->controller->set_cs) spi->controller->set_cs(spi, !enable); + + if (!spi->controller->set_cs_timing) { + if (activate) + spi_delay_exec(&spi->cs_setup, NULL); + else + spi_delay_exec(&spi->cs_inactive, NULL); + } } else if (spi->controller->set_cs) { spi->controller->set_cs(spi, !enable); } - - if (spi_get_csgpiod(spi, 0) || !spi->controller->set_cs_timing) { - if (activate) - spi_delay_exec(&spi->cs_setup, NULL); - else - spi_delay_exec(&spi->cs_inactive, NULL); - } } #ifdef CONFIG_HAS_DMA @@ -2222,8 +2312,8 @@ static void of_spi_parse_dt_cs_delay(struct device_node *nc, static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, struct device_node *nc) { - u32 value; - int rc; + u32 value, cs[SPI_CS_CNT_MAX]; + int rc, idx; /* Mode (clock phase/polarity/etc.) */ if (of_property_read_bool(nc, "spi-cpha")) @@ -2295,14 +2385,53 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, return 0; } + if (ctlr->num_chipselect > SPI_CS_CNT_MAX) { + dev_err(&ctlr->dev, "No. of CS is more than max. no. of supported CS\n"); + return -EINVAL; + } + + /* + * Zero(0) is a valid physical CS value and can be located at any + * logical CS in the spi->chip_select[]. If all the physical CS + * are initialized to 0 then It would be difficult to differentiate + * between a valid physical CS 0 & an unused logical CS whose physical + * CS can be 0. As a solution to this issue initialize all the CS to 0xFF. + * Now all the unused logical CS will have 0xFF physical CS value & can be + * ignore while performing physical CS validity checks. + */ + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) + spi_set_chipselect(spi, idx, 0xFF); + /* Device address */ - rc = of_property_read_u32(nc, "reg", &value); - if (rc) { + rc = of_property_read_variable_u32_array(nc, "reg", &cs[0], 1, + SPI_CS_CNT_MAX); + if (rc < 0) { dev_err(&ctlr->dev, "%pOF has no valid 'reg' property (%d)\n", nc, rc); return rc; } - spi_set_chipselect(spi, 0, value); + if (rc > ctlr->num_chipselect) { + dev_err(&ctlr->dev, "%pOF has number of CS > ctlr->num_chipselect (%d)\n", + nc, rc); + return rc; + } + if ((of_property_read_bool(nc, "parallel-memories")) && + (!(ctlr->flags & SPI_CONTROLLER_MULTI_CS))) { + dev_err(&ctlr->dev, "SPI controller doesn't support multi CS\n"); + return -EINVAL; + } + for (idx = 0; idx < rc; idx++) + spi_set_chipselect(spi, idx, cs[idx]); + + /* + * spi->chip_select[i] gives the corresponding physical CS for logical CS i + * logical CS number is represented by setting the ith bit in spi->cs_index_mask + * So, for example, if spi->cs_index_mask = 0x01 then logical CS number is 0 and + * spi->chip_select[0] will give the physical CS. + * By default spi->chip_select[0] will hold the physical CS number so, set + * spi->cs_index_mask as 0x01. + */ + spi->cs_index_mask = 0x01; /* Device speed */ if (!of_property_read_u32(nc, "spi-max-frequency", &value)) @@ -2408,6 +2537,7 @@ struct spi_device *spi_new_ancillary_device(struct spi_device *spi, struct spi_controller *ctlr = spi->controller; struct spi_device *ancillary; int rc = 0; + u8 idx; /* Alloc an spi_device */ ancillary = spi_alloc_device(ctlr); @@ -2418,12 +2548,33 @@ struct spi_device *spi_new_ancillary_device(struct spi_device *spi, strscpy(ancillary->modalias, "dummy", sizeof(ancillary->modalias)); + /* + * Zero(0) is a valid physical CS value and can be located at any + * logical CS in the spi->chip_select[]. If all the physical CS + * are initialized to 0 then It would be difficult to differentiate + * between a valid physical CS 0 & an unused logical CS whose physical + * CS can be 0. As a solution to this issue initialize all the CS to 0xFF. + * Now all the unused logical CS will have 0xFF physical CS value & can be + * ignore while performing physical CS validity checks. + */ + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) + spi_set_chipselect(ancillary, idx, 0xFF); + /* Use provided chip-select for ancillary device */ spi_set_chipselect(ancillary, 0, chip_select); /* Take over SPI mode/speed from SPI main device */ ancillary->max_speed_hz = spi->max_speed_hz; ancillary->mode = spi->mode; + /* + * spi->chip_select[i] gives the corresponding physical CS for logical CS i + * logical CS number is represented by setting the ith bit in spi->cs_index_mask + * So, for example, if spi->cs_index_mask = 0x01 then logical CS number is 0 and + * spi->chip_select[0] will give the physical CS. + * By default spi->chip_select[0] will hold the physical CS number so, set + * spi->cs_index_mask as 0x01. + */ + ancillary->cs_index_mask = 0x01; WARN_ON(!mutex_is_locked(&ctlr->add_lock)); @@ -2626,6 +2777,7 @@ struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, struct acpi_spi_lookup lookup = {}; struct spi_device *spi; int ret; + u8 idx; if (!ctlr && index == -1) return ERR_PTR(-EINVAL); @@ -2661,12 +2813,33 @@ struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, return ERR_PTR(-ENOMEM); } + /* + * Zero(0) is a valid physical CS value and can be located at any + * logical CS in the spi->chip_select[]. If all the physical CS + * are initialized to 0 then It would be difficult to differentiate + * between a valid physical CS 0 & an unused logical CS whose physical + * CS can be 0. As a solution to this issue initialize all the CS to 0xFF. + * Now all the unused logical CS will have 0xFF physical CS value & can be + * ignore while performing physical CS validity checks. + */ + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) + spi_set_chipselect(spi, idx, 0xFF); + ACPI_COMPANION_SET(&spi->dev, adev); spi->max_speed_hz = lookup.max_speed_hz; spi->mode |= lookup.mode; spi->irq = lookup.irq; spi->bits_per_word = lookup.bits_per_word; spi_set_chipselect(spi, 0, lookup.chip_select); + /* + * spi->chip_select[i] gives the corresponding physical CS for logical CS i + * logical CS number is represented by setting the ith bit in spi->cs_index_mask + * So, for example, if spi->cs_index_mask = 0x01 then logical CS number is 0 and + * spi->chip_select[0] will give the physical CS. + * By default spi->chip_select[0] will hold the physical CS number so, set + * spi->cs_index_mask as 0x01. + */ + spi->cs_index_mask = 0x01; return spi; } @@ -3100,6 +3273,7 @@ int spi_register_controller(struct spi_controller *ctlr) struct boardinfo *bi; int first_dynamic; int status; + int idx; if (!dev) return -ENODEV; @@ -3164,7 +3338,8 @@ int spi_register_controller(struct spi_controller *ctlr) } /* Setting last_cs to -1 means no chip selected */ - ctlr->last_cs = -1; + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) + ctlr->last_cs[idx] = -1; status = device_add(&ctlr->dev); if (status < 0) @@ -3889,7 +4064,7 @@ static int __spi_validate(struct spi_device *spi, struct spi_message *message) * cs_change is set for each transfer. */ if ((spi->mode & SPI_CS_WORD) && (!(ctlr->mode_bits & SPI_CS_WORD) || - spi_get_csgpiod(spi, 0))) { + spi_is_csgpiod(spi))) { size_t maxsize = BITS_TO_BYTES(spi->bits_per_word); int ret; diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 255a0562aea5..50622054b6af 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -20,6 +20,9 @@ #include +/* Max no. of CS supported per spi device */ +#define SPI_CS_CNT_MAX 4 + struct dma_chan; struct software_node; struct ptp_system_timestamp; @@ -132,7 +135,8 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * @max_speed_hz: Maximum clock rate to be used with this chip * (on this board); may be changed by the device's driver. * The spi_transfer.speed_hz can override this for each transfer. - * @chip_select: Chipselect, distinguishing chips handled by @controller. + * @chip_select: Array of physical chipselect, spi->chipselect[i] gives + * the corresponding physical CS for logical CS i. * @mode: The spi mode defines how data is clocked out and in. * This may be changed by the device's driver. * The "active low" default for chipselect mode can be overridden @@ -157,8 +161,8 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * the device will bind to the named driver and only the named driver. * Do not set directly, because core frees it; use driver_set_override() to * set or clear it. - * @cs_gpiod: GPIO descriptor of the chipselect line (optional, NULL when - * not using a GPIO line) + * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines + * (optional, NULL when not using a GPIO line) * @word_delay: delay to be inserted between consecutive * words of a transfer * @cs_setup: delay to be introduced by the controller after CS is asserted @@ -167,6 +171,7 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * deasserted. If @cs_change_delay is used from @spi_transfer, then the * two delays will be added up. * @pcpu_statistics: statistics for the spi_device + * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array * * A @spi_device is used to interchange data between an SPI slave * (usually a discrete chip) and CPU memory. @@ -182,7 +187,7 @@ struct spi_device { struct spi_controller *controller; struct spi_controller *master; /* Compatibility layer */ u32 max_speed_hz; - u8 chip_select; + u8 chip_select[SPI_CS_CNT_MAX]; u8 bits_per_word; bool rt; #define SPI_NO_TX BIT(31) /* No transmit wire */ @@ -213,7 +218,7 @@ struct spi_device { void *controller_data; char modalias[SPI_NAME_SIZE]; const char *driver_override; - struct gpio_desc *cs_gpiod; /* Chip select GPIO descriptor */ + struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ struct spi_delay word_delay; /* Inter-word delay */ /* CS delays */ struct spi_delay cs_setup; @@ -223,6 +228,13 @@ struct spi_device { /* The statistics */ struct spi_statistics __percpu *pcpu_statistics; + /* Bit mask of the chipselect(s) that the driver need to use from + * the chipselect array.When the controller is capable to handle + * multiple chip selects & memories are connected in parallel + * then more than one bit need to be set in cs_index_mask. + */ + u32 cs_index_mask : SPI_CS_CNT_MAX; + /* * Likely need more hooks for more protocol options affecting how * the controller talks to each chip, like: @@ -279,22 +291,33 @@ static inline void *spi_get_drvdata(const struct spi_device *spi) static inline u8 spi_get_chipselect(const struct spi_device *spi, u8 idx) { - return spi->chip_select; + return spi->chip_select[idx]; } static inline void spi_set_chipselect(struct spi_device *spi, u8 idx, u8 chipselect) { - spi->chip_select = chipselect; + spi->chip_select[idx] = chipselect; } static inline struct gpio_desc *spi_get_csgpiod(const struct spi_device *spi, u8 idx) { - return spi->cs_gpiod; + return spi->cs_gpiod[idx]; } static inline void spi_set_csgpiod(struct spi_device *spi, u8 idx, struct gpio_desc *csgpiod) { - spi->cs_gpiod = csgpiod; + spi->cs_gpiod[idx] = csgpiod; +} + +static inline bool spi_is_csgpiod(struct spi_device *spi) +{ + u8 idx; + + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + if (spi_get_csgpiod(spi, idx)) + return true; + } + return false; } /** @@ -399,6 +422,8 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * @bus_lock_spinlock: spinlock for SPI bus locking * @bus_lock_mutex: mutex for exclusion of multiple callers * @bus_lock_flag: indicates that the SPI bus is locked for exclusive use + * @multi_cs_cap: indicates that the SPI Controller can assert/de-assert + * more than one chip select at once. * @setup: updates the device mode and clocking records used by a * device's SPI controller; protocol code may call this. This * must fail if an unrecognized or unsupported mode is requested. @@ -567,6 +592,11 @@ struct spi_controller { #define SPI_CONTROLLER_MUST_TX BIT(4) /* Requires tx */ #define SPI_CONTROLLER_GPIO_SS BIT(5) /* GPIO CS must select slave */ #define SPI_CONTROLLER_SUSPENDED BIT(6) /* Currently suspended */ + /* + * The spi-controller has multi chip select capability and can + * assert/de-assert more than one chip select at once. + */ +#define SPI_CONTROLLER_MULTI_CS BIT(7) /* Flag indicating if the allocation of this struct is devres-managed */ bool devm_allocated; @@ -677,7 +707,8 @@ struct spi_controller { bool rt; bool auto_runtime_pm; bool cur_msg_mapped; - char last_cs; + char last_cs[SPI_CS_CNT_MAX]; + char last_cs_index_mask; bool last_cs_mode_high; bool fallback; struct completion xfer_completion; From 3c1e09d533dba45af8b4681f005c9b5807f9b182 Mon Sep 17 00:00:00 2001 From: Munehisa Kamata Date: Thu, 7 Dec 2023 01:33:56 +0000 Subject: [PATCH 0368/1562] selinux: remove the wrong comment about multithreaded process handling Since commit d9250dea3f89 ("SELinux: add boundary support and thread context assignment"), SELinux has been supporting assigning per-thread security context under a constraint and the comment was updated accordingly. However, seems like commit d84f4f992cbd ("CRED: Inaugurate COW credentials") accidentally brought the old comment back that doesn't match what the code does. Considering the ease of understanding the code, this patch just removes the wrong comment. Fixes: d84f4f992cbd ("CRED: Inaugurate COW credentials") Signed-off-by: Munehisa Kamata Signed-off-by: Paul Moore --- security/selinux/hooks.c | 1 - 1 file changed, 1 deletion(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 855589b64641..863ff67e7849 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6459,7 +6459,6 @@ static int selinux_setprocattr(const char *name, void *value, size_t size) if (sid == 0) goto abort_change; - /* Only allow single threaded processes to change context */ if (!current_is_single_threaded()) { error = security_bounded_transition(tsec->sid, sid); if (error) From ec4e9d630a64df500641892f4e259e8149594a99 Mon Sep 17 00:00:00 2001 From: Gavrilov Ilia Date: Thu, 23 Nov 2023 09:25:54 +0000 Subject: [PATCH 0369/1562] calipso: fix memory leak in netlbl_calipso_add_pass() If IPv6 support is disabled at boot (ipv6.disable=1), the calipso_init() -> netlbl_calipso_ops_register() function isn't called, and the netlbl_calipso_ops_get() function always returns NULL. In this case, the netlbl_calipso_add_pass() function allocates memory for the doi_def variable but doesn't free it with the calipso_doi_free(). BUG: memory leak unreferenced object 0xffff888011d68180 (size 64): comm "syz-executor.1", pid 10746, jiffies 4295410986 (age 17.928s) hex dump (first 32 bytes): 00 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<...>] kmalloc include/linux/slab.h:552 [inline] [<...>] netlbl_calipso_add_pass net/netlabel/netlabel_calipso.c:76 [inline] [<...>] netlbl_calipso_add+0x22e/0x4f0 net/netlabel/netlabel_calipso.c:111 [<...>] genl_family_rcv_msg_doit+0x22f/0x330 net/netlink/genetlink.c:739 [<...>] genl_family_rcv_msg net/netlink/genetlink.c:783 [inline] [<...>] genl_rcv_msg+0x341/0x5a0 net/netlink/genetlink.c:800 [<...>] netlink_rcv_skb+0x14d/0x440 net/netlink/af_netlink.c:2515 [<...>] genl_rcv+0x29/0x40 net/netlink/genetlink.c:811 [<...>] netlink_unicast_kernel net/netlink/af_netlink.c:1313 [inline] [<...>] netlink_unicast+0x54b/0x800 net/netlink/af_netlink.c:1339 [<...>] netlink_sendmsg+0x90a/0xdf0 net/netlink/af_netlink.c:1934 [<...>] sock_sendmsg_nosec net/socket.c:651 [inline] [<...>] sock_sendmsg+0x157/0x190 net/socket.c:671 [<...>] ____sys_sendmsg+0x712/0x870 net/socket.c:2342 [<...>] ___sys_sendmsg+0xf8/0x170 net/socket.c:2396 [<...>] __sys_sendmsg+0xea/0x1b0 net/socket.c:2429 [<...>] do_syscall_64+0x30/0x40 arch/x86/entry/common.c:46 [<...>] entry_SYSCALL_64_after_hwframe+0x61/0xc6 Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) with Syzkaller Fixes: cb72d38211ea ("netlabel: Initial support for the CALIPSO netlink protocol.") Signed-off-by: Gavrilov Ilia [PM: merged via the LSM tree at Jakub Kicinski request] Signed-off-by: Paul Moore --- net/netlabel/netlabel_calipso.c | 49 +++++++++++++++++---------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index f1d5b8465217..a07c2216d28b 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -54,6 +54,28 @@ static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = { [NLBL_CALIPSO_A_MTYPE] = { .type = NLA_U32 }, }; +static const struct netlbl_calipso_ops *calipso_ops; + +/** + * netlbl_calipso_ops_register - Register the CALIPSO operations + * @ops: ops to register + * + * Description: + * Register the CALIPSO packet engine operations. + * + */ +const struct netlbl_calipso_ops * +netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops) +{ + return xchg(&calipso_ops, ops); +} +EXPORT_SYMBOL(netlbl_calipso_ops_register); + +static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void) +{ + return READ_ONCE(calipso_ops); +} + /* NetLabel Command Handlers */ /** @@ -96,15 +118,18 @@ static int netlbl_calipso_add_pass(struct genl_info *info, * */ static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info) - { int ret_val = -EINVAL; struct netlbl_audit audit_info; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (!info->attrs[NLBL_CALIPSO_A_DOI] || !info->attrs[NLBL_CALIPSO_A_MTYPE]) return -EINVAL; + if (!ops) + return -EOPNOTSUPP; + netlbl_netlink_auditinfo(&audit_info); switch (nla_get_u32(info->attrs[NLBL_CALIPSO_A_MTYPE])) { case CALIPSO_MAP_PASS: @@ -363,28 +388,6 @@ int __init netlbl_calipso_genl_init(void) return genl_register_family(&netlbl_calipso_gnl_family); } -static const struct netlbl_calipso_ops *calipso_ops; - -/** - * netlbl_calipso_ops_register - Register the CALIPSO operations - * @ops: ops to register - * - * Description: - * Register the CALIPSO packet engine operations. - * - */ -const struct netlbl_calipso_ops * -netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops) -{ - return xchg(&calipso_ops, ops); -} -EXPORT_SYMBOL(netlbl_calipso_ops_register); - -static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void) -{ - return READ_ONCE(calipso_ops); -} - /** * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine * @doi_def: the DOI structure From a7fb0423c201ba12815877a0b5a68a6a1710b23a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 7 Dec 2023 08:46:14 -0500 Subject: [PATCH 0370/1562] cgroup: Move rcu_head up near the top of cgroup_root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit d23b5c577715 ("cgroup: Make operations on the cgroup root_list RCU safe") adds a new rcu_head to the cgroup_root structure and kvfree_rcu() for freeing the cgroup_root. The current implementation of kvfree_rcu(), however, has the limitation that the offset of the rcu_head structure within the larger data structure must be less than 4096 or the compilation will fail. See the macro definition of __is_kvfree_rcu_offset() in include/linux/rcupdate.h for more information. By putting rcu_head below the large cgroup structure, any change to the cgroup structure that makes it larger run the risk of causing build failure under certain configurations. Commit 77070eeb8821 ("cgroup: Avoid false cacheline sharing of read mostly rstat_cpu") happens to be the last straw that breaks it. Fix this problem by moving the rcu_head structure up before the cgroup structure. Fixes: d23b5c577715 ("cgroup: Make operations on the cgroup root_list RCU safe") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/lkml/20231207143806.114e0a74@canb.auug.org.au/ Signed-off-by: Waiman Long Acked-by: Yafang Shao Reviewed-by: Yosry Ahmed Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5a97ea95b564..ea48c861cd36 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -562,6 +562,10 @@ struct cgroup_root { /* Unique id for this hierarchy. */ int hierarchy_id; + /* A list running through the active hierarchies */ + struct list_head root_list; + struct rcu_head rcu; /* Must be near the top */ + /* * The root cgroup. The containing cgroup_root will be destroyed on its * release. cgrp->ancestors[0] will be used overflowing into the @@ -575,10 +579,6 @@ struct cgroup_root { /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; - /* A list running through the active hierarchies */ - struct list_head root_list; - struct rcu_head rcu; - /* Hierarchy-specific flags */ unsigned int flags; From c3aeaf2f0ec8af93189488bda3928a1ac7752388 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 8 Dec 2023 19:02:54 +0200 Subject: [PATCH 0371/1562] spi: pxa2xx: Use inclusive language Replace master/slave by host/peripheral language in the documentation. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231208170436.3309648-2-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- Documentation/spi/pxa2xx.rst | 48 ++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/Documentation/spi/pxa2xx.rst b/Documentation/spi/pxa2xx.rst index 04f2a3856c40..143f1df83f79 100644 --- a/Documentation/spi/pxa2xx.rst +++ b/Documentation/spi/pxa2xx.rst @@ -3,13 +3,13 @@ PXA2xx SPI on SSP driver HOWTO ============================== This a mini HOWTO on the pxa2xx_spi driver. The driver turns a PXA2xx -synchronous serial port into an SPI master controller +synchronous serial port into an SPI host controller (see Documentation/spi/spi-summary.rst). The driver has the following features - Support for any PXA2xx and compatible SSP. - SSP PIO and SSP DMA data transfers. - External and Internal (SSPFRM) chip selects. -- Per slave device (chip) configuration. +- Per peripheral device (chip) configuration. - Full suspend, freeze, resume support. The driver is built around a &struct spi_message FIFO serviced by kernel @@ -17,10 +17,10 @@ thread. The kernel thread, spi_pump_messages(), drives message FIFO and is responsible for queuing SPI transactions and setting up and launching the DMA or interrupt driven transfers. -Declaring PXA2xx Master Controllers ------------------------------------ -Typically, for a legacy platform, an SPI master is defined in the -arch/.../mach-*/board-*.c as a "platform device". The master configuration +Declaring PXA2xx host controllers +--------------------------------- +Typically, for a legacy platform, an SPI host controller is defined in the +arch/.../mach-*/board-*.c as a "platform device". The host controller configuration is passed to the driver via a table found in include/linux/spi/pxa2xx_spi.h:: struct pxa2xx_spi_controller { @@ -30,7 +30,7 @@ is passed to the driver via a table found in include/linux/spi/pxa2xx_spi.h:: }; The "pxa2xx_spi_controller.num_chipselect" field is used to determine the number of -slave device (chips) attached to this SPI master. +peripheral devices (chips) attached to this SPI host controller. The "pxa2xx_spi_controller.enable_dma" field informs the driver that SSP DMA should be used. This caused the driver to acquire two DMA channels: Rx channel and @@ -40,8 +40,8 @@ See the "PXA2xx Developer Manual" section "DMA Controller". For the new platforms the description of the controller and peripheral devices comes from Device Tree or ACPI. -NSSP MASTER SAMPLE ------------------- +NSSP HOST SAMPLE +---------------- Below is a sample configuration using the PXA255 NSSP for a legacy platform:: static struct resource pxa_spi_nssp_resources[] = { @@ -57,7 +57,7 @@ Below is a sample configuration using the PXA255 NSSP for a legacy platform:: }, }; - static struct pxa2xx_spi_controller pxa_nssp_master_info = { + static struct pxa2xx_spi_controller pxa_nssp_controller_info = { .num_chipselect = 1, /* Matches the number of chips attached to NSSP */ .enable_dma = 1, /* Enables NSSP DMA */ }; @@ -68,7 +68,7 @@ Below is a sample configuration using the PXA255 NSSP for a legacy platform:: .resource = pxa_spi_nssp_resources, .num_resources = ARRAY_SIZE(pxa_spi_nssp_resources), .dev = { - .platform_data = &pxa_nssp_master_info, /* Passed to driver */ + .platform_data = &pxa_nssp_controller_info, /* Passed to driver */ }, }; @@ -81,17 +81,17 @@ Below is a sample configuration using the PXA255 NSSP for a legacy platform:: (void)platform_add_device(devices, ARRAY_SIZE(devices)); } -Declaring Slave Devices ------------------------ -Typically, for a legacy platform, each SPI slave (chip) is defined in the +Declaring peripheral devices +---------------------------- +Typically, for a legacy platform, each SPI peripheral device (chip) is defined in the arch/.../mach-*/board-*.c using the "spi_board_info" structure found in "linux/spi/spi.h". See "Documentation/spi/spi-summary.rst" for additional information. -Each slave device attached to the PXA must provide slave specific configuration +Each peripheral device (chip) attached to the PXA2xx must provide specific chip configuration information via the structure "pxa2xx_spi_chip" found in -"include/linux/spi/pxa2xx_spi.h". The pxa2xx_spi master controller driver -will uses the configuration whenever the driver communicates with the slave +"include/linux/spi/pxa2xx_spi.h". The PXA2xx host controller driver will use +the configuration whenever the driver communicates with the peripheral device. All fields are optional. :: @@ -123,7 +123,7 @@ dma_burst_size == 0. The "pxa2xx_spi_chip.timeout" fields is used to efficiently handle trailing bytes in the SSP receiver FIFO. The correct value for this field is dependent on the SPI bus speed ("spi_board_info.max_speed_hz") and the specific -slave device. Please note that the PXA2xx SSP 1 does not support trailing byte +peripheral device. Please note that the PXA2xx SSP 1 does not support trailing byte timeouts and must busy-wait any trailing bytes. NOTE: the SPI driver cannot control the chip select if SSPFRM is used, so the @@ -132,8 +132,8 @@ asserted around the complete message. Use SSPFRM as a GPIO (through a descriptor to accommodate these chips. -NSSP SLAVE SAMPLE ------------------ +NSSP PERIPHERAL SAMPLE +---------------------- For a legacy platform or in some other cases, the pxa2xx_spi_chip structure is passed to the pxa2xx_spi driver in the "spi_board_info.controller_data" field. Below is a sample configuration using the PXA255 NSSP. @@ -161,16 +161,16 @@ field. Below is a sample configuration using the PXA255 NSSP. .bus_num = 2, /* Framework bus number */ .chip_select = 0, /* Framework chip select */ .platform_data = NULL; /* No spi_driver specific config */ - .controller_data = &cs8415a_chip_info, /* Master chip config */ - .irq = STREETRACER_APCI_IRQ, /* Slave device interrupt */ + .controller_data = &cs8415a_chip_info, /* Host controller config */ + .irq = STREETRACER_APCI_IRQ, /* Peripheral device interrupt */ }, { .modalias = "cs8405a", /* Name of spi_driver for this device */ .max_speed_hz = 3686400, /* Run SSP as fast a possible */ .bus_num = 2, /* Framework bus number */ .chip_select = 1, /* Framework chip select */ - .controller_data = &cs8405a_chip_info, /* Master chip config */ - .irq = STREETRACER_APCI_IRQ, /* Slave device interrupt */ + .controller_data = &cs8405a_chip_info, /* Host controller config */ + .irq = STREETRACER_APCI_IRQ, /* Peripheral device interrupt */ }, }; From 8bc2a3634b87e2235535b5527f83ff529df68b56 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 8 Dec 2023 19:02:55 +0200 Subject: [PATCH 0372/1562] spi: pxa2xx: Update DMA mapping and using logic in the documentation Update DMA mapping and using logic in the documentation to follow what the code does. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231208170436.3309648-3-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- Documentation/spi/pxa2xx.rst | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/Documentation/spi/pxa2xx.rst b/Documentation/spi/pxa2xx.rst index 143f1df83f79..19479b801826 100644 --- a/Documentation/spi/pxa2xx.rst +++ b/Documentation/spi/pxa2xx.rst @@ -193,17 +193,14 @@ mode supports both coherent and stream based DMA mappings. The following logic is used to determine the type of I/O to be used on a per "spi_transfer" basis:: - if !enable_dma then - always use PIO transfers + if spi_message.len > 65536 then + if spi_message.is_dma_mapped or rx_dma_buf != 0 or tx_dma_buf != 0 then + reject premapped transfers - if spi_message.len > 8191 then print "rate limited" warning use PIO transfers - if spi_message.is_dma_mapped and rx_dma_buf != 0 and tx_dma_buf != 0 then - use coherent DMA mode - - if rx_buf and tx_buf are aligned on 8 byte boundary then + if enable_dma and the size is in the range [DMA burst size..65536] then use streaming DMA mode otherwise From 52c9a884c6388171f4c6cdafd9add042a7abec53 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 8 Dec 2023 10:59:27 -0800 Subject: [PATCH 0373/1562] spi: mpc52xx: explicitly include linux/platform_device.h Since linux/of_platform.h had included linux/platform_device.h and since that inclusion was removed, this driver now needs to include the latter header file explicitly to prevent build errors: drivers/spi/spi-mpc52xx.c: In function 'mpc52xx_spi_probe': drivers/spi/spi-mpc52xx.c:396:20: error: invalid use of undefined type 'struct platform_device' and more like that. Fixes: 0d18bcdebb2f ("of: Stop circularly including of_device.h and of_platform.h") Signed-off-by: Randy Dunlap Cc: Rob Herring Cc: Mark Brown Cc: linux-spi@vger.kernel.org Link: https://lore.kernel.org/r/20231208185927.14124-1-rdunlap@infradead.org Signed-off-by: Mark Brown --- drivers/spi/spi-mpc52xx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/spi/spi-mpc52xx.c b/drivers/spi/spi-mpc52xx.c index 4a6c984b6bff..d5ac60c135c2 100644 --- a/drivers/spi/spi-mpc52xx.c +++ b/drivers/spi/spi-mpc52xx.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include From 33318c0e6ba64876050def6432f80387c89d0fe6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 5 Dec 2023 16:19:01 -0800 Subject: [PATCH 0374/1562] fscrypt.rst: update definition of struct fscrypt_context_v2 Get the copy of the fscrypt_context_v2 definition in the documentation in sync with the actual definition, which was changed recently by commit 5b1188847180 ("fscrypt: support crypto data unit size less than filesystem block size"). Link: https://lore.kernel.org/r/20231206001901.14371-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- Documentation/filesystems/fscrypt.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst index 1b84f818e574..8d38b47b7b83 100644 --- a/Documentation/filesystems/fscrypt.rst +++ b/Documentation/filesystems/fscrypt.rst @@ -1382,7 +1382,8 @@ directory.) These structs are defined as follows:: u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; - u8 __reserved[4]; + u8 log2_data_unit_size; + u8 __reserved[3]; u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; From 0fc24a6549f9b6efc538b67a098ab577b1f9a00e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 5 Dec 2023 16:21:27 -0800 Subject: [PATCH 0375/1562] fscrypt: update comment for do_remove_key() Adjust a comment that was missed during commit 15baf55481de ("fscrypt: track master key presence separately from secret"). Link: https://lore.kernel.org/r/20231206002127.14790-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/crypto/keyring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index f34a9b0b9e92..0edf0b58daa7 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -1002,9 +1002,9 @@ static int try_to_lock_encrypted_files(struct super_block *sb, * FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS (all_users=true) always removes the * key itself. * - * To "remove the key itself", first we wipe the actual master key secret, so - * that no more inodes can be unlocked with it. Then we try to evict all cached - * inodes that had been unlocked with the key. + * To "remove the key itself", first we transition the key to the "incompletely + * removed" state, so that no more inodes can be unlocked with it. Then we try + * to evict all cached inodes that had been unlocked with the key. * * If all inodes were evicted, then we unlink the fscrypt_master_key from the * keyring. Otherwise it remains in the keyring in the "incompletely removed" From 23e9f0138963ceef2a252d887534923a0502b2da Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 3 Nov 2023 11:14:50 +0800 Subject: [PATCH 0376/1562] mm/vmstat: move pgdemote_* to per-node stats Demotion will migrate pages across nodes. Previously, only the global demotion statistics were accounted for. Changed them to per-node statistics, making it easier to observe where demotion occurs on each node. This will help to identify which nodes are under pressure. This patch also make pgdemote_* behind CONFIG_NUMA_BALANCING, since demotion is not available for !CONFIG_NUMA_BALANCING With this patch, here is a sample where node0 node1 are DRAM, node3 is PMEM: Global stats: $ grep demote /proc/vmstat pgdemote_kswapd 254288 pgdemote_direct 113497 pgdemote_khugepaged 0 Per-node stats: $ grep demote /sys/devices/system/node/node0/vmstat # demotion source pgdemote_kswapd 68454 pgdemote_direct 83431 pgdemote_khugepaged 0 $ grep demote /sys/devices/system/node/node1/vmstat # demotion source pgdemote_kswapd 185834 pgdemote_direct 30066 pgdemote_khugepaged 0 $ grep demote /sys/devices/system/node/node3/vmstat # demotion target pgdemote_kswapd 0 pgdemote_direct 0 pgdemote_khugepaged 0 Link: https://lkml.kernel.org/r/20231103031450.1456523-1-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Acked-by: "Huang, Ying" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 4 ++++ include/linux/vm_event_item.h | 3 --- mm/vmscan.c | 12 ++++++++---- mm/vmstat.c | 6 +++--- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3c25226beeed..14faffa4354f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -206,6 +206,10 @@ enum node_stat_item { #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ PGPROMOTE_CANDIDATE, /* candidate pages to promote */ + /* PGDEMOTE_*: pages demoted */ + PGDEMOTE_KSWAPD, + PGDEMOTE_DIRECT, + PGDEMOTE_KHUGEPAGED, #endif NR_VM_NODE_STAT_ITEMS }; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 8abfa1240040..d1b847502f09 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -41,9 +41,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, PGSTEAL_KHUGEPAGED, - PGDEMOTE_KSWAPD, - PGDEMOTE_DIRECT, - PGDEMOTE_KHUGEPAGED, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_KHUGEPAGED, diff --git a/mm/vmscan.c b/mm/vmscan.c index 506f8220c5fe..5dc581cac225 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -409,12 +409,14 @@ void drop_slab(void) static int reclaimer_offset(void) { +#ifdef CONFIG_NUMA_BALANCING BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); - BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != - PGSCAN_DIRECT - PGSCAN_KSWAPD); BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); +#endif + BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != + PGSCAN_DIRECT - PGSCAN_KSWAPD); BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD); @@ -976,8 +978,10 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, migrate_pages(demote_folios, alloc_demote_folio, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); - - __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded); +#ifdef CONFIG_NUMA_BALANCING + mod_node_page_state(pgdat, PGDEMOTE_KSWAPD + reclaimer_offset(), + nr_succeeded); +#endif return nr_succeeded; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 359460deb377..afa5a38fcc9c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1248,6 +1248,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_NUMA_BALANCING "pgpromote_success", "pgpromote_candidate", + "pgdemote_kswapd", + "pgdemote_direct", + "pgdemote_khugepaged", #endif /* enum writeback_stat_item counters */ @@ -1279,9 +1282,6 @@ const char * const vmstat_text[] = { "pgsteal_kswapd", "pgsteal_direct", "pgsteal_khugepaged", - "pgdemote_kswapd", - "pgdemote_direct", - "pgdemote_khugepaged", "pgscan_kswapd", "pgscan_direct", "pgscan_khugepaged", From 4f2267b58a22d972be98edef8e6b3c7a67c9fb91 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:36 +0800 Subject: [PATCH 0377/1562] maple_tree: add mt_free_one() and mt_attr() helpers Patch series "Introduce __mt_dup() to improve the performance of fork()", v7. This series introduces __mt_dup() to improve the performance of fork(). During the duplication process of mmap, all VMAs are traversed and inserted one by one into the new maple tree, causing the maple tree to be rebalanced multiple times. Balancing the maple tree is a costly operation. To duplicate VMAs more efficiently, mtree_dup() and __mt_dup() are introduced for the maple tree. They can efficiently duplicate a maple tree. Here are some algorithmic details about {mtree,__mt}_dup(). We perform a DFS pre-order traversal of all nodes in the source maple tree. During this process, we fully copy the nodes from the source tree to the new tree. This involves memory allocation, and when encountering a new node, if it is a non-leaf node, all its child nodes are allocated at once. This idea was originally from Liam R. Howlett's Maple Tree Work email, and I added some of my own ideas to implement it. Some previous discussions can be found in [1]. For a more detailed analysis of the algorithm, please refer to the logs for patch [3/10] and patch [10/10]. There is a "spawn" in byte-unixbench[2], which can be used to test the performance of fork(). I modified it slightly to make it work with different number of VMAs. Below are the test results. The first row shows the number of VMAs. The second and third rows show the number of fork() calls per ten seconds, corresponding to next-20231006 and the this patchset, respectively. The test results were obtained with CPU binding to avoid scheduler load balancing that could cause unstable results. There are still some fluctuations in the test results, but at least they are better than the original performance. 21 121 221 421 821 1621 3221 6421 12821 25621 51221 112100 76261 54227 34035 20195 11112 6017 3161 1606 802 393 114558 83067 65008 45824 28751 16072 8922 4747 2436 1233 599 2.19% 8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42% Thanks to Liam and Matthew for the review. This patch (of 10): Add two helpers: 1. mt_free_one(), used to free a maple node. 2. mt_attr(), used to obtain the attributes of maple tree. Link: https://lkml.kernel.org/r/20231027033845.90608-1-zhangpeng.00@bytedance.com Link: https://lkml.kernel.org/r/20231027033845.90608-2-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index bb24d84a4922..ca7039633844 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -165,6 +165,11 @@ static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } +static inline void mt_free_one(struct maple_node *node) +{ + kmem_cache_free(maple_node_cache, node); +} + static inline void mt_free_bulk(size_t size, void __rcu **nodes) { kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); @@ -205,6 +210,11 @@ static unsigned int mas_mt_height(struct ma_state *mas) return mt_height(mas->tree); } +static inline unsigned int mt_attr(struct maple_tree *mt) +{ + return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK; +} + static inline enum maple_type mte_node_type(const struct maple_enode *entry) { return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) & @@ -5573,7 +5583,7 @@ void mas_destroy(struct ma_state *mas) mt_free_bulk(count, (void __rcu **)&node->slot[1]); total -= count; } - kmem_cache_free(maple_node_cache, node); + mt_free_one(ma_mnode_ptr(node)); total--; } From b2472efe4316b2687c153919c1513a098bd82c17 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:37 +0800 Subject: [PATCH 0378/1562] maple_tree: introduce {mtree,mas}_lock_nested() In some cases, nested locks may be needed, so {mtree,mas}_lock_nested is introduced. For example, when duplicating maple tree, we need to hold the locks of two trees, in which case nested locks are needed. At the same time, add the definition of spin_lock_nested() in tools for testing. Link: https://lkml.kernel.org/r/20231027033845.90608-3-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 4 ++++ tools/include/linux/spinlock.h | 1 + 2 files changed, 5 insertions(+) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index d01e850b570f..f91dbc7fe091 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -256,6 +256,8 @@ struct maple_tree { struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) +#define mtree_lock_nested(mas, subclass) \ + spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* @@ -406,6 +408,8 @@ struct ma_wr_state { }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) +#define mas_lock_nested(mas, subclass) \ + spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 622266b197d0..a6cdf25b6b9d 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -11,6 +11,7 @@ #define spin_lock_init(x) pthread_mutex_init(x, NULL) #define spin_lock(x) pthread_mutex_lock(x) +#define spin_lock_nested(x, subclass) pthread_mutex_lock(x) #define spin_unlock(x) pthread_mutex_unlock(x) #define spin_lock_bh(x) pthread_mutex_lock(x) #define spin_unlock_bh(x) pthread_mutex_unlock(x) From fd32e4e9b7646510ee9010e0d5f8b8857d48a6f7 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:38 +0800 Subject: [PATCH 0379/1562] maple_tree: introduce interfaces __mt_dup() and mtree_dup() Introduce interfaces __mt_dup() and mtree_dup(), which are used to duplicate a maple tree. They duplicate a maple tree in Depth-First Search (DFS) pre-order traversal. It uses memcopy() to copy nodes in the source tree and allocate new child nodes in non-leaf nodes. The new node is exactly the same as the source node except for all the addresses stored in it. It will be faster than traversing all elements in the source tree and inserting them one by one into the new tree. The time complexity of these two functions is O(n). The difference between __mt_dup() and mtree_dup() is that mtree_dup() handles locks internally. Analysis of the average time complexity of this algorithm: For simplicity, let's assume that the maximum branching factor of all non-leaf nodes is 16 (in allocation mode, it is 10), and the tree is a full tree. Under the given conditions, if there is a maple tree with n elements, the number of its leaves is n/16. From bottom to top, the number of nodes in each level is 1/16 of the number of nodes in the level below. So the total number of nodes in the entire tree is given by the sum of n/16 + n/16^2 + n/16^3 + ... + 1. This is a geometric series, and it has log(n) terms with base 16. According to the formula for the sum of a geometric series, the sum of this series can be calculated as (n-1)/15. Each node has only one parent node pointer, which can be considered as an edge. In total, there are (n-1)/15-1 edges. This algorithm consists of two operations: 1. Traversing all nodes in DFS order. 2. For each node, making a copy and performing necessary modifications to create a new node. For the first part, DFS traversal will visit each edge twice. Let T(ascend) represent the cost of taking one step downwards, and T(descend) represent the cost of taking one step upwards. And both of them are constants (although mas_ascend() may not be, as it contains a loop, but here we ignore it and treat it as a constant). So the time spent on the first part can be represented as ((n-1)/15-1) * (T(ascend) + T(descend)). For the second part, each node will be copied, and the cost of copying a node is denoted as T(copy_node). For each non-leaf node, it is necessary to reallocate all child nodes, and the cost of this operation is denoted as T(dup_alloc). The behavior behind memory allocation is complex and not specific to the maple tree operation. Here, we assume that the time required for a single allocation is constant. Since the size of a node is fixed, both of these symbols are also constants. We can calculate that the time spent on the second part is ((n-1)/15) * T(copy_node) + ((n-1)/15 - n/16) * T(dup_alloc). Adding both parts together, the total time spent by the algorithm can be represented as: ((n-1)/15) * (T(ascend) + T(descend) + T(copy_node) + T(dup_alloc)) - n/16 * T(dup_alloc) - (T(ascend) + T(descend)) Let C1 = T(ascend) + T(descend) + T(copy_node) + T(dup_alloc) Let C2 = T(dup_alloc) Let C3 = T(ascend) + T(descend) Finally, the expression can be simplified as: ((16 * C1 - 15 * C2) / (15 * 16)) * n - (C1 / 15 + C3). This is a linear function, so the average time complexity is O(n). Link: https://lkml.kernel.org/r/20231027033845.90608-4-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Suggested-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 3 + lib/maple_tree.c | 274 +++++++++++++++++++++++++++++++++++++ 2 files changed, 277 insertions(+) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index f91dbc7fe091..a452dd8a1e5c 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -329,6 +329,9 @@ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); +int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); +int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); + void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index ca7039633844..718a222cc090 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4,6 +4,8 @@ * Copyright (c) 2018-2022 Oracle Corporation * Authors: Liam R. Howlett * Matthew Wilcox + * Copyright (c) 2023 ByteDance + * Author: Peng Zhang */ /* @@ -6475,6 +6477,278 @@ void *mtree_erase(struct maple_tree *mt, unsigned long index) } EXPORT_SYMBOL(mtree_erase); +/* + * mas_dup_free() - Free an incomplete duplication of a tree. + * @mas: The maple state of a incomplete tree. + * + * The parameter @mas->node passed in indicates that the allocation failed on + * this node. This function frees all nodes starting from @mas->node in the + * reverse order of mas_dup_build(). There is no need to hold the source tree + * lock at this time. + */ +static void mas_dup_free(struct ma_state *mas) +{ + struct maple_node *node; + enum maple_type type; + void __rcu **slots; + unsigned char count, i; + + /* Maybe the first node allocation failed. */ + if (mas_is_none(mas)) + return; + + while (!mte_is_root(mas->node)) { + mas_ascend(mas); + if (mas->offset) { + mas->offset--; + do { + mas_descend(mas); + mas->offset = mas_data_end(mas); + } while (!mte_is_leaf(mas->node)); + + mas_ascend(mas); + } + + node = mte_to_node(mas->node); + type = mte_node_type(mas->node); + slots = ma_slots(node, type); + count = mas_data_end(mas) + 1; + for (i = 0; i < count; i++) + ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK; + mt_free_bulk(count, slots); + } + + node = mte_to_node(mas->node); + mt_free_one(node); +} + +/* + * mas_copy_node() - Copy a maple node and replace the parent. + * @mas: The maple state of source tree. + * @new_mas: The maple state of new tree. + * @parent: The parent of the new node. + * + * Copy @mas->node to @new_mas->node, set @parent to be the parent of + * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM. + */ +static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas, + struct maple_pnode *parent) +{ + struct maple_node *node = mte_to_node(mas->node); + struct maple_node *new_node = mte_to_node(new_mas->node); + unsigned long val; + + /* Copy the node completely. */ + memcpy(new_node, node, sizeof(struct maple_node)); + /* Update the parent node pointer. */ + val = (unsigned long)node->parent & MAPLE_NODE_MASK; + new_node->parent = ma_parent_ptr(val | (unsigned long)parent); +} + +/* + * mas_dup_alloc() - Allocate child nodes for a maple node. + * @mas: The maple state of source tree. + * @new_mas: The maple state of new tree. + * @gfp: The GFP_FLAGS to use for allocations. + * + * This function allocates child nodes for @new_mas->node during the duplication + * process. If memory allocation fails, @mas is set to -ENOMEM. + */ +static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, + gfp_t gfp) +{ + struct maple_node *node = mte_to_node(mas->node); + struct maple_node *new_node = mte_to_node(new_mas->node); + enum maple_type type; + unsigned char request, count, i; + void __rcu **slots; + void __rcu **new_slots; + unsigned long val; + + /* Allocate memory for child nodes. */ + type = mte_node_type(mas->node); + new_slots = ma_slots(new_node, type); + request = mas_data_end(mas) + 1; + count = mt_alloc_bulk(gfp, request, (void **)new_slots); + if (unlikely(count < request)) { + memset(new_slots, 0, request * sizeof(void *)); + mas_set_err(mas, -ENOMEM); + return; + } + + /* Restore node type information in slots. */ + slots = ma_slots(node, type); + for (i = 0; i < count; i++) { + val = (unsigned long)mt_slot_locked(mas->tree, slots, i); + val &= MAPLE_NODE_MASK; + ((unsigned long *)new_slots)[i] |= val; + } +} + +/* + * mas_dup_build() - Build a new maple tree from a source tree + * @mas: The maple state of source tree, need to be in MAS_START state. + * @new_mas: The maple state of new tree, need to be in MAS_START state. + * @gfp: The GFP_FLAGS to use for allocations. + * + * This function builds a new tree in DFS preorder. If the memory allocation + * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the + * last node. mas_dup_free() will free the incomplete duplication of a tree. + * + * Note that the attributes of the two trees need to be exactly the same, and the + * new tree needs to be empty, otherwise -EINVAL will be set in @mas. + */ +static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, + gfp_t gfp) +{ + struct maple_node *node; + struct maple_pnode *parent = NULL; + struct maple_enode *root; + enum maple_type type; + + if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) || + unlikely(!mtree_empty(new_mas->tree))) { + mas_set_err(mas, -EINVAL); + return; + } + + root = mas_start(mas); + if (mas_is_ptr(mas) || mas_is_none(mas)) + goto set_new_tree; + + node = mt_alloc_one(gfp); + if (!node) { + new_mas->node = MAS_NONE; + mas_set_err(mas, -ENOMEM); + return; + } + + type = mte_node_type(mas->node); + root = mt_mk_node(node, type); + new_mas->node = root; + new_mas->min = 0; + new_mas->max = ULONG_MAX; + root = mte_mk_root(root); + while (1) { + mas_copy_node(mas, new_mas, parent); + if (!mte_is_leaf(mas->node)) { + /* Only allocate child nodes for non-leaf nodes. */ + mas_dup_alloc(mas, new_mas, gfp); + if (unlikely(mas_is_err(mas))) + return; + } else { + /* + * This is the last leaf node and duplication is + * completed. + */ + if (mas->max == ULONG_MAX) + goto done; + + /* This is not the last leaf node and needs to go up. */ + do { + mas_ascend(mas); + mas_ascend(new_mas); + } while (mas->offset == mas_data_end(mas)); + + /* Move to the next subtree. */ + mas->offset++; + new_mas->offset++; + } + + mas_descend(mas); + parent = ma_parent_ptr(mte_to_node(new_mas->node)); + mas_descend(new_mas); + mas->offset = 0; + new_mas->offset = 0; + } +done: + /* Specially handle the parent of the root node. */ + mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas)); +set_new_tree: + /* Make them the same height */ + new_mas->tree->ma_flags = mas->tree->ma_flags; + rcu_assign_pointer(new_mas->tree->ma_root, root); +} + +/** + * __mt_dup(): Duplicate an entire maple tree + * @mt: The source maple tree + * @new: The new maple tree + * @gfp: The GFP_FLAGS to use for allocations + * + * This function duplicates a maple tree in Depth-First Search (DFS) pre-order + * traversal. It uses memcpy() to copy nodes in the source tree and allocate + * new child nodes in non-leaf nodes. The new node is exactly the same as the + * source node except for all the addresses stored in it. It will be faster than + * traversing all elements in the source tree and inserting them one by one into + * the new tree. + * The user needs to ensure that the attributes of the source tree and the new + * tree are the same, and the new tree needs to be an empty tree, otherwise + * -EINVAL will be returned. + * Note that the user needs to manually lock the source tree and the new tree. + * + * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If + * the attributes of the two trees are different or the new tree is not an empty + * tree. + */ +int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) +{ + int ret = 0; + MA_STATE(mas, mt, 0, 0); + MA_STATE(new_mas, new, 0, 0); + + mas_dup_build(&mas, &new_mas, gfp); + if (unlikely(mas_is_err(&mas))) { + ret = xa_err(mas.node); + if (ret == -ENOMEM) + mas_dup_free(&new_mas); + } + + return ret; +} +EXPORT_SYMBOL(__mt_dup); + +/** + * mtree_dup(): Duplicate an entire maple tree + * @mt: The source maple tree + * @new: The new maple tree + * @gfp: The GFP_FLAGS to use for allocations + * + * This function duplicates a maple tree in Depth-First Search (DFS) pre-order + * traversal. It uses memcpy() to copy nodes in the source tree and allocate + * new child nodes in non-leaf nodes. The new node is exactly the same as the + * source node except for all the addresses stored in it. It will be faster than + * traversing all elements in the source tree and inserting them one by one into + * the new tree. + * The user needs to ensure that the attributes of the source tree and the new + * tree are the same, and the new tree needs to be an empty tree, otherwise + * -EINVAL will be returned. + * + * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If + * the attributes of the two trees are different or the new tree is not an empty + * tree. + */ +int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) +{ + int ret = 0; + MA_STATE(mas, mt, 0, 0); + MA_STATE(new_mas, new, 0, 0); + + mas_lock(&new_mas); + mas_lock_nested(&mas, SINGLE_DEPTH_NESTING); + mas_dup_build(&mas, &new_mas, gfp); + mas_unlock(&mas); + if (unlikely(mas_is_err(&mas))) { + ret = xa_err(mas.node); + if (ret == -ENOMEM) + mas_dup_free(&new_mas); + } + + mas_unlock(&new_mas); + return ret; +} +EXPORT_SYMBOL(mtree_dup); + /** * __mt_destroy() - Walk and free all nodes of a locked maple tree. * @mt: The maple tree From 46c99e26f2f86260fed226cab217d0b3ca8dca56 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:39 +0800 Subject: [PATCH 0380/1562] radix tree test suite: align kmem_cache_alloc_bulk() with kernel behavior. When kmem_cache_alloc_bulk() fails to allocate, leave the freed pointers in the array. This enables a more accurate simulation of the kernel's behavior and allows for testing potential double-free scenarios. Link: https://lkml.kernel.org/r/20231027033845.90608-5-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- tools/testing/radix-tree/linux.c | 45 +++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c index 61fe2601cb3a..4eb442206d01 100644 --- a/tools/testing/radix-tree/linux.c +++ b/tools/testing/radix-tree/linux.c @@ -93,13 +93,9 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, return p; } -void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) +void __kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) { assert(objp); - uatomic_dec(&nr_allocated); - uatomic_dec(&cachep->nr_allocated); - if (kmalloc_verbose) - printf("Freeing %p to slab\n", objp); if (cachep->nr_objs > 10 || cachep->align) { memset(objp, POISON_FREE, cachep->size); free(objp); @@ -111,6 +107,15 @@ void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) } } +void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) +{ + uatomic_dec(&nr_allocated); + uatomic_dec(&cachep->nr_allocated); + if (kmalloc_verbose) + printf("Freeing %p to slab\n", objp); + __kmem_cache_free_locked(cachep, objp); +} + void kmem_cache_free(struct kmem_cache *cachep, void *objp) { pthread_mutex_lock(&cachep->lock); @@ -141,18 +146,17 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, if (kmalloc_verbose) pr_debug("Bulk alloc %lu\n", size); - if (!(gfp & __GFP_DIRECT_RECLAIM)) { - if (cachep->non_kernel < size) - return 0; - - cachep->non_kernel -= size; - } - pthread_mutex_lock(&cachep->lock); if (cachep->nr_objs >= size) { struct radix_tree_node *node; for (i = 0; i < size; i++) { + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (!cachep->non_kernel) + break; + cachep->non_kernel--; + } + node = cachep->objs; cachep->nr_objs--; cachep->objs = node->parent; @@ -163,11 +167,19 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, } else { pthread_mutex_unlock(&cachep->lock); for (i = 0; i < size; i++) { + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (!cachep->non_kernel) + break; + cachep->non_kernel--; + } + if (cachep->align) { posix_memalign(&p[i], cachep->align, cachep->size); } else { p[i] = malloc(cachep->size); + if (!p[i]) + break; } if (cachep->ctor) cachep->ctor(p[i]); @@ -176,6 +188,15 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, } } + if (i < size) { + size = i; + pthread_mutex_lock(&cachep->lock); + for (i = 0; i < size; i++) + __kmem_cache_free_locked(cachep, p[i]); + pthread_mutex_unlock(&cachep->lock); + return 0; + } + for (i = 0; i < size; i++) { uatomic_inc(&nr_allocated); uatomic_inc(&cachep->nr_allocated); From a2587a7e8d37885dc063255f5400a66299b42e48 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:40 +0800 Subject: [PATCH 0381/1562] maple_tree: add test for mtree_dup() Add test for mtree_dup(). Test by duplicating different maple trees and then comparing the two trees. Includes tests for duplicating full trees and memory allocation failures on different nodes. Link: https://lkml.kernel.org/r/20231027033845.90608-6-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 361 +++++++++++++++++++++++++++++++ 1 file changed, 361 insertions(+) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index e5da1cad70ba..12b3390e9591 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35857,6 +35857,363 @@ static noinline void __init check_locky(struct maple_tree *mt) mt_clear_in_rcu(mt); } +/* + * Compares two nodes except for the addresses stored in the nodes. + * Returns zero if they are the same, otherwise returns non-zero. + */ +static int __init compare_node(struct maple_enode *enode_a, + struct maple_enode *enode_b) +{ + struct maple_node *node_a, *node_b; + struct maple_node a, b; + void **slots_a, **slots_b; /* Do not use the rcu tag. */ + enum maple_type type; + int i; + + if (((unsigned long)enode_a & MAPLE_NODE_MASK) != + ((unsigned long)enode_b & MAPLE_NODE_MASK)) { + pr_err("The lower 8 bits of enode are different.\n"); + return -1; + } + + type = mte_node_type(enode_a); + node_a = mte_to_node(enode_a); + node_b = mte_to_node(enode_b); + a = *node_a; + b = *node_b; + + /* Do not compare addresses. */ + if (ma_is_root(node_a) || ma_is_root(node_b)) { + a.parent = (struct maple_pnode *)((unsigned long)a.parent & + MA_ROOT_PARENT); + b.parent = (struct maple_pnode *)((unsigned long)b.parent & + MA_ROOT_PARENT); + } else { + a.parent = (struct maple_pnode *)((unsigned long)a.parent & + MAPLE_NODE_MASK); + b.parent = (struct maple_pnode *)((unsigned long)b.parent & + MAPLE_NODE_MASK); + } + + if (a.parent != b.parent) { + pr_err("The lower 8 bits of parents are different. %p %p\n", + a.parent, b.parent); + return -1; + } + + /* + * If it is a leaf node, the slots do not contain the node address, and + * no special processing of slots is required. + */ + if (ma_is_leaf(type)) + goto cmp; + + slots_a = ma_slots(&a, type); + slots_b = ma_slots(&b, type); + + for (i = 0; i < mt_slots[type]; i++) { + if (!slots_a[i] && !slots_b[i]) + break; + + if (!slots_a[i] || !slots_b[i]) { + pr_err("The number of slots is different.\n"); + return -1; + } + + /* Do not compare addresses in slots. */ + ((unsigned long *)slots_a)[i] &= MAPLE_NODE_MASK; + ((unsigned long *)slots_b)[i] &= MAPLE_NODE_MASK; + } + +cmp: + /* + * Compare all contents of two nodes, including parent (except address), + * slots (except address), pivots, gaps and metadata. + */ + return memcmp(&a, &b, sizeof(struct maple_node)); +} + +/* + * Compare two trees and return 0 if they are the same, non-zero otherwise. + */ +static int __init compare_tree(struct maple_tree *mt_a, struct maple_tree *mt_b) +{ + MA_STATE(mas_a, mt_a, 0, 0); + MA_STATE(mas_b, mt_b, 0, 0); + + if (mt_a->ma_flags != mt_b->ma_flags) { + pr_err("The flags of the two trees are different.\n"); + return -1; + } + + mas_dfs_preorder(&mas_a); + mas_dfs_preorder(&mas_b); + + if (mas_is_ptr(&mas_a) || mas_is_ptr(&mas_b)) { + if (!(mas_is_ptr(&mas_a) && mas_is_ptr(&mas_b))) { + pr_err("One is MAS_ROOT and the other is not.\n"); + return -1; + } + return 0; + } + + while (!mas_is_none(&mas_a) || !mas_is_none(&mas_b)) { + + if (mas_is_none(&mas_a) || mas_is_none(&mas_b)) { + pr_err("One is MAS_NONE and the other is not.\n"); + return -1; + } + + if (mas_a.min != mas_b.min || + mas_a.max != mas_b.max) { + pr_err("mas->min, mas->max do not match.\n"); + return -1; + } + + if (compare_node(mas_a.node, mas_b.node)) { + pr_err("The contents of nodes %p and %p are different.\n", + mas_a.node, mas_b.node); + mt_dump(mt_a, mt_dump_dec); + mt_dump(mt_b, mt_dump_dec); + return -1; + } + + mas_dfs_preorder(&mas_a); + mas_dfs_preorder(&mas_b); + } + + return 0; +} + +static __init void mas_subtree_max_range(struct ma_state *mas) +{ + unsigned long limit = mas->max; + MA_STATE(newmas, mas->tree, 0, 0); + void *entry; + + mas_for_each(mas, entry, limit) { + if (mas->last - mas->index >= + newmas.last - newmas.index) { + newmas = *mas; + } + } + + *mas = newmas; +} + +/* + * build_full_tree() - Build a full tree. + * @mt: The tree to build. + * @flags: Use @flags to build the tree. + * @height: The height of the tree to build. + * + * Build a tree with full leaf nodes and internal nodes. Note that the height + * should not exceed 3, otherwise it will take a long time to build. + * Return: zero if the build is successful, non-zero if it fails. + */ +static __init int build_full_tree(struct maple_tree *mt, unsigned int flags, + int height) +{ + MA_STATE(mas, mt, 0, 0); + unsigned long step; + int ret = 0, cnt = 1; + enum maple_type type; + + mt_init_flags(mt, flags); + mtree_insert_range(mt, 0, ULONG_MAX, xa_mk_value(5), GFP_KERNEL); + + mtree_lock(mt); + + while (1) { + mas_set(&mas, 0); + if (mt_height(mt) < height) { + mas.max = ULONG_MAX; + goto store; + } + + while (1) { + mas_dfs_preorder(&mas); + if (mas_is_none(&mas)) + goto unlock; + + type = mte_node_type(mas.node); + if (mas_data_end(&mas) + 1 < mt_slots[type]) { + mas_set(&mas, mas.min); + goto store; + } + } +store: + mas_subtree_max_range(&mas); + step = mas.last - mas.index; + if (step < 1) { + ret = -1; + goto unlock; + } + + step /= 2; + mas.last = mas.index + step; + mas_store_gfp(&mas, xa_mk_value(5), + GFP_KERNEL); + ++cnt; + } +unlock: + mtree_unlock(mt); + + MT_BUG_ON(mt, mt_height(mt) != height); + /* pr_info("height:%u number of elements:%d\n", mt_height(mt), cnt); */ + return ret; +} + +static noinline void __init check_mtree_dup(struct maple_tree *mt) +{ + DEFINE_MTREE(new); + int i, j, ret, count = 0; + unsigned int rand_seed = 17, rand; + + /* store a value at [0, 0] */ + mt_init_flags(mt, 0); + mtree_store_range(mt, 0, 0, xa_mk_value(0), GFP_KERNEL); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + + /* The two trees have different attributes. */ + mt_init_flags(mt, 0); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret != -EINVAL); + mtree_destroy(mt); + mtree_destroy(&new); + + /* The new tree is not empty */ + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + mtree_store(&new, 5, xa_mk_value(5), GFP_KERNEL); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret != -EINVAL); + mtree_destroy(mt); + mtree_destroy(&new); + + /* Test for duplicating full trees. */ + for (i = 1; i <= 3; i++) { + ret = build_full_tree(mt, 0, i); + MT_BUG_ON(mt, ret); + mt_init_flags(&new, 0); + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + for (i = 1; i <= 3; i++) { + ret = build_full_tree(mt, MT_FLAGS_ALLOC_RANGE, i); + MT_BUG_ON(mt, ret); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* Test for normal duplicating. */ + for (i = 0; i < 1000; i += 3) { + if (i & 1) { + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + } else { + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + } + + for (j = 0; j < i; j++) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* Test memory allocation failed. */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i < 30; i += 3) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + + /* Failed at the first node. */ + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + mt_set_non_kernel(0); + ret = mtree_dup(mt, &new, GFP_NOWAIT); + mt_set_non_kernel(0); + MT_BUG_ON(&new, ret != -ENOMEM); + mtree_destroy(mt); + mtree_destroy(&new); + + /* Random maple tree fails at a random node. */ + for (i = 0; i < 1000; i += 3) { + if (i & 1) { + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + } else { + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + } + + for (j = 0; j < i; j++) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + /* + * The rand() library function is not used, so we can generate + * the same random numbers on any platform. + */ + rand_seed = rand_seed * 1103515245 + 12345; + rand = rand_seed / 65536 % 128; + mt_set_non_kernel(rand); + + ret = mtree_dup(mt, &new, GFP_NOWAIT); + mt_set_non_kernel(0); + if (ret != 0) { + MT_BUG_ON(&new, ret != -ENOMEM); + count++; + mtree_destroy(mt); + continue; + } + + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* pr_info("mtree_dup() fail %d times\n", count); */ + BUG_ON(!count); +} + extern void test_kmem_cache_bulk(void); void farmer_tests(void) @@ -35904,6 +36261,10 @@ void farmer_tests(void) check_null_expand(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, 0); + check_mtree_dup(&tree); + mtree_destroy(&tree); + /* RCU testing */ mt_init_flags(&tree, 0); check_erase_testset(&tree); From 9bc1d3cdb904170214456bca96c4924f28522ab8 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:41 +0800 Subject: [PATCH 0382/1562] maple_tree: update the documentation of maple tree Introduce the new interface mtree_dup() in the documentation. Link: https://lkml.kernel.org/r/20231027033845.90608-7-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- Documentation/core-api/maple_tree.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/core-api/maple_tree.rst b/Documentation/core-api/maple_tree.rst index 96f3d5f076b5..ccdd1615cf97 100644 --- a/Documentation/core-api/maple_tree.rst +++ b/Documentation/core-api/maple_tree.rst @@ -81,6 +81,9 @@ section. Sometimes it is necessary to ensure the next call to store to a maple tree does not allocate memory, please see :ref:`maple-tree-advanced-api` for this use case. +You can use mtree_dup() to duplicate an entire maple tree. It is a more +efficient way than inserting all elements one by one into a new tree. + Finally, you can remove all entries from a maple tree by calling mtree_destroy(). If the maple tree entries are pointers, you may wish to free the entries first. @@ -112,6 +115,7 @@ Takes ma_lock internally: * mtree_insert() * mtree_insert_range() * mtree_erase() + * mtree_dup() * mtree_destroy() * mt_set_in_rcu() * mt_clear_in_rcu() From f670fa1caadb4ea532a89012c5451e4c6789bfcc Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:42 +0800 Subject: [PATCH 0383/1562] maple_tree: skip other tests when BENCH is enabled Skip other tests when BENCH is enabled so that performance can be measured in user space. Link: https://lkml.kernel.org/r/20231027033845.90608-8-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 8 ++++---- tools/testing/radix-tree/maple.c | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 464eeb90d5ad..de470950714f 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -3585,10 +3585,6 @@ static int __init maple_tree_seed(void) pr_info("\nTEST STARTING\n\n"); - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - check_root_expand(&tree); - mtree_destroy(&tree); - #if defined(BENCH_SLOT_STORE) #define BENCH mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); @@ -3646,6 +3642,10 @@ static int __init maple_tree_seed(void) goto skip; #endif + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_root_expand(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_iteration(&tree); mtree_destroy(&tree); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 12b3390e9591..cb5358674521 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -36299,7 +36299,9 @@ void farmer_tests(void) void maple_tree_tests(void) { +#if !defined(BENCH) farmer_tests(); +#endif maple_tree_seed(); maple_tree_harvest(); } From 446e1867e6df3cbdd19af6be8f8f4ed56176adb4 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:43 +0800 Subject: [PATCH 0384/1562] maple_tree: update check_forking() and bench_forking() Updated check_forking() and bench_forking() to use __mt_dup() to duplicate maple tree. Link: https://lkml.kernel.org/r/20231027033845.90608-9-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 117 ++++++++++++++++++------------------ tools/include/linux/rwsem.h | 4 ++ 2 files changed, 62 insertions(+), 59 deletions(-) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index de470950714f..3e4597fb49d3 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1834,47 +1834,48 @@ static noinline void __init bench_mas_prev(struct maple_tree *mt) } #endif /* check_forking - simulate the kernel forking sequence with the tree. */ -static noinline void __init check_forking(struct maple_tree *mt) +static noinline void __init check_forking(void) { - - struct maple_tree newmt; - int i, nr_entries = 134; + struct maple_tree mt, newmt; + int i, nr_entries = 134, ret; void *val; - MA_STATE(mas, mt, 0, 0); - MA_STATE(newmas, mt, 0, 0); - struct rw_semaphore newmt_lock; + MA_STATE(mas, &mt, 0, 0); + MA_STATE(newmas, &newmt, 0, 0); + struct rw_semaphore mt_lock, newmt_lock; + init_rwsem(&mt_lock); init_rwsem(&newmt_lock); - for (i = 0; i <= nr_entries; i++) - mtree_store_range(mt, i*10, i*10 + 5, - xa_mk_value(i), GFP_KERNEL); + mt_init_flags(&mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt, &mt_lock); - mt_set_non_kernel(99999); mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&newmt, &newmt_lock); - newmas.tree = &newmt; - mas_reset(&newmas); - mas_reset(&mas); - down_write(&newmt_lock); - mas.index = 0; - mas.last = 0; - if (mas_expected_entries(&newmas, nr_entries)) { + + down_write(&mt_lock); + for (i = 0; i <= nr_entries; i++) { + mas_set_range(&mas, i*10, i*10 + 5); + mas_store_gfp(&mas, xa_mk_value(i), GFP_KERNEL); + } + + down_write_nested(&newmt_lock, SINGLE_DEPTH_NESTING); + ret = __mt_dup(&mt, &newmt, GFP_KERNEL); + if (ret) { pr_err("OOM!"); BUG_ON(1); } - rcu_read_lock(); - mas_for_each(&mas, val, ULONG_MAX) { - newmas.index = mas.index; - newmas.last = mas.last; + + mas_set(&newmas, 0); + mas_for_each(&newmas, val, ULONG_MAX) mas_store(&newmas, val); - } - rcu_read_unlock(); + mas_destroy(&newmas); + mas_destroy(&mas); mt_validate(&newmt); - mt_set_non_kernel(0); __mt_destroy(&newmt); + __mt_destroy(&mt); up_write(&newmt_lock); + up_write(&mt_lock); } static noinline void __init check_iteration(struct maple_tree *mt) @@ -1977,49 +1978,51 @@ static noinline void __init check_mas_store_gfp(struct maple_tree *mt) } #if defined(BENCH_FORK) -static noinline void __init bench_forking(struct maple_tree *mt) +static noinline void __init bench_forking(void) { - - struct maple_tree newmt; - int i, nr_entries = 134, nr_fork = 80000; + struct maple_tree mt, newmt; + int i, nr_entries = 134, nr_fork = 80000, ret; void *val; - MA_STATE(mas, mt, 0, 0); - MA_STATE(newmas, mt, 0, 0); - struct rw_semaphore newmt_lock; + MA_STATE(mas, &mt, 0, 0); + MA_STATE(newmas, &newmt, 0, 0); + struct rw_semaphore mt_lock, newmt_lock; + init_rwsem(&mt_lock); init_rwsem(&newmt_lock); - mt_set_external_lock(&newmt, &newmt_lock); - for (i = 0; i <= nr_entries; i++) - mtree_store_range(mt, i*10, i*10 + 5, - xa_mk_value(i), GFP_KERNEL); + mt_init_flags(&mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt, &mt_lock); + + down_write(&mt_lock); + for (i = 0; i <= nr_entries; i++) { + mas_set_range(&mas, i*10, i*10 + 5); + mas_store_gfp(&mas, xa_mk_value(i), GFP_KERNEL); + } for (i = 0; i < nr_fork; i++) { - mt_set_non_kernel(99999); - mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE); - newmas.tree = &newmt; - mas_reset(&newmas); - mas_reset(&mas); - mas.index = 0; - mas.last = 0; - rcu_read_lock(); - down_write(&newmt_lock); - if (mas_expected_entries(&newmas, nr_entries)) { - printk("OOM!"); + mt_init_flags(&newmt, + MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&newmt, &newmt_lock); + + down_write_nested(&newmt_lock, SINGLE_DEPTH_NESTING); + ret = __mt_dup(&mt, &newmt, GFP_KERNEL); + if (ret) { + pr_err("OOM!"); BUG_ON(1); } - mas_for_each(&mas, val, ULONG_MAX) { - newmas.index = mas.index; - newmas.last = mas.last; + + mas_set(&newmas, 0); + mas_for_each(&newmas, val, ULONG_MAX) mas_store(&newmas, val); - } + mas_destroy(&newmas); - rcu_read_unlock(); mt_validate(&newmt); - mt_set_non_kernel(0); __mt_destroy(&newmt); up_write(&newmt_lock); } + mas_destroy(&mas); + __mt_destroy(&mt); + up_write(&mt_lock); } #endif @@ -3615,9 +3618,7 @@ static int __init maple_tree_seed(void) #endif #if defined(BENCH_FORK) #define BENCH - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - bench_forking(&tree); - mtree_destroy(&tree); + bench_forking(); goto skip; #endif #if defined(BENCH_MT_FOR_EACH) @@ -3650,9 +3651,7 @@ static int __init maple_tree_seed(void) check_iteration(&tree); mtree_destroy(&tree); - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - check_forking(&tree); - mtree_destroy(&tree); + check_forking(); mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_mas_store_gfp(&tree); diff --git a/tools/include/linux/rwsem.h b/tools/include/linux/rwsem.h index 83971b3cbfce..f8bffd4a987c 100644 --- a/tools/include/linux/rwsem.h +++ b/tools/include/linux/rwsem.h @@ -37,4 +37,8 @@ static inline int up_write(struct rw_semaphore *sem) { return pthread_rwlock_unlock(&sem->lock); } + +#define down_read_nested(sem, subclass) down_read(sem) +#define down_write_nested(sem, subclass) down_write(sem) + #endif /* _TOOLS_RWSEM_H */ From 8e50d32c7a89bde896945e4e572ef28ccd87bbf8 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:44 +0800 Subject: [PATCH 0385/1562] maple_tree: preserve the tree attributes when destroying maple tree When destroying maple tree, preserve its attributes and then turn it into an empty tree. This allows it to be reused without needing to be reinitialized. Link: https://lkml.kernel.org/r/20231027033845.90608-10-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 718a222cc090..4439469442c7 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6763,7 +6763,7 @@ void __mt_destroy(struct maple_tree *mt) if (xa_is_node(root)) mte_destroy_walk(root, mt); - mt->ma_flags = 0; + mt->ma_flags = mt_attr(mt); } EXPORT_SYMBOL_GPL(__mt_destroy); From d2406291483775ecddaee929231a39c70c08fda2 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:45 +0800 Subject: [PATCH 0386/1562] fork: use __mt_dup() to duplicate maple tree in dup_mmap() In dup_mmap(), using __mt_dup() to duplicate the old maple tree and then directly replacing the entries of VMAs in the new maple tree can result in better performance. __mt_dup() uses DFS pre-order to duplicate the maple tree, so it is efficient. The average time complexity of __mt_dup() is O(n), where n is the number of VMAs. The proof of the time complexity is provided in the commit log that introduces __mt_dup(). After duplicating the maple tree, each element is traversed and replaced (ignoring the cases of deletion, which are rare). Since it is only a replacement operation for each element, this process is also O(n). Analyzing the exact time complexity of the previous algorithm is challenging because each insertion can involve appending to a node, pushing data to adjacent nodes, or even splitting nodes. The frequency of each action is difficult to calculate. The worst-case scenario for a single insertion is when the tree undergoes splitting at every level. If we consider each insertion as the worst-case scenario, we can determine that the upper bound of the time complexity is O(n*log(n)), although this is a loose upper bound. However, based on the test data, it appears that the actual time complexity is likely to be O(n). As the entire maple tree is duplicated using __mt_dup(), if dup_mmap() fails, there will be a portion of VMAs that have not been duplicated in the maple tree. To handle this, we mark the failure point with XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, stop releasing VMAs that have not been duplicated after this point. There is a "spawn" in byte-unixbench[1], which can be used to test the performance of fork(). I modified it slightly to make it work with different number of VMAs. Below are the test results. The first row shows the number of VMAs. The second and third rows show the number of fork() calls per ten seconds, corresponding to next-20231006 and the this patchset, respectively. The test results were obtained with CPU binding to avoid scheduler load balancing that could cause unstable results. There are still some fluctuations in the test results, but at least they are better than the original performance. 21 121 221 421 821 1621 3221 6421 12821 25621 51221 112100 76261 54227 34035 20195 11112 6017 3161 1606 802 393 114558 83067 65008 45824 28751 16072 8922 4747 2436 1233 599 2.19% 8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42% [1] https://github.com/kdlucas/byte-unixbench/tree/master Link: https://lkml.kernel.org/r/20231027033845.90608-11-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Suggested-by: Liam R. Howlett Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 +++++++++++ kernel/fork.c | 40 +++++++++++++++++++++++++++++----------- mm/internal.h | 11 ----------- mm/memory.c | 7 ++++++- mm/mmap.c | 9 ++++++--- 5 files changed, 52 insertions(+), 26 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 418d26608ece..64cd1ee4aacc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -994,6 +994,17 @@ static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, return mas_expected_entries(&vmi->mas, count); } +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + __mas_set_range(&vmi->mas, start, end - 1); + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + /* Free any unused preallocations */ static inline void vma_iter_free(struct vma_iterator *vmi) { diff --git a/kernel/fork.c b/kernel/fork.c index 10917c3e1f03..93924392a5c3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -650,7 +650,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge = 0; LIST_HEAD(uf); - VMA_ITERATOR(old_vmi, oldmm, 0); VMA_ITERATOR(vmi, mm, 0); uprobe_start_dup_mmap(); @@ -678,16 +677,22 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; khugepaged_fork(mm, oldmm); - retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count); - if (retval) + /* Use __mt_dup() to efficiently build an identical maple tree. */ + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); + if (unlikely(retval)) goto out; mt_clear_in_rcu(vmi.mas.tree); - for_each_vma(old_vmi, mpnt) { + for_each_vma(vmi, mpnt) { struct file *file; vma_start_write(mpnt); if (mpnt->vm_flags & VM_DONTCOPY) { + retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, + mpnt->vm_end, GFP_KERNEL); + if (retval) + goto loop_out; + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } @@ -749,9 +754,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (is_vm_hugetlb_page(tmp)) hugetlb_dup_vma_private(tmp); - /* Link the vma into the MT */ - if (vma_iter_bulk_store(&vmi, tmp)) - goto fail_nomem_vmi_store; + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ + vma_iter_bulk_store(&vmi, tmp); mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -760,15 +767,28 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); - if (retval) + if (retval) { + mpnt = vma_next(&vmi); goto loop_out; + } } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: vma_iter_free(&vmi); - if (!retval) + if (!retval) { mt_set_in_rcu(vmi.mas.tree); + } else if (mpnt) { + /* + * The entire maple tree has already been duplicated. If the + * mmap duplication fails, mark the failure point with + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, + * stop releasing VMAs that have not been duplicated after this + * point. + */ + mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store(&vmi.mas, XA_ZERO_ENTRY); + } out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -778,8 +798,6 @@ fail_uprobe_end: uprobe_end_dup_mmap(); return retval; -fail_nomem_vmi_store: - unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: diff --git a/mm/internal.h b/mm/internal.h index b61034bd50f5..89a5a794d68f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1154,17 +1154,6 @@ static inline void vma_iter_clear(struct vma_iterator *vmi) mas_store_prealloc(&vmi->mas, NULL); } -static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, - unsigned long start, unsigned long end, gfp_t gfp) -{ - __mas_set_range(&vmi->mas, start, end - 1); - mas_store_gfp(&vmi->mas, NULL, gfp); - if (unlikely(mas_is_err(&vmi->mas))) - return -ENOMEM; - - return 0; -} - static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) { return mas_walk(&vmi->mas); diff --git a/mm/memory.c b/mm/memory.c index 5c757fba8858..a7025ed5c65b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -374,6 +374,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, * be 0. This will underflow and is okay. */ next = mas_find(mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; /* * Hide vma from rmap and truncate_pagecache before freeing @@ -395,6 +397,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, && !is_vm_hugetlb_page(next)) { vma = next; next = mas_find(mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -1744,7 +1748,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, unmap_single_vma(tlb, vma, start, end, &details, mm_wr_locked); hugetlb_zap_end(vma, &details); - } while ((vma = mas_find(mas, tree_end - 1)) != NULL); + vma = mas_find(mas, tree_end - 1); + } while (vma && likely(!xa_is_zero(vma))); mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/mmap.c b/mm/mmap.c index 1971bfffcc03..4f1cb814586d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3294,10 +3294,11 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); vma = mas_find(&mas, ULONG_MAX); - if (!vma) { + if (!vma || unlikely(xa_is_zero(vma))) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); - return; + mmap_write_lock(mm); + goto destroy; } lru_add_drain(); @@ -3332,11 +3333,13 @@ void exit_mmap(struct mm_struct *mm) remove_vma(vma, true); count++; cond_resched(); - } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + vma = mas_find(&mas, ULONG_MAX); + } while (vma && likely(!xa_is_zero(vma))); BUG_ON(count != mm->map_count); trace_exit_mmap(mm); +destroy: __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); From 3027c6f8eb9d3857aef08f401aeb7de715410525 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 30 Oct 2023 09:11:47 +0800 Subject: [PATCH 0387/1562] mm: huge_memory: batch tlb flush when splitting a pte-mapped THP I can observe an obvious tlb flush hotspot when splitting a pte-mapped THP on my ARM64 server, and the distribution of this hotspot is as follows: - 16.85% split_huge_page_to_list + 7.80% down_write - 7.49% try_to_migrate - 7.48% rmap_walk_anon 7.23% ptep_clear_flush + 1.52% __split_huge_page The reason is that the split_huge_page_to_list() will build migration entries for each subpage of a pte-mapped Anon THP by try_to_migrate(), or unmap for file THP, and it will clear and tlb flush for each subpage's pte. Moreover, the split_huge_page_to_list() will set TTU_SPLIT_HUGE_PMD flag to ensure the THP is already a pte-mapped THP before splitting it to some normal pages. Actually, there is no need to flush tlb for each subpage immediately, instead we can batch tlb flush for the pte-mapped THP to improve the performance. After this patch, we can see the batch tlb flush can improve the latency obviously when running thpscale. k6.5-base patched Amean fault-both-1 1071.17 ( 0.00%) 901.83 * 15.81%* Amean fault-both-3 2386.08 ( 0.00%) 1865.32 * 21.82%* Amean fault-both-5 2851.10 ( 0.00%) 2273.84 * 20.25%* Amean fault-both-7 3679.91 ( 0.00%) 2881.66 * 21.69%* Amean fault-both-12 5916.66 ( 0.00%) 4369.55 * 26.15%* Amean fault-both-18 7981.36 ( 0.00%) 6303.57 * 21.02%* Amean fault-both-24 10950.79 ( 0.00%) 8752.56 * 20.07%* Amean fault-both-30 14077.35 ( 0.00%) 10170.01 * 27.76%* Amean fault-both-32 13061.57 ( 0.00%) 11630.08 * 10.96%* Link: https://lkml.kernel.org/r/431d9fb6823036369dcb1d3b2f63732f01df21a7.1698488264.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Alistair Popple Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4f542444a91f..6eb55f97a3d2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2379,7 +2379,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void unmap_folio(struct folio *folio) { enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | - TTU_SYNC; + TTU_SYNC | TTU_BATCH_FLUSH; VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); @@ -2392,6 +2392,8 @@ static void unmap_folio(struct folio *folio) try_to_migrate(folio, ttu_flags); else try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); + + try_to_unmap_flush(); } static void remap_page(struct folio *folio, unsigned long nr) From ff6c3d81f2e86b63a3a530683f89ef393882782a Mon Sep 17 00:00:00 2001 From: Liam Ni Date: Thu, 26 Oct 2023 10:03:29 +0800 Subject: [PATCH 0388/1562] NUMA: optimize detection of memory with no node id assigned by firmware Sanity check that makes sure the nodes cover all memory loops over numa_meminfo to count the pages that have node id assigned by the firmware, then loops again over memblock.memory to find the total amount of memory and in the end checks that the difference between the total memory and memory that covered by nodes is less than some threshold. Worse, the loop over numa_meminfo calls __absent_pages_in_range() that also partially traverses memblock.memory. It's much simpler and more efficient to have a single traversal of memblock.memory that verifies that amount of memory not covered by nodes is less than a threshold. Introduce memblock_validate_numa_coverage() that does exactly that and use it instead of numa_meminfo_cover_memory(). Link: https://lkml.kernel.org/r/20231026020329.327329-1-zhiguangni01@gmail.com Signed-off-by: Liam Ni Reviewed-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Bibo Mao Cc: Binbin Zhou Cc: Borislav Petkov Cc: Dave Hansen Cc: Feiyang Chen Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: WANG Xuerui Signed-off-by: Andrew Morton --- arch/loongarch/kernel/numa.c | 28 +--------------------------- arch/x86/mm/numa.c | 34 ++-------------------------------- include/linux/memblock.h | 1 + mm/memblock.c | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 59 deletions(-) diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index 6e65ff12d5c7..8fe21f868f72 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -226,32 +226,6 @@ static void __init node_mem_init(unsigned int node) #ifdef CONFIG_ACPI_NUMA -/* - * Sanity check to catch more bad NUMA configurations (they are amazingly - * common). Make sure the nodes cover all memory. - */ -static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) -{ - int i; - u64 numaram, biosram; - - numaram = 0; - for (i = 0; i < mi->nr_blks; i++) { - u64 s = mi->blk[i].start >> PAGE_SHIFT; - u64 e = mi->blk[i].end >> PAGE_SHIFT; - - numaram += e - s; - numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); - if ((s64)numaram < 0) - numaram = 0; - } - max_pfn = max_low_pfn; - biosram = max_pfn - absent_pages_in_range(0, max_pfn); - - BUG_ON((s64)(biosram - numaram) >= (1 << (20 - PAGE_SHIFT))); - return true; -} - static void __init add_node_intersection(u32 node, u64 start, u64 size, u32 type) { static unsigned long num_physpages; @@ -396,7 +370,7 @@ int __init init_numa_memory(void) return -EINVAL; init_node_memblock(); - if (numa_meminfo_cover_memory(&numa_meminfo) == false) + if (!memblock_validate_numa_coverage(SZ_1M)) return -EINVAL; for_each_node_mask(node, node_possible_map) { diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index b29ceb19e46e..adc497b93f03 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -449,37 +449,6 @@ int __node_distance(int from, int to) } EXPORT_SYMBOL(__node_distance); -/* - * Sanity check to catch more bad NUMA configurations (they are amazingly - * common). Make sure the nodes cover all memory. - */ -static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) -{ - u64 numaram, e820ram; - int i; - - numaram = 0; - for (i = 0; i < mi->nr_blks; i++) { - u64 s = mi->blk[i].start >> PAGE_SHIFT; - u64 e = mi->blk[i].end >> PAGE_SHIFT; - numaram += e - s; - numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); - if ((s64)numaram < 0) - numaram = 0; - } - - e820ram = max_pfn - absent_pages_in_range(0, max_pfn); - - /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ - if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { - printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", - (numaram << PAGE_SHIFT) >> 20, - (e820ram << PAGE_SHIFT) >> 20); - return false; - } - return true; -} - /* * Mark all currently memblock-reserved physical memory (which covers the * kernel's own memory ranges) as hot-unswappable. @@ -585,7 +554,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) return -EINVAL; } } - if (!numa_meminfo_cover_memory(mi)) + + if (!memblock_validate_numa_coverage(SZ_1M)) return -EINVAL; /* Finally register nodes. */ diff --git a/include/linux/memblock.h b/include/linux/memblock.h index ae3bde302f70..b695f9e946da 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -123,6 +123,7 @@ int memblock_physmem_add(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); bool memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); +bool memblock_validate_numa_coverage(unsigned long threshold_bytes); int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); diff --git a/mm/memblock.c b/mm/memblock.c index 5a88d6d24d79..4a62f7774b65 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -734,6 +734,40 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); } +/** + * memblock_validate_numa_coverage - check if amount of memory with + * no node ID assigned is less than a threshold + * @threshold_bytes: maximal number of pages that can have unassigned node + * ID (in bytes). + * + * A buggy firmware may report memory that does not belong to any node. + * Check if amount of such memory is below @threshold_bytes. + * + * Return: true on success, false on failure. + */ +bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_bytes) +{ + unsigned long nr_pages = 0; + unsigned long start_pfn, end_pfn, mem_size_mb; + int nid, i; + + /* calculate lose page */ + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + if (nid == NUMA_NO_NODE) + nr_pages += end_pfn - start_pfn; + } + + if ((nr_pages << PAGE_SHIFT) >= threshold_bytes) { + mem_size_mb = memblock_phys_mem_size() >> 20; + pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n", + (nr_pages << PAGE_SHIFT) >> 20, mem_size_mb); + return false; + } + + return true; +} + + /** * memblock_isolate_range - isolate given range into disjoint memblocks * @type: memblock type to isolate range for From 82b8a3b49ebde4e7246319884deeb29d6dc1b0cf Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Tue, 7 Nov 2023 00:22:41 -0700 Subject: [PATCH 0389/1562] mm/memory_hotplug: replace an open-coded kmemdup() in add_memory_resource() Patch series "mm: use memmap_on_memory semantics for dax/kmem", v10. The dax/kmem driver can potentially hot-add large amounts of memory originating from CXL memory expanders, or NVDIMMs, or other 'device memories'. There is a chance there isn't enough regular system memory available to fit the memmap for this new memory. It's therefore desirable, if all other conditions are met, for the kmem managed memory to place its memmap on the newly added memory itself. The main hurdle for accomplishing this for kmem is that memmap_on_memory can only be done if the memory being added is equal to the size of one memblock. To overcome this, allow the hotplug code to split an add_memory() request into memblock-sized chunks, and try_remove_memory() to also expect and handle such a scenario. Patch 1 replaces an open-coded kmemdup() Patch 2 teaches the memory_hotplug code to allow for splitting add_memory() and remove_memory() requests over memblock sized chunks. Patch 3 allows the dax region drivers to request memmap_on_memory semantics. CXL dax regions default this to 'on', all others default to off to keep existing behavior unchanged. This patch (of 3): A review of the memmap_on_memory modifications to add_memory_resource() revealed an instance of an open-coded kmemdup(). Replace it with kmemdup(). Link: https://lkml.kernel.org/r/20231107-vv-kmem_memmap-v10-0-1253ec050ed0@intel.com Link: https://lkml.kernel.org/r/20231107-vv-kmem_memmap-v10-1-1253ec050ed0@intel.com Signed-off-by: Vishal Verma Reviewed-by: David Hildenbrand Reviewed-by: Fan Ni Reported-by: Dan Williams Cc: Michal Hocko Cc: Oscar Salvador Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: Dave Jiang Cc: "Huang, Ying" Cc: Jeff Moyer Cc: Jonathan Cameron Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7a5fc89a8652..39528831e788 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1439,13 +1439,13 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { if (mhp_supports_memmap_on_memory(size)) { mhp_altmap.free = memory_block_memmap_on_memory_pages(); - params.altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL); + params.altmap = kmemdup(&mhp_altmap, + sizeof(struct vmem_altmap), + GFP_KERNEL); if (!params.altmap) { ret = -ENOMEM; goto error; } - - memcpy(params.altmap, &mhp_altmap, sizeof(mhp_altmap)); } /* fallback to not using altmap */ } From 6b8f0798b85aa529011570369db985a788f3003f Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Tue, 7 Nov 2023 00:22:42 -0700 Subject: [PATCH 0390/1562] mm/memory_hotplug: split memmap_on_memory requests across memblocks The MHP_MEMMAP_ON_MEMORY flag for hotplugged memory is restricted to 'memblock_size' chunks of memory being added. Adding a larger span of memory precludes memmap_on_memory semantics. For users of hotplug such as kmem, large amounts of memory might get added from the CXL subsystem. In some cases, this amount may exceed the available 'main memory' to store the memmap for the memory being added. In this case, it is useful to have a way to place the memmap on the memory being added, even if it means splitting the addition into memblock-sized chunks. Change add_memory_resource() to loop over memblock-sized chunks of memory if caller requested memmap_on_memory, and if other conditions for it are met. Teach try_remove_memory() to also expect that a memory range being removed might have been split up into memblock sized chunks, and to loop through those as needed. This does preclude being able to use PUD mappings in the direct map; a proposal to how this could be optimized in the future is laid out here[1]. [1]: https://lore.kernel.org/linux-mm/b6753402-2de9-25b2-36e9-eacd49752b19@redhat.com/ Link: https://lkml.kernel.org/r/20231107-vv-kmem_memmap-v10-2-1253ec050ed0@intel.com Signed-off-by: Vishal Verma Suggested-by: David Hildenbrand Reviewed-by: Dan Williams Reviewed-by: "Huang, Ying" Acked-by: David Hildenbrand Cc: Michal Hocko Cc: Oscar Salvador Cc: Dave Jiang Cc: Dave Hansen Cc: Aneesh Kumar K.V Cc: Fan Ni Cc: Jeff Moyer Cc: Jonathan Cameron Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 212 ++++++++++++++++++++++++++++---------------- 1 file changed, 136 insertions(+), 76 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 39528831e788..926e1cfb10e9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1380,6 +1380,85 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) return arch_supports_memmap_on_memory(vmemmap_size); } +static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size) +{ + unsigned long memblock_size = memory_block_size_bytes(); + u64 cur_start; + + /* + * For memmap_on_memory, the altmaps were added on a per-memblock + * basis; we have to process each individual memory block. + */ + for (cur_start = start; cur_start < start + size; + cur_start += memblock_size) { + struct vmem_altmap *altmap = NULL; + struct memory_block *mem; + + mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(cur_start))); + if (WARN_ON_ONCE(!mem)) + continue; + + altmap = mem->altmap; + mem->altmap = NULL; + + remove_memory_block_devices(cur_start, memblock_size); + + arch_remove_memory(cur_start, memblock_size, altmap); + + /* Verify that all vmemmap pages have actually been freed. */ + WARN(altmap->alloc, "Altmap not fully unmapped"); + kfree(altmap); + } +} + +static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, + u64 start, u64 size) +{ + unsigned long memblock_size = memory_block_size_bytes(); + u64 cur_start; + int ret; + + for (cur_start = start; cur_start < start + size; + cur_start += memblock_size) { + struct mhp_params params = { .pgprot = + pgprot_mhp(PAGE_KERNEL) }; + struct vmem_altmap mhp_altmap = { + .base_pfn = PHYS_PFN(cur_start), + .end_pfn = PHYS_PFN(cur_start + memblock_size - 1), + }; + + mhp_altmap.free = memory_block_memmap_on_memory_pages(); + params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap), + GFP_KERNEL); + if (!params.altmap) { + ret = -ENOMEM; + goto out; + } + + /* call arch's memory hotadd */ + ret = arch_add_memory(nid, cur_start, memblock_size, ¶ms); + if (ret < 0) { + kfree(params.altmap); + goto out; + } + + /* create memory block devices after memory was added */ + ret = create_memory_block_devices(cur_start, memblock_size, + params.altmap, group); + if (ret) { + arch_remove_memory(cur_start, memblock_size, NULL); + kfree(params.altmap); + goto out; + } + } + + return 0; +out: + if (ret && cur_start != start) + remove_memory_blocks_and_altmaps(start, cur_start - start); + return ret; +} + /* * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations (triggered e.g. by sysfs). @@ -1390,10 +1469,6 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; enum memblock_flags memblock_flags = MEMBLOCK_NONE; - struct vmem_altmap mhp_altmap = { - .base_pfn = PHYS_PFN(res->start), - .end_pfn = PHYS_PFN(res->end), - }; struct memory_group *group = NULL; u64 start, size; bool new_node = false; @@ -1436,30 +1511,22 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) /* * Self hosted memmap array */ - if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { - if (mhp_supports_memmap_on_memory(size)) { - mhp_altmap.free = memory_block_memmap_on_memory_pages(); - params.altmap = kmemdup(&mhp_altmap, - sizeof(struct vmem_altmap), - GFP_KERNEL); - if (!params.altmap) { - ret = -ENOMEM; - goto error; - } + if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && + mhp_supports_memmap_on_memory(memory_block_size_bytes())) { + ret = create_altmaps_and_memory_blocks(nid, group, start, size); + if (ret) + goto error; + } else { + ret = arch_add_memory(nid, start, size, ¶ms); + if (ret < 0) + goto error; + + /* create memory block devices after memory was added */ + ret = create_memory_block_devices(start, size, NULL, group); + if (ret) { + arch_remove_memory(start, size, params.altmap); + goto error; } - /* fallback to not using altmap */ - } - - /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, ¶ms); - if (ret < 0) - goto error_free; - - /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size, params.altmap, group); - if (ret) { - arch_remove_memory(start, size, params.altmap); - goto error_free; } if (new_node) { @@ -1496,8 +1563,6 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; -error_free: - kfree(params.altmap); error: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); @@ -2067,17 +2132,13 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) return 0; } -static int test_has_altmap_cb(struct memory_block *mem, void *arg) +static int count_memory_range_altmaps_cb(struct memory_block *mem, void *arg) { - struct memory_block **mem_ptr = (struct memory_block **)arg; - /* - * return the memblock if we have altmap - * and break callback. - */ - if (mem->altmap) { - *mem_ptr = mem; - return 1; - } + u64 *num_altmaps = (u64 *)arg; + + if (mem->altmap) + *num_altmaps += 1; + return 0; } @@ -2151,11 +2212,29 @@ void try_offline_node(int nid) } EXPORT_SYMBOL(try_offline_node); +static int memory_blocks_have_altmaps(u64 start, u64 size) +{ + u64 num_memblocks = size / memory_block_size_bytes(); + u64 num_altmaps = 0; + + if (!mhp_memmap_on_memory()) + return 0; + + walk_memory_blocks(start, size, &num_altmaps, + count_memory_range_altmaps_cb); + + if (num_altmaps == 0) + return 0; + + if (WARN_ON_ONCE(num_memblocks != num_altmaps)) + return -EINVAL; + + return 1; +} + static int __ref try_remove_memory(u64 start, u64 size) { - struct memory_block *mem; - int rc = 0, nid = NUMA_NO_NODE; - struct vmem_altmap *altmap = NULL; + int rc, nid = NUMA_NO_NODE; BUG_ON(check_hotplug_memory_range(start, size)); @@ -2172,45 +2251,26 @@ static int __ref try_remove_memory(u64 start, u64 size) if (rc) return rc; - /* - * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in - * the same granularity it was added - a single memory block. - */ - if (mhp_memmap_on_memory()) { - rc = walk_memory_blocks(start, size, &mem, test_has_altmap_cb); - if (rc) { - if (size != memory_block_size_bytes()) { - pr_warn("Refuse to remove %#llx - %#llx," - "wrong granularity\n", - start, start + size); - return -EINVAL; - } - altmap = mem->altmap; - /* - * Mark altmap NULL so that we can add a debug - * check on memblock free. - */ - mem->altmap = NULL; - } - } - /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); - /* - * Memory block device removal under the device_hotplug_lock is - * a barrier against racing online attempts. - */ - remove_memory_block_devices(start, size); - mem_hotplug_begin(); - arch_remove_memory(start, size, altmap); - - /* Verify that all vmemmap pages have actually been freed. */ - if (altmap) { - WARN(altmap->alloc, "Altmap not fully unmapped"); - kfree(altmap); + rc = memory_blocks_have_altmaps(start, size); + if (rc < 0) { + mem_hotplug_done(); + return rc; + } else if (!rc) { + /* + * Memory block device removal under the device_hotplug_lock is + * a barrier against racing online attempts. + * No altmaps present, do the removal directly + */ + remove_memory_block_devices(start, size); + arch_remove_memory(start, size, NULL); + } else { + /* all memblocks in the range have altmaps */ + remove_memory_blocks_and_altmaps(start, size); } if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { From 4eca0ef49af9b2b0c52ef2b58e045ab34629796b Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Tue, 7 Nov 2023 00:22:43 -0700 Subject: [PATCH 0391/1562] dax/kmem: allow kmem to add memory with memmap_on_memory Large amounts of memory managed by the kmem driver may come in via CXL, and it is often desirable to have the memmap for this memory on the new memory itself. Enroll kmem-managed memory for memmap_on_memory semantics if the dax region originates via CXL. For non-CXL dax regions, retain the existing default behavior of hot adding without memmap_on_memory semantics. Link: https://lkml.kernel.org/r/20231107-vv-kmem_memmap-v10-3-1253ec050ed0@intel.com Signed-off-by: Vishal Verma Reviewed-by: Jonathan Cameron Reviewed-by: David Hildenbrand Reviewed-by: "Huang, Ying" Tested-by: Li Zhijian [cxl.kmem and nvdimm.kmem] Cc: Michal Hocko Cc: Oscar Salvador Cc: Dan Williams Cc: Dave Jiang Cc: Dave Hansen Cc: Aneesh Kumar K.V Cc: Fan Ni Cc: Jeff Moyer Signed-off-by: Andrew Morton --- drivers/dax/bus.c | 3 +++ drivers/dax/bus.h | 1 + drivers/dax/cxl.c | 1 + drivers/dax/dax-private.h | 1 + drivers/dax/hmem/hmem.c | 1 + drivers/dax/kmem.c | 8 +++++++- drivers/dax/pmem.c | 1 + 7 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 1659b787b65f..1ff1ab5fa105 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -367,6 +367,7 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr, .dax_region = dax_region, .size = 0, .id = -1, + .memmap_on_memory = false, }; struct dev_dax *dev_dax = devm_create_dev_dax(&data); @@ -1400,6 +1401,8 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data) dev_dax->align = dax_region->align; ida_init(&dev_dax->ida); + dev_dax->memmap_on_memory = data->memmap_on_memory; + inode = dax_inode(dax_dev); dev->devt = inode->i_rdev; dev->bus = &dax_bus_type; diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index 1ccd23360124..cbbf64443098 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -23,6 +23,7 @@ struct dev_dax_data { struct dev_pagemap *pgmap; resource_size_t size; int id; + bool memmap_on_memory; }; struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data); diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c index 8bc9d04034d6..c696837ab23c 100644 --- a/drivers/dax/cxl.c +++ b/drivers/dax/cxl.c @@ -26,6 +26,7 @@ static int cxl_dax_region_probe(struct device *dev) .dax_region = dax_region, .id = -1, .size = range_len(&cxlr_dax->hpa_range), + .memmap_on_memory = true, }; return PTR_ERR_OR_ZERO(devm_create_dev_dax(&data)); diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index 27cf2daaaa79..446617b73aea 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -70,6 +70,7 @@ struct dev_dax { struct ida ida; struct device dev; struct dev_pagemap *pgmap; + bool memmap_on_memory; int nr_range; struct dev_dax_range { unsigned long pgoff; diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index 5d2ddef0f8f5..b9da69f92697 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -36,6 +36,7 @@ static int dax_hmem_probe(struct platform_device *pdev) .dax_region = dax_region, .id = -1, .size = region_idle ? 0 : range_len(&mri->range), + .memmap_on_memory = false, }; return PTR_ERR_OR_ZERO(devm_create_dev_dax(&data)); diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index 369c698b7706..42ee360cf4e3 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "dax-private.h" #include "bus.h" @@ -93,6 +94,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) struct dax_kmem_data *data; struct memory_dev_type *mtype; int i, rc, mapped = 0; + mhp_t mhp_flags; int numa_node; int adist = MEMTIER_DEFAULT_DAX_ADISTANCE; @@ -179,12 +181,16 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) */ res->flags = IORESOURCE_SYSTEM_RAM; + mhp_flags = MHP_NID_IS_MGID; + if (dev_dax->memmap_on_memory) + mhp_flags |= MHP_MEMMAP_ON_MEMORY; + /* * Ensure that future kexec'd kernels will not treat * this as RAM automatically. */ rc = add_memory_driver_managed(data->mgid, range.start, - range_len(&range), kmem_name, MHP_NID_IS_MGID); + range_len(&range), kmem_name, mhp_flags); if (rc) { dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index ae0cb113a5d3..f3c6c67b8412 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -63,6 +63,7 @@ static struct dev_dax *__dax_pmem_probe(struct device *dev) .id = id, .pgmap = &pgmap, .size = range_len(&range), + .memmap_on_memory = false, }; return devm_create_dev_dax(&data); From 8ff252663d30f5c0bcb0bb336c1a5ed7c37d9730 Mon Sep 17 00:00:00 2001 From: Minjie Du Date: Tue, 7 Nov 2023 10:46:34 +0800 Subject: [PATCH 0392/1562] mm/filemap: increase usage of folio_next_index() helper Simplify code pattern of 'folio->index + folio_nr_pages(folio)' by using the existing helper folio_next_index() in filemap_get_folios_contig(). Link: https://lkml.kernel.org/r/20231107024635.4512-1-duminjie@vivo.com Signed-off-by: Minjie Du Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index f1c8c278310f..71f00539ac00 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2173,7 +2173,7 @@ update_start: if (nr) { folio = fbatch->folios[nr - 1]; - *start = folio->index + folio_nr_pages(folio); + *start = folio_next_index(folio); } out: rcu_read_unlock(); From e6a9a2cbc13bf43e4c03f57666e93d511249d5d7 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 6 Nov 2023 14:09:58 -0800 Subject: [PATCH 0393/1562] fs/proc/task_mmu: report SOFT_DIRTY bits through the PAGEMAP_SCAN ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PAGEMAP_SCAN ioctl returns information regarding page table entries. It is more efficient compared to reading pagemap files. CRIU can start to utilize this ioctl, but it needs info about soft-dirty bits to track memory changes. We are aware of a new method for tracking memory changes implemented in the PAGEMAP_SCAN ioctl. For CRIU, the primary advantage of this method is its usability by unprivileged users. However, it is not feasible to transparently replace the soft-dirty tracker with the new one. The main problem here is userfault descriptors that have to be preserved between pre-dump iterations. It means criu continues supporting the soft-dirty method to avoid breakage for current users. The new method will be implemented as a separate feature. [avagin@google.com: update tools/include/uapi/linux/fs.h] Link: https://lkml.kernel.org/r/20231107164139.576046-1-avagin@google.com Link: https://lkml.kernel.org/r/20231106220959.296568-1-avagin@google.com Signed-off-by: Andrei Vagin Reviewed-by: Muhammad Usama Anjum Cc: Michał Mirosław Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/pagemap.rst | 1 + fs/proc/task_mmu.c | 17 ++++++++++++++++- include/uapi/linux/fs.h | 1 + tools/include/uapi/linux/fs.h | 1 + 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index fe17cf210426..f5f065c67615 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -253,6 +253,7 @@ Following flags about pages are currently supported: - ``PAGE_IS_SWAPPED`` - Page is in swapped - ``PAGE_IS_PFNZERO`` - Page has zero PFN - ``PAGE_IS_HUGE`` - Page is THP or Hugetlb backed +- ``PAGE_IS_SOFT_DIRTY`` - Page is soft-dirty The ``struct pm_scan_arg`` is used as the argument of the IOCTL. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 435b61054b5b..d19924bf0a39 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1761,7 +1761,7 @@ static int pagemap_release(struct inode *inode, struct file *file) #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ - PAGE_IS_HUGE) + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY) #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) struct pagemap_scan_private { @@ -1793,6 +1793,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; + if (pte_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t swp; @@ -1806,6 +1808,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, !PageAnon(pfn_swap_entry_to_page(swp))) categories |= PAGE_IS_FILE; } + if (pte_swp_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } return categories; @@ -1853,12 +1857,16 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, if (is_zero_pfn(pmd_pfn(pmd))) categories |= PAGE_IS_PFNZERO; + if (pmd_soft_dirty(pmd)) + categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pmd(pmd)) { swp_entry_t swp; categories |= PAGE_IS_SWAPPED; if (!pmd_swp_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; + if (pmd_swp_soft_dirty(pmd)) + categories |= PAGE_IS_SOFT_DIRTY; if (p->masks_of_interest & PAGE_IS_FILE) { swp = pmd_to_swp_entry(pmd); @@ -1905,10 +1913,14 @@ static unsigned long pagemap_hugetlb_category(pte_t pte) categories |= PAGE_IS_FILE; if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; + if (pte_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; + if (pte_swp_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } return categories; @@ -2007,6 +2019,9 @@ static int pagemap_scan_test_walk(unsigned long start, unsigned long end, if (wp_allowed) vma_category |= PAGE_IS_WPALLOWED; + if (vma->vm_flags & VM_SOFTDIRTY) + vma_category |= PAGE_IS_SOFT_DIRTY; + if (!pagemap_scan_is_interesting_vma(vma_category, p)) return 1; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index da43810b7485..48ad69f7722e 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -316,6 +316,7 @@ typedef int __bitwise __kernel_rwf_t; #define PAGE_IS_SWAPPED (1 << 4) #define PAGE_IS_PFNZERO (1 << 5) #define PAGE_IS_HUGE (1 << 6) +#define PAGE_IS_SOFT_DIRTY (1 << 7) /* * struct page_region - Page region with flags diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index da43810b7485..48ad69f7722e 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -316,6 +316,7 @@ typedef int __bitwise __kernel_rwf_t; #define PAGE_IS_SWAPPED (1 << 4) #define PAGE_IS_PFNZERO (1 << 5) #define PAGE_IS_HUGE (1 << 6) +#define PAGE_IS_SOFT_DIRTY (1 << 7) /* * struct page_region - Page region with flags From 600bca580579d8d8454cc8fe3290e2f8b9c01884 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 6 Nov 2023 14:09:59 -0800 Subject: [PATCH 0394/1562] selftests/mm: check that PAGEMAP_SCAN returns correct categories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Right now, tests read page flags from /proc/pid/pagemap files. With this change, tests will check that PAGEMAP_SCAN return correct information too. [colin.i.king@gmail.com: fix spelling mistake "succedded" -> "succeeded"] Link: https://lkml.kernel.org/r/20231121093104.1728332-1-colin.i.king@gmail.com Link: https://lkml.kernel.org/r/20231106220959.296568-2-avagin@google.com Signed-off-by: Andrei Vagin Signed-off-by: Colin Ian King Reviewed-by: Muhammad Usama Anjum Tested-by: Muhammad Usama Anjum Cc: Michał Mirosław [avagin@google.com: allow running tests on old kernels] Link: https://lkml.kernel.org/r/20231117181127.2574897-1-avagin@google.com Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/vm_util.c | 80 ++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 3082b40492dd..05736c615734 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include "../kselftest.h" @@ -28,19 +29,92 @@ uint64_t pagemap_get_entry(int fd, char *start) return entry; } +static uint64_t __pagemap_scan_get_categories(int fd, char *start, struct page_region *r) +{ + struct pm_scan_arg arg; + + arg.start = (uintptr_t)start; + arg.end = (uintptr_t)(start + psize()); + arg.vec = (uintptr_t)r; + arg.vec_len = 1; + arg.flags = 0; + arg.size = sizeof(struct pm_scan_arg); + arg.max_pages = 0; + arg.category_inverted = 0; + arg.category_mask = 0; + arg.category_anyof_mask = PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | PAGE_IS_FILE | + PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY; + arg.return_mask = arg.category_anyof_mask; + + return ioctl(fd, PAGEMAP_SCAN, &arg); +} + +static uint64_t pagemap_scan_get_categories(int fd, char *start) +{ + struct page_region r; + long ret; + + ret = __pagemap_scan_get_categories(fd, start, &r); + if (ret < 0) + ksft_exit_fail_msg("PAGEMAP_SCAN failed: %s\n", strerror(errno)); + if (ret == 0) + return 0; + return r.categories; +} + +/* `start` is any valid address. */ +static bool pagemap_scan_supported(int fd, char *start) +{ + static int supported = -1; + int ret; + + if (supported != -1) + return supported; + + /* Provide an invalid address in order to trigger EFAULT. */ + ret = __pagemap_scan_get_categories(fd, start, (struct page_region *) ~0UL); + if (ret == 0) + ksft_exit_fail_msg("PAGEMAP_SCAN succeeded unexpectedly\n"); + + supported = errno == EFAULT; + + return supported; +} + +static bool page_entry_is(int fd, char *start, char *desc, + uint64_t pagemap_flags, uint64_t pagescan_flags) +{ + bool m = pagemap_get_entry(fd, start) & pagemap_flags; + + if (pagemap_scan_supported(fd, start)) { + bool s = pagemap_scan_get_categories(fd, start) & pagescan_flags; + + if (m == s) + return m; + + ksft_exit_fail_msg( + "read and ioctl return unmatched results for %s: %d %d", desc, m, s); + } + return m; +} + bool pagemap_is_softdirty(int fd, char *start) { - return pagemap_get_entry(fd, start) & PM_SOFT_DIRTY; + return page_entry_is(fd, start, "soft-dirty", + PM_SOFT_DIRTY, PAGE_IS_SOFT_DIRTY); } bool pagemap_is_swapped(int fd, char *start) { - return pagemap_get_entry(fd, start) & PM_SWAP; + return page_entry_is(fd, start, "swap", PM_SWAP, PAGE_IS_SWAPPED); } bool pagemap_is_populated(int fd, char *start) { - return pagemap_get_entry(fd, start) & (PM_PRESENT | PM_SWAP); + return page_entry_is(fd, start, "populated", + PM_PRESENT | PM_SWAP, + PAGE_IS_PRESENT | PAGE_IS_SWAPPED); } unsigned long pagemap_get_pfn(int fd, char *start) From a4fc4a0c45f2617c3aa8b693739de264e0c09909 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 7 Nov 2023 21:26:40 +0000 Subject: [PATCH 0395/1562] mm: add folio_zero_tail() and use it in ext4 Patch series "Add folio_zero_tail() and folio_fill_tail()". I'm trying to make it easier for filesystems with tailpacking / stuffing / inline data to use folios. The primary function here is folio_fill_tail(). You give it a pointer to memory where the data currently is, and it takes care of copying it into the folio at that offset. That works for gfs2 & iomap. Then There's Ext4. Rather than gin up some kind of specialist "Here's a two pointers to two blocks of memory" routine, just let it do its current thing, and let it call folio_zero_tail(), which is also called by folio_fill_tail(). Other filesystems can be converted later; these ones seemed like good examples as they're already partly or completely converted to folios. This patch (of 3): Instead of unmapping the folio after copying the data to it, then mapping it again to zero the tail, provide folio_zero_tail() to zero the tail of an already-mapped folio. [akpm@linux-foundation.org: fix kerneldoc argument ordering] Link: https://lkml.kernel.org/r/20231107212643.3490372-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231107212643.3490372-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Darrick J. Wong Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ext4/inline.c | 3 +-- include/linux/highmem.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 9a84a5f9fef4..d5bd1e3a5d36 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -502,9 +502,8 @@ static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) BUG_ON(len > PAGE_SIZE); kaddr = kmap_local_folio(folio, 0); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); - flush_dcache_folio(folio); + kaddr = folio_zero_tail(folio, len, kaddr + len); kunmap_local(kaddr); - folio_zero_segment(folio, len, folio_size(folio)); folio_mark_uptodate(folio); brelse(iloc.bh); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index be20cff4ba73..5ebd5e4dfbf8 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -483,6 +483,44 @@ static inline void memcpy_to_folio(struct folio *folio, size_t offset, flush_dcache_folio(folio); } +/** + * folio_zero_tail - Zero the tail of a folio. + * @folio: The folio to zero. + * @offset: The byte offset in the folio to start zeroing at. + * @kaddr: The address the folio is currently mapped to. + * + * If you have already used kmap_local_folio() to map a folio, written + * some data to it and now need to zero the end of the folio (and flush + * the dcache), you can use this function. If you do not have the + * folio kmapped (eg the folio has been partially populated by DMA), + * use folio_zero_range() or folio_zero_segment() instead. + * + * Return: An address which can be passed to kunmap_local(). + */ +static inline __must_check void *folio_zero_tail(struct folio *folio, + size_t offset, void *kaddr) +{ + size_t len = folio_size(folio) - offset; + + if (folio_test_highmem(folio)) { + size_t max = PAGE_SIZE - offset_in_page(offset); + + while (len > max) { + memset(kaddr, 0, max); + kunmap_local(kaddr); + len -= max; + offset += max; + max = PAGE_SIZE; + kaddr = kmap_local_folio(folio, offset); + } + } + + memset(kaddr, 0, len); + flush_dcache_folio(folio); + + return kaddr; +} + /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. From 6eaa266b54660f6b3654ad8902b4f7027054f55a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 7 Nov 2023 21:26:41 +0000 Subject: [PATCH 0396/1562] mm: add folio_fill_tail() and use it in iomap The iomap code was limited to PAGE_SIZE bytes; generalise it to cover an arbitrary-sized folio, and move it to be a common helper. [akpm@linux-foundation.org: fix folio_fill_tail(), per Andreas Gruenbacher] Link: https://lkml.kernel.org/r/20231107212643.3490372-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Andreas Dilger Cc: Darrick J. Wong Cc: Theodore Ts'o Cc: Andreas Gruenbacher Signed-off-by: Andrew Morton --- fs/iomap/buffered-io.c | 14 ++------------ include/linux/highmem.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f72df2babe56..093c4515b22a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -305,28 +305,18 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, { const struct iomap *iomap = iomap_iter_srcmap(iter); size_t size = i_size_read(iter->inode) - iomap->offset; - size_t poff = offset_in_page(iomap->offset); size_t offset = offset_in_folio(folio, iomap->offset); - void *addr; if (folio_test_uptodate(folio)) return 0; - if (WARN_ON_ONCE(size > PAGE_SIZE - poff)) - return -EIO; - if (WARN_ON_ONCE(size > PAGE_SIZE - - offset_in_page(iomap->inline_data))) - return -EIO; if (WARN_ON_ONCE(size > iomap->length)) return -EIO; if (offset > 0) ifs_alloc(iter->inode, folio, iter->flags); - addr = kmap_local_folio(folio, offset); - memcpy(addr, iomap->inline_data, size); - memset(addr + size, 0, PAGE_SIZE - poff - size); - kunmap_local(addr); - iomap_set_range_uptodate(folio, offset, PAGE_SIZE - poff); + folio_fill_tail(folio, offset, iomap->inline_data, size); + iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset); return 0; } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 5ebd5e4dfbf8..451c1dff0e87 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -521,6 +521,44 @@ static inline __must_check void *folio_zero_tail(struct folio *folio, return kaddr; } +/** + * folio_fill_tail - Copy some data to a folio and pad with zeroes. + * @folio: The destination folio. + * @offset: The offset into @folio at which to start copying. + * @from: The data to copy. + * @len: How many bytes of data to copy. + * + * This function is most useful for filesystems which support inline data. + * When they want to copy data from the inode into the page cache, this + * function does everything for them. It supports large folios even on + * HIGHMEM configurations. + */ +static inline void folio_fill_tail(struct folio *folio, size_t offset, + const char *from, size_t len) +{ + char *to = kmap_local_folio(folio, offset); + + VM_BUG_ON(offset + len > folio_size(folio)); + + if (folio_test_highmem(folio)) { + size_t max = PAGE_SIZE - offset_in_page(offset); + + while (len > max) { + memcpy(to, from, max); + kunmap_local(to); + len -= max; + from += max; + offset += max; + max = PAGE_SIZE; + to = kmap_local_folio(folio, offset); + } + } + + memcpy(to, from, len); + to = folio_zero_tail(folio, offset + len, to + len); + kunmap_local(to); +} + /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. From 78c3c11268c38c5fb5b58f6be1bb018f3f1c195e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 7 Nov 2023 21:26:42 +0000 Subject: [PATCH 0397/1562] gfs2: convert stuffed_readpage() to stuffed_read_folio() Use folio_fill_tail() to implement the unstuffing and folio_end_read() to simultaneously mark the folio uptodate and unlock it. Unifies a couple of code paths. Link: https://lkml.kernel.org/r/20231107212643.3490372-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Andreas Dilger Cc: Darrick J. Wong Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/gfs2/aops.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9611bfceda4b..ba8742dc91f8 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -403,18 +403,18 @@ static int gfs2_jdata_writepages(struct address_space *mapping, } /** - * stuffed_readpage - Fill in a Linux folio with stuffed file data + * stuffed_read_folio - Fill in a Linux folio with stuffed file data * @ip: the inode * @folio: the folio * * Returns: errno */ -static int stuffed_readpage(struct gfs2_inode *ip, struct folio *folio) +static int stuffed_read_folio(struct gfs2_inode *ip, struct folio *folio) { - struct buffer_head *dibh; - size_t i_size = i_size_read(&ip->i_inode); - void *data; - int error; + struct buffer_head *dibh = NULL; + size_t dsize = i_size_read(&ip->i_inode); + void *from = NULL; + int error = 0; /* * Due to the order of unstuffing files and ->fault(), we can be @@ -422,22 +422,20 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct folio *folio) * so we need to supply one here. It doesn't happen often. */ if (unlikely(folio->index)) { - folio_zero_range(folio, 0, folio_size(folio)); - folio_mark_uptodate(folio); - return 0; + dsize = 0; + } else { + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + from = dibh->b_data + sizeof(struct gfs2_dinode); } - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; - - data = dibh->b_data + sizeof(struct gfs2_dinode); - memcpy_to_folio(folio, 0, data, i_size); - folio_zero_range(folio, i_size, folio_size(folio) - i_size); + folio_fill_tail(folio, 0, from, dsize); brelse(dibh); - folio_mark_uptodate(folio); +out: + folio_end_read(folio, error == 0); - return 0; + return error; } /** @@ -456,8 +454,7 @@ static int gfs2_read_folio(struct file *file, struct folio *folio) (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) { error = iomap_read_folio(folio, &gfs2_iomap_ops); } else if (gfs2_is_stuffed(ip)) { - error = stuffed_readpage(ip, folio); - folio_unlock(folio); + error = stuffed_read_folio(ip, folio); } else { error = mpage_read_folio(folio, gfs2_block_map); } From c36f9d3d2c3e17f9eef1d2f47a63c91d51d55e87 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 20:46:02 +0000 Subject: [PATCH 0398/1562] mm: remove test_set_page_writeback() Patch series "Make folio_start_writeback return void". Most of the folio flag-setting functions return void. folio_start_writeback is gratuitously different; the only two filesystems that do anything with the return value emit debug messages if it's already set, and we can (and should) do that internally without bothering the filesystem to do it. This patch (of 4): There are no more callers of this wrapper. Link: https://lkml.kernel.org/r/20231108204605.745109-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231108204605.745109-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Howells Cc: Steve French Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a88e64acebfe..a440062e9386 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -780,11 +780,6 @@ bool set_page_writeback(struct page *page); #define folio_start_writeback_keepwrite(folio) \ __folio_start_writeback(folio, true) -static inline bool test_set_page_writeback(struct page *page) -{ - return set_page_writeback(page); -} - static __always_inline bool folio_test_head(struct folio *folio) { return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY)); From 8525d5984b7b061ba02469cb58c17d1a1b98eb12 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 20:46:03 +0000 Subject: [PATCH 0399/1562] afs: do not test the return value of folio_start_writeback() In preparation for removing the return value entirely, stop testing it in afs. Link: https://lkml.kernel.org/r/20231108204605.745109-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Howells Cc: Steve French Signed-off-by: Andrew Morton --- fs/afs/write.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index 4a168781936b..57d05d67f0c2 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -559,8 +559,7 @@ static void afs_extend_writeback(struct address_space *mapping, if (!folio_clear_dirty_for_io(folio)) BUG(); - if (folio_start_writeback(folio)) - BUG(); + folio_start_writeback(folio); afs_folio_start_fscache(caching, folio); *_count -= folio_nr_pages(folio); @@ -595,8 +594,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, _enter(",%lx,%llx-%llx", folio_index(folio), start, end); - if (folio_start_writeback(folio)) - BUG(); + folio_start_writeback(folio); afs_folio_start_fscache(caching, folio); count -= folio_nr_pages(folio); From a9540e35624d1475f47dbf6353eed8b99936d36e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 20:46:04 +0000 Subject: [PATCH 0400/1562] smb: do not test the return value of folio_start_writeback() In preparation for removing the return value entirely, stop testing it in smb. Link: https://lkml.kernel.org/r/20231108204605.745109-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Howells Cc: Steve French Signed-off-by: Andrew Morton --- fs/smb/client/file.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index cf17e3dd703e..45ca492c141c 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -2706,8 +2706,7 @@ static void cifs_extend_writeback(struct address_space *mapping, */ if (!folio_clear_dirty_for_io(folio)) WARN_ON(1); - if (folio_start_writeback(folio)) - WARN_ON(1); + folio_start_writeback(folio); *_count -= folio_nr_pages(folio); folio_unlock(folio); @@ -2742,8 +2741,7 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping, int rc; /* The folio should be locked, dirty and not undergoing writeback. */ - if (folio_start_writeback(folio)) - WARN_ON(1); + folio_start_writeback(folio); count -= folio_nr_pages(folio); len = folio_size(folio); From b5612c368648a7be52411b288d09593e5945d1aa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 20:46:05 +0000 Subject: [PATCH 0401/1562] mm: return void from folio_start_writeback() and related functions Nobody now checks the return value from any of these functions, so add an assertion at the beginning of the function and return void. Link: https://lkml.kernel.org/r/20231108204605.745109-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Josef Bacik Cc: David Howells Cc: Steve French Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 4 +-- mm/folio-compat.c | 4 +-- mm/page-writeback.c | 54 ++++++++++++++++++-------------------- 3 files changed, 29 insertions(+), 33 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a440062e9386..735cddc13d20 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -772,8 +772,8 @@ static __always_inline void SetPageUptodate(struct page *page) CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) -bool __folio_start_writeback(struct folio *folio, bool keep_write); -bool set_page_writeback(struct page *page); +void __folio_start_writeback(struct folio *folio, bool keep_write); +void set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ __folio_start_writeback(folio, false) diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 10c3247542cb..aee3b9a16828 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -46,9 +46,9 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -bool set_page_writeback(struct page *page) +void set_page_writeback(struct page *page) { - return folio_start_writeback(page_folio(page)); + folio_start_writeback(page_folio(page)); } EXPORT_SYMBOL(set_page_writeback); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ee2fd6a6af40..ca64bd513fa2 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2982,67 +2982,63 @@ bool __folio_end_writeback(struct folio *folio) return ret; } -bool __folio_start_writeback(struct folio *folio, bool keep_write) +void __folio_start_writeback(struct folio *folio, bool keep_write) { long nr = folio_nr_pages(folio); struct address_space *mapping = folio_mapping(folio); - bool ret; int access_ret; + VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); + folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; + bool on_wblist; xas_lock_irqsave(&xas, flags); xas_load(&xas); - ret = folio_test_set_writeback(folio); - if (!ret) { - bool on_wblist; + folio_test_set_writeback(folio); - on_wblist = mapping_tagged(mapping, - PAGECACHE_TAG_WRITEBACK); + on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); - if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { - struct bdi_writeback *wb = inode_to_wb(inode); + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { + struct bdi_writeback *wb = inode_to_wb(inode); - wb_stat_mod(wb, WB_WRITEBACK, nr); - if (!on_wblist) - wb_inode_writeback_start(wb); - } - - /* - * We can come through here when swapping - * anonymous folios, so we don't necessarily - * have an inode to track for sync. - */ - if (mapping->host && !on_wblist) - sb_mark_inode_writeback(mapping->host); + wb_stat_mod(wb, WB_WRITEBACK, nr); + if (!on_wblist) + wb_inode_writeback_start(wb); } + + /* + * We can come through here when swapping anonymous + * folios, so we don't necessarily have an inode to + * track for sync. + */ + if (mapping->host && !on_wblist) + sb_mark_inode_writeback(mapping->host); if (!folio_test_dirty(folio)) xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irqrestore(&xas, flags); } else { - ret = folio_test_set_writeback(folio); - } - if (!ret) { - lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); - zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); + folio_test_set_writeback(folio); } + + lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); folio_memcg_unlock(folio); + access_ret = arch_make_folio_accessible(folio); /* * If writeback has been triggered on a page that cannot be made * accessible, it is too late to recover here. */ VM_BUG_ON_FOLIO(access_ret != 0, folio); - - return ret; } EXPORT_SYMBOL(__folio_start_writeback); From 1e12cbb9f69541181afab6b1ff358b4f1dd3e253 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 18:28:04 +0000 Subject: [PATCH 0402/1562] mm: make mapping_evict_folio() the preferred way to evict clean folios Patch series "Fix fault handler's handling of poisoned tail pages". Since introducing the ability to have large folios in the page cache, it's been possible to have a hwpoisoned tail page returned from the fault handler. We handle this situation poorly; failing to remove the affected page from use. This isn't a minimal patch to fix it, it's a full conversion of all the code surrounding it. This patch (of 6): invalidate_inode_page() does very little beyond calling mapping_evict_folio(). Move the check for mapping being NULL into mapping_evict_folio() and make it available to the rest of the MM for use in the next few patches. Link: https://lkml.kernel.org/r/20231108182809.602073-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231108182809.602073-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/internal.h | 1 + mm/truncate.c | 33 ++++++++++++++++----------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 89a5a794d68f..7d18094e102d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -138,6 +138,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); +long mapping_evict_folio(struct address_space *mapping, struct folio *folio); long invalidate_inode_page(struct page *page); unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed); diff --git a/mm/truncate.c b/mm/truncate.c index 8e3aa9e8618e..1d516e51e29d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -266,9 +266,22 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page) } EXPORT_SYMBOL(generic_error_remove_page); -static long mapping_evict_folio(struct address_space *mapping, - struct folio *folio) +/** + * mapping_evict_folio() - Remove an unused folio from the page-cache. + * @mapping: The mapping this folio belongs to. + * @folio: The folio to remove. + * + * Safely remove one folio from the page cache. + * It only drops clean, unused folios. + * + * Context: Folio must be locked. + * Return: The number of pages successfully removed. + */ +long mapping_evict_folio(struct address_space *mapping, struct folio *folio) { + /* The page may have been truncated before it was locked */ + if (!mapping) + return 0; if (folio_test_dirty(folio) || folio_test_writeback(folio)) return 0; /* The refcount will be elevated if any page in the folio is mapped */ @@ -281,25 +294,11 @@ static long mapping_evict_folio(struct address_space *mapping, return remove_mapping(mapping, folio); } -/** - * invalidate_inode_page() - Remove an unused page from the pagecache. - * @page: The page to remove. - * - * Safely invalidate one page from its pagecache mapping. - * It only drops clean, unused pages. - * - * Context: Page must be locked. - * Return: The number of pages successfully removed. - */ long invalidate_inode_page(struct page *page) { struct folio *folio = page_folio(page); - struct address_space *mapping = folio_mapping(folio); - /* The page may have been truncated before it was locked */ - if (!mapping) - return 0; - return mapping_evict_folio(mapping, folio); + return mapping_evict_folio(folio_mapping(folio), folio); } /** From 01d1e0e6b7d99ebaf2e42d2205595080b7d0c271 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 18:28:05 +0000 Subject: [PATCH 0403/1562] mm: convert __do_fault() to use a folio Convert vmf->page to a folio as soon as we're going to use it. This fixes a bug if the fault handler returns a tail page with hardware poison; tail pages have an invalid page->index, so we would fail to unmap the page from the page tables. We actually have to unmap the entire folio (or mapping_evict_folio() will fail), so use unmap_mapping_folio() instead. This also saves various calls to compound_head() hidden in lock_page(), put_page(), etc. Link: https://lkml.kernel.org/r/20231108182809.602073-3-willy@infradead.org Fixes: 793917d997df ("mm/readahead: Add large folio readahead") Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index a7025ed5c65b..e27e2e5beb3f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4245,6 +4245,7 @@ oom: static vm_fault_t __do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct folio *folio; vm_fault_t ret; /* @@ -4273,27 +4274,26 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) VM_FAULT_DONE_COW))) return ret; + folio = page_folio(vmf->page); if (unlikely(PageHWPoison(vmf->page))) { - struct page *page = vmf->page; vm_fault_t poisonret = VM_FAULT_HWPOISON; if (ret & VM_FAULT_LOCKED) { - if (page_mapped(page)) - unmap_mapping_pages(page_mapping(page), - page->index, 1, false); - /* Retry if a clean page was removed from the cache. */ - if (invalidate_inode_page(page)) + if (page_mapped(vmf->page)) + unmap_mapping_folio(folio); + /* Retry if a clean folio was removed from the cache. */ + if (mapping_evict_folio(folio->mapping, folio)) poisonret = VM_FAULT_NOPAGE; - unlock_page(page); + folio_unlock(folio); } - put_page(page); + folio_put(folio); vmf->page = NULL; return poisonret; } if (unlikely(!(ret & VM_FAULT_LOCKED))) - lock_page(vmf->page); + folio_lock(folio); else - VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); + VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page); return ret; } From 19369d866a8b89788cdc9b10c7b8c9b2777f806b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 18:28:06 +0000 Subject: [PATCH 0404/1562] mm: use mapping_evict_folio() in truncate_error_page() We already have the folio and the mapping, so replace the call to invalidate_inode_page() with mapping_evict_folio(). Link: https://lkml.kernel.org/r/20231108182809.602073-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 660c21859118..9f03952e6d38 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -930,10 +930,10 @@ static int delete_from_lru_cache(struct page *p) static int truncate_error_page(struct page *p, unsigned long pfn, struct address_space *mapping) { + struct folio *folio = page_folio(p); int ret = MF_FAILED; if (mapping->a_ops->error_remove_page) { - struct folio *folio = page_folio(p); int err = mapping->a_ops->error_remove_page(mapping, p); if (err != 0) @@ -947,7 +947,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn, * If the file system doesn't support it just invalidate * This fails on dirty or anything with private pages */ - if (invalidate_inode_page(p)) + if (mapping_evict_folio(mapping, folio)) ret = MF_RECOVERED; else pr_info("%#lx: Failed to invalidate\n", pfn); From 049b26048dd287d52f6f6fbe5eafa301fdca5d37 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 18:28:07 +0000 Subject: [PATCH 0405/1562] mm: convert soft_offline_in_use_page() to use a folio Replace the existing head-page logic with folio logic. Link: https://lkml.kernel.org/r/20231108182809.602073-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9f03952e6d38..075db5b5ad5e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2645,40 +2645,40 @@ static int soft_offline_in_use_page(struct page *page) { long ret = 0; unsigned long pfn = page_to_pfn(page); - struct page *hpage = compound_head(page); + struct folio *folio = page_folio(page); char const *msg_page[] = {"page", "hugepage"}; - bool huge = PageHuge(page); + bool huge = folio_test_hugetlb(folio); LIST_HEAD(pagelist); struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; - if (!huge && PageTransHuge(hpage)) { + if (!huge && folio_test_large(folio)) { if (try_to_split_thp_page(page)) { pr_info("soft offline: %#lx: thp split failed\n", pfn); return -EBUSY; } - hpage = page; + folio = page_folio(page); } - lock_page(page); + folio_lock(folio); if (!huge) - wait_on_page_writeback(page); + folio_wait_writeback(folio); if (PageHWPoison(page)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); pr_info("soft offline: %#lx page already poisoned\n", pfn); return 0; } - if (!huge && PageLRU(page) && !PageSwapCache(page)) + if (!huge && folio_test_lru(folio) && !folio_test_swapcache(folio)) /* * Try to invalidate first. This should work for * non dirty unmapped page cache pages. */ - ret = invalidate_inode_page(page); - unlock_page(page); + ret = mapping_evict_folio(folio_mapping(folio), folio); + folio_unlock(folio); if (ret) { pr_info("soft_offline: %#lx: invalidated\n", pfn); @@ -2686,7 +2686,7 @@ static int soft_offline_in_use_page(struct page *page) return 0; } - if (isolate_page(hpage, &pagelist)) { + if (isolate_page(&folio->page, &pagelist)) { ret = migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL); if (!ret) { From 761d79fbad2a424a240a351b898b54eb674d3bdc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 18:28:08 +0000 Subject: [PATCH 0406/1562] mm: convert isolate_page() to mf_isolate_folio() The only caller now has a folio, so pass it in and operate on it. Saves many page->folio conversions and introduces only one folio->page conversion when calling isolate_movable_page(). Link: https://lkml.kernel.org/r/20231108182809.602073-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 075db5b5ad5e..b601f59ed062 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2602,37 +2602,37 @@ unlock_mutex: } EXPORT_SYMBOL(unpoison_memory); -static bool isolate_page(struct page *page, struct list_head *pagelist) +static bool mf_isolate_folio(struct folio *folio, struct list_head *pagelist) { bool isolated = false; - if (PageHuge(page)) { - isolated = isolate_hugetlb(page_folio(page), pagelist); + if (folio_test_hugetlb(folio)) { + isolated = isolate_hugetlb(folio, pagelist); } else { - bool lru = !__PageMovable(page); + bool lru = !__folio_test_movable(folio); if (lru) - isolated = isolate_lru_page(page); + isolated = folio_isolate_lru(folio); else - isolated = isolate_movable_page(page, + isolated = isolate_movable_page(&folio->page, ISOLATE_UNEVICTABLE); if (isolated) { - list_add(&page->lru, pagelist); + list_add(&folio->lru, pagelist); if (lru) - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_lru(page)); + node_stat_add_folio(folio, NR_ISOLATED_ANON + + folio_is_file_lru(folio)); } } /* - * If we succeed to isolate the page, we grabbed another refcount on - * the page, so we can safely drop the one we got from get_any_page(). - * If we failed to isolate the page, it means that we cannot go further + * If we succeed to isolate the folio, we grabbed another refcount on + * the folio, so we can safely drop the one we got from get_any_page(). + * If we failed to isolate the folio, it means that we cannot go further * and we will return an error, so drop the reference we got from * get_any_page() as well. */ - put_page(page); + folio_put(folio); return isolated; } @@ -2686,7 +2686,7 @@ static int soft_offline_in_use_page(struct page *page) return 0; } - if (isolate_page(&folio->page, &pagelist)) { + if (mf_isolate_folio(folio, &pagelist)) { ret = migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL); if (!ret) { From 2033c98cce666b0d125ae956613ab5111bb8d202 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 18:28:09 +0000 Subject: [PATCH 0407/1562] mm: remove invalidate_inode_page() All callers are now converted to call mapping_evict_folio(). Link: https://lkml.kernel.org/r/20231108182809.602073-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/internal.h | 1 - mm/truncate.c | 11 ++--------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 7d18094e102d..2bc9ff8db393 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -139,7 +139,6 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); long mapping_evict_folio(struct address_space *mapping, struct folio *folio); -long invalidate_inode_page(struct page *page); unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed); diff --git a/mm/truncate.c b/mm/truncate.c index 1d516e51e29d..52e3a703e7b2 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -294,13 +294,6 @@ long mapping_evict_folio(struct address_space *mapping, struct folio *folio) return remove_mapping(mapping, folio); } -long invalidate_inode_page(struct page *page) -{ - struct folio *folio = page_folio(page); - - return mapping_evict_folio(folio_mapping(folio), folio); -} - /** * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate @@ -559,9 +552,9 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, EXPORT_SYMBOL(invalidate_mapping_pages); /* - * This is like invalidate_inode_page(), except it ignores the page's + * This is like mapping_evict_folio(), except it ignores the folio's * refcount. We do this because invalidate_inode_pages2() needs stronger - * invalidation guarantees, and cannot afford to leave pages behind because + * invalidation guarantees, and cannot afford to leave folios behind because * shrink_page_list() has a temp ref on them, or because they're transiently * sitting in the folio_add_lru() caches. */ From 17b46e7beb8fe4e4807f70aaa615cf50a5ba9d3a Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Wed, 8 Nov 2023 16:49:20 +0000 Subject: [PATCH 0408/1562] mm/page_alloc: dedupe some memcg uncharging logic The duplication makes it seem like some work is required before uncharging in the !PageHWPoison case. But it isn't, so we can simplify the code a little. Note the PageMemcgKmem check is redundant, but I've left it in as it avoids an unnecessary function call. Link: https://lkml.kernel.org/r/20231108164920.3401565-1-jackmanb@google.com Signed-off-by: Brendan Jackman Reviewed-by: Yosry Ahmed Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 733732e7e0ba..dd5e8a759d27 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1086,13 +1086,11 @@ static __always_inline bool free_pages_prepare(struct page *page, trace_mm_page_free(page, order); kmsan_free_page(page, order); + if (memcg_kmem_online() && PageMemcgKmem(page)) + __memcg_kmem_uncharge_page(page, order); + if (unlikely(PageHWPoison(page)) && !order) { - /* - * Do not let hwpoison pages hit pcplists/buddy - * Untie memcg state and reset page's owner - */ - if (memcg_kmem_online() && PageMemcgKmem(page)) - __memcg_kmem_uncharge_page(page, order); + /* Do not let hwpoison pages hit pcplists/buddy */ reset_page_owner(page, order); page_table_check_free(page, order); return false; @@ -1123,8 +1121,6 @@ static __always_inline bool free_pages_prepare(struct page *page, } if (PageMappingFlags(page)) page->mapping = NULL; - if (memcg_kmem_online() && PageMemcgKmem(page)) - __memcg_kmem_uncharge_page(page, order); if (is_check_pages_enabled()) { if (free_page_is_bad(page)) bad++; From 16f5dfbc851b55b87101a20e181d4a14be3007d6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Nov 2023 21:15:07 +0000 Subject: [PATCH 0409/1562] gfp: include __GFP_NOWARN in GFP_NOWAIT GFP_NOWAIT callers are always prepared for their allocations to fail because they fail so frequently. Forcing the callers to remember to add __GFP_NOWARN is just annoying and leads to an endless stream of patches for the places where we forgot to add it. We can now remove __GFP_NOWARN from all the callers which specify GFP_NOWAIT, but I'd rather wait a cycle and send patches to each maintainer instead of creating a big pile of merge conflicts. Link: https://lkml.kernel.org/r/20231109211507.2262419-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 6583a58670c5..ae994534a12a 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -274,7 +274,8 @@ typedef unsigned int __bitwise gfp_t; * accounted to kmemcg. * * %GFP_NOWAIT is for kernel allocations that should not stall for direct - * reclaim, start physical IO or use any filesystem callback. + * reclaim, start physical IO or use any filesystem callback. It is very + * likely to fail to allocate memory, even for very small allocations. * * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. @@ -325,7 +326,7 @@ typedef unsigned int __bitwise gfp_t; #define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) -#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) +#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM | __GFP_NOWARN) #define GFP_NOIO (__GFP_RECLAIM) #define GFP_NOFS (__GFP_RECLAIM | __GFP_IO) #define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL) From 69e583eaca579d50ffc699b1f4358258e75fa008 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 13 Nov 2023 13:47:28 +0100 Subject: [PATCH 0410/1562] mmap: remove the IA64-specific vma expansion implementation With commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"), there is no need to keep the IA64-specific vma expansion. Clean up the IA64-specific vma expansion implementation. Link: https://lkml.kernel.org/r/20231113124728.3974-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Acked-by: David Hildenbrand Cc: Ard Biesheuvel Signed-off-by: Andrew Morton --- mm/mmap.c | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 4f1cb814586d..e8470b6b678c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2207,42 +2207,7 @@ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned lon } #endif -/* - * IA64 has some horrid mapping rules: it can expand both up and down, - * but with various special rules. - * - * We'll get rid of this architecture eventually, so the ugliness is - * temporary. - */ -#ifdef CONFIG_IA64 -static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr) -{ - return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) && - REGION_OFFSET(addr) < RGN_MAP_LIMIT; -} - -/* - * IA64 stacks grow down, but there's a special register backing store - * that can grow up. Only sequentially, though, so the new address must - * match vm_end. - */ -static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr) -{ - if (!vma_expand_ok(vma, addr)) - return -EFAULT; - if (vma->vm_end != (addr & PAGE_MASK)) - return -EFAULT; - return expand_upwards(vma, addr); -} - -static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr) -{ - if (!vma_expand_ok(vma, addr)) - return -EFAULT; - return expand_downwards(vma, addr); -} - -#elif defined(CONFIG_STACK_GROWSUP) +#if defined(CONFIG_STACK_GROWSUP) #define vma_expand_up(vma,addr) expand_upwards(vma, addr) #define vma_expand_down(vma, addr) (-EFAULT) From 932b59e3beaefefb1a0bd65f1bb3f9e2000d7315 Mon Sep 17 00:00:00 2001 From: York Jasper Niebuhr Date: Sat, 11 Nov 2023 19:48:59 +0100 Subject: [PATCH 0411/1562] mm: fix process_vm_rw page counts 1. There is a "-1" missing in the page number calculation in process_vm_rw_core. While this can't break anything, it can cause unnecessary allocations in certain cases: Consider handling an iovec ranging over PVM_MAX_PP_ARRAY_COUNT pages that is also aligned to a page boundary. While pp_stack could hold references to such an amount of pinned pages, nr_pages yields (PVM_MAX_PP_ARRAY + 1) in process_vm_rw_core. Consequently, a larger buffer is allocated with kmalloc for no reason. For any page boundary aligned iovec that is a multiple of PAGE_SIZE and larger than PVM_MAX_PP_ARRAY_COUNT pages, nr_pages will be too big by 1 and thus kmalloc allocates excess space for one more pointer. 2. max_pages_per_loop is constant and there is no reason to have it as a variable. A macro does the job just fine and saves memory. 3. Replaced "sizeof(struct pages *)" with "sizeof(struct page *)" to have matching types for allocation and prevent confusion. Link: https://lkml.kernel.org/r/20231111184859.44264-1-yjnworkstation@gmail.com Signed-off-by: York Jasper Niebuhr Signed-off-by: Andrew Morton --- mm/process_vm_access.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 0523edab03a6..b308e96cd05a 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -53,7 +53,10 @@ static int process_vm_rw_pages(struct page **pages, } /* Maximum number of pages kmalloc'd to hold struct page's during copy */ -#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2) +#define PVM_MAX_KMALLOC_PAGES 2 + +/* Maximum number of pages that can be stored at a time */ +#define PVM_MAX_USER_PAGES (PVM_MAX_KMALLOC_PAGES * PAGE_SIZE / sizeof(struct page *)) /** * process_vm_rw_single_vec - read/write pages from task specified @@ -79,8 +82,6 @@ static int process_vm_rw_single_vec(unsigned long addr, unsigned long start_offset = addr - pa; unsigned long nr_pages; ssize_t rc = 0; - unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES - / sizeof(struct pages *); unsigned int flags = 0; /* Work out address and page range required */ @@ -92,7 +93,7 @@ static int process_vm_rw_single_vec(unsigned long addr, flags |= FOLL_WRITE; while (!rc && nr_pages && iov_iter_count(iter)) { - int pinned_pages = min(nr_pages, max_pages_per_loop); + int pinned_pages = min_t(unsigned long, nr_pages, PVM_MAX_USER_PAGES); int locked = 1; size_t bytes; @@ -171,7 +172,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, iov_len = rvec[i].iov_len; if (iov_len > 0) { nr_pages_iov = ((unsigned long)rvec[i].iov_base - + iov_len) + + iov_len - 1) / PAGE_SIZE - (unsigned long)rvec[i].iov_base / PAGE_SIZE + 1; nr_pages = max(nr_pages, nr_pages_iov); @@ -184,8 +185,8 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) { /* For reliability don't try to kmalloc more than 2 pages worth */ - process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES, - sizeof(struct pages *)*nr_pages), + process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES * PAGE_SIZE, + sizeof(struct page *)*nr_pages), GFP_KERNEL); if (!process_pages) From 83a6fdd6c27d4f6f51fa1092805676b24e0f8827 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20Heidekr=C3=BCger?= Date: Thu, 9 Nov 2023 15:51:00 +0000 Subject: [PATCH 0412/1562] kasan: default to inline instrumentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KASan inline instrumentation can yield up to a 2x performance gain at the cost of a larger binary. Make inline instrumentation the default, as suggested in the bug report below. When an architecture does not support inline instrumentation, it should set ARCH_DISABLE_KASAN_INLINE, as done by PowerPC, for instance. Link: https://lkml.kernel.org/r/20231109155101.186028-1-paul.heidekrueger@tum.de Signed-off-by: Paul Heidekrüger Reported-by: Andrey Konovalov Reviewed-by: Marco Elver Closes: https://bugzilla.kernel.org/show_bug.cgi?id=203495 Acked-by: Andrey Konovalov Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- lib/Kconfig.kasan | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index fdca89c05745..935eda08b1e1 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -134,7 +134,7 @@ endchoice choice prompt "Instrumentation type" depends on KASAN_GENERIC || KASAN_SW_TAGS - default KASAN_OUTLINE + default KASAN_INLINE if !ARCH_DISABLE_KASAN_INLINE config KASAN_OUTLINE bool "Outline instrumentation" From 20954c122f1bb09af095a51b66ddead13b6c6ef4 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 14 Nov 2023 12:04:56 +0530 Subject: [PATCH 0413/1562] Documentation/mm: drop pte_bad() descriptions from arch page table helpers pte_bad() never existed unlike similar helpers at PMU, PUD, and PGD level. This was added erroneously and hence should be dropped instead. Link: https://lkml.kernel.org/r/20231114063456.339652-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Rick Edgecombe Cc: Jonathan Corbet Cc: Andrew Morton Signed-off-by: Andrew Morton --- Documentation/mm/arch_pgtable_helpers.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst index c82e3ee20e51..2466d3363af7 100644 --- a/Documentation/mm/arch_pgtable_helpers.rst +++ b/Documentation/mm/arch_pgtable_helpers.rst @@ -18,8 +18,6 @@ PTE Page Table Helpers +---------------------------+--------------------------------------------------+ | pte_same | Tests whether both PTE entries are the same | +---------------------------+--------------------------------------------------+ -| pte_bad | Tests a non-table mapped PTE | -+---------------------------+--------------------------------------------------+ | pte_present | Tests a valid mapped PTE | +---------------------------+--------------------------------------------------+ | pte_young | Tests a young PTE | From 1b5c65b64cd417c801945b26a2a50c4d4eefaec8 Mon Sep 17 00:00:00 2001 From: Barry Song <21cnbao@gmail.com> Date: Tue, 14 Nov 2023 16:42:02 +1300 Subject: [PATCH 0414/1562] mm/page_owner: record and dump free_pid and free_tgid While investigating some complex memory allocation and free bugs especially in multi-processes and multi-threads cases, from time to time, I feel the free stack isn't sufficient as a page can be freed by processes or threads other than the one allocating it. And other processes and threads which free the page often have the exactly same free stack with the one allocating the page. We can't know who free the page only through the free stack though the current page_owner does tell us the pid and tgid of the one allocating the page. This makes the bug investigation often hard. So this patch adds free pid and tgid in page_owner, so that we can easily figure out if the freeing is crossing processes or threads. Link: https://lkml.kernel.org/r/20231114034202.73098-1-v-songbaohua@oppo.com Signed-off-by: Barry Song Cc: Audra Mitchell Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Kassey Li Cc: Kemeng Shi Signed-off-by: Andrew Morton --- mm/page_owner.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 4f13ce7d2452..e7eba7688881 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -32,6 +32,8 @@ struct page_owner { char comm[TASK_COMM_LEN]; pid_t pid; pid_t tgid; + pid_t free_pid; + pid_t free_tgid; }; static bool page_owner_enabled __initdata; @@ -152,6 +154,8 @@ void __reset_page_owner(struct page *page, unsigned short order) page_owner = get_page_owner(page_ext); page_owner->free_handle = handle; page_owner->free_ts_nsec = free_ts_nsec; + page_owner->free_pid = current->pid; + page_owner->free_tgid = current->tgid; page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); @@ -253,6 +257,8 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) new_page_owner->handle = old_page_owner->handle; new_page_owner->pid = old_page_owner->pid; new_page_owner->tgid = old_page_owner->tgid; + new_page_owner->free_pid = old_page_owner->free_pid; + new_page_owner->free_tgid = old_page_owner->free_tgid; new_page_owner->ts_nsec = old_page_owner->ts_nsec; new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; strcpy(new_page_owner->comm, old_page_owner->comm); @@ -495,7 +501,8 @@ void __dump_page_owner(const struct page *page) if (!handle) { pr_alert("page_owner free stack trace missing\n"); } else { - pr_alert("page last free stack trace:\n"); + pr_alert("page last free pid %d tgid %d stack trace:\n", + page_owner->free_pid, page_owner->free_tgid); stack_depot_print(handle); } From a7a0350583ba51d8cde6180bb51d704b89a3b29e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Nov 2023 11:42:12 +0900 Subject: [PATCH 0415/1562] zram: split memory-tracking and ac-time tracking ZRAM_MEMORY_TRACKING enables two features: - per-entry ac-time tracking - debugfs interface The latter one is the reason why memory-tracking depends on DEBUG_FS, while the former one is used far beyond debugging these days. Namely ac-time is used for fine grained writeback of idle entries (pages). Move ac-time tracking under its own config option so that it can be enabled (along with writeback) on systems without DEBUG_FS. [senozhatsky@chromium.org: ifdef fixup, per Dmytro] Link: https://lkml.kernel.org/r/20231117013543.540280-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20231115024223.4133148-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Dmytro Maluka Signed-off-by: Andrew Morton --- Documentation/admin-guide/blockdev/zram.rst | 2 +- drivers/block/zram/Kconfig | 11 ++++++++- drivers/block/zram/zram_drv.c | 27 ++++++++++----------- drivers/block/zram/zram_drv.h | 2 +- 4 files changed, 25 insertions(+), 17 deletions(-) diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index e4551579cb12..ee2b0030d416 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -328,7 +328,7 @@ as idle:: From now on, any pages on zram are idle pages. The idle mark will be removed until someone requests access of the block. IOW, unless there is access request, those pages are still idle pages. -Additionally, when CONFIG_ZRAM_MEMORY_TRACKING is enabled pages can be +Additionally, when CONFIG_ZRAM_TRACK_ENTRY_ACTIME is enabled pages can be marked as idle based on how long (in seconds) it's been since they were last accessed:: diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 0386b7da02aa..af201392ed52 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -69,9 +69,18 @@ config ZRAM_WRITEBACK See Documentation/admin-guide/blockdev/zram.rst for more information. +config ZRAM_TRACK_ENTRY_ACTIME + bool "Track access time of zram entries" + depends on ZRAM + help + With this feature zram tracks access time of every stored + entry (page), which can be used for a more fine grained IDLE + pages writeback. + config ZRAM_MEMORY_TRACKING bool "Track zRam block status" depends on ZRAM && DEBUG_FS + select ZRAM_TRACK_ENTRY_ACTIME help With this feature, admin can track the state of allocated blocks of zRAM. Admin could see the information via @@ -86,4 +95,4 @@ config ZRAM_MULTI_COMP This will enable multi-compression streams, so that ZRAM can re-compress pages using a potentially slower but more effective compression algorithm. Note, that IDLE page recompression - requires ZRAM_MEMORY_TRACKING. + requires ZRAM_TRACK_ENTRY_ACTIME. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d77d3664ca08..f6b286e7f310 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -174,6 +174,14 @@ static inline u32 zram_get_priority(struct zram *zram, u32 index) return prio & ZRAM_COMP_PRIORITY_MASK; } +static void zram_accessed(struct zram *zram, u32 index) +{ + zram_clear_flag(zram, index, ZRAM_IDLE); +#ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME + zram->table[index].ac_time = ktime_get_boottime(); +#endif +} + static inline void update_used_max(struct zram *zram, const unsigned long pages) { @@ -293,8 +301,9 @@ static void mark_idle(struct zram *zram, ktime_t cutoff) zram_slot_lock(zram, index); if (zram_allocated(zram, index) && !zram_test_flag(zram, index, ZRAM_UNDER_WB)) { -#ifdef CONFIG_ZRAM_MEMORY_TRACKING - is_idle = !cutoff || ktime_after(cutoff, zram->table[index].ac_time); +#ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME + is_idle = !cutoff || ktime_after(cutoff, + zram->table[index].ac_time); #endif if (is_idle) zram_set_flag(zram, index, ZRAM_IDLE); @@ -317,7 +326,7 @@ static ssize_t idle_store(struct device *dev, */ u64 age_sec; - if (IS_ENABLED(CONFIG_ZRAM_MEMORY_TRACKING) && !kstrtoull(buf, 0, &age_sec)) + if (IS_ENABLED(CONFIG_ZRAM_TRACK_ENTRY_ACTIME) && !kstrtoull(buf, 0, &age_sec)) cutoff_time = ktime_sub(ktime_get_boottime(), ns_to_ktime(age_sec * NSEC_PER_SEC)); else @@ -841,12 +850,6 @@ static void zram_debugfs_destroy(void) debugfs_remove_recursive(zram_debugfs_root); } -static void zram_accessed(struct zram *zram, u32 index) -{ - zram_clear_flag(zram, index, ZRAM_IDLE); - zram->table[index].ac_time = ktime_get_boottime(); -} - static ssize_t read_block_state(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -930,10 +933,6 @@ static void zram_debugfs_unregister(struct zram *zram) #else static void zram_debugfs_create(void) {}; static void zram_debugfs_destroy(void) {}; -static void zram_accessed(struct zram *zram, u32 index) -{ - zram_clear_flag(zram, index, ZRAM_IDLE); -}; static void zram_debugfs_register(struct zram *zram) {}; static void zram_debugfs_unregister(struct zram *zram) {}; #endif @@ -1254,7 +1253,7 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; -#ifdef CONFIG_ZRAM_MEMORY_TRACKING +#ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME zram->table[index].ac_time = 0; #endif if (zram_test_flag(zram, index, ZRAM_IDLE)) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index d090753f97be..3b94d12f41b4 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -69,7 +69,7 @@ struct zram_table_entry { unsigned long element; }; unsigned long flags; -#ifdef CONFIG_ZRAM_MEMORY_TRACKING +#ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME ktime_t ac_time; #endif }; From 2e16898d0df88e52b26e8bd110cdc4e687217426 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Nov 2023 11:42:13 +0900 Subject: [PATCH 0416/1562] zram: tweak writeback config help Writeback is for incompressible and idle zram pages. Link: https://lkml.kernel.org/r/20231115024223.4133148-2-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Dmytro Maluka Signed-off-by: Andrew Morton --- drivers/block/zram/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index af201392ed52..7b29cce60ab2 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -59,8 +59,8 @@ config ZRAM_WRITEBACK bool "Write back incompressible or idle page to backing device" depends on ZRAM help - With incompressible page, there is no memory saving to keep it - in memory. Instead, write it out to backing device. + This lets zram entries (incompressible or idle pages) be written + back to a backing device, helping save memory. For this feature, admin should set up backing device via /sys/block/zramX/backing_dev. From 3d47e31790b784ec9a6f052fda683c536e272e4f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Nov 2023 16:14:42 +0000 Subject: [PATCH 0417/1562] memory-failure: use a folio in me_pagecache_clean() Patch series "Convert aops->error_remove_page to ->error_remove_folio". This is a memory-failure patch series which converts a lot of uses of page APIs into folio APIs with the usual benefits. This patch (of 6): Replaces three hidden calls to compound_head() with one visible one. Fix up a few comments while I'm modifying this function. Link: https://lkml.kernel.org/r/20231117161447.2461643-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231117161447.2461643-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b601f59ed062..496e8ecd8496 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1014,6 +1014,7 @@ static int me_unknown(struct page_state *ps, struct page *p) */ static int me_pagecache_clean(struct page_state *ps, struct page *p) { + struct folio *folio = page_folio(p); int ret; struct address_space *mapping; bool extra_pins; @@ -1021,10 +1022,10 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) delete_from_lru_cache(p); /* - * For anonymous pages we're done the only reference left + * For anonymous folios the only reference left * should be the one m_f() holds. */ - if (PageAnon(p)) { + if (folio_test_anon(folio)) { ret = MF_RECOVERED; goto out; } @@ -1036,11 +1037,9 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) * has a reference, because it could be file system metadata * and that's not safe to truncate. */ - mapping = page_mapping(p); + mapping = folio_mapping(folio); if (!mapping) { - /* - * Page has been teared down in the meanwhile - */ + /* Folio has been torn down in the meantime */ ret = MF_FAILED; goto out; } @@ -1061,7 +1060,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) ret = MF_FAILED; out: - unlock_page(p); + folio_unlock(folio); return ret; } From 6304b531cd8f568ed2b8d680837b8ceebe175b89 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Nov 2023 16:14:43 +0000 Subject: [PATCH 0418/1562] memory-failure: use a folio in me_pagecache_dirty() Replaces three hidden calls to compound_head() with one visible one. Link: https://lkml.kernel.org/r/20231117161447.2461643-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 496e8ecd8496..d2764fd3e448 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1138,15 +1138,16 @@ static int me_pagecache_dirty(struct page_state *ps, struct page *p) */ static int me_swapcache_dirty(struct page_state *ps, struct page *p) { + struct folio *folio = page_folio(p); int ret; bool extra_pins = false; - ClearPageDirty(p); + folio_clear_dirty(folio); /* Trigger EIO in shmem: */ - ClearPageUptodate(p); + folio_clear_uptodate(folio); ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; - unlock_page(p); + folio_unlock(folio); if (ret == MF_DELAYED) extra_pins = true; From f7092393570f24865199d1642eb097f9e1c8f01e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Nov 2023 16:14:44 +0000 Subject: [PATCH 0419/1562] memory-failure: convert delete_from_lru_cache() to take a folio All three callers now have a folio; pass it in instead of the page. Saves five calls to compound_head(). Link: https://lkml.kernel.org/r/20231117161447.2461643-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d2764fd3e448..e73f2047ffcb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -902,26 +902,26 @@ static const char * const action_page_types[] = { * The page count will stop it from being freed by unpoison. * Stress tests should be aware of this memory leak problem. */ -static int delete_from_lru_cache(struct page *p) +static int delete_from_lru_cache(struct folio *folio) { - if (isolate_lru_page(p)) { + if (folio_isolate_lru(folio)) { /* * Clear sensible page flags, so that the buddy system won't - * complain when the page is unpoison-and-freed. + * complain when the folio is unpoison-and-freed. */ - ClearPageActive(p); - ClearPageUnevictable(p); + folio_clear_active(folio); + folio_clear_unevictable(folio); /* * Poisoned page might never drop its ref count to 0 so we have * to uncharge it manually from its memcg. */ - mem_cgroup_uncharge(page_folio(p)); + mem_cgroup_uncharge(folio); /* - * drop the page count elevated by isolate_lru_page() + * drop the refcount elevated by folio_isolate_lru() */ - put_page(p); + folio_put(folio); return 0; } return -EIO; @@ -1019,7 +1019,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) struct address_space *mapping; bool extra_pins; - delete_from_lru_cache(p); + delete_from_lru_cache(folio); /* * For anonymous folios the only reference left @@ -1146,7 +1146,7 @@ static int me_swapcache_dirty(struct page_state *ps, struct page *p) /* Trigger EIO in shmem: */ folio_clear_uptodate(folio); - ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; + ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_DELAYED; folio_unlock(folio); if (ret == MF_DELAYED) @@ -1165,7 +1165,7 @@ static int me_swapcache_clean(struct page_state *ps, struct page *p) delete_from_swap_cache(folio); - ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; + ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_RECOVERED; folio_unlock(folio); if (has_extra_refcount(ps, p, false)) From b6fd410c32f1a66a52a42d6aae1ab7b011b74547 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Nov 2023 16:14:45 +0000 Subject: [PATCH 0420/1562] memory-failure: use a folio in me_huge_page() This function was already explicitly calling compound_head(); unfortunately the compiler can't know that and elide the redundant calls to compound_head() buried in page_mapping(), unlock_page(), etc. Switch to using a folio, which does let us elide these calls. Link: https://lkml.kernel.org/r/20231117161447.2461643-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e73f2047ffcb..d97d247c0224 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1182,25 +1182,25 @@ static int me_swapcache_clean(struct page_state *ps, struct page *p) */ static int me_huge_page(struct page_state *ps, struct page *p) { + struct folio *folio = page_folio(p); int res; - struct page *hpage = compound_head(p); struct address_space *mapping; bool extra_pins = false; - mapping = page_mapping(hpage); + mapping = folio_mapping(folio); if (mapping) { - res = truncate_error_page(hpage, page_to_pfn(p), mapping); + res = truncate_error_page(&folio->page, page_to_pfn(p), mapping); /* The page is kept in page cache. */ extra_pins = true; - unlock_page(hpage); + folio_unlock(folio); } else { - unlock_page(hpage); + folio_unlock(folio); /* * migration entry prevents later access on error hugepage, * so we can free and dissolve it into buddy to save healthy * subpages. */ - put_page(hpage); + folio_put(folio); if (__page_handle_poison(p) >= 0) { page_ref_inc(p); res = MF_RECOVERED; From e130b6514e14e98ad1f59bf1fc0a5c0f21c6d8ab Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Nov 2023 16:14:46 +0000 Subject: [PATCH 0421/1562] memory-failure: convert truncate_error_page to truncate_error_folio Both callers now have a folio, so pass it in. Nothing downstream was expecting a tail page; that's asserted in generic_error_remove_page(), for example. Link: https://lkml.kernel.org/r/20231117161447.2461643-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d97d247c0224..6aec94821fda 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -927,14 +927,13 @@ static int delete_from_lru_cache(struct folio *folio) return -EIO; } -static int truncate_error_page(struct page *p, unsigned long pfn, +static int truncate_error_page(struct folio *folio, unsigned long pfn, struct address_space *mapping) { - struct folio *folio = page_folio(p); int ret = MF_FAILED; if (mapping->a_ops->error_remove_page) { - int err = mapping->a_ops->error_remove_page(mapping, p); + int err = mapping->a_ops->error_remove_page(mapping, &folio->page); if (err != 0) pr_info("%#lx: Failed to punch page: %d\n", pfn, err); @@ -1055,7 +1054,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) * * Open: to take i_rwsem or not for this? Right now we don't. */ - ret = truncate_error_page(p, page_to_pfn(p), mapping); + ret = truncate_error_page(folio, page_to_pfn(p), mapping); if (has_extra_refcount(ps, p, extra_pins)) ret = MF_FAILED; @@ -1189,7 +1188,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) mapping = folio_mapping(folio); if (mapping) { - res = truncate_error_page(&folio->page, page_to_pfn(p), mapping); + res = truncate_error_page(folio, page_to_pfn(p), mapping); /* The page is kept in page cache. */ extra_pins = true; folio_unlock(folio); From af7628d6ec196999175ecb3fdb38336489b0f88a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Nov 2023 16:14:47 +0000 Subject: [PATCH 0422/1562] fs: convert error_remove_page to error_remove_folio There were already assertions that we were not passing a tail page to error_remove_page(), so make the compiler enforce that by converting everything to pass and use a folio. Link: https://lkml.kernel.org/r/20231117161447.2461643-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- Documentation/filesystems/locking.rst | 4 ++-- Documentation/filesystems/vfs.rst | 6 +++--- block/fops.c | 2 +- fs/afs/write.c | 2 +- fs/bcachefs/fs.c | 2 +- fs/btrfs/inode.c | 2 +- fs/ceph/addr.c | 4 ++-- fs/ext2/inode.c | 2 +- fs/ext4/inode.c | 6 +++--- fs/f2fs/compress.c | 2 +- fs/f2fs/inode.c | 2 +- fs/gfs2/aops.c | 4 ++-- fs/hugetlbfs/inode.c | 6 +++--- fs/nfs/file.c | 2 +- fs/ntfs/aops.c | 6 +++--- fs/ocfs2/aops.c | 2 +- fs/xfs/xfs_aops.c | 2 +- fs/zonefs/file.c | 2 +- include/linux/fs.h | 2 +- include/linux/mm.h | 3 ++- mm/memory-failure.c | 10 +++++----- mm/shmem.c | 6 +++--- mm/truncate.c | 9 ++++----- 23 files changed, 44 insertions(+), 44 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 7be2900806c8..421daf837940 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -261,7 +261,7 @@ prototypes:: struct folio *src, enum migrate_mode); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); - int (*error_remove_page)(struct address_space *, struct page *); + int (*error_remove_folio)(struct address_space *, struct folio *); int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span) int (*swap_deactivate)(struct file *); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); @@ -287,7 +287,7 @@ direct_IO: migrate_folio: yes (both) launder_folio: yes is_partially_uptodate: yes -error_remove_page: yes +error_remove_folio: yes swap_activate: no swap_deactivate: no swap_rw: yes, unlocks diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 99acc2e98673..dd99ce5912d8 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -823,7 +823,7 @@ cache in your filesystem. The following members are defined: bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback)(struct folio *, bool *, bool *); - int (*error_remove_page) (struct mapping *mapping, struct page *page); + int (*error_remove_folio)(struct mapping *mapping, struct folio *); int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span) int (*swap_deactivate)(struct file *); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); @@ -1034,8 +1034,8 @@ cache in your filesystem. The following members are defined: VM if a folio should be treated as dirty or writeback for the purposes of stalling. -``error_remove_page`` - normally set to generic_error_remove_page if truncation is ok +``error_remove_folio`` + normally set to generic_error_remove_folio if truncation is ok for this address space. Used for memory failure handling. Setting this implies you deal with pages going away under you, unless you have them locked or reference counts increased. diff --git a/block/fops.c b/block/fops.c index 0abaac705daf..0bdad1e8d514 100644 --- a/block/fops.c +++ b/block/fops.c @@ -500,7 +500,7 @@ const struct address_space_operations def_blk_aops = { .readahead = blkdev_readahead, .writepages = blkdev_writepages, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .migrate_folio = filemap_migrate_folio, }; #endif /* CONFIG_BUFFER_HEAD */ diff --git a/fs/afs/write.c b/fs/afs/write.c index 57d05d67f0c2..e87b52b1f34c 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -242,7 +242,7 @@ static void afs_kill_pages(struct address_space *mapping, folio_clear_uptodate(folio); folio_end_writeback(folio); folio_lock(folio); - generic_error_remove_page(mapping, &folio->page); + generic_error_remove_folio(mapping, folio); folio_unlock(folio); folio_put(folio); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 4d51be813509..df4a97b6637b 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1103,7 +1103,7 @@ static const struct address_space_operations bch_address_space_operations = { #ifdef CONFIG_MIGRATION .migrate_folio = filemap_migrate_folio, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; struct bcachefs_fid { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9f5a9894f88f..ff7b4efca24f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10930,7 +10930,7 @@ static const struct address_space_operations btrfs_aops = { .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = btrfs_swap_activate, .swap_deactivate = btrfs_swap_deactivate, }; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 85be3bf18cdf..13af429ab030 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -907,8 +907,8 @@ static void writepages_finish(struct ceph_osd_request *req) doutc(cl, "unlocking %p\n", page); if (remove_page) - generic_error_remove_page(inode->i_mapping, - page); + generic_error_remove_folio(inode->i_mapping, + page_folio(page)); unlock_page(page); } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 464faf6c217e..5a4272b2c6b0 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -969,7 +969,7 @@ const struct address_space_operations ext2_aops = { .writepages = ext2_writepages, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; static const struct address_space_operations ext2_dax_aops = { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 61277f7f8722..d7729b17a66b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3564,7 +3564,7 @@ static const struct address_space_operations ext4_aops = { .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; @@ -3581,7 +3581,7 @@ static const struct address_space_operations ext4_journalled_aops = { .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; @@ -3598,7 +3598,7 @@ static const struct address_space_operations ext4_da_aops = { .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 36e5dab6baae..6b2af514660d 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1944,7 +1944,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) continue; } - generic_error_remove_page(mapping, &folio->page); + generic_error_remove_folio(mapping, folio); folio_unlock(folio); } folio_batch_release(&fbatch); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 560bfcad1af2..a9eb3891f417 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -600,7 +600,7 @@ make_now: #ifdef CONFIG_F2FS_FS_COMPRESSION inode->i_mapping->a_ops = &f2fs_compress_aops; /* - * generic_error_remove_page only truncates pages of regular + * generic_error_remove_folio only truncates pages of regular * inode */ inode->i_mode |= S_IFREG; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ba8742dc91f8..5cffb079b87c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -745,7 +745,7 @@ static const struct address_space_operations gfs2_aops = { .bmap = gfs2_bmap, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; static const struct address_space_operations gfs2_jdata_aops = { @@ -758,7 +758,7 @@ static const struct address_space_operations gfs2_jdata_aops = { .invalidate_folio = gfs2_invalidate_folio, .release_folio = gfs2_release_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; void gfs2_set_aops(struct inode *inode) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f757d4f7ad98..36132c9125f9 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1129,8 +1129,8 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, #define hugetlbfs_migrate_folio NULL #endif -static int hugetlbfs_error_remove_page(struct address_space *mapping, - struct page *page) +static int hugetlbfs_error_remove_folio(struct address_space *mapping, + struct folio *folio) { return 0; } @@ -1277,7 +1277,7 @@ static const struct address_space_operations hugetlbfs_aops = { .write_end = hugetlbfs_write_end, .dirty_folio = noop_dirty_folio, .migrate_folio = hugetlbfs_migrate_folio, - .error_remove_page = hugetlbfs_error_remove_page, + .error_remove_folio = hugetlbfs_error_remove_folio, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 3f9768810427..e8cccb94b927 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -567,7 +567,7 @@ const struct address_space_operations nfs_file_aops = { .migrate_folio = nfs_migrate_folio, .launder_folio = nfs_launder_folio, .is_dirty_writeback = nfs_check_dirty_writeback, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = nfs_swap_activate, .swap_deactivate = nfs_swap_deactivate, .swap_rw = nfs_swap_rw, diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 71e31e789b29..70479ce915e8 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1644,7 +1644,7 @@ const struct address_space_operations ntfs_normal_aops = { .bmap = ntfs_bmap, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; /* @@ -1658,7 +1658,7 @@ const struct address_space_operations ntfs_compressed_aops = { #endif /* NTFS_RW */ .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; /* @@ -1673,7 +1673,7 @@ const struct address_space_operations ntfs_mst_aops = { #endif /* NTFS_RW */ .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; #ifdef NTFS_RW diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index ba790219d528..795997806326 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2480,5 +2480,5 @@ const struct address_space_operations ocfs2_aops = { .release_folio = ocfs2_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 465d7630bb21..813f85156b0c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -584,7 +584,7 @@ const struct address_space_operations xfs_address_space_operations = { .bmap = xfs_vm_bmap, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index b2c9b35df8f7..6ab2318a9c8e 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -180,7 +180,7 @@ const struct address_space_operations zonefs_file_aops = { .invalidate_folio = iomap_invalidate_folio, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = zonefs_swap_activate, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..31b2cf963db9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -434,7 +434,7 @@ struct address_space_operations { bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); - int (*error_remove_page)(struct address_space *, struct page *); + int (*error_remove_folio)(struct address_space *, struct folio *); /* swapfile support */ int (*swap_activate)(struct swap_info_struct *sis, struct file *file, diff --git a/include/linux/mm.h b/include/linux/mm.h index 64cd1ee4aacc..13a090271716 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2384,7 +2384,8 @@ extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); -int generic_error_remove_page(struct address_space *mapping, struct page *page); +int generic_error_remove_folio(struct address_space *mapping, + struct folio *folio); struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long address, struct pt_regs *regs); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6aec94821fda..d8c853b35dbb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -927,13 +927,13 @@ static int delete_from_lru_cache(struct folio *folio) return -EIO; } -static int truncate_error_page(struct folio *folio, unsigned long pfn, +static int truncate_error_folio(struct folio *folio, unsigned long pfn, struct address_space *mapping) { int ret = MF_FAILED; - if (mapping->a_ops->error_remove_page) { - int err = mapping->a_ops->error_remove_page(mapping, &folio->page); + if (mapping->a_ops->error_remove_folio) { + int err = mapping->a_ops->error_remove_folio(mapping, folio); if (err != 0) pr_info("%#lx: Failed to punch page: %d\n", pfn, err); @@ -1054,7 +1054,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) * * Open: to take i_rwsem or not for this? Right now we don't. */ - ret = truncate_error_page(folio, page_to_pfn(p), mapping); + ret = truncate_error_folio(folio, page_to_pfn(p), mapping); if (has_extra_refcount(ps, p, extra_pins)) ret = MF_FAILED; @@ -1188,7 +1188,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) mapping = folio_mapping(folio); if (mapping) { - res = truncate_error_page(folio, page_to_pfn(p), mapping); + res = truncate_error_folio(folio, page_to_pfn(p), mapping); /* The page is kept in page cache. */ extra_pins = true; folio_unlock(folio); diff --git a/mm/shmem.c b/mm/shmem.c index 91e2620148b2..97bc622da774 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4445,8 +4445,8 @@ static void __init shmem_destroy_inodecache(void) } /* Keep the page in page cache instead of truncating it */ -static int shmem_error_remove_page(struct address_space *mapping, - struct page *page) +static int shmem_error_remove_folio(struct address_space *mapping, + struct folio *folio) { return 0; } @@ -4461,7 +4461,7 @@ const struct address_space_operations shmem_aops = { #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif - .error_remove_page = shmem_error_remove_page, + .error_remove_folio = shmem_error_remove_folio, }; EXPORT_SYMBOL(shmem_aops); diff --git a/mm/truncate.c b/mm/truncate.c index 52e3a703e7b2..725b150e47ac 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -250,10 +250,9 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) /* * Used to get rid of pages on hardware memory corruption. */ -int generic_error_remove_page(struct address_space *mapping, struct page *page) +int generic_error_remove_folio(struct address_space *mapping, + struct folio *folio) { - VM_BUG_ON_PAGE(PageTail(page), page); - if (!mapping) return -EINVAL; /* @@ -262,9 +261,9 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page) */ if (!S_ISREG(mapping->host->i_mode)) return -EIO; - return truncate_inode_folio(mapping, page_folio(page)); + return truncate_inode_folio(mapping, folio); } -EXPORT_SYMBOL(generic_error_remove_page); +EXPORT_SYMBOL(generic_error_remove_folio); /** * mapping_evict_folio() - Remove an unused folio from the page-cache. From 88f9ee2b3040991ff40628fa9e3516ebe36dd6fa Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Thu, 16 Nov 2023 15:43:17 -0700 Subject: [PATCH 0423/1562] kmemleak: drop (age ) from leak record Patch series "tweak kmemleak report format". These 2 patches make minor changes to the report: 1st strips "age " from output. This makes the output idempotent; unchanging until a new leak is reported. 2nd adds the backtrace.checksum to the "backtrace:" line. This lets a user see repeats without actually reading the whole backtrace. So now the backtrace line looks like this: backtrace (crc 603070071): I surveyed for un-wanted effects upon users: Syzkaller parses kmemleak in executor/common_linux.h: static void check_leaks(char** frames, int nframes) It just counts occurrences of "unreferenced object", specifically it does not look for "age", nor would it choke on "crc" being added. github has 3 repos with "kmemleak" mentioned, all are moribund. gitlab has 0 hits on "kmemleak". This patch (of 2): Displaying age is pretty, but counter-productive; it changes with current-time, so it surrenders idempotency of the output, which breaks simple hash-based cataloging of the records by the user. The trouble: sequential reads, wo new leaks, get new results: :#> sum /sys/kernel/debug/kmemleak 53439 74 /sys/kernel/debug/kmemleak :#> sum /sys/kernel/debug/kmemleak 59066 74 /sys/kernel/debug/kmemleak and age is why (nothing else changes): :#> grep -v age /sys/kernel/debug/kmemleak | sum 58894 67 :#> grep -v age /sys/kernel/debug/kmemleak | sum 58894 67 Since jiffies is already printed in the "comm" line, age adds nothing. Notably, syzkaller reads kmemleak only for "unreferenced object", and won't care about this reform of age-ism. A few moribund github repos mention it, but don't compile. Link: https://lkml.kernel.org/r/20231116224318.124209-1-jim.cromie@gmail.com Link: https://lkml.kernel.org/r/20231116224318.124209-2-jim.cromie@gmail.com Signed-off-by: Jim Cromie Reviewed-by: Catalin Marinas Signed-off-by: Andrew Morton --- mm/kmemleak.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 5501363d6b31..a87b3dd8f3b7 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -355,14 +355,12 @@ static void print_unreferenced(struct seq_file *seq, int i; unsigned long *entries; unsigned int nr_entries; - unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); nr_entries = stack_depot_fetch(object->trace_handle, &entries); warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", object->pointer, object->size); - warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", - object->comm, object->pid, object->jiffies, - msecs_age / 1000, msecs_age % 1000); + warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", + object->comm, object->pid, object->jiffies); hex_dump_object(seq, object); warn_or_seq_printf(seq, " backtrace:\n"); From 52c5d2bc32133966974edb8d8c49bf7763101622 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Thu, 16 Nov 2023 15:43:18 -0700 Subject: [PATCH 0424/1562] kmemleak: add checksum to backtrace report Change /sys/kernel/debug/kmemleak report format slightly, adding "(extra info)" to the backtrace header: from: " backtrace:" to: " backtrace (crc ):" The allows a user to see recurring backtraces without detailed/careful reading of multiline stacks. So after cycling kmemleak-test a few times, I know some leaks are repeating. bash-5.2# grep backtrace /sys/kernel/debug/kmemleak | wc 62 186 1792 bash-5.2# grep backtrace /sys/kernel/debug/kmemleak | sort -u | wc 37 111 1067 syzkaller parses kmemleak for "unreferenced object" only, so is unaffected by this change. Other github repos are moribund. Link: https://lkml.kernel.org/r/20231116224318.124209-3-jim.cromie@gmail.com Signed-off-by: Jim Cromie Reviewed-by: Catalin Marinas Signed-off-by: Andrew Morton --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a87b3dd8f3b7..0fb4dcc3b06a 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -362,7 +362,7 @@ static void print_unreferenced(struct seq_file *seq, warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); hex_dump_object(seq, object); - warn_or_seq_printf(seq, " backtrace:\n"); + warn_or_seq_printf(seq, " backtrace (crc %x):\n", object->checksum); for (i = 0; i < nr_entries; i++) { void *ptr = (void *)entries[i]; From 4d07a037231c985f8c990c9cf1c304bbe31bb764 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:46:59 +0100 Subject: [PATCH 0425/1562] lib/stackdepot: print disabled message only if truly disabled Patch series "stackdepot: allow evicting stack traces", v4. Currently, the stack depot grows indefinitely until it reaches its capacity. Once that happens, the stack depot stops saving new stack traces. This creates a problem for using the stack depot for in-field testing and in production. For such uses, an ideal stack trace storage should: 1. Allow saving fresh stack traces on systems with a large uptime while limiting the amount of memory used to store the traces; 2. Have a low performance impact. Implementing #1 in the stack depot is impossible with the current keep-forever approach. This series targets to address that. Issue #2 is left to be addressed in a future series. This series changes the stack depot implementation to allow evicting unneeded stack traces from the stack depot. The users of the stack depot can do that via new stack_depot_save_flags(STACK_DEPOT_FLAG_GET) and stack_depot_put APIs. Internal changes to the stack depot code include: 1. Storing stack traces in fixed-frame-sized slots (vs precisely-sized slots in the current implementation); the slot size is controlled via CONFIG_STACKDEPOT_MAX_FRAMES (default: 64 frames); 2. Keeping available slots in a freelist (vs keeping an offset to the next free slot); 3. Using a read/write lock for synchronization (vs a lock-free approach combined with a spinlock). This series also integrates the eviction functionality into KASAN: the tag-based modes evict stack traces when the corresponding entry leaves the stack ring, and Generic KASAN evicts stack traces for objects once those leave the quarantine. With KASAN, despite wasting some space on rounding up the size of each stack record, the total memory consumed by stack depot gets saturated due to the eviction of irrelevant stack traces from the stack depot. With the tag-based KASAN modes, the average total amount of memory used for stack traces becomes ~0.5 MB (with the current default stack ring size of 32k entries and the default CONFIG_STACKDEPOT_MAX_FRAMES of 64). With Generic KASAN, the stack traces take up ~1 MB per 1 GB of RAM (as the quarantine's size depends on the amount of RAM). However, with KMSAN, the stack depot ends up using ~4x more memory per a stack trace than before. Thus, for KMSAN, the stack depot capacity is increased accordingly. KMSAN uses a lot of RAM for shadow memory anyway, so the increased stack depot memory usage will not make a significant difference. Other users of the stack depot do not save stack traces as often as KASAN and KMSAN. Thus, the increased memory usage is taken as an acceptable trade-off. In the future, these other users can take advantage of the eviction API to limit the memory waste. There is no measurable boot time performance impact of these changes for KASAN on x86-64. I haven't done any tests for arm64 modes (the stack depot without performance optimizations is not suitable for intended use of those anyway), but I expect a similar result. Obtaining and copying stack trace frames when saving them into stack depot is what takes the most time. This series does not yet provide a way to configure the maximum size of the stack depot externally (e.g. via a command-line parameter). This will be added in a separate series, possibly together with the performance improvement changes. This patch (of 22): Currently, if stack_depot_disable=off is passed to the kernel command-line after stack_depot_disable=on, stack depot prints a message that it is disabled, while it is actually enabled. Fix this by moving printing the disabled message to stack_depot_early_init. Place it before the __stack_depot_early_init_requested check, so that the message is printed even if early stack depot init has not been requested. Also drop the stack_table = NULL assignment from disable_stack_depot, as stack_table is NULL by default. Link: https://lkml.kernel.org/r/cover.1700502145.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/73a25c5fff29f3357cd7a9330e85e09bc8da2cbe.1700502145.git.andreyknvl@google.com Fixes: e1fdc403349c ("lib: stackdepot: add support to disable stack depot") Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 2f5aa851834e..0eeaef4f2523 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -101,14 +101,7 @@ static int next_pool_required = 1; static int __init disable_stack_depot(char *str) { - int ret; - - ret = kstrtobool(str, &stack_depot_disabled); - if (!ret && stack_depot_disabled) { - pr_info("disabled\n"); - stack_table = NULL; - } - return 0; + return kstrtobool(str, &stack_depot_disabled); } early_param("stack_depot_disable", disable_stack_depot); @@ -130,6 +123,15 @@ int __init stack_depot_early_init(void) return 0; __stack_depot_early_init_passed = true; + /* + * Print disabled message even if early init has not been requested: + * stack_depot_init() will not print one. + */ + if (stack_depot_disabled) { + pr_info("disabled\n"); + return 0; + } + /* * If KASAN is enabled, use the maximum order: KASAN is frequently used * in fuzzing scenarios, which leads to a large number of different @@ -138,7 +140,11 @@ int __init stack_depot_early_init(void) if (kasan_enabled() && !stack_bucket_number_order) stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX; - if (!__stack_depot_early_init_requested || stack_depot_disabled) + /* + * Check if early init has been requested after setting + * stack_bucket_number_order: stack_depot_init() uses its value. + */ + if (!__stack_depot_early_init_requested) return 0; /* From 0c5d44a8142d1ede05943845793d3d8a2f10c338 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:00 +0100 Subject: [PATCH 0426/1562] lib/stackdepot: check disabled flag when fetching Do not try fetching a stack trace from the stack depot if the stack_depot_disabled flag is enabled. Link: https://lkml.kernel.org/r/c3bfa3b7ab00b2e48ab75a3fbb9c67555777cb08.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 0eeaef4f2523..f8a8033e1dc8 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -483,7 +483,7 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, */ kmsan_unpoison_memory(entries, sizeof(*entries)); - if (!handle) + if (!handle || stack_depot_disabled) return 0; if (parts.pool_index > pool_index_cached) { From 603c000c115b40be75063af1a1e75a3b40d3a523 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:01 +0100 Subject: [PATCH 0427/1562] lib/stackdepot: simplify __stack_depot_save The retval local variable in __stack_depot_save has the union type handle_parts, but the function never uses anything but the union's handle field. Define retval simply as depot_stack_handle_t to simplify the code. Link: https://lkml.kernel.org/r/3b0763c8057a1cf2f200ff250a5f9580ee36a28c.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index f8a8033e1dc8..3e71c8f61c7d 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -366,7 +366,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, gfp_t alloc_flags, bool can_alloc) { struct stack_record *found = NULL, **bucket; - union handle_parts retval = { .handle = 0 }; + depot_stack_handle_t handle = 0; struct page *page = NULL; void *prealloc = NULL; unsigned long flags; @@ -383,7 +383,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, nr_entries = filter_irq_stacks(entries, nr_entries); if (unlikely(nr_entries == 0) || stack_depot_disabled) - goto fast_exit; + return 0; hash = hash_stack(entries, nr_entries); bucket = &stack_table[hash & stack_hash_mask]; @@ -449,9 +449,8 @@ exit: free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); } if (found) - retval.handle = found->handle.handle; -fast_exit: - return retval.handle; + handle = found->handle.handle; + return handle; } EXPORT_SYMBOL_GPL(__stack_depot_save); From 5f9ce55e020742e3c86a06941fbe9f37f9c022dd Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:02 +0100 Subject: [PATCH 0428/1562] lib/stackdepot: drop valid bit from handles Stack depot doesn't use the valid bit in handles in any way, so drop it. Link: https://lkml.kernel.org/r/34969bba2ca6e012c6ad071767197dee64dc5723.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 3e71c8f61c7d..46a422d31c1f 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -32,13 +32,12 @@ #define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8) -#define DEPOT_VALID_BITS 1 #define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */ #define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER)) #define DEPOT_STACK_ALIGN 4 #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) -#define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_VALID_BITS - \ - DEPOT_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) +#define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \ + STACK_DEPOT_EXTRA_BITS) #define DEPOT_POOLS_CAP 8192 #define DEPOT_MAX_POOLS \ (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \ @@ -50,7 +49,6 @@ union handle_parts { struct { u32 pool_index : DEPOT_POOL_INDEX_BITS; u32 offset : DEPOT_OFFSET_BITS; - u32 valid : DEPOT_VALID_BITS; u32 extra : STACK_DEPOT_EXTRA_BITS; }; }; @@ -309,7 +307,6 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) stack->size = size; stack->handle.pool_index = pool_index; stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN; - stack->handle.valid = 1; stack->handle.extra = 0; memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); pool_offset += required_size; From 83130ab2d8a49e86c70d628d1446a84c8e6ad1a4 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:03 +0100 Subject: [PATCH 0429/1562] lib/stackdepot: add depot_fetch_stack helper Add a helper depot_fetch_stack function that fetches the pointer to a stack record. With this change, all static depot_* functions now operate on stack pools and the exported stack_depot_* functions operate on the hash table. Link: https://lkml.kernel.org/r/170d8c202f29dc8e3d5491ee074d1e9e029a46db.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 46a422d31c1f..e41713983cac 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -310,6 +310,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) stack->handle.extra = 0; memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); pool_offset += required_size; + /* * Let KMSAN know the stored stack record is initialized. This shall * prevent false positive reports if instrumented code accesses it. @@ -319,6 +320,32 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) return stack; } +static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) +{ + union handle_parts parts = { .handle = handle }; + /* + * READ_ONCE pairs with potential concurrent write in + * depot_alloc_stack(). + */ + int pool_index_cached = READ_ONCE(pool_index); + void *pool; + size_t offset = parts.offset << DEPOT_STACK_ALIGN; + struct stack_record *stack; + + if (parts.pool_index > pool_index_cached) { + WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", + parts.pool_index, pool_index_cached, handle); + return NULL; + } + + pool = stack_pools[parts.pool_index]; + if (!pool) + return NULL; + + stack = pool + offset; + return stack; +} + /* Calculates the hash for a stack. */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) { @@ -462,14 +489,6 @@ EXPORT_SYMBOL_GPL(stack_depot_save); unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) { - union handle_parts parts = { .handle = handle }; - /* - * READ_ONCE pairs with potential concurrent write in - * depot_alloc_stack. - */ - int pool_index_cached = READ_ONCE(pool_index); - void *pool; - size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; *entries = NULL; @@ -482,15 +501,7 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, if (!handle || stack_depot_disabled) return 0; - if (parts.pool_index > pool_index_cached) { - WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", - parts.pool_index, pool_index_cached, handle); - return 0; - } - pool = stack_pools[parts.pool_index]; - if (!pool) - return 0; - stack = pool + offset; + stack = depot_fetch_stack(handle); *entries = stack->entries; return stack->size; From fc60e0caa94dd7ca0e97a1d42527f71c9d51cd2d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:04 +0100 Subject: [PATCH 0430/1562] lib/stackdepot: use fixed-sized slots for stack records Instead of storing stack records in stack depot pools one right after another, use fixed-sized slots. Add a new Kconfig option STACKDEPOT_MAX_FRAMES that allows to select the size of the slot in frames. Use 64 as the default value, which is the maximum stack trace size both KASAN and KMSAN use right now. Also add descriptions for other stack depot Kconfig options. This is preparatory patch for implementing the eviction of stack records from the stack depot. Link: https://lkml.kernel.org/r/dce7d030a99ff61022509665187fac45b0827298.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/Kconfig | 10 ++++++++++ lib/stackdepot.c | 13 +++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index 3ea1c830efab..5ddda7c2ed9b 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -713,10 +713,20 @@ config ARCH_STACKWALK config STACKDEPOT bool select STACKTRACE + help + Stack depot: stack trace storage that avoids duplication config STACKDEPOT_ALWAYS_INIT bool select STACKDEPOT + help + Always initialize stack depot during early boot + +config STACKDEPOT_MAX_FRAMES + int "Maximum number of frames in trace saved in stack depot" + range 1 256 + default 64 + depends on STACKDEPOT config REF_TRACKER bool diff --git a/lib/stackdepot.c b/lib/stackdepot.c index e41713983cac..682497dbe081 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -58,9 +58,12 @@ struct stack_record { u32 hash; /* Hash in the hash table */ u32 size; /* Number of stored frames */ union handle_parts handle; - unsigned long entries[]; /* Variable-sized array of frames */ + unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */ }; +#define DEPOT_STACK_RECORD_SIZE \ + ALIGN(sizeof(struct stack_record), 1 << DEPOT_STACK_ALIGN) + static bool stack_depot_disabled; static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); static bool __stack_depot_early_init_passed __initdata; @@ -264,9 +267,7 @@ static struct stack_record * depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) { struct stack_record *stack; - size_t required_size = struct_size(stack, entries, size); - - required_size = ALIGN(required_size, 1 << DEPOT_STACK_ALIGN); + size_t required_size = DEPOT_STACK_RECORD_SIZE; /* Check if there is not enough space in the current pool. */ if (unlikely(pool_offset + required_size > DEPOT_POOL_SIZE)) { @@ -301,6 +302,10 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) if (stack_pools[pool_index] == NULL) return NULL; + /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */ + if (size > CONFIG_STACKDEPOT_MAX_FRAMES) + size = CONFIG_STACKDEPOT_MAX_FRAMES; + /* Save the stack trace. */ stack = stack_pools[pool_index] + pool_offset; stack->hash = hash; From fcccc41ecb0c96e59c471c389cd708014be2efc8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:05 +0100 Subject: [PATCH 0431/1562] lib/stackdepot: fix and clean-up atomic annotations Drop smp_load_acquire from next_pool_required in depot_init_pool, as both depot_init_pool and the all smp_store_release's to this variable are executed under the stack depot lock. Also simplify and clean up comments accompanying the use of atomic accesses in the stack depot code. Link: https://lkml.kernel.org/r/c118ef044d8db80248d9e1f14592c72e8429e9d9.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 682497dbe081..cfa3c6c7cc2e 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -231,10 +231,10 @@ static void depot_init_pool(void **prealloc) /* * If the next pool is already initialized or the maximum number of * pools is reached, do not use the preallocated memory. - * smp_load_acquire() here pairs with smp_store_release() below and - * in depot_alloc_stack(). + * Access next_pool_required non-atomically, as there are no concurrent + * write accesses to this variable. */ - if (!smp_load_acquire(&next_pool_required)) + if (!next_pool_required) return; /* Check if the current pool is not yet allocated. */ @@ -255,8 +255,8 @@ static void depot_init_pool(void **prealloc) * At this point, either the next pool is initialized or the * maximum number of pools is reached. In either case, take * note that initializing another pool is not required. - * This smp_store_release pairs with smp_load_acquire() above - * and in stack_depot_save(). + * smp_store_release() pairs with smp_load_acquire() in + * stack_depot_save(). */ smp_store_release(&next_pool_required, 0); } @@ -279,7 +279,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) /* * Move on to the next pool. - * WRITE_ONCE pairs with potential concurrent read in + * WRITE_ONCE() pairs with potential concurrent read in * stack_depot_fetch(). */ WRITE_ONCE(pool_index, pool_index + 1); @@ -287,8 +287,8 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) /* * If the maximum number of pools is not reached, take note * that the next pool needs to initialized. - * smp_store_release() here pairs with smp_load_acquire() in - * stack_depot_save() and depot_init_pool(). + * smp_store_release() pairs with smp_load_acquire() in + * stack_depot_save(). */ if (pool_index + 1 < DEPOT_MAX_POOLS) smp_store_release(&next_pool_required, 1); @@ -329,7 +329,7 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) { union handle_parts parts = { .handle = handle }; /* - * READ_ONCE pairs with potential concurrent write in + * READ_ONCE() pairs with potential concurrent write in * depot_alloc_stack(). */ int pool_index_cached = READ_ONCE(pool_index); @@ -419,8 +419,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, /* * Fast path: look the stack trace up without locking. - * The smp_load_acquire() here pairs with smp_store_release() to - * |bucket| below. + * smp_load_acquire() pairs with smp_store_release() to |bucket| below. */ found = find_stack(smp_load_acquire(bucket), entries, nr_entries, hash); if (found) @@ -430,8 +429,8 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * Check if another stack pool needs to be initialized. If so, allocate * the memory now - we won't be able to do that under the lock. * - * The smp_load_acquire() here pairs with smp_store_release() to - * |next_pool_inited| in depot_alloc_stack() and depot_init_pool(). + * smp_load_acquire() pairs with smp_store_release() in + * depot_alloc_stack() and depot_init_pool(). */ if (unlikely(can_alloc && smp_load_acquire(&next_pool_required))) { /* @@ -457,8 +456,8 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, if (new) { new->next = *bucket; /* - * This smp_store_release() pairs with - * smp_load_acquire() from |bucket| above. + * smp_store_release() pairs with smp_load_acquire() + * from |bucket| above. */ smp_store_release(bucket, new); found = new; From 94b7d32870298be93b67bceb0470936c54fb2007 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:06 +0100 Subject: [PATCH 0432/1562] lib/stackdepot: rework helpers for depot_alloc_stack Split code in depot_alloc_stack and depot_init_pool into 3 functions: 1. depot_keep_next_pool that keeps preallocated memory for the next pool if required. 2. depot_update_pools that moves on to the next pool if there's no space left in the current pool, uses preallocated memory for the new current pool if required, and calls depot_keep_next_pool otherwise. 3. depot_alloc_stack that calls depot_update_pools and then allocates a stack record as before. This makes it somewhat easier to follow the logic of depot_alloc_stack and also serves as a preparation for implementing the eviction of stack records from the stack depot. Link: https://lkml.kernel.org/r/71fb144d42b701fcb46708d7f4be6801a4a8270e.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 86 +++++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index cfa3c6c7cc2e..b3af868627f4 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -225,11 +225,11 @@ out_unlock: } EXPORT_SYMBOL_GPL(stack_depot_init); -/* Uses preallocated memory to initialize a new stack depot pool. */ -static void depot_init_pool(void **prealloc) +/* Keeps the preallocated memory to be used for the next stack depot pool. */ +static void depot_keep_next_pool(void **prealloc) { /* - * If the next pool is already initialized or the maximum number of + * If the next pool is already saved or the maximum number of * pools is reached, do not use the preallocated memory. * Access next_pool_required non-atomically, as there are no concurrent * write accesses to this variable. @@ -237,44 +237,34 @@ static void depot_init_pool(void **prealloc) if (!next_pool_required) return; - /* Check if the current pool is not yet allocated. */ - if (stack_pools[pool_index] == NULL) { - /* Use the preallocated memory for the current pool. */ - stack_pools[pool_index] = *prealloc; + /* + * Use the preallocated memory for the next pool + * as long as we do not exceed the maximum number of pools. + */ + if (pool_index + 1 < DEPOT_MAX_POOLS) { + stack_pools[pool_index + 1] = *prealloc; *prealloc = NULL; - } else { - /* - * Otherwise, use the preallocated memory for the next pool - * as long as we do not exceed the maximum number of pools. - */ - if (pool_index + 1 < DEPOT_MAX_POOLS) { - stack_pools[pool_index + 1] = *prealloc; - *prealloc = NULL; - } - /* - * At this point, either the next pool is initialized or the - * maximum number of pools is reached. In either case, take - * note that initializing another pool is not required. - * smp_store_release() pairs with smp_load_acquire() in - * stack_depot_save(). - */ - smp_store_release(&next_pool_required, 0); } + + /* + * At this point, either the next pool is kept or the maximum + * number of pools is reached. In either case, take note that + * keeping another pool is not required. + * smp_store_release() pairs with smp_load_acquire() in + * stack_depot_save(). + */ + smp_store_release(&next_pool_required, 0); } -/* Allocates a new stack in a stack depot pool. */ -static struct stack_record * -depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) +/* Updates references to the current and the next stack depot pools. */ +static bool depot_update_pools(size_t required_size, void **prealloc) { - struct stack_record *stack; - size_t required_size = DEPOT_STACK_RECORD_SIZE; - /* Check if there is not enough space in the current pool. */ if (unlikely(pool_offset + required_size > DEPOT_POOL_SIZE)) { /* Bail out if we reached the pool limit. */ if (unlikely(pool_index + 1 >= DEPOT_MAX_POOLS)) { WARN_ONCE(1, "Stack depot reached limit capacity"); - return NULL; + return false; } /* @@ -284,9 +274,10 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) */ WRITE_ONCE(pool_index, pool_index + 1); pool_offset = 0; + /* * If the maximum number of pools is not reached, take note - * that the next pool needs to initialized. + * that the next pool needs to be initialized. * smp_store_release() pairs with smp_load_acquire() in * stack_depot_save(). */ @@ -294,9 +285,30 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) smp_store_release(&next_pool_required, 1); } - /* Assign the preallocated memory to a pool if required. */ + /* Check if the current pool is not yet allocated. */ + if (*prealloc && stack_pools[pool_index] == NULL) { + /* Use the preallocated memory for the current pool. */ + stack_pools[pool_index] = *prealloc; + *prealloc = NULL; + return true; + } + + /* Otherwise, try using the preallocated memory for the next pool. */ if (*prealloc) - depot_init_pool(prealloc); + depot_keep_next_pool(prealloc); + return true; +} + +/* Allocates a new stack in a stack depot pool. */ +static struct stack_record * +depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) +{ + struct stack_record *stack; + size_t required_size = DEPOT_STACK_RECORD_SIZE; + + /* Update current and next pools if required and possible. */ + if (!depot_update_pools(required_size, prealloc)) + return NULL; /* Check if we have a pool to save the stack trace. */ if (stack_pools[pool_index] == NULL) @@ -330,7 +342,7 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) union handle_parts parts = { .handle = handle }; /* * READ_ONCE() pairs with potential concurrent write in - * depot_alloc_stack(). + * depot_update_pools(). */ int pool_index_cached = READ_ONCE(pool_index); void *pool; @@ -430,7 +442,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * the memory now - we won't be able to do that under the lock. * * smp_load_acquire() pairs with smp_store_release() in - * depot_alloc_stack() and depot_init_pool(). + * depot_update_pools() and depot_keep_next_pool(). */ if (unlikely(can_alloc && smp_load_acquire(&next_pool_required))) { /* @@ -467,7 +479,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * Stack depot already contains this stack trace, but let's * keep the preallocated memory for the future. */ - depot_init_pool(&prealloc); + depot_keep_next_pool(&prealloc); } raw_spin_unlock_irqrestore(&pool_lock, flags); From b6a353d3ebc2b5eea3cab81ed81764bb1dd6f4ab Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:07 +0100 Subject: [PATCH 0433/1562] lib/stackdepot: rename next_pool_required to new_pool_required Rename next_pool_required to new_pool_required. This a purely code readability change: the following patch will change stack depot to store the pointer to the new pool in a separate variable, and "new" seems like a more logical name. Link: https://lkml.kernel.org/r/fd7cd6c6eb250c13ec5d2009d75bb4ddd1470db9.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 49 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index b3af868627f4..a38661beab97 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -93,12 +93,11 @@ static size_t pool_offset; static DEFINE_RAW_SPINLOCK(pool_lock); /* * Stack depot tries to keep an extra pool allocated even before it runs out - * of space in the currently used pool. - * This flag marks that this next extra pool needs to be allocated and - * initialized. It has the value 0 when either the next pool is not yet - * initialized or the limit on the number of pools is reached. + * of space in the currently used pool. This flag marks whether this extra pool + * needs to be allocated. It has the value 0 when either an extra pool is not + * yet allocated or if the limit on the number of pools is reached. */ -static int next_pool_required = 1; +static int new_pool_required = 1; static int __init disable_stack_depot(char *str) { @@ -225,20 +224,20 @@ out_unlock: } EXPORT_SYMBOL_GPL(stack_depot_init); -/* Keeps the preallocated memory to be used for the next stack depot pool. */ -static void depot_keep_next_pool(void **prealloc) +/* Keeps the preallocated memory to be used for a new stack depot pool. */ +static void depot_keep_new_pool(void **prealloc) { /* - * If the next pool is already saved or the maximum number of + * If a new pool is already saved or the maximum number of * pools is reached, do not use the preallocated memory. - * Access next_pool_required non-atomically, as there are no concurrent + * Access new_pool_required non-atomically, as there are no concurrent * write accesses to this variable. */ - if (!next_pool_required) + if (!new_pool_required) return; /* - * Use the preallocated memory for the next pool + * Use the preallocated memory for the new pool * as long as we do not exceed the maximum number of pools. */ if (pool_index + 1 < DEPOT_MAX_POOLS) { @@ -247,13 +246,13 @@ static void depot_keep_next_pool(void **prealloc) } /* - * At this point, either the next pool is kept or the maximum + * At this point, either a new pool is kept or the maximum * number of pools is reached. In either case, take note that * keeping another pool is not required. * smp_store_release() pairs with smp_load_acquire() in * stack_depot_save(). */ - smp_store_release(&next_pool_required, 0); + smp_store_release(&new_pool_required, 0); } /* Updates references to the current and the next stack depot pools. */ @@ -268,7 +267,7 @@ static bool depot_update_pools(size_t required_size, void **prealloc) } /* - * Move on to the next pool. + * Move on to the new pool. * WRITE_ONCE() pairs with potential concurrent read in * stack_depot_fetch(). */ @@ -277,12 +276,12 @@ static bool depot_update_pools(size_t required_size, void **prealloc) /* * If the maximum number of pools is not reached, take note - * that the next pool needs to be initialized. + * that yet another new pool needs to be allocated. * smp_store_release() pairs with smp_load_acquire() in * stack_depot_save(). */ if (pool_index + 1 < DEPOT_MAX_POOLS) - smp_store_release(&next_pool_required, 1); + smp_store_release(&new_pool_required, 1); } /* Check if the current pool is not yet allocated. */ @@ -293,9 +292,9 @@ static bool depot_update_pools(size_t required_size, void **prealloc) return true; } - /* Otherwise, try using the preallocated memory for the next pool. */ + /* Otherwise, try using the preallocated memory for a new pool. */ if (*prealloc) - depot_keep_next_pool(prealloc); + depot_keep_new_pool(prealloc); return true; } @@ -306,7 +305,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) struct stack_record *stack; size_t required_size = DEPOT_STACK_RECORD_SIZE; - /* Update current and next pools if required and possible. */ + /* Update current and new pools if required and possible. */ if (!depot_update_pools(required_size, prealloc)) return NULL; @@ -438,13 +437,13 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, goto exit; /* - * Check if another stack pool needs to be initialized. If so, allocate - * the memory now - we won't be able to do that under the lock. + * Check if another stack pool needs to be allocated. If so, allocate + * the memory now: we won't be able to do that under the lock. * * smp_load_acquire() pairs with smp_store_release() in - * depot_update_pools() and depot_keep_next_pool(). + * depot_update_pools() and depot_keep_new_pool(). */ - if (unlikely(can_alloc && smp_load_acquire(&next_pool_required))) { + if (unlikely(can_alloc && smp_load_acquire(&new_pool_required))) { /* * Zero out zone modifiers, as we don't have specific zone * requirements. Keep the flags related to allocation in atomic @@ -477,9 +476,9 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, } else if (prealloc) { /* * Stack depot already contains this stack trace, but let's - * keep the preallocated memory for the future. + * keep the preallocated memory for future. */ - depot_keep_next_pool(&prealloc); + depot_keep_new_pool(&prealloc); } raw_spin_unlock_irqrestore(&pool_lock, flags); From a5d21f71715a0459e5313881203f86eefbeefb3b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:08 +0100 Subject: [PATCH 0434/1562] lib/stackdepot: store next pool pointer in new_pool Instead of using the last pointer in stack_pools for storing the pointer to a new pool (which does not yet store any stack records), use a new new_pool variable. This a purely code readability change: it seems more logical to store the pointer to a pool with a special meaning in a dedicated variable. Link: https://lkml.kernel.org/r/448bc18296c16bef95cb3167697be6583dcc8ce3.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index a38661beab97..68c1ac9aa916 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -85,6 +85,8 @@ static unsigned int stack_hash_mask; /* Array of memory regions that store stack traces. */ static void *stack_pools[DEPOT_MAX_POOLS]; +/* Newly allocated pool that is not yet added to stack_pools. */ +static void *new_pool; /* Currently used pool in stack_pools. */ static int pool_index; /* Offset to the unused space in the currently used pool. */ @@ -241,7 +243,7 @@ static void depot_keep_new_pool(void **prealloc) * as long as we do not exceed the maximum number of pools. */ if (pool_index + 1 < DEPOT_MAX_POOLS) { - stack_pools[pool_index + 1] = *prealloc; + new_pool = *prealloc; *prealloc = NULL; } @@ -272,6 +274,8 @@ static bool depot_update_pools(size_t required_size, void **prealloc) * stack_depot_fetch(). */ WRITE_ONCE(pool_index, pool_index + 1); + stack_pools[pool_index] = new_pool; + new_pool = NULL; pool_offset = 0; /* From b29d31885814003245e2e36373bef4ea6721f114 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:09 +0100 Subject: [PATCH 0435/1562] lib/stackdepot: store free stack records in a freelist Instead of using the global pool_offset variable to find a free slot when storing a new stack record, mainlain a freelist of free slots within the allocated stack pools. A global next_stack variable is used as the head of the freelist, and the next field in the stack_record struct is reused as freelist link (when the record is not in the freelist, this field is used as a link in the hash table). This is preparatory patch for implementing the eviction of stack records from the stack depot. Link: https://lkml.kernel.org/r/b9e4c79955c2121b69301778643b203d3fb09ccc.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 131 +++++++++++++++++++++++++++++------------------ 1 file changed, 82 insertions(+), 49 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 68c1ac9aa916..a5eff165c0d5 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -54,8 +54,8 @@ union handle_parts { }; struct stack_record { - struct stack_record *next; /* Link in the hash table */ - u32 hash; /* Hash in the hash table */ + struct stack_record *next; /* Link in hash table or freelist */ + u32 hash; /* Hash in hash table */ u32 size; /* Number of stored frames */ union handle_parts handle; unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */ @@ -87,10 +87,10 @@ static unsigned int stack_hash_mask; static void *stack_pools[DEPOT_MAX_POOLS]; /* Newly allocated pool that is not yet added to stack_pools. */ static void *new_pool; -/* Currently used pool in stack_pools. */ -static int pool_index; -/* Offset to the unused space in the currently used pool. */ -static size_t pool_offset; +/* Number of pools in stack_pools. */ +static int pools_num; +/* Next stack in the freelist of stack records within stack_pools. */ +static struct stack_record *next_stack; /* Lock that protects the variables above. */ static DEFINE_RAW_SPINLOCK(pool_lock); /* @@ -226,6 +226,42 @@ out_unlock: } EXPORT_SYMBOL_GPL(stack_depot_init); +/* Initializes a stack depol pool. */ +static void depot_init_pool(void *pool) +{ + const int records_in_pool = DEPOT_POOL_SIZE / DEPOT_STACK_RECORD_SIZE; + int i, offset; + + /* Initialize handles and link stack records to each other. */ + for (i = 0, offset = 0; + offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE; + i++, offset += DEPOT_STACK_RECORD_SIZE) { + struct stack_record *stack = pool + offset; + + stack->handle.pool_index = pools_num; + stack->handle.offset = offset >> DEPOT_STACK_ALIGN; + stack->handle.extra = 0; + + if (i < records_in_pool - 1) + stack->next = (void *)stack + DEPOT_STACK_RECORD_SIZE; + else + stack->next = NULL; + } + + /* Link stack records into the freelist. */ + WARN_ON(next_stack); + next_stack = pool; + + /* Save reference to the pool to be used by depot_fetch_stack(). */ + stack_pools[pools_num] = pool; + + /* + * WRITE_ONCE() pairs with potential concurrent read in + * depot_fetch_stack(). + */ + WRITE_ONCE(pools_num, pools_num + 1); +} + /* Keeps the preallocated memory to be used for a new stack depot pool. */ static void depot_keep_new_pool(void **prealloc) { @@ -242,7 +278,7 @@ static void depot_keep_new_pool(void **prealloc) * Use the preallocated memory for the new pool * as long as we do not exceed the maximum number of pools. */ - if (pool_index + 1 < DEPOT_MAX_POOLS) { + if (pools_num < DEPOT_MAX_POOLS) { new_pool = *prealloc; *prealloc = NULL; } @@ -258,45 +294,42 @@ static void depot_keep_new_pool(void **prealloc) } /* Updates references to the current and the next stack depot pools. */ -static bool depot_update_pools(size_t required_size, void **prealloc) +static bool depot_update_pools(void **prealloc) { - /* Check if there is not enough space in the current pool. */ - if (unlikely(pool_offset + required_size > DEPOT_POOL_SIZE)) { - /* Bail out if we reached the pool limit. */ - if (unlikely(pool_index + 1 >= DEPOT_MAX_POOLS)) { - WARN_ONCE(1, "Stack depot reached limit capacity"); - return false; - } + /* Check if we still have objects in the freelist. */ + if (next_stack) + goto out_keep_prealloc; - /* - * Move on to the new pool. - * WRITE_ONCE() pairs with potential concurrent read in - * stack_depot_fetch(). - */ - WRITE_ONCE(pool_index, pool_index + 1); - stack_pools[pool_index] = new_pool; + /* Check if we have a new pool saved and use it. */ + if (new_pool) { + depot_init_pool(new_pool); new_pool = NULL; - pool_offset = 0; - /* - * If the maximum number of pools is not reached, take note - * that yet another new pool needs to be allocated. - * smp_store_release() pairs with smp_load_acquire() in - * stack_depot_save(). - */ - if (pool_index + 1 < DEPOT_MAX_POOLS) + /* Take note that we might need a new new_pool. */ + if (pools_num < DEPOT_MAX_POOLS) smp_store_release(&new_pool_required, 1); + + /* Try keeping the preallocated memory for new_pool. */ + goto out_keep_prealloc; } - /* Check if the current pool is not yet allocated. */ - if (*prealloc && stack_pools[pool_index] == NULL) { - /* Use the preallocated memory for the current pool. */ - stack_pools[pool_index] = *prealloc; + /* Bail out if we reached the pool limit. */ + if (unlikely(pools_num >= DEPOT_MAX_POOLS)) { + WARN_ONCE(1, "Stack depot reached limit capacity"); + return false; + } + + /* Check if we have preallocated memory and use it. */ + if (*prealloc) { + depot_init_pool(*prealloc); *prealloc = NULL; return true; } - /* Otherwise, try using the preallocated memory for a new pool. */ + return false; + +out_keep_prealloc: + /* Keep the preallocated memory for a new pool if required. */ if (*prealloc) depot_keep_new_pool(prealloc); return true; @@ -307,35 +340,35 @@ static struct stack_record * depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) { struct stack_record *stack; - size_t required_size = DEPOT_STACK_RECORD_SIZE; /* Update current and new pools if required and possible. */ - if (!depot_update_pools(required_size, prealloc)) + if (!depot_update_pools(prealloc)) return NULL; - /* Check if we have a pool to save the stack trace. */ - if (stack_pools[pool_index] == NULL) + /* Check if we have a stack record to save the stack trace. */ + stack = next_stack; + if (!stack) return NULL; + /* Advance the freelist. */ + next_stack = stack->next; + /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */ if (size > CONFIG_STACKDEPOT_MAX_FRAMES) size = CONFIG_STACKDEPOT_MAX_FRAMES; /* Save the stack trace. */ - stack = stack_pools[pool_index] + pool_offset; + stack->next = NULL; stack->hash = hash; stack->size = size; - stack->handle.pool_index = pool_index; - stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN; - stack->handle.extra = 0; + /* stack->handle is already filled in by depot_init_pool(). */ memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); - pool_offset += required_size; /* * Let KMSAN know the stored stack record is initialized. This shall * prevent false positive reports if instrumented code accesses it. */ - kmsan_unpoison_memory(stack, required_size); + kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE); return stack; } @@ -345,16 +378,16 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) union handle_parts parts = { .handle = handle }; /* * READ_ONCE() pairs with potential concurrent write in - * depot_update_pools(). + * depot_init_pool(). */ - int pool_index_cached = READ_ONCE(pool_index); + int pools_num_cached = READ_ONCE(pools_num); void *pool; size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; - if (parts.pool_index > pool_index_cached) { + if (parts.pool_index > pools_num_cached) { WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", - parts.pool_index, pool_index_cached, handle); + parts.pool_index, pools_num_cached, handle); return NULL; } From a6cd957021f2bbbe0f02e5c32389eb4c06aa97c8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:10 +0100 Subject: [PATCH 0436/1562] lib/stackdepot: use read/write lock Currently, stack depot uses the following locking scheme: 1. Lock-free accesses when looking up a stack record, which allows to have multiple users to look up records in parallel; 2. Spinlock for protecting the stack depot pools and the hash table when adding a new record. For implementing the eviction of stack traces from stack depot, the lock-free approach is not going to work anymore, as we will need to be able to also remove records from the hash table. Convert the spinlock into a read/write lock, and drop the atomic accesses, as they are no longer required. Looking up stack traces is now protected by the read lock and adding new records - by the write lock. One of the following patches will add a new function for evicting stack records, which will be protected by the write lock as well. With this change, multiple users can still look up records in parallel. This is preparatory patch for implementing the eviction of stack records from the stack depot. Link: https://lkml.kernel.org/r/9f81ffcc4bb422ebb6326a65a770bf1918634cbb.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 87 +++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index a5eff165c0d5..8378b32b5310 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -91,15 +92,15 @@ static void *new_pool; static int pools_num; /* Next stack in the freelist of stack records within stack_pools. */ static struct stack_record *next_stack; -/* Lock that protects the variables above. */ -static DEFINE_RAW_SPINLOCK(pool_lock); /* * Stack depot tries to keep an extra pool allocated even before it runs out * of space in the currently used pool. This flag marks whether this extra pool * needs to be allocated. It has the value 0 when either an extra pool is not * yet allocated or if the limit on the number of pools is reached. */ -static int new_pool_required = 1; +static bool new_pool_required = true; +/* Lock that protects the variables above. */ +static DEFINE_RWLOCK(pool_rwlock); static int __init disable_stack_depot(char *str) { @@ -232,6 +233,8 @@ static void depot_init_pool(void *pool) const int records_in_pool = DEPOT_POOL_SIZE / DEPOT_STACK_RECORD_SIZE; int i, offset; + lockdep_assert_held_write(&pool_rwlock); + /* Initialize handles and link stack records to each other. */ for (i = 0, offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE; @@ -254,22 +257,17 @@ static void depot_init_pool(void *pool) /* Save reference to the pool to be used by depot_fetch_stack(). */ stack_pools[pools_num] = pool; - - /* - * WRITE_ONCE() pairs with potential concurrent read in - * depot_fetch_stack(). - */ - WRITE_ONCE(pools_num, pools_num + 1); + pools_num++; } /* Keeps the preallocated memory to be used for a new stack depot pool. */ static void depot_keep_new_pool(void **prealloc) { + lockdep_assert_held_write(&pool_rwlock); + /* * If a new pool is already saved or the maximum number of * pools is reached, do not use the preallocated memory. - * Access new_pool_required non-atomically, as there are no concurrent - * write accesses to this variable. */ if (!new_pool_required) return; @@ -287,15 +285,15 @@ static void depot_keep_new_pool(void **prealloc) * At this point, either a new pool is kept or the maximum * number of pools is reached. In either case, take note that * keeping another pool is not required. - * smp_store_release() pairs with smp_load_acquire() in - * stack_depot_save(). */ - smp_store_release(&new_pool_required, 0); + new_pool_required = false; } /* Updates references to the current and the next stack depot pools. */ static bool depot_update_pools(void **prealloc) { + lockdep_assert_held_write(&pool_rwlock); + /* Check if we still have objects in the freelist. */ if (next_stack) goto out_keep_prealloc; @@ -307,7 +305,7 @@ static bool depot_update_pools(void **prealloc) /* Take note that we might need a new new_pool. */ if (pools_num < DEPOT_MAX_POOLS) - smp_store_release(&new_pool_required, 1); + new_pool_required = true; /* Try keeping the preallocated memory for new_pool. */ goto out_keep_prealloc; @@ -341,6 +339,8 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) { struct stack_record *stack; + lockdep_assert_held_write(&pool_rwlock); + /* Update current and new pools if required and possible. */ if (!depot_update_pools(prealloc)) return NULL; @@ -376,18 +376,15 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) { union handle_parts parts = { .handle = handle }; - /* - * READ_ONCE() pairs with potential concurrent write in - * depot_init_pool(). - */ - int pools_num_cached = READ_ONCE(pools_num); void *pool; size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; - if (parts.pool_index > pools_num_cached) { + lockdep_assert_held_read(&pool_rwlock); + + if (parts.pool_index > pools_num) { WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", - parts.pool_index, pools_num_cached, handle); + parts.pool_index, pools_num, handle); return NULL; } @@ -429,6 +426,8 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, { struct stack_record *found; + lockdep_assert_held(&pool_rwlock); + for (found = bucket; found; found = found->next) { if (found->hash == hash && found->size == size && @@ -446,6 +445,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, depot_stack_handle_t handle = 0; struct page *page = NULL; void *prealloc = NULL; + bool need_alloc = false; unsigned long flags; u32 hash; @@ -465,22 +465,26 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, hash = hash_stack(entries, nr_entries); bucket = &stack_table[hash & stack_hash_mask]; - /* - * Fast path: look the stack trace up without locking. - * smp_load_acquire() pairs with smp_store_release() to |bucket| below. - */ - found = find_stack(smp_load_acquire(bucket), entries, nr_entries, hash); - if (found) + read_lock_irqsave(&pool_rwlock, flags); + + /* Fast path: look the stack trace up without full locking. */ + found = find_stack(*bucket, entries, nr_entries, hash); + if (found) { + read_unlock_irqrestore(&pool_rwlock, flags); goto exit; + } + + /* Take note if another stack pool needs to be allocated. */ + if (new_pool_required) + need_alloc = true; + + read_unlock_irqrestore(&pool_rwlock, flags); /* - * Check if another stack pool needs to be allocated. If so, allocate - * the memory now: we won't be able to do that under the lock. - * - * smp_load_acquire() pairs with smp_store_release() in - * depot_update_pools() and depot_keep_new_pool(). + * Allocate memory for a new pool if required now: + * we won't be able to do that under the lock. */ - if (unlikely(can_alloc && smp_load_acquire(&new_pool_required))) { + if (unlikely(can_alloc && need_alloc)) { /* * Zero out zone modifiers, as we don't have specific zone * requirements. Keep the flags related to allocation in atomic @@ -494,7 +498,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, prealloc = page_address(page); } - raw_spin_lock_irqsave(&pool_lock, flags); + write_lock_irqsave(&pool_rwlock, flags); found = find_stack(*bucket, entries, nr_entries, hash); if (!found) { @@ -503,11 +507,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, if (new) { new->next = *bucket; - /* - * smp_store_release() pairs with smp_load_acquire() - * from |bucket| above. - */ - smp_store_release(bucket, new); + *bucket = new; found = new; } } else if (prealloc) { @@ -518,7 +518,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, depot_keep_new_pool(&prealloc); } - raw_spin_unlock_irqrestore(&pool_lock, flags); + write_unlock_irqrestore(&pool_rwlock, flags); exit: if (prealloc) { /* Stack depot didn't use this memory, free it. */ @@ -542,6 +542,7 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) { struct stack_record *stack; + unsigned long flags; *entries = NULL; /* @@ -553,8 +554,12 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, if (!handle || stack_depot_disabled) return 0; + read_lock_irqsave(&pool_rwlock, flags); + stack = depot_fetch_stack(handle); + read_unlock_irqrestore(&pool_rwlock, flags); + *entries = stack->entries; return stack->size; } From 4805180bc165238c3d845a992a5962ee87097c15 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:11 +0100 Subject: [PATCH 0437/1562] lib/stackdepot: use list_head for stack record links Switch stack_record to use list_head for links in the hash table and in the freelist. This will allow removing entries from the hash table buckets. This is preparatory patch for implementing the eviction of stack records from the stack depot. Link: https://lkml.kernel.org/r/4787d9a584cd33433d9ee1846b17fa3d3e1987ad.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 87 ++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 37 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 8378b32b5310..4bb0af423f82 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -55,7 +56,7 @@ union handle_parts { }; struct stack_record { - struct stack_record *next; /* Link in hash table or freelist */ + struct list_head list; /* Links in hash table or freelist */ u32 hash; /* Hash in hash table */ u32 size; /* Number of stored frames */ union handle_parts handle; @@ -77,21 +78,21 @@ static bool __stack_depot_early_init_passed __initdata; /* Initial seed for jhash2. */ #define STACK_HASH_SEED 0x9747b28c -/* Hash table of pointers to stored stack traces. */ -static struct stack_record **stack_table; +/* Hash table of stored stack records. */ +static struct list_head *stack_table; /* Fixed order of the number of table buckets. Used when KASAN is enabled. */ static unsigned int stack_bucket_number_order; /* Hash mask for indexing the table. */ static unsigned int stack_hash_mask; -/* Array of memory regions that store stack traces. */ +/* Array of memory regions that store stack records. */ static void *stack_pools[DEPOT_MAX_POOLS]; /* Newly allocated pool that is not yet added to stack_pools. */ static void *new_pool; /* Number of pools in stack_pools. */ static int pools_num; -/* Next stack in the freelist of stack records within stack_pools. */ -static struct stack_record *next_stack; +/* Freelist of stack records within stack_pools. */ +static LIST_HEAD(free_stacks); /* * Stack depot tries to keep an extra pool allocated even before it runs out * of space in the currently used pool. This flag marks whether this extra pool @@ -116,6 +117,15 @@ void __init stack_depot_request_early_init(void) __stack_depot_early_init_requested = true; } +/* Initialize list_head's within the hash table. */ +static void init_stack_table(unsigned long entries) +{ + unsigned long i; + + for (i = 0; i < entries; i++) + INIT_LIST_HEAD(&stack_table[i]); +} + /* Allocates a hash table via memblock. Can only be used during early boot. */ int __init stack_depot_early_init(void) { @@ -152,16 +162,16 @@ int __init stack_depot_early_init(void) /* * If stack_bucket_number_order is not set, leave entries as 0 to rely - * on the automatic calculations performed by alloc_large_system_hash. + * on the automatic calculations performed by alloc_large_system_hash(). */ if (stack_bucket_number_order) entries = 1UL << stack_bucket_number_order; pr_info("allocating hash table via alloc_large_system_hash\n"); stack_table = alloc_large_system_hash("stackdepot", - sizeof(struct stack_record *), + sizeof(struct list_head), entries, STACK_HASH_TABLE_SCALE, - HASH_EARLY | HASH_ZERO, + HASH_EARLY, NULL, &stack_hash_mask, 1UL << STACK_BUCKET_NUMBER_ORDER_MIN, @@ -171,6 +181,14 @@ int __init stack_depot_early_init(void) stack_depot_disabled = true; return -ENOMEM; } + if (!entries) { + /* + * Obtain the number of entries that was calculated by + * alloc_large_system_hash(). + */ + entries = stack_hash_mask + 1; + } + init_stack_table(entries); return 0; } @@ -211,7 +229,7 @@ int stack_depot_init(void) entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX; pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); - stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); + stack_table = kvcalloc(entries, sizeof(struct list_head), GFP_KERNEL); if (!stack_table) { pr_err("hash table allocation failed, disabling\n"); stack_depot_disabled = true; @@ -219,6 +237,7 @@ int stack_depot_init(void) goto out_unlock; } stack_hash_mask = entries - 1; + init_stack_table(entries); out_unlock: mutex_unlock(&stack_depot_init_mutex); @@ -230,31 +249,24 @@ EXPORT_SYMBOL_GPL(stack_depot_init); /* Initializes a stack depol pool. */ static void depot_init_pool(void *pool) { - const int records_in_pool = DEPOT_POOL_SIZE / DEPOT_STACK_RECORD_SIZE; - int i, offset; + int offset; lockdep_assert_held_write(&pool_rwlock); - /* Initialize handles and link stack records to each other. */ - for (i = 0, offset = 0; - offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE; - i++, offset += DEPOT_STACK_RECORD_SIZE) { + WARN_ON(!list_empty(&free_stacks)); + + /* Initialize handles and link stack records into the freelist. */ + for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE; + offset += DEPOT_STACK_RECORD_SIZE) { struct stack_record *stack = pool + offset; stack->handle.pool_index = pools_num; stack->handle.offset = offset >> DEPOT_STACK_ALIGN; stack->handle.extra = 0; - if (i < records_in_pool - 1) - stack->next = (void *)stack + DEPOT_STACK_RECORD_SIZE; - else - stack->next = NULL; + list_add(&stack->list, &free_stacks); } - /* Link stack records into the freelist. */ - WARN_ON(next_stack); - next_stack = pool; - /* Save reference to the pool to be used by depot_fetch_stack(). */ stack_pools[pools_num] = pool; pools_num++; @@ -295,7 +307,7 @@ static bool depot_update_pools(void **prealloc) lockdep_assert_held_write(&pool_rwlock); /* Check if we still have objects in the freelist. */ - if (next_stack) + if (!list_empty(&free_stacks)) goto out_keep_prealloc; /* Check if we have a new pool saved and use it. */ @@ -346,19 +358,18 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) return NULL; /* Check if we have a stack record to save the stack trace. */ - stack = next_stack; - if (!stack) + if (list_empty(&free_stacks)) return NULL; - /* Advance the freelist. */ - next_stack = stack->next; + /* Get and unlink the first entry from the freelist. */ + stack = list_first_entry(&free_stacks, struct stack_record, list); + list_del(&stack->list); /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */ if (size > CONFIG_STACKDEPOT_MAX_FRAMES) size = CONFIG_STACKDEPOT_MAX_FRAMES; /* Save the stack trace. */ - stack->next = NULL; stack->hash = hash; stack->size = size; /* stack->handle is already filled in by depot_init_pool(). */ @@ -420,15 +431,17 @@ int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, } /* Finds a stack in a bucket of the hash table. */ -static inline struct stack_record *find_stack(struct stack_record *bucket, +static inline struct stack_record *find_stack(struct list_head *bucket, unsigned long *entries, int size, u32 hash) { + struct list_head *pos; struct stack_record *found; lockdep_assert_held(&pool_rwlock); - for (found = bucket; found; found = found->next) { + list_for_each(pos, bucket) { + found = list_entry(pos, struct stack_record, list); if (found->hash == hash && found->size == size && !stackdepot_memcmp(entries, found->entries, size)) @@ -441,7 +454,8 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags, bool can_alloc) { - struct stack_record *found = NULL, **bucket; + struct list_head *bucket; + struct stack_record *found = NULL; depot_stack_handle_t handle = 0; struct page *page = NULL; void *prealloc = NULL; @@ -468,7 +482,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, read_lock_irqsave(&pool_rwlock, flags); /* Fast path: look the stack trace up without full locking. */ - found = find_stack(*bucket, entries, nr_entries, hash); + found = find_stack(bucket, entries, nr_entries, hash); if (found) { read_unlock_irqrestore(&pool_rwlock, flags); goto exit; @@ -500,14 +514,13 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, write_lock_irqsave(&pool_rwlock, flags); - found = find_stack(*bucket, entries, nr_entries, hash); + found = find_stack(bucket, entries, nr_entries, hash); if (!found) { struct stack_record *new = depot_alloc_stack(entries, nr_entries, hash, &prealloc); if (new) { - new->next = *bucket; - *bucket = new; + list_add(&new->list, bucket); found = new; } } else if (prealloc) { From 3bddc3100c20139341212acdb8c472c3f07af6a8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:12 +0100 Subject: [PATCH 0438/1562] kmsan: use stack_depot_save instead of __stack_depot_save Make KMSAN use stack_depot_save instead of __stack_depot_save, as it always passes true to __stack_depot_save as the last argument. Link: https://lkml.kernel.org/r/18092240699efdc6acd78b51e41ea782953e6c8d.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/kmsan/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index c19f47af0424..cf2d70e9c9a5 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -76,7 +76,7 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, /* Don't sleep. */ flags &= ~(__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM); - handle = __stack_depot_save(entries, nr_entries, flags, true); + handle = stack_depot_save(entries, nr_entries, flags); return stack_depot_set_extra_bits(handle, extra); } @@ -185,11 +185,10 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) /* * @entries is a local var in non-instrumented code, so KMSAN does not * know it is initialized. Explicitly unpoison it to avoid false - * positives when __stack_depot_save() passes it to instrumented code. + * positives when stack_depot_save() passes it to instrumented code. */ kmsan_internal_unpoison_memory(entries, sizeof(entries), false); - handle = __stack_depot_save(entries, ARRAY_SIZE(entries), __GFP_HIGH, - true); + handle = stack_depot_save(entries, ARRAY_SIZE(entries), __GFP_HIGH); return stack_depot_set_extra_bits(handle, extra_bits); } From 022012dcf44209074af97b6ae531a10c08736b31 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:13 +0100 Subject: [PATCH 0439/1562] lib/stackdepot, kasan: add flags to __stack_depot_save and rename Change the bool can_alloc argument of __stack_depot_save to a u32 argument that accepts a set of flags. The following patch will add another flag to stack_depot_save_flags besides the existing STACK_DEPOT_FLAG_CAN_ALLOC. Also rename the function to stack_depot_save_flags, as __stack_depot_save is a cryptic name, Link: https://lkml.kernel.org/r/645fa15239621eebbd3a10331e5864b718839512.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 36 +++++++++++++++++++++++++----------- lib/stackdepot.c | 16 +++++++++++----- mm/kasan/common.c | 7 ++++--- mm/kasan/generic.c | 9 +++++---- mm/kasan/kasan.h | 2 +- mm/kasan/tags.c | 3 ++- 6 files changed, 48 insertions(+), 25 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index e58306783d8e..0b262e14144e 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -32,6 +32,17 @@ typedef u32 depot_stack_handle_t; */ #define STACK_DEPOT_EXTRA_BITS 5 +typedef u32 depot_flags_t; + +/* + * Flags that can be passed to stack_depot_save_flags(); see the comment next + * to its declaration for more details. + */ +#define STACK_DEPOT_FLAG_CAN_ALLOC ((depot_flags_t)0x0001) + +#define STACK_DEPOT_FLAGS_NUM 1 +#define STACK_DEPOT_FLAGS_MASK ((depot_flags_t)((1 << STACK_DEPOT_FLAGS_NUM) - 1)) + /* * Using stack depot requires its initialization, which can be done in 3 ways: * @@ -69,31 +80,34 @@ static inline int stack_depot_early_init(void) { return 0; } #endif /** - * __stack_depot_save - Save a stack trace to stack depot + * stack_depot_save_flags - Save a stack trace to stack depot * * @entries: Pointer to the stack trace * @nr_entries: Number of frames in the stack * @alloc_flags: Allocation GFP flags - * @can_alloc: Allocate stack pools (increased chance of failure if false) + * @depot_flags: Stack depot flags * - * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is - * %true, stack depot can replenish the stack pools in case no space is left - * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids - * any allocations and fails if no space is left to store the stack trace. + * Saves a stack trace from @entries array of size @nr_entries. + * + * If STACK_DEPOT_FLAG_CAN_ALLOC is set in @depot_flags, stack depot can + * replenish the stack pools in case no space is left (allocates using GFP + * flags of @alloc_flags). Otherwise, stack depot avoids any allocations and + * fails if no space is left to store the stack trace. * * If the provided stack trace comes from the interrupt context, only the part * up to the interrupt entry is saved. * - * Context: Any context, but setting @can_alloc to %false is required if + * Context: Any context, but setting STACK_DEPOT_FLAG_CAN_ALLOC is required if * alloc_pages() cannot be used from the current context. Currently * this is the case for contexts where neither %GFP_ATOMIC nor * %GFP_NOWAIT can be used (NMI, raw_spin_lock). * * Return: Handle of the stack struct stored in depot, 0 on failure */ -depot_stack_handle_t __stack_depot_save(unsigned long *entries, - unsigned int nr_entries, - gfp_t gfp_flags, bool can_alloc); +depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, + unsigned int nr_entries, + gfp_t gfp_flags, + depot_flags_t depot_flags); /** * stack_depot_save - Save a stack trace to stack depot @@ -103,7 +117,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * @alloc_flags: Allocation GFP flags * * Context: Contexts where allocations via alloc_pages() are allowed. - * See __stack_depot_save() for more details. + * See stack_depot_save_flags() for more details. * * Return: Handle of the stack trace stored in depot, 0 on failure */ diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 4bb0af423f82..59d61d5c09a7 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -450,19 +450,24 @@ static inline struct stack_record *find_stack(struct list_head *bucket, return NULL; } -depot_stack_handle_t __stack_depot_save(unsigned long *entries, - unsigned int nr_entries, - gfp_t alloc_flags, bool can_alloc) +depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, + unsigned int nr_entries, + gfp_t alloc_flags, + depot_flags_t depot_flags) { struct list_head *bucket; struct stack_record *found = NULL; depot_stack_handle_t handle = 0; struct page *page = NULL; void *prealloc = NULL; + bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC; bool need_alloc = false; unsigned long flags; u32 hash; + if (WARN_ON(depot_flags & ~STACK_DEPOT_FLAGS_MASK)) + return 0; + /* * If this stack trace is from an interrupt, including anything before * interrupt entry usually leads to unbounded stack depot growth. @@ -541,13 +546,14 @@ exit: handle = found->handle.handle; return handle; } -EXPORT_SYMBOL_GPL(__stack_depot_save); +EXPORT_SYMBOL_GPL(stack_depot_save_flags); depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags) { - return __stack_depot_save(entries, nr_entries, alloc_flags, true); + return stack_depot_save_flags(entries, nr_entries, alloc_flags, + STACK_DEPOT_FLAG_CAN_ALLOC); } EXPORT_SYMBOL_GPL(stack_depot_save); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 256930da578a..825a0240ec02 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -37,19 +38,19 @@ struct slab *kasan_addr_to_slab(const void *addr) return NULL; } -depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) +depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags) { unsigned long entries[KASAN_STACK_DEPTH]; unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); - return __stack_depot_save(entries, nr_entries, flags, can_alloc); + return stack_depot_save_flags(entries, nr_entries, flags, depot_flags); } void kasan_set_track(struct kasan_track *track, gfp_t flags) { track->pid = current->pid; - track->stack = kasan_save_stack(flags, true); + track->stack = kasan_save_stack(flags, STACK_DEPOT_FLAG_CAN_ALLOC); } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 4d837ab83f08..5d168c9afb32 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -472,7 +473,7 @@ size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) sizeof(struct kasan_free_meta) : 0); } -static void __kasan_record_aux_stack(void *addr, bool can_alloc) +static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) { struct slab *slab = kasan_addr_to_slab(addr); struct kmem_cache *cache; @@ -489,17 +490,17 @@ static void __kasan_record_aux_stack(void *addr, bool can_alloc) return; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; - alloc_meta->aux_stack[0] = kasan_save_stack(0, can_alloc); + alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags); } void kasan_record_aux_stack(void *addr) { - return __kasan_record_aux_stack(addr, true); + return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC); } void kasan_record_aux_stack_noalloc(void *addr) { - return __kasan_record_aux_stack(addr, false); + return __kasan_record_aux_stack(addr, 0); } void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 8b06bab5c406..b29d46b83d1f 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -368,7 +368,7 @@ static inline void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { } #endif -depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); +depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); void kasan_save_free_info(struct kmem_cache *cache, void *object); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 7dcfe341d48e..4fd32121b0fd 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -101,7 +102,7 @@ static void save_stack_info(struct kmem_cache *cache, void *object, struct kasan_stack_ring_entry *entry; void *old_ptr; - stack = kasan_save_stack(gfp_flags, true); + stack = kasan_save_stack(gfp_flags, STACK_DEPOT_FLAG_CAN_ALLOC); /* * Prevent save_stack_info() from modifying stack ring From 410b764f89f59cce858d94fc781b68c1f27a0ca9 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:14 +0100 Subject: [PATCH 0440/1562] lib/stackdepot: add refcount for records Add a reference counter for how many times a stack records has been added to stack depot. Add a new STACK_DEPOT_FLAG_GET flag to stack_depot_save_flags that instructs the stack depot to increment the refcount. Do not yet decrement the refcount; this is implemented in one of the following patches. Do not yet enable any users to use the flag to avoid overflowing the refcount. This is preparatory patch for implementing the eviction of stack records from the stack depot. Link: https://lkml.kernel.org/r/a3fc14a2359d019d2a008d4ff8b46a665371ffee.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 13 ++++++++++--- lib/stackdepot.c | 12 ++++++++++-- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 0b262e14144e..611716702d73 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -39,8 +39,9 @@ typedef u32 depot_flags_t; * to its declaration for more details. */ #define STACK_DEPOT_FLAG_CAN_ALLOC ((depot_flags_t)0x0001) +#define STACK_DEPOT_FLAG_GET ((depot_flags_t)0x0002) -#define STACK_DEPOT_FLAGS_NUM 1 +#define STACK_DEPOT_FLAGS_NUM 2 #define STACK_DEPOT_FLAGS_MASK ((depot_flags_t)((1 << STACK_DEPOT_FLAGS_NUM) - 1)) /* @@ -94,6 +95,9 @@ static inline int stack_depot_early_init(void) { return 0; } * flags of @alloc_flags). Otherwise, stack depot avoids any allocations and * fails if no space is left to store the stack trace. * + * If STACK_DEPOT_FLAG_GET is set in @depot_flags, stack depot will increment + * the refcount on the saved stack trace if it already exists in stack depot. + * * If the provided stack trace comes from the interrupt context, only the part * up to the interrupt entry is saved. * @@ -116,8 +120,11 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, * @nr_entries: Number of frames in the stack * @alloc_flags: Allocation GFP flags * - * Context: Contexts where allocations via alloc_pages() are allowed. - * See stack_depot_save_flags() for more details. + * Does not increment the refcount on the saved stack trace; see + * stack_depot_save_flags() for more details. + * + * Context: Contexts where allocations via alloc_pages() are allowed; + * see stack_depot_save_flags() for more details. * * Return: Handle of the stack trace stored in depot, 0 on failure */ diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 59d61d5c09a7..911dee11bf39 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -60,6 +61,7 @@ struct stack_record { u32 hash; /* Hash in hash table */ u32 size; /* Number of stored frames */ union handle_parts handle; + refcount_t count; unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */ }; @@ -373,6 +375,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) stack->hash = hash; stack->size = size; /* stack->handle is already filled in by depot_init_pool(). */ + refcount_set(&stack->count, 1); memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); /* @@ -489,6 +492,8 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, /* Fast path: look the stack trace up without full locking. */ found = find_stack(bucket, entries, nr_entries, hash); if (found) { + if (depot_flags & STACK_DEPOT_FLAG_GET) + refcount_inc(&found->count); read_unlock_irqrestore(&pool_rwlock, flags); goto exit; } @@ -528,12 +533,15 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, list_add(&new->list, bucket); found = new; } - } else if (prealloc) { + } else { + if (depot_flags & STACK_DEPOT_FLAG_GET) + refcount_inc(&found->count); /* * Stack depot already contains this stack trace, but let's * keep the preallocated memory for future. */ - depot_keep_new_pool(&prealloc); + if (prealloc) + depot_keep_new_pool(&prealloc); } write_unlock_irqrestore(&pool_rwlock, flags); From 108be8def46e9422f5a5abc96b0ab8fb6b3fb344 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:15 +0100 Subject: [PATCH 0441/1562] lib/stackdepot: allow users to evict stack traces Add stack_depot_put, a function that decrements the reference counter on a stack record and removes it from the stack depot once the counter reaches 0. Internally, when removing a stack record, the function unlinks it from the hash table bucket and returns to the freelist. With this change, the users of stack depot can call stack_depot_put when keeping a stack trace in the stack depot is not needed anymore. This allows avoiding polluting the stack depot with irrelevant stack traces and thus have more space to store the relevant ones before the stack depot reaches its capacity. Link: https://lkml.kernel.org/r/1d1ad5692ee43d4fc2b3fd9d221331d30b36123f.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 14 ++++++++++++++ lib/stackdepot.c | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 611716702d73..a6796f178913 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -97,6 +97,8 @@ static inline int stack_depot_early_init(void) { return 0; } * * If STACK_DEPOT_FLAG_GET is set in @depot_flags, stack depot will increment * the refcount on the saved stack trace if it already exists in stack depot. + * Users of this flag must also call stack_depot_put() when keeping the stack + * trace is no longer required to avoid overflowing the refcount. * * If the provided stack trace comes from the interrupt context, only the part * up to the interrupt entry is saved. @@ -162,6 +164,18 @@ void stack_depot_print(depot_stack_handle_t stack); int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); +/** + * stack_depot_put - Drop a reference to a stack trace from stack depot + * + * @handle: Stack depot handle returned from stack_depot_save() + * + * The stack trace is evicted from stack depot once all references to it have + * been dropped (once the number of stack_depot_evict() calls matches the + * number of stack_depot_save_flags() calls with STACK_DEPOT_FLAG_GET set for + * this stack trace). + */ +void stack_depot_put(depot_stack_handle_t handle); + /** * stack_depot_set_extra_bits - Set extra bits in a stack depot handle * diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 911dee11bf39..c1b31160f4b4 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -394,7 +394,7 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; - lockdep_assert_held_read(&pool_rwlock); + lockdep_assert_held(&pool_rwlock); if (parts.pool_index > pools_num) { WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", @@ -410,6 +410,14 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) return stack; } +/* Links stack into the freelist. */ +static void depot_free_stack(struct stack_record *stack) +{ + lockdep_assert_held_write(&pool_rwlock); + + list_add(&stack->list, &free_stacks); +} + /* Calculates the hash for a stack. */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) { @@ -592,6 +600,33 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, } EXPORT_SYMBOL_GPL(stack_depot_fetch); +void stack_depot_put(depot_stack_handle_t handle) +{ + struct stack_record *stack; + unsigned long flags; + + if (!handle || stack_depot_disabled) + return; + + write_lock_irqsave(&pool_rwlock, flags); + + stack = depot_fetch_stack(handle); + if (WARN_ON(!stack)) + goto out; + + if (refcount_dec_and_test(&stack->count)) { + /* Unlink stack from the hash table. */ + list_del(&stack->list); + + /* Free stack. */ + depot_free_stack(stack); + } + +out: + write_unlock_irqrestore(&pool_rwlock, flags); +} +EXPORT_SYMBOL_GPL(stack_depot_put); + void stack_depot_print(depot_stack_handle_t stack) { unsigned long *entries; From f3b5979862994089005d48ad2ce5b6a9735981fe Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:16 +0100 Subject: [PATCH 0442/1562] kasan: remove atomic accesses to stack ring entries Remove the atomic accesses to entry fields in save_stack_info and kasan_complete_mode_report_info for tag-based KASAN modes. These atomics are not required, as the read/write lock prevents the entries from being read (in kasan_complete_mode_report_info) while being written (in save_stack_info) and the try_cmpxchg prevents the same entry from being rewritten (in save_stack_info) in the unlikely case of wrapping during writing. Link: https://lkml.kernel.org/r/29f59126d9845c5257b6c29cd7ad113b16f19f47.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/kasan/report_tags.c | 25 +++++++------------------ mm/kasan/tags.c | 13 +++++-------- 2 files changed, 12 insertions(+), 26 deletions(-) diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 8b8bfdb3cfdb..78abdcde5da9 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -31,10 +31,6 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) unsigned long flags; u64 pos; struct kasan_stack_ring_entry *entry; - void *ptr; - u32 pid; - depot_stack_handle_t stack; - bool is_free; bool alloc_found = false, free_found = false; if ((!info->cache || !info->object) && !info->bug_type) { @@ -61,18 +57,11 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) entry = &stack_ring.entries[i % stack_ring.size]; - /* Paired with smp_store_release() in save_stack_info(). */ - ptr = (void *)smp_load_acquire(&entry->ptr); - - if (kasan_reset_tag(ptr) != info->object || - get_tag(ptr) != get_tag(info->access_addr)) + if (kasan_reset_tag(entry->ptr) != info->object || + get_tag(entry->ptr) != get_tag(info->access_addr)) continue; - pid = READ_ONCE(entry->pid); - stack = READ_ONCE(entry->stack); - is_free = READ_ONCE(entry->is_free); - - if (is_free) { + if (entry->is_free) { /* * Second free of the same object. * Give up on trying to find the alloc entry. @@ -80,8 +69,8 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) if (free_found) break; - info->free_track.pid = pid; - info->free_track.stack = stack; + info->free_track.pid = entry->pid; + info->free_track.stack = entry->stack; free_found = true; /* @@ -95,8 +84,8 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) if (alloc_found) break; - info->alloc_track.pid = pid; - info->alloc_track.stack = stack; + info->alloc_track.pid = entry->pid; + info->alloc_track.stack = entry->stack; alloc_found = true; /* diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 4fd32121b0fd..b6c017e670d8 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -121,15 +121,12 @@ next: if (!try_cmpxchg(&entry->ptr, &old_ptr, STACK_RING_BUSY_PTR)) goto next; /* Busy slot. */ - WRITE_ONCE(entry->size, cache->object_size); - WRITE_ONCE(entry->pid, current->pid); - WRITE_ONCE(entry->stack, stack); - WRITE_ONCE(entry->is_free, is_free); + entry->size = cache->object_size; + entry->pid = current->pid; + entry->stack = stack; + entry->is_free = is_free; - /* - * Paired with smp_load_acquire() in kasan_complete_mode_report_info(). - */ - smp_store_release(&entry->ptr, (s64)object); + entry->ptr = object; read_unlock_irqrestore(&stack_ring.lock, flags); } From 7d88e4f768b0fdb85b68f0e4679bb10fdb05c808 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:17 +0100 Subject: [PATCH 0443/1562] kasan: check object_size in kasan_complete_mode_report_info Check the object size when looking up entries in the stack ring. If the size of the object for which a report is being printed does not match the size of the object for which a stack trace has been saved in the stack ring, the saved stack trace is irrelevant. Link: https://lkml.kernel.org/r/68c6948175aadd7e7e7deea61725103d64a4528f.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/kasan/report_tags.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 78abdcde5da9..55154743f915 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -7,6 +7,7 @@ #include #include "kasan.h" +#include "../slab.h" extern struct kasan_stack_ring stack_ring; @@ -58,7 +59,8 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) entry = &stack_ring.entries[i % stack_ring.size]; if (kasan_reset_tag(entry->ptr) != info->object || - get_tag(entry->ptr) != get_tag(info->access_addr)) + get_tag(entry->ptr) != get_tag(info->access_addr) || + info->cache->object_size != entry->size) continue; if (entry->is_free) { From f816938bff1f772ce7949e5747734be27ecf7f4d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:18 +0100 Subject: [PATCH 0444/1562] kasan: use stack_depot_put for tag-based modes Make tag-based KASAN modes evict stack traces from the stack depot once they are evicted from the stack ring. Internally, pass STACK_DEPOT_FLAG_GET to stack_depot_save_flags (via kasan_save_stack) to increment the refcount when saving a new entry to stack ring and call stack_depot_put when removing an entry from stack ring. Link: https://lkml.kernel.org/r/b4773e5c1b0b9df6826ec0b65c1923feadfa78e5.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/kasan/tags.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index b6c017e670d8..739ae997463d 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -97,12 +97,13 @@ static void save_stack_info(struct kmem_cache *cache, void *object, gfp_t gfp_flags, bool is_free) { unsigned long flags; - depot_stack_handle_t stack; + depot_stack_handle_t stack, old_stack; u64 pos; struct kasan_stack_ring_entry *entry; void *old_ptr; - stack = kasan_save_stack(gfp_flags, STACK_DEPOT_FLAG_CAN_ALLOC); + stack = kasan_save_stack(gfp_flags, + STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET); /* * Prevent save_stack_info() from modifying stack ring @@ -121,6 +122,8 @@ next: if (!try_cmpxchg(&entry->ptr, &old_ptr, STACK_RING_BUSY_PTR)) goto next; /* Busy slot. */ + old_stack = entry->stack; + entry->size = cache->object_size; entry->pid = current->pid; entry->stack = stack; @@ -129,6 +132,9 @@ next: entry->ptr = object; read_unlock_irqrestore(&stack_ring.lock, flags); + + if (old_stack) + stack_depot_put(old_stack); } void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) From 2d5524635b00fc90016577e1a18c21682b1bb913 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 23 Nov 2023 00:12:02 +0100 Subject: [PATCH 0445/1562] slub, kasan: improve interaction of KASAN and slub_debug poisoning When both KASAN and slub_debug are enabled, when a free object is being prepared in setup_object, slub_debug poisons the object data before KASAN initializes its per-object metadata. Right now, in setup_object, KASAN only initializes the alloc metadata, which is always stored outside of the object. slub_debug is aware of this and it skips poisoning and checking that memory area. However, with the following patch in this series, KASAN also starts initializing its free medata in setup_object. As this metadata might be stored within the object, this initialization might overwrite the slub_debug poisoning. This leads to slub_debug reports. Thus, skip checking slub_debug poisoning of the object data area that overlaps with the in-object KASAN free metadata. Also make slub_debug poisoning of tail kmalloc redzones more precise when KASAN is enabled: slub_debug can still poison and check the tail kmalloc allocation area that comes after the KASAN free metadata. Link: https://lkml.kernel.org/r/20231122231202.121277-1-andrey.konovalov@linux.dev Signed-off-by: Andrey Konovalov Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Feng Tang Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/slub.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 63d281dfacdb..782bd8a6bd34 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -870,20 +870,20 @@ static inline void set_orig_size(struct kmem_cache *s, void *object, unsigned int orig_size) { void *p = kasan_reset_tag(object); + unsigned int kasan_meta_size; if (!slub_debug_orig_size(s)) return; -#ifdef CONFIG_KASAN_GENERIC /* - * KASAN could save its free meta data in object's data area at - * offset 0, if the size is larger than 'orig_size', it will - * overlap the data redzone in [orig_size+1, object_size], and - * the check should be skipped. + * KASAN can save its free meta data inside of the object at offset 0. + * If this meta data size is larger than 'orig_size', it will overlap + * the data redzone in [orig_size+1, object_size]. Thus, we adjust + * 'orig_size' to be as at least as big as KASAN's meta data. */ - if (kasan_metadata_size(s, true) > orig_size) - orig_size = s->object_size; -#endif + kasan_meta_size = kasan_metadata_size(s, true); + if (kasan_meta_size > orig_size) + orig_size = kasan_meta_size; p += get_info_end(s); p += sizeof(struct track) * 2; @@ -1192,7 +1192,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, { u8 *p = object; u8 *endobject = object + s->object_size; - unsigned int orig_size; + unsigned int orig_size, kasan_meta_size; if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, slab, object, "Left Redzone", @@ -1222,12 +1222,23 @@ static int check_object(struct kmem_cache *s, struct slab *slab, } if (s->flags & SLAB_POISON) { - if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && - (!check_bytes_and_report(s, slab, p, "Poison", p, - POISON_FREE, s->object_size - 1) || - !check_bytes_and_report(s, slab, p, "End Poison", - p + s->object_size - 1, POISON_END, 1))) - return 0; + if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON)) { + /* + * KASAN can save its free meta data inside of the + * object at offset 0. Thus, skip checking the part of + * the redzone that overlaps with the meta data. + */ + kasan_meta_size = kasan_metadata_size(s, true); + if (kasan_meta_size < s->object_size - 1 && + !check_bytes_and_report(s, slab, p, "Poison", + p + kasan_meta_size, POISON_FREE, + s->object_size - kasan_meta_size - 1)) + return 0; + if (kasan_meta_size < s->object_size && + !check_bytes_and_report(s, slab, p, "End Poison", + p + s->object_size - 1, POISON_END, 1)) + return 0; + } /* * check_pad_bytes cleans up on its own. */ From 773688a6cb24b0b3c2ba40354d883348a2befa38 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:19 +0100 Subject: [PATCH 0446/1562] kasan: use stack_depot_put for Generic mode Evict alloc/free stack traces from the stack depot for Generic KASAN once they are evicted from the quaratine. For auxiliary stack traces, evict the oldest stack trace once a new one is saved (KASAN only keeps references to the last two). Also evict all saved stack traces on krealloc. To avoid double-evicting and mis-evicting stack traces (in case KASAN's metadata was corrupted), reset KASAN's per-object metadata that stores stack depot handles when the object is initialized and when it's evicted from the quarantine. Note that stack_depot_put is no-op if the handle is 0. Link: https://lkml.kernel.org/r/5cef104d9b842899489b4054fe8d1339a71acee0.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/kasan/common.c | 3 ++- mm/kasan/generic.c | 22 ++++++++++++++++++---- mm/kasan/quarantine.c | 26 ++++++++++++++++++++------ 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 825a0240ec02..b5d8bd26fced 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -50,7 +50,8 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags) void kasan_set_track(struct kasan_track *track, gfp_t flags) { track->pid = current->pid; - track->stack = kasan_save_stack(flags, STACK_DEPOT_FLAG_CAN_ALLOC); + track->stack = kasan_save_stack(flags, + STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET); } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 5d168c9afb32..50cc519e23f4 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -449,10 +449,14 @@ struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { struct kasan_alloc_meta *alloc_meta; + struct kasan_free_meta *free_meta; alloc_meta = kasan_get_alloc_meta(cache, object); if (alloc_meta) __memset(alloc_meta, 0, sizeof(*alloc_meta)); + free_meta = kasan_get_free_meta(cache, object); + if (free_meta) + __memset(free_meta, 0, sizeof(*free_meta)); } size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) @@ -489,18 +493,20 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) if (!alloc_meta) return; + stack_depot_put(alloc_meta->aux_stack[1]); alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags); } void kasan_record_aux_stack(void *addr) { - return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC); + return __kasan_record_aux_stack(addr, + STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET); } void kasan_record_aux_stack_noalloc(void *addr) { - return __kasan_record_aux_stack(addr, 0); + return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_GET); } void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) @@ -508,8 +514,16 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) struct kasan_alloc_meta *alloc_meta; alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - kasan_set_track(&alloc_meta->alloc_track, flags); + if (!alloc_meta) + return; + + /* Evict previous stack traces (might exist for krealloc). */ + stack_depot_put(alloc_meta->alloc_track.stack); + stack_depot_put(alloc_meta->aux_stack[0]); + stack_depot_put(alloc_meta->aux_stack[1]); + __memset(alloc_meta, 0, sizeof(*alloc_meta)); + + kasan_set_track(&alloc_meta->alloc_track, flags); } void kasan_save_free_info(struct kmem_cache *cache, void *object) diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index ca4529156735..265ca2bbe2dd 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -143,11 +143,22 @@ static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache) static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) { void *object = qlink_to_object(qlink, cache); - struct kasan_free_meta *meta = kasan_get_free_meta(cache, object); + struct kasan_alloc_meta *alloc_meta = kasan_get_alloc_meta(cache, object); + struct kasan_free_meta *free_meta = kasan_get_free_meta(cache, object); unsigned long flags; - if (IS_ENABLED(CONFIG_SLAB)) - local_irq_save(flags); + if (alloc_meta) { + stack_depot_put(alloc_meta->alloc_track.stack); + stack_depot_put(alloc_meta->aux_stack[0]); + stack_depot_put(alloc_meta->aux_stack[1]); + __memset(alloc_meta, 0, sizeof(*alloc_meta)); + } + + if (free_meta && + *(u8 *)kasan_mem_to_shadow(object) == KASAN_SLAB_FREETRACK) { + stack_depot_put(free_meta->free_track.stack); + free_meta->free_track.stack = 0; + } /* * If init_on_free is enabled and KASAN's free metadata is stored in @@ -157,14 +168,17 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) */ if (slab_want_init_on_free(cache) && cache->kasan_info.free_meta_offset == 0) - memzero_explicit(meta, sizeof(*meta)); + memzero_explicit(free_meta, sizeof(*free_meta)); /* - * As the object now gets freed from the quarantine, assume that its - * free track is no longer valid. + * As the object now gets freed from the quarantine, + * take note that its free track is no longer exists. */ *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE; + if (IS_ENABLED(CONFIG_SLAB)) + local_irq_save(flags); + ___cache_free(cache, object, _THIS_IP_); if (IS_ENABLED(CONFIG_SLAB)) From bd9d9624b7136b69d892597b6a8cc482341e415a Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:20 +0100 Subject: [PATCH 0447/1562] lib/stackdepot: adjust DEPOT_POOLS_CAP for KMSAN KMSAN is frequently used in fuzzing scenarios and thus saves a lot of stack traces. As KMSAN does not support evicting stack traces from the stack depot, the stack depot capacity might be reached quickly with large stack records. Adjust the maximum number of stack depot pools for this case. The average size of a stack trace saved into the stack depot is ~16 frames. Thus, adjust the maximum pools number accordingly to keep the maximum number of stack traces that can be saved into the stack depot similar to the one that was allowed before the stack trace eviction changes. Link: https://lkml.kernel.org/r/301a115cf7ce8ddb42ef6de9151c2bb76ba728fc.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index c1b31160f4b4..870cce2f4cbd 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -41,7 +41,17 @@ #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \ STACK_DEPOT_EXTRA_BITS) +#if IS_ENABLED(CONFIG_KMSAN) && CONFIG_STACKDEPOT_MAX_FRAMES >= 32 +/* + * KMSAN is frequently used in fuzzing scenarios and thus saves a lot of stack + * traces. As KMSAN does not support evicting stack traces from the stack + * depot, the stack depot capacity might be reached quickly with large stack + * records. Adjust the maximum number of stack depot pools for this case. + */ +#define DEPOT_POOLS_CAP (8192 * (CONFIG_STACKDEPOT_MAX_FRAMES / 16)) +#else #define DEPOT_POOLS_CAP 8192 +#endif #define DEPOT_MAX_POOLS \ (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \ (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP) From 95a2ac937013cc3aaaea02abcdd167b96874548d Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 20 Nov 2023 15:53:54 +0100 Subject: [PATCH 0448/1562] mm: use vmem_altmap code without CONFIG_ZONE_DEVICE vmem_altmap_free() and vmem_altmap_offset() could be utlized without CONFIG_ZONE_DEVICE enabled. For example, mm/memory_hotplug.c:__add_pages() relies on that. The altmap is no longer restricted to ZONE_DEVICE handling, but instead depends on CONFIG_SPARSEMEM_VMEMMAP. When CONFIG_SPARSEMEM_VMEMMAP is disabled, these functions are defined as inline stubs, ensuring compatibility with configurations that do not use sparsemem vmemmap. Without it, lkp reported the following: ld: arch/x86/mm/init_64.o: in function `remove_pagetable': init_64.c:(.meminit.text+0xfc7): undefined reference to `vmem_altmap_free' Link: https://lkml.kernel.org/r/20231120145354.308999-4-sumanthk@linux.ibm.com Signed-off-by: Sumanth Korikkar Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311180545.VeyRXEDq-lkp@intel.com/ Reviewed-by: Gerald Schaefer Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- include/linux/memremap.h | 12 ------------ include/linux/mm.h | 26 ++++++++++++++++++++++++++ mm/memremap.c | 14 +------------- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 1314d9c5f05b..744c830f4b13 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -196,8 +196,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); -unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); -void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); unsigned long memremap_compat_align(void); #else static inline void *devm_memremap_pages(struct device *dev, @@ -228,16 +226,6 @@ static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn) return false; } -static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) -{ - return 0; -} - -static inline void vmem_altmap_free(struct vmem_altmap *altmap, - unsigned long nr_pfns) -{ -} - /* when memremap_pages() is disabled all archs can remap a single page */ static inline unsigned long memremap_compat_align(void) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 13a090271716..a422cc123a2d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3871,6 +3871,32 @@ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + /* number of pfns from base where pfn_to_page() is valid */ + if (altmap) + return altmap->reserve + altmap->free; + return 0; +} + +static inline void vmem_altmap_free(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ + altmap->alloc -= nr_pfns; +} +#else +static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + return 0; +} + +static inline void vmem_altmap_free(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ +} +#endif + #define VMEMMAP_RESERVE_NR 2 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, diff --git a/mm/memremap.c b/mm/memremap.c index bee85560a243..9531faa92a7c 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -422,19 +423,6 @@ void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) } EXPORT_SYMBOL_GPL(devm_memunmap_pages); -unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) -{ - /* number of pfns from base where pfn_to_page() is valid */ - if (altmap) - return altmap->reserve + altmap->free; - return 0; -} - -void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) -{ - altmap->alloc -= nr_pfns; -} - /** * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn * @pfn: page frame number to lookup page_map From 2f7537620f383de121eaeb25f3e073a27831d086 Mon Sep 17 00:00:00 2001 From: Fabio De Francesco Date: Mon, 20 Nov 2023 15:15:27 +0100 Subject: [PATCH 0449/1562] mm/util: use kmap_local_page() in memcmp_pages() kmap_atomic() has been deprecated in favor of kmap_local_page(). Therefore, replace kmap_atomic() with kmap_local_page() in memcmp_pages(). kmap_atomic() is implemented like a kmap_local_page() which also disables page-faults and preemption (the latter only in !PREEMPT_RT kernels). The kernel virtual addresses returned by these two API are only valid in the context of the callers (i.e., they cannot be handed to other threads). With kmap_local_page() the mappings are per thread and CPU local like in kmap_atomic(); however, they can handle page-faults and can be called from any context (including interrupts). The tasks that call kmap_local_page() can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. In memcmp_pages(), the block of code between the mapping and un-mapping does not depend on the above-mentioned side effects of kmap_aatomic(), so that mere replacements of the old API with the new one is all that is required (i.e., there is no need to explicitly call pagefault_disable() and/or preempt_disable()). Link: https://lkml.kernel.org/r/20231120141554.6612-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Cc: Ira Weiny Signed-off-by: Andrew Morton --- mm/util.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/util.c b/mm/util.c index 744b4d7e3fae..5a6a9802583b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1047,11 +1047,11 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) char *addr1, *addr2; int ret; - addr1 = kmap_atomic(page1); - addr2 = kmap_atomic(page2); + addr1 = kmap_local_page(page1); + addr2 = kmap_local_page(page2); ret = memcmp(addr1, addr2, PAGE_SIZE); - kunmap_atomic(addr2); - kunmap_atomic(addr1); + kunmap_local(addr2); + kunmap_local(addr1); return ret; } From b33519896664f66358cec60f6b308d80a60d1c96 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 20 Nov 2023 15:18:44 +0100 Subject: [PATCH 0450/1562] mm/ksm: use kmap_local_page() in calc_checksum() kmap_atomic() has been deprecated in favor of kmap_local_page(). Therefore, replace kmap_atomic() with kmap_local_page() in calc_checksum(). kmap_atomic() is implemented like a kmap_local_page() which also disables page-faults and preemption (the latter only in !PREEMPT_RT kernels). The kernel virtual addresses returned by these two API are only valid in the context of the callers (i.e., they cannot be handed to other threads). With kmap_local_page() the mappings are per thread and CPU local like in kmap_atomic(); however, they can handle page-faults and can be called from any context (including interrupts). The tasks that call kmap_local_page() can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. In calc_checksum(), the block of code between the mapping and un-mapping does not depend on the above-mentioned side effects of kmap_aatomic(), so that a mere replacements of the old API with the new one is all that is required (i.e., there is no need to explicitly call pagefault_disable() and/or preempt_disable()). Link: https://lkml.kernel.org/r/20231120141855.6761-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Cc: Ira Weiny Signed-off-by: Andrew Morton --- mm/ksm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 6a831009b4cb..5d60d5385de6 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1099,9 +1099,9 @@ error: static u32 calc_checksum(struct page *page) { u32 checksum; - void *addr = kmap_atomic(page); + void *addr = kmap_local_page(page); checksum = xxhash(addr, PAGE_SIZE, 0); - kunmap_atomic(addr); + kunmap_local(addr); return checksum; } From 24d2613a6356f9c4a0b1b8e17f125562f6c8e11b Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 20 Nov 2023 15:24:05 +0100 Subject: [PATCH 0451/1562] mm/memory: use kmap_local_page() in __wp_page_copy_user() kmap_atomic() has been deprecated in favor of kmap_local_{folio,page}. Therefore, replace kmap_atomic() with kmap_local_page in __wp_page_copy_user(). kmap_atomic() disables preemption in !PREEMPT_RT kernels and unconditionally disables also page-faults. My limited knowledge of the implementation of __wp_page_copy_user() makes me think that the latter side effect is still needed here, but kmap_local_page() is implemented not to disable page-faults. So, in addition to the conversion to local mapping, add explicit pagefault_disable() / pagefault_enable() between mapping and un-mapping. Link: https://lkml.kernel.org/r/20231120142418.6977-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Cc: Ira Weiny Signed-off-by: Andrew Morton --- mm/memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index e27e2e5beb3f..a8ff3489211b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2841,7 +2841,8 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, * just copying from the original user address. If that * fails, we just zero-fill it. Live with it. */ - kaddr = kmap_atomic(dst); + kaddr = kmap_local_page(dst); + pagefault_disable(); uaddr = (void __user *)(addr & PAGE_MASK); /* @@ -2909,7 +2910,8 @@ warn: pte_unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); - kunmap_atomic(kaddr); + pagefault_enable(); + kunmap_local(kaddr); flush_dcache_page(dst); return ret; From f2bcc99a5e901a13b754648d1dbab60f4adf9375 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 20 Nov 2023 15:26:31 +0100 Subject: [PATCH 0452/1562] mm/mempool: replace kmap_atomic() with kmap_local_page() kmap_atomic() has been deprecated in favor of kmap_local_page(). Therefore, replace kmap_atomic() with kmap_local_page(). kmap_atomic() is implemented like a kmap_local_page() which also disables page-faults and preemption (the latter only in !PREEMPT_RT kernels). The kernel virtual addresses returned by these two API are only valid in the context of the callers (i.e., they cannot be handed to other threads). With kmap_local_page() the mappings are per thread and CPU local like in kmap_atomic(); however, they can handle page-faults and can be called from any context (including interrupts). The tasks that call kmap_local_page() can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. The code blocks between the mappings and un-mappings don't rely on the above-mentioned side effects of kmap_atomic(), so that mere replacements of the old API with the new one is all that they require (i.e., there is no need to explicitly call pagefault_disable() and/or preempt_disable()). Link: https://lkml.kernel.org/r/20231120142640.7077-1-fabio.maria.de.francesco@linux.intel.com Signed-off-by: Fabio M. De Francesco Cc: Ira Weiny Signed-off-by: Andrew Morton --- mm/mempool.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index 734bcf5afbb7..b3d2084fd989 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -64,10 +64,10 @@ static void check_element(mempool_t *pool, void *element) } else if (pool->free == mempool_free_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; - void *addr = kmap_atomic((struct page *)element); + void *addr = kmap_local_page((struct page *)element); __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); - kunmap_atomic(addr); + kunmap_local(addr); } } @@ -89,10 +89,10 @@ static void poison_element(mempool_t *pool, void *element) } else if (pool->alloc == mempool_alloc_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; - void *addr = kmap_atomic((struct page *)element); + void *addr = kmap_local_page((struct page *)element); __poison_element(addr, 1UL << (PAGE_SHIFT + order)); - kunmap_atomic(addr); + kunmap_local(addr); } } #else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ From f542b8e582abd93df092c4a2763679e380f14645 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 20 Nov 2023 15:28:23 +0100 Subject: [PATCH 0453/1562] mm/page_poison: replace kmap_atomic() with kmap_local_page() kmap_atomic() has been deprecated in favor of kmap_local_page(). Therefore, replace kmap_atomic() with kmap_local_page(). kmap_atomic() is implemented like a kmap_local_page() which also disables page-faults and preemption (the latter only in !PREEMPT_RT kernels). The kernel virtual addresses returned by these two API are only valid in the context of the callers (i.e., they cannot be handed to other threads). With kmap_local_page() the mappings are per thread and CPU local like in kmap_atomic(); however, they can handle page-faults and can be called from any context (including interrupts). The tasks that call kmap_local_page() can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. The code blocks between the mappings and un-mappings do not rely on the above-mentioned side effects of kmap_atomic(), so that mere replacements of the old API with the new one is all that they require (i.e., there is no need to explicitly call pagefault_disable() and/or preempt_disable()). Link: https://lkml.kernel.org/r/20231120142836.7219-1-fabio.maria.de.francesco@linux.intel.com Signed-off-by: Fabio M. De Francesco Cc: Ira Weiny Signed-off-by: Andrew Morton --- mm/page_poison.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_poison.c b/mm/page_poison.c index b4f456437b7e..3e9037363cf9 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -21,13 +21,13 @@ early_param("page_poison", early_page_poison_param); static void poison_page(struct page *page) { - void *addr = kmap_atomic(page); + void *addr = kmap_local_page(page); /* KASAN still think the page is in-use, so skip it. */ kasan_disable_current(); memset(kasan_reset_tag(addr), PAGE_POISON, PAGE_SIZE); kasan_enable_current(); - kunmap_atomic(addr); + kunmap_local(addr); } void __kernel_poison_pages(struct page *page, int n) @@ -77,7 +77,7 @@ static void unpoison_page(struct page *page) { void *addr; - addr = kmap_atomic(page); + addr = kmap_local_page(page); kasan_disable_current(); /* * Page poisoning when enabled poisons each and every page @@ -86,7 +86,7 @@ static void unpoison_page(struct page *page) */ check_poison_mem(page, kasan_reset_tag(addr), PAGE_SIZE); kasan_enable_current(); - kunmap_atomic(addr); + kunmap_local(addr); } void __kernel_unpoison_pages(struct page *page, int n) From a5989d4ed40cef0cadede2393c714a1ff9179f65 Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Mon, 20 Nov 2023 04:46:29 +0800 Subject: [PATCH 0454/1562] kasan: improve free meta storage in Generic KASAN Currently free meta can only be stored in object if the object is not smaller than free meta. After the improvement, when the object is smaller than free meta and SLUB DEBUG is not enabled, it is possible to store part of the free meta in the object, reducing the increased size of the red zone. Example: free meta size: 16 bytes alloc meta size: 16 bytes object size: 8 bytes optimal redzone size (object_size <= 64): 16 bytes Before improvement: actual redzone size = alloc meta size + free meta size = 32 bytes After improvement: actual redzone size = alloc meta size + (free meta size - object size) = 24 bytes [juntong.deng@outlook.com: make kasan_metadata_size() adapt to the improved free meta storage] Link: https://lkml.kernel.org/r/VI1P193MB0752675D6E0A2D16CE656F8299BAA@VI1P193MB0752.EURP193.PROD.OUTLOOK.COM Link: https://lkml.kernel.org/r/VI1P193MB0752DE2CCD9046B5FED0AA8E99B5A@VI1P193MB0752.EURP193.PROD.OUTLOOK.COM Signed-off-by: Juntong Deng Suggested-by: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 50cc519e23f4..54e20b2bc3e1 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -362,6 +362,8 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, { unsigned int ok_size; unsigned int optimal_size; + unsigned int rem_free_meta_size; + unsigned int orig_alloc_meta_offset; if (!kasan_requires_meta()) return; @@ -395,6 +397,9 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, /* Continue, since free meta might still fit. */ } + ok_size = *size; + orig_alloc_meta_offset = cache->kasan_info.alloc_meta_offset; + /* * Add free meta into redzone when it's not possible to store * it in the object. This is the case when: @@ -402,23 +407,37 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, * be touched after it was freed, or * 2. Object has a constructor, which means it's expected to * retain its content until the next allocation, or - * 3. Object is too small. + * 3. Object is too small and SLUB DEBUG is enabled. Avoid + * free meta that exceeds the object size corrupts the + * SLUB DEBUG metadata. * Otherwise cache->kasan_info.free_meta_offset = 0 is implied. + * If the object is smaller than the free meta and SLUB DEBUG + * is not enabled, it is still possible to store part of the + * free meta in the object. */ - if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || - cache->object_size < sizeof(struct kasan_free_meta)) { - ok_size = *size; - + if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor) { cache->kasan_info.free_meta_offset = *size; *size += sizeof(struct kasan_free_meta); - - /* If free meta doesn't fit, don't add it. */ - if (*size > KMALLOC_MAX_SIZE) { - cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; - *size = ok_size; + } else if (cache->object_size < sizeof(struct kasan_free_meta)) { + if (__slub_debug_enabled()) { + cache->kasan_info.free_meta_offset = *size; + *size += sizeof(struct kasan_free_meta); + } else { + rem_free_meta_size = sizeof(struct kasan_free_meta) - + cache->object_size; + *size += rem_free_meta_size; + if (cache->kasan_info.alloc_meta_offset != 0) + cache->kasan_info.alloc_meta_offset += rem_free_meta_size; } } + /* If free meta doesn't fit, don't add it. */ + if (*size > KMALLOC_MAX_SIZE) { + cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; + cache->kasan_info.alloc_meta_offset = orig_alloc_meta_offset; + *size = ok_size; + } + /* Calculate size with optimal redzone. */ optimal_size = cache->object_size + optimal_redzone(cache->object_size); /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */ From 50668b53f8c9cdb2f6a7f7e3ff0a5d0bd85cc932 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 19 Nov 2023 17:15:29 +0000 Subject: [PATCH 0455/1562] mm/damon/core-test: test damon_split_region_at()'s access rate copying damon_split_region_at() should set access rate related fields of the resulting regions same. It may forgotten, and actually there was the mistake before. Test it with the unit test case for the function. Link: https://lkml.kernel.org/r/20231119171529.66863-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 649adf91ebc5..e6a01ea2ec54 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -122,18 +122,25 @@ static void damon_test_split_at(struct kunit *test) { struct damon_ctx *c = damon_new_ctx(); struct damon_target *t; - struct damon_region *r; + struct damon_region *r, *r_new; t = damon_new_target(); r = damon_new_region(0, 100); + r->nr_accesses_bp = 420000; + r->nr_accesses = 42; + r->last_nr_accesses = 15; damon_add_region(r, t); damon_split_region_at(t, r, 25); KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); KUNIT_EXPECT_EQ(test, r->ar.end, 25ul); - r = damon_next_region(r); - KUNIT_EXPECT_EQ(test, r->ar.start, 25ul); - KUNIT_EXPECT_EQ(test, r->ar.end, 100ul); + r_new = damon_next_region(r); + KUNIT_EXPECT_EQ(test, r_new->ar.start, 25ul); + KUNIT_EXPECT_EQ(test, r_new->ar.end, 100ul); + + KUNIT_EXPECT_EQ(test, r->nr_accesses_bp, r_new->nr_accesses_bp); + KUNIT_EXPECT_EQ(test, r->nr_accesses, r_new->nr_accesses); + KUNIT_EXPECT_EQ(test, r->last_nr_accesses, r_new->last_nr_accesses); damon_free_target(t); damon_destroy_ctx(c); From 38ca8a185389716e9f7566bce4bb0085f71da61d Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 21 Nov 2023 20:43:49 +0100 Subject: [PATCH 0456/1562] pgtable: fix s390 ptdesc field comments Patch series "minor ptdesc updates", v3. This patch (of 2): Since commit d08d4e7cd6bf ("s390/mm: use full 4KB page for 2KB PTE") there is no fragmented page tracking on s390. Fix the corresponding comments. Link: https://lkml.kernel.org/r/cover.1700594815.git.agordeev@linux.ibm.com Link: https://lkml.kernel.org/r/2eead241f3a45bed26c7911cf66bded1e35670b8.1700594815.git.agordeev@linux.ibm.com Signed-off-by: Alexander Gordeev Suggested-by: Heiko Carstens Cc: Gerald Schaefer Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 957ce38768b2..fbec64036baa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -401,11 +401,11 @@ FOLIO_MATCH(compound_head, _head_2a); * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. * @pt_mm: Used for x86 pgds. - * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only. + * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. - * @_refcount: Same as page refcount. Used for s390 page tables. + * @_refcount: Same as page refcount. * @pt_memcg_data: Memcg data. Tracked for page tables here. * * This struct overlays struct page for now. Do not modify without a good From f7dd74ac239aad5ef7575ea03c45fd7956e00285 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 21 Nov 2023 20:43:50 +0100 Subject: [PATCH 0457/1562] pgtable: rename ptdesc _refcount field to __page_refcount Rename ptdesc _refcount field to __page_refcount similar to the other unused page fields. Link: https://lkml.kernel.org/r/982bdc652ba79a606c3d01c905766e7e076b3315.1700594815.git.agordeev@linux.ibm.com Signed-off-by: Alexander Gordeev Suggested-by: Vishal Moola Cc: Gerald Schaefer Cc: Heiko Carstens Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fbec64036baa..ef18d2b25378 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -405,7 +405,7 @@ FOLIO_MATCH(compound_head, _head_2a); * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. - * @_refcount: Same as page refcount. + * @__page_refcount: Same as page refcount. * @pt_memcg_data: Memcg data. Tracked for page tables here. * * This struct overlays struct page for now. Do not modify without a good @@ -438,7 +438,7 @@ struct ptdesc { #endif }; unsigned int __page_type; - atomic_t _refcount; + atomic_t __page_refcount; #ifdef CONFIG_MEMCG unsigned long pt_memcg_data; #endif @@ -452,7 +452,7 @@ TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); TABLE_MATCH(rcu_head, pt_rcu_head); TABLE_MATCH(page_type, __page_type); -TABLE_MATCH(_refcount, _refcount); +TABLE_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG TABLE_MATCH(memcg_data, pt_memcg_data); #endif From ecf5dd1ffe84ede52d8a6c9fb72d8aad04ff8160 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Wed, 22 Nov 2023 21:24:03 +0300 Subject: [PATCH 0458/1562] mm/mm_init.c: extend init unavailable range doc info Besides of the already described reasons the pages backended memory holes might be persistent due to having memory mapped IO spaces behind those ranges in the framework of flatmem kernel config. Add such note to the init_unavailable_range() method kdoc in order to point out to one more reason of having the function executed for such regions. [fancer.lancer@gmail.com: update per Mike] Link: https://lkml.kernel.org/r/20231202111855.18392-1-fancer.lancer@gmail.com Link: https://lkml.kernel.org/r/20231122182419.30633-6-fancer.lancer@gmail.com Signed-off-by: Serge Semin Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mm_init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mm_init.c b/mm/mm_init.c index 077bfe393b5e..824bf53e8253 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -796,6 +796,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * - physical memory bank size is not necessarily the exact multiple of the * arbitrary section size * - early reserved memory may not be listed in memblock.memory + * - non-memory regions covered by the contigious flatmem mapping * - memory layouts defined with memmap= kernel parameter may not align * nicely with memmap sections * From 01846c6c70257efc0969c0394e496673040627ec Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Wed, 22 Nov 2023 21:24:04 +0300 Subject: [PATCH 0459/1562] mm/mm_init.c: append newline to the unavailable ranges log-message Based on the init_unavailable_range() method and it's callee semantics no multi-line info messages are intended to be printed to the console. Thus append the '\n' symbol to the respective info string. Link: https://lkml.kernel.org/r/20231122182419.30633-7-fancer.lancer@gmail.com Signed-off-by: Serge Semin Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mm_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 824bf53e8253..a5f91eba4f8d 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -827,7 +827,7 @@ static void __init init_unavailable_range(unsigned long spfn, } if (pgcnt) - pr_info("On node %d, zone %s: %lld pages in unavailable ranges", + pr_info("On node %d, zone %s: %lld pages in unavailable ranges\n", node, zone_names[zone], pgcnt); } From cddba0af0b7919e93134469f6fdf29a7d362768a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 24 Nov 2023 10:19:02 -0500 Subject: [PATCH 0460/1562] fs/Kconfig: make hugetlbfs a menuconfig Hugetlb vmemmap default option (HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON) is a sub-option to hugetlbfs, but it shows in the same level as hugetlbfs itself, under "Pesudo filesystems". Make the vmemmap option a sub-option to hugetlbfs, by changing hugetlbfs into a menuconfig. When moving it, fix a typo 'v' spot by Randy. Link: https://lkml.kernel.org/r/20231124151902.1075697-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Muchun Song Cc: Mike Kravetz Cc: Randy Dunlap Signed-off-by: Andrew Morton --- fs/Kconfig | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index 42837617a55b..cf62d86b514f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -254,7 +254,7 @@ config TMPFS_QUOTA config ARCH_SUPPORTS_HUGETLBFS def_bool n -config HUGETLBFS +menuconfig HUGETLBFS bool "HugeTLB file system support" depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN depends on (SYSFS || SYSCTL) @@ -266,6 +266,17 @@ config HUGETLBFS If unsure, say N. +if HUGETLBFS +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON + bool "HugeTLB Vmemmap Optimization (HVO) defaults to on" + default n + depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP + help + The HugeTLB Vmemmap Optimization (HVO) defaults to off. Say Y here to + enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off + (boot command line) or hugetlb_optimize_vmemmap (sysctl). +endif # HUGETLBFS + config HUGETLB_PAGE def_bool HUGETLBFS select XARRAY_MULTI @@ -275,15 +286,6 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP depends on SPARSEMEM_VMEMMAP -config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON - bool "HugeTLB Vmemmap Optimization (HVO) defaults to on" - default n - depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP - help - The HugeTLB VmemmapvOptimization (HVO) defaults to off. Say Y here to - enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off - (boot command line) or hugetlb_optimize_vmemmap (sysctl). - config ARCH_HAS_GIGANTIC_PAGE bool From d68e39fc45f70e35eb74df2128d315c1d91e4dc4 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Fri, 24 Nov 2023 16:35:52 +0530 Subject: [PATCH 0461/1562] mm: page_alloc: correct high atomic reserve calculations Patch series "mm: page_alloc: fixes for high atomic reserve caluculations", v3. The state of the system where the issue exposed shown in oom kill logs: [ 295.998653] Normal free:7728kB boost:0kB min:804kB low:1004kB high:1204kB reserved_highatomic:8192KB active_anon:4kB inactive_anon:0kB active_file:24kB inactive_file:24kB unevictable:1220kB writepending:0kB present:70732kB managed:49224kB mlocked:0kB bounce:0kB free_pcp:688kBlocal_pcp:492kB free_cma:0kB [ 295.998656] lowmem_reserve[]: 0 32 [ 295.998659] Normal: 508*4kB (UMEH) 241*8kB (UMEH) 143*16kB (UMEH) 33*32kB (UH) 7*64kB (UH) 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 7752kB From the above, it is seen that ~16MB of memory reserved for high atomic reserves against the expectation of 1% reserves which is fixed in the 1st patch. Don't reserve the high atomic page blocks if 1% of zone memory size is below a pageblock size. This patch (of 2): reserve_highatomic_pageblock() aims to reserve the 1% of the managed pages of a zone, which is used for the high order atomic allocations. It uses the below calculation to reserve: static void reserve_highatomic_pageblock(struct page *page, ....) { ....... max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; if (zone->nr_reserved_highatomic >= max_managed) goto out; zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); out: .... } Since we are always appending the 1% of zone managed pages count to pageblock_nr_pages, the minimum it is turning into 2 pageblocks as the nr_reserved_highatomic is incremented/decremented in pageblock sizes. Encountered a system(actually a VM running on the Linux kernel) with the below zone configuration: Normal free:7728kB boost:0kB min:804kB low:1004kB high:1204kB reserved_highatomic:8192KB managed:49224kB The existing calculations making it to reserve the 8MB(with pageblock size of 4MB) i.e. 16% of the zone managed memory. Reserving such high amount of memory can easily exert memory pressure in the system thus may lead into unnecessary reclaims till unreserving of high atomic reserves. Since high atomic reserves are managed in pageblock size granules, as MIGRATE_HIGHATOMIC is set for such pageblock, fix the calculations for high atomic reserves as, minimum is pageblock size , maximum is approximately 1% of the zone managed pages. Link: https://lkml.kernel.org/r/cover.1700821416.git.quic_charante@quicinc.com Link: https://lkml.kernel.org/r/1660034138397b82a0a8b6ae51cbe96bd583d89e.1700821416.git.quic_charante@quicinc.com Signed-off-by: Charan Teja Kalla Acked-by: Mel Gorman Acked-by: David Rientjes Cc: David Hildenbrand Cc: Johannes Weiner Cc: Michal Hocko Cc: Pavankumar Kondeti Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dd5e8a759d27..2a272eb108a5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1880,10 +1880,11 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) unsigned long max_managed, flags; /* - * Limit the number reserved to 1 pageblock or roughly 1% of a zone. + * The number reserved as: minimum is 1 pageblock, maximum is + * roughly 1% of a zone. * Check is race-prone but harmless. */ - max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; + max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages); if (zone->nr_reserved_highatomic >= max_managed) return; From 9cd20f3fe045af95a8fe7a12328b21bfd2f3b8bf Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Fri, 24 Nov 2023 16:35:53 +0530 Subject: [PATCH 0462/1562] mm: page_alloc: enforce minimum zone size to do high atomic reserves Highatomic reserves are set to roughly 1% of zone for maximum and a pageblock size for minimum. Encountered a system with the below configuration: Normal free:7728kB boost:0kB min:804kB low:1004kB high:1204kB reserved_highatomic:8192KB managed:49224kB On such systems, even a single pageblock makes highatomic reserves are set to ~8% of the zone memory. This high value can easily exert pressure on the zone. Per discussion with Michal and Mel, it is not much useful to reserve the memory for highatomic allocations on such small systems[1]. Since the minimum size for high atomic reserves is always going to be a pageblock size and if 1% of zone managed pages is going to be below pageblock size, don't reserve memory for high atomic allocations. Thanks Michal for this suggestion[2]. Since no memory is being reserved for high atomic allocations and if respective allocation failures are seen, this patch can be reverted. [1] https://lore.kernel.org/linux-mm/20231117161956.d3yjdxhhm4rhl7h2@techsingularity.net/ [2] https://lore.kernel.org/linux-mm/ZVYRJMUitykepLRy@tiehlicka/ Link: https://lkml.kernel.org/r/c3a2a48e2cfe08176a80eaf01c110deb9e918055.1700821416.git.quic_charante@quicinc.com Signed-off-by: Charan Teja Kalla Acked-by: David Rientjes Cc: David Hildenbrand Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Pavankumar Kondeti Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a272eb108a5..ef8b151edbd0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1881,9 +1881,12 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) /* * The number reserved as: minimum is 1 pageblock, maximum is - * roughly 1% of a zone. + * roughly 1% of a zone. But if 1% of a zone falls below a + * pageblock size, then don't reserve any pageblocks. * Check is race-prone but harmless. */ + if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages) + return; max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages); if (zone->nr_reserved_highatomic >= max_managed) return; From ac3f3b0a55518056bc80ed32a41931c99e1f7d81 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Fri, 24 Nov 2023 16:27:25 +0530 Subject: [PATCH 0463/1562] mm: page_alloc: unreserve highatomic page blocks before oom __alloc_pages_direct_reclaim() is called from slowpath allocation where high atomic reserves can be unreserved after there is a progress in reclaim and yet no suitable page is found. Later should_reclaim_retry() gets called from slow path allocation to decide if the reclaim needs to be retried before OOM kill path is taken. should_reclaim_retry() checks the available(reclaimable + free pages) memory against the min wmark levels of a zone and returns: a) true, if it is above the min wmark so that slow path allocation will do the reclaim retries. b) false, thus slowpath allocation takes oom kill path. should_reclaim_retry() can also unreserves the high atomic reserves **but only after all the reclaim retries are exhausted.** In a case where there are almost none reclaimable memory and free pages contains mostly the high atomic reserves but allocation context can't use these high atomic reserves, makes the available memory below min wmark levels hence false is returned from should_reclaim_retry() leading the allocation request to take OOM kill path. This can turn into a early oom kill if high atomic reserves are holding lot of free memory and unreserving of them is not attempted. (early)OOM is encountered on a VM with the below state: [ 295.998653] Normal free:7728kB boost:0kB min:804kB low:1004kB high:1204kB reserved_highatomic:8192KB active_anon:4kB inactive_anon:0kB active_file:24kB inactive_file:24kB unevictable:1220kB writepending:0kB present:70732kB managed:49224kB mlocked:0kB bounce:0kB free_pcp:688kB local_pcp:492kB free_cma:0kB [ 295.998656] lowmem_reserve[]: 0 32 [ 295.998659] Normal: 508*4kB (UMEH) 241*8kB (UMEH) 143*16kB (UMEH) 33*32kB (UH) 7*64kB (UH) 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 7752kB Per above log, the free memory of ~7MB exist in the high atomic reserves is not freed up before falling back to oom kill path. Fix it by trying to unreserve the high atomic reserves in should_reclaim_retry() before __alloc_pages_direct_reclaim() can fallback to oom kill path. Link: https://lkml.kernel.org/r/1700823445-27531-1-git-send-email-quic_charante@quicinc.com Fixes: 0aaa29a56e4f ("mm, page_alloc: reserve pageblocks for high-order atomic allocations on demand") Signed-off-by: Charan Teja Kalla Reported-by: Chris Goldsworthy Suggested-by: Michal Hocko Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Chris Goldsworthy Cc: David Hildenbrand Cc: Johannes Weiner Cc: Mel Gorman Cc: Pavankumar Kondeti Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef8b151edbd0..7ea9c33320bf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3951,14 +3951,9 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, else (*no_progress_loops)++; - /* - * Make sure we converge to OOM if we cannot make any progress - * several times in the row. - */ - if (*no_progress_loops > MAX_RECLAIM_RETRIES) { - /* Before OOM, exhaust highatomic_reserve */ - return unreserve_highatomic_pageblock(ac, true); - } + if (*no_progress_loops > MAX_RECLAIM_RETRIES) + goto out; + /* * Keep reclaiming pages while there is a chance this will lead @@ -4001,6 +3996,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, schedule_timeout_uninterruptible(1); else cond_resched(); +out: + /* Before OOM, exhaust highatomic_reserve */ + if (!ret) + return unreserve_highatomic_pageblock(ac, true); + return ret; } From e9119fb65761f124b31743b598ce04b8f15a6fe3 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 23 Nov 2023 13:02:22 -0500 Subject: [PATCH 0464/1562] mm/gup: fix follow_devmap_p[mu]d() on page==NULL handling This is a bug found not by any report but only by code observations. When GUP sees a devpmd/devpud and if page==NULL is returned, it means a fault is probably required. Here falling through when page==NULL can cause unexpected behavior. Fix both cases by catching the page==NULL cases with no_page_table(). Link: https://lkml.kernel.org/r/20231123180222.1048297-1-peterx@redhat.com Fixes: 3565fce3a659 ("mm, x86: get_user_pages() for dax mappings") Fixes: 080dbb618b4b ("mm/follow_page_mask: split follow_page_mask to smaller functions.") Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Dan Williams Cc: Mel Gorman Cc: Matthew Wilcox Cc: Aneesh Kumar K.V Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/gup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/gup.c b/mm/gup.c index 231711efa390..0a5f0e91bfec 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -710,6 +710,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, spin_unlock(ptl); if (page) return page; + return no_page_table(vma, flags); } if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); @@ -758,6 +759,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, spin_unlock(ptl); if (page) return page; + return no_page_table(vma, flags); } if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); From 7679e14098c9c3c8118a7130d6e1e9cfe2565c04 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 23 Nov 2023 19:23:17 +0200 Subject: [PATCH 0465/1562] mm: list_lru: Update kernel documentation to follow the requirements kernel-doc is not happy about documentation in list_lru.h: list_lru.h:90: warning: Function parameter or member 'lru' not described in 'list_lru_add' list_lru.h:90: warning: Excess function parameter 'list_lru' description in 'list_lru_add' list_lru.h:90: warning: No description found for return value of 'list_lru_add' list_lru.h:103: warning: Function parameter or member 'lru' not described in 'list_lru_del' list_lru.h:103: warning: Excess function parameter 'list_lru' description in 'list_lru_del' list_lru.h:103: warning: No description found for return value of 'list_lru_del' list_lru.h:116: warning: No description found for return value of 'list_lru_count_one' list_lru.h:168: warning: No description found for return value of 'list_lru_walk_one' list_lru.h:185: warning: No description found for return value of 'list_lru_walk_one_irq' Fix the documentation accordingly. While at it, fix the references to the parameters in functions inside the long descriptions, on which the above script is not complaining (yet?). Link: https://lkml.kernel.org/r/20231123172320.2434780-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index b35968ee9fb5..db86ad78d428 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -73,7 +73,7 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren /** * list_lru_add: add an element to the lru list's tail - * @list_lru: the lru pointer + * @lru: the lru pointer * @item: the item to be added. * * If the element is already part of a list, this function returns doing @@ -83,22 +83,22 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * the caller organize itself in a way that elements can be in more than * one type of list, it is up to the caller to fully remove the item from * the previous list (with list_lru_del() for instance) before moving it - * to @list_lru + * to @lru. * - * Return value: true if the list was updated, false otherwise + * Return: true if the list was updated, false otherwise */ bool list_lru_add(struct list_lru *lru, struct list_head *item); /** * list_lru_del: delete an element to the lru list - * @list_lru: the lru pointer + * @lru: the lru pointer * @item: the item to be deleted. * - * This function works analogously as list_lru_add in terms of list + * This function works analogously as list_lru_add() in terms of list * manipulation. The comments about an element already pertaining to - * a list are also valid for list_lru_del. + * a list are also valid for list_lru_del(). * - * Return value: true if the list was updated, false otherwise + * Return: true if the list was updated, false otherwise */ bool list_lru_del(struct list_lru *lru, struct list_head *item); @@ -108,9 +108,11 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item); * @nid: the node id to count from. * @memcg: the cgroup to count from. * - * Always return a non-negative number, 0 for empty lists. There is no - * guarantee that the list is not updated while the count is being computed. - * Callers that want such a guarantee need to provide an outer lock. + * There is no guarantee that the list is not updated while the count is being + * computed. Callers that want such a guarantee need to provide an outer lock. + * + * Return: 0 for empty lists, otherwise the number of objects + * currently held by @lru. */ unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg); @@ -141,7 +143,7 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); /** - * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items. + * list_lru_walk_one: walk a @lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. @@ -150,24 +152,24 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * - * This function will scan all elements in a particular list_lru, calling the + * This function will scan all elements in a particular @lru, calling the * @isolate callback for each of those items, along with the current list * spinlock and a caller-provided opaque. The @isolate callback can choose to * drop the lock internally, but *must* return with the lock held. The callback - * will return an enum lru_status telling the list_lru infrastructure what to + * will return an enum lru_status telling the @lru infrastructure what to * do with the object being scanned. * - * Please note that nr_to_walk does not mean how many objects will be freed, + * Please note that @nr_to_walk does not mean how many objects will be freed, * just how many objects will be scanned. * - * Return value: the number of objects effectively removed from the LRU. + * Return: the number of objects effectively removed from the LRU. */ unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); /** - * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items. + * list_lru_walk_one_irq: walk a @lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. @@ -176,7 +178,7 @@ unsigned long list_lru_walk_one(struct list_lru *lru, * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * - * Same as @list_lru_walk_one except that the spinlock is acquired with + * Same as list_lru_walk_one() except that the spinlock is acquired with * spin_lock_irq(). */ unsigned long list_lru_walk_one_irq(struct list_lru *lru, From 27873192ac5938bfa9d27348d79b931e5b438ba6 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Thu, 23 Nov 2023 18:40:18 +0800 Subject: [PATCH 0466/1562] mm, oom:dump_tasks add rss detailed information printing When the system is under oom, it prints out the RSS information of each process. However, we don't know the size of rss_anon, rss_file, and rss_shmem. To distinguish the memory occupied by anonymous or file mappings or shmem, could help us identify the root cause of the oom. So this patch adds RSS details, which refers to the /proc//status[1]. It can help us know more about process memory usage. Example of oom including the new rss_* fields: [ 1630.902466] Tasks state (memory values in pages): [ 1630.902870] [ pid ] uid tgid total_vm rss rss_anon rss_file rss_shmem pgtables_bytes swapents oom_score_adj name [ 1630.903619] [ 149] 0 149 486 288 0 288 0 36864 0 0 ash [ 1630.904210] [ 156] 0 156 153531 153345 153345 0 0 1269760 0 0 mm_test [1] commit 8cee852ec53f ("mm, procfs: breakdown RSS for anon, shmem and file in /proc/pid/status"). Link: https://lkml.kernel.org/r/202311231840181856667@zte.com.cn Signed-off-by: Yong Wang Reviewed-by: Yang Yang Cc: Hugh Dickins Cc: Johannes Weiner Cc: Xuexin Jiang Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/oom_kill.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9e6071fde34a..91ccd82097c2 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -399,10 +399,11 @@ static int dump_task(struct task_struct *p, void *arg) return 0; } - pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", + pr_info("[%7d] %5d %5d %8lu %8lu %8lu %8lu %9lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - mm_pgtables_bytes(task->mm), + get_mm_counter(task->mm, MM_ANONPAGES), get_mm_counter(task->mm, MM_FILEPAGES), + get_mm_counter(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); @@ -423,7 +424,7 @@ static int dump_task(struct task_struct *p, void *arg) static void dump_tasks(struct oom_control *oc) { pr_info("Tasks state (memory values in pages):\n"); - pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss rss_anon rss_file rss_shmem pgtables_bytes swapents oom_score_adj name\n"); if (is_memcg_oom(oc)) mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); From 003ae2fb0b36803112f9b66cce8041afa5d46a83 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 27 Nov 2023 16:55:21 +0100 Subject: [PATCH 0467/1562] mm/zswap: replace kmap_atomic() with kmap_local_page() kmap_atomic() has been deprecated in favor of kmap_local_page(). Therefore, replace kmap_atomic() with kmap_local_page() in zswap.c. kmap_atomic() is implemented like a kmap_local_page() which also disables page-faults and preemption (the latter only in !PREEMPT_RT kernels). The kernel virtual addresses returned by these two API are only valid in the context of the callers (i.e., they cannot be handed to other threads). With kmap_local_page() the mappings are per thread and CPU local like in kmap_atomic(); however, they can handle page-faults and can be called from any context (including interrupts). The tasks that call kmap_local_page() can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. In mm/zswap.c, the blocks of code between the mappings and un-mappings do not depend on the above-mentioned side effects of kmap_atomic(), so that the mere replacements of the old API with the new one is all that is required (i.e., there is no need to explicitly call pagefault_disable() and/or preempt_disable()). Link: https://lkml.kernel.org/r/20231127160058.586446-1-fabio.maria.de.francesco@linux.intel.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Nhat Pham Acked-by: Chris Li (Google) Cc: Ira Weiny Cc: Seth Jennings Cc: Dan Streetman Signed-off-by: Andrew Morton --- mm/zswap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 74411dfdad92..699c6ee11222 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1267,16 +1267,16 @@ bool zswap_store(struct folio *folio) } if (zswap_same_filled_pages_enabled) { - src = kmap_atomic(page); + src = kmap_local_page(page); if (zswap_is_page_same_filled(src, &value)) { - kunmap_atomic(src); + kunmap_local(src); entry->swpentry = swp_entry(type, offset); entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages); goto insert_entry; } - kunmap_atomic(src); + kunmap_local(src); } if (!zswap_non_same_filled_pages_enabled) @@ -1422,9 +1422,9 @@ bool zswap_load(struct folio *folio) spin_unlock(&tree->lock); if (!entry->length) { - dst = kmap_atomic(page); + dst = kmap_local_page(page); zswap_fill_page(dst, entry->value); - kunmap_atomic(dst); + kunmap_local(dst); ret = true; goto stats; } From 829c3151f0f89f98ed2d89f0e0fc367649c1cd34 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 27 Nov 2023 16:54:37 +0100 Subject: [PATCH 0468/1562] mm/swapfile: replace kmap_atomic() with kmap_local_page() kmap_atomic() has been deprecated in favor of kmap_local_page(). Therefore, replace kmap_atomic() with kmap_local_page() in swapfile.c. kmap_atomic() is implemented like a kmap_local_page() which also disables page-faults and preemption (the latter only in !PREEMPT_RT kernels). The kernel virtual addresses returned by these two API are only valid in the context of the callers (i.e., they cannot be handed to other threads). With kmap_local_page() the mappings are per thread and CPU local like in kmap_atomic(); however, they can handle page-faults and can be called from any context (including interrupts). The tasks that call kmap_local_page() can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. In mm/swapfile.c, the blocks of code between the mappings and un-mappings do not depend on the above-mentioned side effects of kmap_atomic(), so that the mere replacements of the old API with the new one is all that is required (i.e., there is no need to explicitly call pagefault_disable() and/or preempt_disable()). Link: https://lkml.kernel.org/r/20231127155452.586387-1-fabio.maria.de.francesco@linux.intel.com Signed-off-by: Fabio M. De Francesco Cc: Ira Weiny Signed-off-by: Andrew Morton --- mm/swapfile.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 4bc70f459164..8be70912e298 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1495,9 +1495,9 @@ int swp_swapcount(swp_entry_t entry) do { page = list_next_entry(page, lru); - map = kmap_atomic(page); + map = kmap_local_page(page); tmp_count = map[offset]; - kunmap_atomic(map); + kunmap_local(map); count += (tmp_count & ~COUNT_CONTINUED) * n; n *= (SWAP_CONT_MAX + 1); @@ -3477,9 +3477,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) if (!(count & COUNT_CONTINUED)) goto out_unlock_cont; - map = kmap_atomic(list_page) + offset; + map = kmap_local_page(list_page) + offset; count = *map; - kunmap_atomic(map); + kunmap_local(map); /* * If this continuation count now has some space in it, @@ -3529,7 +3529,7 @@ static bool swap_count_continued(struct swap_info_struct *si, spin_lock(&si->cont_lock); offset &= ~PAGE_MASK; page = list_next_entry(head, lru); - map = kmap_atomic(page) + offset; + map = kmap_local_page(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ goto init_map; /* jump over SWAP_CONT_MAX checks */ @@ -3539,27 +3539,27 @@ static bool swap_count_continued(struct swap_info_struct *si, * Think of how you add 1 to 999 */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { - kunmap_atomic(map); + kunmap_local(map); page = list_next_entry(page, lru); BUG_ON(page == head); - map = kmap_atomic(page) + offset; + map = kmap_local_page(page) + offset; } if (*map == SWAP_CONT_MAX) { - kunmap_atomic(map); + kunmap_local(map); page = list_next_entry(page, lru); if (page == head) { ret = false; /* add count continuation */ goto out; } - map = kmap_atomic(page) + offset; + map = kmap_local_page(page) + offset; init_map: *map = 0; /* we didn't zero the page */ } *map += 1; - kunmap_atomic(map); + kunmap_local(map); while ((page = list_prev_entry(page, lru)) != head) { - map = kmap_atomic(page) + offset; + map = kmap_local_page(page) + offset; *map = COUNT_CONTINUED; - kunmap_atomic(map); + kunmap_local(map); } ret = true; /* incremented */ @@ -3569,21 +3569,21 @@ init_map: *map = 0; /* we didn't zero the page */ */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { - kunmap_atomic(map); + kunmap_local(map); page = list_next_entry(page, lru); BUG_ON(page == head); - map = kmap_atomic(page) + offset; + map = kmap_local_page(page) + offset; } BUG_ON(*map == 0); *map -= 1; if (*map == 0) count = 0; - kunmap_atomic(map); + kunmap_local(map); while ((page = list_prev_entry(page, lru)) != head) { - map = kmap_atomic(page) + offset; + map = kmap_local_page(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; - kunmap_atomic(map); + kunmap_local(map); } ret = count == COUNT_CONTINUED; } From b123d09304d8676ba327b72a39a6d0b79b6f604c Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 27 Nov 2023 16:46:42 +0800 Subject: [PATCH 0469/1562] mm: pagewalk: assert write mmap lock only for walking the user page tables The 8782fb61cc848 ("mm: pagewalk: Fix race between unmap and page walker") introduces an assertion to walk_page_range_novma() to make all the users of page table walker is safe. However, the race only exists for walking the user page tables. And it is ridiculous to hold a particular user mmap write lock against the changes of the kernel page tables. So only assert at least mmap read lock when walking the kernel page tables. And some users matching this case could downgrade to a mmap read lock to relief the contention of mmap lock of init_mm, it will be nicer in hugetlb (only holding mmap read lock) in the next patch. Link: https://lkml.kernel.org/r/20231127084645.27017-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Mike Kravetz Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/pagewalk.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index b7d7e4fcfad7..f46c80b18ce4 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -539,6 +539,11 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. This is useful for * walking the kernel pages tables or page tables for firmware. + * + * Note: Be careful to walk the kernel pages tables, the caller may be need to + * take other effective approache (mmap lock may be insufficient) to prevent + * the intermediate kernel page tables belonging to the specified address range + * from being freed (e.g. memory hot-remove). */ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, @@ -556,7 +561,29 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, if (start >= end || !walk.mm) return -EINVAL; - mmap_assert_write_locked(walk.mm); + /* + * 1) For walking the user virtual address space: + * + * The mmap lock protects the page walker from changes to the page + * tables during the walk. However a read lock is insufficient to + * protect those areas which don't have a VMA as munmap() detaches + * the VMAs before downgrading to a read lock and actually tearing + * down PTEs/page tables. In which case, the mmap write lock should + * be hold. + * + * 2) For walking the kernel virtual address space: + * + * The kernel intermediate page tables usually do not be freed, so + * the mmap map read lock is sufficient. But there are some exceptions. + * E.g. memory hot-remove. In which case, the mmap lock is insufficient + * to prevent the intermediate kernel pages tables belonging to the + * specified address range from being freed. The caller should take + * other actions to prevent this race. + */ + if (mm == &init_mm) + mmap_assert_locked(walk.mm); + else + mmap_assert_write_locked(walk.mm); return walk_pgd_range(start, end, &walk); } From fb93ed63345f67f676cd3569057e8e7c2b58aed7 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 27 Nov 2023 16:46:43 +0800 Subject: [PATCH 0470/1562] mm: hugetlb_vmemmap: use walk_page_range_novma() to simplify the code It is unnecessary to implement a series of dedicated page table walking helpers since there is already a general one walk_page_range_novma(). So use it to simplify the code. Link: https://lkml.kernel.org/r/20231127084645.27017-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 156 ++++++++++++------------------------------- 1 file changed, 43 insertions(+), 113 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 87818ee7f01d..ef14356855d1 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include "hugetlb_vmemmap.h" @@ -45,21 +46,14 @@ struct vmemmap_remap_walk { unsigned long flags; }; -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush) +static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, + struct vmemmap_remap_walk *walk) { pmd_t __pmd; int i; unsigned long addr = start; - struct page *head; pte_t *pgtable; - spin_lock(&init_mm.page_table_lock); - head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; - spin_unlock(&init_mm.page_table_lock); - - if (!head) - return 0; - pgtable = pte_alloc_one_kernel(&init_mm); if (!pgtable) return -ENOMEM; @@ -88,7 +82,7 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush) /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); - if (flush) + if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); @@ -98,123 +92,59 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush) return 0; } -static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, - struct vmemmap_remap_walk *walk) +static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) { - pte_t *pte = pte_offset_kernel(pmd, addr); + struct page *head; + struct vmemmap_remap_walk *vmemmap_walk = walk->private; + + /* Only splitting, not remapping the vmemmap pages. */ + if (!vmemmap_walk->remap_pte) + walk->action = ACTION_CONTINUE; + + spin_lock(&init_mm.page_table_lock); + head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; + spin_unlock(&init_mm.page_table_lock); + if (!head) + return 0; + + return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); +} + +static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct vmemmap_remap_walk *vmemmap_walk = walk->private; /* - * The reuse_page is found 'first' in table walk before we start - * remapping (which is calling @walk->remap_pte). + * The reuse_page is found 'first' in page table walking before + * starting remapping. */ - if (!walk->reuse_page) { - walk->reuse_page = pte_page(ptep_get(pte)); - /* - * Because the reuse address is part of the range that we are - * walking, skip the reuse address range. - */ - addr += PAGE_SIZE; - pte++; - walk->nr_walked++; - } - - for (; addr != end; addr += PAGE_SIZE, pte++) { - walk->remap_pte(pte, addr, walk); - walk->nr_walked++; - } -} - -static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, - struct vmemmap_remap_walk *walk) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_offset(pud, addr); - do { - int ret; - - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, - !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)); - if (ret) - return ret; - - next = pmd_addr_end(addr, end); - - /* - * We are only splitting, not remapping the hugetlb vmemmap - * pages. - */ - if (!walk->remap_pte) - continue; - - vmemmap_pte_range(pmd, addr, next, walk); - } while (pmd++, addr = next, addr != end); + if (!vmemmap_walk->reuse_page) + vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); + else + vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); + vmemmap_walk->nr_walked++; return 0; } -static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, - struct vmemmap_remap_walk *walk) -{ - pud_t *pud; - unsigned long next; - - pud = pud_offset(p4d, addr); - do { - int ret; - - next = pud_addr_end(addr, end); - ret = vmemmap_pmd_range(pud, addr, next, walk); - if (ret) - return ret; - } while (pud++, addr = next, addr != end); - - return 0; -} - -static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, - struct vmemmap_remap_walk *walk) -{ - p4d_t *p4d; - unsigned long next; - - p4d = p4d_offset(pgd, addr); - do { - int ret; - - next = p4d_addr_end(addr, end); - ret = vmemmap_pud_range(p4d, addr, next, walk); - if (ret) - return ret; - } while (p4d++, addr = next, addr != end); - - return 0; -} +static const struct mm_walk_ops vmemmap_remap_ops = { + .pmd_entry = vmemmap_pmd_entry, + .pte_entry = vmemmap_pte_entry, +}; static int vmemmap_remap_range(unsigned long start, unsigned long end, struct vmemmap_remap_walk *walk) { - unsigned long addr = start; - unsigned long next; - pgd_t *pgd; + int ret; - VM_BUG_ON(!PAGE_ALIGNED(start)); - VM_BUG_ON(!PAGE_ALIGNED(end)); + VM_BUG_ON(!PAGE_ALIGNED(start | end)); - pgd = pgd_offset_k(addr); - do { - int ret; - - next = pgd_addr_end(addr, end); - ret = vmemmap_p4d_range(pgd, addr, next, walk); - if (ret) - return ret; - } while (pgd++, addr = next, addr != end); + ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops, + NULL, walk); + if (ret) + return ret; if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, end); From be035a2acf1fa03caf77daff7d8a424f395cfb4c Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 27 Nov 2023 16:46:44 +0800 Subject: [PATCH 0471/1562] mm: hugetlb_vmemmap: move PageVmemmapSelfHosted() check to split_vmemmap_huge_pmd() To check a page whether it is self-hosted needs to traverse the page table (e.g. pmd_off_k()), however, we already have done this in the next calling of vmemmap_remap_range(). Moving PageVmemmapSelfHosted() check to vmemmap_pmd_entry() could simplify the code a bit. Link: https://lkml.kernel.org/r/20231127084645.27017-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 70 +++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 46 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index ef14356855d1..ce920ca6c90e 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -95,6 +95,7 @@ static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { + int ret = 0; struct page *head; struct vmemmap_remap_walk *vmemmap_walk = walk->private; @@ -104,9 +105,30 @@ static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, spin_lock(&init_mm.page_table_lock); head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; + /* + * Due to HugeTLB alignment requirements and the vmemmap + * pages being at the start of the hotplugged memory + * region in memory_hotplug.memmap_on_memory case. Checking + * the vmemmap page associated with the first vmemmap page + * if it is self-hosted is sufficient. + * + * [ hotplugged memory ] + * [ section ][...][ section ] + * [ vmemmap ][ usable memory ] + * ^ | ^ | + * +--+ | | + * +------------------------+ + */ + if (unlikely(!vmemmap_walk->nr_walked)) { + struct page *page = head ? head + pte_index(addr) : + pte_page(ptep_get(pte_offset_kernel(pmd, addr))); + + if (PageVmemmapSelfHosted(page)) + ret = -ENOTSUPP; + } spin_unlock(&init_mm.page_table_lock); - if (!head) - return 0; + if (!head || ret) + return ret; return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); } @@ -524,50 +546,6 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h if (!hugetlb_vmemmap_optimizable(h)) return false; - if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { - pmd_t *pmdp, pmd; - struct page *vmemmap_page; - unsigned long vaddr = (unsigned long)head; - - /* - * Only the vmemmap page's vmemmap page can be self-hosted. - * Walking the page tables to find the backing page of the - * vmemmap page. - */ - pmdp = pmd_off_k(vaddr); - /* - * The READ_ONCE() is used to stabilize *pmdp in a register or - * on the stack so that it will stop changing under the code. - * The only concurrent operation where it can be changed is - * split_vmemmap_huge_pmd() (*pmdp will be stable after this - * operation). - */ - pmd = READ_ONCE(*pmdp); - if (pmd_leaf(pmd)) - vmemmap_page = pmd_page(pmd) + pte_index(vaddr); - else - vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr)); - /* - * Due to HugeTLB alignment requirements and the vmemmap pages - * being at the start of the hotplugged memory region in - * memory_hotplug.memmap_on_memory case. Checking any vmemmap - * page's vmemmap page if it is marked as VmemmapSelfHosted is - * sufficient. - * - * [ hotplugged memory ] - * [ section ][...][ section ] - * [ vmemmap ][ usable memory ] - * ^ | | | - * +---+ | | - * ^ | | - * +-------+ | - * ^ | - * +-------------------------------------------+ - */ - if (PageVmemmapSelfHosted(vmemmap_page)) - return false; - } - return true; } From ebc20dcac4ce98f227f63cf8be0c9c1152d25cc9 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 27 Nov 2023 16:46:45 +0800 Subject: [PATCH 0472/1562] mm: hugetlb_vmemmap: convert page to folio There are still some places where it does not be converted to folio, this patch convert all of them to folio. And this patch also does some trival cleanup to fix the code style problems. Link: https://lkml.kernel.org/r/20231127084645.27017-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 51 ++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index ce920ca6c90e..54f388aa361f 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -280,7 +280,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_split(unsigned long start, unsigned long end, - unsigned long reuse) + unsigned long reuse) { int ret; struct vmemmap_remap_walk walk = { @@ -447,14 +447,14 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); -static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio, unsigned long flags) +static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, + struct folio *folio, unsigned long flags) { int ret; - struct page *head = &folio->page; - unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; - VM_WARN_ON_ONCE(!PageHuge(head)); + VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; @@ -517,7 +517,7 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { ret = __hugetlb_vmemmap_restore_folio(h, folio, - VMEMMAP_REMAP_NO_TLB_FLUSH); + VMEMMAP_REMAP_NO_TLB_FLUSH); if (ret) break; restored++; @@ -535,9 +535,9 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, } /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ -static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head) +static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) { - if (HPageVmemmapOptimized((struct page *)head)) + if (folio_test_hugetlb_vmemmap_optimized(folio)) return false; if (!READ_ONCE(vmemmap_optimize_enabled)) @@ -550,17 +550,16 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h } static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, - struct folio *folio, - struct list_head *vmemmap_pages, - unsigned long flags) + struct folio *folio, + struct list_head *vmemmap_pages, + unsigned long flags) { int ret = 0; - struct page *head = &folio->page; - unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; - VM_WARN_ON_ONCE(!PageHuge(head)); - if (!vmemmap_should_optimize(h, head)) + VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); + if (!vmemmap_should_optimize_folio(h, folio)) return ret; static_branch_inc(&hugetlb_optimize_vmemmap_key); @@ -588,7 +587,7 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, * the caller. */ ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, - vmemmap_pages, flags); + vmemmap_pages, flags); if (ret) { static_branch_dec(&hugetlb_optimize_vmemmap_key); folio_clear_hugetlb_vmemmap_optimized(folio); @@ -615,12 +614,12 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) free_vmemmap_page_list(&vmemmap_pages); } -static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head) +static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) { - unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; - if (!vmemmap_should_optimize(h, head)) + if (!vmemmap_should_optimize_folio(h, folio)) return 0; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); @@ -640,7 +639,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l LIST_HEAD(vmemmap_pages); list_for_each_entry(folio, folio_list, lru) { - int ret = hugetlb_vmemmap_split(h, &folio->page); + int ret = hugetlb_vmemmap_split_folio(h, folio); /* * Spliting the PMD requires allocating a page, thus lets fail @@ -655,9 +654,10 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l flush_tlb_all(); list_for_each_entry(folio, folio_list, lru) { - int ret = __hugetlb_vmemmap_optimize_folio(h, folio, - &vmemmap_pages, - VMEMMAP_REMAP_NO_TLB_FLUSH); + int ret; + + ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, + VMEMMAP_REMAP_NO_TLB_FLUSH); /* * Pages to be freed may have been accumulated. If we @@ -671,9 +671,8 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); - __hugetlb_vmemmap_optimize_folio(h, folio, - &vmemmap_pages, - VMEMMAP_REMAP_NO_TLB_FLUSH); + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, + VMEMMAP_REMAP_NO_TLB_FLUSH); } } From 60433a9d038db006ca2f49e3c5f050dc46aaad3a Mon Sep 17 00:00:00 2001 From: Dmitry Rokosov Date: Thu, 23 Nov 2023 10:19:43 +0300 Subject: [PATCH 0473/1562] samples: introduce new samples subdir for cgroup Patch series "samples: introduce cgroup events listeners", v3. To begin with, this patch series relocates the cgroup example code to the samples/cgroup directory, which is the appropriate location for such code snippets. Furthermore, a new memcg events listener is introduced. This listener is a simple yet effective tool for monitoring memory events and managing counter changes during runtime. Additionally, as per Andrew Morton's suggestion, a helpful reminder comment is included in the memcontrol implementation. This comment serves to ensure that the samples code is updated whenever new events are added. This patch (of 3): Move the cgroup_event_listener for cgroup v1 to the samples directory. This suggestion was proposed by Andrew Morton during the discussion [1]. Link: https://lore.kernel.org/all/20231106140934.3f5d4960141562fe8da53906@linux-foundation.org/ [1] Link: https://lkml.kernel.org/r/20231123071945.25811-1-ddrokosov@salutedevices.com Link: https://lkml.kernel.org/r/20231123071945.25811-2-ddrokosov@salutedevices.com Signed-off-by: Dmitry Rokosov Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + samples/Kconfig | 6 ++++++ samples/Makefile | 1 + samples/cgroup/Makefile | 5 +++++ {tools => samples}/cgroup/cgroup_event_listener.c | 0 tools/cgroup/Makefile | 11 ----------- 6 files changed, 13 insertions(+), 11 deletions(-) create mode 100644 samples/cgroup/Makefile rename {tools => samples}/cgroup/cgroup_event_listener.c (100%) delete mode 100644 tools/cgroup/Makefile diff --git a/MAINTAINERS b/MAINTAINERS index 5c9d3d854671..6f5d6962d26d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5339,6 +5339,7 @@ L: linux-mm@kvack.org S: Maintained F: mm/memcontrol.c F: mm/swap_cgroup.c +F: samples/cgroup/* F: tools/testing/selftests/cgroup/memcg_protection.m F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c F: tools/testing/selftests/cgroup/test_kmem.c diff --git a/samples/Kconfig b/samples/Kconfig index b0ddf5f36738..b288d9991d27 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -285,6 +285,12 @@ config SAMPLE_KMEMLEAK Build a sample program which have explicitly leaks memory to test kmemleak +config SAMPLE_CGROUP + bool "Build cgroup sample code" + depends on CGROUPS && CC_CAN_LINK && HEADERS_INSTALL + help + Build samples that demonstrate the usage of the cgroup API. + source "samples/rust/Kconfig" endif # SAMPLES diff --git a/samples/Makefile b/samples/Makefile index 0a551c2b33f4..b85fa64390c5 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -3,6 +3,7 @@ subdir-$(CONFIG_SAMPLE_AUXDISPLAY) += auxdisplay subdir-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs +subdir-$(CONFIG_SAMPLE_CGROUP) += cgroup obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs/ obj-$(CONFIG_SAMPLE_CONNECTOR) += connector/ obj-$(CONFIG_SAMPLE_FANOTIFY_ERROR) += fanotify/ diff --git a/samples/cgroup/Makefile b/samples/cgroup/Makefile new file mode 100644 index 000000000000..deef4530f5e7 --- /dev/null +++ b/samples/cgroup/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +userprogs-always-y += cgroup_event_listener + +userccflags += -I usr/include diff --git a/tools/cgroup/cgroup_event_listener.c b/samples/cgroup/cgroup_event_listener.c similarity index 100% rename from tools/cgroup/cgroup_event_listener.c rename to samples/cgroup/cgroup_event_listener.c diff --git a/tools/cgroup/Makefile b/tools/cgroup/Makefile deleted file mode 100644 index ffca068e4a76..000000000000 --- a/tools/cgroup/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Makefile for cgroup tools - -CFLAGS = -Wall -Wextra - -all: cgroup_event_listener -%: %.c - $(CC) $(CFLAGS) -o $@ $^ - -clean: - $(RM) cgroup_event_listener From becf6529603589c9cb8cfda398a585c0c2e01aae Mon Sep 17 00:00:00 2001 From: Dmitry Rokosov Date: Thu, 23 Nov 2023 10:19:44 +0300 Subject: [PATCH 0474/1562] samples/cgroup: introduce memcg memory.events listener This is a simple listener for memory events that handles counter changes in runtime. It can be set up for a specific memory cgroup v2. The output example: ===== $ /tmp/memcg_event_listener test Initialized MEMCG events with counters: MEMCG events: low: 0 high: 0 max: 0 oom: 0 oom_kill: 0 oom_group_kill: 0 Started monitoring memory events from '/sys/fs/cgroup/test/memory.events'... Received event in /sys/fs/cgroup/test/memory.events: *** 1 MEMCG oom_kill event, change counter 0 => 1 Received event in /sys/fs/cgroup/test/memory.events: *** 1 MEMCG oom_kill event, change counter 1 => 2 Received event in /sys/fs/cgroup/test/memory.events: *** 1 MEMCG oom_kill event, change counter 2 => 3 Received event in /sys/fs/cgroup/test/memory.events: *** 1 MEMCG oom_kill event, change counter 3 => 4 Received event in /sys/fs/cgroup/test/memory.events: *** 2 MEMCG max events, change counter 0 => 2 Received event in /sys/fs/cgroup/test/memory.events: *** 8 MEMCG max events, change counter 2 => 10 *** 1 MEMCG oom event, change counter 0 => 1 Received event in /sys/fs/cgroup/test/memory.events: *** 1 MEMCG oom_kill event, change counter 4 => 5 ^CExiting memcg event listener... ===== Link: https://lkml.kernel.org/r/20231123071945.25811-3-ddrokosov@salutedevices.com Signed-off-by: Dmitry Rokosov Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- samples/cgroup/Makefile | 2 +- samples/cgroup/memcg_event_listener.c | 330 ++++++++++++++++++++++++++ 2 files changed, 331 insertions(+), 1 deletion(-) create mode 100644 samples/cgroup/memcg_event_listener.c diff --git a/samples/cgroup/Makefile b/samples/cgroup/Makefile index deef4530f5e7..526c8569707c 100644 --- a/samples/cgroup/Makefile +++ b/samples/cgroup/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -userprogs-always-y += cgroup_event_listener +userprogs-always-y += cgroup_event_listener memcg_event_listener userccflags += -I usr/include diff --git a/samples/cgroup/memcg_event_listener.c b/samples/cgroup/memcg_event_listener.c new file mode 100644 index 000000000000..a1667fe2489a --- /dev/null +++ b/samples/cgroup/memcg_event_listener.c @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * memcg_event_listener.c - Simple listener of memcg memory.events + * + * Copyright (c) 2023, SaluteDevices. All Rights Reserved. + * + * Author: Dmitry Rokosov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MEMCG_EVENTS "memory.events" + +/* Size of buffer to use when reading inotify events */ +#define INOTIFY_BUFFER_SIZE 8192 + +#define INOTIFY_EVENT_NEXT(event, length) ({ \ + (length) -= sizeof(*(event)) + (event)->len; \ + (event)++; \ +}) + +#define INOTIFY_EVENT_OK(event, length) ((length) >= (ssize_t)sizeof(*(event))) + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) + +struct memcg_counters { + long low; + long high; + long max; + long oom; + long oom_kill; + long oom_group_kill; +}; + +struct memcg_events { + struct memcg_counters counters; + char path[PATH_MAX]; + int inotify_fd; + int inotify_wd; +}; + +static void print_memcg_counters(const struct memcg_counters *counters) +{ + printf("MEMCG events:\n"); + printf("\tlow: %ld\n", counters->low); + printf("\thigh: %ld\n", counters->high); + printf("\tmax: %ld\n", counters->max); + printf("\toom: %ld\n", counters->oom); + printf("\toom_kill: %ld\n", counters->oom_kill); + printf("\toom_group_kill: %ld\n", counters->oom_group_kill); +} + +static int get_memcg_counter(char *line, const char *name, long *counter) +{ + size_t len = strlen(name); + char *endptr; + long tmp; + + if (memcmp(line, name, len)) { + warnx("Counter line %s has wrong name, %s is expected", + line, name); + return -EINVAL; + } + + /* skip the whitespace delimiter */ + len += 1; + + errno = 0; + tmp = strtol(&line[len], &endptr, 10); + if (((tmp == LONG_MAX || tmp == LONG_MIN) && errno == ERANGE) || + (errno && !tmp)) { + warnx("Failed to parse: %s", &line[len]); + return -ERANGE; + } + + if (endptr == &line[len]) { + warnx("Not digits were found in line %s", &line[len]); + return -EINVAL; + } + + if (!(*endptr == '\0' || (*endptr == '\n' && *++endptr == '\0'))) { + warnx("Further characters after number: %s", endptr); + return -EINVAL; + } + + *counter = tmp; + + return 0; +} + +static int read_memcg_events(struct memcg_events *events, bool show_diff) +{ + FILE *fp = fopen(events->path, "re"); + size_t i; + int ret = 0; + bool any_new_events = false; + char *line = NULL; + size_t len = 0; + struct memcg_counters new_counters; + struct memcg_counters *counters = &events->counters; + struct { + const char *name; + long *new; + long *old; + } map[] = { + { + .name = "low", + .new = &new_counters.low, + .old = &counters->low, + }, + { + .name = "high", + .new = &new_counters.high, + .old = &counters->high, + }, + { + .name = "max", + .new = &new_counters.max, + .old = &counters->max, + }, + { + .name = "oom", + .new = &new_counters.oom, + .old = &counters->oom, + }, + { + .name = "oom_kill", + .new = &new_counters.oom_kill, + .old = &counters->oom_kill, + }, + { + .name = "oom_group_kill", + .new = &new_counters.oom_group_kill, + .old = &counters->oom_group_kill, + }, + }; + + if (!fp) { + warn("Failed to open memcg events file %s", events->path); + return -EBADF; + } + + /* Read new values for memcg counters */ + for (i = 0; i < ARRAY_SIZE(map); ++i) { + ssize_t nread; + + errno = 0; + nread = getline(&line, &len, fp); + if (nread == -1) { + if (errno) { + warn("Failed to read line for counter %s", + map[i].name); + ret = -EIO; + goto exit; + } + + break; + } + + ret = get_memcg_counter(line, map[i].name, map[i].new); + if (ret) { + warnx("Failed to get counter value from line %s", line); + goto exit; + } + } + + for (i = 0; i < ARRAY_SIZE(map); ++i) { + long diff; + + if (*map[i].new > *map[i].old) { + diff = *map[i].new - *map[i].old; + + if (show_diff) + printf("*** %ld MEMCG %s event%s, " + "change counter %ld => %ld\n", + diff, map[i].name, + (diff == 1) ? "" : "s", + *map[i].old, *map[i].new); + + *map[i].old += diff; + any_new_events = true; + } + } + + if (show_diff && !any_new_events) + printf("*** No new untracked memcg events available\n"); + +exit: + free(line); + fclose(fp); + + return ret; +} + +static void process_memcg_events(struct memcg_events *events, + struct inotify_event *event) +{ + int ret; + + if (events->inotify_wd != event->wd) { + warnx("Unknown inotify event %d, should be %d", event->wd, + events->inotify_wd); + return; + } + + printf("Received event in %s:\n", events->path); + + if (!(event->mask & IN_MODIFY)) { + warnx("No IN_MODIFY event, skip it"); + return; + } + + ret = read_memcg_events(events, /* show_diff = */true); + if (ret) + warnx("Can't read memcg events"); +} + +static void monitor_events(struct memcg_events *events) +{ + struct pollfd fds[1]; + int ret; + + printf("Started monitoring memory events from '%s'...\n", events->path); + + fds[0].fd = events->inotify_fd; + fds[0].events = POLLIN; + + for (;;) { + ret = poll(fds, ARRAY_SIZE(fds), -1); + if (ret < 0 && errno != EAGAIN) + err(EXIT_FAILURE, "Can't poll memcg events (%d)", ret); + + if (fds[0].revents & POLLERR) + err(EXIT_FAILURE, "Got POLLERR during monitor events"); + + if (fds[0].revents & POLLIN) { + struct inotify_event *event; + char buffer[INOTIFY_BUFFER_SIZE]; + ssize_t length; + + length = read(fds[0].fd, buffer, INOTIFY_BUFFER_SIZE); + if (length <= 0) + continue; + + event = (struct inotify_event *)buffer; + while (INOTIFY_EVENT_OK(event, length)) { + process_memcg_events(events, event); + event = INOTIFY_EVENT_NEXT(event, length); + } + } + } +} + +static int initialize_memcg_events(struct memcg_events *events, + const char *cgroup) +{ + int ret; + + memset(events, 0, sizeof(struct memcg_events)); + + ret = snprintf(events->path, PATH_MAX, + "/sys/fs/cgroup/%s/memory.events", cgroup); + if (ret >= PATH_MAX) { + warnx("Path to cgroup memory.events is too long"); + return -EMSGSIZE; + } else if (ret < 0) { + warn("Can't generate cgroup event full name"); + return ret; + } + + ret = read_memcg_events(events, /* show_diff = */false); + if (ret) { + warnx("Failed to read initial memcg events state (%d)", ret); + return ret; + } + + events->inotify_fd = inotify_init(); + if (events->inotify_fd < 0) { + warn("Failed to setup new inotify device"); + return -EMFILE; + } + + events->inotify_wd = inotify_add_watch(events->inotify_fd, + events->path, IN_MODIFY); + if (events->inotify_wd < 0) { + warn("Couldn't add monitor in dir %s", events->path); + return -EIO; + } + + printf("Initialized MEMCG events with counters:\n"); + print_memcg_counters(&events->counters); + + return 0; +} + +static void cleanup_memcg_events(struct memcg_events *events) +{ + inotify_rm_watch(events->inotify_fd, events->inotify_wd); + close(events->inotify_fd); +} + +int main(int argc, const char **argv) +{ + struct memcg_events events; + ssize_t ret; + + if (argc != 2) + errx(EXIT_FAILURE, "Usage: %s ", argv[0]); + + ret = initialize_memcg_events(&events, argv[1]); + if (ret) + errx(EXIT_FAILURE, "Can't initialize memcg events (%zd)", ret); + + monitor_events(&events); + + cleanup_memcg_events(&events); + + printf("Exiting memcg event listener...\n"); + + return EXIT_SUCCESS; +} From 664dc2189dd4327dd94959f5167a263276af6b76 Mon Sep 17 00:00:00 2001 From: Dmitry Rokosov Date: Thu, 23 Nov 2023 10:19:45 +0300 Subject: [PATCH 0475/1562] mm: memcg: add reminder comment for the memcg v2 events To maintain the correct state, it is important to ensure that events for the memory cgroup v2 are aligned with the sample cgroup codes. Link: https://lkml.kernel.org/r/20231123071945.25811-4-ddrokosov@salutedevices.com Signed-off-by: Dmitry Rokosov Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b226090fd906..592572d4842e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6783,6 +6783,10 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, return nbytes; } +/* + * Note: don't forget to update the 'samples/cgroup/memcg_event_listener' + * if any new events become available. + */ static void __memory_events_show(struct seq_file *m, atomic_long_t *events) { seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); From 73829b7134708de2ee64f8d2027cef32d27faeb3 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 28 Nov 2023 17:22:07 +0900 Subject: [PATCH 0476/1562] zram: use kmap_local_page() Use kmap_local_page() instead of kmap_atomic() which has been deprecated. Link: https://lkml.kernel.org/r/20231128083845.848008-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f6b286e7f310..2b1d82473be8 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1321,9 +1321,9 @@ static int zram_read_from_zspool(struct zram *zram, struct page *page, void *mem; value = handle ? zram_get_element(zram, index) : 0; - mem = kmap_atomic(page); + mem = kmap_local_page(page); zram_fill_page(mem, PAGE_SIZE, value); - kunmap_atomic(mem); + kunmap_local(mem); return 0; } @@ -1336,14 +1336,14 @@ static int zram_read_from_zspool(struct zram *zram, struct page *page, src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); if (size == PAGE_SIZE) { - dst = kmap_atomic(page); + dst = kmap_local_page(page); memcpy(dst, src, PAGE_SIZE); - kunmap_atomic(dst); + kunmap_local(dst); ret = 0; } else { - dst = kmap_atomic(page); + dst = kmap_local_page(page); ret = zcomp_decompress(zstrm, src, size, dst); - kunmap_atomic(dst); + kunmap_local(dst); zcomp_stream_put(zram->comps[prio]); } zs_unmap_object(zram->mem_pool, handle); @@ -1416,21 +1416,21 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) unsigned long element = 0; enum zram_pageflags flags = 0; - mem = kmap_atomic(page); + mem = kmap_local_page(page); if (page_same_filled(mem, &element)) { - kunmap_atomic(mem); + kunmap_local(mem); /* Free memory associated with this sector now. */ flags = ZRAM_SAME; atomic64_inc(&zram->stats.same_pages); goto out; } - kunmap_atomic(mem); + kunmap_local(mem); compress_again: zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); - src = kmap_atomic(page); + src = kmap_local_page(page); ret = zcomp_compress(zstrm, src, &comp_len); - kunmap_atomic(src); + kunmap_local(src); if (unlikely(ret)) { zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); @@ -1494,10 +1494,10 @@ compress_again: src = zstrm->buffer; if (comp_len == PAGE_SIZE) - src = kmap_atomic(page); + src = kmap_local_page(page); memcpy(dst, src, comp_len); if (comp_len == PAGE_SIZE) - kunmap_atomic(src); + kunmap_local(src); zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); zs_unmap_object(zram->mem_pool, handle); @@ -1614,9 +1614,9 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, num_recomps++; zstrm = zcomp_stream_get(zram->comps[prio]); - src = kmap_atomic(page); + src = kmap_local_page(page); ret = zcomp_compress(zstrm, src, &comp_len_new); - kunmap_atomic(src); + kunmap_local(src); if (ret) { zcomp_stream_put(zram->comps[prio]); From 5d4c6ac94694096cdfb528f05a3019a1a423b3a4 Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Mon, 27 Nov 2023 21:17:31 +0000 Subject: [PATCH 0477/1562] kasan: record and report more information Record and report more information to help us find the cause of the bug and to help us correlate the error with other system events. This patch adds recording and showing CPU number and timestamp at allocation and free (controlled by CONFIG_KASAN_EXTRA_INFO). The timestamps in the report use the same format and source as printk. Error occurrence timestamp is already implicit in the printk log, and CPU number is already shown by dump_stack_lvl, so there is no need to add it. In order to record CPU number and timestamp at allocation and free, corresponding members need to be added to the relevant data structures, which will lead to increased memory consumption. In Generic KASAN, members are added to struct kasan_track. Since in most cases, alloc meta is stored in the redzone and free meta is stored in the object or the redzone, memory consumption will not increase much. In SW_TAGS KASAN and HW_TAGS KASAN, members are added to struct kasan_stack_ring_entry. Memory consumption increases as the size of struct kasan_stack_ring_entry increases (this part of the memory is allocated by memblock), but since this is configurable, it is up to the user to choose. Link: https://lkml.kernel.org/r/VI1P193MB0752BD991325D10E4AB1913599BDA@VI1P193MB0752.EURP193.PROD.OUTLOOK.COM Signed-off-by: Juntong Deng Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- lib/Kconfig.kasan | 21 +++++++++++++++++++++ mm/kasan/common.c | 8 ++++++++ mm/kasan/kasan.h | 8 ++++++++ mm/kasan/report.c | 12 ++++++++++++ mm/kasan/report_tags.c | 15 +++++++++++++++ mm/kasan/tags.c | 15 +++++++++++++++ 6 files changed, 79 insertions(+) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 935eda08b1e1..8653f5c38be7 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -207,4 +207,25 @@ config KASAN_MODULE_TEST A part of the KASAN test suite that is not integrated with KUnit. Incompatible with Hardware Tag-Based KASAN. +config KASAN_EXTRA_INFO + bool "Record and report more information" + depends on KASAN + help + Record and report more information to help us find the cause of the + bug and to help us correlate the error with other system events. + + Currently, the CPU number and timestamp are additionally + recorded for each heap block at allocation and free time, and + 8 bytes will be added to each metadata structure that records + allocation or free information. + + In Generic KASAN, each kmalloc-8 and kmalloc-16 object will add + 16 bytes of additional memory consumption, and each kmalloc-32 + object will add 8 bytes of additional memory consumption, not + affecting other larger objects. + + In SW_TAGS KASAN and HW_TAGS KASAN, depending on the stack_ring_size + boot parameter, it will add 8 * stack_ring_size bytes of additional + memory consumption. + endif # KASAN diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b5d8bd26fced..fe6c4b43ad9f 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,13 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags) void kasan_set_track(struct kasan_track *track, gfp_t flags) { +#ifdef CONFIG_KASAN_EXTRA_INFO + u32 cpu = raw_smp_processor_id(); + u64 ts_nsec = local_clock(); + + track->cpu = cpu; + track->timestamp = ts_nsec >> 3; +#endif /* CONFIG_KASAN_EXTRA_INFO */ track->pid = current->pid; track->stack = kasan_save_stack(flags, STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index b29d46b83d1f..5e298e3ac909 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -187,6 +187,10 @@ static inline bool kasan_requires_meta(void) struct kasan_track { u32 pid; depot_stack_handle_t stack; +#ifdef CONFIG_KASAN_EXTRA_INFO + u64 cpu:20; + u64 timestamp:44; +#endif /* CONFIG_KASAN_EXTRA_INFO */ }; enum kasan_report_type { @@ -278,6 +282,10 @@ struct kasan_stack_ring_entry { u32 pid; depot_stack_handle_t stack; bool is_free; +#ifdef CONFIG_KASAN_EXTRA_INFO + u64 cpu:20; + u64 timestamp:44; +#endif /* CONFIG_KASAN_EXTRA_INFO */ }; struct kasan_stack_ring { diff --git a/mm/kasan/report.c b/mm/kasan/report.c index e77facb62900..a938237f6882 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -262,7 +262,19 @@ static void print_error_description(struct kasan_report_info *info) static void print_track(struct kasan_track *track, const char *prefix) { +#ifdef CONFIG_KASAN_EXTRA_INFO + u64 ts_nsec = track->timestamp; + unsigned long rem_usec; + + ts_nsec <<= 3; + rem_usec = do_div(ts_nsec, NSEC_PER_SEC) / 1000; + + pr_err("%s by task %u on cpu %d at %lu.%06lus:\n", + prefix, track->pid, track->cpu, + (unsigned long)ts_nsec, rem_usec); +#else pr_err("%s by task %u:\n", prefix, track->pid); +#endif /* CONFIG_KASAN_EXTRA_INFO */ if (track->stack) stack_depot_print(track->stack); else diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 55154743f915..979f284c2497 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -27,6 +27,15 @@ static const char *get_common_bug_type(struct kasan_report_info *info) return "invalid-access"; } +#ifdef CONFIG_KASAN_EXTRA_INFO +static void kasan_complete_extra_report_info(struct kasan_track *track, + struct kasan_stack_ring_entry *entry) +{ + track->cpu = entry->cpu; + track->timestamp = entry->timestamp; +} +#endif /* CONFIG_KASAN_EXTRA_INFO */ + void kasan_complete_mode_report_info(struct kasan_report_info *info) { unsigned long flags; @@ -73,6 +82,9 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) info->free_track.pid = entry->pid; info->free_track.stack = entry->stack; +#ifdef CONFIG_KASAN_EXTRA_INFO + kasan_complete_extra_report_info(&info->free_track, entry); +#endif /* CONFIG_KASAN_EXTRA_INFO */ free_found = true; /* @@ -88,6 +100,9 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) info->alloc_track.pid = entry->pid; info->alloc_track.stack = entry->stack; +#ifdef CONFIG_KASAN_EXTRA_INFO + kasan_complete_extra_report_info(&info->alloc_track, entry); +#endif /* CONFIG_KASAN_EXTRA_INFO */ alloc_found = true; /* diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 739ae997463d..c13b198b8302 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -93,6 +94,17 @@ void __init kasan_init_tags(void) } } +#ifdef CONFIG_KASAN_EXTRA_INFO +static void save_extra_info(struct kasan_stack_ring_entry *entry) +{ + u32 cpu = raw_smp_processor_id(); + u64 ts_nsec = local_clock(); + + entry->cpu = cpu; + entry->timestamp = ts_nsec >> 3; +} +#endif /* CONFIG_KASAN_EXTRA_INFO */ + static void save_stack_info(struct kmem_cache *cache, void *object, gfp_t gfp_flags, bool is_free) { @@ -128,6 +140,9 @@ next: entry->pid = current->pid; entry->stack = stack; entry->is_free = is_free; +#ifdef CONFIG_KASAN_EXTRA_INFO + save_extra_info(entry); +#endif /* CONFIG_KASAN_EXTRA_INFO */ entry->ptr = object; From a9a1d6ad668f2ea0b5a77d0c4c7a41446d0801a8 Mon Sep 17 00:00:00 2001 From: Dongmin Lee Date: Sat, 4 Nov 2023 20:33:20 +0900 Subject: [PATCH 0478/1562] kernel/reboot: explicitly notify if halt occurred instead of power off When kernel_can_power_off() returns false, and reboot has called with LINUX_REBOOT_CMD_POWER_OFF, kernel_halt() will be initiated instead of actual power off function. However, in this situation, Kernel never explicitly notifies user that system halted instead of requested power off. Since halt and power off perform different behavior, and user initiated reboot call with power off command, not halt, This could be unintended behavior to user, like this: ~ # poweroff -f [ 3.581482] reboot: System halted Therefore, this explicitly notifies user that poweroff is not available, and halting has been occured as an alternative behavior instead: ~ # poweroff -f [ 4.123668] reboot: Power off not available: System halted instead [akpm@linux-foundation.org: tweak comment text] Link: https://lkml.kernel.org/r/20231104113320.72440-1-ldmldm05@gmail.com Signed-off-by: Dongmin Lee Signed-off-by: Andrew Morton --- kernel/reboot.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/reboot.c b/kernel/reboot.c index 395a0ea3c7a8..c3a3b82c4f64 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -58,6 +58,14 @@ struct sys_off_handler { struct device *dev; }; +/* + * This variable is used to indicate if a halt was initiated instead of a + * reboot when the reboot call was invoked with LINUX_REBOOT_CMD_POWER_OFF, but + * the system cannot be powered off. This allowes kernel_halt() to notify users + * of that. + */ +static bool poweroff_fallback_to_halt; + /* * Temporary stub that prevents linkage failure while we're in process * of removing all uses of legacy pm_power_off() around the kernel. @@ -297,7 +305,10 @@ void kernel_halt(void) kernel_shutdown_prepare(SYSTEM_HALT); migrate_to_reboot_cpu(); syscore_shutdown(); - pr_emerg("System halted\n"); + if (poweroff_fallback_to_halt) + pr_emerg("Power off not available: System halted instead\n"); + else + pr_emerg("System halted\n"); kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_halt(); } @@ -732,8 +743,10 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) { + poweroff_fallback_to_halt = true; cmd = LINUX_REBOOT_CMD_HALT; + } mutex_lock(&system_transition_mutex); switch (cmd) { From 61a7a5e25fe79b6c43f1c49705a0294be113c4a5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 30 Oct 2023 16:57:10 +0100 Subject: [PATCH 0479/1562] introduce for_other_threads(p, t) Cosmetic, but imho it makes the usage look more clear and simple, the new helper doesn't require to initialize "t". After this change while_each_thread() has only 3 users, and it is only used in the do/while loops. Link: https://lkml.kernel.org/r/20231030155710.GA9095@redhat.com Signed-off-by: Oleg Nesterov Reviewed-by: Christian Brauner Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- fs/exec.c | 3 +-- include/linux/sched/signal.h | 3 +++ kernel/signal.c | 11 ++++------- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 4aa19b24f281..ee43597cb453 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1578,11 +1578,10 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... */ - t = p; n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); - while_each_thread(p, t) { + for_other_threads(p, t) { if (t->fs == p->fs) n_fs++; } diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3499c1a8b929..41d6759d6a4a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -646,6 +646,9 @@ extern bool current_is_single_threaded(void); #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) +#define for_other_threads(p, t) \ + for (t = p; (t = next_thread(t)) != p; ) + #define __for_each_thread(signal, t) \ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ lockdep_is_held(&tasklist_lock)) diff --git a/kernel/signal.c b/kernel/signal.c index 47a7602dfe8d..5aa216e841a2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1376,12 +1376,12 @@ int force_sig_info(struct kernel_siginfo *info) */ int zap_other_threads(struct task_struct *p) { - struct task_struct *t = p; + struct task_struct *t; int count = 0; p->signal->group_stop_count = 0; - while_each_thread(p, t) { + for_other_threads(p, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); /* Don't require de_thread to wait for the vhost_worker */ if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER) @@ -2465,12 +2465,10 @@ static bool do_signal_stop(int signr) sig->group_exit_code = signr; sig->group_stop_count = 0; - if (task_set_jobctl_pending(current, signr | gstop)) sig->group_stop_count++; - t = current; - while_each_thread(current, t) { + for_other_threads(current, t) { /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, @@ -2966,8 +2964,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) if (sigisemptyset(&retarget)) return; - t = tsk; - while_each_thread(tsk, t) { + for_other_threads(tsk, t) { if (t->flags & PF_EXITING) continue; From 44e3876d268be49ee810481ee3c95d8d650bf22e Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Tue, 7 Nov 2023 07:44:16 +0900 Subject: [PATCH 0480/1562] fs/nilfs2: use standard array-copy-function ioctl.c utilizes memdup_user() to copy a userspace array. An overflow check is performed manually before the function's invocation. The new function memdup_array_user() standardizes copying userspace arrays, thus, improving readability by making it more clear that an array is being copied. Additionally, it also performs an overflow check. Remove the (now redundant) manual overflow-check and replace memdup_user() with memdup_array_user(). In addition, improve the grammar of the comment above memdup_array_user(). Link: https://lkml.kernel.org/r/20231106224416.3055-1-konishi.ryusuke@gmail.com Signed-off-by: Philipp Stanner Link: https://lkml.kernel.org/r/20231103184831.99406-2-pstanner@redhat.com Signed-off-by: Ryusuke Konishi Suggested-by: Dave Airlie Signed-off-by: Andrew Morton --- fs/nilfs2/ioctl.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 40ffade49f38..cfb6aca5ec38 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -872,16 +872,14 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, nsegs = argv[4].v_nmembs; if (argv[4].v_size != argsz[4]) goto out; - if (nsegs > UINT_MAX / sizeof(__u64)) - goto out; /* * argv[4] points to segment numbers this ioctl cleans. We - * use kmalloc() for its buffer because memory used for the - * segment numbers is enough small. + * use kmalloc() for its buffer because the memory used for the + * segment numbers is small enough. */ - kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base, - nsegs * sizeof(__u64)); + kbufs[4] = memdup_array_user((void __user *)(unsigned long)argv[4].v_base, + nsegs, sizeof(__u64)); if (IS_ERR(kbufs[4])) { ret = PTR_ERR(kbufs[4]); goto out; From 12427de9439d68b8e96ba6f50b601ef15f437612 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 13 Nov 2023 16:09:01 +0000 Subject: [PATCH 0481/1562] Squashfs: fix variable overflow triggered by sysbot Sysbot reports a slab out of bounds write in squashfs_readahead(). This is ultimately caused by a file reporting an (infeasibly) large file size (1407374883553280 bytes) with the minimum block size of 4K. This causes variable overflow. Link: https://lkml.kernel.org/r/20231113160901.6444-1-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Reported-by: syzbot+604424eb051c2f696163@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000b1fda20609ede0d1@google.com/ Signed-off-by: Andrew Morton --- fs/squashfs/file.c | 3 ++- fs/squashfs/file_direct.c | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 8ba8c4c50770..e8df6430444b 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -544,7 +544,8 @@ static void squashfs_readahead(struct readahead_control *ractl) struct squashfs_page_actor *actor; unsigned int nr_pages = 0; struct page **pages; - int i, file_end = i_size_read(inode) >> msblk->block_log; + int i; + loff_t file_end = i_size_read(inode) >> msblk->block_log; unsigned int max_pages = 1UL << shift; readahead_expand(ractl, start, (len | mask) + 1); diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index f1ccad519e28..763a3f7a75f6 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -26,10 +26,10 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, struct inode *inode = target_page->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT; + loff_t file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT; int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; - int start_index = target_page->index & ~mask; - int end_index = start_index | mask; + loff_t start_index = target_page->index & ~mask; + loff_t end_index = start_index | mask; int i, n, pages, bytes, res = -ENOMEM; struct page **page; struct squashfs_page_actor *actor; From 8f46eaf6fd8454b0621b4ce07df50b2aa471c880 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:17 +0900 Subject: [PATCH 0482/1562] nilfs2: add nilfs_end_folio_io() Patch series "nilfs2: Folio conversions for file paths". This series advances page->folio conversions for a wide range of nilfs2, including its file operations, block routines, and the log writer's writeback routines. It doesn't cover large folios support, but it saves a lot of hidden compound_head() calls while preserving the existing support range behavior. The original series in post [1] also covered directory-related page->folio conversions, but that was put on hold because a regression was found in testing, so this is an excerpt from the first half of the original post. [1] https://lkml.kernel.org/r/20231106173903.1734114-1-willy@infradead.org I tested this series in both 32-bit and 64-bit environments, switching between normal and small block sizes. I also reviewed all changes in all patches to ensure they do not break existing behavior. There were no problems. This patch (of 20): This is the folio counterpart of the existing nilfs_end_page_io() which is retained as a wrapper of nilfs_end_folio_io(). Replaces nine hidden calls to compound_head() with one. Link: https://lkml.kernel.org/r/20231114084436.2755-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20231114084436.2755-2-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 55e31cc903d1..1df03d0895be 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1739,17 +1739,18 @@ static int nilfs_segctor_write(struct nilfs_sc_info *sci, return ret; } -static void nilfs_end_page_io(struct page *page, int err) +static void nilfs_end_folio_io(struct folio *folio, int err) { - if (!page) + if (!folio) return; - if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) { + if (buffer_nilfs_node(folio_buffers(folio)) && + !folio_test_writeback(folio)) { /* * For b-tree node pages, this function may be called twice * or more because they might be split in a segment. */ - if (PageDirty(page)) { + if (folio_test_dirty(folio)) { /* * For pages holding split b-tree node buffers, dirty * flag on the buffers may be cleared discretely. @@ -1757,24 +1758,31 @@ static void nilfs_end_page_io(struct page *page, int err) * remaining buffers, and it must be cancelled if * all the buffers get cleaned later. */ - lock_page(page); - if (nilfs_page_buffers_clean(page)) - __nilfs_clear_page_dirty(page); - unlock_page(page); + folio_lock(folio); + if (nilfs_page_buffers_clean(&folio->page)) + __nilfs_clear_page_dirty(&folio->page); + folio_unlock(folio); } return; } if (!err) { - if (!nilfs_page_buffers_clean(page)) - __set_page_dirty_nobuffers(page); - ClearPageError(page); + if (!nilfs_page_buffers_clean(&folio->page)) + filemap_dirty_folio(folio->mapping, folio); + folio_clear_error(folio); } else { - __set_page_dirty_nobuffers(page); - SetPageError(page); + filemap_dirty_folio(folio->mapping, folio); + folio_set_error(folio); } - end_page_writeback(page); + folio_end_writeback(folio); +} + +static void nilfs_end_page_io(struct page *page, int err) +{ + if (!page) + return; + nilfs_end_folio_io(page_folio(page), err); } static void nilfs_abort_logs(struct list_head *logs, int err) From 50196f0081caef71090dc11eeb2b0793ff9449bc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:18 +0900 Subject: [PATCH 0483/1562] nilfs2: convert nilfs_abort_logs to use folios Use the new folio APIs, saving five hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-3-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 1df03d0895be..730062e79bfc 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1788,7 +1788,7 @@ static void nilfs_end_page_io(struct page *page, int err) static void nilfs_abort_logs(struct list_head *logs, int err) { struct nilfs_segment_buffer *segbuf; - struct page *bd_page = NULL, *fs_page = NULL; + struct folio *bd_folio = NULL, *fs_folio = NULL; struct buffer_head *bh; if (list_empty(logs)) @@ -1798,10 +1798,10 @@ static void nilfs_abort_logs(struct list_head *logs, int err) list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { clear_buffer_uptodate(bh); - if (bh->b_page != bd_page) { - if (bd_page) - end_page_writeback(bd_page); - bd_page = bh->b_page; + if (bh->b_folio != bd_folio) { + if (bd_folio) + folio_end_writeback(bd_folio); + bd_folio = bh->b_folio; } } @@ -1810,22 +1810,22 @@ static void nilfs_abort_logs(struct list_head *logs, int err) clear_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { clear_buffer_uptodate(bh); - if (bh->b_page != bd_page) { - end_page_writeback(bd_page); - bd_page = bh->b_page; + if (bh->b_folio != bd_folio) { + folio_end_writeback(bd_folio); + bd_folio = bh->b_folio; } break; } - if (bh->b_page != fs_page) { - nilfs_end_page_io(fs_page, err); - fs_page = bh->b_page; + if (bh->b_folio != fs_folio) { + nilfs_end_folio_io(fs_folio, err); + fs_folio = bh->b_folio; } } } - if (bd_page) - end_page_writeback(bd_page); + if (bd_folio) + folio_end_writeback(bd_folio); - nilfs_end_page_io(fs_page, err); + nilfs_end_folio_io(fs_folio, err); } static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, From 3cd36212bf75e476e52a045d1fb1a4f40a5a76b0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:19 +0900 Subject: [PATCH 0484/1562] nilfs2: convert nilfs_segctor_complete_write to use folios Use the new folio APIs, saving five calls to compound_head(). This includes the last callers of nilfs_end_page_io(), so remove that too. Link: https://lkml.kernel.org/r/20231114084436.2755-4-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 49 +++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 730062e79bfc..2a058aad5c2d 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1778,13 +1778,6 @@ static void nilfs_end_folio_io(struct folio *folio, int err) folio_end_writeback(folio); } -static void nilfs_end_page_io(struct page *page, int err) -{ - if (!page) - return; - nilfs_end_folio_io(page_folio(page), err); -} - static void nilfs_abort_logs(struct list_head *logs, int err) { struct nilfs_segment_buffer *segbuf; @@ -1867,7 +1860,7 @@ static void nilfs_set_next_segment(struct the_nilfs *nilfs, static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) { struct nilfs_segment_buffer *segbuf; - struct page *bd_page = NULL, *fs_page = NULL; + struct folio *bd_folio = NULL, *fs_folio = NULL; struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int update_sr = false; @@ -1878,21 +1871,21 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) b_assoc_buffers) { set_buffer_uptodate(bh); clear_buffer_dirty(bh); - if (bh->b_page != bd_page) { - if (bd_page) - end_page_writeback(bd_page); - bd_page = bh->b_page; + if (bh->b_folio != bd_folio) { + if (bd_folio) + folio_end_writeback(bd_folio); + bd_folio = bh->b_folio; } } /* - * We assume that the buffers which belong to the same page + * We assume that the buffers which belong to the same folio * continue over the buffer list. - * Under this assumption, the last BHs of pages is - * identifiable by the discontinuity of bh->b_page - * (page != fs_page). + * Under this assumption, the last BHs of folios is + * identifiable by the discontinuity of bh->b_folio + * (folio != fs_folio). * * For B-tree node blocks, however, this assumption is not - * guaranteed. The cleanup code of B-tree node pages needs + * guaranteed. The cleanup code of B-tree node folios needs * special care. */ list_for_each_entry(bh, &segbuf->sb_payload_buffers, @@ -1905,16 +1898,16 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) set_mask_bits(&bh->b_state, clear_bits, set_bits); if (bh == segbuf->sb_super_root) { - if (bh->b_page != bd_page) { - end_page_writeback(bd_page); - bd_page = bh->b_page; + if (bh->b_folio != bd_folio) { + folio_end_writeback(bd_folio); + bd_folio = bh->b_folio; } update_sr = true; break; } - if (bh->b_page != fs_page) { - nilfs_end_page_io(fs_page, 0); - fs_page = bh->b_page; + if (bh->b_folio != fs_folio) { + nilfs_end_folio_io(fs_folio, 0); + fs_folio = bh->b_folio; } } @@ -1928,13 +1921,13 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) } } /* - * Since pages may continue over multiple segment buffers, - * end of the last page must be checked outside of the loop. + * Since folios may continue over multiple segment buffers, + * end of the last folio must be checked outside of the loop. */ - if (bd_page) - end_page_writeback(bd_page); + if (bd_folio) + folio_end_writeback(bd_folio); - nilfs_end_page_io(fs_page, 0); + nilfs_end_folio_io(fs_folio, 0); nilfs_drop_collected_inodes(&sci->sc_dirty_files); From 797e25ad106b5f0c49bdbeb6ce015acae6b93b3b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:20 +0900 Subject: [PATCH 0485/1562] nilfs2: convert nilfs_forget_buffer to use a folio Save two hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-5-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 06b04758f289..3882acde1b3e 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -73,7 +73,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, */ void nilfs_forget_buffer(struct buffer_head *bh) { - struct page *page = bh->b_page; + struct folio *folio = bh->b_folio; const unsigned long clear_bits = (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) | BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) | @@ -81,12 +81,12 @@ void nilfs_forget_buffer(struct buffer_head *bh) lock_buffer(bh); set_mask_bits(&bh->b_state, clear_bits, 0); - if (nilfs_page_buffers_clean(page)) - __nilfs_clear_page_dirty(page); + if (nilfs_page_buffers_clean(&folio->page)) + __nilfs_clear_page_dirty(&folio->page); bh->b_blocknr = -1; - ClearPageUptodate(page); - ClearPageMappedToDisk(page); + folio_clear_uptodate(folio); + folio_clear_mappedtodisk(folio); unlock_buffer(bh); brelse(bh); } From 36319c0c1c6c4374949f7351a018aa922fb6ef3d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:21 +0900 Subject: [PATCH 0486/1562] nilfs2: convert to nilfs_folio_buffers_clean() All callers of nilfs_page_buffers_clean() now have a folio, so convert it to take a folio. While I'm at it, make it return a bool. Link: https://lkml.kernel.org/r/20231114084436.2755-6-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 18 +++++++++--------- fs/nilfs2/page.h | 2 +- fs/nilfs2/segment.c | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 3882acde1b3e..29799a49c234 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -81,7 +81,7 @@ void nilfs_forget_buffer(struct buffer_head *bh) lock_buffer(bh); set_mask_bits(&bh->b_state, clear_bits, 0); - if (nilfs_page_buffers_clean(&folio->page)) + if (nilfs_folio_buffers_clean(folio)) __nilfs_clear_page_dirty(&folio->page); bh->b_blocknr = -1; @@ -131,23 +131,23 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) } /** - * nilfs_page_buffers_clean - check if a page has dirty buffers or not. - * @page: page to be checked + * nilfs_folio_buffers_clean - Check if a folio has dirty buffers or not. + * @folio: Folio to be checked. * - * nilfs_page_buffers_clean() returns zero if the page has dirty buffers. - * Otherwise, it returns non-zero value. + * nilfs_folio_buffers_clean() returns false if the folio has dirty buffers. + * Otherwise, it returns true. */ -int nilfs_page_buffers_clean(struct page *page) +bool nilfs_folio_buffers_clean(struct folio *folio) { struct buffer_head *bh, *head; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { if (buffer_dirty(bh)) - return 0; + return false; bh = bh->b_this_page; } while (bh != head); - return 1; + return true; } void nilfs_page_bug(struct page *page) diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index d249ea1cefff..a8ab800e689c 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -36,7 +36,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *, unsigned long, unsigned long); void nilfs_forget_buffer(struct buffer_head *); void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *); -int nilfs_page_buffers_clean(struct page *); +bool nilfs_folio_buffers_clean(struct folio *); void nilfs_page_bug(struct page *); int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 2a058aad5c2d..888b8606a1e8 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1759,7 +1759,7 @@ static void nilfs_end_folio_io(struct folio *folio, int err) * all the buffers get cleaned later. */ folio_lock(folio); - if (nilfs_page_buffers_clean(&folio->page)) + if (nilfs_folio_buffers_clean(folio)) __nilfs_clear_page_dirty(&folio->page); folio_unlock(folio); } @@ -1767,7 +1767,7 @@ static void nilfs_end_folio_io(struct folio *folio, int err) } if (!err) { - if (!nilfs_page_buffers_clean(&folio->page)) + if (!nilfs_folio_buffers_clean(folio)) filemap_dirty_folio(folio->mapping, folio); folio_clear_error(folio); } else { From b7ef8d3b2d82e0040cb1c925820fb92830c1bd51 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:22 +0900 Subject: [PATCH 0487/1562] nilfs2: convert nilfs_writepage() to use a folio Convert the incoming page to a folio. Replaces three calls to compound_head() with one. Link: https://lkml.kernel.org/r/20231114084436.2755-7-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/inode.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index f861f3a0bf5c..c7ec56358a79 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -175,7 +175,8 @@ static int nilfs_writepages(struct address_space *mapping, static int nilfs_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; int err; if (sb_rdonly(inode->i_sb)) { @@ -186,12 +187,12 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc) * So, here we simply discard this dirty page. */ nilfs_clear_dirty_page(page, false); - unlock_page(page); + folio_unlock(folio); return -EROFS; } - redirty_page_for_writepage(wbc, page); - unlock_page(page); + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); if (wbc->sync_mode == WB_SYNC_ALL) { err = nilfs_construct_segment(inode->i_sb); From 021cff9df677f108dd7cdb6c5d8189acec91682c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:23 +0900 Subject: [PATCH 0488/1562] nilfs2: convert nilfs_mdt_write_page() to use a folio Convert the incoming page to a folio. Replaces three calls to compound_head() with one. Link: https://lkml.kernel.org/r/20231114084436.2755-8-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/mdt.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index c97c77a39668..327408512b86 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -399,7 +399,8 @@ int nilfs_mdt_fetch_dirty(struct inode *inode) static int nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; struct super_block *sb; int err = 0; @@ -407,16 +408,16 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) /* * It means that filesystem was remounted in read-only * mode because of error or metadata corruption. But we - * have dirty pages that try to be flushed in background. - * So, here we simply discard this dirty page. + * have dirty folios that try to be flushed in background. + * So, here we simply discard this dirty folio. */ nilfs_clear_dirty_page(page, false); - unlock_page(page); + folio_unlock(folio); return -EROFS; } - redirty_page_for_writepage(wbc, page); - unlock_page(page); + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); if (!inode) return 0; From 5d3b5903d46bfdff6f23767909a6b3c2a5d702f4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:24 +0900 Subject: [PATCH 0489/1562] nilfs2: convert to nilfs_clear_folio_dirty() All callers of nilfs_clear_dirty_page() now have a folio, so rename the function and pass in the folio. Saves three hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-9-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/inode.c | 2 +- fs/nilfs2/mdt.c | 2 +- fs/nilfs2/page.c | 27 ++++++++++++++------------- fs/nilfs2/page.h | 2 +- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index c7ec56358a79..8fe784f62720 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -186,7 +186,7 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc) * have dirty pages that try to be flushed in background. * So, here we simply discard this dirty page. */ - nilfs_clear_dirty_page(page, false); + nilfs_clear_folio_dirty(folio, false); folio_unlock(folio); return -EROFS; } diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 327408512b86..2e7952ac2f67 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -411,7 +411,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) * have dirty folios that try to be flushed in background. * So, here we simply discard this dirty folio. */ - nilfs_clear_dirty_page(page, false); + nilfs_clear_folio_dirty(folio, false); folio_unlock(folio); return -EROFS; } diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 29799a49c234..48a91ff059f5 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -379,7 +379,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) * was acquired. Skip processing in that case. */ if (likely(folio->mapping == mapping)) - nilfs_clear_dirty_page(&folio->page, silent); + nilfs_clear_folio_dirty(folio, silent); folio_unlock(folio); } @@ -389,32 +389,33 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) } /** - * nilfs_clear_dirty_page - discard dirty page - * @page: dirty page that will be discarded + * nilfs_clear_folio_dirty - discard dirty folio + * @folio: dirty folio that will be discarded * @silent: suppress [true] or print [false] warning messages */ -void nilfs_clear_dirty_page(struct page *page, bool silent) +void nilfs_clear_folio_dirty(struct folio *folio, bool silent) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct super_block *sb = inode->i_sb; + struct buffer_head *bh, *head; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); if (!silent) nilfs_warn(sb, "discard dirty page: offset=%lld, ino=%lu", - page_offset(page), inode->i_ino); + folio_pos(folio), inode->i_ino); - ClearPageUptodate(page); - ClearPageMappedToDisk(page); + folio_clear_uptodate(folio); + folio_clear_mappedtodisk(folio); - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; + head = folio_buffers(folio); + if (head) { const unsigned long clear_bits = (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) | BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected)); - bh = head = page_buffers(page); + bh = head; do { lock_buffer(bh); if (!silent) @@ -427,7 +428,7 @@ void nilfs_clear_dirty_page(struct page *page, bool silent) } while (bh = bh->b_this_page, bh != head); } - __nilfs_clear_page_dirty(page); + __nilfs_clear_page_dirty(&folio->page); } unsigned int nilfs_page_count_clean_buffers(struct page *page, diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index a8ab800e689c..c419bb1f5b7d 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -41,7 +41,7 @@ void nilfs_page_bug(struct page *); int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); -void nilfs_clear_dirty_page(struct page *, bool); +void nilfs_clear_folio_dirty(struct folio *, bool); void nilfs_clear_dirty_pages(struct address_space *, bool); unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int, unsigned int); From 6609e235769cdb800e794d281dbe80dabe7e7834 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:25 +0900 Subject: [PATCH 0490/1562] nilfs2: convert to __nilfs_clear_folio_dirty() All callers now have a folio, so convert to pass a folio. No caller uses the return value, so make it return void. Removes a couple of hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-10-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 19 ++++++++++--------- fs/nilfs2/page.h | 2 +- fs/nilfs2/segment.c | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 48a91ff059f5..94e11bcee05b 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -82,7 +82,7 @@ void nilfs_forget_buffer(struct buffer_head *bh) lock_buffer(bh); set_mask_bits(&bh->b_state, clear_bits, 0); if (nilfs_folio_buffers_clean(folio)) - __nilfs_clear_page_dirty(&folio->page); + __nilfs_clear_folio_dirty(folio); bh->b_blocknr = -1; folio_clear_uptodate(folio); @@ -428,7 +428,7 @@ void nilfs_clear_folio_dirty(struct folio *folio, bool silent) } while (bh = bh->b_this_page, bh != head); } - __nilfs_clear_page_dirty(&folio->page); + __nilfs_clear_folio_dirty(folio); } unsigned int nilfs_page_count_clean_buffers(struct page *page, @@ -458,22 +458,23 @@ unsigned int nilfs_page_count_clean_buffers(struct page *page, * 2) Some B-tree operations like insertion or deletion may dispose buffers * in dirty state, and this needs to cancel the dirty state of their pages. */ -int __nilfs_clear_page_dirty(struct page *page) +void __nilfs_clear_folio_dirty(struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; if (mapping) { xa_lock_irq(&mapping->i_pages); - if (test_bit(PG_dirty, &page->flags)) { - __xa_clear_mark(&mapping->i_pages, page_index(page), + if (folio_test_dirty(folio)) { + __xa_clear_mark(&mapping->i_pages, folio->index, PAGECACHE_TAG_DIRTY); xa_unlock_irq(&mapping->i_pages); - return clear_page_dirty_for_io(page); + folio_clear_dirty_for_io(folio); + return; } xa_unlock_irq(&mapping->i_pages); - return 0; + return; } - return TestClearPageDirty(page); + folio_clear_dirty(folio); } /** diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index c419bb1f5b7d..968b311d265b 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -30,7 +30,7 @@ BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */ BUFFER_FNS(NILFS_Redirected, nilfs_redirected) /* redirected to a copy */ -int __nilfs_clear_page_dirty(struct page *); +void __nilfs_clear_folio_dirty(struct folio *); struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *, unsigned long, unsigned long); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 888b8606a1e8..8c675c118c66 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1760,7 +1760,7 @@ static void nilfs_end_folio_io(struct folio *folio, int err) */ folio_lock(folio); if (nilfs_folio_buffers_clean(folio)) - __nilfs_clear_page_dirty(&folio->page); + __nilfs_clear_folio_dirty(folio); folio_unlock(folio); } return; From ff5710c3f3c2ade105592f7964350cf637cd2b75 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:26 +0900 Subject: [PATCH 0491/1562] nilfs2: convert nilfs_segctor_prepare_write to use folios Use the new folio APIs, saving 17 hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-11-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 58 ++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 8c675c118c66..52995838f2de 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1665,39 +1665,39 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode) return 0; } -static void nilfs_begin_page_io(struct page *page) +static void nilfs_begin_folio_io(struct folio *folio) { - if (!page || PageWriteback(page)) + if (!folio || folio_test_writeback(folio)) /* * For split b-tree node pages, this function may be called * twice. We ignore the 2nd or later calls by this check. */ return; - lock_page(page); - clear_page_dirty_for_io(page); - set_page_writeback(page); - unlock_page(page); + folio_lock(folio); + folio_clear_dirty_for_io(folio); + folio_start_writeback(folio); + folio_unlock(folio); } static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) { struct nilfs_segment_buffer *segbuf; - struct page *bd_page = NULL, *fs_page = NULL; + struct folio *bd_folio = NULL, *fs_folio = NULL; list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { struct buffer_head *bh; list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { - if (bh->b_page != bd_page) { - if (bd_page) { - lock_page(bd_page); - clear_page_dirty_for_io(bd_page); - set_page_writeback(bd_page); - unlock_page(bd_page); + if (bh->b_folio != bd_folio) { + if (bd_folio) { + folio_lock(bd_folio); + folio_clear_dirty_for_io(bd_folio); + folio_start_writeback(bd_folio); + folio_unlock(bd_folio); } - bd_page = bh->b_page; + bd_folio = bh->b_folio; } } @@ -1705,28 +1705,28 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) b_assoc_buffers) { set_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { - if (bh->b_page != bd_page) { - lock_page(bd_page); - clear_page_dirty_for_io(bd_page); - set_page_writeback(bd_page); - unlock_page(bd_page); - bd_page = bh->b_page; + if (bh->b_folio != bd_folio) { + folio_lock(bd_folio); + folio_clear_dirty_for_io(bd_folio); + folio_start_writeback(bd_folio); + folio_unlock(bd_folio); + bd_folio = bh->b_folio; } break; } - if (bh->b_page != fs_page) { - nilfs_begin_page_io(fs_page); - fs_page = bh->b_page; + if (bh->b_folio != fs_folio) { + nilfs_begin_folio_io(fs_folio); + fs_folio = bh->b_folio; } } } - if (bd_page) { - lock_page(bd_page); - clear_page_dirty_for_io(bd_page); - set_page_writeback(bd_page); - unlock_page(bd_page); + if (bd_folio) { + folio_lock(bd_folio); + folio_clear_dirty_for_io(bd_folio); + folio_start_writeback(bd_folio); + folio_unlock(bd_folio); } - nilfs_begin_page_io(fs_page); + nilfs_begin_folio_io(fs_folio); } static int nilfs_segctor_write(struct nilfs_sc_info *sci, From 5a5cad8cb2e3ef5427b17d01c644b4cffd32af24 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:27 +0900 Subject: [PATCH 0492/1562] nilfs2: convert nilfs_page_mkwrite() to use a folio Using the new folio APIs saves seven hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-12-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/file.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 740ce26d1e76..bec33b89a075 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -45,34 +45,36 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vma->vm_file); struct nilfs_transaction_info ti; + struct buffer_head *bh, *head; int ret = 0; if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info))) return VM_FAULT_SIGBUS; /* -ENOSPC */ sb_start_pagefault(inode->i_sb); - lock_page(page); - if (page->mapping != inode->i_mapping || - page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { - unlock_page(page); + folio_lock(folio); + if (folio->mapping != inode->i_mapping || + folio_pos(folio) >= i_size_read(inode) || + !folio_test_uptodate(folio)) { + folio_unlock(folio); ret = -EFAULT; /* make the VM retry the fault */ goto out; } /* - * check to see if the page is mapped already (no holes) + * check to see if the folio is mapped already (no holes) */ - if (PageMappedToDisk(page)) + if (folio_test_mappedtodisk(folio)) goto mapped; - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; + head = folio_buffers(folio); + if (head) { int fully_mapped = 1; - bh = head = page_buffers(page); + bh = head; do { if (!buffer_mapped(bh)) { fully_mapped = 0; @@ -81,11 +83,11 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) } while (bh = bh->b_this_page, bh != head); if (fully_mapped) { - SetPageMappedToDisk(page); + folio_set_mappedtodisk(folio); goto mapped; } } - unlock_page(page); + folio_unlock(folio); /* * fill hole blocks @@ -105,7 +107,7 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) nilfs_transaction_commit(inode->i_sb); mapped: - wait_for_stable_page(page); + folio_wait_stable(folio); out: sb_end_pagefault(inode->i_sb); return vmf_fs_error(ret); From 83d9638ded878d67633b2f82dcea606b8f116b96 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:28 +0900 Subject: [PATCH 0493/1562] nilfs2: convert nilfs_mdt_create_block to use a folio Saves two calls to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-13-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/mdt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 2e7952ac2f67..7e4dcff2c94b 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -97,8 +97,8 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, } failed_bh: - unlock_page(bh->b_page); - put_page(bh->b_page); + folio_unlock(bh->b_folio); + folio_put(bh->b_folio); brelse(bh); failed_unlock: From 319a12c0462061e1435e32dfbdf57304938e9f90 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:29 +0900 Subject: [PATCH 0494/1562] nilfs2: convert nilfs_mdt_submit_block to use a folio Saves two calls to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-14-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/mdt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 7e4dcff2c94b..e45c01a559c0 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -158,8 +158,8 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, blk_opf_t opf, *out_bh = bh; failed_bh: - unlock_page(bh->b_page); - put_page(bh->b_page); + folio_unlock(bh->b_folio); + folio_put(bh->b_folio); brelse(bh); failed: return ret; From af01ea51488847f34b73c5dd811ede95ac027986 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:30 +0900 Subject: [PATCH 0495/1562] nilfs2: convert nilfs_gccache_submit_read_data to use a folio Saves two calls to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-15-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/gcinode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 8beb2730929d..bf9a11d58817 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -98,8 +98,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, *out_bh = bh; failed: - unlock_page(bh->b_page); - put_page(bh->b_page); + folio_unlock(bh->b_folio); + folio_put(bh->b_folio); if (unlikely(err)) brelse(bh); return err; From d80cb7777e18a1c0bcd1e660e6d8fffd257862aa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:31 +0900 Subject: [PATCH 0496/1562] nilfs2: convert nilfs_btnode_create_block to use a folio Saves two calls to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-16-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/btnode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 5710833ac1cc..691a50410ea9 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -64,8 +64,8 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) set_buffer_mapped(bh); set_buffer_uptodate(bh); - unlock_page(bh->b_page); - put_page(bh->b_page); + folio_unlock(bh->b_folio); + folio_put(bh->b_folio); return bh; } From 10c6cca9c3233d6328eb192821a647dc8fdffd0e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:32 +0900 Subject: [PATCH 0497/1562] nilfs2: convert nilfs_btnode_submit_block to use a folio Saves two calls to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-17-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/btnode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 691a50410ea9..5ef9eebd8d2e 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -75,7 +75,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, { struct buffer_head *bh; struct inode *inode = btnc->host; - struct page *page; + struct folio *folio; int err; bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node)); @@ -83,7 +83,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, return -ENOMEM; err = -EEXIST; /* internal code */ - page = bh->b_page; + folio = bh->b_folio; if (buffer_uptodate(bh) || buffer_dirty(bh)) goto found; @@ -130,8 +130,8 @@ found: *pbh = bh; out_locked: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } From cf62eb2c7a74aae8ef5bee000cf4ac1f77af6fad Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:33 +0900 Subject: [PATCH 0498/1562] nilfs2: convert nilfs_btnode_delete to use a folio Saves six calls to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-18-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/btnode.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 5ef9eebd8d2e..e077d4a7a11c 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -145,19 +145,19 @@ out_locked: void nilfs_btnode_delete(struct buffer_head *bh) { struct address_space *mapping; - struct page *page = bh->b_page; - pgoff_t index = page_index(page); + struct folio *folio = bh->b_folio; + pgoff_t index = folio->index; int still_dirty; - get_page(page); - lock_page(page); - wait_on_page_writeback(page); + folio_get(folio); + folio_lock(folio); + folio_wait_writeback(folio); nilfs_forget_buffer(bh); - still_dirty = PageDirty(page); - mapping = page->mapping; - unlock_page(page); - put_page(page); + still_dirty = folio_test_dirty(folio); + mapping = folio->mapping; + folio_unlock(folio); + folio_put(folio); if (!still_dirty && mapping) invalidate_inode_pages2_range(mapping, index, index); From 7c5c654c09c3d08ed04fb19ff0798784027eb33a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:34 +0900 Subject: [PATCH 0499/1562] nilfs2: convert nilfs_btnode_prepare_change_key to use a folio Saves three calls to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-19-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/btnode.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index e077d4a7a11c..da3e4366625f 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -185,23 +185,23 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, ctxt->newbh = NULL; if (inode->i_blkbits == PAGE_SHIFT) { - struct page *opage = obh->b_page; - lock_page(opage); + struct folio *ofolio = obh->b_folio; + folio_lock(ofolio); retry: /* BUG_ON(oldkey != obh->b_folio->index); */ - if (unlikely(oldkey != opage->index)) - NILFS_PAGE_BUG(opage, + if (unlikely(oldkey != ofolio->index)) + NILFS_PAGE_BUG(&ofolio->page, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); xa_lock_irq(&btnc->i_pages); - err = __xa_insert(&btnc->i_pages, newkey, opage, GFP_NOFS); + err = __xa_insert(&btnc->i_pages, newkey, ofolio, GFP_NOFS); xa_unlock_irq(&btnc->i_pages); /* - * Note: page->index will not change to newkey until + * Note: folio->index will not change to newkey until * nilfs_btnode_commit_change_key() will be called. - * To protect the page in intermediate state, the page lock + * To protect the folio in intermediate state, the folio lock * is held. */ if (!err) @@ -213,7 +213,7 @@ retry: if (!err) goto retry; /* fallback to copy mode */ - unlock_page(opage); + folio_unlock(ofolio); } nbh = nilfs_btnode_create_block(btnc, newkey); @@ -225,7 +225,7 @@ retry: return 0; failed_unlock: - unlock_page(obh->b_page); + folio_unlock(obh->b_folio); return err; } From c2a491f3d88a7d94fed070fe48c859dfc5c9d47c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:35 +0900 Subject: [PATCH 0500/1562] nilfs2: convert nilfs_btnode_commit_change_key to use a folio Saves one call to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-20-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/btnode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index da3e4366625f..fb1638765d54 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -238,15 +238,15 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, { struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh; __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; - struct page *opage; + struct folio *ofolio; if (oldkey == newkey) return; if (nbh == NULL) { /* blocksize == pagesize */ - opage = obh->b_page; - if (unlikely(oldkey != opage->index)) - NILFS_PAGE_BUG(opage, + ofolio = obh->b_folio; + if (unlikely(oldkey != ofolio->index)) + NILFS_PAGE_BUG(&ofolio->page, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); @@ -257,8 +257,8 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, __xa_set_mark(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY); xa_unlock_irq(&btnc->i_pages); - opage->index = obh->b_blocknr = newkey; - unlock_page(opage); + ofolio->index = obh->b_blocknr = newkey; + folio_unlock(ofolio); } else { nilfs_copy_buffer(nbh, obh); mark_buffer_dirty(nbh); From 2f0eff2054aa6894fab0e75e48649388b6f4b242 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 14 Nov 2023 17:44:36 +0900 Subject: [PATCH 0501/1562] nilfs2: convert nilfs_btnode_abort_change_key to use a folio Saves one call to compound_head(). Link: https://lkml.kernel.org/r/20231114084436.2755-21-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/btnode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index fb1638765d54..1204dd06ead8 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -284,7 +284,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, if (nbh == NULL) { /* blocksize == pagesize */ xa_erase_irq(&btnc->i_pages, newkey); - unlock_page(ctxt->bh->b_page); + folio_unlock(ctxt->bh->b_folio); } else { /* * When canceling a buffer that a prepare operation has From f72709ab69430d986dfc5a08c9a86f625e3fed33 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 16 Nov 2023 14:36:36 +0100 Subject: [PATCH 0502/1562] arch: remove ARCH_THREAD_STACK_ALLOCATOR Patch series "Remove unused code after IA-64 removal". While looking into something different I noticed that there are a couple of Kconfig options which were only selected by IA-64 and which are now unused. So remove them and simplify the code a bit. This patch (of 3): IA-64 was the only architecture which selected ARCH_THREAD_STACK_ALLOCATOR. IA-64 was removed with commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"). Therefore remove support for ARCH_THREAD_STACK_ALLOCATOR as well. Link: https://lkml.kernel.org/r/20231116133638.1636277-1-hca@linux.ibm.com Link: https://lkml.kernel.org/r/20231116133638.1636277-2-hca@linux.ibm.com Signed-off-by: Heiko Carstens Reviewed-by: Arnd Bergmann Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/Kconfig | 4 ---- kernel/fork.c | 20 -------------------- 2 files changed, 24 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index f4b210ab0612..310162b41a1c 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -320,10 +320,6 @@ config HAVE_ARCH_THREAD_STRUCT_WHITELIST should be implemented. Without this, the entire thread_struct field in task_struct will be left whitelisted. -# Select if arch has its private alloc_thread_stack() function -config ARCH_THREAD_STACK_ALLOCATOR - bool - # Select if arch wants to size task_struct dynamically via arch_task_struct_size: config ARCH_WANTS_DYNAMIC_TASK_STRUCT bool diff --git a/kernel/fork.c b/kernel/fork.c index 10917c3e1f03..d071809866e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -179,8 +179,6 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR - /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. @@ -412,24 +410,6 @@ void thread_stack_cache_init(void) } # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ -#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ - -static int alloc_thread_stack_node(struct task_struct *tsk, int node) -{ - unsigned long *stack; - - stack = arch_alloc_thread_stack_node(tsk, node); - tsk->stack = stack; - return stack ? 0 : -ENOMEM; -} - -static void free_thread_stack(struct task_struct *tsk) -{ - arch_free_thread_stack(tsk); - tsk->stack = NULL; -} - -#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; From 3888750e21ccb909051c810cc79fcc0650a740f8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 16 Nov 2023 14:36:37 +0100 Subject: [PATCH 0503/1562] arch: remove ARCH_TASK_STRUCT_ALLOCATOR IA-64 was the only architecture which selected ARCH_TASK_STRUCT_ALLOCATOR. IA-64 was removed with commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"). Therefore remove support for ARCH_THREAD_STACK_ALLOCATOR as well. Link: https://lkml.kernel.org/r/20231116133638.1636277-3-hca@linux.ibm.com Signed-off-by: Heiko Carstens Reviewed-by: Arnd Bergmann Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/Kconfig | 5 ----- kernel/fork.c | 6 ------ 2 files changed, 11 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 310162b41a1c..c2f87ef9f0ae 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -305,13 +305,8 @@ config ARCH_HAS_CPU_FINALIZE_INIT config ARCH_TASK_STRUCT_ON_STACK bool -# Select if arch has its private alloc_task_struct() function -config ARCH_TASK_STRUCT_ALLOCATOR - bool - config HAVE_ARCH_THREAD_STRUCT_WHITELIST bool - depends on !ARCH_TASK_STRUCT_ALLOCATOR help An architecture should select this to provide hardened usercopy knowledge about what region of the thread_struct should be diff --git a/kernel/fork.c b/kernel/fork.c index d071809866e0..ce8a4b8c04e2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -165,7 +165,6 @@ void __weak arch_release_task_struct(struct task_struct *tsk) { } -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static struct kmem_cache *task_struct_cachep; static inline struct task_struct *alloc_task_struct_node(int node) @@ -177,7 +176,6 @@ static inline void free_task_struct(struct task_struct *tsk) { kmem_cache_free(task_struct_cachep, tsk); } -#endif /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a @@ -1001,7 +999,6 @@ static void set_max_threads(unsigned int max_threads_suggested) int arch_task_struct_size __read_mostly; #endif -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static void task_struct_whitelist(unsigned long *offset, unsigned long *size) { /* Fetch thread_struct whitelist for the architecture. */ @@ -1016,12 +1013,10 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size) else *offset += offsetof(struct task_struct, thread); } -#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */ void __init fork_init(void) { int i; -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN 0 #endif @@ -1034,7 +1029,6 @@ void __init fork_init(void) arch_task_struct_size, align, SLAB_PANIC|SLAB_ACCOUNT, useroffset, usersize, NULL); -#endif /* do the arch specific task caches init */ arch_task_cache_init(); From 0eb5085c38749f2a91e5bd8cbebb1ebf3398343c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 16 Nov 2023 14:36:38 +0100 Subject: [PATCH 0504/1562] arch: remove ARCH_TASK_STRUCT_ON_STACK IA-64 was the only architecture which selected ARCH_TASK_STRUCT_ON_STACK. IA-64 was removed with commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"). Therefore remove support for ARCH_TASK_STRUCT_ON_STACK as well. Note: this also reveals a potential bug in powerpc code, which makes use of __init_task_data without selecting ARCH_TASK_STRUCT_ON_STACK which makes __init_task_data a no-op. This is broken since commit d11ed3ab3166 ("Expand INIT_TASK() in init/init_task.c and remove") from 2018 and needs to be addressed separately. Link: https://lkml.kernel.org/r/20231116133638.1636277-4-hca@linux.ibm.com Signed-off-by: Heiko Carstens Reviewed-by: Arnd Bergmann Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/Kconfig | 4 ---- arch/powerpc/kexec/core_64.c | 3 +-- include/linux/init_task.h | 7 ------- include/linux/sched.h | 2 -- init/init_task.c | 10 ++-------- 5 files changed, 3 insertions(+), 23 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index c2f87ef9f0ae..bfcc7c2dc039 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -301,10 +301,6 @@ config ARCH_HAS_DMA_CLEAR_UNCACHED config ARCH_HAS_CPU_FINALIZE_INIT bool -# Select if arch init_task must go in the __init_task_data section -config ARCH_TASK_STRUCT_ON_STACK - bool - config HAVE_ARCH_THREAD_STRUCT_WHITELIST bool help diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 0bee7ca9a77c..762e4d09aacf 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -283,8 +283,7 @@ static void kexec_prepare_cpus(void) * We could use a smaller stack if we don't care about anything using * current, but that audit has not been performed. */ -static union thread_union kexec_stack __init_task_data = - { }; +static union thread_union kexec_stack = { }; /* * For similar reasons to the stack above, the kexecing CPU needs to be on a diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 40fc5813cf93..bccb3f1f6262 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -37,13 +37,6 @@ extern struct cred init_cred; #define INIT_TASK_COMM "swapper" -/* Attach to the init_task data structure for proper alignment */ -#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK -#define __init_task_data __section(".data..init_task") -#else -#define __init_task_data /**/ -#endif - /* Attach to the thread_info data structure for proper alignment */ #define __init_thread_info __section(".data..init_thread_info") diff --git a/include/linux/sched.h b/include/linux/sched.h index 292c31697248..c2ecb2e06046 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1955,9 +1955,7 @@ extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { -#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK struct task_struct task; -#endif #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif diff --git a/init/init_task.c b/init/init_task.c index 5727d42149c3..6f6485d554df 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -51,8 +51,7 @@ static struct sighand_struct init_sighand = { }; #ifdef CONFIG_SHADOW_CALL_STACK -unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] - __init_task_data = { +unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] = { [(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC }; #endif @@ -61,12 +60,7 @@ unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] * Set up the first task table, touch at your own risk!. Base=0, * limit=0x1fffff (=2MB) */ -struct task_struct init_task -#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK - __init_task_data -#endif - __aligned(L1_CACHE_BYTES) -= { +struct task_struct init_task __aligned(L1_CACHE_BYTES) = { #ifdef CONFIG_THREAD_INFO_IN_TASK .thread_info = INIT_THREAD_INFO(init_task), .stack_refcount = REFCOUNT_INIT(1), From 71aa3419e98f6e23bddc3aca9ec4ac368836a109 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 9 Nov 2023 16:51:38 +0900 Subject: [PATCH 0505/1562] checkpatch: do not require an empty line before error injection ALLOW_ERROR_INJECTION macro (just like EXPORT_SYMBOL) can immediately follow a function it annotates. Link: https://lkml.kernel.org/r/20231109075147.2779461-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Joe Perches Cc: Andy Whitcroft (maintainer:CHECKPATCH) Cc: Dwaipayan Ray (reviewer:CHECKPATCH) Cc: Lukas Bulwahn Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 25fdb7fda112..a94ed6c46a6d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4054,7 +4054,7 @@ sub process { if ($prevline =~ /^[\+ ]};?\s*$/ && $line =~ /^\+/ && !($line =~ /^\+\s*$/ || - $line =~ /^\+\s*(?:EXPORT_SYMBOL|early_param)/ || + $line =~ /^\+\s*(?:EXPORT_SYMBOL|early_param|ALLOW_ERROR_INJECTION)/ || $line =~ /^\+\s*MODULE_/i || $line =~ /^\+\s*\#\s*(?:end|elif|else)/ || $line =~ /^\+[a-z_]*init/ || From 48aa137e5a9491b491ae2bea0e0a603b330e708f Mon Sep 17 00:00:00 2001 From: Ariel Miculas Date: Fri, 17 Nov 2023 18:12:14 +0200 Subject: [PATCH 0506/1562] docs: filesystems: document the squashfs specific mount options When SQUASHFS_CHOICE_DECOMP_BY_MOUNT is set, the "threads" mount option can be used to specify the decompression mode: single-threaded, multi-threaded, percpu or the number of threads used for decompression. When SQUASHFS_CHOICE_DECOMP_BY_MOUNT is not set, SQUASHFS_DECOMP_MULTI and SQUASHFS_MOUNT_DECOMP_THREADS are both set, the "threads" option can also be used to specify the number of threads used for decompression. This mount option is only mentioned in fs/squashfs/Kconfig, which makes it difficult to find. Another mount option available is "errors", which can be configured to panic the kernel when squashfs errors are encountered. Add both these options to the squashfs documentation, making them more noticeable. Link: https://lkml.kernel.org/r/20231117161215.140282-1-amiculas@cisco.com Signed-off-by: Ariel Miculas Reviewed-by: Bagas Sanjaya Reviewed-by: Phillip Lougher Cc: Jonathan Corbet Cc: Serge Hallyn Signed-off-by: Andrew Morton --- Documentation/filesystems/squashfs.rst | 60 ++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/Documentation/filesystems/squashfs.rst b/Documentation/filesystems/squashfs.rst index df42106bae71..4af8d6207509 100644 --- a/Documentation/filesystems/squashfs.rst +++ b/Documentation/filesystems/squashfs.rst @@ -64,6 +64,66 @@ obtained from this site also. The squashfs-tools development tree is now located on kernel.org git://git.kernel.org/pub/scm/fs/squashfs/squashfs-tools.git +2.1 Mount options +----------------- +=================== ========================================================= +errors=%s Specify whether squashfs errors trigger a kernel panic + or not + + ========== ============================================= + continue errors don't trigger a panic (default) + panic trigger a panic when errors are encountered, + similar to several other filesystems (e.g. + btrfs, ext4, f2fs, GFS2, jfs, ntfs, ubifs) + + This allows a kernel dump to be saved, + useful for analyzing and debugging the + corruption. + ========== ============================================= +threads=%s Select the decompression mode or the number of threads + + If SQUASHFS_CHOICE_DECOMP_BY_MOUNT is set: + + ========== ============================================= + single use single-threaded decompression (default) + + Only one block (data or metadata) can be + decompressed at any one time. This limits + CPU and memory usage to a minimum, but it + also gives poor performance on parallel I/O + workloads when using multiple CPU machines + due to waiting on decompressor availability. + multi use up to two parallel decompressors per core + + If you have a parallel I/O workload and your + system has enough memory, using this option + may improve overall I/O performance. It + dynamically allocates decompressors on a + demand basis. + percpu use a maximum of one decompressor per core + + It uses percpu variables to ensure + decompression is load-balanced across the + cores. + 1|2|3|... configure the number of threads used for + decompression + + The upper limit is num_online_cpus() * 2. + ========== ============================================= + + If SQUASHFS_CHOICE_DECOMP_BY_MOUNT is **not** set and + SQUASHFS_DECOMP_MULTI, SQUASHFS_MOUNT_DECOMP_THREADS are + both set: + + ========== ============================================= + 2|3|... configure the number of threads used for + decompression + + The upper limit is num_online_cpus() * 2. + ========== ============================================= + +=================== ========================================================= + 3. Squashfs Filesystem Design ----------------------------- From b454ec29225cda9ae85ed0a154f4228f1922c872 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 20 Nov 2023 16:16:49 +0100 Subject: [PATCH 0507/1562] kernel/signal.c: simplify force_sig_info_to_task(), kill recalc_sigpending_and_wake() The purpose of recalc_sigpending_and_wake() is not clear, it looks "obviously unneeded" because we are going to send the signal which can't be blocked or ignored. Add the comment to explain why we can't rely on send_signal_locked() and make this logic more simple/explicit. recalc_sigpending_and_wake() has no other users, it can die. In fact I think we don't even need signal_wake_up(), the target task must be either current or a TASK_TRACED child, otherwise the usage of siglock is not safe. But this needs another change. Link: https://lkml.kernel.org/r/20231120151649.GA15995@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric Biederman Signed-off-by: Andrew Morton --- include/linux/sched/signal.h | 1 - kernel/signal.c | 17 ++++------------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 41d6759d6a4a..015c0e3a3e1d 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -432,7 +432,6 @@ static inline bool fault_signal_pending(vm_fault_t fault_flags, * This is required every time the blocked sigset_t changes. * callers must hold sighand->siglock. */ -extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); extern void calculate_sigpending(void); diff --git a/kernel/signal.c b/kernel/signal.c index 5aa216e841a2..c9c57d053ce4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -171,16 +171,6 @@ static bool recalc_sigpending_tsk(struct task_struct *t) return false; } -/* - * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up. - * This is superfluous when called on current, the wakeup is a harmless no-op. - */ -void recalc_sigpending_and_wake(struct task_struct *t) -{ - if (recalc_sigpending_tsk(t)) - signal_wake_up(t, 0); -} - void recalc_sigpending(void) { if (!recalc_sigpending_tsk(current) && !freezing(current)) @@ -1348,10 +1338,8 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, action->sa.sa_handler = SIG_DFL; if (handler == HANDLER_EXIT) action->sa.sa_flags |= SA_IMMUTABLE; - if (blocked) { + if (blocked) sigdelset(&t->blocked, sig); - recalc_sigpending_and_wake(t); - } } /* * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect @@ -1361,6 +1349,9 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = send_signal_locked(sig, info, t, PIDTYPE_PID); + /* This can happen if the signal was already pending and blocked */ + if (!task_sigpending(t)) + signal_wake_up(t, 0); spin_unlock_irqrestore(&t->sighand->siglock, flags); return ret; From fe1a25eb059b215949825d4c81e26b100e6816a9 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 20 Nov 2023 19:37:18 +0100 Subject: [PATCH 0508/1562] checkstack: sort output by size and function name Sort output by size and in addition by function name. This increases readability for cases where there are many functions with the same stack usage. Link: https://lkml.kernel.org/r/20231120183719.2188479-3-hca@linux.ibm.com Signed-off-by: Heiko Carstens Cc: Maninder Singh Cc: Masahiro Yamada Cc: Vaneet Narang Signed-off-by: Andrew Morton --- scripts/checkstack.pl | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/scripts/checkstack.pl b/scripts/checkstack.pl index f27d552aec43..13408714ba0f 100755 --- a/scripts/checkstack.pl +++ b/scripts/checkstack.pl @@ -189,5 +189,20 @@ if ($total_size > $min_stack) { push @stack, "$intro$total_size\n"; } -# Sort output by size (last field) -print sort { ($b =~ /:\t*(\d+)$/)[0] <=> ($a =~ /:\t*(\d+)$/)[0] } @stack; +# Sort output by size (last field) and function name if size is the same +sub sort_lines { + my ($a, $b) = @_; + + my $num_a = $1 if $a =~ /:\t*(\d+)$/; + my $num_b = $1 if $b =~ /:\t*(\d+)$/; + my $func_a = $1 if $a =~ / (.*):/; + my $func_b = $1 if $b =~ / (.*):/; + + if ($num_a != $num_b) { + return $num_b <=> $num_a; + } else { + return $func_a cmp $func_b; + } +} + +print sort { sort_lines($a, $b) } @stack; From 66242cfafeea59a0199250dda3dc98736782a739 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 20 Nov 2023 19:37:19 +0100 Subject: [PATCH 0509/1562] checkstack: allow to pass MINSTACKSIZE parameter The checkstack script omits all functions with a stack usage of less than 100 bytes. However the script already has support for a parameter which allows to override the default, but it cannot be set with $ make checkstack Add a MINSTACKSIZE parameter which allows to change the default. This might be useful in order to print the stack usage of all functions, or only those with large stack usage: $ make checkstack MINSTACKSIZE=0 $ make checkstack MINSTACKSIZE=800 Link: https://lkml.kernel.org/r/20231120183719.2188479-4-hca@linux.ibm.com Signed-off-by: Heiko Carstens Cc: Maninder Singh Cc: Masahiro Yamada Cc: Vaneet Narang Signed-off-by: Andrew Morton --- Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 511b5616aa41..5d69bc326204 100644 --- a/Makefile +++ b/Makefile @@ -1576,7 +1576,8 @@ help: echo ' (default: $(INSTALL_HDR_PATH))'; \ echo '' @echo 'Static analysers:' - @echo ' checkstack - Generate a list of stack hogs' + @echo ' checkstack - Generate a list of stack hogs and consider all functions' + @echo ' with a stack size larger than MINSTACKSIZE (default: 100)' @echo ' versioncheck - Sanity check on version.h usage' @echo ' includecheck - Check for duplicate included header files' @echo ' export_report - List the usages of all exported symbols' @@ -2016,9 +2017,10 @@ CHECKSTACK_ARCH := $(SUBARCH) else CHECKSTACK_ARCH := $(ARCH) endif +MINSTACKSIZE ?= 100 checkstack: $(OBJDUMP) -d vmlinux $$(find . -name '*.ko') | \ - $(PERL) $(srctree)/scripts/checkstack.pl $(CHECKSTACK_ARCH) + $(PERL) $(srctree)/scripts/checkstack.pl $(CHECKSTACK_ARCH) $(MINSTACKSIZE) kernelrelease: @$(filechk_kernel.release) From 27bbb2a0fddf70e185e800cd78f0142d45330c6c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Nov 2023 17:26:50 +0100 Subject: [PATCH 0510/1562] __ptrace_unlink: kill the obsolete "FIXME" code The corner case described by the comment is no longer possible after the commit 7b3c36fc4c23 ("ptrace: fix task_join_group_stop() for the case when current is traced"), task_join_group_stop() ensures that the new thread has the correct signr in JOBCTL_STOP_SIGMASK regardless of ptrace. Link: https://lkml.kernel.org/r/20231121162650.GA6635@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric Biederman Signed-off-by: Andrew Morton --- kernel/ptrace.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d8b5e13a2229..3617213c3d8a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -145,20 +145,9 @@ void __ptrace_unlink(struct task_struct *child) */ if (!(child->flags & PF_EXITING) && (child->signal->flags & SIGNAL_STOP_STOPPED || - child->signal->group_stop_count)) { + child->signal->group_stop_count)) child->jobctl |= JOBCTL_STOP_PENDING; - /* - * This is only possible if this thread was cloned by the - * traced task running in the stopped group, set the signal - * for the future reports. - * FIXME: we should change ptrace_init_task() to handle this - * case. - */ - if (!(child->jobctl & JOBCTL_STOP_SIGMASK)) - child->jobctl |= SIGSTOP; - } - /* * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick * @child in the butt. Note that @resume should be used iff @child From 1ee918ffa6d4776a69708b013fd7e7006619158a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 22 Nov 2023 10:40:37 +0000 Subject: [PATCH 0511/1562] scripts/spelling.txt: add more spellings to spelling.txt Some of the more common spelling mistakes and typos that I've found while fixing up spelling mistakes in the kernel over the past couple of releases. Link: https://lkml.kernel.org/r/20231122104037.1770749-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Andrew Morton --- scripts/spelling.txt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 855c4863124b..edec60d39bbf 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -26,6 +26,7 @@ accelaration||acceleration accelearion||acceleration acceleratoin||acceleration accelleration||acceleration +accelrometer||accelerometer accesing||accessing accesnt||accent accessable||accessible @@ -137,6 +138,7 @@ anniversery||anniversary annoucement||announcement anomolies||anomalies anomoly||anomaly +anonynous||anonymous anway||anyway aplication||application appearence||appearance @@ -267,6 +269,7 @@ cadidate||candidate cahces||caches calender||calendar calescing||coalescing +calibraiton||calibration calle||called callibration||calibration callled||called @@ -288,6 +291,7 @@ capabitilies||capabilities capablity||capability capatibilities||capabilities capapbilities||capabilities +captuer||capture caputure||capture carefuly||carefully cariage||carriage @@ -340,6 +344,7 @@ comminucation||communication commited||committed commiting||committing committ||commit +commmand||command commnunication||communication commoditiy||commodity comsume||consume @@ -406,6 +411,7 @@ continious||continuous continous||continuous continously||continuously continueing||continuing +contiuous||continuous contraints||constraints contruct||construct contol||control @@ -757,6 +763,7 @@ hardward||hardware havind||having heirarchically||hierarchically heirarchy||hierarchy +heirachy||hierarchy helpfull||helpful hearbeat||heartbeat heterogenous||heterogeneous @@ -1199,6 +1206,7 @@ priting||printing privilaged||privileged privilage||privilege priviledge||privilege +priviledged||privileged priviledges||privileges privleges||privileges probaly||probably @@ -1251,6 +1259,7 @@ purgable||purgeable pwoer||power queing||queuing quering||querying +querrying||querying queus||queues randomally||randomly raoming||roaming @@ -1324,6 +1333,7 @@ reseting||resetting reseved||reserved reseverd||reserved resizeable||resizable +resonable||reasonable resotre||restore resouce||resource resouces||resources @@ -1427,6 +1437,7 @@ sliped||slipped softwade||software softwares||software soley||solely +soluation||solution souce||source speach||speech specfic||specific @@ -1458,6 +1469,7 @@ standart||standard standy||standby stardard||standard staticly||statically +statisitcs||statistics statuss||status stoped||stopped stoping||stopping @@ -1548,6 +1560,7 @@ threds||threads threee||three threshhold||threshold thresold||threshold +throtting||throttling throught||through tansition||transition trackling||tracking @@ -1571,6 +1584,7 @@ tranasction||transaction tranceiver||transceiver tranfer||transfer tranmission||transmission +tranport||transport transcevier||transceiver transciever||transceiver transferd||transferred From 0311d8272406b2ec47f485bef887723cc352a489 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 14 Nov 2023 17:12:01 +0100 Subject: [PATCH 0512/1562] kexec: use atomic_try_cmpxchg in crash_kexec Use atomic_try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in crash_kexec(). x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg. No functional change intended. Link: https://lkml.kernel.org/r/20231114161228.108516-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Baoquan He Cc: Eric Biederman Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index be5642a4ec49..bc4c096ab1f3 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1063,9 +1063,10 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs) * panic(). Otherwise parallel calls of panic() and crash_kexec() * may stop each other. To exclude them, we use panic_cpu here too. */ + old_cpu = PANIC_CPU_INVALID; this_cpu = raw_smp_processor_id(); - old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); - if (old_cpu == PANIC_CPU_INVALID) { + + if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { /* This is the 1st CPU which comes here, so go ahead. */ __crash_kexec(regs); From 0f0d2871e78db648a2578abbeb9103f484f9b754 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 30 Nov 2023 09:07:38 +0100 Subject: [PATCH 0513/1562] arch: turn off -Werror for architectures with known warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A couple of architectures enable -Werror for their own files regardless of CONFIG_WERROR but also have known warnings that fail the build with -Wmissing-prototypes enabled by default: arch/alpha/lib/memcpy.c:153:8: error: no previous prototype for 'memcpy' [-Werror=missing-prototypes] arch/alpha/kernel/irq.c:96:1: error: no previous prototype for 'handle_irq' [-Werror=missing-prototypes] arch/mips/kernel/signal.c:673:17: error: no previous prototype for ‘sys_rt_sigreturn’ [-Werror=missing-prototypes] arch/mips/kernel/signal.c:636:17: error: no previous prototype for ‘sys_sigreturn’ [-Werror=missing-prototypes] arch/mips/kernel/syscall.c:51:16: error: no previous prototype for ‘sysm_pipe’ [-Werror=missing-prototypes] arch/mips/mm/fault.c:323:17: error: no previous prototype for ‘do_page_fault’ [-Werror=missing-prototypes] arch/sparc/vdso/vma.c:246:12: warning: no previous prototype for ‘init_vdso_image’ [-Wmissing-prototypes]v arch/sparc/vdso/vdso32/../vclock_gettime.c:343:1: warning: no previous prototype for ‘__vdso_gettimeofday_stick’ [-Wmissing-prototypes] arch/sparc/vdso/vclock_gettime.c:343:1: warning: no previous prototype for ‘__vdso_gettimeofday_stick’ [-Wmissing-prototypes] arch/sparc/prom/p1275.c:52:6: warning: no previous prototype for ‘prom_cif_init’ [-Wmissing-prototypes] arch/sparc/prom/misc_64.c:165:5: warning: no previous prototype for ‘prom_get_mmu_ihandle’ [-Wmissing-prototypes] This appears to be an artifact from the times when this architecture code was better maintained that most device drivers and before CONFIG_WERROR was added. Now it just gets in the way, so remove all of these. Powerpc and x86 both still have their own Kconfig options to enable -Werror for some of their files. These architectures are better maintained than most and the options are easy to disable, so leave those untouched. Link: https://lkml.kernel.org/r/4be73872-c1f5-4c31-8201-712c19290a22@app.fastmail.com Signed-off-by: Arnd Bergmann Reported-by: Stephen Rothwell Cc: "David S. Miller" Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/alpha/lib/Makefile | 1 - arch/alpha/mm/Makefile | 2 -- arch/mips/Kbuild | 6 ------ arch/sparc/kernel/Makefile | 1 - arch/sparc/lib/Makefile | 1 - arch/sparc/mm/Makefile | 1 - arch/sparc/prom/Makefile | 1 - 7 files changed, 13 deletions(-) diff --git a/arch/alpha/lib/Makefile b/arch/alpha/lib/Makefile index 1cc74f7b50ef..6a779b9018fd 100644 --- a/arch/alpha/lib/Makefile +++ b/arch/alpha/lib/Makefile @@ -4,7 +4,6 @@ # asflags-y := $(KBUILD_CFLAGS) -ccflags-y := -Werror # Many of these routines have implementations tuned for ev6. # Choose them iff we're targeting ev6 specifically. diff --git a/arch/alpha/mm/Makefile b/arch/alpha/mm/Makefile index bd770302eb82..101dbd06b4ce 100644 --- a/arch/alpha/mm/Makefile +++ b/arch/alpha/mm/Makefile @@ -3,6 +3,4 @@ # Makefile for the linux alpha-specific parts of the memory manager. # -ccflags-y := -Werror - obj-y := init.o fault.o diff --git a/arch/mips/Kbuild b/arch/mips/Kbuild index af2967bffb73..e2d623621a00 100644 --- a/arch/mips/Kbuild +++ b/arch/mips/Kbuild @@ -1,10 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# Fail on warnings - also for files referenced in subdirs -# -Werror can be disabled for specific files using: -# CFLAGS_ := -Wno-error -ifeq ($(W),) -subdir-ccflags-y := -Werror -endif # platform specific definitions include $(srctree)/arch/mips/Kbuild.platforms diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index 0984bb6f0f17..58ea4ef9b622 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -5,7 +5,6 @@ # asflags-y := -ansi -ccflags-y := -Werror # Undefine sparc when processing vmlinux.lds - it is used # And teach CPP we are doing $(BITS) builds (for this case) diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index 063556fe2cb1..59669ebddd4e 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -3,7 +3,6 @@ # asflags-y := -ansi -DST_DIV0=0x02 -ccflags-y := -Werror lib-$(CONFIG_SPARC32) += ashrdi3.o lib-$(CONFIG_SPARC32) += memcpy.o memset.o diff --git a/arch/sparc/mm/Makefile b/arch/sparc/mm/Makefile index 871354aa3c00..809d993f6d88 100644 --- a/arch/sparc/mm/Makefile +++ b/arch/sparc/mm/Makefile @@ -3,7 +3,6 @@ # asflags-y := -ansi -ccflags-y := -Werror obj-$(CONFIG_SPARC64) += ultra.o tlb.o tsb.o obj-y += fault_$(BITS).o diff --git a/arch/sparc/prom/Makefile b/arch/sparc/prom/Makefile index 397b79af77f7..a1adc75d8055 100644 --- a/arch/sparc/prom/Makefile +++ b/arch/sparc/prom/Makefile @@ -3,7 +3,6 @@ # Linux. # asflags := -ansi -ccflags := -Werror lib-y := bootstr_$(BITS).o lib-y += init_$(BITS).o From 014a5c107d0c45e259f87d3168f6a01e3e195637 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:14 -0700 Subject: [PATCH 0514/1562] hexagon: uaccess: remove clear_user_hexagon() Patch series "hexagon: Fix up instances of -Wmissing-prototypes". This series fixes all the instances of -Wmissing-prototypes in arch/hexagon, as it is about to be enabled globally in a default build. This patch (of 19): Clang warns: arch/hexagon/mm/uaccess.c:39:15: warning: no previous prototype for function 'clear_user_hexagon' [-Wmissing-prototypes] 39 | unsigned long clear_user_hexagon(void __user *dest, unsigned long count) | ^ arch/hexagon/mm/uaccess.c:39:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 39 | unsigned long clear_user_hexagon(void __user *dest, unsigned long count) | ^ | static 1 warning generated. This function appears to have been unused since it was introduced in commit 7567746e1c0d ("Hexagon: Add user access functions"), so remove it. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-0-5c34714afe9e@kernel.org Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-1-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/mm/uaccess.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/hexagon/mm/uaccess.c b/arch/hexagon/mm/uaccess.c index 650bca92f0b7..3204e9ba6d6f 100644 --- a/arch/hexagon/mm/uaccess.c +++ b/arch/hexagon/mm/uaccess.c @@ -35,11 +35,3 @@ __kernel_size_t __clear_user_hexagon(void __user *dest, unsigned long count) return count; } - -unsigned long clear_user_hexagon(void __user *dest, unsigned long count) -{ - if (!access_ok(dest, count)) - return count; - else - return __clear_user_hexagon(dest, count); -} From 600acbea29533db8906ed172b89eb10cd0d5413a Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:15 -0700 Subject: [PATCH 0515/1562] hexagon: mm: mark paging_init() as static Clang warns: arch/hexagon/mm/init.c:89:13: warning: no previous prototype for function 'paging_init' [-Wmissing-prototypes] 89 | void __init paging_init(void) | ^ arch/hexagon/mm/init.c:89:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 89 | void __init paging_init(void) | ^ | static This function is only used within this translation unit, so mark it static as suggested. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-2-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index 146115c9de61..f164b377b93b 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -86,7 +86,7 @@ void sync_icache_dcache(pte_t pte) * In this mode, we only have one pg_data_t * structure: contig_mem_data. */ -void __init paging_init(void) +static void __init paging_init(void) { unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; From bba07109f57d1299cd5551eb948ce182d711c221 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:16 -0700 Subject: [PATCH 0516/1562] hexagon: mm: include asm/setup.h for setup_arch_memory()'s prototype Clang warns: arch/hexagon/mm/init.c:138:13: warning: no previous prototype for function 'setup_arch_memory' [-Wmissing-prototypes] 138 | void __init setup_arch_memory(void) | ^ arch/hexagon/mm/init.c:138:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 138 | void __init setup_arch_memory(void) | ^ | static The prototype is in asm/setup.h, include it to clear up the warning. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-3-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/mm/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index f164b377b93b..3458f39ca2ac 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -12,6 +12,7 @@ #include #include #include +#include #include /* From ef14250ec7d42a0e993bd341db078ecb33900a16 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:17 -0700 Subject: [PATCH 0517/1562] hexagon: smp: mark handle_ipi() and start_secondary() as static Clang warns: arch/hexagon/kernel/smp.c:82:13: warning: no previous prototype for function 'handle_ipi' [-Wmissing-prototypes] 82 | irqreturn_t handle_ipi(int irq, void *desc) | ^ arch/hexagon/kernel/smp.c:82:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 82 | irqreturn_t handle_ipi(int irq, void *desc) | ^ | static arch/hexagon/kernel/smp.c:127:6: warning: no previous prototype for function 'start_secondary' [-Wmissing-prototypes] 127 | void start_secondary(void) | ^ arch/hexagon/kernel/smp.c:127:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 127 | void start_secondary(void) | ^ | static 2 warnings generated. These functions are not used outside of this translation unit, so mark them as static. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-4-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/kernel/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c index 4e8bee25b8c6..608884bc3396 100644 --- a/arch/hexagon/kernel/smp.c +++ b/arch/hexagon/kernel/smp.c @@ -79,7 +79,7 @@ void smp_vm_unmask_irq(void *info) * Specifically, first arg is irq, second is the irq_desc. */ -irqreturn_t handle_ipi(int irq, void *desc) +static irqreturn_t handle_ipi(int irq, void *desc) { int cpu = smp_processor_id(); struct ipi_data *ipi = &per_cpu(ipi_data, cpu); @@ -124,7 +124,7 @@ void __init smp_prepare_boot_cpu(void) * to point to current thread info */ -void start_secondary(void) +static void start_secondary(void) { unsigned long thread_ptr; unsigned int cpu, irq; From d9d106ce60760ae020f39f5a2d783fe92d401f8f Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:18 -0700 Subject: [PATCH 0518/1562] hexagon: vm_fault: mark do_page_fault() as static Clang warns: arch/hexagon/mm/vm_fault.c:36:6: warning: no previous prototype for function 'do_page_fault' [-Wmissing-prototypes] 36 | void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) | ^ arch/hexagon/mm/vm_fault.c:36:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 36 | void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) | ^ | static This function is not used outside of this translation unit, so mark it as static. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-5-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/mm/vm_fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 7295ea3f8cc8..ab0f0a791e00 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -33,7 +33,7 @@ /* * Canonical page fault handler */ -void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) +static void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; From 8126fafece234f383339bdc3713b7d793006302d Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:19 -0700 Subject: [PATCH 0519/1562] hexagon: vm_fault: include asm/vm_fault.h for prototypes Clang warns: arch/hexagon/mm/vm_fault.c:157:6: warning: no previous prototype for function 'read_protection_fault' [-Wmissing-prototypes] 157 | void read_protection_fault(struct pt_regs *regs) | ^ arch/hexagon/mm/vm_fault.c:157:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 157 | void read_protection_fault(struct pt_regs *regs) | ^ | static arch/hexagon/mm/vm_fault.c:164:6: warning: no previous prototype for function 'write_protection_fault' [-Wmissing-prototypes] 164 | void write_protection_fault(struct pt_regs *regs) | ^ arch/hexagon/mm/vm_fault.c:164:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 164 | void write_protection_fault(struct pt_regs *regs) | ^ | static arch/hexagon/mm/vm_fault.c:171:6: warning: no previous prototype for function 'execute_protection_fault' [-Wmissing-prototypes] 171 | void execute_protection_fault(struct pt_regs *regs) | ^ arch/hexagon/mm/vm_fault.c:171:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 171 | void execute_protection_fault(struct pt_regs *regs) | ^ | static The prototypes for these functions are defined in asm/vm_fault.h, so include it to pick them up and silence the warnings. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-6-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/mm/vm_fault.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index ab0f0a791e00..3771fb453898 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include From 0ebac3e6151c283d39d24a6bbe43f0fe14149899 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:20 -0700 Subject: [PATCH 0520/1562] hexagon: vm_tlb: include asm/tlbflush.h for prototypes Clang warns about several missing prototypes that are declared in this header: arch/hexagon/mm/vm_tlb.c:25:6: warning: no previous prototype for function 'flush_tlb_range' [-Wmissing-prototypes] 25 | void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, | ^ arch/hexagon/mm/vm_tlb.c:25:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 25 | void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, | ^ | static arch/hexagon/mm/vm_tlb.c:37:6: warning: no previous prototype for function 'flush_tlb_one' [-Wmissing-prototypes] 37 | void flush_tlb_one(unsigned long vaddr) | ^ arch/hexagon/mm/vm_tlb.c:37:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 37 | void flush_tlb_one(unsigned long vaddr) | ^ | static arch/hexagon/mm/vm_tlb.c:47:6: warning: no previous prototype for function 'tlb_flush_all' [-Wmissing-prototypes] 47 | void tlb_flush_all(void) | ^ arch/hexagon/mm/vm_tlb.c:47:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 47 | void tlb_flush_all(void) | ^ | static arch/hexagon/mm/vm_tlb.c:56:6: warning: no previous prototype for function 'flush_tlb_mm' [-Wmissing-prototypes] 56 | void flush_tlb_mm(struct mm_struct *mm) | ^ arch/hexagon/mm/vm_tlb.c:56:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 56 | void flush_tlb_mm(struct mm_struct *mm) | ^ | static arch/hexagon/mm/vm_tlb.c:66:6: warning: no previous prototype for function 'flush_tlb_page' [-Wmissing-prototypes] 66 | void flush_tlb_page(struct vm_area_struct *vma, unsigned long vaddr) | ^ arch/hexagon/mm/vm_tlb.c:66:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 66 | void flush_tlb_page(struct vm_area_struct *vma, unsigned long vaddr) | ^ | static arch/hexagon/mm/vm_tlb.c:78:6: warning: no previous prototype for function 'flush_tlb_kernel_range' [-Wmissing-prototypes] 78 | void flush_tlb_kernel_range(unsigned long start, unsigned long end) | ^ arch/hexagon/mm/vm_tlb.c:78:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 78 | void flush_tlb_kernel_range(unsigned long start, unsigned long end) | ^ | static 6 warnings generated. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-7-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/mm/vm_tlb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/hexagon/mm/vm_tlb.c b/arch/hexagon/mm/vm_tlb.c index 53482f2a9ff9..8b6405e2234b 100644 --- a/arch/hexagon/mm/vm_tlb.c +++ b/arch/hexagon/mm/vm_tlb.c @@ -14,6 +14,7 @@ #include #include #include +#include /* * Initial VM implementation has only one map active at a time, with From 3279333097b22e1bab750d0f40837a097ec765fd Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:21 -0700 Subject: [PATCH 0521/1562] hexagon: time: include asm/time.h for prototypes Clang warns about missing prototypes that are declared in this header: arch/hexagon/kernel/time.c:118:6: warning: no previous prototype for function 'setup_percpu_clockdev' [-Wmissing-prototypes] 118 | void setup_percpu_clockdev(void) | ^ arch/hexagon/kernel/time.c:118:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 118 | void setup_percpu_clockdev(void) | ^ | static arch/hexagon/kernel/time.c:135:6: warning: no previous prototype for function 'ipi_timer' [-Wmissing-prototypes] 135 | void ipi_timer(void) | ^ arch/hexagon/kernel/time.c:135:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 135 | void ipi_timer(void) | ^ | static Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-8-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/kernel/time.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/hexagon/kernel/time.c b/arch/hexagon/kernel/time.c index febc95714d75..59f00bf54fe9 100644 --- a/arch/hexagon/kernel/time.c +++ b/arch/hexagon/kernel/time.c @@ -18,6 +18,7 @@ #include #include +#include #define TIMER_ENABLE BIT(0) From 1f443caea93e76a9e4613d9e370e082354ae3b44 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:22 -0700 Subject: [PATCH 0522/1562] hexagon: time: mark time_init_deferred() as static Clang warns: arch/hexagon/kernel/time.c:163:13: warning: no previous prototype for function 'time_init_deferred' [-Wmissing-prototypes] 163 | void __init time_init_deferred(void) | ^ arch/hexagon/kernel/time.c:163:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 163 | void __init time_init_deferred(void) | ^ | static This function is not used outside of this translation unit so mark it as static to resolve the warning. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-9-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/hexagon/kernel/time.c b/arch/hexagon/kernel/time.c index 59f00bf54fe9..f04dbe4f8a4c 100644 --- a/arch/hexagon/kernel/time.c +++ b/arch/hexagon/kernel/time.c @@ -161,7 +161,7 @@ static irqreturn_t timer_interrupt(int irq, void *devid) * This runs just before the delay loop is calibrated, and * is used for delay calibration. */ -void __init time_init_deferred(void) +static void __init time_init_deferred(void) { struct resource *resource = NULL; struct clock_event_device *ce_dev = &hexagon_clockevent_dev; From d068b1237e3204ec5d4f7ddcdde54aeef2a9c30b Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:23 -0700 Subject: [PATCH 0523/1562] hexagon: time: include asm/delay.h for prototypes Clang warns about missing prototypes that are declared in this header: arch/hexagon/kernel/time.c:209:6: warning: no previous prototype for function '__delay' [-Wmissing-prototypes] 209 | void __delay(unsigned long cycles) | ^ arch/hexagon/kernel/time.c:209:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 209 | void __delay(unsigned long cycles) | ^ | static arch/hexagon/kernel/time.c:224:6: warning: no previous prototype for function '__udelay' [-Wmissing-prototypes] 224 | void __udelay(unsigned long usecs) | ^ arch/hexagon/kernel/time.c:224:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 224 | void __udelay(unsigned long usecs) | ^ | static Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-10-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/kernel/time.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/hexagon/kernel/time.c b/arch/hexagon/kernel/time.c index f04dbe4f8a4c..f0f207e2a694 100644 --- a/arch/hexagon/kernel/time.c +++ b/arch/hexagon/kernel/time.c @@ -17,6 +17,7 @@ #include #include +#include #include #include From cb0085b0d694ab1269ac0c52464a48111c47161e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:24 -0700 Subject: [PATCH 0524/1562] hexagon: signal: switch to SYSCALL_DEFINE0 for sys_rt_sigreturn() Clang warns: arch/hexagon/kernel/signal.c:223:16: warning: no previous prototype for function 'sys_rt_sigreturn' [-Wmissing-prototypes] 223 | asmlinkage int sys_rt_sigreturn(void) | ^ arch/hexagon/kernel/signal.c:223:12: note: declare 'static' if the function is not intended to be used outside of this translation unit 223 | asmlinkage int sys_rt_sigreturn(void) | ^ | static 1 warning generated. Switch to the SYSCALL_DEFINE0() macro, which automatically declares a prototype. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-11-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c index bcba31e9e0ae..d301f4621553 100644 --- a/arch/hexagon/kernel/signal.c +++ b/arch/hexagon/kernel/signal.c @@ -220,7 +220,7 @@ no_restart: * Architecture-specific wrappers for signal-related system calls */ -asmlinkage int sys_rt_sigreturn(void) +SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; From 9e06373780bd946d4990f84765de4f9bc168ed82 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:25 -0700 Subject: [PATCH 0525/1562] hexagon: reset: include linux/reboot.h for prototypes Clang warns about missing prototypes that are declared in this header: arch/hexagon/kernel/reset.c:9:6: warning: no previous prototype for function 'machine_power_off' [-Wmissing-prototypes] 9 | void machine_power_off(void) | ^ arch/hexagon/kernel/reset.c:9:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 9 | void machine_power_off(void) | ^ | static arch/hexagon/kernel/reset.c:15:6: warning: no previous prototype for function 'machine_halt' [-Wmissing-prototypes] 15 | void machine_halt(void) | ^ arch/hexagon/kernel/reset.c:15:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 15 | void machine_halt(void) | ^ | static arch/hexagon/kernel/reset.c:19:6: warning: no previous prototype for function 'machine_restart' [-Wmissing-prototypes] 19 | void machine_restart(char *cmd) | ^ arch/hexagon/kernel/reset.c:19:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 19 | void machine_restart(char *cmd) | ^ | static 3 warnings generated. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-12-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/kernel/reset.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/hexagon/kernel/reset.c b/arch/hexagon/kernel/reset.c index da36114d928f..efd70a8d2526 100644 --- a/arch/hexagon/kernel/reset.c +++ b/arch/hexagon/kernel/reset.c @@ -3,6 +3,7 @@ * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. */ +#include #include #include From b0f731229a255112bfc82dd81f997a5f3484249e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:26 -0700 Subject: [PATCH 0526/1562] hexagon: process: include linux/cpu.h for arch_cpu_idle() prototype Clang warns: arch/hexagon/kernel/process.c:43:6: warning: no previous prototype for function 'arch_cpu_idle' [-Wmissing-prototypes] 43 | void arch_cpu_idle(void) | ^ arch/hexagon/kernel/process.c:43:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 43 | void arch_cpu_idle(void) | ^ | static This prototype is declared in include/linux/cpu.h, include it in process.c to clear up the warning. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-13-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/kernel/process.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index dd7f74ea2c20..51e37fc92857 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -5,6 +5,7 @@ * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved. */ +#include #include #include #include From 54ba0eab469d594dd1ef432a8de48724ae119336 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:27 -0700 Subject: [PATCH 0527/1562] hexagon: process: add internal prototype for do_work_pending() Clang warns: arch/hexagon/kernel/process.c:155:5: warning: no previous prototype for function 'do_work_pending' [-Wmissing-prototypes] 155 | int do_work_pending(struct pt_regs *regs, u32 thread_info_flags) | ^ arch/hexagon/kernel/process.c:155:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 155 | int do_work_pending(struct pt_regs *regs, u32 thread_info_flags) | ^ | static This function is only referenced from assembly, so it does not technically need a prototype. Add one right above the definition anyways to clear up the warning. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-14-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/kernel/process.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index 51e37fc92857..2a77bfd75694 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -153,6 +153,7 @@ unsigned long __get_wchan(struct task_struct *p) * Returns 0 if there's no need to re-check for more work. */ +int do_work_pending(struct pt_regs *regs, u32 thread_info_flags); int do_work_pending(struct pt_regs *regs, u32 thread_info_flags) { if (!(thread_info_flags & _TIF_WORK_MASK)) { From d75eb3344ef1888885fcace3d34f3aceafee9aae Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:28 -0700 Subject: [PATCH 0528/1562] hexagon: vdso: include asm/elf.h for arch_setup_additional_pages() prototype Clang warns: arch/hexagon/kernel/vdso.c:49:5: warning: no previous prototype for function 'arch_setup_additional_pages' [-Wmissing-prototypes] 49 | int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) | ^ arch/hexagon/kernel/vdso.c:49:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 49 | int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) | ^ | static 1 warning generated. Include the header that declares the prototype to clear up the warning. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-15-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/kernel/vdso.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/hexagon/kernel/vdso.c b/arch/hexagon/kernel/vdso.c index b70970ac809f..2e4872d62124 100644 --- a/arch/hexagon/kernel/vdso.c +++ b/arch/hexagon/kernel/vdso.c @@ -10,6 +10,7 @@ #include #include +#include #include static struct page *vdso_page; From d9f85d8be96900f659167a637686257e7176ce0e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:29 -0700 Subject: [PATCH 0529/1562] hexagon: vm_events: remove unused dummy_handler() Clang warns: arch/hexagon/kernel/vm_events.c:76:6: warning: no previous prototype for function 'dummy_handler' [-Wmissing-prototypes] 76 | void dummy_handler(struct pt_regs *regs) | ^ arch/hexagon/kernel/vm_events.c:76:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 76 | void dummy_handler(struct pt_regs *regs) | ^ | static This function appears to be entirely unused, so remove it. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-16-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/kernel/vm_events.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/hexagon/kernel/vm_events.c b/arch/hexagon/kernel/vm_events.c index 59ef72e4a4e5..2b881a89b206 100644 --- a/arch/hexagon/kernel/vm_events.c +++ b/arch/hexagon/kernel/vm_events.c @@ -73,13 +73,6 @@ void show_regs(struct pt_regs *regs) pt_psp(regs), pt_badva(regs), ints_enabled(regs)); } -void dummy_handler(struct pt_regs *regs) -{ - unsigned int elr = pt_elr(regs); - printk(KERN_ERR "Unimplemented handler; ELR=0x%08x\n", elr); -} - - void arch_do_IRQ(struct pt_regs *regs) { int irq = pt_cause(regs); From 2212acda71d93887418146f36d5dd90fb13a2610 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:30 -0700 Subject: [PATCH 0530/1562] hexagon: irq: add prototype for arch_do_IRQ() Clang warns: arch/hexagon/kernel/vm_events.c:83:6: warning: no previous prototype for function 'arch_do_IRQ' [-Wmissing-prototypes] 83 | void arch_do_IRQ(struct pt_regs *regs) | ^ arch/hexagon/kernel/vm_events.c:83:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 83 | void arch_do_IRQ(struct pt_regs *regs) | ^ | static This function is only called from assembly but the irq header is a reasonable place to put a prototype to silence the warning. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-17-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/include/asm/irq.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/hexagon/include/asm/irq.h b/arch/hexagon/include/asm/irq.h index 1f7f1292f701..a60d26754caa 100644 --- a/arch/hexagon/include/asm/irq.h +++ b/arch/hexagon/include/asm/irq.h @@ -20,4 +20,7 @@ #include +struct pt_regs; +void arch_do_IRQ(struct pt_regs *); + #endif From d6b0180e6db1cc0699d9df8c8627aade0e2e1b80 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:31 -0700 Subject: [PATCH 0531/1562] hexagon: traps: remove sys_syscall() Clang warns: arch/hexagon/kernel/traps.c:335:6: warning: no previous prototype for function 'sys_syscall' [-Wmissing-prototypes] 335 | long sys_syscall(void) | ^ arch/hexagon/kernel/traps.c:335:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 335 | long sys_syscall(void) | ^ | static This function is not used anywhere, so remove it. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-18-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/kernel/traps.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c index 6447763ce5a9..3f6ff43cb514 100644 --- a/arch/hexagon/kernel/traps.c +++ b/arch/hexagon/kernel/traps.c @@ -331,13 +331,6 @@ void do_genex(struct pt_regs *regs) } } -/* Indirect system call dispatch */ -long sys_syscall(void) -{ - printk(KERN_ERR "sys_syscall invoked!\n"); - return -ENOSYS; -} - void do_trap0(struct pt_regs *regs) { syscall_fn syscall; From 2562a3aeaa71753dcb857c1fc121d7e76300e860 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 15:58:32 -0700 Subject: [PATCH 0532/1562] hexagon: traps: add internal prototypes for functions only called from asm Clang warns: arch/hexagon/kernel/traps.c:284:6: warning: no previous prototype for function 'do_genex' [-Wmissing-prototypes] 284 | void do_genex(struct pt_regs *regs) | ^ arch/hexagon/kernel/traps.c:284:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 284 | void do_genex(struct pt_regs *regs) | ^ | static arch/hexagon/kernel/traps.c:341:6: warning: no previous prototype for function 'do_trap0' [-Wmissing-prototypes] 341 | void do_trap0(struct pt_regs *regs) | ^ arch/hexagon/kernel/traps.c:341:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 341 | void do_trap0(struct pt_regs *regs) | ^ | static arch/hexagon/kernel/traps.c:418:6: warning: no previous prototype for function 'do_machcheck' [-Wmissing-prototypes] 418 | void do_machcheck(struct pt_regs *regs) | ^ arch/hexagon/kernel/traps.c:418:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 418 | void do_machcheck(struct pt_regs *regs) | ^ | static arch/hexagon/kernel/traps.c:428:6: warning: no previous prototype for function 'do_debug_exception' [-Wmissing-prototypes] 428 | void do_debug_exception(struct pt_regs *regs) | ^ arch/hexagon/kernel/traps.c:428:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 428 | void do_debug_exception(struct pt_regs *regs) | ^ | static These functions are only called from assembly or this translation unit, so just add prototypes right above the definitions to silence the warnings. Link: https://lkml.kernel.org/r/20231130-hexagon-missing-prototypes-v1-19-5c34714afe9e@kernel.org Signed-off-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/kernel/traps.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c index 3f6ff43cb514..75e062722d28 100644 --- a/arch/hexagon/kernel/traps.c +++ b/arch/hexagon/kernel/traps.c @@ -281,6 +281,7 @@ static void cache_error(struct pt_regs *regs) /* * General exception handler */ +void do_genex(struct pt_regs *regs); void do_genex(struct pt_regs *regs) { /* @@ -331,6 +332,7 @@ void do_genex(struct pt_regs *regs) } } +void do_trap0(struct pt_regs *regs); void do_trap0(struct pt_regs *regs) { syscall_fn syscall; @@ -408,6 +410,7 @@ void do_trap0(struct pt_regs *regs) /* * Machine check exception handler */ +void do_machcheck(struct pt_regs *regs); void do_machcheck(struct pt_regs *regs) { /* Halt and catch fire */ @@ -418,6 +421,7 @@ void do_machcheck(struct pt_regs *regs) * Treat this like the old 0xdb trap. */ +void do_debug_exception(struct pt_regs *regs); void do_debug_exception(struct pt_regs *regs) { regs->hvmer.vmest &= ~HVM_VMEST_CAUSE_MSK; From c0706cfc7a5e9ddef1949520059798e8aea4c7d3 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 17:22:32 -0700 Subject: [PATCH 0533/1562] s390/dasd: remove dasd_stats_generic_show() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "s390: A couple of fixes for -Wmissing-prototypes". This series resolves a couple of -Wmissing-prototypes that I see in my builds of -next, even though the issues appear to be latent. This addresses issues which will be exposed by the later patch "Makefile.extrawarn: turn on missing-prototypes globally". This patch (of 2): With CONFIG_DASD_PROFILE=n, there is a warning that dasd_stats_generic_show() is missing a prototype: drivers/s390/block/dasd.c:1109:5: warning: no previous prototype for 'dasd_stats_generic_show' [-Wmissing-prototypes] 1109 | int dasd_stats_generic_show(struct seq_file *m, void *v) | ^~~~~~~~~~~~~~~~~~~~~~~ This function has been unused since its introduction in commit 4fa52aa7a82f ("[S390] dasd: add enhanced DASD statistics interface"), remove it to clear up the warning. Link: https://lkml.kernel.org/r/20231130-s390-missing-prototypes-v1-0-799d3cf07fb7@kernel.org Link: https://lkml.kernel.org/r/20231130-s390-missing-prototypes-v1-1-799d3cf07fb7@kernel.org Signed-off-by: Nathan Chancellor Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Jan Höppner Cc: Stefan Haberland Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- drivers/s390/block/dasd.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 833cfab7d877..7327e81352e9 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1106,12 +1106,6 @@ static void dasd_statistics_removeroot(void) return; } -int dasd_stats_generic_show(struct seq_file *m, void *v) -{ - seq_puts(m, "Statistics are not activated in this kernel\n"); - return 0; -} - static void dasd_profile_init(struct dasd_profile *profile, struct dentry *base_dentry) { From 78af7920d0eb7659a30dde3e3214b2b920f8fdf3 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Nov 2023 17:22:33 -0700 Subject: [PATCH 0534/1562] s390/traps: only define is_valid_bugaddr() under CONFIG_GENERIC_BUG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building with -Wmissing-prototypes without CONFIG_GENERIC_BUG, there is a warning about a missing prototype for is_valid_bugaddr(): arch/s390/kernel/traps.c:46:5: warning: no previous prototype for 'is_valid_bugaddr' [-Wmissing-prototypes] 46 | int is_valid_bugaddr(unsigned long addr) | ^~~~~~~~~~~~~~~~ The prototype is only declared with CONFIG_GENERIC_BUG, so only define the function under the same condition to clear up the warning, which matches other architectures. Link: https://lkml.kernel.org/r/20231130-s390-missing-prototypes-v1-2-799d3cf07fb7@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Alexander Gordeev Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Jan Höppner Cc: Stefan Haberland Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- arch/s390/kernel/traps.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 1d2aa448d103..cc3e3a01dfa5 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -43,10 +43,12 @@ static inline void __user *get_trap_ip(struct pt_regs *regs) return (void __user *) (address - (regs->int_code >> 16)); } +#ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long addr) { return 1; } +#endif void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) { From 1b5e6f4ec0870cc11250f627d26ad939d22cc2f6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:56:51 +0100 Subject: [PATCH 0535/1562] mips: decompress: fix add missing prototypes Patch series "mips: address -Wmissing-prototypes warnings". Address the -Wmissing-prototypes warnings that showed up in mips as the last major architecture after my patch to enable the option everywhere. This patch (of 20): The mips decompressor has some string functions defined locally that are not declared in the right place: arch/mips/boot/compressed/dbg.c:12:13: error: no previous prototype for 'putc' [-Werror=missing-prototypes] arch/mips/boot/compressed/dbg.c:16:6: error: no previous prototype for 'puts' [-Werror=missing-prototypes] arch/mips/boot/compressed/dbg.c:26:6: error: no previous prototype for 'puthex' [-Werror=missing-prototypes] arch/mips/boot/compressed/string.c:11:7: error: no previous prototype for 'memcpy' [-Werror=missing-prototypes] arch/mips/boot/compressed/string.c:22:7: error: no previous prototype for 'memset' [-Werror=missing-prototypes] arch/mips/boot/compressed/string.c:32:15: error: no previous prototype for 'memmove' [-Werror=missing-prototypes] arch/mips/boot/compressed/decompress.c:43:6: error: no previous prototype for 'error' [-Werror=missing-prototypes] arch/mips/boot/compressed/decompress.c:91:6: error: no previous prototype for 'decompress_kernel' [-Werror=missing-prototypes] Include the string.h header where needed and add a decompress.h header to have shared prototypes for the rest. Link: https://lkml.kernel.org/r/20231204115710.2247097-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20231204115710.2247097-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/boot/compressed/dbg.c | 2 ++ arch/mips/boot/compressed/decompress.c | 16 ++-------------- arch/mips/boot/compressed/decompress.h | 24 ++++++++++++++++++++++++ arch/mips/boot/compressed/string.c | 1 + 4 files changed, 29 insertions(+), 14 deletions(-) create mode 100644 arch/mips/boot/compressed/decompress.h diff --git a/arch/mips/boot/compressed/dbg.c b/arch/mips/boot/compressed/dbg.c index f6728a8fd1c3..2f1ac38fe1cc 100644 --- a/arch/mips/boot/compressed/dbg.c +++ b/arch/mips/boot/compressed/dbg.c @@ -9,6 +9,8 @@ #include #include +#include "decompress.h" + void __weak putc(char c) { } diff --git a/arch/mips/boot/compressed/decompress.c b/arch/mips/boot/compressed/decompress.c index c5dd415254d3..adb6d5b0e6eb 100644 --- a/arch/mips/boot/compressed/decompress.c +++ b/arch/mips/boot/compressed/decompress.c @@ -19,6 +19,8 @@ #include #include +#include "decompress.h" + /* * These two variables specify the free mem region * that can be used for temporary malloc area @@ -26,20 +28,6 @@ unsigned long free_mem_ptr; unsigned long free_mem_end_ptr; -/* The linker tells us where the image is. */ -extern unsigned char __image_begin[], __image_end[]; - -/* debug interfaces */ -#ifdef CONFIG_DEBUG_ZBOOT -extern void puts(const char *s); -extern void puthex(unsigned long long val); -#else -#define puts(s) do {} while (0) -#define puthex(val) do {} while (0) -#endif - -extern char __appended_dtb[]; - void error(char *x) { puts("\n\n"); diff --git a/arch/mips/boot/compressed/decompress.h b/arch/mips/boot/compressed/decompress.h new file mode 100644 index 000000000000..073b64593b3d --- /dev/null +++ b/arch/mips/boot/compressed/decompress.h @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _DECOMPRESSOR_H +#define _DECOMPRESSOR_H + +/* The linker tells us where the image is. */ +extern unsigned char __image_begin[], __image_end[]; + +/* debug interfaces */ +#ifdef CONFIG_DEBUG_ZBOOT +extern void putc(char c); +extern void puts(const char *s); +extern void puthex(unsigned long long val); +#else +#define putc(s) do {} while (0) +#define puts(s) do {} while (0) +#define puthex(val) do {} while (0) +#endif + +extern char __appended_dtb[]; + +void error(char *x); +void decompress_kernel(unsigned long boot_heap_start); + +#endif diff --git a/arch/mips/boot/compressed/string.c b/arch/mips/boot/compressed/string.c index 0b593b709228..f0eb251e44e5 100644 --- a/arch/mips/boot/compressed/string.c +++ b/arch/mips/boot/compressed/string.c @@ -7,6 +7,7 @@ #include #include +#include void *memcpy(void *dest, const void *src, size_t n) { From be018aaa158ad5155f21a85faf3865cb0a379d09 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:56:52 +0100 Subject: [PATCH 0536/1562] mips: add asm/syscalls.h header System call prototypes are generally in linux/syscalls.h, but there are a couple of mips specific entry points that are missing there: arch/mips/kernel/signal.c:636:17: error: no previous prototype for 'sys_sigreturn' [-Werror=missing-prototypes] arch/mips/kernel/signal.c:673:17: error: no previous prototype for 'sys_rt_sigreturn' [-Werror=missing-prototypes] arch/mips/kernel/syscall.c:51:16: error: no previous prototype for 'sysm_pipe' [-Werror=missing-prototypes] arch/mips/kernel/mips-mt-fpaff.c:65:17: error: no previous prototype for 'mipsmt_sys_sched_setaffinity' [-Werror=missing-prototypes] arch/mips/kernel/mips-mt-fpaff.c:157:17: error: no previous prototype for 'mipsmt_sys_sched_getaffinity' [-Werror=missing-prototypes] Add these to a new asm/syscalls.h as we have in other architectures. Link: https://lkml.kernel.org/r/20231204115710.2247097-3-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/include/asm/syscalls.h | 33 ++++++++++++++++++++++++++++++++ arch/mips/kernel/linux32.c | 1 + arch/mips/kernel/mips-mt-fpaff.c | 1 + arch/mips/kernel/signal.c | 1 + arch/mips/kernel/signal32.c | 1 + arch/mips/kernel/signal_n32.c | 1 + arch/mips/kernel/signal_o32.c | 1 + arch/mips/kernel/syscall.c | 1 + 8 files changed, 40 insertions(+) create mode 100644 arch/mips/include/asm/syscalls.h diff --git a/arch/mips/include/asm/syscalls.h b/arch/mips/include/asm/syscalls.h new file mode 100644 index 000000000000..59f9c0c9fa0a --- /dev/null +++ b/arch/mips/include/asm/syscalls.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_MIPS_SYSCALLS_H +#define _ASM_MIPS_SYSCALLS_H + +#include +#include + +asmlinkage void sys_sigreturn(void); +asmlinkage void sys_rt_sigreturn(void); +asmlinkage int sysm_pipe(void); +asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr); +asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr); +asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_a2, + unsigned offset_a3, unsigned len_a4, + unsigned len_a5); +asmlinkage long sys32_fadvise64_64(int fd, int __pad, + unsigned long a2, unsigned long a3, + unsigned long a4, unsigned long a5, + int flags); +asmlinkage ssize_t sys32_readahead(int fd, u32 pad0, u64 a2, u64 a3, + size_t count); +asmlinkage long sys32_sync_file_range(int fd, int __pad, + unsigned long a2, unsigned long a3, + unsigned long a4, unsigned long a5, + int flags); +asmlinkage void sys32_rt_sigreturn(void); +asmlinkage void sys32_sigreturn(void); +asmlinkage int sys32_sigsuspend(compat_sigset_t __user *uset); +asmlinkage void sysn32_rt_sigreturn(void); + +#endif diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c index 6b61be486303..a0c0a7a654e9 100644 --- a/arch/mips/kernel/linux32.c +++ b/arch/mips/kernel/linux32.c @@ -42,6 +42,7 @@ #include #include #include +#include #ifdef __MIPSEB__ #define merge_64(r1, r2) ((((r1) & 0xffffffffUL) << 32) + ((r2) & 0xffffffffUL)) diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c index 67e130d3f038..10172fc4f627 100644 --- a/arch/mips/kernel/mips-mt-fpaff.c +++ b/arch/mips/kernel/mips-mt-fpaff.c @@ -15,6 +15,7 @@ #include #include #include +#include /* * CPU mask used to set process affinity for MT VPEs/TCs with FPUs diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 479999b7f2de..ccbf580827f6 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "signal-common.h" diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index 59b8965433c2..73081d4ee8c1 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "signal-common.h" diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c index cfc77b69420a..ff2043d620ba 100644 --- a/arch/mips/kernel/signal_n32.c +++ b/arch/mips/kernel/signal_n32.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "signal-common.h" diff --git a/arch/mips/kernel/signal_o32.c b/arch/mips/kernel/signal_o32.c index 299a7a28ca33..4f0458459650 100644 --- a/arch/mips/kernel/signal_o32.c +++ b/arch/mips/kernel/signal_o32.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "signal-common.h" diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index ae93a607ddf7..1bfc34a2e5b3 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -39,6 +39,7 @@ #include #include #include +#include #include /* From 09fc778e1b96539166e2745187310d266d2e4c29 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:56:53 +0100 Subject: [PATCH 0537/1562] mips: add missing declarations for trap handlers These exception handlers are all called from assembly code, so they don't normally need a declaration, but without one we now get warnings: arch/mips/mm/fault.c:323:17: error: no previous prototype for 'do_page_fault' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:447:17: error: no previous prototype for 'do_be' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:752:17: error: no previous prototype for 'do_ov' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:874:17: error: no previous prototype for 'do_fpe' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1027:17: error: no previous prototype for 'do_bp' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1114:17: error: no previous prototype for 'do_tr' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1151:17: error: no previous prototype for 'do_ri' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1402:17: error: no previous prototype for 'do_cpu' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1507:17: error: no previous prototype for 'do_msa_fpe' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1527:17: error: no previous prototype for 'do_msa' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1548:17: error: no previous prototype for 'do_mdmx' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1560:17: error: no previous prototype for 'do_watch' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1587:17: error: no previous prototype for 'do_mcheck' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1612:17: error: no previous prototype for 'do_mt' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1648:17: error: no previous prototype for 'do_dsp' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1656:17: error: no previous prototype for 'do_reserved' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1832:17: error: no previous prototype for 'cache_parity_error' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1880:17: error: no previous prototype for 'do_ftlb' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1909:17: error: no previous prototype for 'do_gsexc' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1944:6: error: no previous prototype for 'ejtag_exception_handler' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:1989:17: error: no previous prototype for 'nmi_exception_handler' [-Werror=missing-prototypes] arch/mips/kernel/unaligned.c:1516:17: error: no previous prototype for 'do_ade' [-Werror=missing-prototypes] Link: https://lkml.kernel.org/r/20231204115710.2247097-4-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/include/asm/traps.h | 24 ++++++++++++++++++++++++ arch/mips/kernel/r4k-bugs64.c | 1 + arch/mips/kernel/unaligned.c | 1 + arch/mips/mm/fault.c | 1 + 4 files changed, 27 insertions(+) diff --git a/arch/mips/include/asm/traps.h b/arch/mips/include/asm/traps.h index 15cde638b407..2c2b26f1e464 100644 --- a/arch/mips/include/asm/traps.h +++ b/arch/mips/include/asm/traps.h @@ -39,4 +39,28 @@ extern char except_vec_nmi[]; register_nmi_notifier(&fn##_nb); \ }) +asmlinkage void do_ade(struct pt_regs *regs); +asmlinkage void do_be(struct pt_regs *regs); +asmlinkage void do_ov(struct pt_regs *regs); +asmlinkage void do_fpe(struct pt_regs *regs, unsigned long fcr31); +asmlinkage void do_bp(struct pt_regs *regs); +asmlinkage void do_tr(struct pt_regs *regs); +asmlinkage void do_ri(struct pt_regs *regs); +asmlinkage void do_cpu(struct pt_regs *regs); +asmlinkage void do_msa_fpe(struct pt_regs *regs, unsigned int msacsr); +asmlinkage void do_msa(struct pt_regs *regs); +asmlinkage void do_mdmx(struct pt_regs *regs); +asmlinkage void do_watch(struct pt_regs *regs); +asmlinkage void do_mcheck(struct pt_regs *regs); +asmlinkage void do_mt(struct pt_regs *regs); +asmlinkage void do_dsp(struct pt_regs *regs); +asmlinkage void do_reserved(struct pt_regs *regs); +asmlinkage void do_ftlb(void); +asmlinkage void do_gsexc(struct pt_regs *regs, u32 diag1); +asmlinkage void do_daddi_ov(struct pt_regs *regs); + +asmlinkage void cache_parity_error(void); +asmlinkage void ejtag_exception_handler(struct pt_regs *regs); +asmlinkage void __noreturn nmi_exception_handler(struct pt_regs *regs); + #endif /* _ASM_TRAPS_H */ diff --git a/arch/mips/kernel/r4k-bugs64.c b/arch/mips/kernel/r4k-bugs64.c index 6ffefb2c6971..1e300330078d 100644 --- a/arch/mips/kernel/r4k-bugs64.c +++ b/arch/mips/kernel/r4k-bugs64.c @@ -14,6 +14,7 @@ #include #include #include +#include static char bug64hit[] __initdata = "reliable operation impossible!\n%s"; diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c index f4cf94e92ec3..db652c99b72e 100644 --- a/arch/mips/kernel/unaligned.c +++ b/arch/mips/kernel/unaligned.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include "access-helper.h" diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index d7878208bd3f..aaa9a242ebba 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -26,6 +26,7 @@ #include #include #include /* For VMALLOC_END */ +#include #include int show_unhandled_signals = 1; From 2657bc63d34ec0ca40244e7d1cfa78c742d905f2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:56:54 +0100 Subject: [PATCH 0538/1562] mips: rs870e: stop exporting local functions These four functions are exported, but don't have any users, and no prototypes, which now causes warnings: drivers/platform/mips/rs780e-acpi.c:35:6: error: no previous prototype for 'pm_iowrite' [-Werror=missing-prototypes] drivers/platform/mips/rs780e-acpi.c:41:4: error: no previous prototype for 'pm_ioread' [-Werror=missing-prototypes] drivers/platform/mips/rs780e-acpi.c:47:6: error: no previous prototype for 'pm2_iowrite' [-Werror=missing-prototypes] drivers/platform/mips/rs780e-acpi.c:53:4: error: no previous prototype for 'pm2_ioread' [-Werror=missing-prototypes] Link: https://lkml.kernel.org/r/20231204115710.2247097-5-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- drivers/platform/mips/rs780e-acpi.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/platform/mips/rs780e-acpi.c b/drivers/platform/mips/rs780e-acpi.c index bb0e8ae0eefd..5b8f9cc32589 100644 --- a/drivers/platform/mips/rs780e-acpi.c +++ b/drivers/platform/mips/rs780e-acpi.c @@ -32,29 +32,25 @@ static u8 pmio_read_index(u16 index, u8 reg) return inb(index + 1); } -void pm_iowrite(u8 reg, u8 value) +static void pm_iowrite(u8 reg, u8 value) { pmio_write_index(PM_INDEX, reg, value); } -EXPORT_SYMBOL(pm_iowrite); -u8 pm_ioread(u8 reg) +static u8 pm_ioread(u8 reg) { return pmio_read_index(PM_INDEX, reg); } -EXPORT_SYMBOL(pm_ioread); -void pm2_iowrite(u8 reg, u8 value) +static void pm2_iowrite(u8 reg, u8 value) { pmio_write_index(PM2_INDEX, reg, value); } -EXPORT_SYMBOL(pm2_iowrite); -u8 pm2_ioread(u8 reg) +static u8 pm2_ioread(u8 reg) { return pmio_read_index(PM2_INDEX, reg); } -EXPORT_SYMBOL(pm2_ioread); static void acpi_hw_clear_status(void) { From 2894a8c4bcdc41d72d9d6c080901101e48fd6195 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:56:55 +0100 Subject: [PATCH 0539/1562] mips: signal: move sigcontext declarations to header Function declarations should be in a shared header to ensure the prototypes match the definition: arch/mips/kernel/signal.c:439:5: error: no previous prototype for 'setup_sigcontext' [-Werror=missing-prototypes] arch/mips/kernel/signal.c:516:5: error: no previous prototype for 'restore_sigcontext' [-Werror=missing-prototypes] Link: https://lkml.kernel.org/r/20231204115710.2247097-6-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/kernel/signal-common.h | 3 +++ arch/mips/kernel/signal_n32.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/mips/kernel/signal-common.h b/arch/mips/kernel/signal-common.h index f50d48435c68..136eb20ac024 100644 --- a/arch/mips/kernel/signal-common.h +++ b/arch/mips/kernel/signal-common.h @@ -40,4 +40,7 @@ _restore_fp_context(void __user *fpregs, void __user *csr); extern asmlinkage int _save_msa_all_upper(void __user *buf); extern asmlinkage int _restore_msa_all_upper(void __user *buf); +extern int setup_sigcontext(struct pt_regs *, struct sigcontext __user *); +extern int restore_sigcontext(struct pt_regs *, struct sigcontext __user *); + #endif /* __SIGNAL_COMMON_H */ diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c index ff2043d620ba..139d2596b0d4 100644 --- a/arch/mips/kernel/signal_n32.c +++ b/arch/mips/kernel/signal_n32.c @@ -33,9 +33,6 @@ */ #define __NR_N32_restart_syscall 6214 -extern int setup_sigcontext(struct pt_regs *, struct sigcontext __user *); -extern int restore_sigcontext(struct pt_regs *, struct sigcontext __user *); - struct ucontextn32 { u32 uc_flags; s32 uc_link; From 9a2036724cd693fef6c7609a856e56fa0d348be9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:56:56 +0100 Subject: [PATCH 0540/1562] mips: mark local function static if possible These two functions are global but have no extern prototypes or other callers, so it's best to mark them as static, avoiding these warnings: arch/mips/kernel/mips-cm.c:204:13: error: no previous prototype for '__mips_cm_l2sync_phys_base' [-Werror=missing-prototypes] arch/mips/mm/c-r4k.c:1827:12: error: no previous prototype for 'r4k_cache_init_pm' [-Werror=missing-prototypes] Link: https://lkml.kernel.org/r/20231204115710.2247097-7-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/kernel/mips-cm.c | 2 +- arch/mips/mm/c-r4k.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/mips-cm.c b/arch/mips/kernel/mips-cm.c index 3f00788b0871..84b3affb9de8 100644 --- a/arch/mips/kernel/mips-cm.c +++ b/arch/mips/kernel/mips-cm.c @@ -201,7 +201,7 @@ phys_addr_t __mips_cm_phys_base(void) phys_addr_t mips_cm_phys_base(void) __attribute__((weak, alias("__mips_cm_phys_base"))); -phys_addr_t __mips_cm_l2sync_phys_base(void) +static phys_addr_t __mips_cm_l2sync_phys_base(void) { u32 base_reg; diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index 187d1c16361c..0619e5296ff3 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -1828,7 +1828,7 @@ static struct notifier_block r4k_cache_pm_notifier_block = { .notifier_call = r4k_cache_pm_notifier, }; -int __init r4k_cache_init_pm(void) +static int __init r4k_cache_init_pm(void) { return cpu_pm_register_notifier(&r4k_cache_pm_notifier_block); } From 6fb04df9b9b49891eebbdab6728f92091540e166 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:56:57 +0100 Subject: [PATCH 0541/1562] mips: move build_tlb_refill_handler() prototype Instead of having a declaration for each caller, have one that is shared with the function definition, which avoids a warning: arch/mips/mm/tlbex.c:2547:6: error: no previous prototype for 'build_tlb_refill_handler' [-Werror=missing-prototypes] Link: https://lkml.kernel.org/r/20231204115710.2247097-8-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/include/asm/tlbex.h | 1 + arch/mips/mm/tlb-r3k.c | 3 +-- arch/mips/mm/tlb-r4k.c | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/mips/include/asm/tlbex.h b/arch/mips/include/asm/tlbex.h index 6d97e23f30ab..24a2d06cc1c3 100644 --- a/arch/mips/include/asm/tlbex.h +++ b/arch/mips/include/asm/tlbex.h @@ -23,6 +23,7 @@ void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep); void build_tlb_write_entry(u32 **p, struct uasm_label **l, struct uasm_reloc **r, enum tlb_write_entry wmode); +void build_tlb_refill_handler(void); extern void handle_tlbl(void); extern char handle_tlbl_end[]; diff --git a/arch/mips/mm/tlb-r3k.c b/arch/mips/mm/tlb-r3k.c index 53dfa2b9316b..1fb2cf8c8bfa 100644 --- a/arch/mips/mm/tlb-r3k.c +++ b/arch/mips/mm/tlb-r3k.c @@ -23,11 +23,10 @@ #include #include #include +#include #undef DEBUG_TLB -extern void build_tlb_refill_handler(void); - /* CP0 hazard avoidance. */ #define BARRIER \ __asm__ __volatile__( \ diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c index 93c2d695588a..a542b255019a 100644 --- a/arch/mips/mm/tlb-r4k.c +++ b/arch/mips/mm/tlb-r4k.c @@ -22,10 +22,9 @@ #include #include #include +#include #include -extern void build_tlb_refill_handler(void); - /* * LOONGSON-2 has a 4 entry itlb which is a subset of jtlb, LOONGSON-3 has * a 4 entry itlb and a 4 entry dtlb which are subsets of jtlb. Unfortunately, From ad6eb1ec6a590168b0f3add844bdf2b8bd422714 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:56:58 +0100 Subject: [PATCH 0542/1562] mips: move jump_label_apply_nops() declaration to header Instead of an extern declaration in the C file with the caller, move it to an appropriate header, avoiding arch/mips/kernel/jump_label.c:93:6: error: no previous prototype for 'jump_label_apply_nops' [-Werror=missing-prototypes] Link: https://lkml.kernel.org/r/20231204115710.2247097-9-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/include/asm/jump_label.h | 3 +++ arch/mips/kernel/module.c | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index c5c6864e64bc..081be98c71ef 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -15,6 +15,9 @@ #include #include +struct module; +extern void jump_label_apply_nops(struct module *mod); + #define JUMP_LABEL_NOP_SIZE 4 #ifdef CONFIG_64BIT diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 0c936cbf20c5..7b2fbaa9cac5 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -20,8 +20,7 @@ #include #include #include - -extern void jump_label_apply_nops(struct module *mod); +#include struct mips_hi16 { struct mips_hi16 *next; From e9f98feb17207addcd66435d8211ccf9c0a563dd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:56:59 +0100 Subject: [PATCH 0543/1562] mips: unhide uasm_in_compat_space_p() declaration uasm_in_compat_space_p() has a conditional declaration but is defined unconditionally because of another local user, which causes a warning: arch/mips/mm/uasm.c:421:5: error: no previous prototype for 'uasm_in_compat_space_p' [-Werror=missing-prototypes] Make the declaration unconditional to avoid this. Link: https://lkml.kernel.org/r/20231204115710.2247097-10-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/include/asm/uasm.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h index 296bcf31abb5..b43bfd445252 100644 --- a/arch/mips/include/asm/uasm.h +++ b/arch/mips/include/asm/uasm.h @@ -193,9 +193,7 @@ struct uasm_label { void uasm_build_label(struct uasm_label **lab, u32 *addr, int lid); -#ifdef CONFIG_64BIT int uasm_in_compat_space_p(long addr); -#endif int uasm_rel_hi(long val); int uasm_rel_lo(long val); void UASM_i_LA_mostly(u32 **buf, unsigned int rs, long addr); From e021227afb5867738482c2dc0970191772cd0d4e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:57:00 +0100 Subject: [PATCH 0544/1562] mips: fix setup_zero_pages() prototype setup_zero_pages() has a local declaration in a platform specific header, but that is not seen in the file it is defined in: arch/mips/mm/init.c:60:6: error: no previous prototype for 'setup_zero_pages' [-Werror=missing-prototypes] Move it to the corresponding global header and include that where needed. Link: https://lkml.kernel.org/r/20231204115710.2247097-11-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/include/asm/mach-loongson64/mmzone.h | 1 - arch/mips/include/asm/mmzone.h | 2 ++ arch/mips/mm/init.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/mips/include/asm/mach-loongson64/mmzone.h b/arch/mips/include/asm/mach-loongson64/mmzone.h index ebb1deaa77b9..a3d65d37b8b5 100644 --- a/arch/mips/include/asm/mach-loongson64/mmzone.h +++ b/arch/mips/include/asm/mach-loongson64/mmzone.h @@ -18,7 +18,6 @@ extern struct pglist_data *__node_data[]; #define NODE_DATA(n) (__node_data[n]) -extern void setup_zero_pages(void); extern void __init prom_init_numa_memory(void); #endif /* _ASM_MACH_MMZONE_H */ diff --git a/arch/mips/include/asm/mmzone.h b/arch/mips/include/asm/mmzone.h index 602a21aee9d4..14226ea42036 100644 --- a/arch/mips/include/asm/mmzone.h +++ b/arch/mips/include/asm/mmzone.h @@ -20,4 +20,6 @@ #define nid_to_addrbase(nid) 0 #endif +extern void setup_zero_pages(void); + #endif /* _ASM_MMZONE_H_ */ diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 5dcb525a8995..c2e0e5aebe90 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include From ec47b986e53e5617e9ed1c74fc3db04983a2d782 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:57:01 +0100 Subject: [PATCH 0545/1562] mips: fix tlb_init() prototype There are two definitions for tlb_init(), but no global declaration: arch/mips/mm/tlb-r4k.c:552:6: error: no previous prototype for 'tlb_init' [-Werror=missing-prototypes] arch/mips/mm/tlb-r3k.c:244:6: error: no previous prototype for 'tlb_init' [-Werror=missing-prototypes] Move the declaration to asm/setup.h and included it as needed. Link: https://lkml.kernel.org/r/20231204115710.2247097-12-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/include/asm/setup.h | 1 + arch/mips/kernel/traps.c | 2 -- arch/mips/mm/tlb-r3k.c | 1 + arch/mips/mm/tlb-r4k.c | 1 + 4 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/mips/include/asm/setup.h b/arch/mips/include/asm/setup.h index 8c56b862fd9c..4dce41138bad 100644 --- a/arch/mips/include/asm/setup.h +++ b/arch/mips/include/asm/setup.h @@ -27,5 +27,6 @@ extern unsigned long ebase; extern unsigned int hwrena; extern void per_cpu_trap_init(bool); extern void cpu_cache_init(void); +extern void tlb_init(void); #endif /* __SETUP_H */ diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 246c6a6b0261..c58c0c3c5b40 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -2157,8 +2157,6 @@ void *set_vi_handler(int n, vi_handler_t addr) return set_vi_srs_handler(n, addr, 0); } -extern void tlb_init(void); - /* * Timer interrupt */ diff --git a/arch/mips/mm/tlb-r3k.c b/arch/mips/mm/tlb-r3k.c index 1fb2cf8c8bfa..f6db65410c65 100644 --- a/arch/mips/mm/tlb-r3k.c +++ b/arch/mips/mm/tlb-r3k.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #undef DEBUG_TLB diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c index a542b255019a..44411b20c7ec 100644 --- a/arch/mips/mm/tlb-r4k.c +++ b/arch/mips/mm/tlb-r4k.c @@ -24,6 +24,7 @@ #include #include #include +#include /* * LOONGSON-2 has a 4 entry itlb which is a subset of jtlb, LOONGSON-3 has From 66445677f01effe40a6fc725021db6bfa1e47aad Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:57:02 +0100 Subject: [PATCH 0546/1562] mips: move cache declarations into header Some of the cache functions are declared only for their callers, e.g. arch/mips/mm/c-r3k.c:28:15: error: no previous prototype for 'r3k_cache_size' [-Werror=missing-prototypes] arch/mips/mm/c-r3k.c:63:15: error: no previous prototype for 'r3k_cache_lsize' [-Werror=missing-prototypes] arch/mips/mm/c-r4k.c:1703:6: error: no previous prototype for 'r4k_cache_init' [-Werror=missing-prototypes] arch/mips/mm/sc-mips.c:255:5: error: no previous prototype for 'mips_sc_init' [-Werror=missing-prototypes] Move all the declarations to asm/cache.h and asm/r4kcache.h where they can be seen by the function definitions. Link: https://lkml.kernel.org/r/20231204115710.2247097-13-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/include/asm/cache.h | 6 ++++++ arch/mips/include/asm/r4kcache.h | 4 ++++ arch/mips/kernel/cpu-probe.c | 1 - arch/mips/kernel/cpu-r3k-probe.c | 1 - arch/mips/mm/c-r4k.c | 4 ---- arch/mips/mm/cache.c | 6 ------ 6 files changed, 10 insertions(+), 12 deletions(-) diff --git a/arch/mips/include/asm/cache.h b/arch/mips/include/asm/cache.h index 3424a7908c0f..8b08db3fb17a 100644 --- a/arch/mips/include/asm/cache.h +++ b/arch/mips/include/asm/cache.h @@ -17,5 +17,11 @@ #define __read_mostly __section(".data..read_mostly") extern void cache_noop(void); +extern void r3k_cache_init(void); +extern unsigned long r3k_cache_size(unsigned long); +extern unsigned long r3k_cache_lsize(unsigned long); +extern void r4k_cache_init(void); +extern void octeon_cache_init(void); +extern void au1x00_fixup_config_od(void); #endif /* _ASM_CACHE_H */ diff --git a/arch/mips/include/asm/r4kcache.h b/arch/mips/include/asm/r4kcache.h index 431a1c9d53fc..da1cd1bbdbc5 100644 --- a/arch/mips/include/asm/r4kcache.h +++ b/arch/mips/include/asm/r4kcache.h @@ -24,6 +24,10 @@ #include #include +extern void r5k_sc_init(void); +extern void rm7k_sc_init(void); +extern int mips_sc_init(void); + extern void (*r4k_blast_dcache)(void); extern void (*r4k_blast_icache)(void); diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index b406d8bfb15a..de7460c3a72e 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -179,7 +179,6 @@ void __init check_bugs32(void) static inline int cpu_has_confreg(void) { #ifdef CONFIG_CPU_R3000 - extern unsigned long r3k_cache_size(unsigned long); unsigned long size1, size2; unsigned long cfg = read_c0_conf(); diff --git a/arch/mips/kernel/cpu-r3k-probe.c b/arch/mips/kernel/cpu-r3k-probe.c index be93469c0e0e..0c826f729f75 100644 --- a/arch/mips/kernel/cpu-r3k-probe.c +++ b/arch/mips/kernel/cpu-r3k-probe.c @@ -42,7 +42,6 @@ void __init check_bugs32(void) static inline int cpu_has_confreg(void) { #ifdef CONFIG_CPU_R3000 - extern unsigned long r3k_cache_size(unsigned long); unsigned long size1, size2; unsigned long cfg = read_c0_conf(); diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index 0619e5296ff3..b45bf026ee55 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -1485,10 +1485,6 @@ static void loongson3_sc_init(void) return; } -extern int r5k_sc_init(void); -extern int rm7k_sc_init(void); -extern int mips_sc_init(void); - static void setup_scache(void) { struct cpuinfo_mips *c = ¤t_cpu_data; diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 7f830634dbe7..e5d19f4a38ba 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -206,19 +206,13 @@ static inline void setup_protection_map(void) void cpu_cache_init(void) { if (cpu_has_3k_cache) { - extern void __weak r3k_cache_init(void); - r3k_cache_init(); } if (cpu_has_4k_cache) { - extern void __weak r4k_cache_init(void); - r4k_cache_init(); } if (cpu_has_octeon_cache) { - extern void __weak octeon_cache_init(void); - octeon_cache_init(); } From 7dc5b89251840ff1071eb47ad3dd83ff2b24a4c6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:57:03 +0100 Subject: [PATCH 0547/1562] mips: add missing declarations These are three more functions that are only called from assembler and only need a declaration to avoid the -Wmissing-prototypes warnings: arch/mips/kernel/signal.c:904:17: error: no previous prototype for 'do_notify_resume' [-Werror=missing-prototypes] arch/mips/kernel/traps.c:370:6: error: no previous prototype for 'show_registers' [-Werror=missing-prototypes] arch/mips/kernel/smp.c:352:17: error: no previous prototype for 'start_secondary' [-Werror=missing-prototypes] Link: https://lkml.kernel.org/r/20231204115710.2247097-14-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/include/asm/processor.h | 2 ++ arch/mips/include/asm/signal.h | 1 + arch/mips/include/asm/smp.h | 2 ++ 3 files changed, 5 insertions(+) diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index ae2cd37a38f0..ca7662cc65a7 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -402,4 +402,6 @@ extern int mips_set_process_fp_mode(struct task_struct *task, #define GET_FP_MODE(task) mips_get_process_fp_mode(task) #define SET_FP_MODE(task,value) mips_set_process_fp_mode(task, value) +void show_registers(struct pt_regs *regs); + #endif /* _ASM_PROCESSOR_H */ diff --git a/arch/mips/include/asm/signal.h b/arch/mips/include/asm/signal.h index 23d6b8015c79..8de81ccef7ad 100644 --- a/arch/mips/include/asm/signal.h +++ b/arch/mips/include/asm/signal.h @@ -31,5 +31,6 @@ extern struct mips_abi mips_abi_32; extern int protected_save_fp_context(void __user *sc); extern int protected_restore_fp_context(void __user *sc); +void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags); #endif /* _ASM_SIGNAL_H */ diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h index a40d8c0e4b87..bfb1ec86fb84 100644 --- a/arch/mips/include/asm/smp.h +++ b/arch/mips/include/asm/smp.h @@ -63,6 +63,8 @@ extern asmlinkage void smp_bootstrap(void); extern void calculate_cpu_foreign_map(void); +asmlinkage void start_secondary(void); + /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing From 858c638c2fafb404d16453ee09aa1099f858178d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:57:04 +0100 Subject: [PATCH 0548/1562] mips: spram: fix missing prototype warning for spram_config arch/mips/kernel/spram.c:194:6: error: no previous prototype for 'spram_config' [-Werror=missing-prototypes] Link: https://lkml.kernel.org/r/20231204115710.2247097-15-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/include/asm/spram.h | 2 +- arch/mips/kernel/spram.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/include/asm/spram.h b/arch/mips/include/asm/spram.h index 373f2a5d495d..9f6a2cb1943a 100644 --- a/arch/mips/include/asm/spram.h +++ b/arch/mips/include/asm/spram.h @@ -3,7 +3,7 @@ #define _MIPS_SPRAM_H #if defined(CONFIG_MIPS_SPRAM) -extern __init void spram_config(void); +extern void spram_config(void); #else static inline void spram_config(void) { } #endif /* CONFIG_MIPS_SPRAM */ diff --git a/arch/mips/kernel/spram.c b/arch/mips/kernel/spram.c index d5d96214cce5..71c7e5e27567 100644 --- a/arch/mips/kernel/spram.c +++ b/arch/mips/kernel/spram.c @@ -12,6 +12,7 @@ #include #include #include +#include /* * These definitions are correct for the 24K/34K/74K SPRAM sample From 4666cf018a26d89f2ee1ad6023227caa37f1a799 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:57:05 +0100 Subject: [PATCH 0549/1562] mips: mt: include asm/mips_mt.h These two functions have a global prototype but the header is not included before the function definitions: arch/mips/kernel/mips-mt.c:50:6: error: no previous prototype for 'mips_mt_regdump' [-Werror=missing-prototypes] arch/mips/kernel/mips-mt.c:159:6: error: no previous prototype for 'mips_mt_set_cpuoptions' [-Werror=missing-prototypes] Link: https://lkml.kernel.org/r/20231204115710.2247097-16-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/kernel/mips-mt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/kernel/mips-mt.c b/arch/mips/kernel/mips-mt.c index f88b7919f11f..c07d64438b5b 100644 --- a/arch/mips/kernel/mips-mt.c +++ b/arch/mips/kernel/mips-mt.c @@ -19,6 +19,7 @@ #include #include #include +#include int vpelimit; From a3075dcb1757e641321290ea62197d2f8b185c45 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:57:07 +0100 Subject: [PATCH 0550/1562] mips: suspend: include linux/suspend.h as needed A couple of functions are defined by the architecture and declared in linux/suspend.h, but mips is lacking the corresponding #include statement before the definition: arch/mips/power/cpu.c:16:6: warning: no previous prototype for 'save_processor_state' [-Wmissing-prototypes] arch/mips/power/cpu.c:26:6: warning: no previous prototype for 'restore_processor_state' [-Wmissing-prototypes] arch/mips/power/cpu.c:36:5: warning: no previous prototype for 'pfn_is_nosave' [-Wmissing-prototypes] arch/mips/power/hibernate.c:6:5: warning: no previous prototype for 'swsusp_arch_resume' [-Wmissing-prototypes] Link: https://lkml.kernel.org/r/20231204115710.2247097-18-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/power/cpu.c | 1 + arch/mips/power/hibernate.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/mips/power/cpu.c b/arch/mips/power/cpu.c index a15e29dfc7b3..d8ef7778e535 100644 --- a/arch/mips/power/cpu.c +++ b/arch/mips/power/cpu.c @@ -6,6 +6,7 @@ * Author: Hu Hongbing * Wu Zhangjin */ +#include #include #include #include diff --git a/arch/mips/power/hibernate.c b/arch/mips/power/hibernate.c index 94ab17c3c49d..192879e76c85 100644 --- a/arch/mips/power/hibernate.c +++ b/arch/mips/power/hibernate.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include extern int restore_image(void); From b4fc7a3c37c3cd053f3ccf0553d7837e9d3e810b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:57:08 +0100 Subject: [PATCH 0551/1562] mips: hide conditionally unused functions A couple of functions are defined unconditionally but have a conditional declaration: arch/mips/mm/tlb-r4k.c:461:12: error: no previous prototype for 'add_temporary_entry' [-Werror=missing-prototypes] arch/mips/mm/pgtable-64.c:92:7: error: no previous prototype for 'mk_pmd' [-Werror=missing-prototypes] arch/mips/mm/pgtable-64.c:101:6: error: no previous prototype for 'set_pmd_at' [-Werror=missing-prototypes] Since there are no callers in these configurations, add the same #ifdef checks around the definitions. Link: https://lkml.kernel.org/r/20231204115710.2247097-19-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/mm/pgtable-64.c | 2 ++ arch/mips/mm/tlb-r4k.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c index c76d21f7dffb..1e544827dea9 100644 --- a/arch/mips/mm/pgtable-64.c +++ b/arch/mips/mm/pgtable-64.c @@ -89,6 +89,7 @@ void pud_init(void *addr) } #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t mk_pmd(struct page *page, pgprot_t prot) { pmd_t pmd; @@ -103,6 +104,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, { *pmdp = pmd; } +#endif void __init pagetable_init(void) { diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c index 44411b20c7ec..7e2a0011a6fb 100644 --- a/arch/mips/mm/tlb-r4k.c +++ b/arch/mips/mm/tlb-r4k.c @@ -458,6 +458,7 @@ EXPORT_SYMBOL(has_transparent_hugepage); int temp_tlb_entry; +#ifndef CONFIG_64BIT __init int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1, unsigned long entryhi, unsigned long pagemask) { @@ -496,6 +497,7 @@ out: local_irq_restore(flags); return ret; } +#endif static int ntlb; static int __init set_ntlb(char *str) From d1f4b2b875e49dbbc9114bf340ee6871a892d014 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:57:09 +0100 Subject: [PATCH 0552/1562] mips: smp: fix setup_profiling_timer() prototype The function is unconditionally defined in smp.c but is conditionally declared in a header that is not included here. arch/mips/kernel/smp.c:473:5: error: no previous prototype for 'setup_profiling_timer' [-Werror=missing-prototypes] Add the missing #include and #ifdef to match the declaration. Link: https://lkml.kernel.org/r/20231204115710.2247097-20-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/kernel/smp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 8fbef537fb88..774e4dcd86d2 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -468,11 +469,13 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) return 0; } +#ifdef CONFIG_PROFILING /* Not really SMP stuff ... */ int setup_profiling_timer(unsigned int multiplier) { return 0; } +#endif static void flush_tlb_all_ipi(void *info) { From 430b6ac059399828ca864979ad3685a3511c4984 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 12:57:10 +0100 Subject: [PATCH 0553/1562] mips: kexec: include linux/reboot.h Two functions are provided for kexec, but the mips implementation is missing the corresponding #include statment: arch/mips/kernel/machine_kexec.c:136:1: error: no previous prototype for 'machine_shutdown' [-Werror=missing-prototypes] arch/mips/kernel/machine_kexec.c:152:1: error: no previous prototype for 'machine_crash_shutdown' [-Werror=missing-prototypes] Link: https://lkml.kernel.org/r/20231204115710.2247097-21-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/kernel/machine_kexec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/kernel/machine_kexec.c b/arch/mips/kernel/machine_kexec.c index 432bfd3e7f22..4e3579bbd620 100644 --- a/arch/mips/kernel/machine_kexec.c +++ b/arch/mips/kernel/machine_kexec.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include From fd6f52e3fa9b681ec3bee79e3a0935b22082cf64 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Nov 2023 12:05:01 +0100 Subject: [PATCH 0554/1562] ida: make 'ida_dump' static Patch series "Treewide: enable -Wmissing-prototypes", v3. At this point, there are five architectures with a number of known regressions: alpha, nios2, mips, sh and sparc. In the previous version of this patch, I had turned off the missing prototype warnings for the 15 architectures that still had issues, but since there are only five left, I think we can leave the rest to the maintainers (Cc'd here) as well. The series is also likely to cause occasional build regressions on linux-next as developers add new code that misses prototypes. Hopefully this should be resolved by the time the patches make it into a release and everyone gets the warnings right away. This patch (of 6): There is no global declaration for ida_dump() and no other callers, so make it static to avoid this warning: lib/test_ida.c:16:6: error: no previous prototype for 'ida_dump' Link: https://lkml.kernel.org/r/20231123110506.707903-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20231123110506.707903-2-arnd@kernel.org Fixes: 8ab8ba38d488 ("ida: Start new test_ida module") Signed-off-by: Arnd Bergmann Cc: "David S. Miller" Cc: David Woodhouse Cc: Dinh Nguyen Cc: Greg Kroah-Hartman Cc: Ivan Kokshaysky Cc: John Paul Adrian Glaubitz Cc: Masahiro Yamada Cc: Matt Turner Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Peter Zijlstra Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Cc: Tudor Ambarus Cc: Yoshinori Sato Cc: Kees Cook Cc: Palmer Dabbelt Cc: Zhihao Cheng Signed-off-by: Andrew Morton --- lib/test_ida.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_ida.c b/lib/test_ida.c index b06880625961..f946c80ced8b 100644 --- a/lib/test_ida.c +++ b/lib/test_ida.c @@ -13,7 +13,7 @@ static unsigned int tests_run; static unsigned int tests_passed; #ifdef __KERNEL__ -void ida_dump(struct ida *ida) { } +static void ida_dump(struct ida *ida) { } #endif #define IDA_BUG_ON(ida, x) do { \ tests_run++; \ From a9a6c365f3edfd5b9e50f182171bcf96c8bedcbb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Nov 2023 12:05:02 +0100 Subject: [PATCH 0555/1562] jffs2: mark __jffs2_dbg_superblock_counts() static This function is only called locally and does not need to be global. Since there is no external prototype, gcc warns about the non-static definition: fs/jffs2/debug.c:160:6: error: no previous prototype for '__jffs2_dbg_superblock_counts' [-Werror=missing-prototypes] Link: https://lkml.kernel.org/r/20231123110506.707903-3-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Tudor Ambarus Reviewed-by: Zhihao Cheng Cc: "David S. Miller" Cc: David Woodhouse Cc: Dinh Nguyen Cc: Greg Kroah-Hartman Cc: Ivan Kokshaysky Cc: John Paul Adrian Glaubitz Cc: Kees Cook Cc: Masahiro Yamada Cc: Matt Turner Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- fs/jffs2/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c index 9d26b1b9fc01..0925caab23c4 100644 --- a/fs/jffs2/debug.c +++ b/fs/jffs2/debug.c @@ -157,7 +157,7 @@ __jffs2_dbg_prewrite_paranoia_check(struct jffs2_sb_info *c, kfree(buf); } -void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c) +static void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c) { struct jffs2_eraseblock *jeb; uint32_t free = 0, dirty = 0, used = 0, wasted = 0, From b1c3efe07987592c16d5f59ce235e6ddbea65a73 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Nov 2023 12:05:03 +0100 Subject: [PATCH 0556/1562] sched: fair: move unused stub functions to header These four functions have a normal definition for CONFIG_FAIR_GROUP_SCHED, and empty one that is only referenced when FAIR_GROUP_SCHED is disabled but CGROUP_SCHED is still enabled. If both are turned off, the functions are still defined but the misisng prototype causes a W=1 warning: kernel/sched/fair.c:12544:6: error: no previous prototype for 'free_fair_sched_group' kernel/sched/fair.c:12546:5: error: no previous prototype for 'alloc_fair_sched_group' kernel/sched/fair.c:12553:6: error: no previous prototype for 'online_fair_sched_group' kernel/sched/fair.c:12555:6: error: no previous prototype for 'unregister_fair_sched_group' Move the alternatives into the header as static inline functions with the correct combination of #ifdef checks to avoid the warning without adding even more complexity. [A different patch with the same description got applied by accident and was later reverted, but the original patch is still missing] Link: https://lkml.kernel.org/r/20231123110506.707903-4-arnd@kernel.org Fixes: 7aa55f2a5902 ("sched/fair: Move unused stub functions to header") Signed-off-by: Arnd Bergmann Cc: "David S. Miller" Cc: David Woodhouse Cc: Dinh Nguyen Cc: Greg Kroah-Hartman Cc: Ivan Kokshaysky Cc: John Paul Adrian Glaubitz Cc: Kees Cook Cc: Masahiro Yamada Cc: Matt Turner Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Cc: Tudor Ambarus Cc: Yoshinori Sato Cc: Zhihao Cheng Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 13 ------------- kernel/sched/sched.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d7a3c63a2171..835a40eac462 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -13036,19 +13036,6 @@ next_cpu: return 0; } -#else /* CONFIG_FAIR_GROUP_SCHED */ - -void free_fair_sched_group(struct task_group *tg) { } - -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} - -void online_fair_sched_group(struct task_group *tg) { } - -void unregister_fair_sched_group(struct task_group *tg) { } - #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2e5a95486a42..8f5df5250b8d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -436,10 +436,21 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) extern int tg_nop(struct task_group *tg, void *data); +#ifdef CONFIG_FAIR_GROUP_SCHED extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); +#else +static inline void free_fair_sched_group(struct task_group *tg) { } +static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} +static inline void online_fair_sched_group(struct task_group *tg) { } +static inline void unregister_fair_sched_group(struct task_group *tg) { } +#endif + extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); From 9fcba2e95980704cdca56892481f76a92621095d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Nov 2023 12:05:04 +0100 Subject: [PATCH 0557/1562] x86: sta2x11: include header for sta2x11_get_instance() prototype sta2x11_get_instance() is a global function declared in asm/sta2x11.h, but this header is not included before the definition, causing a warning: arch/x86/pci/sta2x11-fixup.c:95:26: error: no previous prototype for 'sta2x11_get_instance' [-Werror=missing-prototypes] Add the missing #include. Link: https://lkml.kernel.org/r/20231123110506.707903-5-arnd@kernel.org Fixes: 83125a3a189e ("x86, platform: Initial support for sta2x11 I/O hub") Signed-off-by: Arnd Bergmann Cc: "David S. Miller" Cc: David Woodhouse Cc: Dinh Nguyen Cc: Greg Kroah-Hartman Cc: Ivan Kokshaysky Cc: John Paul Adrian Glaubitz Cc: Kees Cook Cc: Masahiro Yamada Cc: Matt Turner Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Cc: Tudor Ambarus Cc: Yoshinori Sato Cc: Zhihao Cheng Signed-off-by: Andrew Morton --- arch/x86/pci/sta2x11-fixup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 7368afc03998..8c8ddc4dcc08 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -14,6 +14,7 @@ #include #include #include +#include #define STA2X11_SWIOTLB_SIZE (4*1024*1024) From 0025aa93d70278e8a13c07c0b308630575f4e805 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Nov 2023 12:05:05 +0100 Subject: [PATCH 0558/1562] usb: fsl-mph-dr-of: mark fsl_usb2_mpc5121_init() static This function is only called locally and should always have been static: drivers/usb/host/fsl-mph-dr-of.c:291:5: error: no previous prototype for 'fsl_usb2_mpc5121_init' [-Werror=missing-prototypes] Link: https://lkml.kernel.org/r/20231123110506.707903-6-arnd@kernel.org Fixes: 230f7ede6c2f ("USB: add USB EHCI support for MPC5121 SoC") Acked-by: Greg Kroah-Hartman Signed-off-by: Arnd Bergmann Cc: "David S. Miller" Cc: David Woodhouse Cc: Dinh Nguyen Cc: Ivan Kokshaysky Cc: John Paul Adrian Glaubitz Cc: Kees Cook Cc: Masahiro Yamada Cc: Matt Turner Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Cc: Tudor Ambarus Cc: Yoshinori Sato Cc: Zhihao Cheng Signed-off-by: Andrew Morton --- drivers/usb/host/fsl-mph-dr-of.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/fsl-mph-dr-of.c b/drivers/usb/host/fsl-mph-dr-of.c index 8508d37a2aff..6cdc3d805c32 100644 --- a/drivers/usb/host/fsl-mph-dr-of.c +++ b/drivers/usb/host/fsl-mph-dr-of.c @@ -288,7 +288,7 @@ static void fsl_usb2_mph_dr_of_remove(struct platform_device *ofdev) #define PHYCTRL_LSFE (1 << 1) /* Line State Filter Enable */ #define PHYCTRL_PXE (1 << 0) /* PHY oscillator enable */ -int fsl_usb2_mpc5121_init(struct platform_device *pdev) +static int fsl_usb2_mpc5121_init(struct platform_device *pdev) { struct fsl_usb2_platform_data *pdata = dev_get_platdata(&pdev->dev); struct clk *clk; From bfc4372b86085ec947fdcef20cbe40c55066394f Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 27 Nov 2023 13:08:57 +1100 Subject: [PATCH 0559/1562] powerpc: pmd_move_must_withdraw() is only needed for CONFIG_TRANSPARENT_HUGEPAGE This is required for the later patch "Makefile.extrawarn: turn on missing-prototypes globally". Link: https://lkml.kernel.org/r/20231127132809.45c2b398@canb.auug.org.au Signed-off-by: Stephen Rothwell Acked-by: Michael Ellerman Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/pgtable.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index be229290a6a7..3438ab72c346 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -542,6 +542,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, set_pte_at(vma->vm_mm, addr, ptep, pte); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * For hash translation mode, we use the deposited table to store hash slot * information and they are stored at PTRS_PER_PMD offset from related pmd @@ -563,6 +564,7 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, return true; } +#endif /* * Does the CPU support tlbie? From 7acf164b259d9007264d9d8501da1023f140a3b4 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 15 Nov 2023 21:00:27 +0800 Subject: [PATCH 0560/1562] resource: add walk_system_ram_res_rev() This function, being a variant of walk_system_ram_res() introduced in commit 8c86e70acead ("resource: provide new functions to walk through resources"), walks through a list of all the resources of System RAM in reversed order, i.e., from higher to lower. It will be used in kexec_file code to load kernel, initrd etc when preparing kexec reboot. Link: https://lkml.kernel.org/r/ZVTA6z/06cLnWKUz@MiWiFi-R3L-srv Signed-off-by: AKASHI Takahiro Signed-off-by: Baoquan He Cc: Eric Biederman Signed-off-by: Andrew Morton --- include/linux/ioport.h | 3 +++ kernel/resource.c | 57 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 14f5cfabbbc8..db7fe25f3370 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -331,6 +331,9 @@ extern int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int +walk_system_ram_res_rev(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)); +extern int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); diff --git a/kernel/resource.c b/kernel/resource.c index 866ef3663a0b..e8a244300e5b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include @@ -429,6 +431,61 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, func); } +/* + * This function, being a variant of walk_system_ram_res(), calls the @func + * callback against all memory ranges of type System RAM which are marked as + * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from + * higher to lower. + */ +int walk_system_ram_res_rev(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)) +{ + struct resource res, *rams; + int rams_size = 16, i; + unsigned long flags; + int ret = -1; + + /* create a list */ + rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL); + if (!rams) + return ret; + + flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + i = 0; + while ((start < end) && + (!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) { + if (i >= rams_size) { + /* re-alloc */ + struct resource *rams_new; + + rams_new = kvrealloc(rams, rams_size * sizeof(struct resource), + (rams_size + 16) * sizeof(struct resource), + GFP_KERNEL); + if (!rams_new) + goto out; + + rams = rams_new; + rams_size += 16; + } + + rams[i].start = res.start; + rams[i++].end = res.end; + + start = res.end + 1; + } + + /* go reverse */ + for (i--; i >= 0; i--) { + ret = (*func)(&rams[i], arg); + if (ret) + break; + } + +out: + kvfree(rams); + return ret; +} + /* * This function calls the @func callback against all memory ranges, which * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY. From b3ba234171cd0d58df0a13c262210ff8b5fd2830 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 14 Nov 2023 17:16:58 +0800 Subject: [PATCH 0561/1562] kexec_file: load kernel at top of system RAM if required Patch series "kexec_file: Load kernel at top of system RAM if required". Justification: ============== Kexec_load interface has been doing top down searching and loading kernel/initrd/purgtory etc to prepare for kexec reboot. In that way, the benefits are that it avoids to consume and fragment limited low memory which satisfy DMA buffer allocation and big chunk of continuous memory during system init; and avoids to stir with BIOS/FW reserved or occupied areas, or corner case handling/work around/quirk occupied areas when doing system init. By the way, the top-down searching and loading of kexec-ed kernel is done in user space utility code. For kexec_file loading, even if kexec_buf.top_down is 'true', it's simply ignored. It calls walk_system_ram_res() directly to go through all resources of System RAM bottom up, to find an available memory region, then call locate_mem_hole_callback() to allocate memory in that found memory region from top to down. This is not expected and inconsistent with kexec_load. Implementation =============== In patch 1, introduce a new function walk_system_ram_res_rev() which is a variant of walk_system_ram_res(), it walks through a list of all the resources of System RAM in reversed order, i.e., from higher to lower. In patch 2, check if kexec_buf.top_down is 'true' in kexec_walk_resources(), if yes, call walk_system_ram_res_rev() to find memory region of system RAM from top to down to load kernel/initrd etc. Background information: ======================= And I ever tried this in the past in a different way, please see below link. In the post, I tried to adjust struct sibling linking code, replace the the singly linked list with list_head so that walk_system_ram_res_rev() can be implemented in a much easier way. Finally I failed. https://lore.kernel.org/all/20180718024944.577-4-bhe@redhat.com/ This time, I picked up the patch from AKASHI Takahiro's old post and made some change to take as the current patch 1: https://lists.infradead.org/pipermail/linux-arm-kernel/2017-September/531456.html This patch (of 2): Kexec_load interface has been doing top down searching and loading kernel/initrd/purgtory etc to prepare for kexec reboot. In that way, the benefits are that it avoids to consume and fragment limited low memory which satisfy DMA buffer allocation and big chunk of continuous memory during system init; and avoids to stir with BIOS/FW reserved or occupied areas, or corner case handling/work around/quirk occupied areas when doing system init. By the way, the top-down searching and loading of kexec-ed kernel is done in user space utility code. For kexec_file loading, even if kexec_buf.top_down is 'true', it's simply ignored. It calls walk_system_ram_res() directly to go through all resources of System RAM bottom up, to find an available memory region, then call locate_mem_hole_callback() to allocate memory in that found memory region from top to down. This is not expected and inconsistent with kexec_load. Here check if kexec_buf.top_down is 'true' in kexec_walk_resources(), if yes, call the newly added walk_system_ram_res_rev() to find memory region of system RAM from top to down to load kernel/initrd etc. Link: https://lkml.kernel.org/r/20231114091658.228030-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20231114091658.228030-3-bhe@redhat.com Signed-off-by: Baoquan He Cc: AKASHI Takahiro Cc: Baoquan He Cc: Eric Biederman Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f9a419cd22d4..ba3ef30921b8 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -592,6 +592,8 @@ static int kexec_walk_resources(struct kexec_buf *kbuf, IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, crashk_res.start, crashk_res.end, kbuf, func); + else if (kbuf->top_down) + return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func); else return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } From 9d02330abd3ecdacc43fc6b16a5668e7e94b7562 Mon Sep 17 00:00:00 2001 From: Li Zhe Date: Thu, 23 Nov 2023 16:40:22 +0800 Subject: [PATCH 0562/1562] softlockup: serialized softlockup's log If multiple CPUs trigger softlockup at the same time with 'softlockup_all_cpu_backtrace=0', the softlockup's logs will appear staggeredly in dmesg, which will affect the viewing of the logs for developer. Since the code path for outputting softlockup logs is not a kernel hotspot and the performance requirements for the code are not strict, locks are used to serialize the softlockup log output to improve the readability of the logs. Link: https://lkml.kernel.org/r/20231123084022.10302-1-lizhe.67@bytedance.com Signed-off-by: Li Zhe Reviewed-by: Petr Mladek Reviewed-by: Douglas Anderson Cc: Lecopzer Chen Cc: Pingfan Liu Cc: Zefan Li Cc: John Ogness Signed-off-by: Andrew Morton --- kernel/watchdog.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5cd6d4e26915..bf30a6fac665 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -448,6 +448,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; + static DEFINE_SPINLOCK(watchdog_output_lock); if (!watchdog_enabled) return HRTIMER_NORESTART; @@ -514,6 +515,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* Start period for the next softlockup warning. */ update_report_ts(); + spin_lock(&watchdog_output_lock); pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -523,6 +525,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) show_regs(regs); else dump_stack(); + spin_unlock(&watchdog_output_lock); if (softlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(smp_processor_id()); From 584db20c181f5e28c0386d7987406ace7fbd3e49 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 27 Nov 2023 23:30:20 +0900 Subject: [PATCH 0563/1562] nilfs2: move page release outside of nilfs_delete_entry and nilfs_set_link Patch series "nilfs2: Folio conversions for directory paths". This series applies page->folio conversions to nilfs2 directory operations. This reduces hidden compound_head() calls and also converts deprecated kmap calls to kmap_local in the directory code. Although nilfs2 does not yet support large folios, Matthew has done his best here to include support for large folios, which will be needed for devices with large block sizes. This series corresponds to the second half of the original post [1], but with two complementary patches inserted at the beginning and some adjustments, to prevent a kmap_local constraint violation found during testing with highmem mapping. [1] https://lkml.kernel.org/r/20231106173903.1734114-1-willy@infradead.org I have reviewed all changes and tested this for regular and small block sizes, both on machines with and without highmem mapping. No issues found. This patch (of 17): In a few directory operations, the call to nilfs_put_page() for a page obtained using nilfs_find_entry() or nilfs_dotdot() is hidden in nilfs_set_link() and nilfs_delete_entry(), making it difficult to track page release and preventing change of its call position. By moving nilfs_put_page() out of these functions, this makes the page get/put correspondence clearer and makes it easier to swap nilfs_put_page() calls (and kunmap calls within them) when modifying multiple directory entries simultaneously in nilfs_rename(). Also, update comments for nilfs_set_link() and nilfs_delete_entry() to reflect changes in their behavior. To make nilfs_put_page() visible from namei.c, this moves its definition to nilfs.h and replaces existing equivalents to use it, but the exposure of that definition is temporary and will be removed on a later kmap -> kmap_local conversion. Link: https://lkml.kernel.org/r/20231127143036.2425-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20231127143036.2425-2-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 11 +---------- fs/nilfs2/namei.c | 13 +++++++------ fs/nilfs2/nilfs.h | 6 ++++++ 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index de2073c47651..b9f13bdf8fba 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -64,12 +64,6 @@ static inline unsigned int nilfs_chunk_size(struct inode *inode) return inode->i_sb->s_blocksize; } -static inline void nilfs_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. @@ -413,7 +407,6 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) return res; } -/* Releases the page */ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, struct page *page, struct inode *inode) { @@ -428,7 +421,6 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); nilfs_commit_chunk(page, mapping, from, to); - nilfs_put_page(page); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } @@ -533,7 +525,7 @@ out_unlock: /* * nilfs_delete_entry deletes a directory entry by merging it with the - * previous entry. Page is up-to-date. Releases the page. + * previous entry. Page is up-to-date. */ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) { @@ -569,7 +561,6 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) nilfs_commit_chunk(page, mapping, from, to); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); out: - nilfs_put_page(page); return err; } diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 2a4e7f4a8102..99255694cbe9 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -280,6 +280,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) set_nlink(inode, 1); } err = nilfs_delete_entry(de, page); + nilfs_put_page(page); if (err) goto out; @@ -386,6 +387,7 @@ static int nilfs_rename(struct mnt_idmap *idmap, if (!new_de) goto out_dir; nilfs_set_link(new_dir, new_de, new_page, old_inode); + nilfs_put_page(new_page); nilfs_mark_inode_dirty(new_dir); inode_set_ctime_current(new_inode); if (dir_de) @@ -409,9 +411,11 @@ static int nilfs_rename(struct mnt_idmap *idmap, inode_set_ctime_current(old_inode); nilfs_delete_entry(old_de, old_page); + nilfs_put_page(old_page); if (dir_de) { nilfs_set_link(old_inode, dir_de, dir_page, new_dir); + nilfs_put_page(dir_page); drop_nlink(old_dir); } nilfs_mark_inode_dirty(old_dir); @@ -421,13 +425,10 @@ static int nilfs_rename(struct mnt_idmap *idmap, return err; out_dir: - if (dir_de) { - kunmap(dir_page); - put_page(dir_page); - } + if (dir_de) + nilfs_put_page(dir_page); out_old: - kunmap(old_page); - put_page(old_page); + nilfs_put_page(old_page); out: nilfs_transaction_abort(old_dir->i_sb); return err; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 8046490cd7fe..afd700f5dc4e 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -237,6 +237,12 @@ extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **); extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, struct page *, struct inode *); +static inline void nilfs_put_page(struct page *page) +{ + kunmap(page); + put_page(page); +} + /* file.c */ extern int nilfs_sync_file(struct file *, loff_t, loff_t, int); From 8cf57c6df818f58fdad16a909506be213623a88e Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 27 Nov 2023 23:30:21 +0900 Subject: [PATCH 0564/1562] nilfs2: eliminate staggered calls to kunmap in nilfs_rename In nilfs_rename(), calls to nilfs_put_page() to release pages obtained with nilfs_find_entry() or nilfs_dotdot() are alternated in the normal path. When replacing the kernel memory mapping method from kmap to kmap_local_{page,folio}, this violates the constraint on the calling order of kunmap_local(). Swap the order of nilfs_put_page calls where the kmap sections of multiple pages overlap so that they are nested, allowing direct replacement of nilfs_put_page() -> unmap_and_put_page(). Without this reordering, that replacement will cause a kernel WARNING in kunmap_local_indexed() on architectures with high memory mapping. Link: https://lkml.kernel.org/r/20231127143036.2425-3-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/nilfs2/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 99255694cbe9..d179db8074c2 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -411,13 +411,14 @@ static int nilfs_rename(struct mnt_idmap *idmap, inode_set_ctime_current(old_inode); nilfs_delete_entry(old_de, old_page); - nilfs_put_page(old_page); if (dir_de) { nilfs_set_link(old_inode, dir_de, dir_page, new_dir); nilfs_put_page(dir_page); drop_nlink(old_dir); } + nilfs_put_page(old_page); + nilfs_mark_inode_dirty(old_dir); nilfs_mark_inode_dirty(old_inode); From 6bb09fa1b44f8634e7091d6186bcba80edebfce7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:22 +0900 Subject: [PATCH 0565/1562] nilfs2: remove page_address() from nilfs_set_link In preparation for removing kmap from directory handling, use offset_in_page() to calculate 'from'. Matches ext2. Link: https://lkml.kernel.org/r/20231127143036.2425-4-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index b9f13bdf8fba..9c0513245a3b 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -410,7 +410,7 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, struct page *page, struct inode *inode) { - unsigned int from = (char *)de - (char *)page_address(page); + unsigned int from = offset_in_page(de); unsigned int to = from + nilfs_rec_len_from_disk(de->rec_len); struct address_space *mapping = page->mapping; int err; From 2197f5aed404216ec8035bcf726ad808418fd691 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:23 +0900 Subject: [PATCH 0566/1562] nilfs2: remove page_address() from nilfs_add_link In preparation for removing kmap from directory handling, use offset_in_page() to calculate 'from'. Matches ext2. Link: https://lkml.kernel.org/r/20231127143036.2425-5-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 9c0513245a3b..73f135290288 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -493,7 +493,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) return -EINVAL; got_it: - from = (char *)de - (char *)page_address(page); + from = offset_in_page(de); to = from + rec_len; err = nilfs_prepare_chunk(page, from, to); if (err) From 6af2191f8358fa89061df70bf68a1fd616e49a06 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:24 +0900 Subject: [PATCH 0567/1562] nilfs2: remove page_address() from nilfs_delete_entry In preparation for removing kmap from directory handling, mask the directory entry pointer to discover the start address of the page. Matches ext2. Link: https://lkml.kernel.org/r/20231127143036.2425-6-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 73f135290288..385e47eda99f 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -531,7 +531,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; - char *kaddr = page_address(page); + char *kaddr = (char *)((unsigned long)dir & PAGE_MASK); unsigned int from, to; struct nilfs_dir_entry *de, *pde = NULL; int err; @@ -551,7 +551,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) de = nilfs_next_entry(de); } if (pde) - from = (char *)pde - (char *)page_address(page); + from = (char *)pde - kaddr; lock_page(page); err = nilfs_prepare_chunk(page, from, to); BUG_ON(err); From 09a46acb3697e50548bb265afa1d79163659dd85 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:25 +0900 Subject: [PATCH 0568/1562] nilfs2: return the mapped address from nilfs_get_page() In prepartion for switching from kmap() to kmap_local(), return the kmap address from nilfs_get_page() instead of having the caller look up page_address(). [konishi.ryusuke: fixed a missing blank line after declaration] Link: https://lkml.kernel.org/r/20231127143036.2425-7-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 57 +++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 385e47eda99f..45f75d4c4522 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -180,19 +180,24 @@ fail: return false; } -static struct page *nilfs_get_page(struct inode *dir, unsigned long n) +static void *nilfs_get_page(struct inode *dir, unsigned long n, + struct page **pagep) { struct address_space *mapping = dir->i_mapping; struct page *page = read_mapping_page(mapping, n, NULL); + void *kaddr; - if (!IS_ERR(page)) { - kmap(page); - if (unlikely(!PageChecked(page))) { - if (!nilfs_check_page(page)) - goto fail; - } + if (IS_ERR(page)) + return page; + + kaddr = kmap(page); + if (unlikely(!PageChecked(page))) { + if (!nilfs_check_page(page)) + goto fail; } - return page; + + *pagep = page; + return kaddr; fail: nilfs_put_page(page); @@ -269,14 +274,14 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; struct nilfs_dir_entry *de; - struct page *page = nilfs_get_page(inode, n); + struct page *page; - if (IS_ERR(page)) { + kaddr = nilfs_get_page(inode, n, &page); + if (IS_ERR(kaddr)) { nilfs_error(sb, "bad page in #%lu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; return -EIO; } - kaddr = page_address(page); de = (struct nilfs_dir_entry *)(kaddr + offset); limit = kaddr + nilfs_last_byte(inode, n) - NILFS_DIR_REC_LEN(1); @@ -339,11 +344,9 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, start = 0; n = start; do { - char *kaddr; + char *kaddr = nilfs_get_page(dir, n, &page); - page = nilfs_get_page(dir, n); - if (!IS_ERR(page)) { - kaddr = page_address(page); + if (!IS_ERR(kaddr)) { de = (struct nilfs_dir_entry *)kaddr; kaddr += nilfs_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { @@ -381,15 +384,11 @@ found: struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p) { - struct page *page = nilfs_get_page(dir, 0); - struct nilfs_dir_entry *de = NULL; + struct nilfs_dir_entry *de = nilfs_get_page(dir, 0, p); - if (!IS_ERR(page)) { - de = nilfs_next_entry( - (struct nilfs_dir_entry *)page_address(page)); - *p = page; - } - return de; + if (IS_ERR(de)) + return NULL; + return nilfs_next_entry(de); } ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) @@ -451,12 +450,11 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) for (n = 0; n <= npages; n++) { char *dir_end; - page = nilfs_get_page(dir, n); - err = PTR_ERR(page); - if (IS_ERR(page)) + kaddr = nilfs_get_page(dir, n, &page); + err = PTR_ERR(kaddr); + if (IS_ERR(kaddr)) goto out; lock_page(page); - kaddr = page_address(page); dir_end = kaddr + nilfs_last_byte(dir, n); de = (struct nilfs_dir_entry *)kaddr; kaddr += PAGE_SIZE - reclen; @@ -618,11 +616,10 @@ int nilfs_empty_dir(struct inode *inode) char *kaddr; struct nilfs_dir_entry *de; - page = nilfs_get_page(inode, i); - if (IS_ERR(page)) + kaddr = nilfs_get_page(inode, i, &page); + if (IS_ERR(kaddr)) continue; - kaddr = page_address(page); de = (struct nilfs_dir_entry *)kaddr; kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1); From a8e610353bf94c279d0ca6d3711aa84728d80a46 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:26 +0900 Subject: [PATCH 0569/1562] nilfs2: pass the mapped address to nilfs_check_page() Remove another use of page_address() as part of preparing for the kmap to kmap_local transition. Link: https://lkml.kernel.org/r/20231127143036.2425-8-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 45f75d4c4522..01900e84bddf 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -107,12 +107,11 @@ static void nilfs_commit_chunk(struct page *page, unlock_page(page); } -static bool nilfs_check_page(struct page *page) +static bool nilfs_check_page(struct page *page, char *kaddr) { struct inode *dir = page->mapping->host; struct super_block *sb = dir->i_sb; unsigned int chunk_size = nilfs_chunk_size(dir); - char *kaddr = page_address(page); unsigned int offs, rec_len; unsigned int limit = PAGE_SIZE; struct nilfs_dir_entry *p; @@ -192,7 +191,7 @@ static void *nilfs_get_page(struct inode *dir, unsigned long n, kaddr = kmap(page); if (unlikely(!PageChecked(page))) { - if (!nilfs_check_page(page)) + if (!nilfs_check_page(page, kaddr)) goto fail; } From 9b77f66f992733069543638afe591f94e1d30291 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:27 +0900 Subject: [PATCH 0570/1562] nilfs2: switch to kmap_local for directory handling Match ext2 by using kmap_local() instead of kmap(). This is more efficient. Also use unmap_and_put_page() instead of duplicating it as a nilfs function. [konishi.ryusuke: followed the change of page release helper call sites] Link: https://lkml.kernel.org/r/20231127143036.2425-9-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 27 +++++++++++++-------------- fs/nilfs2/namei.c | 12 ++++++------ fs/nilfs2/nilfs.h | 6 ------ 3 files changed, 19 insertions(+), 26 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 01900e84bddf..89e8a248e571 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -189,7 +189,7 @@ static void *nilfs_get_page(struct inode *dir, unsigned long n, if (IS_ERR(page)) return page; - kaddr = kmap(page); + kaddr = kmap_local_page(page); if (unlikely(!PageChecked(page))) { if (!nilfs_check_page(page, kaddr)) goto fail; @@ -199,7 +199,7 @@ static void *nilfs_get_page(struct inode *dir, unsigned long n, return kaddr; fail: - nilfs_put_page(page); + unmap_and_put_page(page, kaddr); return ERR_PTR(-EIO); } @@ -287,7 +287,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) { if (de->rec_len == 0) { nilfs_error(sb, "zero-length directory entry"); - nilfs_put_page(page); + unmap_and_put_page(page, kaddr); return -EIO; } if (de->inode) { @@ -300,13 +300,13 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, de->name, de->name_len, le64_to_cpu(de->inode), t)) { - nilfs_put_page(page); + unmap_and_put_page(page, kaddr); return 0; } } ctx->pos += nilfs_rec_len_from_disk(de->rec_len); } - nilfs_put_page(page); + unmap_and_put_page(page, kaddr); } return 0; } @@ -352,14 +352,14 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, if (de->rec_len == 0) { nilfs_error(dir->i_sb, "zero-length directory entry"); - nilfs_put_page(page); + unmap_and_put_page(page, kaddr); goto out; } if (nilfs_match(namelen, name, de)) goto found; de = nilfs_next_entry(de); } - nilfs_put_page(page); + unmap_and_put_page(page, kaddr); } if (++n >= npages) n = 0; @@ -399,8 +399,7 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) de = nilfs_find_entry(dir, qstr, &page); if (de) { res = le64_to_cpu(de->inode); - kunmap(page); - put_page(page); + unmap_and_put_page(page, de); } return res; } @@ -484,7 +483,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) de = (struct nilfs_dir_entry *)((char *)de + rec_len); } unlock_page(page); - nilfs_put_page(page); + unmap_and_put_page(page, kaddr); } BUG(); return -EINVAL; @@ -512,7 +511,7 @@ got_it: nilfs_mark_inode_dirty(dir); /* OFFSET_CACHE */ out_put: - nilfs_put_page(page); + unmap_and_put_page(page, de); out: return err; out_unlock: @@ -609,10 +608,10 @@ fail: int nilfs_empty_dir(struct inode *inode) { struct page *page = NULL; + char *kaddr; unsigned long i, npages = dir_pages(inode); for (i = 0; i < npages; i++) { - char *kaddr; struct nilfs_dir_entry *de; kaddr = nilfs_get_page(inode, i, &page); @@ -644,12 +643,12 @@ int nilfs_empty_dir(struct inode *inode) } de = nilfs_next_entry(de); } - nilfs_put_page(page); + unmap_and_put_page(page, kaddr); } return 1; not_empty: - nilfs_put_page(page); + unmap_and_put_page(page, kaddr); return 0; } diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index d179db8074c2..c08b1bf9fa7b 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -280,7 +280,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) set_nlink(inode, 1); } err = nilfs_delete_entry(de, page); - nilfs_put_page(page); + unmap_and_put_page(page, de); if (err) goto out; @@ -387,7 +387,7 @@ static int nilfs_rename(struct mnt_idmap *idmap, if (!new_de) goto out_dir; nilfs_set_link(new_dir, new_de, new_page, old_inode); - nilfs_put_page(new_page); + unmap_and_put_page(new_page, new_de); nilfs_mark_inode_dirty(new_dir); inode_set_ctime_current(new_inode); if (dir_de) @@ -414,10 +414,10 @@ static int nilfs_rename(struct mnt_idmap *idmap, if (dir_de) { nilfs_set_link(old_inode, dir_de, dir_page, new_dir); - nilfs_put_page(dir_page); + unmap_and_put_page(dir_page, dir_de); drop_nlink(old_dir); } - nilfs_put_page(old_page); + unmap_and_put_page(old_page, old_de); nilfs_mark_inode_dirty(old_dir); nilfs_mark_inode_dirty(old_inode); @@ -427,9 +427,9 @@ static int nilfs_rename(struct mnt_idmap *idmap, out_dir: if (dir_de) - nilfs_put_page(dir_page); + unmap_and_put_page(dir_page, dir_de); out_old: - nilfs_put_page(old_page); + unmap_and_put_page(old_page, old_de); out: nilfs_transaction_abort(old_dir->i_sb); return err; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index afd700f5dc4e..8046490cd7fe 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -237,12 +237,6 @@ extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **); extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, struct page *, struct inode *); -static inline void nilfs_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - /* file.c */ extern int nilfs_sync_file(struct file *, loff_t, loff_t, int); From 75ad5db662b24584bc640d043802bc194dab9014 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:28 +0900 Subject: [PATCH 0571/1562] nilfs2: add nilfs_get_folio() Convert nilfs_get_page() to be a wrapper. Also convert nilfs_check_page() to nilfs_check_folio(). Link: https://lkml.kernel.org/r/20231127143036.2425-10-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 67 ++++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 89e8a248e571..5c9ca3f0d7f4 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -107,18 +107,18 @@ static void nilfs_commit_chunk(struct page *page, unlock_page(page); } -static bool nilfs_check_page(struct page *page, char *kaddr) +static bool nilfs_check_folio(struct folio *folio, char *kaddr) { - struct inode *dir = page->mapping->host; + struct inode *dir = folio->mapping->host; struct super_block *sb = dir->i_sb; unsigned int chunk_size = nilfs_chunk_size(dir); - unsigned int offs, rec_len; - unsigned int limit = PAGE_SIZE; + size_t offs, rec_len; + size_t limit = folio_size(folio); struct nilfs_dir_entry *p; char *error; - if ((dir->i_size >> PAGE_SHIFT) == page->index) { - limit = dir->i_size & ~PAGE_MASK; + if (dir->i_size < folio_pos(folio) + limit) { + limit = dir->i_size - folio_pos(folio); if (limit & (chunk_size - 1)) goto Ebadsize; if (!limit) @@ -140,7 +140,7 @@ static bool nilfs_check_page(struct page *page, char *kaddr) if (offs != limit) goto Eend; out: - SetPageChecked(page); + folio_set_checked(folio); return true; /* Too bad, we had an error */ @@ -163,8 +163,8 @@ Espan: error = "directory entry across blocks"; bad_entry: nilfs_error(sb, - "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index << PAGE_SHIFT) + offs, + "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d", + dir->i_ino, error, (folio->index << PAGE_SHIFT) + offs, (unsigned long)le64_to_cpu(p->inode), rec_len, p->name_len); goto fail; @@ -172,35 +172,46 @@ Eend: p = (struct nilfs_dir_entry *)(kaddr + offs); nilfs_error(sb, "entry in directory #%lu spans the page boundary offset=%lu, inode=%lu", - dir->i_ino, (page->index << PAGE_SHIFT) + offs, + dir->i_ino, (folio->index << PAGE_SHIFT) + offs, (unsigned long)le64_to_cpu(p->inode)); fail: - SetPageError(page); + folio_set_error(folio); return false; } +static void *nilfs_get_folio(struct inode *dir, unsigned long n, + struct folio **foliop) +{ + struct address_space *mapping = dir->i_mapping; + struct folio *folio = read_mapping_folio(mapping, n, NULL); + void *kaddr; + + if (IS_ERR(folio)) + return folio; + + kaddr = kmap_local_folio(folio, 0); + if (unlikely(!folio_test_checked(folio))) { + if (!nilfs_check_folio(folio, kaddr)) + goto fail; + } + + *foliop = folio; + return kaddr; + +fail: + folio_release_kmap(folio, kaddr); + return ERR_PTR(-EIO); +} + static void *nilfs_get_page(struct inode *dir, unsigned long n, struct page **pagep) { - struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - void *kaddr; + struct folio *folio; + void *kaddr = nilfs_get_folio(dir, n, &folio); - if (IS_ERR(page)) - return page; - - kaddr = kmap_local_page(page); - if (unlikely(!PageChecked(page))) { - if (!nilfs_check_page(page, kaddr)) - goto fail; - } - - *pagep = page; + if (!IS_ERR(kaddr)) + *pagep = &folio->page; return kaddr; - -fail: - unmap_and_put_page(page, kaddr); - return ERR_PTR(-EIO); } /* From b37b2bec46bf11bbf20b3de22a45260292325cee Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:29 +0900 Subject: [PATCH 0572/1562] nilfs2: convert nilfs_readdir to use a folio Use the new folio APIs to remove calls to compound_head(). Link: https://lkml.kernel.org/r/20231127143036.2425-11-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 5c9ca3f0d7f4..c7b046589877 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -284,9 +284,9 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; struct nilfs_dir_entry *de; - struct page *page; + struct folio *folio; - kaddr = nilfs_get_page(inode, n, &page); + kaddr = nilfs_get_folio(inode, n, &folio); if (IS_ERR(kaddr)) { nilfs_error(sb, "bad page in #%lu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; @@ -298,7 +298,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) { if (de->rec_len == 0) { nilfs_error(sb, "zero-length directory entry"); - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return -EIO; } if (de->inode) { @@ -311,13 +311,13 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, de->name, de->name_len, le64_to_cpu(de->inode), t)) { - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return 0; } } ctx->pos += nilfs_rec_len_from_disk(de->rec_len); } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 0; } From a4bf041e44d571837d8c1d2da890aa0b65f76639 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:30 +0900 Subject: [PATCH 0573/1562] nilfs2: convert nilfs_find_entry to use a folio Use the new folio APIs to remove calls to compound_head(). [konishi.ryusuke: resolved a conflict due to style warning correction] Link: https://lkml.kernel.org/r/20231127143036.2425-12-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index c7b046589877..a79726182867 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -339,7 +339,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, unsigned int reclen = NILFS_DIR_REC_LEN(namelen); unsigned long start, n; unsigned long npages = dir_pages(dir); - struct page *page = NULL; + struct folio *folio = NULL; struct nilfs_inode_info *ei = NILFS_I(dir); struct nilfs_dir_entry *de; @@ -354,7 +354,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, start = 0; n = start; do { - char *kaddr = nilfs_get_page(dir, n, &page); + char *kaddr = nilfs_get_folio(dir, n, &folio); if (!IS_ERR(kaddr)) { de = (struct nilfs_dir_entry *)kaddr; @@ -363,18 +363,18 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, if (de->rec_len == 0) { nilfs_error(dir->i_sb, "zero-length directory entry"); - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); goto out; } if (nilfs_match(namelen, name, de)) goto found; de = nilfs_next_entry(de); } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } if (++n >= npages) n = 0; - /* next page is past the blocks we've got */ + /* next folio is past the blocks we've got */ if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) { nilfs_error(dir->i_sb, "dir %lu size %lld exceeds block count %llu", @@ -387,7 +387,7 @@ out: return NULL; found: - *res_page = page; + *res_page = &folio->page; ei->i_dir_start_lookup = n; return de; } From 6f133c97e5ced9a2adc983683684a06df27bb2c2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:31 +0900 Subject: [PATCH 0574/1562] nilfs2: convert nilfs_rename() to use folios This involves converting nilfs_find_entry(), nilfs_dotdot(), nilfs_set_link(), nilfs_delete_entry() and nilfs_do_unlink() to use folios as well. [konishi.ryusuke: followed the change of page release helper call sites] Link: https://lkml.kernel.org/r/20231127143036.2425-13-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 68 ++++++++++++++++++++++------------------------- fs/nilfs2/namei.c | 36 ++++++++++++------------- fs/nilfs2/nilfs.h | 20 +++++++------- 3 files changed, 60 insertions(+), 64 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index a79726182867..2a759598801b 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -323,38 +323,35 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) } /* - * nilfs_find_entry() + * nilfs_find_entry() * - * finds an entry in the specified directory with the wanted name. It - * returns the page in which the entry was found, and the entry itself - * (as a parameter - res_dir). Page is returned mapped and unlocked. - * Entry is guaranteed to be valid. + * Finds an entry in the specified directory with the wanted name. It + * returns the folio in which the entry was found, and the entry itself. + * The folio is mapped and unlocked. When the caller is finished with + * the entry, it should call folio_release_kmap(). + * + * On failure, returns NULL and the caller should ignore foliop. */ -struct nilfs_dir_entry * -nilfs_find_entry(struct inode *dir, const struct qstr *qstr, - struct page **res_page) +struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir, + const struct qstr *qstr, struct folio **foliop) { const unsigned char *name = qstr->name; int namelen = qstr->len; unsigned int reclen = NILFS_DIR_REC_LEN(namelen); unsigned long start, n; unsigned long npages = dir_pages(dir); - struct folio *folio = NULL; struct nilfs_inode_info *ei = NILFS_I(dir); struct nilfs_dir_entry *de; if (npages == 0) goto out; - /* OFFSET_CACHE */ - *res_page = NULL; - start = ei->i_dir_start_lookup; if (start >= npages) start = 0; n = start; do { - char *kaddr = nilfs_get_folio(dir, n, &folio); + char *kaddr = nilfs_get_folio(dir, n, foliop); if (!IS_ERR(kaddr)) { de = (struct nilfs_dir_entry *)kaddr; @@ -363,14 +360,14 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, if (de->rec_len == 0) { nilfs_error(dir->i_sb, "zero-length directory entry"); - folio_release_kmap(folio, kaddr); + folio_release_kmap(*foliop, kaddr); goto out; } if (nilfs_match(namelen, name, de)) goto found; de = nilfs_next_entry(de); } - folio_release_kmap(folio, kaddr); + folio_release_kmap(*foliop, kaddr); } if (++n >= npages) n = 0; @@ -387,14 +384,13 @@ out: return NULL; found: - *res_page = &folio->page; ei->i_dir_start_lookup = n; return de; } -struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p) +struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct folio **foliop) { - struct nilfs_dir_entry *de = nilfs_get_page(dir, 0, p); + struct nilfs_dir_entry *de = nilfs_get_folio(dir, 0, foliop); if (IS_ERR(de)) return NULL; @@ -405,30 +401,30 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) { ino_t res = 0; struct nilfs_dir_entry *de; - struct page *page; + struct folio *folio; - de = nilfs_find_entry(dir, qstr, &page); + de = nilfs_find_entry(dir, qstr, &folio); if (de) { res = le64_to_cpu(de->inode); - unmap_and_put_page(page, de); + folio_release_kmap(folio, de); } return res; } void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, - struct page *page, struct inode *inode) + struct folio *folio, struct inode *inode) { - unsigned int from = offset_in_page(de); - unsigned int to = from + nilfs_rec_len_from_disk(de->rec_len); - struct address_space *mapping = page->mapping; + size_t from = offset_in_folio(folio, de); + size_t to = from + nilfs_rec_len_from_disk(de->rec_len); + struct address_space *mapping = folio->mapping; int err; - lock_page(page); - err = nilfs_prepare_chunk(page, from, to); + folio_lock(folio); + err = nilfs_prepare_chunk(&folio->page, from, to); BUG_ON(err); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); - nilfs_commit_chunk(page, mapping, from, to); + nilfs_commit_chunk(&folio->page, mapping, from, to); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } @@ -532,14 +528,14 @@ out_unlock: /* * nilfs_delete_entry deletes a directory entry by merging it with the - * previous entry. Page is up-to-date. + * previous entry. Folio is up-to-date. */ -int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) +int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; - char *kaddr = (char *)((unsigned long)dir & PAGE_MASK); - unsigned int from, to; + char *kaddr = (char *)((unsigned long)dir & ~(folio_size(folio) - 1)); + size_t from, to; struct nilfs_dir_entry *de, *pde = NULL; int err; @@ -559,13 +555,13 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) } if (pde) from = (char *)pde - kaddr; - lock_page(page); - err = nilfs_prepare_chunk(page, from, to); + folio_lock(folio); + err = nilfs_prepare_chunk(&folio->page, from, to); BUG_ON(err); if (pde) pde->rec_len = nilfs_rec_len_to_disk(to - from); dir->inode = 0; - nilfs_commit_chunk(page, mapping, from, to); + nilfs_commit_chunk(&folio->page, mapping, from, to); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); out: return err; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index c08b1bf9fa7b..959bd9fb3d81 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -260,11 +260,11 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode; struct nilfs_dir_entry *de; - struct page *page; + struct folio *folio; int err; err = -ENOENT; - de = nilfs_find_entry(dir, &dentry->d_name, &page); + de = nilfs_find_entry(dir, &dentry->d_name, &folio); if (!de) goto out; @@ -279,8 +279,8 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } - err = nilfs_delete_entry(de, page); - unmap_and_put_page(page, de); + err = nilfs_delete_entry(de, folio); + folio_release_kmap(folio, de); if (err) goto out; @@ -348,9 +348,9 @@ static int nilfs_rename(struct mnt_idmap *idmap, { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct page *dir_page = NULL; + struct folio *dir_folio = NULL; struct nilfs_dir_entry *dir_de = NULL; - struct page *old_page; + struct folio *old_folio; struct nilfs_dir_entry *old_de; struct nilfs_transaction_info ti; int err; @@ -363,19 +363,19 @@ static int nilfs_rename(struct mnt_idmap *idmap, return err; err = -ENOENT; - old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page); + old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_folio); if (!old_de) goto out; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = nilfs_dotdot(old_inode, &dir_page); + dir_de = nilfs_dotdot(old_inode, &dir_folio); if (!dir_de) goto out_old; } if (new_inode) { - struct page *new_page; + struct folio *new_folio; struct nilfs_dir_entry *new_de; err = -ENOTEMPTY; @@ -383,11 +383,11 @@ static int nilfs_rename(struct mnt_idmap *idmap, goto out_dir; err = -ENOENT; - new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page); + new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_folio); if (!new_de) goto out_dir; - nilfs_set_link(new_dir, new_de, new_page, old_inode); - unmap_and_put_page(new_page, new_de); + nilfs_set_link(new_dir, new_de, new_folio, old_inode); + folio_release_kmap(new_folio, new_de); nilfs_mark_inode_dirty(new_dir); inode_set_ctime_current(new_inode); if (dir_de) @@ -410,14 +410,14 @@ static int nilfs_rename(struct mnt_idmap *idmap, */ inode_set_ctime_current(old_inode); - nilfs_delete_entry(old_de, old_page); + nilfs_delete_entry(old_de, old_folio); if (dir_de) { - nilfs_set_link(old_inode, dir_de, dir_page, new_dir); - unmap_and_put_page(dir_page, dir_de); + nilfs_set_link(old_inode, dir_de, dir_folio, new_dir); + folio_release_kmap(dir_folio, dir_de); drop_nlink(old_dir); } - unmap_and_put_page(old_page, old_de); + folio_release_kmap(old_folio, old_de); nilfs_mark_inode_dirty(old_dir); nilfs_mark_inode_dirty(old_inode); @@ -427,9 +427,9 @@ static int nilfs_rename(struct mnt_idmap *idmap, out_dir: if (dir_de) - unmap_and_put_page(dir_page, dir_de); + folio_release_kmap(dir_folio, dir_de); out_old: - unmap_and_put_page(old_page, old_de); + folio_release_kmap(old_folio, old_de); out: nilfs_transaction_abort(old_dir->i_sb); return err; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 8046490cd7fe..98cffaf0ac12 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -226,16 +226,16 @@ static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags) } /* dir.c */ -extern int nilfs_add_link(struct dentry *, struct inode *); -extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *); -extern int nilfs_make_empty(struct inode *, struct inode *); -extern struct nilfs_dir_entry * -nilfs_find_entry(struct inode *, const struct qstr *, struct page **); -extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *); -extern int nilfs_empty_dir(struct inode *); -extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **); -extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, - struct page *, struct inode *); +int nilfs_add_link(struct dentry *, struct inode *); +ino_t nilfs_inode_by_name(struct inode *, const struct qstr *); +int nilfs_make_empty(struct inode *, struct inode *); +struct nilfs_dir_entry *nilfs_find_entry(struct inode *, const struct qstr *, + struct folio **); +int nilfs_delete_entry(struct nilfs_dir_entry *, struct folio *); +int nilfs_empty_dir(struct inode *); +struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct folio **); +void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, + struct folio *, struct inode *); /* file.c */ extern int nilfs_sync_file(struct file *, loff_t, loff_t, int); From f59bb60f7d56a0f93570dfb6d221b62495c63ead Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:32 +0900 Subject: [PATCH 0575/1562] nilfs2: convert nilfs_add_link() to use a folio Remove six calls to compound_head() by using the folio API. Link: https://lkml.kernel.org/r/20231127143036.2425-14-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 2a759598801b..8d8c42e34148 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -439,30 +439,28 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) unsigned int chunk_size = nilfs_chunk_size(dir); unsigned int reclen = NILFS_DIR_REC_LEN(namelen); unsigned short rec_len, name_len; - struct page *page = NULL; + struct folio *folio = NULL; struct nilfs_dir_entry *de; unsigned long npages = dir_pages(dir); unsigned long n; - char *kaddr; - unsigned int from, to; + size_t from, to; int err; /* * We take care of directory expansion in the same loop. - * This code plays outside i_size, so it locks the page + * This code plays outside i_size, so it locks the folio * to protect that region. */ for (n = 0; n <= npages; n++) { + char *kaddr = nilfs_get_folio(dir, n, &folio); char *dir_end; - kaddr = nilfs_get_page(dir, n, &page); - err = PTR_ERR(kaddr); if (IS_ERR(kaddr)) - goto out; - lock_page(page); + return PTR_ERR(kaddr); + folio_lock(folio); dir_end = kaddr + nilfs_last_byte(dir, n); de = (struct nilfs_dir_entry *)kaddr; - kaddr += PAGE_SIZE - reclen; + kaddr += folio_size(folio) - reclen; while ((char *)de <= kaddr) { if ((char *)de == dir_end) { /* We hit i_size */ @@ -489,16 +487,16 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) goto got_it; de = (struct nilfs_dir_entry *)((char *)de + rec_len); } - unlock_page(page); - unmap_and_put_page(page, kaddr); + folio_unlock(folio); + folio_release_kmap(folio, kaddr); } BUG(); return -EINVAL; got_it: - from = offset_in_page(de); + from = offset_in_folio(folio, de); to = from + rec_len; - err = nilfs_prepare_chunk(page, from, to); + err = nilfs_prepare_chunk(&folio->page, from, to); if (err) goto out_unlock; if (de->inode) { @@ -513,16 +511,15 @@ got_it: memcpy(de->name, name, namelen); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); - nilfs_commit_chunk(page, page->mapping, from, to); + nilfs_commit_chunk(&folio->page, folio->mapping, from, to); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); nilfs_mark_inode_dirty(dir); /* OFFSET_CACHE */ out_put: - unmap_and_put_page(page, de); -out: + folio_release_kmap(folio, de); return err; out_unlock: - unlock_page(page); + folio_unlock(folio); goto out_put; } From 18f03ddf4db8cecfc6337d7a6775545fdbdc1713 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:33 +0900 Subject: [PATCH 0576/1562] nilfs2: convert nilfs_empty_dir() to use a folio Remove three calls to compound_head() by using the folio API. Link: https://lkml.kernel.org/r/20231127143036.2425-15-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 8d8c42e34148..919936d9ec27 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -203,17 +203,6 @@ fail: return ERR_PTR(-EIO); } -static void *nilfs_get_page(struct inode *dir, unsigned long n, - struct page **pagep) -{ - struct folio *folio; - void *kaddr = nilfs_get_folio(dir, n, &folio); - - if (!IS_ERR(kaddr)) - *pagep = &folio->page; - return kaddr; -} - /* * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure. * @@ -611,14 +600,14 @@ fail: */ int nilfs_empty_dir(struct inode *inode) { - struct page *page = NULL; + struct folio *folio = NULL; char *kaddr; unsigned long i, npages = dir_pages(inode); for (i = 0; i < npages; i++) { struct nilfs_dir_entry *de; - kaddr = nilfs_get_page(inode, i, &page); + kaddr = nilfs_get_folio(inode, i, &folio); if (IS_ERR(kaddr)) continue; @@ -647,12 +636,12 @@ int nilfs_empty_dir(struct inode *inode) } de = nilfs_next_entry(de); } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 1; not_empty: - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return 0; } From 0743230fff17f729a56c35869e20a5f090a8fdc2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:34 +0900 Subject: [PATCH 0577/1562] nilfs2: convert nilfs_make_empty() to use a folio Remove two calls to compound_head() and switch from kmap_atomic to kmap_local. Link: https://lkml.kernel.org/r/20231127143036.2425-16-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 919936d9ec27..ff0a009a292f 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -559,21 +559,21 @@ out: int nilfs_make_empty(struct inode *inode, struct inode *parent) { struct address_space *mapping = inode->i_mapping; - struct page *page = grab_cache_page(mapping, 0); + struct folio *folio = filemap_grab_folio(mapping, 0); unsigned int chunk_size = nilfs_chunk_size(inode); struct nilfs_dir_entry *de; int err; void *kaddr; - if (!page) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); - err = nilfs_prepare_chunk(page, 0, chunk_size); + err = nilfs_prepare_chunk(&folio->page, 0, chunk_size); if (unlikely(err)) { - unlock_page(page); + folio_unlock(folio); goto fail; } - kaddr = kmap_atomic(page); + kaddr = kmap_local_folio(folio, 0); memset(kaddr, 0, chunk_size); de = (struct nilfs_dir_entry *)kaddr; de->name_len = 1; @@ -588,10 +588,10 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent) de->inode = cpu_to_le64(parent->i_ino); memcpy(de->name, "..\0", 4); nilfs_set_de_type(de, inode); - kunmap_atomic(kaddr); - nilfs_commit_chunk(page, mapping, 0, chunk_size); + kunmap_local(kaddr); + nilfs_commit_chunk(&folio->page, mapping, 0, chunk_size); fail: - put_page(page); + folio_put(folio); return err; } From 9bff5f980eb78b04627d6d8f69869d9fb8aa6ff7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:35 +0900 Subject: [PATCH 0578/1562] nilfs2: convert nilfs_prepare_chunk() and nilfs_commit_chunk() to folios All callers now have a folio, so convert these two functions. Saves one call to compound_head() in unlock_page(). [konishi.ryusuke: resolved conflicts in nilfs_{set_link,delete_entry}] Link: https://lkml.kernel.org/r/20231127143036.2425-17-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index ff0a009a292f..bc846b904b68 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -78,33 +78,32 @@ static unsigned int nilfs_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static int nilfs_prepare_chunk(struct page *page, unsigned int from, +static int nilfs_prepare_chunk(struct folio *folio, unsigned int from, unsigned int to) { - loff_t pos = page_offset(page) + from; + loff_t pos = folio_pos(folio) + from; - return __block_write_begin(page, pos, to - from, nilfs_get_block); + return __block_write_begin(&folio->page, pos, to - from, nilfs_get_block); } -static void nilfs_commit_chunk(struct page *page, - struct address_space *mapping, - unsigned int from, unsigned int to) +static void nilfs_commit_chunk(struct folio *folio, + struct address_space *mapping, size_t from, size_t to) { struct inode *dir = mapping->host; - loff_t pos = page_offset(page) + from; - unsigned int len = to - from; - unsigned int nr_dirty, copied; + loff_t pos = folio_pos(folio) + from; + size_t copied, len = to - from; + unsigned int nr_dirty; int err; - nr_dirty = nilfs_page_count_clean_buffers(page, from, to); - copied = block_write_end(NULL, mapping, pos, len, len, page, NULL); + nr_dirty = nilfs_page_count_clean_buffers(&folio->page, from, to); + copied = block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); if (pos + copied > dir->i_size) i_size_write(dir, pos + copied); if (IS_DIRSYNC(dir)) nilfs_set_transaction_flag(NILFS_TI_SYNC); err = nilfs_set_file_dirty(dir, nr_dirty); WARN_ON(err); /* do not happen */ - unlock_page(page); + folio_unlock(folio); } static bool nilfs_check_folio(struct folio *folio, char *kaddr) @@ -409,11 +408,11 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, int err; folio_lock(folio); - err = nilfs_prepare_chunk(&folio->page, from, to); + err = nilfs_prepare_chunk(folio, from, to); BUG_ON(err); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); - nilfs_commit_chunk(&folio->page, mapping, from, to); + nilfs_commit_chunk(folio, mapping, from, to); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } @@ -485,7 +484,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) got_it: from = offset_in_folio(folio, de); to = from + rec_len; - err = nilfs_prepare_chunk(&folio->page, from, to); + err = nilfs_prepare_chunk(folio, from, to); if (err) goto out_unlock; if (de->inode) { @@ -500,7 +499,7 @@ got_it: memcpy(de->name, name, namelen); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); - nilfs_commit_chunk(&folio->page, folio->mapping, from, to); + nilfs_commit_chunk(folio, folio->mapping, from, to); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); nilfs_mark_inode_dirty(dir); /* OFFSET_CACHE */ @@ -542,12 +541,12 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct folio *folio) if (pde) from = (char *)pde - kaddr; folio_lock(folio); - err = nilfs_prepare_chunk(&folio->page, from, to); + err = nilfs_prepare_chunk(folio, from, to); BUG_ON(err); if (pde) pde->rec_len = nilfs_rec_len_to_disk(to - from); dir->inode = 0; - nilfs_commit_chunk(&folio->page, mapping, from, to); + nilfs_commit_chunk(folio, mapping, from, to); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); out: return err; @@ -568,7 +567,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent) if (IS_ERR(folio)) return PTR_ERR(folio); - err = nilfs_prepare_chunk(&folio->page, 0, chunk_size); + err = nilfs_prepare_chunk(folio, 0, chunk_size); if (unlikely(err)) { folio_unlock(folio); goto fail; @@ -589,7 +588,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent) memcpy(de->name, "..\0", 4); nilfs_set_de_type(de, inode); kunmap_local(kaddr); - nilfs_commit_chunk(&folio->page, mapping, 0, chunk_size); + nilfs_commit_chunk(folio, mapping, 0, chunk_size); fail: folio_put(folio); return err; From b4f19e3bce903712e347ce7f88d0c4f6e43277f9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Nov 2023 23:30:36 +0900 Subject: [PATCH 0579/1562] nilfs2: convert nilfs_page_bug() to nilfs_folio_bug() All callers have a folio now, so convert it. Link: https://lkml.kernel.org/r/20231127143036.2425-18-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/btnode.c | 4 ++-- fs/nilfs2/page.c | 25 +++++++++++++------------ fs/nilfs2/page.h | 6 +++--- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 1204dd06ead8..0131d83b912d 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -190,7 +190,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, retry: /* BUG_ON(oldkey != obh->b_folio->index); */ if (unlikely(oldkey != ofolio->index)) - NILFS_PAGE_BUG(&ofolio->page, + NILFS_FOLIO_BUG(ofolio, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); @@ -246,7 +246,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, if (nbh == NULL) { /* blocksize == pagesize */ ofolio = obh->b_folio; if (unlikely(oldkey != ofolio->index)) - NILFS_PAGE_BUG(&ofolio->page, + NILFS_FOLIO_BUG(ofolio, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 94e11bcee05b..5c2eba1987bd 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -150,29 +150,30 @@ bool nilfs_folio_buffers_clean(struct folio *folio) return true; } -void nilfs_page_bug(struct page *page) +void nilfs_folio_bug(struct folio *folio) { + struct buffer_head *bh, *head; struct address_space *m; unsigned long ino; - if (unlikely(!page)) { - printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n"); + if (unlikely(!folio)) { + printk(KERN_CRIT "NILFS_FOLIO_BUG(NULL)\n"); return; } - m = page->mapping; + m = folio->mapping; ino = m ? m->host->i_ino : 0; - printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx " + printk(KERN_CRIT "NILFS_FOLIO_BUG(%p): cnt=%d index#=%llu flags=0x%lx " "mapping=%p ino=%lu\n", - page, page_ref_count(page), - (unsigned long long)page->index, page->flags, m, ino); + folio, folio_ref_count(folio), + (unsigned long long)folio->index, folio->flags, m, ino); - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; + head = folio_buffers(folio); + if (head) { int i = 0; - bh = head = page_buffers(page); + bh = head; do { printk(KERN_CRIT " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n", @@ -258,7 +259,7 @@ repeat: folio_lock(folio); if (unlikely(!folio_test_dirty(folio))) - NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state"); + NILFS_FOLIO_BUG(folio, "inconsistent dirty state"); dfolio = filemap_grab_folio(dmap, folio->index); if (unlikely(IS_ERR(dfolio))) { @@ -268,7 +269,7 @@ repeat: break; } if (unlikely(!folio_buffers(folio))) - NILFS_PAGE_BUG(&folio->page, + NILFS_FOLIO_BUG(folio, "found empty page in dat page cache"); nilfs_copy_folio(dfolio, folio, true); diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 968b311d265b..7e1a2c455a10 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -37,7 +37,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *, void nilfs_forget_buffer(struct buffer_head *); void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *); bool nilfs_folio_buffers_clean(struct folio *); -void nilfs_page_bug(struct page *); +void nilfs_folio_bug(struct folio *); int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); @@ -49,7 +49,7 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, sector_t *blkoff); -#define NILFS_PAGE_BUG(page, m, a...) \ - do { nilfs_page_bug(page); BUG(); } while (0) +#define NILFS_FOLIO_BUG(folio, m, a...) \ + do { nilfs_folio_bug(folio); BUG(); } while (0) #endif /* _NILFS_PAGE_H */ From 125e9987a2d9016f78d0a020cec7d55fd0f29501 Mon Sep 17 00:00:00 2001 From: Kuan-Ying Lee Date: Wed, 29 Nov 2023 14:51:39 +0800 Subject: [PATCH 0580/1562] scripts/gdb/stackdepot: rename pool_index to pools_num After stackdepot evicting support patchset[1], we rename pool_index to pools_num. To avoid from the below issue, we rename consistently in gdb scripts. Python Exception : No symbol "pool_index" in current context. Error occurred in Python: No symbol "pool_index" in current context. [1] https://lore.kernel.org/linux-mm/cover.1700502145.git.andreyknvl@google.com/ Link: https://lkml.kernel.org/r/20231129065142.13375-3-Kuan-Ying.Lee@mediatek.com Signed-off-by: Kuan-Ying Lee Reviewed-by: Florian Fainelli Cc: Andrey Konovalov Cc: AngeloGioacchino Del Regno Cc: Chinwen Chang Cc: Matthias Brugger Cc: Oleg Nesterov Cc: Qun-Wei Lin Signed-off-by: Andrew Morton --- scripts/gdb/linux/stackdepot.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/gdb/linux/stackdepot.py b/scripts/gdb/linux/stackdepot.py index 047d329a6a12..0281d9de4b7c 100644 --- a/scripts/gdb/linux/stackdepot.py +++ b/scripts/gdb/linux/stackdepot.py @@ -25,10 +25,10 @@ def stack_depot_fetch(handle): handle_parts_t = gdb.lookup_type("union handle_parts") parts = handle.cast(handle_parts_t) offset = parts['offset'] << DEPOT_STACK_ALIGN - pool_index_cached = gdb.parse_and_eval('pool_index') + pools_num = gdb.parse_and_eval('pools_num') - if parts['pool_index'] > pool_index_cached: - gdb.write("pool index %d out of bounds (%d) for stack id 0x%08x\n" % (parts['pool_index'], pool_index_cached, handle)) + if parts['pool_index'] > pools_num: + gdb.write("pool index %d out of bounds (%d) for stack id 0x%08x\n" % (parts['pool_index'], pools_num, handle)) return gdb.Value(0), 0 stack_pools = gdb.parse_and_eval('stack_pools') From e52ec6a2db2e01e6a8cdfbe4fee1f89f57cdf723 Mon Sep 17 00:00:00 2001 From: Kuan-Ying Lee Date: Mon, 27 Nov 2023 15:04:03 +0800 Subject: [PATCH 0581/1562] scripts/gdb: remove exception handling and refine print format 1. When we crash on a page, we want to check what happened on this page instead of skipping this page by try-except block. Thus, removing the try-except block. 2. Remove redundant comma and print the task name properly. Link: https://lkml.kernel.org/r/20231127070404.4192-4-Kuan-Ying.Lee@mediatek.com Signed-off-by: Kuan-Ying Lee Cc: Andrey Konovalov Cc: AngeloGioacchino Del Regno Cc: Chinwen Chang Cc: Matthias Brugger Cc: Oleg Nesterov Cc: Qun-Wei Lin Signed-off-by: Andrew Morton --- scripts/gdb/linux/page_owner.py | 58 ++++++++++++++------------------- scripts/gdb/linux/slab.py | 3 +- 2 files changed, 26 insertions(+), 35 deletions(-) diff --git a/scripts/gdb/linux/page_owner.py b/scripts/gdb/linux/page_owner.py index 844fd5d0c912..8e713a09cfe7 100644 --- a/scripts/gdb/linux/page_owner.py +++ b/scripts/gdb/linux/page_owner.py @@ -122,27 +122,24 @@ class DumpPageOwner(gdb.Command): if not (page_ext['flags'] & (1 << PAGE_EXT_OWNER_ALLOCATED)): gdb.write("page_owner is not allocated\n") - try: - page_owner = self.get_page_owner(page_ext) - gdb.write("Page last allocated via order %d, gfp_mask: 0x%x, pid: %d, tgid: %d (%s), ts %u ns, free_ts %u ns\n" %\ - (page_owner["order"], page_owner["gfp_mask"],\ - page_owner["pid"], page_owner["tgid"], page_owner["comm"],\ - page_owner["ts_nsec"], page_owner["free_ts_nsec"])) - gdb.write("PFN: %d, Flags: 0x%x\n" % (pfn, page['flags'])) - if page_owner["handle"] == 0: - gdb.write('page_owner allocation stack trace missing\n') - else: - stackdepot.stack_depot_print(page_owner["handle"]) + page_owner = self.get_page_owner(page_ext) + gdb.write("Page last allocated via order %d, gfp_mask: 0x%x, pid: %d, tgid: %d (%s), ts %u ns, free_ts %u ns\n" %\ + (page_owner["order"], page_owner["gfp_mask"],\ + page_owner["pid"], page_owner["tgid"], page_owner["comm"].string(),\ + page_owner["ts_nsec"], page_owner["free_ts_nsec"])) + gdb.write("PFN: %d, Flags: 0x%x\n" % (pfn, page['flags'])) + if page_owner["handle"] == 0: + gdb.write('page_owner allocation stack trace missing\n') + else: + stackdepot.stack_depot_print(page_owner["handle"]) - if page_owner["free_handle"] == 0: - gdb.write('page_owner free stack trace missing\n') - else: - gdb.write('page last free stack trace:\n') - stackdepot.stack_depot_print(page_owner["free_handle"]) - if page_owner['last_migrate_reason'] != -1: - gdb.write('page has been migrated, last migrate reason: %s\n' % self.migrate_reason_names[page_owner['last_migrate_reason']]) - except: - gdb.write("\n") + if page_owner["free_handle"] == 0: + gdb.write('page_owner free stack trace missing\n') + else: + gdb.write('page last free stack trace:\n') + stackdepot.stack_depot_print(page_owner["free_handle"]) + if page_owner['last_migrate_reason'] != -1: + gdb.write('page has been migrated, last migrate reason: %s\n' % self.migrate_reason_names[page_owner['last_migrate_reason']]) def read_page_owner(self): pfn = self.min_pfn @@ -173,18 +170,13 @@ class DumpPageOwner(gdb.Command): pfn += 1 continue - try: - page_owner = self.get_page_owner(page_ext) - gdb.write("Page allocated via order %d, gfp_mask: 0x%x, pid: %d, tgid: %d (%s), ts %u ns, free_ts %u ns\n" %\ - (page_owner["order"], page_owner["gfp_mask"],\ - page_owner["pid"], page_owner["tgid"], page_owner["comm"],\ - page_owner["ts_nsec"], page_owner["free_ts_nsec"])) - gdb.write("PFN: %d, Flags: 0x%x\n" % (pfn, page['flags'])) - stackdepot.stack_depot_print(page_owner["handle"]) - pfn += (1 << page_owner["order"]) - continue - except: - gdb.write("\n") - pfn += 1 + page_owner = self.get_page_owner(page_ext) + gdb.write("Page allocated via order %d, gfp_mask: 0x%x, pid: %d, tgid: %d (%s), ts %u ns, free_ts %u ns\n" %\ + (page_owner["order"], page_owner["gfp_mask"],\ + page_owner["pid"], page_owner["tgid"], page_owner["comm"].string(),\ + page_owner["ts_nsec"], page_owner["free_ts_nsec"])) + gdb.write("PFN: %d, Flags: 0x%x\n" % (pfn, page['flags'])) + stackdepot.stack_depot_print(page_owner["handle"]) + pfn += (1 << page_owner["order"]) DumpPageOwner() diff --git a/scripts/gdb/linux/slab.py b/scripts/gdb/linux/slab.py index f012ba38c7d9..0e2d93867fe2 100644 --- a/scripts/gdb/linux/slab.py +++ b/scripts/gdb/linux/slab.py @@ -228,8 +228,7 @@ def slabtrace(alloc, cache_name): nr_cpu = gdb.parse_and_eval('__num_online_cpus')['counter'] if nr_cpu > 1: gdb.write(" cpus=") - for i in loc['cpus']: - gdb.write("%d," % i) + gdb.write(','.join(str(cpu) for cpu in loc['cpus'])) gdb.write("\n") if constants.LX_CONFIG_STACKDEPOT: if loc['handle']: From f2a2d85a9374b771474aeefe4b27cc725f0cfcb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Dec 2023 22:52:11 +0100 Subject: [PATCH 0582/1562] platform/x86: asus-wmi: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/639b9ffc18422fe59125893bd7909e8a73cffb72.1701726190.git.u.kleine-koenig@pengutronix.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/asus-wmi.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index 6a79f16233ab..ca3e64c37f1e 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -4598,7 +4598,7 @@ fail_platform: return err; } -static int asus_wmi_remove(struct platform_device *device) +static void asus_wmi_remove(struct platform_device *device) { struct asus_wmi *asus; @@ -4619,7 +4619,6 @@ static int asus_wmi_remove(struct platform_device *device) platform_profile_remove(); kfree(asus); - return 0; } /* Platform driver - hibernate/resume callbacks *******************************/ @@ -4741,7 +4740,7 @@ int __init_or_module asus_wmi_register_driver(struct asus_wmi_driver *driver) return -EBUSY; platform_driver = &driver->platform_driver; - platform_driver->remove = asus_wmi_remove; + platform_driver->remove_new = asus_wmi_remove; platform_driver->driver.owner = driver->owner; platform_driver->driver.name = driver->name; platform_driver->driver.pm = &asus_pm_ops; From 3df692169e8486fc3dd91fcd5ea81c27a0bac033 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Dec 2023 22:52:12 +0100 Subject: [PATCH 0583/1562] platform/x86: hp-wmi: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/a6b074b7ee37f3682da4b3f39ea40af97add64c2.1701726190.git.u.kleine-koenig@pengutronix.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/hp/hp-wmi.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c index 8ebb7be52ee7..e536604225c5 100644 --- a/drivers/platform/x86/hp/hp-wmi.c +++ b/drivers/platform/x86/hp/hp-wmi.c @@ -1478,7 +1478,7 @@ static int __init hp_wmi_bios_setup(struct platform_device *device) return 0; } -static int __exit hp_wmi_bios_remove(struct platform_device *device) +static void __exit hp_wmi_bios_remove(struct platform_device *device) { int i; @@ -1502,8 +1502,6 @@ static int __exit hp_wmi_bios_remove(struct platform_device *device) if (platform_profile_support) platform_profile_remove(); - - return 0; } static int hp_wmi_resume_handler(struct device *device) @@ -1560,7 +1558,7 @@ static struct platform_driver hp_wmi_driver __refdata = { .pm = &hp_wmi_pm_ops, .dev_groups = hp_wmi_groups, }, - .remove = __exit_p(hp_wmi_bios_remove), + .remove_new = __exit_p(hp_wmi_bios_remove), }; static umode_t hp_wmi_hwmon_is_visible(const void *data, From 7973be94724464222ae0b1860a25be04ab7b0132 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 8 Dec 2023 18:52:38 +0200 Subject: [PATCH 0584/1562] clk: x86: lpss-atom: Drop unneeded 'extern' in the header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'extern' for the functions is not needed, drop it. Signed-off-by: Andy Shevchenko Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231208165238.3309058-1-andriy.shevchenko@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/clk-lpss.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/platform_data/x86/clk-lpss.h b/include/linux/platform_data/x86/clk-lpss.h index 41df326583f9..7f132029316a 100644 --- a/include/linux/platform_data/x86/clk-lpss.h +++ b/include/linux/platform_data/x86/clk-lpss.h @@ -15,6 +15,6 @@ struct lpss_clk_data { struct clk *clk; }; -extern int lpss_atom_clk_init(void); +int lpss_atom_clk_init(void); #endif /* __CLK_LPSS_H */ From b87434f2e6fe81362d2ac57f3aba45ba89a11399 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 4 Dec 2023 14:17:36 -0800 Subject: [PATCH 0585/1562] platform/x86/intel/tpmi: Don't create devices for disabled features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If some TPMI features are disabled, don't create auxiliary devices. In this way feature drivers will not load. While creating auxiliary devices, call tpmi_read_feature_status() to check feature state and return if the feature is disabled without creating a device. Signed-off-by: Srinivas Pandruvada Reviewed-by: Hans de Goede Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231204221740.3645130-2-srinivas.pandruvada@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/tpmi.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/tpmi.c b/drivers/platform/x86/intel/tpmi.c index 311abcac894a..09972e2b8a06 100644 --- a/drivers/platform/x86/intel/tpmi.c +++ b/drivers/platform/x86/intel/tpmi.c @@ -598,9 +598,21 @@ static int tpmi_create_device(struct intel_tpmi_info *tpmi_info, struct intel_vsec_device *vsec_dev = tpmi_info->vsec_dev; char feature_id_name[TPMI_FEATURE_NAME_LEN]; struct intel_vsec_device *feature_vsec_dev; + struct tpmi_feature_state feature_state; struct resource *res, *tmp; const char *name; - int i; + int i, ret; + + ret = tpmi_read_feature_status(tpmi_info, pfs->pfs_header.tpmi_id, &feature_state); + if (ret) + return ret; + + /* + * If not enabled, continue to look at other features in the PFS, so return -EOPNOTSUPP. + * This will not cause failure of loading of this driver. + */ + if (!feature_state.enabled) + return -EOPNOTSUPP; name = intel_tpmi_name(pfs->pfs_header.tpmi_id); if (!name) From 72dd14d241e1c6e241fc5b265746c59f306c6aa3 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 4 Dec 2023 14:17:37 -0800 Subject: [PATCH 0586/1562] platform/x86/intel/tpmi: Modify external interface to get read/write state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modify the external interface tpmi_get_feature_status() to get read and write blocked instead of locked and disabled. Since auxiliary device is not created when disabled, no use of returning disabled state. Also locked state is not useful as feature driver can't use locked state in a meaningful way. Using read and write state, feature driver can decide which operations to restrict for that feature. Signed-off-by: Srinivas Pandruvada Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231204221740.3645130-3-srinivas.pandruvada@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/tpmi.c | 8 ++++---- include/linux/intel_tpmi.h | 5 ++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/platform/x86/intel/tpmi.c b/drivers/platform/x86/intel/tpmi.c index 09972e2b8a06..92c5c3a90d53 100644 --- a/drivers/platform/x86/intel/tpmi.c +++ b/drivers/platform/x86/intel/tpmi.c @@ -345,8 +345,8 @@ err_unlock: return ret; } -int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id, - int *locked, int *disabled) +int tpmi_get_feature_status(struct auxiliary_device *auxdev, + int feature_id, bool *read_blocked, bool *write_blocked) { struct intel_vsec_device *intel_vsec_dev = dev_to_ivdev(auxdev->dev.parent); struct intel_tpmi_info *tpmi_info = auxiliary_get_drvdata(&intel_vsec_dev->auxdev); @@ -357,8 +357,8 @@ int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id, if (ret) return ret; - *locked = feature_state.locked; - *disabled = !feature_state.enabled; + *read_blocked = feature_state.read_blocked; + *write_blocked = feature_state.write_blocked; return 0; } diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h index ee07393445f9..4f89c5bd8663 100644 --- a/include/linux/intel_tpmi.h +++ b/include/linux/intel_tpmi.h @@ -32,7 +32,6 @@ struct intel_tpmi_plat_info { struct intel_tpmi_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev); struct resource *tpmi_get_resource_at_index(struct auxiliary_device *auxdev, int index); int tpmi_get_resource_count(struct auxiliary_device *auxdev); - -int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id, int *locked, - int *disabled); +int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id, bool *read_blocked, + bool *write_blocked); #endif From 046d7be6210e7f870e53eb38fd410237e9d1d88f Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 4 Dec 2023 14:17:38 -0800 Subject: [PATCH 0587/1562] platform/x86/intel/tpmi: Move TPMI ID definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move TPMI ID definitions to common include file. In this way other feature drivers don't have to redefine. Signed-off-by: Srinivas Pandruvada Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231204221740.3645130-4-srinivas.pandruvada@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/tpmi.c | 13 ------------- include/linux/intel_tpmi.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/platform/x86/intel/tpmi.c b/drivers/platform/x86/intel/tpmi.c index 92c5c3a90d53..e73cdea67fff 100644 --- a/drivers/platform/x86/intel/tpmi.c +++ b/drivers/platform/x86/intel/tpmi.c @@ -170,19 +170,6 @@ struct tpmi_feature_state { u32 locked:1; } __packed; -/* - * List of supported TMPI IDs. - * Some TMPI IDs are not used by Linux, so the numbers are not consecutive. - */ -enum intel_tpmi_id { - TPMI_ID_RAPL = 0, /* Running Average Power Limit */ - TPMI_ID_PEM = 1, /* Power and Perf excursion Monitor */ - TPMI_ID_UNCORE = 2, /* Uncore Frequency Scaling */ - TPMI_ID_SST = 5, /* Speed Select Technology */ - TPMI_CONTROL_ID = 0x80, /* Special ID for getting feature status */ - TPMI_INFO_ID = 0x81, /* Special ID for PCI BDF and Package ID information */ -}; - /* * The size from hardware is in u32 units. This size is from a trusted hardware, * but better to verify for pre silicon platforms. Set size to 0, when invalid. diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h index 4f89c5bd8663..a3529b962be6 100644 --- a/include/linux/intel_tpmi.h +++ b/include/linux/intel_tpmi.h @@ -12,6 +12,19 @@ #define TPMI_MINOR_VERSION(val) FIELD_GET(GENMASK(4, 0), val) #define TPMI_MAJOR_VERSION(val) FIELD_GET(GENMASK(7, 5), val) +/* + * List of supported TMPI IDs. + * Some TMPI IDs are not used by Linux, so the numbers are not consecutive. + */ +enum intel_tpmi_id { + TPMI_ID_RAPL = 0, /* Running Average Power Limit */ + TPMI_ID_PEM = 1, /* Power and Perf excursion Monitor */ + TPMI_ID_UNCORE = 2, /* Uncore Frequency Scaling */ + TPMI_ID_SST = 5, /* Speed Select Technology */ + TPMI_CONTROL_ID = 0x80, /* Special ID for getting feature status */ + TPMI_INFO_ID = 0x81, /* Special ID for PCI BDF and Package ID information */ +}; + /** * struct intel_tpmi_plat_info - Platform information for a TPMI device instance * @package_id: CPU Package id From 8bed9ff7dbcce4d1a436f7839be48c6fd5fac0ce Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 4 Dec 2023 14:17:39 -0800 Subject: [PATCH 0588/1562] platform/x86: ISST: Process read/write blocked feature status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a feature is read blocked, don't continue to read SST information and register with SST core. When the feature is write blocked, continue to offer read interface for SST parameters, but don't allow any operation to change state. A state change results from SST level change, feature change or class of service change. Signed-off-by: Srinivas Pandruvada Reviewed-by: Hans de Goede Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231204221740.3645130-5-srinivas.pandruvada@linux.intel.com Signed-off-by: Hans de Goede --- .../intel/speed_select_if/isst_tpmi_core.c | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c index 0b6d2c864437..2662fbbddf0c 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c @@ -234,6 +234,7 @@ struct perf_level { * @saved_clos_configs: Save SST-CP CLOS configuration to store restore for suspend/resume * @saved_clos_assocs: Save SST-CP CLOS association to store restore for suspend/resume * @saved_pp_control: Save SST-PP control information to store restore for suspend/resume + * @write_blocked: Write operation is blocked, so can't change SST state * * This structure is used store complete SST information for a power_domain. This information * is used to read/write request for any SST IOCTL. Each physical CPU package can have multiple @@ -259,6 +260,7 @@ struct tpmi_per_power_domain_info { u64 saved_clos_configs[4]; u64 saved_clos_assocs[4]; u64 saved_pp_control; + bool write_blocked; }; /** @@ -515,6 +517,9 @@ static long isst_if_clos_param(void __user *argp) return -EINVAL; if (clos_param.get_set) { + if (power_domain_info->write_blocked) + return -EPERM; + _write_cp_info("clos.min_freq", clos_param.min_freq_mhz, (SST_CLOS_CONFIG_0_OFFSET + clos_param.clos * SST_REG_SIZE), SST_CLOS_CONFIG_MIN_START, SST_CLOS_CONFIG_MIN_WIDTH, @@ -602,6 +607,9 @@ static long isst_if_clos_assoc(void __user *argp) power_domain_info = &sst_inst->power_domain_info[punit_id]; + if (assoc_cmds.get_set && power_domain_info->write_blocked) + return -EPERM; + offset = SST_CLOS_ASSOC_0_OFFSET + (punit_cpu_no / SST_CLOS_ASSOC_CPUS_PER_REG) * SST_REG_SIZE; shift = punit_cpu_no % SST_CLOS_ASSOC_CPUS_PER_REG; @@ -752,6 +760,9 @@ static int isst_if_set_perf_level(void __user *argp) if (!power_domain_info) return -EINVAL; + if (power_domain_info->write_blocked) + return -EPERM; + if (!(power_domain_info->pp_header.allowed_level_mask & BIT(perf_level.level))) return -EINVAL; @@ -809,6 +820,9 @@ static int isst_if_set_perf_feature(void __user *argp) if (!power_domain_info) return -EINVAL; + if (power_domain_info->write_blocked) + return -EPERM; + _write_pp_info("perf_feature", perf_feature.feature, SST_PP_CONTROL_OFFSET, SST_PP_FEATURE_STATE_START, SST_PP_FEATURE_STATE_WIDTH, SST_MUL_FACTOR_NONE) @@ -1257,11 +1271,21 @@ static long isst_if_def_ioctl(struct file *file, unsigned int cmd, int tpmi_sst_dev_add(struct auxiliary_device *auxdev) { + bool read_blocked = 0, write_blocked = 0; struct intel_tpmi_plat_info *plat_info; struct tpmi_sst_struct *tpmi_sst; int i, ret, pkg = 0, inst = 0; int num_resources; + ret = tpmi_get_feature_status(auxdev, TPMI_ID_SST, &read_blocked, &write_blocked); + if (ret) + dev_info(&auxdev->dev, "Can't read feature status: ignoring read/write blocked status\n"); + + if (read_blocked) { + dev_info(&auxdev->dev, "Firmware has blocked reads, exiting\n"); + return -ENODEV; + } + plat_info = tpmi_get_platform_data(auxdev); if (!plat_info) { dev_err(&auxdev->dev, "No platform info\n"); @@ -1306,6 +1330,7 @@ int tpmi_sst_dev_add(struct auxiliary_device *auxdev) tpmi_sst->power_domain_info[i].package_id = pkg; tpmi_sst->power_domain_info[i].power_domain_id = i; tpmi_sst->power_domain_info[i].auxdev = auxdev; + tpmi_sst->power_domain_info[i].write_blocked = write_blocked; tpmi_sst->power_domain_info[i].sst_base = devm_ioremap_resource(&auxdev->dev, res); if (IS_ERR(tpmi_sst->power_domain_info[i].sst_base)) return PTR_ERR(tpmi_sst->power_domain_info[i].sst_base); From b06458d1b1cbb99635c7bb4f9a4f4c4cef2ed984 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 4 Dec 2023 14:17:40 -0800 Subject: [PATCH 0589/1562] platform/x86/intel-uncore-freq: Process read/write blocked feature status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a feature is read blocked, don't continue to read uncore information and register with uncore core. When the feature is write blocked, continue to offer read interface but block setting uncore limits. Signed-off-by: Srinivas Pandruvada Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231204221740.3645130-6-srinivas.pandruvada@linux.intel.com Signed-off-by: Hans de Goede --- .../uncore-frequency/uncore-frequency-tpmi.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c index 4fb790552c47..bd75d61ff8a6 100644 --- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c +++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c @@ -66,6 +66,7 @@ struct tpmi_uncore_struct { int min_ratio; struct tpmi_uncore_power_domain_info *pd_info; struct tpmi_uncore_cluster_info root_cluster; + bool write_blocked; }; #define UNCORE_GENMASK_MIN_RATIO GENMASK_ULL(21, 15) @@ -157,6 +158,9 @@ static int uncore_write_control_freq(struct uncore_data *data, unsigned int inpu cluster_info = container_of(data, struct tpmi_uncore_cluster_info, uncore_data); uncore_root = cluster_info->uncore_root; + if (uncore_root->write_blocked) + return -EPERM; + /* Update each cluster in a package */ if (cluster_info->root_domain) { struct tpmi_uncore_struct *uncore_root = cluster_info->uncore_root; @@ -233,11 +237,21 @@ static void remove_cluster_entries(struct tpmi_uncore_struct *tpmi_uncore) static int uncore_probe(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id) { + bool read_blocked = 0, write_blocked = 0; struct intel_tpmi_plat_info *plat_info; struct tpmi_uncore_struct *tpmi_uncore; int ret, i, pkg = 0; int num_resources; + ret = tpmi_get_feature_status(auxdev, TPMI_ID_UNCORE, &read_blocked, &write_blocked); + if (ret) + dev_info(&auxdev->dev, "Can't read feature status: ignoring blocked status\n"); + + if (read_blocked) { + dev_info(&auxdev->dev, "Firmware has blocked reads, exiting\n"); + return -ENODEV; + } + /* Get number of power domains, which is equal to number of resources */ num_resources = tpmi_get_resource_count(auxdev); if (!num_resources) @@ -266,6 +280,7 @@ static int uncore_probe(struct auxiliary_device *auxdev, const struct auxiliary_ } tpmi_uncore->power_domain_count = num_resources; + tpmi_uncore->write_blocked = write_blocked; /* Get the package ID from the TPMI core */ plat_info = tpmi_get_platform_data(auxdev); From 682b43a049c898d060bb1bd7af4cac0d91b3594d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 8 Dec 2023 15:48:45 +0200 Subject: [PATCH 0590/1562] platform/x86: ips: Remove unused debug code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove unused debug code inside #if 0 ... #endif. Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231208134845.3900-1-ilpo.jarvinen@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/intel_ips.c | 33 -------------------------------- 1 file changed, 33 deletions(-) diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c index 4dfdbfca6841..e26e7e14c44c 100644 --- a/drivers/platform/x86/intel_ips.c +++ b/drivers/platform/x86/intel_ips.c @@ -1105,39 +1105,6 @@ static int ips_monitor(void *data) return 0; } -#if 0 -#define THM_DUMPW(reg) \ - { \ - u16 val = thm_readw(reg); \ - dev_dbg(ips->dev, #reg ": 0x%04x\n", val); \ - } -#define THM_DUMPL(reg) \ - { \ - u32 val = thm_readl(reg); \ - dev_dbg(ips->dev, #reg ": 0x%08x\n", val); \ - } -#define THM_DUMPQ(reg) \ - { \ - u64 val = thm_readq(reg); \ - dev_dbg(ips->dev, #reg ": 0x%016x\n", val); \ - } - -static void dump_thermal_info(struct ips_driver *ips) -{ - u16 ptl; - - ptl = thm_readw(THM_PTL); - dev_dbg(ips->dev, "Processor temp limit: %d\n", ptl); - - THM_DUMPW(THM_CTA); - THM_DUMPW(THM_TRC); - THM_DUMPW(THM_CTV1); - THM_DUMPL(THM_STS); - THM_DUMPW(THM_PTV); - THM_DUMPQ(THM_MGTV); -} -#endif - /** * ips_irq_handler - handle temperature triggers and other IPS events * @irq: irq number From 0e8d2444168dd519fea501599d150e62718ed2fe Mon Sep 17 00:00:00 2001 From: Ilias Apalodimas Date: Tue, 7 Nov 2023 14:40:56 +0900 Subject: [PATCH 0591/1562] efivarfs: force RO when remounting if SetVariable is not supported If SetVariable at runtime is not supported by the firmware we never assign a callback for that function. At the same time mount the efivarfs as RO so no one can call that. However, we never check the permission flags when someone remounts the filesystem as RW. As a result this leads to a crash looking like this: $ mount -o remount,rw /sys/firmware/efi/efivars $ efi-updatevar -f PK.auth PK [ 303.279166] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 [ 303.280482] Mem abort info: [ 303.280854] ESR = 0x0000000086000004 [ 303.281338] EC = 0x21: IABT (current EL), IL = 32 bits [ 303.282016] SET = 0, FnV = 0 [ 303.282414] EA = 0, S1PTW = 0 [ 303.282821] FSC = 0x04: level 0 translation fault [ 303.283771] user pgtable: 4k pages, 48-bit VAs, pgdp=000000004258c000 [ 303.284913] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000 [ 303.286076] Internal error: Oops: 0000000086000004 [#1] PREEMPT SMP [ 303.286936] Modules linked in: qrtr tpm_tis tpm_tis_core crct10dif_ce arm_smccc_trng rng_core drm fuse ip_tables x_tables ipv6 [ 303.288586] CPU: 1 PID: 755 Comm: efi-updatevar Not tainted 6.3.0-rc1-00108-gc7d0c4695c68 #1 [ 303.289748] Hardware name: Unknown Unknown Product/Unknown Product, BIOS 2023.04-00627-g88336918701d 04/01/2023 [ 303.291150] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 303.292123] pc : 0x0 [ 303.292443] lr : efivar_set_variable_locked+0x74/0xec [ 303.293156] sp : ffff800008673c10 [ 303.293619] x29: ffff800008673c10 x28: ffff0000037e8000 x27: 0000000000000000 [ 303.294592] x26: 0000000000000800 x25: ffff000002467400 x24: 0000000000000027 [ 303.295572] x23: ffffd49ea9832000 x22: ffff0000020c9800 x21: ffff000002467000 [ 303.296566] x20: 0000000000000001 x19: 00000000000007fc x18: 0000000000000000 [ 303.297531] x17: 0000000000000000 x16: 0000000000000000 x15: 0000aaaac807ab54 [ 303.298495] x14: ed37489f673633c0 x13: 71c45c606de13f80 x12: 47464259e219acf4 [ 303.299453] x11: ffff000002af7b01 x10: 0000000000000003 x9 : 0000000000000002 [ 303.300431] x8 : 0000000000000010 x7 : ffffd49ea8973230 x6 : 0000000000a85201 [ 303.301412] x5 : 0000000000000000 x4 : ffff0000020c9800 x3 : 00000000000007fc [ 303.302370] x2 : 0000000000000027 x1 : ffff000002467400 x0 : ffff000002467000 [ 303.303341] Call trace: [ 303.303679] 0x0 [ 303.303938] efivar_entry_set_get_size+0x98/0x16c [ 303.304585] efivarfs_file_write+0xd0/0x1a4 [ 303.305148] vfs_write+0xc4/0x2e4 [ 303.305601] ksys_write+0x70/0x104 [ 303.306073] __arm64_sys_write+0x1c/0x28 [ 303.306622] invoke_syscall+0x48/0x114 [ 303.307156] el0_svc_common.constprop.0+0x44/0xec [ 303.307803] do_el0_svc+0x38/0x98 [ 303.308268] el0_svc+0x2c/0x84 [ 303.308702] el0t_64_sync_handler+0xf4/0x120 [ 303.309293] el0t_64_sync+0x190/0x194 [ 303.309794] Code: ???????? ???????? ???????? ???????? (????????) [ 303.310612] ---[ end trace 0000000000000000 ]--- Fix this by adding a .reconfigure() function to the fs operations which we can use to check the requested flags and deny anything that's not RO if the firmware doesn't implement SetVariable at runtime. Fixes: f88814cc2578 ("efi/efivars: Expose RT service availability via efivars abstraction") Signed-off-by: Ilias Apalodimas Signed-off-by: Ard Biesheuvel --- fs/efivarfs/super.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 77240953a92e..869537f1a550 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "internal.h" @@ -333,9 +334,20 @@ static int efivarfs_get_tree(struct fs_context *fc) return get_tree_single(fc, efivarfs_fill_super); } +static int efivarfs_reconfigure(struct fs_context *fc) +{ + if (!efivar_supports_writes() && !(fc->sb_flags & SB_RDONLY)) { + pr_err("Firmware does not support SetVariableRT. Can not remount with rw\n"); + return -EINVAL; + } + + return 0; +} + static const struct fs_context_operations efivarfs_context_ops = { .get_tree = efivarfs_get_tree, .parse_param = efivarfs_parse_param, + .reconfigure = efivarfs_reconfigure, }; static int efivarfs_init_fs_context(struct fs_context *fc) From d28076ddda34d13aee675fbed52e3275af00f64d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Dec 2023 17:39:27 +0100 Subject: [PATCH 0592/1562] efivarfs: Move efivar availability check into FS context init Instead of checking whether or not EFI variables are available when creating the superblock, check it one step earlier, when initializing the FS context for the mount. This way, no FS context will be created at all, and we can drop the second check at .kill_sb() time entirely. Signed-off-by: Ard Biesheuvel --- fs/efivarfs/super.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 869537f1a550..d4530e4eac06 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -296,9 +296,6 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) struct dentry *root; int err; - if (!efivar_is_available()) - return -EOPNOTSUPP; - sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; @@ -354,6 +351,9 @@ static int efivarfs_init_fs_context(struct fs_context *fc) { struct efivarfs_fs_info *sfi; + if (!efivar_is_available()) + return -EOPNOTSUPP; + sfi = kzalloc(sizeof(*sfi), GFP_KERNEL); if (!sfi) return -ENOMEM; @@ -370,9 +370,6 @@ static void efivarfs_kill_sb(struct super_block *sb) { kill_litter_super(sb); - if (!efivar_is_available()) - return; - /* Remove all entries and destroy */ efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL); } From 547713d502f7b4b8efccd409cff84d731a23853b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Dec 2023 17:39:28 +0100 Subject: [PATCH 0593/1562] efivarfs: Free s_fs_info on unmount Now that we allocate a s_fs_info struct on fs context creation, we should ensure that we free it again when the superblock goes away. Fixes: 5329aa5101f7 ("efivarfs: Add uid/gid mount options") Signed-off-by: Ard Biesheuvel --- fs/efivarfs/super.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index d4530e4eac06..f954b1fab915 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -368,10 +368,13 @@ static int efivarfs_init_fs_context(struct fs_context *fc) static void efivarfs_kill_sb(struct super_block *sb) { + struct efivarfs_fs_info *sfi = sb->s_fs_info; + kill_litter_super(sb); /* Remove all entries and destroy */ efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL); + kfree(sfi); } static struct file_system_type efivarfs_type = { From cdb46a8aefbf7fd36772bb206aaaf7e45d7cf8f6 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Dec 2023 17:39:29 +0100 Subject: [PATCH 0594/1562] efivarfs: Move efivarfs list into superblock s_fs_info syzbot reports issues with concurrent fsopen()/fsconfig() invocations on efivarfs, which are the result of the fact that the efivarfs list (which caches the names and GUIDs of existing EFI variables) is a global structure. In normal use, these issues are unlikely to trigger, even in the presence of multiple mounts of efivarfs, but the execution pattern used by the syzkaller reproducer may result in multiple instances of the superblock that share the global efivarfs list, and this causes list corruption when the list is reinitialized by one user while another is traversing it. So let's move the list head into the superblock s_fs_info field, so that it will never be shared between distinct instances of the superblock. In the common case, there will still be a single instance of this list, but in the artificial syzkaller case, no list corruption can occur any longer. Reported-by: syzbot+1902c359bfcaf39c46f2@syzkaller.appspotmail.com Signed-off-by: Ard Biesheuvel --- fs/efivarfs/inode.c | 3 ++- fs/efivarfs/internal.h | 6 +++--- fs/efivarfs/super.c | 19 ++++++++++--------- fs/efivarfs/vars.c | 5 +++-- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 91290fe4a70b..586446e02ef7 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -77,6 +77,7 @@ bool efivarfs_valid_name(const char *str, int len) static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { + struct efivarfs_fs_info *info = dir->i_sb->s_fs_info; struct inode *inode = NULL; struct efivar_entry *var; int namelen, i = 0, err = 0; @@ -118,7 +119,7 @@ static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir, inode->i_private = var; kmemleak_ignore(var); - err = efivar_entry_add(var, &efivarfs_list); + err = efivar_entry_add(var, &info->efivarfs_list); if (err) goto out; diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h index c66647f5c0bd..1dc0ccce3cc3 100644 --- a/fs/efivarfs/internal.h +++ b/fs/efivarfs/internal.h @@ -16,6 +16,7 @@ struct efivarfs_mount_opts { struct efivarfs_fs_info { struct efivarfs_mount_opts mount_opts; + struct list_head efivarfs_list; }; struct efi_variable { @@ -33,7 +34,8 @@ struct efivar_entry { struct kobject kobj; }; -int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), +int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *, + struct list_head *), void *data, bool duplicates, struct list_head *head); int efivar_entry_add(struct efivar_entry *entry, struct list_head *head); @@ -64,6 +66,4 @@ extern struct inode *efivarfs_get_inode(struct super_block *sb, const struct inode *dir, int mode, dev_t dev, bool is_removable); -extern struct list_head efivarfs_list; - #endif /* EFIVAR_FS_INTERNAL_H */ diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index f954b1fab915..cee325b5bbdd 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -19,8 +19,6 @@ #include "internal.h" -LIST_HEAD(efivarfs_list); - static void efivarfs_evict_inode(struct inode *inode) { clear_inode(inode); @@ -167,7 +165,8 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) } static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, - unsigned long name_size, void *data) + unsigned long name_size, void *data, + struct list_head *list) { struct super_block *sb = (struct super_block *)data; struct efivar_entry *entry; @@ -222,7 +221,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, } __efivar_entry_get(entry, NULL, &size, NULL); - __efivar_entry_add(entry, &efivarfs_list); + __efivar_entry_add(entry, list); /* copied by the above to local storage in the dentry. */ kfree(name); @@ -292,6 +291,7 @@ static int efivarfs_parse_param(struct fs_context *fc, struct fs_parameter *para static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct efivarfs_fs_info *sfi = sb->s_fs_info; struct inode *inode = NULL; struct dentry *root; int err; @@ -317,11 +317,10 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) if (!root) return -ENOMEM; - INIT_LIST_HEAD(&efivarfs_list); - - err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list); + err = efivar_init(efivarfs_callback, (void *)sb, true, + &sfi->efivarfs_list); if (err) - efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL); + efivar_entry_iter(efivarfs_destroy, &sfi->efivarfs_list, NULL); return err; } @@ -358,6 +357,8 @@ static int efivarfs_init_fs_context(struct fs_context *fc) if (!sfi) return -ENOMEM; + INIT_LIST_HEAD(&sfi->efivarfs_list); + sfi->mount_opts.uid = GLOBAL_ROOT_UID; sfi->mount_opts.gid = GLOBAL_ROOT_GID; @@ -373,7 +374,7 @@ static void efivarfs_kill_sb(struct super_block *sb) kill_litter_super(sb); /* Remove all entries and destroy */ - efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL); + efivar_entry_iter(efivarfs_destroy, &sfi->efivarfs_list, NULL); kfree(sfi); } diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c index 9e4f47808bd5..114ff0fd4e55 100644 --- a/fs/efivarfs/vars.c +++ b/fs/efivarfs/vars.c @@ -369,7 +369,8 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid, * * Returns 0 on success, or a kernel error code on failure. */ -int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), +int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *, + struct list_head *), void *data, bool duplicates, struct list_head *head) { unsigned long variable_name_size = 1024; @@ -420,7 +421,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), status = EFI_NOT_FOUND; } else { err = func(variable_name, vendor_guid, - variable_name_size, data); + variable_name_size, data, head); if (err) status = EFI_NOT_FOUND; } From 6bb3703aa52c9b5bb9716cbeae7350247b675209 Mon Sep 17 00:00:00 2001 From: Masahisa Kojima Date: Tue, 7 Nov 2023 14:40:52 +0900 Subject: [PATCH 0595/1562] efi: expose efivar generic ops register function This is a preparation for supporting efivar operations provided by other than efi subsystem. Both register and unregister functions are exposed so that non-efi subsystem can revert the efi generic operation. Acked-by: Sumit Garg Co-developed-by: Ilias Apalodimas Signed-off-by: Ilias Apalodimas Signed-off-by: Masahisa Kojima Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/efi.c | 12 ++++++++++++ include/linux/efi.h | 3 +++ 2 files changed, 15 insertions(+) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 9d3910d1abe1..32a67c61c3b8 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -231,6 +231,18 @@ static void generic_ops_unregister(void) efivars_unregister(&generic_efivars); } +void efivars_generic_ops_register(void) +{ + generic_ops_register(); +} +EXPORT_SYMBOL_GPL(efivars_generic_ops_register); + +void efivars_generic_ops_unregister(void) +{ + generic_ops_unregister(); +} +EXPORT_SYMBOL_GPL(efivars_generic_ops_unregister); + #ifdef CONFIG_EFI_CUSTOM_SSDT_OVERLAYS #define EFIVAR_SSDT_NAME_MAX 16UL static char efivar_ssdt[EFIVAR_SSDT_NAME_MAX] __initdata; diff --git a/include/linux/efi.h b/include/linux/efi.h index 9cc5bf32f6f2..1b2f50efb98c 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1348,4 +1348,7 @@ bool efi_config_table_is_usable(const efi_guid_t *guid, unsigned long table) umode_t efi_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n); +void efivars_generic_ops_register(void); +void efivars_generic_ops_unregister(void); + #endif /* _LINUX_EFI_H */ From 1f71f37fbbd065b3326d9b7d8bb5ae688cd653d0 Mon Sep 17 00:00:00 2001 From: Masahisa Kojima Date: Tue, 7 Nov 2023 14:40:53 +0900 Subject: [PATCH 0596/1562] efi: Add EFI_ACCESS_DENIED status code This commit adds the EFI_ACCESS_DENIED status code. Acked-by: Sumit Garg Co-developed-by: Ilias Apalodimas Signed-off-by: Ilias Apalodimas Signed-off-by: Masahisa Kojima Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/efi.h b/include/linux/efi.h index 1b2f50efb98c..3668aa204c47 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -40,6 +40,7 @@ struct screen_info; #define EFI_WRITE_PROTECTED ( 8 | (1UL << (BITS_PER_LONG-1))) #define EFI_OUT_OF_RESOURCES ( 9 | (1UL << (BITS_PER_LONG-1))) #define EFI_NOT_FOUND (14 | (1UL << (BITS_PER_LONG-1))) +#define EFI_ACCESS_DENIED (15 | (1UL << (BITS_PER_LONG-1))) #define EFI_TIMEOUT (18 | (1UL << (BITS_PER_LONG-1))) #define EFI_ABORTED (21 | (1UL << (BITS_PER_LONG-1))) #define EFI_SECURITY_VIOLATION (26 | (1UL << (BITS_PER_LONG-1))) From c44b6be62e8dd4ee0a308c36a70620613e6fc55f Mon Sep 17 00:00:00 2001 From: Masahisa Kojima Date: Tue, 7 Nov 2023 14:40:54 +0900 Subject: [PATCH 0597/1562] efi: Add tee-based EFI variable driver When the flash is not owned by the non-secure world, accessing the EFI variables is straight-forward and done via EFI Runtime Variable Services. In this case, critical variables for system integrity and security are normally stored in the dedicated secure storage and can only be manipulated directly from the secure world. Usually, small embedded devices don't have the special dedicated secure storage. The eMMC device with an RPMB partition is becoming more common, and we can use this RPMB partition to store the EFI Variables. The eMMC device is typically owned by the non-secure world (Linux in our case). There is an existing solution utilizing eMMC RPMB partition for EFI Variables, it is implemented by interacting with TEE (OP-TEE in this case), StandaloneMM (as EFI Variable Service Pseudo TA), eMMC driver and tee-supplicant. The last piece is the tee-based variable access driver to interact with TEE and StandaloneMM. So let's add the kernel functions needed. This feature is implemented as a kernel module. StMM PTA has TA_FLAG_DEVICE_ENUM_SUPP flag when registered to OP-TEE so that this tee_stmm_efi module is probed after tee-supplicant starts, since "SetVariable" EFI Runtime Variable Service requires to interact with tee-supplicant. Acked-by: Sumit Garg Co-developed-by: Ilias Apalodimas Signed-off-by: Ilias Apalodimas Signed-off-by: Masahisa Kojima Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/Kconfig | 15 + drivers/firmware/efi/Makefile | 1 + drivers/firmware/efi/stmm/mm_communication.h | 236 +++++++ drivers/firmware/efi/stmm/tee_stmm_efi.c | 616 +++++++++++++++++++ 4 files changed, 868 insertions(+) create mode 100644 drivers/firmware/efi/stmm/mm_communication.h create mode 100644 drivers/firmware/efi/stmm/tee_stmm_efi.c diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index cb374b2da9b7..72f2537d90ca 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -301,3 +301,18 @@ config UEFI_CPER_X86 bool depends on UEFI_CPER && X86 default y + +config TEE_STMM_EFI + tristate "TEE-based EFI runtime variable service driver" + depends on EFI && OPTEE + help + Select this config option if TEE is compiled to include StandAloneMM + as a separate secure partition. It has the ability to check and store + EFI variables on an RPMB or any other non-volatile medium used by + StandAloneMM. + + Enabling this will change the EFI runtime services from the firmware + provided functions to TEE calls. + + To compile this driver as a module, choose M here: the module + will be called tee_stmm_efi. diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index e489fefd23da..a2d0009560d0 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -42,3 +42,4 @@ obj-$(CONFIG_EFI_EARLYCON) += earlycon.o obj-$(CONFIG_UEFI_CPER_ARM) += cper-arm.o obj-$(CONFIG_UEFI_CPER_X86) += cper-x86.o obj-$(CONFIG_UNACCEPTED_MEMORY) += unaccepted_memory.o +obj-$(CONFIG_TEE_STMM_EFI) += stmm/tee_stmm_efi.o diff --git a/drivers/firmware/efi/stmm/mm_communication.h b/drivers/firmware/efi/stmm/mm_communication.h new file mode 100644 index 000000000000..52a1f32cd1eb --- /dev/null +++ b/drivers/firmware/efi/stmm/mm_communication.h @@ -0,0 +1,236 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Headers for EFI variable service via StandAloneMM, EDK2 application running + * in OP-TEE. Most of the structs and defines resemble the EDK2 naming. + * + * Copyright (c) 2017, Intel Corporation. All rights reserved. + * Copyright (C) 2020 Linaro Ltd. + */ + +#ifndef _MM_COMMUNICATION_H_ +#define _MM_COMMUNICATION_H_ + +/* + * Interface to the pseudo Trusted Application (TA), which provides a + * communication channel with the Standalone MM (Management Mode) + * Secure Partition running at Secure-EL0 + */ + +#define PTA_STMM_CMD_COMMUNICATE 0 + +/* + * Defined in OP-TEE, this UUID is used to identify the pseudo-TA. + * OP-TEE is using big endian GUIDs while UEFI uses little endian ones + */ +#define PTA_STMM_UUID \ + UUID_INIT(0xed32d533, 0x99e6, 0x4209, \ + 0x9c, 0xc0, 0x2d, 0x72, 0xcd, 0xd9, 0x98, 0xa7) + +#define EFI_MM_VARIABLE_GUID \ + EFI_GUID(0xed32d533, 0x99e6, 0x4209, \ + 0x9c, 0xc0, 0x2d, 0x72, 0xcd, 0xd9, 0x98, 0xa7) + +/** + * struct efi_mm_communicate_header - Header used for SMM variable communication + + * @header_guid: header use for disambiguation of content + * @message_len: length of the message. Does not include the size of the + * header + * @data: payload of the message + * + * Defined in the PI spec as EFI_MM_COMMUNICATE_HEADER. + * To avoid confusion in interpreting frames, the communication buffer should + * always begin with efi_mm_communicate_header. + */ +struct efi_mm_communicate_header { + efi_guid_t header_guid; + size_t message_len; + u8 data[]; +} __packed; + +#define MM_COMMUNICATE_HEADER_SIZE \ + (sizeof(struct efi_mm_communicate_header)) + +/* SPM return error codes */ +#define ARM_SVC_SPM_RET_SUCCESS 0 +#define ARM_SVC_SPM_RET_NOT_SUPPORTED -1 +#define ARM_SVC_SPM_RET_INVALID_PARAMS -2 +#define ARM_SVC_SPM_RET_DENIED -3 +#define ARM_SVC_SPM_RET_NO_MEMORY -5 + +#define SMM_VARIABLE_FUNCTION_GET_VARIABLE 1 +/* + * The payload for this function is + * SMM_VARIABLE_COMMUNICATE_GET_NEXT_VARIABLE_NAME. + */ +#define SMM_VARIABLE_FUNCTION_GET_NEXT_VARIABLE_NAME 2 +/* + * The payload for this function is SMM_VARIABLE_COMMUNICATE_ACCESS_VARIABLE. + */ +#define SMM_VARIABLE_FUNCTION_SET_VARIABLE 3 +/* + * The payload for this function is + * SMM_VARIABLE_COMMUNICATE_QUERY_VARIABLE_INFO. + */ +#define SMM_VARIABLE_FUNCTION_QUERY_VARIABLE_INFO 4 +/* + * It is a notify event, no extra payload for this function. + */ +#define SMM_VARIABLE_FUNCTION_READY_TO_BOOT 5 +/* + * It is a notify event, no extra payload for this function. + */ +#define SMM_VARIABLE_FUNCTION_EXIT_BOOT_SERVICE 6 +/* + * The payload for this function is VARIABLE_INFO_ENTRY. + * The GUID in EFI_SMM_COMMUNICATE_HEADER is gEfiSmmVariableProtocolGuid. + */ +#define SMM_VARIABLE_FUNCTION_GET_STATISTICS 7 +/* + * The payload for this function is SMM_VARIABLE_COMMUNICATE_LOCK_VARIABLE + */ +#define SMM_VARIABLE_FUNCTION_LOCK_VARIABLE 8 + +#define SMM_VARIABLE_FUNCTION_VAR_CHECK_VARIABLE_PROPERTY_SET 9 + +#define SMM_VARIABLE_FUNCTION_VAR_CHECK_VARIABLE_PROPERTY_GET 10 + +#define SMM_VARIABLE_FUNCTION_GET_PAYLOAD_SIZE 11 +/* + * The payload for this function is + * SMM_VARIABLE_COMMUNICATE_RUNTIME_VARIABLE_CACHE_CONTEXT + */ +#define SMM_VARIABLE_FUNCTION_INIT_RUNTIME_VARIABLE_CACHE_CONTEXT 12 + +#define SMM_VARIABLE_FUNCTION_SYNC_RUNTIME_CACHE 13 +/* + * The payload for this function is + * SMM_VARIABLE_COMMUNICATE_GET_RUNTIME_CACHE_INFO + */ +#define SMM_VARIABLE_FUNCTION_GET_RUNTIME_CACHE_INFO 14 + +/** + * struct smm_variable_communicate_header - Used for SMM variable communication + + * @function: function to call in Smm. + * @ret_status: return status + * @data: payload + */ +struct smm_variable_communicate_header { + size_t function; + efi_status_t ret_status; + u8 data[]; +}; + +#define MM_VARIABLE_COMMUNICATE_SIZE \ + (sizeof(struct smm_variable_communicate_header)) + +/** + * struct smm_variable_access - Used to communicate with StMM by + * SetVariable and GetVariable. + + * @guid: vendor GUID + * @data_size: size of EFI variable data + * @name_size: size of EFI name + * @attr: attributes + * @name: variable name + * + */ +struct smm_variable_access { + efi_guid_t guid; + size_t data_size; + size_t name_size; + u32 attr; + u16 name[]; +}; + +#define MM_VARIABLE_ACCESS_HEADER_SIZE \ + (sizeof(struct smm_variable_access)) +/** + * struct smm_variable_payload_size - Used to get the max allowed + * payload used in StMM. + * + * @size: size to fill in + * + */ +struct smm_variable_payload_size { + size_t size; +}; + +/** + * struct smm_variable_getnext - Used to communicate with StMM for + * GetNextVariableName. + * + * @guid: vendor GUID + * @name_size: size of the name of the variable + * @name: variable name + * + */ +struct smm_variable_getnext { + efi_guid_t guid; + size_t name_size; + u16 name[]; +}; + +#define MM_VARIABLE_GET_NEXT_HEADER_SIZE \ + (sizeof(struct smm_variable_getnext)) + +/** + * struct smm_variable_query_info - Used to communicate with StMM for + * QueryVariableInfo. + * + * @max_variable_storage: max available storage + * @remaining_variable_storage: remaining available storage + * @max_variable_size: max variable supported size + * @attr: attributes to query storage for + * + */ +struct smm_variable_query_info { + u64 max_variable_storage; + u64 remaining_variable_storage; + u64 max_variable_size; + u32 attr; +}; + +#define VAR_CHECK_VARIABLE_PROPERTY_REVISION 0x0001 +#define VAR_CHECK_VARIABLE_PROPERTY_READ_ONLY BIT(0) +/** + * struct var_check_property - Used to store variable properties in StMM + * + * @revision: magic revision number for variable property checking + * @property: properties mask for the variable used in StMM. + * Currently RO flag is supported + * @attributes: variable attributes used in StMM checking when properties + * for a variable are enabled + * @minsize: minimum allowed size for variable payload checked against + * smm_variable_access->datasize in StMM + * @maxsize: maximum allowed size for variable payload checked against + * smm_variable_access->datasize in StMM + * + */ +struct var_check_property { + u16 revision; + u16 property; + u32 attributes; + size_t minsize; + size_t maxsize; +}; + +/** + * struct smm_variable_var_check_property - Used to communicate variable + * properties with StMM + * + * @guid: vendor GUID + * @name_size: size of EFI name + * @property: variable properties struct + * @name: variable name + * + */ +struct smm_variable_var_check_property { + efi_guid_t guid; + size_t name_size; + struct var_check_property property; + u16 name[]; +}; + +#endif /* _MM_COMMUNICATION_H_ */ diff --git a/drivers/firmware/efi/stmm/tee_stmm_efi.c b/drivers/firmware/efi/stmm/tee_stmm_efi.c new file mode 100644 index 000000000000..f741ca279052 --- /dev/null +++ b/drivers/firmware/efi/stmm/tee_stmm_efi.c @@ -0,0 +1,616 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * EFI variable service via TEE + * + * Copyright (C) 2022 Linaro + */ + +#include +#include +#include +#include +#include +#include +#include "mm_communication.h" + +static struct efivars tee_efivars; +static struct efivar_operations tee_efivar_ops; + +static size_t max_buffer_size; /* comm + var + func + data */ +static size_t max_payload_size; /* func + data */ + +struct tee_stmm_efi_private { + struct tee_context *ctx; + u32 session; + struct device *dev; +}; + +static struct tee_stmm_efi_private pvt_data; + +/* UUID of the stmm PTA */ +static const struct tee_client_device_id tee_stmm_efi_id_table[] = { + {PTA_STMM_UUID}, + {} +}; + +static int tee_ctx_match(struct tee_ioctl_version_data *ver, const void *data) +{ + /* currently only OP-TEE is supported as a communication path */ + if (ver->impl_id == TEE_IMPL_ID_OPTEE) + return 1; + else + return 0; +} + +/** + * tee_mm_communicate() - Pass a buffer to StandaloneMM running in TEE + * + * @comm_buf: locally allocated communication buffer + * @dsize: buffer size + * Return: status code + */ +static efi_status_t tee_mm_communicate(void *comm_buf, size_t dsize) +{ + size_t buf_size; + struct efi_mm_communicate_header *mm_hdr; + struct tee_ioctl_invoke_arg arg; + struct tee_param param[4]; + struct tee_shm *shm = NULL; + int rc; + + if (!comm_buf) + return EFI_INVALID_PARAMETER; + + mm_hdr = (struct efi_mm_communicate_header *)comm_buf; + buf_size = mm_hdr->message_len + sizeof(efi_guid_t) + sizeof(size_t); + + if (dsize != buf_size) + return EFI_INVALID_PARAMETER; + + shm = tee_shm_register_kernel_buf(pvt_data.ctx, comm_buf, buf_size); + if (IS_ERR(shm)) { + dev_err(pvt_data.dev, "Unable to register shared memory\n"); + return EFI_UNSUPPORTED; + } + + memset(&arg, 0, sizeof(arg)); + arg.func = PTA_STMM_CMD_COMMUNICATE; + arg.session = pvt_data.session; + arg.num_params = 4; + + memset(param, 0, sizeof(param)); + param[0].attr = TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT; + param[0].u.memref.size = buf_size; + param[0].u.memref.shm = shm; + param[1].attr = TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT; + param[2].attr = TEE_IOCTL_PARAM_ATTR_TYPE_NONE; + param[3].attr = TEE_IOCTL_PARAM_ATTR_TYPE_NONE; + + rc = tee_client_invoke_func(pvt_data.ctx, &arg, param); + tee_shm_free(shm); + + if (rc < 0 || arg.ret != 0) { + dev_err(pvt_data.dev, + "PTA_STMM_CMD_COMMUNICATE invoke error: 0x%x\n", arg.ret); + return EFI_DEVICE_ERROR; + } + + switch (param[1].u.value.a) { + case ARM_SVC_SPM_RET_SUCCESS: + return EFI_SUCCESS; + + case ARM_SVC_SPM_RET_INVALID_PARAMS: + return EFI_INVALID_PARAMETER; + + case ARM_SVC_SPM_RET_DENIED: + return EFI_ACCESS_DENIED; + + case ARM_SVC_SPM_RET_NO_MEMORY: + return EFI_OUT_OF_RESOURCES; + + default: + return EFI_ACCESS_DENIED; + } +} + +/** + * mm_communicate() - Adjust the communication buffer to StandAlonneMM and send + * it to TEE + * + * @comm_buf: locally allocated communication buffer, buffer should + * be enough big to have some headers and payload + * @payload_size: payload size + * Return: status code + */ +static efi_status_t mm_communicate(u8 *comm_buf, size_t payload_size) +{ + size_t dsize; + efi_status_t ret; + struct efi_mm_communicate_header *mm_hdr; + struct smm_variable_communicate_header *var_hdr; + + dsize = payload_size + MM_COMMUNICATE_HEADER_SIZE + + MM_VARIABLE_COMMUNICATE_SIZE; + mm_hdr = (struct efi_mm_communicate_header *)comm_buf; + var_hdr = (struct smm_variable_communicate_header *)mm_hdr->data; + + ret = tee_mm_communicate(comm_buf, dsize); + if (ret != EFI_SUCCESS) { + dev_err(pvt_data.dev, "%s failed!\n", __func__); + return ret; + } + + return var_hdr->ret_status; +} + +/** + * setup_mm_hdr() - Allocate a buffer for StandAloneMM and initialize the + * header data. + * + * @dptr: pointer address to store allocated buffer + * @payload_size: payload size + * @func: standAloneMM function number + * @ret: EFI return code + * Return: pointer to corresponding StandAloneMM function buffer or NULL + */ +static void *setup_mm_hdr(u8 **dptr, size_t payload_size, size_t func, + efi_status_t *ret) +{ + const efi_guid_t mm_var_guid = EFI_MM_VARIABLE_GUID; + struct efi_mm_communicate_header *mm_hdr; + struct smm_variable_communicate_header *var_hdr; + u8 *comm_buf; + + /* In the init function we initialize max_buffer_size with + * get_max_payload(). So skip the test if max_buffer_size is initialized + * StandAloneMM will perform similar checks and drop the buffer if it's + * too long + */ + if (max_buffer_size && + max_buffer_size < (MM_COMMUNICATE_HEADER_SIZE + + MM_VARIABLE_COMMUNICATE_SIZE + payload_size)) { + *ret = EFI_INVALID_PARAMETER; + return NULL; + } + + comm_buf = kzalloc(MM_COMMUNICATE_HEADER_SIZE + + MM_VARIABLE_COMMUNICATE_SIZE + payload_size, + GFP_KERNEL); + if (!comm_buf) { + *ret = EFI_OUT_OF_RESOURCES; + return NULL; + } + + mm_hdr = (struct efi_mm_communicate_header *)comm_buf; + memcpy(&mm_hdr->header_guid, &mm_var_guid, sizeof(mm_hdr->header_guid)); + mm_hdr->message_len = MM_VARIABLE_COMMUNICATE_SIZE + payload_size; + + var_hdr = (struct smm_variable_communicate_header *)mm_hdr->data; + var_hdr->function = func; + if (dptr) + *dptr = comm_buf; + *ret = EFI_SUCCESS; + + return var_hdr->data; +} + +/** + * get_max_payload() - Get variable payload size from StandAloneMM. + * + * @size: size of the variable in storage + * Return: status code + */ +static efi_status_t get_max_payload(size_t *size) +{ + struct smm_variable_payload_size *var_payload = NULL; + size_t payload_size; + u8 *comm_buf = NULL; + efi_status_t ret; + + if (!size) + return EFI_INVALID_PARAMETER; + + payload_size = sizeof(*var_payload); + var_payload = setup_mm_hdr(&comm_buf, payload_size, + SMM_VARIABLE_FUNCTION_GET_PAYLOAD_SIZE, + &ret); + if (!var_payload) + return EFI_OUT_OF_RESOURCES; + + ret = mm_communicate(comm_buf, payload_size); + if (ret != EFI_SUCCESS) + goto out; + + /* Make sure the buffer is big enough for storing variables */ + if (var_payload->size < MM_VARIABLE_ACCESS_HEADER_SIZE + 0x20) { + ret = EFI_DEVICE_ERROR; + goto out; + } + *size = var_payload->size; + /* + * There seems to be a bug in EDK2 miscalculating the boundaries and + * size checks, so deduct 2 more bytes to fulfill this requirement. Fix + * it up here to ensure backwards compatibility with older versions + * (cf. StandaloneMmPkg/Drivers/StandaloneMmCpu/AArch64/EventHandle.c. + * sizeof (EFI_MM_COMMUNICATE_HEADER) instead the size minus the + * flexible array member). + * + * size is guaranteed to be > 2 due to checks on the beginning. + */ + *size -= 2; +out: + kfree(comm_buf); + return ret; +} + +static efi_status_t get_property_int(u16 *name, size_t name_size, + const efi_guid_t *vendor, + struct var_check_property *var_property) +{ + struct smm_variable_var_check_property *smm_property; + size_t payload_size; + u8 *comm_buf = NULL; + efi_status_t ret; + + memset(var_property, 0, sizeof(*var_property)); + payload_size = sizeof(*smm_property) + name_size; + if (payload_size > max_payload_size) + return EFI_INVALID_PARAMETER; + + smm_property = setup_mm_hdr( + &comm_buf, payload_size, + SMM_VARIABLE_FUNCTION_VAR_CHECK_VARIABLE_PROPERTY_GET, &ret); + if (!smm_property) + return EFI_OUT_OF_RESOURCES; + + memcpy(&smm_property->guid, vendor, sizeof(smm_property->guid)); + smm_property->name_size = name_size; + memcpy(smm_property->name, name, name_size); + + ret = mm_communicate(comm_buf, payload_size); + /* + * Currently only R/O property is supported in StMM. + * Variables that are not set to R/O will not set the property in StMM + * and the call will return EFI_NOT_FOUND. We are setting the + * properties to 0x0 so checking against that is enough for the + * EFI_NOT_FOUND case. + */ + if (ret == EFI_NOT_FOUND) + ret = EFI_SUCCESS; + if (ret != EFI_SUCCESS) + goto out; + memcpy(var_property, &smm_property->property, sizeof(*var_property)); + +out: + kfree(comm_buf); + return ret; +} + +static efi_status_t tee_get_variable(u16 *name, efi_guid_t *vendor, + u32 *attributes, unsigned long *data_size, + void *data) +{ + struct var_check_property var_property; + struct smm_variable_access *var_acc; + size_t payload_size; + size_t name_size; + size_t tmp_dsize; + u8 *comm_buf = NULL; + efi_status_t ret; + + if (!name || !vendor || !data_size) + return EFI_INVALID_PARAMETER; + + name_size = (ucs2_strnlen(name, EFI_VAR_NAME_LEN) + 1) * sizeof(u16); + if (name_size > max_payload_size - MM_VARIABLE_ACCESS_HEADER_SIZE) + return EFI_INVALID_PARAMETER; + + /* Trim output buffer size */ + tmp_dsize = *data_size; + if (name_size + tmp_dsize > + max_payload_size - MM_VARIABLE_ACCESS_HEADER_SIZE) { + tmp_dsize = max_payload_size - MM_VARIABLE_ACCESS_HEADER_SIZE - + name_size; + } + + payload_size = MM_VARIABLE_ACCESS_HEADER_SIZE + name_size + tmp_dsize; + var_acc = setup_mm_hdr(&comm_buf, payload_size, + SMM_VARIABLE_FUNCTION_GET_VARIABLE, &ret); + if (!var_acc) + return EFI_OUT_OF_RESOURCES; + + /* Fill in contents */ + memcpy(&var_acc->guid, vendor, sizeof(var_acc->guid)); + var_acc->data_size = tmp_dsize; + var_acc->name_size = name_size; + var_acc->attr = attributes ? *attributes : 0; + memcpy(var_acc->name, name, name_size); + + ret = mm_communicate(comm_buf, payload_size); + if (ret == EFI_SUCCESS || ret == EFI_BUFFER_TOO_SMALL) + /* Update with reported data size for trimmed case */ + *data_size = var_acc->data_size; + if (ret != EFI_SUCCESS) + goto out; + + ret = get_property_int(name, name_size, vendor, &var_property); + if (ret != EFI_SUCCESS) + goto out; + + if (attributes) + *attributes = var_acc->attr; + + if (!data) { + ret = EFI_INVALID_PARAMETER; + goto out; + } + memcpy(data, (u8 *)var_acc->name + var_acc->name_size, + var_acc->data_size); +out: + kfree(comm_buf); + return ret; +} + +static efi_status_t tee_get_next_variable(unsigned long *name_size, + efi_char16_t *name, efi_guid_t *guid) +{ + struct smm_variable_getnext *var_getnext; + size_t payload_size; + size_t out_name_size; + size_t in_name_size; + u8 *comm_buf = NULL; + efi_status_t ret; + + if (!name_size || !name || !guid) + return EFI_INVALID_PARAMETER; + + out_name_size = *name_size; + in_name_size = (ucs2_strnlen(name, EFI_VAR_NAME_LEN) + 1) * sizeof(u16); + + if (out_name_size < in_name_size) + return EFI_INVALID_PARAMETER; + + if (in_name_size > max_payload_size - MM_VARIABLE_GET_NEXT_HEADER_SIZE) + return EFI_INVALID_PARAMETER; + + /* Trim output buffer size */ + if (out_name_size > max_payload_size - MM_VARIABLE_GET_NEXT_HEADER_SIZE) + out_name_size = + max_payload_size - MM_VARIABLE_GET_NEXT_HEADER_SIZE; + + payload_size = MM_VARIABLE_GET_NEXT_HEADER_SIZE + out_name_size; + var_getnext = setup_mm_hdr(&comm_buf, payload_size, + SMM_VARIABLE_FUNCTION_GET_NEXT_VARIABLE_NAME, + &ret); + if (!var_getnext) + return EFI_OUT_OF_RESOURCES; + + /* Fill in contents */ + memcpy(&var_getnext->guid, guid, sizeof(var_getnext->guid)); + var_getnext->name_size = out_name_size; + memcpy(var_getnext->name, name, in_name_size); + memset((u8 *)var_getnext->name + in_name_size, 0x0, + out_name_size - in_name_size); + + ret = mm_communicate(comm_buf, payload_size); + if (ret == EFI_SUCCESS || ret == EFI_BUFFER_TOO_SMALL) { + /* Update with reported data size for trimmed case */ + *name_size = var_getnext->name_size; + } + if (ret != EFI_SUCCESS) + goto out; + + memcpy(guid, &var_getnext->guid, sizeof(*guid)); + memcpy(name, var_getnext->name, var_getnext->name_size); + +out: + kfree(comm_buf); + return ret; +} + +static efi_status_t tee_set_variable(efi_char16_t *name, efi_guid_t *vendor, + u32 attributes, unsigned long data_size, + void *data) +{ + efi_status_t ret; + struct var_check_property var_property; + struct smm_variable_access *var_acc; + size_t payload_size; + size_t name_size; + u8 *comm_buf = NULL; + + if (!name || name[0] == 0 || !vendor) + return EFI_INVALID_PARAMETER; + + if (data_size > 0 && !data) + return EFI_INVALID_PARAMETER; + + /* Check payload size */ + name_size = (ucs2_strnlen(name, EFI_VAR_NAME_LEN) + 1) * sizeof(u16); + payload_size = MM_VARIABLE_ACCESS_HEADER_SIZE + name_size + data_size; + if (payload_size > max_payload_size) + return EFI_INVALID_PARAMETER; + + /* + * Allocate the buffer early, before switching to RW (if needed) + * so we won't need to account for any failures in reading/setting + * the properties, if the allocation fails + */ + var_acc = setup_mm_hdr(&comm_buf, payload_size, + SMM_VARIABLE_FUNCTION_SET_VARIABLE, &ret); + if (!var_acc) + return EFI_OUT_OF_RESOURCES; + + /* + * The API has the ability to override RO flags. If no RO check was + * requested switch the variable to RW for the duration of this call + */ + ret = get_property_int(name, name_size, vendor, &var_property); + if (ret != EFI_SUCCESS) { + dev_err(pvt_data.dev, "Getting variable property failed\n"); + goto out; + } + + if (var_property.property & VAR_CHECK_VARIABLE_PROPERTY_READ_ONLY) { + ret = EFI_WRITE_PROTECTED; + goto out; + } + + /* Fill in contents */ + memcpy(&var_acc->guid, vendor, sizeof(var_acc->guid)); + var_acc->data_size = data_size; + var_acc->name_size = name_size; + var_acc->attr = attributes; + memcpy(var_acc->name, name, name_size); + memcpy((u8 *)var_acc->name + name_size, data, data_size); + + ret = mm_communicate(comm_buf, payload_size); + dev_dbg(pvt_data.dev, "Set Variable %s %d %lx\n", __FILE__, __LINE__, ret); +out: + kfree(comm_buf); + return ret; +} + +static efi_status_t tee_set_variable_nonblocking(efi_char16_t *name, + efi_guid_t *vendor, + u32 attributes, + unsigned long data_size, + void *data) +{ + return EFI_UNSUPPORTED; +} + +static efi_status_t tee_query_variable_info(u32 attributes, + u64 *max_variable_storage_size, + u64 *remain_variable_storage_size, + u64 *max_variable_size) +{ + struct smm_variable_query_info *mm_query_info; + size_t payload_size; + efi_status_t ret; + u8 *comm_buf; + + payload_size = sizeof(*mm_query_info); + mm_query_info = setup_mm_hdr(&comm_buf, payload_size, + SMM_VARIABLE_FUNCTION_QUERY_VARIABLE_INFO, + &ret); + if (!mm_query_info) + return EFI_OUT_OF_RESOURCES; + + mm_query_info->attr = attributes; + ret = mm_communicate(comm_buf, payload_size); + if (ret != EFI_SUCCESS) + goto out; + *max_variable_storage_size = mm_query_info->max_variable_storage; + *remain_variable_storage_size = + mm_query_info->remaining_variable_storage; + *max_variable_size = mm_query_info->max_variable_size; + +out: + kfree(comm_buf); + return ret; +} + +static void tee_stmm_efi_close_context(void *data) +{ + tee_client_close_context(pvt_data.ctx); +} + +static void tee_stmm_efi_close_session(void *data) +{ + tee_client_close_session(pvt_data.ctx, pvt_data.session); +} + +static void tee_stmm_restore_efivars_generic_ops(void) +{ + efivars_unregister(&tee_efivars); + efivars_generic_ops_register(); +} + +static int tee_stmm_efi_probe(struct device *dev) +{ + struct tee_ioctl_open_session_arg sess_arg; + efi_status_t ret; + int rc; + + pvt_data.ctx = tee_client_open_context(NULL, tee_ctx_match, NULL, NULL); + if (IS_ERR(pvt_data.ctx)) + return -ENODEV; + + rc = devm_add_action_or_reset(dev, tee_stmm_efi_close_context, NULL); + if (rc) + return rc; + + /* Open session with StMM PTA */ + memset(&sess_arg, 0, sizeof(sess_arg)); + export_uuid(sess_arg.uuid, &tee_stmm_efi_id_table[0].uuid); + rc = tee_client_open_session(pvt_data.ctx, &sess_arg, NULL); + if ((rc < 0) || (sess_arg.ret != 0)) { + dev_err(dev, "tee_client_open_session failed, err: %x\n", + sess_arg.ret); + return -EINVAL; + } + pvt_data.session = sess_arg.session; + pvt_data.dev = dev; + rc = devm_add_action_or_reset(dev, tee_stmm_efi_close_session, NULL); + if (rc) + return rc; + + ret = get_max_payload(&max_payload_size); + if (ret != EFI_SUCCESS) + return -EIO; + + max_buffer_size = MM_COMMUNICATE_HEADER_SIZE + + MM_VARIABLE_COMMUNICATE_SIZE + + max_payload_size; + + tee_efivar_ops.get_variable = tee_get_variable; + tee_efivar_ops.get_next_variable = tee_get_next_variable; + tee_efivar_ops.set_variable = tee_set_variable; + tee_efivar_ops.set_variable_nonblocking = tee_set_variable_nonblocking; + tee_efivar_ops.query_variable_store = efi_query_variable_store; + tee_efivar_ops.query_variable_info = tee_query_variable_info; + + efivars_generic_ops_unregister(); + pr_info("Using TEE-based EFI runtime variable services\n"); + efivars_register(&tee_efivars, &tee_efivar_ops); + + return 0; +} + +static int tee_stmm_efi_remove(struct device *dev) +{ + tee_stmm_restore_efivars_generic_ops(); + + return 0; +} + +MODULE_DEVICE_TABLE(tee, tee_stmm_efi_id_table); + +static struct tee_client_driver tee_stmm_efi_driver = { + .id_table = tee_stmm_efi_id_table, + .driver = { + .name = "tee-stmm-efi", + .bus = &tee_bus_type, + .probe = tee_stmm_efi_probe, + .remove = tee_stmm_efi_remove, + }, +}; + +static int __init tee_stmm_efi_mod_init(void) +{ + return driver_register(&tee_stmm_efi_driver.driver); +} + +static void __exit tee_stmm_efi_mod_exit(void) +{ + driver_unregister(&tee_stmm_efi_driver.driver); +} + +module_init(tee_stmm_efi_mod_init); +module_exit(tee_stmm_efi_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ilias Apalodimas "); +MODULE_AUTHOR("Masahisa Kojima "); +MODULE_DESCRIPTION("TEE based EFI runtime variable service driver"); From 94f7f6182c72ba642c1f20111681f9cc8621c95f Mon Sep 17 00:00:00 2001 From: Masahisa Kojima Date: Tue, 7 Nov 2023 14:40:55 +0900 Subject: [PATCH 0598/1562] efivarfs: automatically update super block flag efivar operation is updated when the tee_stmm_efi module is probed. tee_stmm_efi module supports SetVariable runtime service, but user needs to manually remount the efivarfs as RW to enable the write access if the previous efivar operation does not support SetVariable and efivarfs is mounted as read-only. This commit notifies the update of efivar operation to efivarfs subsystem, then drops SB_RDONLY flag if the efivar operation supports SetVariable. Signed-off-by: Masahisa Kojima [ardb: use per-superblock instance of the notifier block] Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/efi.c | 6 ++++++ drivers/firmware/efi/vars.c | 8 ++++++++ fs/efivarfs/internal.h | 2 ++ fs/efivarfs/super.c | 27 +++++++++++++++++++++++++++ include/linux/efi.h | 8 ++++++++ 5 files changed, 51 insertions(+) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 32a67c61c3b8..4fcda50acfa4 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -32,6 +32,7 @@ #include #include #include +#include #include @@ -187,6 +188,9 @@ static const struct attribute_group efi_subsys_attr_group = { .is_visible = efi_attr_is_visible, }; +struct blocking_notifier_head efivar_ops_nh; +EXPORT_SYMBOL_GPL(efivar_ops_nh); + static struct efivars generic_efivars; static struct efivar_operations generic_ops; @@ -431,6 +435,8 @@ static int __init efisubsys_init(void) platform_device_register_simple("efivars", 0, NULL, 0); } + BLOCKING_INIT_NOTIFIER_HEAD(&efivar_ops_nh); + error = sysfs_create_group(efi_kobj, &efi_subsys_attr_group); if (error) { pr_err("efi: Sysfs attribute export failed with error %d.\n", diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index e9dc7116daf1..f654e6f6af87 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -63,6 +63,7 @@ int efivars_register(struct efivars *efivars, const struct efivar_operations *ops) { int rv; + int event; if (down_interruptible(&efivars_lock)) return -EINTR; @@ -77,6 +78,13 @@ int efivars_register(struct efivars *efivars, __efivars = efivars; + if (efivar_supports_writes()) + event = EFIVAR_OPS_RDWR; + else + event = EFIVAR_OPS_RDONLY; + + blocking_notifier_call_chain(&efivar_ops_nh, event, NULL); + pr_info("Registered efivars operations\n"); rv = 0; out: diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h index 1dc0ccce3cc3..169252e6dc46 100644 --- a/fs/efivarfs/internal.h +++ b/fs/efivarfs/internal.h @@ -17,6 +17,8 @@ struct efivarfs_mount_opts { struct efivarfs_fs_info { struct efivarfs_mount_opts mount_opts; struct list_head efivarfs_list; + struct super_block *sb; + struct notifier_block nb; }; struct efi_variable { diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index cee325b5bbdd..6038dd39367a 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -15,10 +15,30 @@ #include #include #include +#include #include #include "internal.h" +static int efivarfs_ops_notifier(struct notifier_block *nb, unsigned long event, + void *data) +{ + struct efivarfs_fs_info *sfi = container_of(nb, struct efivarfs_fs_info, nb); + + switch (event) { + case EFIVAR_OPS_RDONLY: + sfi->sb->s_flags |= SB_RDONLY; + break; + case EFIVAR_OPS_RDWR: + sfi->sb->s_flags &= ~SB_RDONLY; + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + static void efivarfs_evict_inode(struct inode *inode) { clear_inode(inode); @@ -317,6 +337,12 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) if (!root) return -ENOMEM; + sfi->sb = sb; + sfi->nb.notifier_call = efivarfs_ops_notifier; + err = blocking_notifier_chain_register(&efivar_ops_nh, &sfi->nb); + if (err) + return err; + err = efivar_init(efivarfs_callback, (void *)sb, true, &sfi->efivarfs_list); if (err) @@ -371,6 +397,7 @@ static void efivarfs_kill_sb(struct super_block *sb) { struct efivarfs_fs_info *sfi = sb->s_fs_info; + blocking_notifier_chain_unregister(&efivar_ops_nh, &sfi->nb); kill_litter_super(sb); /* Remove all entries and destroy */ diff --git a/include/linux/efi.h b/include/linux/efi.h index 3668aa204c47..c74f47711f0b 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1349,6 +1349,14 @@ bool efi_config_table_is_usable(const efi_guid_t *guid, unsigned long table) umode_t efi_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n); +/* + * efivar ops event type + */ +#define EFIVAR_OPS_RDONLY 0 +#define EFIVAR_OPS_RDWR 1 + +extern struct blocking_notifier_head efivar_ops_nh; + void efivars_generic_ops_register(void); void efivars_generic_ops_unregister(void); From f763fd73b181c7c5117e0d8a4da3dd4237737b86 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Sun, 10 Dec 2023 21:24:39 +0100 Subject: [PATCH 0599/1562] platform/x86: wmi: Remove debug_dump_wdg module param MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functionality of dumping WDG entries is better provided by userspace tools like "fwts wmi", which also does not suffer from garbled printk output caused by pr_cont(). Reviewed-by: Ilpo Järvinen Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231210202443.646427-2-W_Armin@gmx.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/wmi.c | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index 4f94e4b117f1..e8019bc19b4f 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -90,11 +90,6 @@ module_param(debug_event, bool, 0444); MODULE_PARM_DESC(debug_event, "Log WMI Events [0/1]"); -static bool debug_dump_wdg; -module_param(debug_dump_wdg, bool, 0444); -MODULE_PARM_DESC(debug_dump_wdg, - "Dump available WMI interfaces [0/1]"); - static const struct acpi_device_id wmi_device_ids[] = { {"PNP0C14", 0}, {"pnp0c14", 0}, @@ -597,29 +592,6 @@ acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct } EXPORT_SYMBOL_GPL(wmidev_block_set); -static void wmi_dump_wdg(const struct guid_block *g) -{ - pr_info("%pUL:\n", &g->guid); - if (g->flags & ACPI_WMI_EVENT) - pr_info("\tnotify_id: 0x%02X\n", g->notify_id); - else - pr_info("\tobject_id: %2pE\n", g->object_id); - pr_info("\tinstance_count: %d\n", g->instance_count); - pr_info("\tflags: %#x", g->flags); - if (g->flags) { - if (g->flags & ACPI_WMI_EXPENSIVE) - pr_cont(" ACPI_WMI_EXPENSIVE"); - if (g->flags & ACPI_WMI_METHOD) - pr_cont(" ACPI_WMI_METHOD"); - if (g->flags & ACPI_WMI_STRING) - pr_cont(" ACPI_WMI_STRING"); - if (g->flags & ACPI_WMI_EVENT) - pr_cont(" ACPI_WMI_EVENT"); - } - pr_cont("\n"); - -} - static void wmi_notify_debug(u32 value, void *context) { struct acpi_buffer response = { ACPI_ALLOCATE_BUFFER, NULL }; @@ -1343,9 +1315,6 @@ static int parse_wdg(struct device *wmi_bus_dev, struct platform_device *pdev) total = obj->buffer.length / sizeof(struct guid_block); for (i = 0; i < total; i++) { - if (debug_dump_wdg) - wmi_dump_wdg(&gblock[i]); - if (!gblock[i].instance_count) { dev_info(wmi_bus_dev, FW_INFO "%pUL has zero instances\n", &gblock[i].guid); continue; From ed72a2b50b755327f411f8199c4120d8bc7687cf Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Sun, 10 Dec 2023 21:24:40 +0100 Subject: [PATCH 0600/1562] platform/x86: wmi: Remove debug_event module param MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Users can already listen to ACPI WMI events through the ACPI netlink interface. The old wmi_notify_debug() interface also uses the deprecated GUID-based interface. Remove it to make the event handling code more readable. Reviewed-by: Ilpo Järvinen Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231210202443.646427-3-W_Armin@gmx.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/wmi.c | 74 ++++---------------------------------- 1 file changed, 7 insertions(+), 67 deletions(-) diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index e8019bc19b4f..7df5b5ee7983 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -85,11 +85,6 @@ struct wmi_block { #define ACPI_WMI_STRING BIT(2) /* GUID takes & returns a string */ #define ACPI_WMI_EVENT BIT(3) /* GUID is an event */ -static bool debug_event; -module_param(debug_event, bool, 0444); -MODULE_PARM_DESC(debug_event, - "Log WMI Events [0/1]"); - static const struct acpi_device_id wmi_device_ids[] = { {"PNP0C14", 0}, {"pnp0c14", 0}, @@ -592,42 +587,6 @@ acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct } EXPORT_SYMBOL_GPL(wmidev_block_set); -static void wmi_notify_debug(u32 value, void *context) -{ - struct acpi_buffer response = { ACPI_ALLOCATE_BUFFER, NULL }; - union acpi_object *obj; - acpi_status status; - - status = wmi_get_event_data(value, &response); - if (status != AE_OK) { - pr_info("bad event status 0x%x\n", status); - return; - } - - obj = response.pointer; - if (!obj) - return; - - pr_info("DEBUG: event 0x%02X ", value); - switch (obj->type) { - case ACPI_TYPE_BUFFER: - pr_cont("BUFFER_TYPE - length %u\n", obj->buffer.length); - break; - case ACPI_TYPE_STRING: - pr_cont("STRING_TYPE - %s\n", obj->string.pointer); - break; - case ACPI_TYPE_INTEGER: - pr_cont("INTEGER_TYPE - %llu\n", obj->integer.value); - break; - case ACPI_TYPE_PACKAGE: - pr_cont("PACKAGE_TYPE - %u elements\n", obj->package.count); - break; - default: - pr_cont("object type 0x%X\n", obj->type); - } - kfree(obj); -} - /** * wmi_install_notify_handler - Register handler for WMI events (deprecated) * @guid: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba @@ -656,8 +615,7 @@ acpi_status wmi_install_notify_handler(const char *guid, acpi_status wmi_status; if (guid_equal(&block->gblock.guid, &guid_input)) { - if (block->handler && - block->handler != wmi_notify_debug) + if (block->handler) return AE_ALREADY_ACQUIRED; block->handler = handler; @@ -698,22 +656,14 @@ acpi_status wmi_remove_notify_handler(const char *guid) acpi_status wmi_status; if (guid_equal(&block->gblock.guid, &guid_input)) { - if (!block->handler || - block->handler == wmi_notify_debug) + if (!block->handler) return AE_NULL_ENTRY; - if (debug_event) { - block->handler = wmi_notify_debug; - status = AE_OK; - } else { - wmi_status = wmi_method_enable(block, false); - block->handler = NULL; - block->handler_data = NULL; - if ((wmi_status != AE_OK) || - ((wmi_status == AE_OK) && - (status == AE_NOT_EXIST))) - status = wmi_status; - } + wmi_status = wmi_method_enable(block, false); + block->handler = NULL; + block->handler_data = NULL; + if (wmi_status != AE_OK || (wmi_status == AE_OK && status == AE_NOT_EXIST)) + status = wmi_status; } } @@ -1340,17 +1290,10 @@ static int parse_wdg(struct device *wmi_bus_dev, struct platform_device *pdev) list_add_tail(&wblock->list, &wmi_block_list); - if (debug_event) { - wblock->handler = wmi_notify_debug; - wmi_method_enable(wblock, true); - } - retval = wmi_add_device(pdev, &wblock->dev); if (retval) { dev_err(wmi_bus_dev, "failed to register %pUL\n", &wblock->gblock.guid); - if (debug_event) - wmi_method_enable(wblock, false); list_del(&wblock->list); put_device(&wblock->dev.dev); @@ -1445,9 +1388,6 @@ static void acpi_wmi_notify_handler(acpi_handle handle, u32 event, wblock->handler(event, wblock->handler_data); } - if (debug_event) - pr_info("DEBUG: GUID %pUL event 0x%02X\n", &wblock->gblock.guid, event); - acpi_bus_generate_netlink_event( wblock->acpi_device->pnp.device_class, dev_name(&wblock->dev.dev), From ba358964cb8f24f28d543c6ce18a4af64961ab62 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Sun, 10 Dec 2023 21:24:41 +0100 Subject: [PATCH 0601/1562] platform/x86: dell-smbios-wmi: Use devm_get_free_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use devres version of __get_free_pages() to simplify the error handling code. Signed-off-by: Armin Wolf Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231210202443.646427-4-W_Armin@gmx.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/dell/dell-smbios-wmi.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/platform/x86/dell/dell-smbios-wmi.c b/drivers/platform/x86/dell/dell-smbios-wmi.c index 931cc50136de..7eb7c61bb27d 100644 --- a/drivers/platform/x86/dell/dell-smbios-wmi.c +++ b/drivers/platform/x86/dell/dell-smbios-wmi.c @@ -6,6 +6,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -183,7 +184,7 @@ static int dell_smbios_wmi_probe(struct wmi_device *wdev, const void *context) return ret; count = get_order(priv->req_buf_size); - priv->buf = (void *)__get_free_pages(GFP_KERNEL, count); + priv->buf = (void *)devm_get_free_pages(&wdev->dev, GFP_KERNEL, count); if (!priv->buf) return -ENOMEM; @@ -191,7 +192,7 @@ static int dell_smbios_wmi_probe(struct wmi_device *wdev, const void *context) wdev->dev.id = 1; ret = dell_smbios_register_device(&wdev->dev, &dell_smbios_wmi_call); if (ret) - goto fail_register; + return ret; priv->wdev = wdev; dev_set_drvdata(&wdev->dev, priv); @@ -200,24 +201,17 @@ static int dell_smbios_wmi_probe(struct wmi_device *wdev, const void *context) mutex_unlock(&list_mutex); return 0; - -fail_register: - free_pages((unsigned long)priv->buf, count); - return ret; } static void dell_smbios_wmi_remove(struct wmi_device *wdev) { struct wmi_smbios_priv *priv = dev_get_drvdata(&wdev->dev); - int count; mutex_lock(&call_mutex); mutex_lock(&list_mutex); list_del(&priv->list); mutex_unlock(&list_mutex); dell_smbios_unregister_device(&wdev->dev); - count = get_order(priv->req_buf_size); - free_pages((unsigned long)priv->buf, count); mutex_unlock(&call_mutex); } From 93885e85a77f25a1de57d47572a00633717fe1d4 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Sun, 10 Dec 2023 21:24:42 +0100 Subject: [PATCH 0602/1562] platform/x86: dell-smbios-wmi: Stop using WMI chardev The WMI chardev API will be removed in the near future. Reimplement the necessary bits used by this driver so that userspace software depending on it does no break. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231210202443.646427-5-W_Armin@gmx.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/dell/dell-smbios-wmi.c | 159 ++++++++++++++------ 1 file changed, 116 insertions(+), 43 deletions(-) diff --git a/drivers/platform/x86/dell/dell-smbios-wmi.c b/drivers/platform/x86/dell/dell-smbios-wmi.c index 7eb7c61bb27d..ae9012549560 100644 --- a/drivers/platform/x86/dell/dell-smbios-wmi.c +++ b/drivers/platform/x86/dell/dell-smbios-wmi.c @@ -8,11 +8,14 @@ #include #include +#include #include +#include #include #include #include #include +#include #include "dell-smbios.h" #include "dell-wmi-descriptor.h" @@ -33,7 +36,8 @@ struct wmi_smbios_priv { struct list_head list; struct wmi_device *wdev; struct device *child; - u32 req_buf_size; + u64 req_buf_size; + struct miscdevice char_dev; }; static LIST_HEAD(wmi_list); @@ -109,48 +113,115 @@ out_wmi_call: return ret; } -static long dell_smbios_wmi_filter(struct wmi_device *wdev, unsigned int cmd, - struct wmi_ioctl_buffer *arg) +static int dell_smbios_wmi_open(struct inode *inode, struct file *filp) { struct wmi_smbios_priv *priv; - int ret = 0; - switch (cmd) { - case DELL_WMI_SMBIOS_CMD: - mutex_lock(&call_mutex); - priv = dev_get_drvdata(&wdev->dev); - if (!priv) { - ret = -ENODEV; - goto fail_smbios_cmd; - } - memcpy(priv->buf, arg, priv->req_buf_size); - if (dell_smbios_call_filter(&wdev->dev, &priv->buf->std)) { - dev_err(&wdev->dev, "Invalid call %d/%d:%8x\n", - priv->buf->std.cmd_class, - priv->buf->std.cmd_select, - priv->buf->std.input[0]); - ret = -EFAULT; - goto fail_smbios_cmd; - } - ret = run_smbios_call(priv->wdev); - if (ret) - goto fail_smbios_cmd; - memcpy(arg, priv->buf, priv->req_buf_size); -fail_smbios_cmd: - mutex_unlock(&call_mutex); - break; - default: - ret = -ENOIOCTLCMD; + priv = container_of(filp->private_data, struct wmi_smbios_priv, char_dev); + filp->private_data = priv; + + return nonseekable_open(inode, filp); +} + +static ssize_t dell_smbios_wmi_read(struct file *filp, char __user *buffer, size_t length, + loff_t *offset) +{ + struct wmi_smbios_priv *priv = filp->private_data; + + return simple_read_from_buffer(buffer, length, offset, &priv->req_buf_size, + sizeof(priv->req_buf_size)); +} + +static long dell_smbios_wmi_do_ioctl(struct wmi_smbios_priv *priv, + struct dell_wmi_smbios_buffer __user *arg) +{ + long ret; + + if (get_user(priv->buf->length, &arg->length)) + return -EFAULT; + + if (priv->buf->length < priv->req_buf_size) + return -EINVAL; + + /* if it's too big, warn, driver will only use what is needed */ + if (priv->buf->length > priv->req_buf_size) + dev_err(&priv->wdev->dev, "Buffer %llu is bigger than required %llu\n", + priv->buf->length, priv->req_buf_size); + + if (copy_from_user(priv->buf, arg, priv->req_buf_size)) + return -EFAULT; + + if (dell_smbios_call_filter(&priv->wdev->dev, &priv->buf->std)) { + dev_err(&priv->wdev->dev, "Invalid call %d/%d:%8x\n", + priv->buf->std.cmd_class, + priv->buf->std.cmd_select, + priv->buf->std.input[0]); + + return -EINVAL; } + + ret = run_smbios_call(priv->wdev); + if (ret) + return ret; + + if (copy_to_user(arg, priv->buf, priv->req_buf_size)) + return -EFAULT; + + return 0; +} + +static long dell_smbios_wmi_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct dell_wmi_smbios_buffer __user *input = (struct dell_wmi_smbios_buffer __user *)arg; + struct wmi_smbios_priv *priv = filp->private_data; + long ret; + + if (cmd != DELL_WMI_SMBIOS_CMD) + return -ENOIOCTLCMD; + + mutex_lock(&call_mutex); + ret = dell_smbios_wmi_do_ioctl(priv, input); + mutex_unlock(&call_mutex); + return ret; } +static const struct file_operations dell_smbios_wmi_fops = { + .owner = THIS_MODULE, + .open = dell_smbios_wmi_open, + .read = dell_smbios_wmi_read, + .unlocked_ioctl = dell_smbios_wmi_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static void dell_smbios_wmi_unregister_chardev(void *data) +{ + struct miscdevice *char_dev = data; + + misc_deregister(char_dev); +} + +static int dell_smbios_wmi_register_chardev(struct wmi_smbios_priv *priv) +{ + int ret; + + priv->char_dev.minor = MISC_DYNAMIC_MINOR; + priv->char_dev.name = "wmi/dell-smbios"; + priv->char_dev.fops = &dell_smbios_wmi_fops; + priv->char_dev.mode = 0444; + + ret = misc_register(&priv->char_dev); + if (ret < 0) + return ret; + + return devm_add_action_or_reset(&priv->wdev->dev, dell_smbios_wmi_unregister_chardev, + &priv->char_dev); +} + static int dell_smbios_wmi_probe(struct wmi_device *wdev, const void *context) { - struct wmi_driver *wdriver = - container_of(wdev->dev.driver, struct wmi_driver, driver); struct wmi_smbios_priv *priv; - u32 hotfix; + u32 buffer_size, hotfix; int count; int ret; @@ -163,39 +234,42 @@ static int dell_smbios_wmi_probe(struct wmi_device *wdev, const void *context) if (!priv) return -ENOMEM; + priv->wdev = wdev; + dev_set_drvdata(&wdev->dev, priv); + /* WMI buffer size will be either 4k or 32k depending on machine */ - if (!dell_wmi_get_size(&priv->req_buf_size)) + if (!dell_wmi_get_size(&buffer_size)) return -EPROBE_DEFER; + priv->req_buf_size = buffer_size; + /* some SMBIOS calls fail unless BIOS contains hotfix */ if (!dell_wmi_get_hotfix(&hotfix)) return -EPROBE_DEFER; - if (!hotfix) { + + if (!hotfix) dev_warn(&wdev->dev, "WMI SMBIOS userspace interface not supported(%u), try upgrading to a newer BIOS\n", hotfix); - wdriver->filter_callback = NULL; - } /* add in the length object we will use internally with ioctl */ priv->req_buf_size += sizeof(u64); - ret = set_required_buffer_size(wdev, priv->req_buf_size); - if (ret) - return ret; count = get_order(priv->req_buf_size); priv->buf = (void *)devm_get_free_pages(&wdev->dev, GFP_KERNEL, count); if (!priv->buf) return -ENOMEM; + ret = dell_smbios_wmi_register_chardev(priv); + if (ret) + return ret; + /* ID is used by dell-smbios to set priority of drivers */ wdev->dev.id = 1; ret = dell_smbios_register_device(&wdev->dev, &dell_smbios_wmi_call); if (ret) return ret; - priv->wdev = wdev; - dev_set_drvdata(&wdev->dev, priv); mutex_lock(&list_mutex); list_add_tail(&priv->list, &wmi_list); mutex_unlock(&list_mutex); @@ -250,7 +324,6 @@ static struct wmi_driver dell_smbios_wmi_driver = { .probe = dell_smbios_wmi_probe, .remove = dell_smbios_wmi_remove, .id_table = dell_smbios_wmi_id_table, - .filter_callback = dell_smbios_wmi_filter, }; int init_dell_smbios_wmi(void) From 704af3a40747e395b67892127943e6ffd5e2b642 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Sun, 10 Dec 2023 21:24:43 +0100 Subject: [PATCH 0603/1562] platform/x86: wmi: Remove chardev interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The design of the WMI chardev interface is broken: - it assumes that WMI drivers are not instantiated twice - it offers next to no abstractions, the WMI driver gets a raw byte buffer - it is only used by a single driver, something which is unlikely to change Since the only user (dell-smbios-wmi) has been migrated to his own ioctl interface, remove it. Reviewed-by: Ilpo Järvinen Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231210202443.646427-6-W_Armin@gmx.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/wmi.c | 180 ++----------------------------------- include/linux/wmi.h | 8 -- 2 files changed, 5 insertions(+), 183 deletions(-) diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index 7df5b5ee7983..7303702290e5 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -23,17 +23,14 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include #include -#include MODULE_AUTHOR("Carlos Corbacho"); MODULE_DESCRIPTION("ACPI-WMI Mapping Driver"); @@ -66,12 +63,9 @@ struct wmi_block { struct wmi_device dev; struct list_head list; struct guid_block gblock; - struct miscdevice char_dev; - struct mutex char_mutex; struct acpi_device *acpi_device; wmi_notify_handler handler; void *handler_data; - u64 req_buf_size; unsigned long flags; }; @@ -256,26 +250,6 @@ static void wmi_device_put(struct wmi_device *wdev) * Exported WMI functions */ -/** - * set_required_buffer_size - Sets the buffer size needed for performing IOCTL - * @wdev: A wmi bus device from a driver - * @length: Required buffer size - * - * Allocates memory needed for buffer, stores the buffer size in that memory. - * - * Return: 0 on success or a negative error code for failure. - */ -int set_required_buffer_size(struct wmi_device *wdev, u64 length) -{ - struct wmi_block *wblock; - - wblock = container_of(wdev, struct wmi_block, dev); - wblock->req_buf_size = length; - - return 0; -} -EXPORT_SYMBOL_GPL(set_required_buffer_size); - /** * wmi_instance_count - Get number of WMI object instances * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba @@ -884,111 +858,12 @@ static int wmi_dev_match(struct device *dev, struct device_driver *driver) return 0; } -static int wmi_char_open(struct inode *inode, struct file *filp) -{ - /* - * The miscdevice already stores a pointer to itself - * inside filp->private_data - */ - struct wmi_block *wblock = container_of(filp->private_data, struct wmi_block, char_dev); - - filp->private_data = wblock; - - return nonseekable_open(inode, filp); -} - -static ssize_t wmi_char_read(struct file *filp, char __user *buffer, - size_t length, loff_t *offset) -{ - struct wmi_block *wblock = filp->private_data; - - return simple_read_from_buffer(buffer, length, offset, - &wblock->req_buf_size, - sizeof(wblock->req_buf_size)); -} - -static long wmi_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - struct wmi_ioctl_buffer __user *input = - (struct wmi_ioctl_buffer __user *) arg; - struct wmi_block *wblock = filp->private_data; - struct wmi_ioctl_buffer *buf; - struct wmi_driver *wdriver; - int ret; - - if (_IOC_TYPE(cmd) != WMI_IOC) - return -ENOTTY; - - /* make sure we're not calling a higher instance than exists*/ - if (_IOC_NR(cmd) >= wblock->gblock.instance_count) - return -EINVAL; - - mutex_lock(&wblock->char_mutex); - buf = wblock->handler_data; - if (get_user(buf->length, &input->length)) { - dev_dbg(&wblock->dev.dev, "Read length from user failed\n"); - ret = -EFAULT; - goto out_ioctl; - } - /* if it's too small, abort */ - if (buf->length < wblock->req_buf_size) { - dev_err(&wblock->dev.dev, - "Buffer %lld too small, need at least %lld\n", - buf->length, wblock->req_buf_size); - ret = -EINVAL; - goto out_ioctl; - } - /* if it's too big, warn, driver will only use what is needed */ - if (buf->length > wblock->req_buf_size) - dev_warn(&wblock->dev.dev, - "Buffer %lld is bigger than required %lld\n", - buf->length, wblock->req_buf_size); - - /* copy the structure from userspace */ - if (copy_from_user(buf, input, wblock->req_buf_size)) { - dev_dbg(&wblock->dev.dev, "Copy %llu from user failed\n", - wblock->req_buf_size); - ret = -EFAULT; - goto out_ioctl; - } - - /* let the driver do any filtering and do the call */ - wdriver = drv_to_wdrv(wblock->dev.dev.driver); - if (!try_module_get(wdriver->driver.owner)) { - ret = -EBUSY; - goto out_ioctl; - } - ret = wdriver->filter_callback(&wblock->dev, cmd, buf); - module_put(wdriver->driver.owner); - if (ret) - goto out_ioctl; - - /* return the result (only up to our internal buffer size) */ - if (copy_to_user(input, buf, wblock->req_buf_size)) { - dev_dbg(&wblock->dev.dev, "Copy %llu to user failed\n", - wblock->req_buf_size); - ret = -EFAULT; - } - -out_ioctl: - mutex_unlock(&wblock->char_mutex); - return ret; -} - -static const struct file_operations wmi_fops = { - .owner = THIS_MODULE, - .read = wmi_char_read, - .open = wmi_char_open, - .unlocked_ioctl = wmi_ioctl, - .compat_ioctl = compat_ptr_ioctl, -}; static int wmi_dev_probe(struct device *dev) { struct wmi_block *wblock = dev_to_wblock(dev); struct wmi_driver *wdriver = drv_to_wdrv(dev->driver); int ret = 0; - char *buf; if (ACPI_FAILURE(wmi_method_enable(wblock, true))) dev_warn(dev, "failed to enable device -- probing anyway\n"); @@ -996,55 +871,17 @@ static int wmi_dev_probe(struct device *dev) if (wdriver->probe) { ret = wdriver->probe(dev_to_wdev(dev), find_guid_context(wblock, wdriver)); - if (ret != 0) - goto probe_failure; - } + if (!ret) { + if (ACPI_FAILURE(wmi_method_enable(wblock, false))) + dev_warn(dev, "Failed to disable device\n"); - /* driver wants a character device made */ - if (wdriver->filter_callback) { - /* check that required buffer size declared by driver or MOF */ - if (!wblock->req_buf_size) { - dev_err(&wblock->dev.dev, - "Required buffer size not set\n"); - ret = -EINVAL; - goto probe_failure; - } - - wblock->handler_data = kmalloc(wblock->req_buf_size, - GFP_KERNEL); - if (!wblock->handler_data) { - ret = -ENOMEM; - goto probe_failure; - } - - buf = kasprintf(GFP_KERNEL, "wmi/%s", wdriver->driver.name); - if (!buf) { - ret = -ENOMEM; - goto probe_string_failure; - } - wblock->char_dev.minor = MISC_DYNAMIC_MINOR; - wblock->char_dev.name = buf; - wblock->char_dev.fops = &wmi_fops; - wblock->char_dev.mode = 0444; - ret = misc_register(&wblock->char_dev); - if (ret) { - dev_warn(dev, "failed to register char dev: %d\n", ret); - ret = -ENOMEM; - goto probe_misc_failure; + return ret; } } set_bit(WMI_PROBED, &wblock->flags); - return 0; -probe_misc_failure: - kfree(buf); -probe_string_failure: - kfree(wblock->handler_data); -probe_failure: - if (ACPI_FAILURE(wmi_method_enable(wblock, false))) - dev_warn(dev, "failed to disable device\n"); - return ret; + return 0; } static void wmi_dev_remove(struct device *dev) @@ -1054,12 +891,6 @@ static void wmi_dev_remove(struct device *dev) clear_bit(WMI_PROBED, &wblock->flags); - if (wdriver->filter_callback) { - misc_deregister(&wblock->char_dev); - kfree(wblock->char_dev.name); - kfree(wblock->handler_data); - } - if (wdriver->remove) wdriver->remove(dev_to_wdev(dev)); @@ -1131,7 +962,6 @@ static int wmi_create_device(struct device *wmi_bus_dev, if (wblock->gblock.flags & ACPI_WMI_METHOD) { wblock->dev.dev.type = &wmi_type_method; - mutex_init(&wblock->char_mutex); goto out_init; } diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 8a643c39fcce..50f7f1e4fd4f 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -11,7 +11,6 @@ #include #include #include -#include /** * struct wmi_device - WMI device structure @@ -47,8 +46,6 @@ acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct u8 wmidev_instance_count(struct wmi_device *wdev); -extern int set_required_buffer_size(struct wmi_device *wdev, u64 length); - /** * struct wmi_driver - WMI driver structure * @driver: Driver model structure @@ -57,11 +54,8 @@ extern int set_required_buffer_size(struct wmi_device *wdev, u64 length); * @probe: Callback for device binding * @remove: Callback for device unbinding * @notify: Callback for receiving WMI events - * @filter_callback: Callback for filtering device IOCTLs * * This represents WMI drivers which handle WMI devices. - * @filter_callback is only necessary for drivers which - * want to set up a WMI IOCTL interface. */ struct wmi_driver { struct device_driver driver; @@ -71,8 +65,6 @@ struct wmi_driver { int (*probe)(struct wmi_device *wdev, const void *context); void (*remove)(struct wmi_device *wdev); void (*notify)(struct wmi_device *device, union acpi_object *data); - long (*filter_callback)(struct wmi_device *wdev, unsigned int cmd, - struct wmi_ioctl_buffer *arg); }; extern int __must_check __wmi_driver_register(struct wmi_driver *driver, From 2128f3cca5a2e7ab4d1ffb16c0e0431c3a0106a1 Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Mon, 11 Dec 2023 18:06:22 +0800 Subject: [PATCH 0604/1562] Documentation/driver-api: Add document about WBRF mechanism Add documentation about AMD's Wifi band RFI mitigation (WBRF) mechanism explaining the theory and how it is used. Signed-off-by: Ma Jun Reviewed-by: Hans de Goede Reviewed-by: Mario Limonciello Signed-off-by: Hans de Goede --- Documentation/driver-api/index.rst | 1 + Documentation/driver-api/wbrf.rst | 78 ++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 Documentation/driver-api/wbrf.rst diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index f549a68951d7..8bc4ebe7a36f 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -115,6 +115,7 @@ available subsections can be seen below. hte/index wmi dpll + wbrf .. only:: subproject and html diff --git a/Documentation/driver-api/wbrf.rst b/Documentation/driver-api/wbrf.rst new file mode 100644 index 000000000000..f48bfa029813 --- /dev/null +++ b/Documentation/driver-api/wbrf.rst @@ -0,0 +1,78 @@ +.. SPDX-License-Identifier: GPL-2.0-or-later + +================================= +WBRF - Wifi Band RFI Mitigations +================================= + +Due to electrical and mechanical constraints in certain platform designs +there may be likely interference of relatively high-powered harmonics of +the GPU memory clocks with local radio module frequency bands used by +certain Wifi bands. + +To mitigate possible RFI interference producers can advertise the +frequencies in use and consumers can use this information to avoid using +these frequencies for sensitive features. + +When a platform is known to have this issue with any contained devices, +the platform designer will advertise the availability of this feature via +ACPI devices with a device specific method (_DSM). +* Producers with this _DSM will be able to advertise the frequencies in use. +* Consumers with this _DSM will be able to register for notifications of +frequencies in use. + +Some general terms +================== + +Producer: such component who can produce high-powered radio frequency +Consumer: such component who can adjust its in-use frequency in +response to the radio frequencies of other components to mitigate the +possible RFI. + +To make the mechanism function, those producers should notify active use +of their particular frequencies so that other consumers can make relative +internal adjustments as necessary to avoid this resonance. + +ACPI interface +============== + +Although initially used by for wifi + dGPU use cases, the ACPI interface +can be scaled to any type of device that a platform designer discovers +can cause interference. + +The GUID used for the _DSM is 7B7656CF-DC3D-4C1C-83E9-66E721DE3070. + +3 functions are available in this _DSM: + +* 0: discover # of functions available +* 1: record RF bands in use +* 2: retrieve RF bands in use + +Driver programming interface +============================ + +.. kernel-doc:: drivers/platform/x86/amd/wbrf.c + +Sample Usage +============= + +The expected flow for the producers: +1. During probe, call `acpi_amd_wbrf_supported_producer` to check if WBRF +can be enabled for the device. +2. On using some frequency band, call `acpi_amd_wbrf_add_remove` with 'add' +param to get other consumers properly notified. +3. Or on stopping using some frequency band, call +`acpi_amd_wbrf_add_remove` with 'remove' param to get other consumers notified. + +The expected flow for the consumers: +1. During probe, call `acpi_amd_wbrf_supported_consumer` to check if WBRF +can be enabled for the device. +2. Call `amd_wbrf_register_notifier` to register for notification +of frequency band change(add or remove) from other producers. +3. Call the `amd_wbrf_retrieve_freq_band` initally to retrieve +current active frequency bands considering some producers may broadcast +such information before the consumer is up. +4. On receiving a notification for frequency band change, run +`amd_wbrf_retrieve_freq_band` again to retrieve the latest +active frequency bands. +5. During driver cleanup, call `amd_wbrf_unregister_notifier` to +unregister the notifier. From 58e82a62669da52e688f4a8b89922c1839bf1001 Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Mon, 11 Dec 2023 18:06:23 +0800 Subject: [PATCH 0605/1562] platform/x86/amd: Add support for AMD ACPI based Wifi band RFI mitigation feature Due to electrical and mechanical constraints in certain platform designs there may be likely interference of relatively high-powered harmonics of the (G-)DDR memory clocks with local radio module frequency bands used by Wifi 6/6e/7. To mitigate this, AMD has introduced a mechanism that devices can use to notify active use of particular frequencies so that other devices can make relative internal adjustments as necessary to avoid this resonance. Co-developed-by: Evan Quan Signed-off-by: Evan Quan Signed-off-by: Ma Jun Reviewed-by: Mario Limonciello Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/Kconfig | 14 ++ drivers/platform/x86/amd/Makefile | 1 + drivers/platform/x86/amd/wbrf.c | 317 ++++++++++++++++++++++++++++++ include/linux/acpi_amd_wbrf.h | 91 +++++++++ 4 files changed, 423 insertions(+) create mode 100644 drivers/platform/x86/amd/wbrf.c create mode 100644 include/linux/acpi_amd_wbrf.h diff --git a/drivers/platform/x86/amd/Kconfig b/drivers/platform/x86/amd/Kconfig index 55f3a2fc6aec..54753213cc61 100644 --- a/drivers/platform/x86/amd/Kconfig +++ b/drivers/platform/x86/amd/Kconfig @@ -18,3 +18,17 @@ config AMD_HSMP If you choose to compile this driver as a module the module will be called amd_hsmp. + +config AMD_WBRF + bool "AMD Wifi RF Band mitigations (WBRF)" + depends on ACPI + help + WBRF(Wifi Band RFI mitigation) mechanism allows Wifi drivers + to notify the frequencies they are using so that other hardware + can be reconfigured to avoid harmonic conflicts. + + AMD provides an ACPI based mechanism to support WBRF on platform with + appropriate underlying support. + + This mechanism will only be activated on platforms that advertise a + need for it. diff --git a/drivers/platform/x86/amd/Makefile b/drivers/platform/x86/amd/Makefile index f04932b7a7d1..dcec0a46f8af 100644 --- a/drivers/platform/x86/amd/Makefile +++ b/drivers/platform/x86/amd/Makefile @@ -8,3 +8,4 @@ obj-$(CONFIG_AMD_PMC) += pmc/ amd_hsmp-y := hsmp.o obj-$(CONFIG_AMD_HSMP) += amd_hsmp.o obj-$(CONFIG_AMD_PMF) += pmf/ +obj-$(CONFIG_AMD_WBRF) += wbrf.o diff --git a/drivers/platform/x86/amd/wbrf.c b/drivers/platform/x86/amd/wbrf.c new file mode 100644 index 000000000000..dd197b3aebe0 --- /dev/null +++ b/drivers/platform/x86/amd/wbrf.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Wifi Frequency Band Manage Interface + * Copyright (C) 2023 Advanced Micro Devices + */ + +#include +#include + +/* + * Functions bit vector for WBRF method + * + * Bit 0: WBRF supported. + * Bit 1: Function 1 (Add / Remove frequency) is supported. + * Bit 2: Function 2 (Get frequency list) is supported. + */ +#define WBRF_ENABLED 0x0 +#define WBRF_RECORD 0x1 +#define WBRF_RETRIEVE 0x2 + +#define WBRF_REVISION 0x1 + +/* + * The data structure used for WBRF_RETRIEVE is not naturally aligned. + * And unfortunately the design has been settled down. + */ +struct amd_wbrf_ranges_out { + u32 num_of_ranges; + struct freq_band_range band_list[MAX_NUM_OF_WBRF_RANGES]; +} __packed; + +static const guid_t wifi_acpi_dsm_guid = + GUID_INIT(0x7b7656cf, 0xdc3d, 0x4c1c, + 0x83, 0xe9, 0x66, 0xe7, 0x21, 0xde, 0x30, 0x70); + +/* + * Used to notify consumer (amdgpu driver currently) about + * the wifi frequency is change. + */ +static BLOCKING_NOTIFIER_HEAD(wbrf_chain_head); + +static int wbrf_record(struct acpi_device *adev, uint8_t action, struct wbrf_ranges_in_out *in) +{ + union acpi_object argv4; + union acpi_object *tmp; + union acpi_object *obj; + u32 num_of_ranges = 0; + u32 num_of_elements; + u32 arg_idx = 0; + int ret; + u32 i; + + if (!in) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(in->band_list); i++) { + if (in->band_list[i].start && in->band_list[i].end) + num_of_ranges++; + } + + /* + * The num_of_ranges value in the "in" object supplied by + * the caller is required to be equal to the number of + * entries in the band_list array in there. + */ + if (num_of_ranges != in->num_of_ranges) + return -EINVAL; + + /* + * Every input frequency band comes with two end points(start/end) + * and each is accounted as an element. Meanwhile the range count + * and action type are accounted as an element each. + * So, the total element count = 2 * num_of_ranges + 1 + 1. + */ + num_of_elements = 2 * num_of_ranges + 2; + + tmp = kcalloc(num_of_elements, sizeof(*tmp), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + argv4.package.type = ACPI_TYPE_PACKAGE; + argv4.package.count = num_of_elements; + argv4.package.elements = tmp; + + /* save the number of ranges*/ + tmp[0].integer.type = ACPI_TYPE_INTEGER; + tmp[0].integer.value = num_of_ranges; + + /* save the action(WBRF_RECORD_ADD/REMOVE/RETRIEVE) */ + tmp[1].integer.type = ACPI_TYPE_INTEGER; + tmp[1].integer.value = action; + + arg_idx = 2; + for (i = 0; i < ARRAY_SIZE(in->band_list); i++) { + if (!in->band_list[i].start || !in->band_list[i].end) + continue; + + tmp[arg_idx].integer.type = ACPI_TYPE_INTEGER; + tmp[arg_idx++].integer.value = in->band_list[i].start; + tmp[arg_idx].integer.type = ACPI_TYPE_INTEGER; + tmp[arg_idx++].integer.value = in->band_list[i].end; + } + + obj = acpi_evaluate_dsm(adev->handle, &wifi_acpi_dsm_guid, + WBRF_REVISION, WBRF_RECORD, &argv4); + + if (!obj) + return -EINVAL; + + if (obj->type != ACPI_TYPE_INTEGER) { + ret = -EINVAL; + goto out; + } + + ret = obj->integer.value; + if (ret) + ret = -EINVAL; + +out: + ACPI_FREE(obj); + kfree(tmp); + + return ret; +} + +/** + * acpi_amd_wbrf_add_remove - add or remove the frequency band the device is using + * + * @dev: device pointer + * @action: remove or add the frequency band into bios + * @in: input structure containing the frequency band the device is using + * + * Broadcast to other consumers the frequency band the device starts + * to use. Underneath the surface the information is cached into an + * internal buffer first. Then a notification is sent to all those + * registered consumers. So then they can retrieve that buffer to + * know the latest active frequency bands. Consumers that haven't + * yet been registered can retrieve the information from the cache + * when they register. + * + * Return: + * 0 for success add/remove wifi frequency band. + * Returns a negative error code for failure. + */ +int acpi_amd_wbrf_add_remove(struct device *dev, uint8_t action, struct wbrf_ranges_in_out *in) +{ + struct acpi_device *adev; + int ret; + + adev = ACPI_COMPANION(dev); + if (!adev) + return -ENODEV; + + ret = wbrf_record(adev, action, in); + if (ret) + return ret; + + blocking_notifier_call_chain(&wbrf_chain_head, WBRF_CHANGED, NULL); + + return 0; +} +EXPORT_SYMBOL_GPL(acpi_amd_wbrf_add_remove); + +/** + * acpi_amd_wbrf_supported_producer - determine if the WBRF can be enabled + * for the device as a producer + * + * @dev: device pointer + * + * Check if the platform equipped with necessary implementations to + * support WBRF for the device as a producer. + * + * Return: + * true if WBRF is supported, otherwise returns false + */ +bool acpi_amd_wbrf_supported_producer(struct device *dev) +{ + struct acpi_device *adev; + + adev = ACPI_COMPANION(dev); + if (!adev) + return false; + + return acpi_check_dsm(adev->handle, &wifi_acpi_dsm_guid, + WBRF_REVISION, BIT(WBRF_RECORD)); +} +EXPORT_SYMBOL_GPL(acpi_amd_wbrf_supported_producer); + +/** + * acpi_amd_wbrf_supported_consumer - determine if the WBRF can be enabled + * for the device as a consumer + * + * @dev: device pointer + * + * Determine if the platform equipped with necessary implementations to + * support WBRF for the device as a consumer. + * + * Return: + * true if WBRF is supported, otherwise returns false. + */ +bool acpi_amd_wbrf_supported_consumer(struct device *dev) +{ + struct acpi_device *adev; + + adev = ACPI_COMPANION(dev); + if (!adev) + return false; + + return acpi_check_dsm(adev->handle, &wifi_acpi_dsm_guid, + WBRF_REVISION, BIT(WBRF_RETRIEVE)); +} +EXPORT_SYMBOL_GPL(acpi_amd_wbrf_supported_consumer); + +/** + * amd_wbrf_retrieve_freq_band - retrieve current active frequency bands + * + * @dev: device pointer + * @out: output structure containing all the active frequency bands + * + * Retrieve the current active frequency bands which were broadcasted + * by other producers. The consumer who calls this API should take + * proper actions if any of the frequency band may cause RFI with its + * own frequency band used. + * + * Return: + * 0 for getting wifi freq band successfully. + * Returns a negative error code for failure. + */ +int amd_wbrf_retrieve_freq_band(struct device *dev, struct wbrf_ranges_in_out *out) +{ + struct amd_wbrf_ranges_out acpi_out = {0}; + struct acpi_device *adev; + union acpi_object *obj; + union acpi_object param; + int ret = 0; + + adev = ACPI_COMPANION(dev); + if (!adev) + return -ENODEV; + + param.type = ACPI_TYPE_STRING; + param.string.length = 0; + param.string.pointer = NULL; + + obj = acpi_evaluate_dsm(adev->handle, &wifi_acpi_dsm_guid, + WBRF_REVISION, WBRF_RETRIEVE, ¶m); + if (!obj) + return -EINVAL; + + /* + * The return buffer is with variable length and the format below: + * number_of_entries(1 DWORD): Number of entries + * start_freq of 1st entry(1 QWORD): Start frequency of the 1st entry + * end_freq of 1st entry(1 QWORD): End frequency of the 1st entry + * ... + * ... + * start_freq of the last entry(1 QWORD) + * end_freq of the last entry(1 QWORD) + * + * Thus the buffer length is determined by the number of entries. + * - For zero entry scenario, the buffer length will be 4 bytes. + * - For one entry scenario, the buffer length will be 20 bytes. + */ + if (obj->buffer.length > sizeof(acpi_out) || obj->buffer.length < 4) { + dev_err(dev, "Wrong sized WBRT information"); + ret = -EINVAL; + goto out; + } + memcpy(&acpi_out, obj->buffer.pointer, obj->buffer.length); + + out->num_of_ranges = acpi_out.num_of_ranges; + memcpy(out->band_list, acpi_out.band_list, sizeof(acpi_out.band_list)); + +out: + ACPI_FREE(obj); + return ret; +} +EXPORT_SYMBOL_GPL(amd_wbrf_retrieve_freq_band); + +/** + * amd_wbrf_register_notifier - register for notifications of frequency + * band update + * + * @nb: driver notifier block + * + * The consumer should register itself via this API so that it can get + * notified on the frequency band updates from other producers. + * + * Return: + * 0 for registering a consumer driver successfully. + * Returns a negative error code for failure. + */ +int amd_wbrf_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&wbrf_chain_head, nb); +} +EXPORT_SYMBOL_GPL(amd_wbrf_register_notifier); + +/** + * amd_wbrf_unregister_notifier - unregister for notifications of + * frequency band update + * + * @nb: driver notifier block + * + * The consumer should call this API when it is longer interested with + * the frequency band updates from other producers. Usually, this should + * be performed during driver cleanup. + * + * Return: + * 0 for unregistering a consumer driver. + * Returns a negative error code for failure. + */ +int amd_wbrf_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&wbrf_chain_head, nb); +} +EXPORT_SYMBOL_GPL(amd_wbrf_unregister_notifier); diff --git a/include/linux/acpi_amd_wbrf.h b/include/linux/acpi_amd_wbrf.h new file mode 100644 index 000000000000..898f31d536d4 --- /dev/null +++ b/include/linux/acpi_amd_wbrf.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Wifi Band Exclusion Interface (AMD ACPI Implementation) + * Copyright (C) 2023 Advanced Micro Devices + */ + +#ifndef _ACPI_AMD_WBRF_H +#define _ACPI_AMD_WBRF_H + +#include +#include + +/* The maximum number of frequency band ranges */ +#define MAX_NUM_OF_WBRF_RANGES 11 + +/* Record actions */ +#define WBRF_RECORD_ADD 0x0 +#define WBRF_RECORD_REMOVE 0x1 + +/** + * struct freq_band_range - Wifi frequency band range definition + * @start: start frequency point (in Hz) + * @end: end frequency point (in Hz) + */ +struct freq_band_range { + u64 start; + u64 end; +}; + +/** + * struct wbrf_ranges_in_out - wbrf ranges info + * @num_of_ranges: total number of band ranges in this struct + * @band_list: array of Wifi band ranges + */ +struct wbrf_ranges_in_out { + u64 num_of_ranges; + struct freq_band_range band_list[MAX_NUM_OF_WBRF_RANGES]; +}; + +/** + * enum wbrf_notifier_actions - wbrf notifier actions index + * @WBRF_CHANGED: there was some frequency band updates. The consumers + * should retrieve the latest active frequency bands. + */ +enum wbrf_notifier_actions { + WBRF_CHANGED, +}; + +#if IS_ENABLED(CONFIG_AMD_WBRF) +bool acpi_amd_wbrf_supported_producer(struct device *dev); +int acpi_amd_wbrf_add_remove(struct device *dev, uint8_t action, struct wbrf_ranges_in_out *in); +bool acpi_amd_wbrf_supported_consumer(struct device *dev); +int amd_wbrf_retrieve_freq_band(struct device *dev, struct wbrf_ranges_in_out *out); +int amd_wbrf_register_notifier(struct notifier_block *nb); +int amd_wbrf_unregister_notifier(struct notifier_block *nb); +#else +static inline +bool acpi_amd_wbrf_supported_consumer(struct device *dev) +{ + return false; +} + +static inline +int acpi_amd_wbrf_add_remove(struct device *dev, uint8_t action, struct wbrf_ranges_in_out *in) +{ + return -ENODEV; +} + +static inline +bool acpi_amd_wbrf_supported_producer(struct device *dev) +{ + return false; +} +static inline +int amd_wbrf_retrieve_freq_band(struct device *dev, struct wbrf_ranges_in_out *out) +{ + return -ENODEV; +} +static inline +int amd_wbrf_register_notifier(struct notifier_block *nb) +{ + return -ENODEV; +} +static inline +int amd_wbrf_unregister_notifier(struct notifier_block *nb) +{ + return -ENODEV; +} +#endif /* CONFIG_AMD_WBRF */ + +#endif /* _ACPI_AMD_WBRF_H */ From d1c371035c8204112d84266e6bde7537f25448f7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 10 Dec 2023 10:50:28 +0800 Subject: [PATCH 0606/1562] quota: convert dquot_claim_space_nodirty() to return void dquot_claim_space_nodirty() always return zero, let's convert it to return void, then, its caller can get rid of handling failure case. Signed-off-by: Chao Yu Signed-off-by: Jan Kara Message-Id: <20231210025028.3262900-1-chao@kernel.org> --- fs/quota/dquot.c | 6 +++--- include/linux/quotaops.h | 15 +++++---------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 58b5de081b57..44ff2813ae51 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1787,7 +1787,7 @@ EXPORT_SYMBOL(dquot_alloc_inode); /* * Convert in-memory reserved quotas to real consumed quotas */ -int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) +void dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { struct dquot **dquots; int cnt, index; @@ -1797,7 +1797,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) *inode_reserved_space(inode) -= number; __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); - return 0; + return; } dquots = i_dquot(inode); @@ -1822,7 +1822,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); - return 0; + return; } EXPORT_SYMBOL(dquot_claim_space_nodirty); diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 4fa4ef0a173a..06cc8888199e 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -74,7 +74,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags); int dquot_alloc_inode(struct inode *inode); -int dquot_claim_space_nodirty(struct inode *inode, qsize_t number); +void dquot_claim_space_nodirty(struct inode *inode, qsize_t number); void dquot_free_inode(struct inode *inode); void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number); @@ -257,10 +257,9 @@ static inline void __dquot_free_space(struct inode *inode, qsize_t number, inode_sub_bytes(inode, number); } -static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) +static inline void dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { inode_add_bytes(inode, number); - return 0; } static inline int dquot_reclaim_space_nodirty(struct inode *inode, @@ -358,14 +357,10 @@ static inline int dquot_reserve_block(struct inode *inode, qsize_t nr) DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE); } -static inline int dquot_claim_block(struct inode *inode, qsize_t nr) +static inline void dquot_claim_block(struct inode *inode, qsize_t nr) { - int ret; - - ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); - if (!ret) - mark_inode_dirty_sync(inode); - return ret; + dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); + mark_inode_dirty_sync(inode); } static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr) From 03560ff08d2839d7381f18576b329a2eee5cfb37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 5 Dec 2023 13:26:16 +0100 Subject: [PATCH 0607/1562] regulator: arizona-ldo1: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://msgid.link/r/76c7af01e2c8b3ab6585a04bc3f0d163fbb7fdf7.1701778038.git.u.kleine-koenig@pengutronix.de Acked-by: Charles Keepax Signed-off-by: Mark Brown --- drivers/regulator/arizona-ldo1.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/regulator/arizona-ldo1.c b/drivers/regulator/arizona-ldo1.c index b465c0010665..4b54068d4f59 100644 --- a/drivers/regulator/arizona-ldo1.c +++ b/drivers/regulator/arizona-ldo1.c @@ -339,14 +339,12 @@ static int arizona_ldo1_probe(struct platform_device *pdev) return ret; } -static int arizona_ldo1_remove(struct platform_device *pdev) +static void arizona_ldo1_remove(struct platform_device *pdev) { struct arizona_ldo1 *ldo1 = platform_get_drvdata(pdev); if (ldo1->ena_gpiod) gpiod_put(ldo1->ena_gpiod); - - return 0; } static int madera_ldo1_probe(struct platform_device *pdev) @@ -377,7 +375,7 @@ static int madera_ldo1_probe(struct platform_device *pdev) static struct platform_driver arizona_ldo1_driver = { .probe = arizona_ldo1_probe, - .remove = arizona_ldo1_remove, + .remove_new = arizona_ldo1_remove, .driver = { .name = "arizona-ldo1", .probe_type = PROBE_FORCE_SYNCHRONOUS, @@ -386,7 +384,7 @@ static struct platform_driver arizona_ldo1_driver = { static struct platform_driver madera_ldo1_driver = { .probe = madera_ldo1_probe, - .remove = arizona_ldo1_remove, + .remove_new = arizona_ldo1_remove, .driver = { .name = "madera-ldo1", .probe_type = PROBE_FORCE_SYNCHRONOUS, From cddda6f5f47f7cb13191b7753bc3882940b6f325 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 5 Dec 2023 13:26:17 +0100 Subject: [PATCH 0608/1562] regulator: bd9571mwv: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://msgid.link/r/639e796b36815a219ff1172cc758ba7378211d74.1701778038.git.u.kleine-koenig@pengutronix.de Reviewed-by: Geert Uytterhoeven Signed-off-by: Mark Brown --- drivers/regulator/bd9571mwv-regulator.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/regulator/bd9571mwv-regulator.c b/drivers/regulator/bd9571mwv-regulator.c index d469481d8442..c7ceba56e7dc 100644 --- a/drivers/regulator/bd9571mwv-regulator.c +++ b/drivers/regulator/bd9571mwv-regulator.c @@ -260,10 +260,9 @@ static const struct dev_pm_ops bd9571mwv_pm = { SET_SYSTEM_SLEEP_PM_OPS(bd9571mwv_suspend, bd9571mwv_resume) }; -static int bd9571mwv_regulator_remove(struct platform_device *pdev) +static void bd9571mwv_regulator_remove(struct platform_device *pdev) { device_remove_file(&pdev->dev, &dev_attr_backup_mode); - return 0; } #define DEV_PM_OPS &bd9571mwv_pm #else @@ -357,7 +356,7 @@ static struct platform_driver bd9571mwv_regulator_driver = { .pm = DEV_PM_OPS, }, .probe = bd9571mwv_regulator_probe, - .remove = bd9571mwv_regulator_remove, + .remove_new = bd9571mwv_regulator_remove, .id_table = bd9571mwv_regulator_id_table, }; module_platform_driver(bd9571mwv_regulator_driver); From 0210a60aad02149d8503d259525bfbe0e99f8cb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 5 Dec 2023 13:26:18 +0100 Subject: [PATCH 0609/1562] regulator: db8500-prcmu: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://msgid.link/r/fcaa42d7dd707031ed8dd9e8c28483891b879965.1701778038.git.u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- drivers/regulator/db8500-prcmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/regulator/db8500-prcmu.c b/drivers/regulator/db8500-prcmu.c index 34c5e485d0af..1e2d54da1b9a 100644 --- a/drivers/regulator/db8500-prcmu.c +++ b/drivers/regulator/db8500-prcmu.c @@ -469,11 +469,9 @@ static int db8500_regulator_probe(struct platform_device *pdev) return 0; } -static int db8500_regulator_remove(struct platform_device *pdev) +static void db8500_regulator_remove(struct platform_device *pdev) { ux500_regulator_debug_exit(); - - return 0; } static struct platform_driver db8500_regulator_driver = { @@ -482,7 +480,7 @@ static struct platform_driver db8500_regulator_driver = { .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, .probe = db8500_regulator_probe, - .remove = db8500_regulator_remove, + .remove_new = db8500_regulator_remove, }; static int __init db8500_regulator_init(void) From 6f382a0c7ec12f85f4e40d5343ba53f16f543ccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 5 Dec 2023 13:26:19 +0100 Subject: [PATCH 0610/1562] regulator: stm32-vrefbuf: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://msgid.link/r/2e96cf99c8d97b728d891a745e8f94ee39fbfee8.1701778038.git.u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- drivers/regulator/stm32-vrefbuf.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/regulator/stm32-vrefbuf.c b/drivers/regulator/stm32-vrefbuf.c index 717144cbe0f9..40855105dd33 100644 --- a/drivers/regulator/stm32-vrefbuf.c +++ b/drivers/regulator/stm32-vrefbuf.c @@ -233,7 +233,7 @@ err_pm_stop: return ret; } -static int stm32_vrefbuf_remove(struct platform_device *pdev) +static void stm32_vrefbuf_remove(struct platform_device *pdev) { struct regulator_dev *rdev = platform_get_drvdata(pdev); struct stm32_vrefbuf *priv = rdev_get_drvdata(rdev); @@ -244,8 +244,6 @@ static int stm32_vrefbuf_remove(struct platform_device *pdev) pm_runtime_disable(&pdev->dev); pm_runtime_set_suspended(&pdev->dev); pm_runtime_put_noidle(&pdev->dev); - - return 0; }; static int __maybe_unused stm32_vrefbuf_runtime_suspend(struct device *dev) @@ -282,7 +280,7 @@ MODULE_DEVICE_TABLE(of, stm32_vrefbuf_of_match); static struct platform_driver stm32_vrefbuf_driver = { .probe = stm32_vrefbuf_probe, - .remove = stm32_vrefbuf_remove, + .remove_new = stm32_vrefbuf_remove, .driver = { .name = "stm32-vrefbuf", .probe_type = PROBE_PREFER_ASYNCHRONOUS, From 964575179663db70832e374edfacc91539e783d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 5 Dec 2023 13:26:20 +0100 Subject: [PATCH 0611/1562] regulator: uniphier: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://msgid.link/r/ced2a73a1aeca3f33d4b194e4dbe2672ad84a50a.1701778038.git.u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- drivers/regulator/uniphier-regulator.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/regulator/uniphier-regulator.c b/drivers/regulator/uniphier-regulator.c index 1d8304b88bd6..5f868042392f 100644 --- a/drivers/regulator/uniphier-regulator.c +++ b/drivers/regulator/uniphier-regulator.c @@ -115,7 +115,7 @@ out_rst_assert: return ret; } -static int uniphier_regulator_remove(struct platform_device *pdev) +static void uniphier_regulator_remove(struct platform_device *pdev) { struct uniphier_regulator_priv *priv = platform_get_drvdata(pdev); int i; @@ -124,8 +124,6 @@ static int uniphier_regulator_remove(struct platform_device *pdev) reset_control_assert(priv->rst[i]); clk_bulk_disable_unprepare(priv->data->nclks, priv->clk); - - return 0; } /* USB3 controller data */ @@ -209,7 +207,7 @@ MODULE_DEVICE_TABLE(of, uniphier_regulator_match); static struct platform_driver uniphier_regulator_driver = { .probe = uniphier_regulator_probe, - .remove = uniphier_regulator_remove, + .remove_new = uniphier_regulator_remove, .driver = { .name = "uniphier-regulator", .probe_type = PROBE_PREFER_ASYNCHRONOUS, From 3b2e8e98692b20436d0346fc6adffff1b596d50f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 5 Dec 2023 13:26:21 +0100 Subject: [PATCH 0612/1562] regulator: userspace-consumer: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://msgid.link/r/89c5f261707bf178e1508cf5dd55121f0da2dc3f.1701778038.git.u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- drivers/regulator/userspace-consumer.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/regulator/userspace-consumer.c b/drivers/regulator/userspace-consumer.c index 97f075ed68c9..53d1b9d6f69c 100644 --- a/drivers/regulator/userspace-consumer.c +++ b/drivers/regulator/userspace-consumer.c @@ -194,7 +194,7 @@ err_enable: return ret; } -static int regulator_userspace_consumer_remove(struct platform_device *pdev) +static void regulator_userspace_consumer_remove(struct platform_device *pdev) { struct userspace_consumer_data *data = platform_get_drvdata(pdev); @@ -202,8 +202,6 @@ static int regulator_userspace_consumer_remove(struct platform_device *pdev) if (data->enabled && !data->no_autoswitch) regulator_bulk_disable(data->num_supplies, data->supplies); - - return 0; } static const struct of_device_id regulator_userspace_consumer_of_match[] = { @@ -213,7 +211,7 @@ static const struct of_device_id regulator_userspace_consumer_of_match[] = { static struct platform_driver regulator_userspace_consumer_driver = { .probe = regulator_userspace_consumer_probe, - .remove = regulator_userspace_consumer_remove, + .remove_new = regulator_userspace_consumer_remove, .driver = { .name = "reg-userspace-consumer", .probe_type = PROBE_PREFER_ASYNCHRONOUS, From d637a75ede3db84f7ae4bc2ab398fe2232f22c26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 5 Dec 2023 13:26:22 +0100 Subject: [PATCH 0613/1562] regulator: virtual: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://msgid.link/r/d9954f02ae51b1b0b0077c710d16bfaeafa216ec.1701778038.git.u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- drivers/regulator/virtual.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/regulator/virtual.c b/drivers/regulator/virtual.c index d5a160efdae6..0a0ee186c6af 100644 --- a/drivers/regulator/virtual.c +++ b/drivers/regulator/virtual.c @@ -345,7 +345,7 @@ static int regulator_virtual_probe(struct platform_device *pdev) return 0; } -static int regulator_virtual_remove(struct platform_device *pdev) +static void regulator_virtual_remove(struct platform_device *pdev) { struct virtual_consumer_data *drvdata = platform_get_drvdata(pdev); @@ -353,13 +353,11 @@ static int regulator_virtual_remove(struct platform_device *pdev) if (drvdata->enabled) regulator_disable(drvdata->regulator); - - return 0; } static struct platform_driver regulator_virtual_consumer_driver = { .probe = regulator_virtual_probe, - .remove = regulator_virtual_remove, + .remove_new = regulator_virtual_remove, .driver = { .name = "reg-virt-consumer", .probe_type = PROBE_PREFER_ASYNCHRONOUS, From 8d6fab52f3fdaeb8aabfd046d95e5d3f9464399e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 5 Dec 2023 13:26:23 +0100 Subject: [PATCH 0614/1562] regulator: wm8350: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://msgid.link/r/1f7bbc545829a1cc3df40be0424fe46d7449fb72.1701778038.git.u.kleine-koenig@pengutronix.de Acked-by: Charles Keepax Signed-off-by: Mark Brown --- drivers/regulator/wm8350-regulator.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/regulator/wm8350-regulator.c b/drivers/regulator/wm8350-regulator.c index 1445bafcab40..9939a5d2cbec 100644 --- a/drivers/regulator/wm8350-regulator.c +++ b/drivers/regulator/wm8350-regulator.c @@ -1158,14 +1158,12 @@ static int wm8350_regulator_probe(struct platform_device *pdev) return 0; } -static int wm8350_regulator_remove(struct platform_device *pdev) +static void wm8350_regulator_remove(struct platform_device *pdev) { struct regulator_dev *rdev = platform_get_drvdata(pdev); struct wm8350 *wm8350 = rdev_get_drvdata(rdev); wm8350_free_irq(wm8350, wm8350_reg[pdev->id].irq, rdev); - - return 0; } int wm8350_register_regulator(struct wm8350 *wm8350, int reg, @@ -1306,7 +1304,7 @@ EXPORT_SYMBOL_GPL(wm8350_register_led); static struct platform_driver wm8350_regulator_driver = { .probe = wm8350_regulator_probe, - .remove = wm8350_regulator_remove, + .remove_new = wm8350_regulator_remove, .driver = { .name = "wm8350-regulator", .probe_type = PROBE_PREFER_ASYNCHRONOUS, From 4c6dd33de9d3148909bc403d394f527bec4aec27 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 11 Dec 2023 13:49:14 +0100 Subject: [PATCH 0615/1562] spi: pl022: delete unused cur_gpiod in struct pl022 The member cur_gpiod of struct pl022 is not used anywhere. Delete it. Signed-off-by: Nam Cao Link: https://msgid.link/r/7618c9d714aa1c16c7cb06f9d1fb1c074d1d9c65.1702298527.git.namcao@linutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi-pl022.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/spi/spi-pl022.c b/drivers/spi/spi-pl022.c index 1e3bd6f3303a..fd21e83cc3a4 100644 --- a/drivers/spi/spi-pl022.c +++ b/drivers/spi/spi-pl022.c @@ -361,7 +361,6 @@ struct vendor_data { * @dummypage: a dummy page used for driving data on the bus with DMA * @dma_running: indicates whether DMA is in operation * @cur_cs: current chip select index - * @cur_gpiod: current chip select GPIO descriptor */ struct pl022 { struct amba_device *adev; @@ -393,7 +392,6 @@ struct pl022 { bool dma_running; #endif int cur_cs; - struct gpio_desc *cur_gpiod; }; /** @@ -1344,8 +1342,6 @@ static int pl022_transfer_one(struct spi_controller *host, struct spi_device *sp /* Setup the SPI using the per chip configuration */ pl022->cur_chip = spi_get_ctldata(spi); pl022->cur_cs = spi_get_chipselect(spi, 0); - /* This is always available but may be set to -ENOENT */ - pl022->cur_gpiod = spi_get_csgpiod(spi, 0); restore_state(pl022); flush(pl022); From 3c49d848d2d3c6fe46522e4d750fc3a18e699997 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 11 Dec 2023 13:49:15 +0100 Subject: [PATCH 0616/1562] spi: pl022: delete unused next_msg_cs_active in struct pl022 The member next_msg_cs_active of struct pl022 is not used anywhere. Delete it. Signed-off-by: Nam Cao Link: https://msgid.link/r/424fec01a75f6a881edcce189ac68b3408b62f29.1702298527.git.namcao@linutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi-pl022.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/spi/spi-pl022.c b/drivers/spi/spi-pl022.c index fd21e83cc3a4..3baf45da01cd 100644 --- a/drivers/spi/spi-pl022.c +++ b/drivers/spi/spi-pl022.c @@ -341,10 +341,6 @@ struct vendor_data { * @cur_msg: Pointer to current spi_message being processed * @cur_transfer: Pointer to current spi_transfer * @cur_chip: pointer to current clients chip(assigned from controller_state) - * @next_msg_cs_active: the next message in the queue has been examined - * and it was found that it uses the same chip select as the previous - * message, so we left it active after the previous transfer, and it's - * active already. * @tx: current position in TX buffer to be read * @tx_end: end position in TX buffer to be read * @rx: current position in RX buffer to be written @@ -372,7 +368,6 @@ struct pl022 { struct pl022_ssp_controller *host_info; struct spi_transfer *cur_transfer; struct chip_data *cur_chip; - bool next_msg_cs_active; void *tx; void *tx_end; void *rx; From 0a3d087d09a8f52c02d0014bad63be99c53c4812 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:06 +0800 Subject: [PATCH 0617/1562] spi: sprd-adi: switch to use spi_alloc_host() Switch to use modern name function spi_alloc_host(). No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-2-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-sprd-adi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-sprd-adi.c b/drivers/spi/spi-sprd-adi.c index 58c3badd9c79..262c11d977ea 100644 --- a/drivers/spi/spi-sprd-adi.c +++ b/drivers/spi/spi-sprd-adi.c @@ -528,7 +528,7 @@ static int sprd_adi_probe(struct platform_device *pdev) pdev->id = of_alias_get_id(np, "spi"); num_chipselect = of_get_child_count(np); - ctlr = spi_alloc_master(&pdev->dev, sizeof(struct sprd_adi)); + ctlr = spi_alloc_host(&pdev->dev, sizeof(struct sprd_adi)); if (!ctlr) return -ENOMEM; From 8c53784757b7fb2bf75e36ae5356628a8baeffd9 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:07 +0800 Subject: [PATCH 0618/1562] spi: sprd: switch to use modern name Change legacy name master to modern name host or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-3-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-sprd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-sprd.c b/drivers/spi/spi-sprd.c index 95377cf748c0..831ebae10fe0 100644 --- a/drivers/spi/spi-sprd.c +++ b/drivers/spi/spi-sprd.c @@ -578,7 +578,7 @@ static void sprd_spi_dma_release(struct sprd_spi *ss) static int sprd_spi_dma_txrx_bufs(struct spi_device *sdev, struct spi_transfer *t) { - struct sprd_spi *ss = spi_master_get_devdata(sdev->master); + struct sprd_spi *ss = spi_controller_get_devdata(sdev->controller); u32 trans_len = ss->trans_len; int ret, write_size = 0; @@ -923,7 +923,7 @@ static int sprd_spi_probe(struct platform_device *pdev) int ret; pdev->id = of_alias_get_id(pdev->dev.of_node, "spi"); - sctlr = spi_alloc_master(&pdev->dev, sizeof(*ss)); + sctlr = spi_alloc_host(&pdev->dev, sizeof(*ss)); if (!sctlr) return -ENOMEM; From e6b7e64cb11966b26646a362677ca5a08481157e Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:08 +0800 Subject: [PATCH 0619/1562] spi: st-ssc4: switch to use modern name Change legacy name master/slave to modern name host/target or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-4-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-st-ssc4.c | 70 +++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/drivers/spi/spi-st-ssc4.c b/drivers/spi/spi-st-ssc4.c index 7fcff9c539e2..e064025e2fd6 100644 --- a/drivers/spi/spi-st-ssc4.c +++ b/drivers/spi/spi-st-ssc4.c @@ -6,7 +6,7 @@ * Patrice Chotard * Lee Jones * - * SPI master mode controller driver, used in STMicroelectronics devices. + * SPI host mode controller driver, used in STMicroelectronics devices. */ #include @@ -115,10 +115,10 @@ static void ssc_read_rx_fifo(struct spi_st *spi_st) spi_st->words_remaining -= count; } -static int spi_st_transfer_one(struct spi_master *master, +static int spi_st_transfer_one(struct spi_controller *host, struct spi_device *spi, struct spi_transfer *t) { - struct spi_st *spi_st = spi_master_get_devdata(master); + struct spi_st *spi_st = spi_controller_get_devdata(host); uint32_t ctl = 0; /* Setup transfer */ @@ -165,7 +165,7 @@ static int spi_st_transfer_one(struct spi_master *master, if (ctl) writel_relaxed(ctl, spi_st->base + SSC_CTL); - spi_finalize_current_transfer(spi->master); + spi_finalize_current_transfer(spi->controller); return t->len; } @@ -174,7 +174,7 @@ static int spi_st_transfer_one(struct spi_master *master, #define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST | SPI_LOOP | SPI_CS_HIGH) static int spi_st_setup(struct spi_device *spi) { - struct spi_st *spi_st = spi_master_get_devdata(spi->master); + struct spi_st *spi_st = spi_controller_get_devdata(spi->controller); u32 spi_st_clk, sscbrg, var; u32 hz = spi->max_speed_hz; @@ -274,35 +274,35 @@ static irqreturn_t spi_st_irq(int irq, void *dev_id) static int spi_st_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; - struct spi_master *master; + struct spi_controller *host; struct spi_st *spi_st; int irq, ret = 0; u32 var; - master = spi_alloc_master(&pdev->dev, sizeof(*spi_st)); - if (!master) + host = spi_alloc_host(&pdev->dev, sizeof(*spi_st)); + if (!host) return -ENOMEM; - master->dev.of_node = np; - master->mode_bits = MODEBITS; - master->setup = spi_st_setup; - master->transfer_one = spi_st_transfer_one; - master->bits_per_word_mask = SPI_BPW_MASK(8) | SPI_BPW_MASK(16); - master->auto_runtime_pm = true; - master->bus_num = pdev->id; - master->use_gpio_descriptors = true; - spi_st = spi_master_get_devdata(master); + host->dev.of_node = np; + host->mode_bits = MODEBITS; + host->setup = spi_st_setup; + host->transfer_one = spi_st_transfer_one; + host->bits_per_word_mask = SPI_BPW_MASK(8) | SPI_BPW_MASK(16); + host->auto_runtime_pm = true; + host->bus_num = pdev->id; + host->use_gpio_descriptors = true; + spi_st = spi_controller_get_devdata(host); spi_st->clk = devm_clk_get(&pdev->dev, "ssc"); if (IS_ERR(spi_st->clk)) { dev_err(&pdev->dev, "Unable to request clock\n"); ret = PTR_ERR(spi_st->clk); - goto put_master; + goto put_host; } ret = clk_prepare_enable(spi_st->clk); if (ret) - goto put_master; + goto put_host; init_completion(&spi_st->done); @@ -324,7 +324,7 @@ static int spi_st_probe(struct platform_device *pdev) var &= ~SSC_CTL_SR; writel_relaxed(var, spi_st->base + SSC_CTL); - /* Set SSC into slave mode before reconfiguring PIO pins */ + /* Set SSC into target mode before reconfiguring PIO pins */ var = readl_relaxed(spi_st->base + SSC_CTL); var &= ~SSC_CTL_MS; writel_relaxed(var, spi_st->base + SSC_CTL); @@ -347,11 +347,11 @@ static int spi_st_probe(struct platform_device *pdev) pm_runtime_set_active(&pdev->dev); pm_runtime_enable(&pdev->dev); - platform_set_drvdata(pdev, master); + platform_set_drvdata(pdev, host); - ret = devm_spi_register_master(&pdev->dev, master); + ret = devm_spi_register_controller(&pdev->dev, host); if (ret) { - dev_err(&pdev->dev, "Failed to register master\n"); + dev_err(&pdev->dev, "Failed to register host\n"); goto rpm_disable; } @@ -361,15 +361,15 @@ rpm_disable: pm_runtime_disable(&pdev->dev); clk_disable: clk_disable_unprepare(spi_st->clk); -put_master: - spi_master_put(master); +put_host: + spi_controller_put(host); return ret; } static void spi_st_remove(struct platform_device *pdev) { - struct spi_master *master = platform_get_drvdata(pdev); - struct spi_st *spi_st = spi_master_get_devdata(master); + struct spi_controller *host = platform_get_drvdata(pdev); + struct spi_st *spi_st = spi_controller_get_devdata(host); pm_runtime_disable(&pdev->dev); @@ -381,8 +381,8 @@ static void spi_st_remove(struct platform_device *pdev) #ifdef CONFIG_PM static int spi_st_runtime_suspend(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct spi_st *spi_st = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct spi_st *spi_st = spi_controller_get_devdata(host); writel_relaxed(0, spi_st->base + SSC_IEN); pinctrl_pm_select_sleep_state(dev); @@ -394,8 +394,8 @@ static int spi_st_runtime_suspend(struct device *dev) static int spi_st_runtime_resume(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct spi_st *spi_st = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct spi_st *spi_st = spi_controller_get_devdata(host); int ret; ret = clk_prepare_enable(spi_st->clk); @@ -408,10 +408,10 @@ static int spi_st_runtime_resume(struct device *dev) #ifdef CONFIG_PM_SLEEP static int spi_st_suspend(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); + struct spi_controller *host = dev_get_drvdata(dev); int ret; - ret = spi_master_suspend(master); + ret = spi_controller_suspend(host); if (ret) return ret; @@ -420,10 +420,10 @@ static int spi_st_suspend(struct device *dev) static int spi_st_resume(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); + struct spi_controller *host = dev_get_drvdata(dev); int ret; - ret = spi_master_resume(master); + ret = spi_controller_resume(host); if (ret) return ret; From d9ea4bcf244d936d74a5993ae1f778f8cb9a479b Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:09 +0800 Subject: [PATCH 0620/1562] spi: stm32-qspi: switch to use modern name Change legacy name master to modern name host or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-5-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32-qspi.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/spi/spi-stm32-qspi.c b/drivers/spi/spi-stm32-qspi.c index def74ae9b5f6..385832030459 100644 --- a/drivers/spi/spi-stm32-qspi.c +++ b/drivers/spi/spi-stm32-qspi.c @@ -357,7 +357,7 @@ static int stm32_qspi_get_mode(u8 buswidth) static int stm32_qspi_send(struct spi_device *spi, const struct spi_mem_op *op) { - struct stm32_qspi *qspi = spi_controller_get_devdata(spi->master); + struct stm32_qspi *qspi = spi_controller_get_devdata(spi->controller); struct stm32_qspi_flash *flash = &qspi->flash[spi_get_chipselect(spi, 0)]; u32 ccr, cr; int timeout, err = 0, err_poll_status = 0; @@ -448,7 +448,7 @@ static int stm32_qspi_poll_status(struct spi_mem *mem, const struct spi_mem_op * unsigned long polling_rate_us, unsigned long timeout_ms) { - struct stm32_qspi *qspi = spi_controller_get_devdata(mem->spi->master); + struct stm32_qspi *qspi = spi_controller_get_devdata(mem->spi->controller); int ret; if (!spi_mem_supports_op(mem, op)) @@ -476,7 +476,7 @@ static int stm32_qspi_poll_status(struct spi_mem *mem, const struct spi_mem_op * static int stm32_qspi_exec_op(struct spi_mem *mem, const struct spi_mem_op *op) { - struct stm32_qspi *qspi = spi_controller_get_devdata(mem->spi->master); + struct stm32_qspi *qspi = spi_controller_get_devdata(mem->spi->controller); int ret; ret = pm_runtime_resume_and_get(qspi->dev); @@ -500,7 +500,7 @@ static int stm32_qspi_exec_op(struct spi_mem *mem, const struct spi_mem_op *op) static int stm32_qspi_dirmap_create(struct spi_mem_dirmap_desc *desc) { - struct stm32_qspi *qspi = spi_controller_get_devdata(desc->mem->spi->master); + struct stm32_qspi *qspi = spi_controller_get_devdata(desc->mem->spi->controller); if (desc->info.op_tmpl.data.dir == SPI_MEM_DATA_OUT) return -EOPNOTSUPP; @@ -518,7 +518,7 @@ static int stm32_qspi_dirmap_create(struct spi_mem_dirmap_desc *desc) static ssize_t stm32_qspi_dirmap_read(struct spi_mem_dirmap_desc *desc, u64 offs, size_t len, void *buf) { - struct stm32_qspi *qspi = spi_controller_get_devdata(desc->mem->spi->master); + struct stm32_qspi *qspi = spi_controller_get_devdata(desc->mem->spi->controller); struct spi_mem_op op; u32 addr_max; int ret; @@ -640,7 +640,7 @@ end_of_transfer: static int stm32_qspi_setup(struct spi_device *spi) { - struct spi_controller *ctrl = spi->master; + struct spi_controller *ctrl = spi->controller; struct stm32_qspi *qspi = spi_controller_get_devdata(ctrl); struct stm32_qspi_flash *flash; u32 presc, mode; @@ -775,7 +775,7 @@ static int stm32_qspi_probe(struct platform_device *pdev) struct resource *res; int ret, irq; - ctrl = devm_spi_alloc_master(dev, sizeof(*qspi)); + ctrl = devm_spi_alloc_host(dev, sizeof(*qspi)); if (!ctrl) return -ENOMEM; @@ -861,7 +861,7 @@ static int stm32_qspi_probe(struct platform_device *pdev) pm_runtime_enable(dev); pm_runtime_get_noresume(dev); - ret = spi_register_master(ctrl); + ret = spi_register_controller(ctrl); if (ret) goto err_pm_runtime_free; @@ -892,7 +892,7 @@ static void stm32_qspi_remove(struct platform_device *pdev) struct stm32_qspi *qspi = platform_get_drvdata(pdev); pm_runtime_get_sync(qspi->dev); - spi_unregister_master(qspi->ctrl); + spi_unregister_controller(qspi->ctrl); /* disable qspi */ writel_relaxed(0, qspi->io_base + QSPI_CR); stm32_qspi_dma_free(qspi); From a5c1fa1318ee72b9809f105207570ef55c7992d9 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:10 +0800 Subject: [PATCH 0621/1562] spi: stm32: switch to use modern name Change legacy name master/slave to modern name host/target. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-6-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index 94df3836834c..f48e4dcc29fc 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -180,7 +180,7 @@ #define SPI_DMA_MIN_BYTES 16 /* STM32 SPI driver helpers */ -#define STM32_SPI_MASTER_MODE(stm32_spi) (!(stm32_spi)->device_mode) +#define STM32_SPI_HOST_MODE(stm32_spi) (!(stm32_spi)->device_mode) #define STM32_SPI_DEVICE_MODE(stm32_spi) ((stm32_spi)->device_mode) /** @@ -229,7 +229,7 @@ struct stm32_spi; * @get_fifo_size: routine to get fifo size * @get_bpw_mask: routine to get bits per word mask * @disable: routine to disable controller - * @config: routine to configure controller as SPI Master + * @config: routine to configure controller as SPI Host * @set_bpw: routine to configure registers to for bits per word * @set_mode: routine to configure registers to desired mode * @set_data_idleness: optional routine to configure registers to desired idle @@ -287,7 +287,7 @@ struct stm32_spi_cfg { * @lock: prevent I/O concurrent access * @irq: SPI controller interrupt line * @fifo_size: size of the embedded fifo in bytes - * @cur_midi: master inter-data idleness in ns + * @cur_midi: host inter-data idleness in ns * @cur_speed: speed configured in Hz * @cur_half_period: time of a half bit in us * @cur_bpw: number of bits in a single SPI data frame @@ -1064,7 +1064,7 @@ static int stm32_spi_prepare_msg(struct spi_controller *ctrl, unsigned long flags; u32 clrb = 0, setb = 0; - /* SPI slave device may need time between data frames */ + /* SPI target device may need time between data frames */ spi->cur_midi = 0; if (np && !of_property_read_u32(np, "st,spi-midi-ns", &spi->cur_midi)) dev_dbg(spi->dev, "%dns inter-data idleness\n", spi->cur_midi); @@ -1279,7 +1279,7 @@ static int stm32h7_spi_transfer_one_irq(struct stm32_spi *spi) if (spi->tx_buf) stm32h7_spi_write_txfifo(spi); - if (STM32_SPI_MASTER_MODE(spi)) + if (STM32_SPI_HOST_MODE(spi)) stm32_spi_set_bits(spi, STM32H7_SPI_CR1, STM32H7_SPI_CR1_CSTART); writel_relaxed(ier, spi->base + STM32H7_SPI_IER); @@ -1343,7 +1343,7 @@ static void stm32h7_spi_transfer_one_dma_start(struct stm32_spi *spi) stm32_spi_enable(spi); - if (STM32_SPI_MASTER_MODE(spi)) + if (STM32_SPI_HOST_MODE(spi)) stm32_spi_set_bits(spi, STM32H7_SPI_CR1, STM32H7_SPI_CR1_CSTART); } @@ -1516,7 +1516,7 @@ static void stm32h7_spi_set_bpw(struct stm32_spi *spi) } /** - * stm32_spi_set_mbr - Configure baud rate divisor in master mode + * stm32_spi_set_mbr - Configure baud rate divisor in host mode * @spi: pointer to the spi controller data structure * @mbrdiv: baud rate divisor value */ @@ -1628,7 +1628,7 @@ static int stm32h7_spi_set_mode(struct stm32_spi *spi, unsigned int comm_type) /** * stm32h7_spi_data_idleness - configure minimum time delay inserted between two - * consecutive data frames in master mode + * consecutive data frames in host mode * @spi: pointer to the spi controller data structure * @len: transfer len */ @@ -1697,7 +1697,7 @@ static int stm32_spi_transfer_one_setup(struct stm32_spi *spi, spi->cfg->set_bpw(spi); /* Update spi->cur_speed with real clock speed */ - if (STM32_SPI_MASTER_MODE(spi)) { + if (STM32_SPI_HOST_MODE(spi)) { mbr = stm32_spi_prepare_mbr(spi, transfer->speed_hz, spi->cfg->baud_rate_div_min, spi->cfg->baud_rate_div_max); @@ -1717,7 +1717,7 @@ static int stm32_spi_transfer_one_setup(struct stm32_spi *spi, spi->cur_comm = comm_type; - if (STM32_SPI_MASTER_MODE(spi) && spi->cfg->set_data_idleness) + if (STM32_SPI_HOST_MODE(spi) && spi->cfg->set_data_idleness) spi->cfg->set_data_idleness(spi, transfer->len); if (spi->cur_bpw <= 8) @@ -1738,7 +1738,7 @@ static int stm32_spi_transfer_one_setup(struct stm32_spi *spi, dev_dbg(spi->dev, "data frame of %d-bit, data packet of %d data frames\n", spi->cur_bpw, spi->cur_fthlv); - if (STM32_SPI_MASTER_MODE(spi)) + if (STM32_SPI_HOST_MODE(spi)) dev_dbg(spi->dev, "speed set to %dHz\n", spi->cur_speed); dev_dbg(spi->dev, "transfer of %d bytes (%d data frames)\n", spi->cur_xferlen, nb_words); @@ -1803,7 +1803,7 @@ static int stm32_spi_unprepare_msg(struct spi_controller *ctrl, } /** - * stm32fx_spi_config - Configure SPI controller as SPI master + * stm32fx_spi_config - Configure SPI controller as SPI host * @spi: pointer to the spi controller data structure */ static int stm32fx_spi_config(struct stm32_spi *spi) @@ -1819,8 +1819,8 @@ static int stm32fx_spi_config(struct stm32_spi *spi) /* * - SS input value high * - transmitter half duplex direction - * - Set the master mode (default Motorola mode) - * - Consider 1 master/n slaves configuration and + * - Set the host mode (default Motorola mode) + * - Consider 1 host/n targets configuration and * SS input value is determined by the SSI bit */ stm32_spi_set_bits(spi, STM32FX_SPI_CR1, STM32FX_SPI_CR1_SSI | @@ -1860,8 +1860,8 @@ static int stm32h7_spi_config(struct stm32_spi *spi) cr1 |= STM32H7_SPI_CR1_HDDIR | STM32H7_SPI_CR1_MASRX | STM32H7_SPI_CR1_SSI; /* - * - Set the master mode (default Motorola mode) - * - Consider 1 master/n devices configuration and + * - Set the host mode (default Motorola mode) + * - Consider 1 host/n devices configuration and * SS input value is determined by the SSI bit * - keep control of all associated GPIOs */ @@ -1977,9 +1977,9 @@ static int stm32_spi_probe(struct platform_device *pdev) } if (device_mode) - ctrl = devm_spi_alloc_slave(&pdev->dev, sizeof(struct stm32_spi)); + ctrl = devm_spi_alloc_target(&pdev->dev, sizeof(struct stm32_spi)); else - ctrl = devm_spi_alloc_master(&pdev->dev, sizeof(struct stm32_spi)); + ctrl = devm_spi_alloc_host(&pdev->dev, sizeof(struct stm32_spi)); if (!ctrl) { dev_err(&pdev->dev, "spi controller allocation failed\n"); return -ENOMEM; @@ -2070,7 +2070,7 @@ static int stm32_spi_probe(struct platform_device *pdev) ctrl->unprepare_message = stm32_spi_unprepare_msg; ctrl->flags = spi->cfg->flags; if (STM32_SPI_DEVICE_MODE(spi)) - ctrl->slave_abort = stm32h7_spi_device_abort; + ctrl->target_abort = stm32h7_spi_device_abort; spi->dma_tx = dma_request_chan(spi->dev, "tx"); if (IS_ERR(spi->dma_tx)) { @@ -2117,7 +2117,7 @@ static int stm32_spi_probe(struct platform_device *pdev) pm_runtime_put_autosuspend(&pdev->dev); dev_info(&pdev->dev, "driver initialized (%s mode)\n", - STM32_SPI_MASTER_MODE(spi) ? "master" : "device"); + STM32_SPI_HOST_MODE(spi) ? "host" : "device"); return 0; From 6d232cc8a7e59af0c083319827541966a68817a0 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:11 +0800 Subject: [PATCH 0622/1562] spi: sun4i: switch to use modern name Change legacy name master to modern name host or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-7-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-sun4i.c | 72 ++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/drivers/spi/spi-sun4i.c b/drivers/spi/spi-sun4i.c index b8947265d329..11d8bd27b3e9 100644 --- a/drivers/spi/spi-sun4i.c +++ b/drivers/spi/spi-sun4i.c @@ -75,7 +75,7 @@ #define SUN4I_FIFO_STA_TF_CNT_BITS 16 struct sun4i_spi { - struct spi_master *master; + struct spi_controller *host; void __iomem *base_addr; struct clk *hclk; struct clk *mclk; @@ -161,7 +161,7 @@ static inline void sun4i_spi_fill_fifo(struct sun4i_spi *sspi, int len) static void sun4i_spi_set_cs(struct spi_device *spi, bool enable) { - struct sun4i_spi *sspi = spi_master_get_devdata(spi->master); + struct sun4i_spi *sspi = spi_controller_get_devdata(spi->controller); u32 reg; reg = sun4i_spi_read(sspi, SUN4I_CTL_REG); @@ -201,11 +201,11 @@ static size_t sun4i_spi_max_transfer_size(struct spi_device *spi) return SUN4I_MAX_XFER_SIZE - 1; } -static int sun4i_spi_transfer_one(struct spi_master *master, +static int sun4i_spi_transfer_one(struct spi_controller *host, struct spi_device *spi, struct spi_transfer *tfr) { - struct sun4i_spi *sspi = spi_master_get_devdata(master); + struct sun4i_spi *sspi = spi_controller_get_devdata(host); unsigned int mclk_rate, div, timeout; unsigned int start, end, tx_time; unsigned int tx_len = 0; @@ -331,7 +331,7 @@ static int sun4i_spi_transfer_one(struct spi_master *master, msecs_to_jiffies(tx_time)); end = jiffies; if (!timeout) { - dev_warn(&master->dev, + dev_warn(&host->dev, "%s: timeout transferring %u bytes@%iHz for %i(%i)ms", dev_name(&spi->dev), tfr->len, tfr->speed_hz, jiffies_to_msecs(end - start), tx_time); @@ -386,8 +386,8 @@ static irqreturn_t sun4i_spi_handler(int irq, void *dev_id) static int sun4i_spi_runtime_resume(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct sun4i_spi *sspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct sun4i_spi *sspi = spi_controller_get_devdata(host); int ret; ret = clk_prepare_enable(sspi->hclk); @@ -415,8 +415,8 @@ out: static int sun4i_spi_runtime_suspend(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct sun4i_spi *sspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct sun4i_spi *sspi = spi_controller_get_devdata(host); clk_disable_unprepare(sspi->mclk); clk_disable_unprepare(sspi->hclk); @@ -426,62 +426,62 @@ static int sun4i_spi_runtime_suspend(struct device *dev) static int sun4i_spi_probe(struct platform_device *pdev) { - struct spi_master *master; + struct spi_controller *host; struct sun4i_spi *sspi; int ret = 0, irq; - master = spi_alloc_master(&pdev->dev, sizeof(struct sun4i_spi)); - if (!master) { - dev_err(&pdev->dev, "Unable to allocate SPI Master\n"); + host = spi_alloc_host(&pdev->dev, sizeof(struct sun4i_spi)); + if (!host) { + dev_err(&pdev->dev, "Unable to allocate SPI Host\n"); return -ENOMEM; } - platform_set_drvdata(pdev, master); - sspi = spi_master_get_devdata(master); + platform_set_drvdata(pdev, host); + sspi = spi_controller_get_devdata(host); sspi->base_addr = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(sspi->base_addr)) { ret = PTR_ERR(sspi->base_addr); - goto err_free_master; + goto err_free_host; } irq = platform_get_irq(pdev, 0); if (irq < 0) { ret = -ENXIO; - goto err_free_master; + goto err_free_host; } ret = devm_request_irq(&pdev->dev, irq, sun4i_spi_handler, 0, "sun4i-spi", sspi); if (ret) { dev_err(&pdev->dev, "Cannot request IRQ\n"); - goto err_free_master; + goto err_free_host; } - sspi->master = master; - master->max_speed_hz = 100 * 1000 * 1000; - master->min_speed_hz = 3 * 1000; - master->set_cs = sun4i_spi_set_cs; - master->transfer_one = sun4i_spi_transfer_one; - master->num_chipselect = 4; - master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST; - master->bits_per_word_mask = SPI_BPW_MASK(8); - master->dev.of_node = pdev->dev.of_node; - master->auto_runtime_pm = true; - master->max_transfer_size = sun4i_spi_max_transfer_size; + sspi->host = host; + host->max_speed_hz = 100 * 1000 * 1000; + host->min_speed_hz = 3 * 1000; + host->set_cs = sun4i_spi_set_cs; + host->transfer_one = sun4i_spi_transfer_one; + host->num_chipselect = 4; + host->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST; + host->bits_per_word_mask = SPI_BPW_MASK(8); + host->dev.of_node = pdev->dev.of_node; + host->auto_runtime_pm = true; + host->max_transfer_size = sun4i_spi_max_transfer_size; sspi->hclk = devm_clk_get(&pdev->dev, "ahb"); if (IS_ERR(sspi->hclk)) { dev_err(&pdev->dev, "Unable to acquire AHB clock\n"); ret = PTR_ERR(sspi->hclk); - goto err_free_master; + goto err_free_host; } sspi->mclk = devm_clk_get(&pdev->dev, "mod"); if (IS_ERR(sspi->mclk)) { dev_err(&pdev->dev, "Unable to acquire module clock\n"); ret = PTR_ERR(sspi->mclk); - goto err_free_master; + goto err_free_host; } init_completion(&sspi->done); @@ -493,16 +493,16 @@ static int sun4i_spi_probe(struct platform_device *pdev) ret = sun4i_spi_runtime_resume(&pdev->dev); if (ret) { dev_err(&pdev->dev, "Couldn't resume the device\n"); - goto err_free_master; + goto err_free_host; } pm_runtime_set_active(&pdev->dev); pm_runtime_enable(&pdev->dev); pm_runtime_idle(&pdev->dev); - ret = devm_spi_register_master(&pdev->dev, master); + ret = devm_spi_register_controller(&pdev->dev, host); if (ret) { - dev_err(&pdev->dev, "cannot register SPI master\n"); + dev_err(&pdev->dev, "cannot register SPI host\n"); goto err_pm_disable; } @@ -511,8 +511,8 @@ static int sun4i_spi_probe(struct platform_device *pdev) err_pm_disable: pm_runtime_disable(&pdev->dev); sun4i_spi_runtime_suspend(&pdev->dev); -err_free_master: - spi_master_put(master); +err_free_host: + spi_controller_put(host); return ret; } From 9f55bb79893a9dc75982372bee1307bdce48976b Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:12 +0800 Subject: [PATCH 0623/1562] spi: sun6i: switch to use modern name Change legacy name master to modern name host or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-8-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-sun6i.c | 148 ++++++++++++++++++++-------------------- 1 file changed, 74 insertions(+), 74 deletions(-) diff --git a/drivers/spi/spi-sun6i.c b/drivers/spi/spi-sun6i.c index fddc63309773..cd018ea1abf1 100644 --- a/drivers/spi/spi-sun6i.c +++ b/drivers/spi/spi-sun6i.c @@ -97,7 +97,7 @@ struct sun6i_spi_cfg { }; struct sun6i_spi { - struct spi_master *master; + struct spi_controller *host; void __iomem *base_addr; dma_addr_t dma_addr_rx; dma_addr_t dma_addr_tx; @@ -181,7 +181,7 @@ static inline void sun6i_spi_fill_fifo(struct sun6i_spi *sspi) static void sun6i_spi_set_cs(struct spi_device *spi, bool enable) { - struct sun6i_spi *sspi = spi_master_get_devdata(spi->master); + struct sun6i_spi *sspi = spi_controller_get_devdata(spi->controller); u32 reg; reg = sun6i_spi_read(sspi, SUN6I_TFR_CTL_REG); @@ -212,7 +212,7 @@ static int sun6i_spi_prepare_dma(struct sun6i_spi *sspi, struct spi_transfer *tfr) { struct dma_async_tx_descriptor *rxdesc, *txdesc; - struct spi_master *master = sspi->master; + struct spi_controller *host = sspi->host; rxdesc = NULL; if (tfr->rx_buf) { @@ -223,9 +223,9 @@ static int sun6i_spi_prepare_dma(struct sun6i_spi *sspi, .src_maxburst = 8, }; - dmaengine_slave_config(master->dma_rx, &rxconf); + dmaengine_slave_config(host->dma_rx, &rxconf); - rxdesc = dmaengine_prep_slave_sg(master->dma_rx, + rxdesc = dmaengine_prep_slave_sg(host->dma_rx, tfr->rx_sg.sgl, tfr->rx_sg.nents, DMA_DEV_TO_MEM, @@ -245,38 +245,38 @@ static int sun6i_spi_prepare_dma(struct sun6i_spi *sspi, .dst_maxburst = 8, }; - dmaengine_slave_config(master->dma_tx, &txconf); + dmaengine_slave_config(host->dma_tx, &txconf); - txdesc = dmaengine_prep_slave_sg(master->dma_tx, + txdesc = dmaengine_prep_slave_sg(host->dma_tx, tfr->tx_sg.sgl, tfr->tx_sg.nents, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT); if (!txdesc) { if (rxdesc) - dmaengine_terminate_sync(master->dma_rx); + dmaengine_terminate_sync(host->dma_rx); return -EINVAL; } } if (tfr->rx_buf) { dmaengine_submit(rxdesc); - dma_async_issue_pending(master->dma_rx); + dma_async_issue_pending(host->dma_rx); } if (tfr->tx_buf) { dmaengine_submit(txdesc); - dma_async_issue_pending(master->dma_tx); + dma_async_issue_pending(host->dma_tx); } return 0; } -static int sun6i_spi_transfer_one(struct spi_master *master, +static int sun6i_spi_transfer_one(struct spi_controller *host, struct spi_device *spi, struct spi_transfer *tfr) { - struct sun6i_spi *sspi = spi_master_get_devdata(master); + struct sun6i_spi *sspi = spi_controller_get_devdata(host); unsigned int div, div_cdr1, div_cdr2, timeout; unsigned int start, end, tx_time; unsigned int trig_level; @@ -293,7 +293,7 @@ static int sun6i_spi_transfer_one(struct spi_master *master, sspi->tx_buf = tfr->tx_buf; sspi->rx_buf = tfr->rx_buf; sspi->len = tfr->len; - use_dma = master->can_dma ? master->can_dma(master, spi, tfr) : false; + use_dma = host->can_dma ? host->can_dma(host, spi, tfr) : false; /* Clear pending interrupts */ sun6i_spi_write(sspi, SUN6I_INT_STA_REG, ~0); @@ -463,7 +463,7 @@ static int sun6i_spi_transfer_one(struct spi_master *master, } else { ret = sun6i_spi_prepare_dma(sspi, tfr); if (ret) { - dev_warn(&master->dev, + dev_warn(&host->dev, "%s: prepare DMA failed, ret=%d", dev_name(&spi->dev), ret); return ret; @@ -486,7 +486,7 @@ static int sun6i_spi_transfer_one(struct spi_master *master, reg = sun6i_spi_read(sspi, SUN6I_TFR_CTL_REG); sun6i_spi_write(sspi, SUN6I_TFR_CTL_REG, reg | SUN6I_TFR_CTL_XCH); - tx_time = spi_controller_xfer_timeout(master, tfr); + tx_time = spi_controller_xfer_timeout(host, tfr); start = jiffies; timeout = wait_for_completion_timeout(&sspi->done, msecs_to_jiffies(tx_time)); @@ -502,13 +502,13 @@ static int sun6i_spi_transfer_one(struct spi_master *master, timeout = wait_for_completion_timeout(&sspi->dma_rx_done, timeout); if (!timeout) - dev_warn(&master->dev, "RX DMA timeout\n"); + dev_warn(&host->dev, "RX DMA timeout\n"); } } end = jiffies; if (!timeout) { - dev_warn(&master->dev, + dev_warn(&host->dev, "%s: timeout transferring %u bytes@%iHz for %i(%i)ms", dev_name(&spi->dev), tfr->len, tfr->speed_hz, jiffies_to_msecs(end - start), tx_time); @@ -518,8 +518,8 @@ static int sun6i_spi_transfer_one(struct spi_master *master, sun6i_spi_write(sspi, SUN6I_INT_CTL_REG, 0); if (ret && use_dma) { - dmaengine_terminate_sync(master->dma_rx); - dmaengine_terminate_sync(master->dma_tx); + dmaengine_terminate_sync(host->dma_rx); + dmaengine_terminate_sync(host->dma_tx); } return ret; @@ -564,8 +564,8 @@ static irqreturn_t sun6i_spi_handler(int irq, void *dev_id) static int sun6i_spi_runtime_resume(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct sun6i_spi *sspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct sun6i_spi *sspi = spi_controller_get_devdata(host); int ret; ret = clk_prepare_enable(sspi->hclk); @@ -601,8 +601,8 @@ out: static int sun6i_spi_runtime_suspend(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct sun6i_spi *sspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct sun6i_spi *sspi = spi_controller_get_devdata(host); reset_control_assert(sspi->rstc); clk_disable_unprepare(sspi->mclk); @@ -611,11 +611,11 @@ static int sun6i_spi_runtime_suspend(struct device *dev) return 0; } -static bool sun6i_spi_can_dma(struct spi_master *master, +static bool sun6i_spi_can_dma(struct spi_controller *host, struct spi_device *spi, struct spi_transfer *xfer) { - struct sun6i_spi *sspi = spi_master_get_devdata(master); + struct sun6i_spi *sspi = spi_controller_get_devdata(host); /* * If the number of spi words to transfer is less or equal than @@ -627,67 +627,67 @@ static bool sun6i_spi_can_dma(struct spi_master *master, static int sun6i_spi_probe(struct platform_device *pdev) { - struct spi_master *master; + struct spi_controller *host; struct sun6i_spi *sspi; struct resource *mem; int ret = 0, irq; - master = spi_alloc_master(&pdev->dev, sizeof(struct sun6i_spi)); - if (!master) { - dev_err(&pdev->dev, "Unable to allocate SPI Master\n"); + host = spi_alloc_host(&pdev->dev, sizeof(struct sun6i_spi)); + if (!host) { + dev_err(&pdev->dev, "Unable to allocate SPI Host\n"); return -ENOMEM; } - platform_set_drvdata(pdev, master); - sspi = spi_master_get_devdata(master); + platform_set_drvdata(pdev, host); + sspi = spi_controller_get_devdata(host); sspi->base_addr = devm_platform_get_and_ioremap_resource(pdev, 0, &mem); if (IS_ERR(sspi->base_addr)) { ret = PTR_ERR(sspi->base_addr); - goto err_free_master; + goto err_free_host; } irq = platform_get_irq(pdev, 0); if (irq < 0) { ret = -ENXIO; - goto err_free_master; + goto err_free_host; } ret = devm_request_irq(&pdev->dev, irq, sun6i_spi_handler, 0, "sun6i-spi", sspi); if (ret) { dev_err(&pdev->dev, "Cannot request IRQ\n"); - goto err_free_master; + goto err_free_host; } - sspi->master = master; + sspi->host = host; sspi->cfg = of_device_get_match_data(&pdev->dev); - master->max_speed_hz = 100 * 1000 * 1000; - master->min_speed_hz = 3 * 1000; - master->use_gpio_descriptors = true; - master->set_cs = sun6i_spi_set_cs; - master->transfer_one = sun6i_spi_transfer_one; - master->num_chipselect = 4; - master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST | - sspi->cfg->mode_bits; - master->bits_per_word_mask = SPI_BPW_MASK(8); - master->dev.of_node = pdev->dev.of_node; - master->auto_runtime_pm = true; - master->max_transfer_size = sun6i_spi_max_transfer_size; + host->max_speed_hz = 100 * 1000 * 1000; + host->min_speed_hz = 3 * 1000; + host->use_gpio_descriptors = true; + host->set_cs = sun6i_spi_set_cs; + host->transfer_one = sun6i_spi_transfer_one; + host->num_chipselect = 4; + host->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST | + sspi->cfg->mode_bits; + host->bits_per_word_mask = SPI_BPW_MASK(8); + host->dev.of_node = pdev->dev.of_node; + host->auto_runtime_pm = true; + host->max_transfer_size = sun6i_spi_max_transfer_size; sspi->hclk = devm_clk_get(&pdev->dev, "ahb"); if (IS_ERR(sspi->hclk)) { dev_err(&pdev->dev, "Unable to acquire AHB clock\n"); ret = PTR_ERR(sspi->hclk); - goto err_free_master; + goto err_free_host; } sspi->mclk = devm_clk_get(&pdev->dev, "mod"); if (IS_ERR(sspi->mclk)) { dev_err(&pdev->dev, "Unable to acquire module clock\n"); ret = PTR_ERR(sspi->mclk); - goto err_free_master; + goto err_free_host; } init_completion(&sspi->done); @@ -697,34 +697,34 @@ static int sun6i_spi_probe(struct platform_device *pdev) if (IS_ERR(sspi->rstc)) { dev_err(&pdev->dev, "Couldn't get reset controller\n"); ret = PTR_ERR(sspi->rstc); - goto err_free_master; + goto err_free_host; } - master->dma_tx = dma_request_chan(&pdev->dev, "tx"); - if (IS_ERR(master->dma_tx)) { + host->dma_tx = dma_request_chan(&pdev->dev, "tx"); + if (IS_ERR(host->dma_tx)) { /* Check tx to see if we need defer probing driver */ - if (PTR_ERR(master->dma_tx) == -EPROBE_DEFER) { + if (PTR_ERR(host->dma_tx) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; - goto err_free_master; + goto err_free_host; } dev_warn(&pdev->dev, "Failed to request TX DMA channel\n"); - master->dma_tx = NULL; + host->dma_tx = NULL; } - master->dma_rx = dma_request_chan(&pdev->dev, "rx"); - if (IS_ERR(master->dma_rx)) { - if (PTR_ERR(master->dma_rx) == -EPROBE_DEFER) { + host->dma_rx = dma_request_chan(&pdev->dev, "rx"); + if (IS_ERR(host->dma_rx)) { + if (PTR_ERR(host->dma_rx) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; goto err_free_dma_tx; } dev_warn(&pdev->dev, "Failed to request RX DMA channel\n"); - master->dma_rx = NULL; + host->dma_rx = NULL; } - if (master->dma_tx && master->dma_rx) { + if (host->dma_tx && host->dma_rx) { sspi->dma_addr_tx = mem->start + SUN6I_TXDATA_REG; sspi->dma_addr_rx = mem->start + SUN6I_RXDATA_REG; - master->can_dma = sun6i_spi_can_dma; + host->can_dma = sun6i_spi_can_dma; } /* @@ -742,9 +742,9 @@ static int sun6i_spi_probe(struct platform_device *pdev) pm_runtime_set_active(&pdev->dev); pm_runtime_enable(&pdev->dev); - ret = devm_spi_register_master(&pdev->dev, master); + ret = devm_spi_register_controller(&pdev->dev, host); if (ret) { - dev_err(&pdev->dev, "cannot register SPI master\n"); + dev_err(&pdev->dev, "cannot register SPI host\n"); goto err_pm_disable; } @@ -754,26 +754,26 @@ err_pm_disable: pm_runtime_disable(&pdev->dev); sun6i_spi_runtime_suspend(&pdev->dev); err_free_dma_rx: - if (master->dma_rx) - dma_release_channel(master->dma_rx); + if (host->dma_rx) + dma_release_channel(host->dma_rx); err_free_dma_tx: - if (master->dma_tx) - dma_release_channel(master->dma_tx); -err_free_master: - spi_master_put(master); + if (host->dma_tx) + dma_release_channel(host->dma_tx); +err_free_host: + spi_controller_put(host); return ret; } static void sun6i_spi_remove(struct platform_device *pdev) { - struct spi_master *master = platform_get_drvdata(pdev); + struct spi_controller *host = platform_get_drvdata(pdev); pm_runtime_force_suspend(&pdev->dev); - if (master->dma_tx) - dma_release_channel(master->dma_tx); - if (master->dma_rx) - dma_release_channel(master->dma_rx); + if (host->dma_tx) + dma_release_channel(host->dma_tx); + if (host->dma_rx) + dma_release_channel(host->dma_rx); } static const struct sun6i_spi_cfg sun6i_a31_spi_cfg = { From 90bbb007a06aa7b0f428a89531dec064ec584d8a Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:13 +0800 Subject: [PATCH 0624/1562] spi: sunplus-sp7021: switch to use modern name Change legacy name master/slave to modern name host/target or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-9-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-sunplus-sp7021.c | 88 ++++++++++++++++---------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/drivers/spi/spi-sunplus-sp7021.c b/drivers/spi/spi-sunplus-sp7021.c index eb8f835a4771..4e481380c259 100644 --- a/drivers/spi/spi-sunplus-sp7021.c +++ b/drivers/spi/spi-sunplus-sp7021.c @@ -70,8 +70,8 @@ #define SP7021_FIFO_DATA_LEN (16) enum { - SP7021_MASTER_MODE = 0, - SP7021_SLAVE_MODE = 1, + SP7021_HOST_MODE = 0, + SP7021_TARGET_MODE = 1, }; struct sp7021_spi_ctlr { @@ -88,7 +88,7 @@ struct sp7021_spi_ctlr { // data xfer lock struct mutex buf_lock; struct completion isr_done; - struct completion slave_isr; + struct completion target_isr; unsigned int rx_cur_len; unsigned int tx_cur_len; unsigned int data_unit; @@ -96,7 +96,7 @@ struct sp7021_spi_ctlr { u8 *rx_buf; }; -static irqreturn_t sp7021_spi_slave_irq(int irq, void *dev) +static irqreturn_t sp7021_spi_target_irq(int irq, void *dev) { struct sp7021_spi_ctlr *pspim = dev; unsigned int data_status; @@ -104,25 +104,25 @@ static irqreturn_t sp7021_spi_slave_irq(int irq, void *dev) data_status = readl(pspim->s_base + SP7021_DATA_RDY_REG); data_status |= SP7021_SLAVE_CLR_INT; writel(data_status , pspim->s_base + SP7021_DATA_RDY_REG); - complete(&pspim->slave_isr); + complete(&pspim->target_isr); return IRQ_HANDLED; } -static int sp7021_spi_slave_abort(struct spi_controller *ctlr) +static int sp7021_spi_target_abort(struct spi_controller *ctlr) { - struct sp7021_spi_ctlr *pspim = spi_master_get_devdata(ctlr); + struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(ctlr); - complete(&pspim->slave_isr); + complete(&pspim->target_isr); complete(&pspim->isr_done); return 0; } -static int sp7021_spi_slave_tx(struct spi_device *spi, struct spi_transfer *xfer) +static int sp7021_spi_target_tx(struct spi_device *spi, struct spi_transfer *xfer) { struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(spi->controller); u32 value; - reinit_completion(&pspim->slave_isr); + reinit_completion(&pspim->target_isr); value = SP7021_SLAVE_DMA_EN | SP7021_SLAVE_DMA_RW | FIELD_PREP(SP7021_SLAVE_DMA_CMD, 3); writel(value, pspim->s_base + SP7021_SLAVE_DMA_CTRL_REG); writel(xfer->len, pspim->s_base + SP7021_SLAVE_DMA_LENGTH_REG); @@ -137,7 +137,7 @@ static int sp7021_spi_slave_tx(struct spi_device *spi, struct spi_transfer *xfer return 0; } -static int sp7021_spi_slave_rx(struct spi_device *spi, struct spi_transfer *xfer) +static int sp7021_spi_target_rx(struct spi_device *spi, struct spi_transfer *xfer) { struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(spi->controller); u32 value; @@ -155,7 +155,7 @@ static int sp7021_spi_slave_rx(struct spi_device *spi, struct spi_transfer *xfer return 0; } -static void sp7021_spi_master_rb(struct sp7021_spi_ctlr *pspim, unsigned int len) +static void sp7021_spi_host_rb(struct sp7021_spi_ctlr *pspim, unsigned int len) { int i; @@ -166,7 +166,7 @@ static void sp7021_spi_master_rb(struct sp7021_spi_ctlr *pspim, unsigned int len } } -static void sp7021_spi_master_wb(struct sp7021_spi_ctlr *pspim, unsigned int len) +static void sp7021_spi_host_wb(struct sp7021_spi_ctlr *pspim, unsigned int len) { int i; @@ -177,7 +177,7 @@ static void sp7021_spi_master_wb(struct sp7021_spi_ctlr *pspim, unsigned int len } } -static irqreturn_t sp7021_spi_master_irq(int irq, void *dev) +static irqreturn_t sp7021_spi_host_irq(int irq, void *dev) { struct sp7021_spi_ctlr *pspim = dev; unsigned int tx_cnt, total_len; @@ -206,9 +206,9 @@ static irqreturn_t sp7021_spi_master_irq(int irq, void *dev) fd_status, rx_cnt, tx_cnt, tx_len); if (rx_cnt > 0) - sp7021_spi_master_rb(pspim, rx_cnt); + sp7021_spi_host_rb(pspim, rx_cnt); if (tx_cnt > 0) - sp7021_spi_master_wb(pspim, tx_cnt); + sp7021_spi_host_wb(pspim, tx_cnt); fd_status = readl(pspim->m_base + SP7021_SPI_STATUS_REG); tx_len = FIELD_GET(SP7021_TX_LEN_MASK, fd_status); @@ -224,7 +224,7 @@ static irqreturn_t sp7021_spi_master_irq(int irq, void *dev) rx_cnt = FIELD_GET(SP7021_RX_CNT_MASK, fd_status); if (rx_cnt > 0) - sp7021_spi_master_rb(pspim, rx_cnt); + sp7021_spi_host_rb(pspim, rx_cnt); } value = readl(pspim->m_base + SP7021_INT_BUSY_REG); value |= SP7021_CLR_MASTER_INT; @@ -240,7 +240,7 @@ static irqreturn_t sp7021_spi_master_irq(int irq, void *dev) static void sp7021_prep_transfer(struct spi_controller *ctlr, struct spi_device *spi) { - struct sp7021_spi_ctlr *pspim = spi_master_get_devdata(ctlr); + struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(ctlr); pspim->tx_cur_len = 0; pspim->rx_cur_len = 0; @@ -251,7 +251,7 @@ static void sp7021_prep_transfer(struct spi_controller *ctlr, struct spi_device static int sp7021_spi_controller_prepare_message(struct spi_controller *ctlr, struct spi_message *msg) { - struct sp7021_spi_ctlr *pspim = spi_master_get_devdata(ctlr); + struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(ctlr); struct spi_device *s = msg->spi; u32 valus, rs = 0; @@ -283,7 +283,7 @@ static int sp7021_spi_controller_prepare_message(struct spi_controller *ctlr, static void sp7021_spi_setup_clk(struct spi_controller *ctlr, struct spi_transfer *xfer) { - struct sp7021_spi_ctlr *pspim = spi_master_get_devdata(ctlr); + struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(ctlr); u32 clk_rate, clk_sel, div; clk_rate = clk_get_rate(pspim->spi_clk); @@ -295,10 +295,10 @@ static void sp7021_spi_setup_clk(struct spi_controller *ctlr, struct spi_transfe writel(pspim->xfer_conf, pspim->m_base + SP7021_SPI_CONFIG_REG); } -static int sp7021_spi_master_transfer_one(struct spi_controller *ctlr, struct spi_device *spi, +static int sp7021_spi_host_transfer_one(struct spi_controller *ctlr, struct spi_device *spi, struct spi_transfer *xfer) { - struct sp7021_spi_ctlr *pspim = spi_master_get_devdata(ctlr); + struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(ctlr); unsigned long timeout = msecs_to_jiffies(1000); unsigned int xfer_cnt, xfer_len, last_len; unsigned int i, len_temp; @@ -323,7 +323,7 @@ static int sp7021_spi_master_transfer_one(struct spi_controller *ctlr, struct sp if (pspim->tx_cur_len < xfer_len) { len_temp = min(pspim->data_unit, xfer_len); - sp7021_spi_master_wb(pspim, len_temp); + sp7021_spi_host_wb(pspim, len_temp); } reg_temp = readl(pspim->m_base + SP7021_SPI_CONFIG_REG); reg_temp &= ~SP7021_CLEAN_RW_BYTE; @@ -359,10 +359,10 @@ static int sp7021_spi_master_transfer_one(struct spi_controller *ctlr, struct sp return 0; } -static int sp7021_spi_slave_transfer_one(struct spi_controller *ctlr, struct spi_device *spi, +static int sp7021_spi_target_transfer_one(struct spi_controller *ctlr, struct spi_device *spi, struct spi_transfer *xfer) { - struct sp7021_spi_ctlr *pspim = spi_master_get_devdata(ctlr); + struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(ctlr); struct device *dev = pspim->dev; int ret; @@ -371,14 +371,14 @@ static int sp7021_spi_slave_transfer_one(struct spi_controller *ctlr, struct spi xfer->len, DMA_TO_DEVICE); if (dma_mapping_error(dev, xfer->tx_dma)) return -ENOMEM; - ret = sp7021_spi_slave_tx(spi, xfer); + ret = sp7021_spi_target_tx(spi, xfer); dma_unmap_single(dev, xfer->tx_dma, xfer->len, DMA_TO_DEVICE); } else if (xfer->rx_buf && !xfer->tx_buf) { xfer->rx_dma = dma_map_single(dev, xfer->rx_buf, xfer->len, DMA_FROM_DEVICE); if (dma_mapping_error(dev, xfer->rx_dma)) return -ENOMEM; - ret = sp7021_spi_slave_rx(spi, xfer); + ret = sp7021_spi_target_rx(spi, xfer); dma_unmap_single(dev, xfer->rx_dma, xfer->len, DMA_FROM_DEVICE); } else { dev_dbg(&ctlr->dev, "%s() wrong command\n", __func__); @@ -409,14 +409,14 @@ static int sp7021_spi_controller_probe(struct platform_device *pdev) pdev->id = of_alias_get_id(pdev->dev.of_node, "sp_spi"); if (device_property_read_bool(dev, "spi-slave")) - mode = SP7021_SLAVE_MODE; + mode = SP7021_TARGET_MODE; else - mode = SP7021_MASTER_MODE; + mode = SP7021_HOST_MODE; - if (mode == SP7021_SLAVE_MODE) - ctlr = devm_spi_alloc_slave(dev, sizeof(*pspim)); + if (mode == SP7021_TARGET_MODE) + ctlr = devm_spi_alloc_target(dev, sizeof(*pspim)); else - ctlr = devm_spi_alloc_master(dev, sizeof(*pspim)); + ctlr = devm_spi_alloc_host(dev, sizeof(*pspim)); if (!ctlr) return -ENOMEM; device_set_node(&ctlr->dev, dev_fwnode(dev)); @@ -424,9 +424,9 @@ static int sp7021_spi_controller_probe(struct platform_device *pdev) ctlr->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST; ctlr->auto_runtime_pm = true; ctlr->prepare_message = sp7021_spi_controller_prepare_message; - if (mode == SP7021_SLAVE_MODE) { - ctlr->transfer_one = sp7021_spi_slave_transfer_one; - ctlr->slave_abort = sp7021_spi_slave_abort; + if (mode == SP7021_TARGET_MODE) { + ctlr->transfer_one = sp7021_spi_target_transfer_one; + ctlr->target_abort = sp7021_spi_target_abort; ctlr->flags = SPI_CONTROLLER_HALF_DUPLEX; } else { ctlr->bits_per_word_mask = SPI_BPW_MASK(8); @@ -434,7 +434,7 @@ static int sp7021_spi_controller_probe(struct platform_device *pdev) ctlr->max_speed_hz = 25000000; ctlr->use_gpio_descriptors = true; ctlr->flags = SPI_CONTROLLER_MUST_RX | SPI_CONTROLLER_MUST_TX; - ctlr->transfer_one = sp7021_spi_master_transfer_one; + ctlr->transfer_one = sp7021_spi_host_transfer_one; } platform_set_drvdata(pdev, ctlr); pspim = spi_controller_get_devdata(ctlr); @@ -443,7 +443,7 @@ static int sp7021_spi_controller_probe(struct platform_device *pdev) pspim->dev = dev; mutex_init(&pspim->buf_lock); init_completion(&pspim->isr_done); - init_completion(&pspim->slave_isr); + init_completion(&pspim->target_isr); pspim->m_base = devm_platform_ioremap_resource_byname(pdev, "master"); if (IS_ERR(pspim->m_base)) @@ -485,12 +485,12 @@ static int sp7021_spi_controller_probe(struct platform_device *pdev) if (ret) return ret; - ret = devm_request_irq(dev, pspim->m_irq, sp7021_spi_master_irq, + ret = devm_request_irq(dev, pspim->m_irq, sp7021_spi_host_irq, IRQF_TRIGGER_RISING, pdev->name, pspim); if (ret) return ret; - ret = devm_request_irq(dev, pspim->s_irq, sp7021_spi_slave_irq, + ret = devm_request_irq(dev, pspim->s_irq, sp7021_spi_target_irq, IRQF_TRIGGER_RISING, pdev->name, pspim); if (ret) return ret; @@ -499,7 +499,7 @@ static int sp7021_spi_controller_probe(struct platform_device *pdev) ret = spi_register_controller(ctlr); if (ret) { pm_runtime_disable(dev); - return dev_err_probe(dev, ret, "spi_register_master fail\n"); + return dev_err_probe(dev, ret, "spi_register_controller fail\n"); } return 0; } @@ -516,7 +516,7 @@ static void sp7021_spi_controller_remove(struct platform_device *pdev) static int __maybe_unused sp7021_spi_controller_suspend(struct device *dev) { struct spi_controller *ctlr = dev_get_drvdata(dev); - struct sp7021_spi_ctlr *pspim = spi_master_get_devdata(ctlr); + struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(ctlr); return reset_control_assert(pspim->rstc); } @@ -524,7 +524,7 @@ static int __maybe_unused sp7021_spi_controller_suspend(struct device *dev) static int __maybe_unused sp7021_spi_controller_resume(struct device *dev) { struct spi_controller *ctlr = dev_get_drvdata(dev); - struct sp7021_spi_ctlr *pspim = spi_master_get_devdata(ctlr); + struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(ctlr); reset_control_deassert(pspim->rstc); return clk_prepare_enable(pspim->spi_clk); @@ -534,7 +534,7 @@ static int __maybe_unused sp7021_spi_controller_resume(struct device *dev) static int sp7021_spi_runtime_suspend(struct device *dev) { struct spi_controller *ctlr = dev_get_drvdata(dev); - struct sp7021_spi_ctlr *pspim = spi_master_get_devdata(ctlr); + struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(ctlr); return reset_control_assert(pspim->rstc); } @@ -542,7 +542,7 @@ static int sp7021_spi_runtime_suspend(struct device *dev) static int sp7021_spi_runtime_resume(struct device *dev) { struct spi_controller *ctlr = dev_get_drvdata(dev); - struct sp7021_spi_ctlr *pspim = spi_master_get_devdata(ctlr); + struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(ctlr); return reset_control_deassert(pspim->rstc); } From 3524d1b727a66712f02f92807219a3650e5cf910 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:14 +0800 Subject: [PATCH 0625/1562] spi: synquacer: switch to use modern name Change legacy name master to modern name host or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-10-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-synquacer.c | 82 ++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/drivers/spi/spi-synquacer.c b/drivers/spi/spi-synquacer.c index aeaf7db022f0..7cb4301a6fb2 100644 --- a/drivers/spi/spi-synquacer.c +++ b/drivers/spi/spi-synquacer.c @@ -225,11 +225,11 @@ static int write_fifo(struct synquacer_spi *sspi) return 0; } -static int synquacer_spi_config(struct spi_master *master, +static int synquacer_spi_config(struct spi_controller *host, struct spi_device *spi, struct spi_transfer *xfer) { - struct synquacer_spi *sspi = spi_master_get_devdata(master); + struct synquacer_spi *sspi = spi_controller_get_devdata(host); unsigned int speed, mode, bpw, cs, bus_width, transfer_mode; u32 rate, val, div; @@ -263,7 +263,7 @@ static int synquacer_spi_config(struct spi_master *master, } sspi->transfer_mode = transfer_mode; - rate = master->max_speed_hz; + rate = host->max_speed_hz; div = DIV_ROUND_UP(rate, speed); if (div > 254) { @@ -350,11 +350,11 @@ static int synquacer_spi_config(struct spi_master *master, return 0; } -static int synquacer_spi_transfer_one(struct spi_master *master, +static int synquacer_spi_transfer_one(struct spi_controller *host, struct spi_device *spi, struct spi_transfer *xfer) { - struct synquacer_spi *sspi = spi_master_get_devdata(master); + struct synquacer_spi *sspi = spi_controller_get_devdata(host); int ret; int status = 0; u32 words; @@ -378,7 +378,7 @@ static int synquacer_spi_transfer_one(struct spi_master *master, if (bpw == 8 && !(xfer->len % 4) && !(spi->mode & SPI_LSB_FIRST)) xfer->bits_per_word = 32; - ret = synquacer_spi_config(master, spi, xfer); + ret = synquacer_spi_config(host, spi, xfer); /* restore */ xfer->bits_per_word = bpw; @@ -482,7 +482,7 @@ static int synquacer_spi_transfer_one(struct spi_master *master, static void synquacer_spi_set_cs(struct spi_device *spi, bool enable) { - struct synquacer_spi *sspi = spi_master_get_devdata(spi->master); + struct synquacer_spi *sspi = spi_controller_get_devdata(spi->controller); u32 val; val = readl(sspi->regs + SYNQUACER_HSSPI_REG_DMSTART); @@ -517,11 +517,11 @@ static int synquacer_spi_wait_status_update(struct synquacer_spi *sspi, return -EBUSY; } -static int synquacer_spi_enable(struct spi_master *master) +static int synquacer_spi_enable(struct spi_controller *host) { u32 val; int status; - struct synquacer_spi *sspi = spi_master_get_devdata(master); + struct synquacer_spi *sspi = spi_controller_get_devdata(host); /* Disable module */ writel(0, sspi->regs + SYNQUACER_HSSPI_REG_MCTRL); @@ -601,18 +601,18 @@ static irqreturn_t sq_spi_tx_handler(int irq, void *priv) static int synquacer_spi_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; - struct spi_master *master; + struct spi_controller *host; struct synquacer_spi *sspi; int ret; int rx_irq, tx_irq; - master = spi_alloc_master(&pdev->dev, sizeof(*sspi)); - if (!master) + host = spi_alloc_host(&pdev->dev, sizeof(*sspi)); + if (!host) return -ENOMEM; - platform_set_drvdata(pdev, master); + platform_set_drvdata(pdev, host); - sspi = spi_master_get_devdata(master); + sspi = spi_controller_get_devdata(host); sspi->dev = &pdev->dev; init_completion(&sspi->transfer_done); @@ -625,7 +625,7 @@ static int synquacer_spi_probe(struct platform_device *pdev) sspi->clk_src_type = SYNQUACER_HSSPI_CLOCK_SRC_IHCLK; /* Default */ device_property_read_u32(&pdev->dev, "socionext,ihclk-rate", - &master->max_speed_hz); /* for ACPI */ + &host->max_speed_hz); /* for ACPI */ if (dev_of_node(&pdev->dev)) { if (device_property_match_string(&pdev->dev, @@ -655,21 +655,21 @@ static int synquacer_spi_probe(struct platform_device *pdev) goto put_spi; } - master->max_speed_hz = clk_get_rate(sspi->clk); + host->max_speed_hz = clk_get_rate(sspi->clk); } - if (!master->max_speed_hz) { + if (!host->max_speed_hz) { dev_err(&pdev->dev, "missing clock source\n"); ret = -EINVAL; goto disable_clk; } - master->min_speed_hz = master->max_speed_hz / 254; + host->min_speed_hz = host->max_speed_hz / 254; sspi->aces = device_property_read_bool(&pdev->dev, "socionext,set-aces"); sspi->rtm = device_property_read_bool(&pdev->dev, "socionext,use-rtm"); - master->num_chipselect = SYNQUACER_HSSPI_NUM_CHIP_SELECT; + host->num_chipselect = SYNQUACER_HSSPI_NUM_CHIP_SELECT; rx_irq = platform_get_irq(pdev, 0); if (rx_irq <= 0) { @@ -699,27 +699,27 @@ static int synquacer_spi_probe(struct platform_device *pdev) goto disable_clk; } - master->dev.of_node = np; - master->dev.fwnode = pdev->dev.fwnode; - master->auto_runtime_pm = true; - master->bus_num = pdev->id; + host->dev.of_node = np; + host->dev.fwnode = pdev->dev.fwnode; + host->auto_runtime_pm = true; + host->bus_num = pdev->id; - master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_TX_DUAL | SPI_RX_DUAL | - SPI_TX_QUAD | SPI_RX_QUAD; - master->bits_per_word_mask = SPI_BPW_MASK(32) | SPI_BPW_MASK(24) | - SPI_BPW_MASK(16) | SPI_BPW_MASK(8); + host->mode_bits = SPI_CPOL | SPI_CPHA | SPI_TX_DUAL | SPI_RX_DUAL | + SPI_TX_QUAD | SPI_RX_QUAD; + host->bits_per_word_mask = SPI_BPW_MASK(32) | SPI_BPW_MASK(24) | + SPI_BPW_MASK(16) | SPI_BPW_MASK(8); - master->set_cs = synquacer_spi_set_cs; - master->transfer_one = synquacer_spi_transfer_one; + host->set_cs = synquacer_spi_set_cs; + host->transfer_one = synquacer_spi_transfer_one; - ret = synquacer_spi_enable(master); + ret = synquacer_spi_enable(host); if (ret) goto disable_clk; pm_runtime_set_active(sspi->dev); pm_runtime_enable(sspi->dev); - ret = devm_spi_register_master(sspi->dev, master); + ret = devm_spi_register_controller(sspi->dev, host); if (ret) goto disable_pm; @@ -730,15 +730,15 @@ disable_pm: disable_clk: clk_disable_unprepare(sspi->clk); put_spi: - spi_master_put(master); + spi_controller_put(host); return ret; } static void synquacer_spi_remove(struct platform_device *pdev) { - struct spi_master *master = platform_get_drvdata(pdev); - struct synquacer_spi *sspi = spi_master_get_devdata(master); + struct spi_controller *host = platform_get_drvdata(pdev); + struct synquacer_spi *sspi = spi_controller_get_devdata(host); pm_runtime_disable(sspi->dev); @@ -747,11 +747,11 @@ static void synquacer_spi_remove(struct platform_device *pdev) static int __maybe_unused synquacer_spi_suspend(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct synquacer_spi *sspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct synquacer_spi *sspi = spi_controller_get_devdata(host); int ret; - ret = spi_master_suspend(master); + ret = spi_controller_suspend(host); if (ret) return ret; @@ -763,8 +763,8 @@ static int __maybe_unused synquacer_spi_suspend(struct device *dev) static int __maybe_unused synquacer_spi_resume(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct synquacer_spi *sspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct synquacer_spi *sspi = spi_controller_get_devdata(host); int ret; if (!pm_runtime_suspended(dev)) { @@ -778,7 +778,7 @@ static int __maybe_unused synquacer_spi_resume(struct device *dev) return ret; } - ret = synquacer_spi_enable(master); + ret = synquacer_spi_enable(host); if (ret) { clk_disable_unprepare(sspi->clk); dev_err(dev, "failed to enable spi (%d)\n", ret); @@ -786,7 +786,7 @@ static int __maybe_unused synquacer_spi_resume(struct device *dev) } } - ret = spi_master_resume(master); + ret = spi_controller_resume(host); if (ret < 0) clk_disable_unprepare(sspi->clk); From 8726bdcef62eac46c80830e6154c442fbca6d928 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:15 +0800 Subject: [PATCH 0626/1562] spi: geni-qcom: switch to use modern name Change legacy name master/slave to modern name host/target or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-11-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-geni-qcom.c | 96 ++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/drivers/spi/spi-geni-qcom.c b/drivers/spi/spi-geni-qcom.c index b956a32a4162..15f84e68d4d2 100644 --- a/drivers/spi/spi-geni-qcom.c +++ b/drivers/spi/spi-geni-qcom.c @@ -145,10 +145,10 @@ static int get_spi_clk_cfg(unsigned int speed_hz, return ret; } -static void handle_se_timeout(struct spi_master *spi, - struct spi_message *msg) +static void handle_se_timeout(struct spi_controller *spi, + struct spi_message *msg) { - struct spi_geni_master *mas = spi_master_get_devdata(spi); + struct spi_geni_master *mas = spi_controller_get_devdata(spi); unsigned long time_left; struct geni_se *se = &mas->se; const struct spi_transfer *xfer; @@ -160,9 +160,9 @@ static void handle_se_timeout(struct spi_master *spi, xfer = mas->cur_xfer; mas->cur_xfer = NULL; - if (spi->slave) { + if (spi->target) { /* - * skip CMD Cancel sequnece since spi slave + * skip CMD Cancel sequnece since spi target * doesn`t support CMD Cancel sequnece */ spin_unlock_irq(&mas->lock); @@ -225,17 +225,17 @@ reset_if_dma: } } -static void handle_gpi_timeout(struct spi_master *spi, struct spi_message *msg) +static void handle_gpi_timeout(struct spi_controller *spi, struct spi_message *msg) { - struct spi_geni_master *mas = spi_master_get_devdata(spi); + struct spi_geni_master *mas = spi_controller_get_devdata(spi); dmaengine_terminate_sync(mas->tx); dmaengine_terminate_sync(mas->rx); } -static void spi_geni_handle_err(struct spi_master *spi, struct spi_message *msg) +static void spi_geni_handle_err(struct spi_controller *spi, struct spi_message *msg) { - struct spi_geni_master *mas = spi_master_get_devdata(spi); + struct spi_geni_master *mas = spi_controller_get_devdata(spi); switch (mas->cur_xfer_mode) { case GENI_SE_FIFO: @@ -286,8 +286,8 @@ static bool spi_geni_is_abort_still_pending(struct spi_geni_master *mas) static void spi_geni_set_cs(struct spi_device *slv, bool set_flag) { - struct spi_geni_master *mas = spi_master_get_devdata(slv->master); - struct spi_master *spi = dev_get_drvdata(mas->dev); + struct spi_geni_master *mas = spi_controller_get_devdata(slv->controller); + struct spi_controller *spi = dev_get_drvdata(mas->dev); struct geni_se *se = &mas->se; unsigned long time_left; @@ -395,9 +395,9 @@ static int geni_spi_set_clock_and_bw(struct spi_geni_master *mas, } static int setup_fifo_params(struct spi_device *spi_slv, - struct spi_master *spi) + struct spi_controller *spi) { - struct spi_geni_master *mas = spi_master_get_devdata(spi); + struct spi_geni_master *mas = spi_controller_get_devdata(spi); struct geni_se *se = &mas->se; u32 loopback_cfg = 0, cpol = 0, cpha = 0, demux_output_inv = 0; u32 demux_sel; @@ -434,7 +434,7 @@ static int setup_fifo_params(struct spi_device *spi_slv, static void spi_gsi_callback_result(void *cb, const struct dmaengine_result *result) { - struct spi_master *spi = cb; + struct spi_controller *spi = cb; spi->cur_msg->status = -EIO; if (result->result != DMA_TRANS_NOERROR) { @@ -454,7 +454,7 @@ spi_gsi_callback_result(void *cb, const struct dmaengine_result *result) } static int setup_gsi_xfer(struct spi_transfer *xfer, struct spi_geni_master *mas, - struct spi_device *spi_slv, struct spi_master *spi) + struct spi_device *spi_slv, struct spi_controller *spi) { unsigned long flags = DMA_PREP_INTERRUPT | DMA_CTRL_ACK; struct dma_slave_config config = {}; @@ -560,14 +560,14 @@ static u32 get_xfer_len_in_words(struct spi_transfer *xfer, static bool geni_can_dma(struct spi_controller *ctlr, struct spi_device *slv, struct spi_transfer *xfer) { - struct spi_geni_master *mas = spi_master_get_devdata(slv->master); + struct spi_geni_master *mas = spi_controller_get_devdata(slv->controller); u32 len, fifo_size; if (mas->cur_xfer_mode == GENI_GPI_DMA) return true; - /* Set SE DMA mode for SPI slave. */ - if (ctlr->slave) + /* Set SE DMA mode for SPI target. */ + if (ctlr->target) return true; len = get_xfer_len_in_words(xfer, mas); @@ -579,10 +579,10 @@ static bool geni_can_dma(struct spi_controller *ctlr, return false; } -static int spi_geni_prepare_message(struct spi_master *spi, - struct spi_message *spi_msg) +static int spi_geni_prepare_message(struct spi_controller *spi, + struct spi_message *spi_msg) { - struct spi_geni_master *mas = spi_master_get_devdata(spi); + struct spi_geni_master *mas = spi_controller_get_devdata(spi); int ret; switch (mas->cur_xfer_mode) { @@ -657,7 +657,7 @@ static int spi_geni_init(struct spi_geni_master *mas) proto = geni_se_read_proto(se); - if (spi->slave) { + if (spi->target) { if (proto != GENI_SE_SPI_SLAVE) { dev_err(mas->dev, "Invalid proto %d\n", proto); goto out_pm; @@ -715,7 +715,7 @@ static int spi_geni_init(struct spi_geni_master *mas) } /* We always control CS manually */ - if (!spi->slave) { + if (!spi->target) { spi_tx_cfg = readl(se->base + SE_SPI_TRANS_CFG); spi_tx_cfg &= ~CS_TOGGLE; writel(spi_tx_cfg, se->base + SE_SPI_TRANS_CFG); @@ -824,7 +824,7 @@ static void geni_spi_handle_rx(struct spi_geni_master *mas) static int setup_se_xfer(struct spi_transfer *xfer, struct spi_geni_master *mas, - u16 mode, struct spi_master *spi) + u16 mode, struct spi_controller *spi) { u32 m_cmd = 0; u32 len; @@ -913,11 +913,11 @@ static int setup_se_xfer(struct spi_transfer *xfer, return ret; } -static int spi_geni_transfer_one(struct spi_master *spi, - struct spi_device *slv, - struct spi_transfer *xfer) +static int spi_geni_transfer_one(struct spi_controller *spi, + struct spi_device *slv, + struct spi_transfer *xfer) { - struct spi_geni_master *mas = spi_master_get_devdata(spi); + struct spi_geni_master *mas = spi_controller_get_devdata(spi); int ret; if (spi_geni_is_abort_still_pending(mas)) @@ -939,8 +939,8 @@ static int spi_geni_transfer_one(struct spi_master *spi, static irqreturn_t geni_spi_isr(int irq, void *data) { - struct spi_master *spi = data; - struct spi_geni_master *mas = spi_master_get_devdata(spi); + struct spi_controller *spi = data; + struct spi_geni_master *mas = spi_controller_get_devdata(spi); struct geni_se *se = &mas->se; u32 m_irq; @@ -1042,7 +1042,7 @@ static irqreturn_t geni_spi_isr(int irq, void *data) static int spi_geni_probe(struct platform_device *pdev) { int ret, irq; - struct spi_master *spi; + struct spi_controller *spi; struct spi_geni_master *mas; void __iomem *base; struct clk *clk; @@ -1064,12 +1064,12 @@ static int spi_geni_probe(struct platform_device *pdev) if (IS_ERR(clk)) return PTR_ERR(clk); - spi = devm_spi_alloc_master(dev, sizeof(*mas)); + spi = devm_spi_alloc_host(dev, sizeof(*mas)); if (!spi) return -ENOMEM; platform_set_drvdata(pdev, spi); - mas = spi_master_get_devdata(spi); + mas = spi_controller_get_devdata(spi); mas->irq = irq; mas->dev = dev; mas->se.dev = dev; @@ -1113,7 +1113,7 @@ static int spi_geni_probe(struct platform_device *pdev) pm_runtime_enable(dev); if (device_property_read_bool(&pdev->dev, "spi-slave")) - spi->slave = true; + spi->target = true; ret = geni_icc_get(&mas->se, NULL); if (ret) @@ -1135,7 +1135,7 @@ static int spi_geni_probe(struct platform_device *pdev) * for dma (gsi) mode, the gsi will set cs based on params passed in * TRE */ - if (!spi->slave && mas->cur_xfer_mode == GENI_SE_FIFO) + if (!spi->target && mas->cur_xfer_mode == GENI_SE_FIFO) spi->set_cs = spi_geni_set_cs; /* @@ -1148,7 +1148,7 @@ static int spi_geni_probe(struct platform_device *pdev) if (ret) goto spi_geni_release_dma; - ret = spi_register_master(spi); + ret = spi_register_controller(spi); if (ret) goto spi_geni_probe_free_irq; @@ -1164,11 +1164,11 @@ spi_geni_probe_runtime_disable: static void spi_geni_remove(struct platform_device *pdev) { - struct spi_master *spi = platform_get_drvdata(pdev); - struct spi_geni_master *mas = spi_master_get_devdata(spi); + struct spi_controller *spi = platform_get_drvdata(pdev); + struct spi_geni_master *mas = spi_controller_get_devdata(spi); /* Unregister _before_ disabling pm_runtime() so we stop transfers */ - spi_unregister_master(spi); + spi_unregister_controller(spi); spi_geni_release_dma_chan(mas); @@ -1178,8 +1178,8 @@ static void spi_geni_remove(struct platform_device *pdev) static int __maybe_unused spi_geni_runtime_suspend(struct device *dev) { - struct spi_master *spi = dev_get_drvdata(dev); - struct spi_geni_master *mas = spi_master_get_devdata(spi); + struct spi_controller *spi = dev_get_drvdata(dev); + struct spi_geni_master *mas = spi_controller_get_devdata(spi); int ret; /* Drop the performance state vote */ @@ -1194,8 +1194,8 @@ static int __maybe_unused spi_geni_runtime_suspend(struct device *dev) static int __maybe_unused spi_geni_runtime_resume(struct device *dev) { - struct spi_master *spi = dev_get_drvdata(dev); - struct spi_geni_master *mas = spi_master_get_devdata(spi); + struct spi_controller *spi = dev_get_drvdata(dev); + struct spi_geni_master *mas = spi_controller_get_devdata(spi); int ret; ret = geni_icc_enable(&mas->se); @@ -1211,30 +1211,30 @@ static int __maybe_unused spi_geni_runtime_resume(struct device *dev) static int __maybe_unused spi_geni_suspend(struct device *dev) { - struct spi_master *spi = dev_get_drvdata(dev); + struct spi_controller *spi = dev_get_drvdata(dev); int ret; - ret = spi_master_suspend(spi); + ret = spi_controller_suspend(spi); if (ret) return ret; ret = pm_runtime_force_suspend(dev); if (ret) - spi_master_resume(spi); + spi_controller_resume(spi); return ret; } static int __maybe_unused spi_geni_resume(struct device *dev) { - struct spi_master *spi = dev_get_drvdata(dev); + struct spi_controller *spi = dev_get_drvdata(dev); int ret; ret = pm_runtime_force_resume(dev); if (ret) return ret; - ret = spi_master_resume(spi); + ret = spi_controller_resume(spi); if (ret) pm_runtime_force_suspend(dev); From fe2e1c2225986b49988189ecd42dc233c10f237f Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:16 +0800 Subject: [PATCH 0627/1562] spi: tegra114: switch to use modern name Change legacy name master/slave to modern name host/target or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-12-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-tegra114.c | 118 ++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 59 deletions(-) diff --git a/drivers/spi/spi-tegra114.c b/drivers/spi/spi-tegra114.c index 460f232dad50..bc7cc4088eea 100644 --- a/drivers/spi/spi-tegra114.c +++ b/drivers/spi/spi-tegra114.c @@ -164,7 +164,7 @@ struct tegra_spi_client_data { struct tegra_spi_data { struct device *dev; - struct spi_master *master; + struct spi_controller *host; spinlock_t lock; struct clk *clk; @@ -718,7 +718,7 @@ static void tegra_spi_deinit_dma_param(struct tegra_spi_data *tspi, static int tegra_spi_set_hw_cs_timing(struct spi_device *spi) { - struct tegra_spi_data *tspi = spi_master_get_devdata(spi->master); + struct tegra_spi_data *tspi = spi_controller_get_devdata(spi->controller); struct spi_delay *setup = &spi->cs_setup; struct spi_delay *hold = &spi->cs_hold; struct spi_delay *inactive = &spi->cs_inactive; @@ -772,7 +772,7 @@ static u32 tegra_spi_setup_transfer_one(struct spi_device *spi, bool is_first_of_msg, bool is_single_xfer) { - struct tegra_spi_data *tspi = spi_master_get_devdata(spi->master); + struct tegra_spi_data *tspi = spi_controller_get_devdata(spi->controller); struct tegra_spi_client_data *cdata = spi->controller_data; u32 speed = t->speed_hz; u8 bits_per_word = t->bits_per_word; @@ -865,7 +865,7 @@ static u32 tegra_spi_setup_transfer_one(struct spi_device *spi, static int tegra_spi_start_transfer_one(struct spi_device *spi, struct spi_transfer *t, u32 command1) { - struct tegra_spi_data *tspi = spi_master_get_devdata(spi->master); + struct tegra_spi_data *tspi = spi_controller_get_devdata(spi->controller); unsigned total_fifo_words; int ret; @@ -912,10 +912,10 @@ static struct tegra_spi_client_data *tegra_spi_parse_cdata_dt(struct spi_device *spi) { struct tegra_spi_client_data *cdata; - struct device_node *slave_np; + struct device_node *target_np; - slave_np = spi->dev.of_node; - if (!slave_np) { + target_np = spi->dev.of_node; + if (!target_np) { dev_dbg(&spi->dev, "device node not found\n"); return NULL; } @@ -924,9 +924,9 @@ static struct tegra_spi_client_data if (!cdata) return NULL; - of_property_read_u32(slave_np, "nvidia,tx-clk-tap-delay", + of_property_read_u32(target_np, "nvidia,tx-clk-tap-delay", &cdata->tx_clk_tap_delay); - of_property_read_u32(slave_np, "nvidia,rx-clk-tap-delay", + of_property_read_u32(target_np, "nvidia,rx-clk-tap-delay", &cdata->rx_clk_tap_delay); return cdata; } @@ -942,7 +942,7 @@ static void tegra_spi_cleanup(struct spi_device *spi) static int tegra_spi_setup(struct spi_device *spi) { - struct tegra_spi_data *tspi = spi_master_get_devdata(spi->master); + struct tegra_spi_data *tspi = spi_controller_get_devdata(spi->controller); struct tegra_spi_client_data *cdata = spi->controller_data; u32 val; unsigned long flags; @@ -993,7 +993,7 @@ static int tegra_spi_setup(struct spi_device *spi) static void tegra_spi_transfer_end(struct spi_device *spi) { - struct tegra_spi_data *tspi = spi_master_get_devdata(spi->master); + struct tegra_spi_data *tspi = spi_controller_get_devdata(spi->controller); int cs_val = (spi->mode & SPI_CS_HIGH) ? 0 : 1; /* GPIO based chip select control */ @@ -1025,11 +1025,11 @@ static void tegra_spi_dump_regs(struct tegra_spi_data *tspi) tegra_spi_readl(tspi, SPI_FIFO_STATUS)); } -static int tegra_spi_transfer_one_message(struct spi_master *master, +static int tegra_spi_transfer_one_message(struct spi_controller *host, struct spi_message *msg) { bool is_first_msg = true; - struct tegra_spi_data *tspi = spi_master_get_devdata(master); + struct tegra_spi_data *tspi = spi_controller_get_devdata(host); struct spi_transfer *xfer; struct spi_device *spi = msg->spi; int ret; @@ -1078,7 +1078,7 @@ static int tegra_spi_transfer_one_message(struct spi_master *master, reset_control_assert(tspi->rst); udelay(2); reset_control_deassert(tspi->rst); - tspi->last_used_cs = master->num_chipselect + 1; + tspi->last_used_cs = host->num_chipselect + 1; goto complete_xfer; } @@ -1112,7 +1112,7 @@ complete_xfer: ret = 0; exit: msg->status = ret; - spi_finalize_current_message(master); + spi_finalize_current_message(host); return ret; } @@ -1293,40 +1293,40 @@ MODULE_DEVICE_TABLE(of, tegra_spi_of_match); static int tegra_spi_probe(struct platform_device *pdev) { - struct spi_master *master; + struct spi_controller *host; struct tegra_spi_data *tspi; struct resource *r; int ret, spi_irq; int bus_num; - master = spi_alloc_master(&pdev->dev, sizeof(*tspi)); - if (!master) { - dev_err(&pdev->dev, "master allocation failed\n"); + host = spi_alloc_host(&pdev->dev, sizeof(*tspi)); + if (!host) { + dev_err(&pdev->dev, "host allocation failed\n"); return -ENOMEM; } - platform_set_drvdata(pdev, master); - tspi = spi_master_get_devdata(master); + platform_set_drvdata(pdev, host); + tspi = spi_controller_get_devdata(host); if (of_property_read_u32(pdev->dev.of_node, "spi-max-frequency", - &master->max_speed_hz)) - master->max_speed_hz = 25000000; /* 25MHz */ + &host->max_speed_hz)) + host->max_speed_hz = 25000000; /* 25MHz */ /* the spi->mode bits understood by this driver: */ - master->use_gpio_descriptors = true; - master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST | - SPI_TX_DUAL | SPI_RX_DUAL | SPI_3WIRE; - master->bits_per_word_mask = SPI_BPW_RANGE_MASK(4, 32); - master->setup = tegra_spi_setup; - master->cleanup = tegra_spi_cleanup; - master->transfer_one_message = tegra_spi_transfer_one_message; - master->set_cs_timing = tegra_spi_set_hw_cs_timing; - master->num_chipselect = MAX_CHIP_SELECT; - master->auto_runtime_pm = true; + host->use_gpio_descriptors = true; + host->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST | + SPI_TX_DUAL | SPI_RX_DUAL | SPI_3WIRE; + host->bits_per_word_mask = SPI_BPW_RANGE_MASK(4, 32); + host->setup = tegra_spi_setup; + host->cleanup = tegra_spi_cleanup; + host->transfer_one_message = tegra_spi_transfer_one_message; + host->set_cs_timing = tegra_spi_set_hw_cs_timing; + host->num_chipselect = MAX_CHIP_SELECT; + host->auto_runtime_pm = true; bus_num = of_alias_get_id(pdev->dev.of_node, "spi"); if (bus_num >= 0) - master->bus_num = bus_num; + host->bus_num = bus_num; - tspi->master = master; + tspi->host = host; tspi->dev = &pdev->dev; spin_lock_init(&tspi->lock); @@ -1334,20 +1334,20 @@ static int tegra_spi_probe(struct platform_device *pdev) if (!tspi->soc_data) { dev_err(&pdev->dev, "unsupported tegra\n"); ret = -ENODEV; - goto exit_free_master; + goto exit_free_host; } tspi->base = devm_platform_get_and_ioremap_resource(pdev, 0, &r); if (IS_ERR(tspi->base)) { ret = PTR_ERR(tspi->base); - goto exit_free_master; + goto exit_free_host; } tspi->phys = r->start; spi_irq = platform_get_irq(pdev, 0); if (spi_irq < 0) { ret = spi_irq; - goto exit_free_master; + goto exit_free_host; } tspi->irq = spi_irq; @@ -1355,14 +1355,14 @@ static int tegra_spi_probe(struct platform_device *pdev) if (IS_ERR(tspi->clk)) { dev_err(&pdev->dev, "can not get clock\n"); ret = PTR_ERR(tspi->clk); - goto exit_free_master; + goto exit_free_host; } tspi->rst = devm_reset_control_get_exclusive(&pdev->dev, "spi"); if (IS_ERR(tspi->rst)) { dev_err(&pdev->dev, "can not get reset\n"); ret = PTR_ERR(tspi->rst); - goto exit_free_master; + goto exit_free_host; } tspi->max_buf_size = SPI_FIFO_DEPTH << 2; @@ -1370,7 +1370,7 @@ static int tegra_spi_probe(struct platform_device *pdev) ret = tegra_spi_init_dma_param(tspi, true); if (ret < 0) - goto exit_free_master; + goto exit_free_host; ret = tegra_spi_init_dma_param(tspi, false); if (ret < 0) goto exit_rx_dma_free; @@ -1401,7 +1401,7 @@ static int tegra_spi_probe(struct platform_device *pdev) tspi->spi_cs_timing1 = tegra_spi_readl(tspi, SPI_CS_TIMING1); tspi->spi_cs_timing2 = tegra_spi_readl(tspi, SPI_CS_TIMING2); tspi->def_command2_reg = tegra_spi_readl(tspi, SPI_COMMAND2); - tspi->last_used_cs = master->num_chipselect + 1; + tspi->last_used_cs = host->num_chipselect + 1; pm_runtime_put(&pdev->dev); ret = request_threaded_irq(tspi->irq, tegra_spi_isr, tegra_spi_isr_thread, IRQF_ONESHOT, @@ -1412,10 +1412,10 @@ static int tegra_spi_probe(struct platform_device *pdev) goto exit_pm_disable; } - master->dev.of_node = pdev->dev.of_node; - ret = devm_spi_register_master(&pdev->dev, master); + host->dev.of_node = pdev->dev.of_node; + ret = devm_spi_register_controller(&pdev->dev, host); if (ret < 0) { - dev_err(&pdev->dev, "can not register to master err %d\n", ret); + dev_err(&pdev->dev, "can not register to host err %d\n", ret); goto exit_free_irq; } return ret; @@ -1429,15 +1429,15 @@ exit_pm_disable: tegra_spi_deinit_dma_param(tspi, false); exit_rx_dma_free: tegra_spi_deinit_dma_param(tspi, true); -exit_free_master: - spi_master_put(master); +exit_free_host: + spi_controller_put(host); return ret; } static void tegra_spi_remove(struct platform_device *pdev) { - struct spi_master *master = platform_get_drvdata(pdev); - struct tegra_spi_data *tspi = spi_master_get_devdata(master); + struct spi_controller *host = platform_get_drvdata(pdev); + struct tegra_spi_data *tspi = spi_controller_get_devdata(host); free_irq(tspi->irq, tspi); @@ -1455,15 +1455,15 @@ static void tegra_spi_remove(struct platform_device *pdev) #ifdef CONFIG_PM_SLEEP static int tegra_spi_suspend(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); + struct spi_controller *host = dev_get_drvdata(dev); - return spi_master_suspend(master); + return spi_controller_suspend(host); } static int tegra_spi_resume(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct tegra_spi_data *tspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct tegra_spi_data *tspi = spi_controller_get_devdata(host); int ret; ret = pm_runtime_resume_and_get(dev); @@ -1473,17 +1473,17 @@ static int tegra_spi_resume(struct device *dev) } tegra_spi_writel(tspi, tspi->command1_reg, SPI_COMMAND1); tegra_spi_writel(tspi, tspi->def_command2_reg, SPI_COMMAND2); - tspi->last_used_cs = master->num_chipselect + 1; + tspi->last_used_cs = host->num_chipselect + 1; pm_runtime_put(dev); - return spi_master_resume(master); + return spi_controller_resume(host); } #endif static int tegra_spi_runtime_suspend(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct tegra_spi_data *tspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct tegra_spi_data *tspi = spi_controller_get_devdata(host); /* Flush all write which are in PPSB queue by reading back */ tegra_spi_readl(tspi, SPI_COMMAND1); @@ -1494,8 +1494,8 @@ static int tegra_spi_runtime_suspend(struct device *dev) static int tegra_spi_runtime_resume(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct tegra_spi_data *tspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct tegra_spi_data *tspi = spi_controller_get_devdata(host); int ret; ret = clk_prepare_enable(tspi->clk); From 5ee8cd26d8ebd889c270a6851824b7aeec38f3a8 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:17 +0800 Subject: [PATCH 0628/1562] spi: tegra20-sflash: switch to use modern name Change legacy name master to modern name host or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-13-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-tegra20-sflash.c | 74 ++++++++++++++++---------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/drivers/spi/spi-tegra20-sflash.c b/drivers/spi/spi-tegra20-sflash.c index 0c5507473f97..9f6b9f89be5b 100644 --- a/drivers/spi/spi-tegra20-sflash.c +++ b/drivers/spi/spi-tegra20-sflash.c @@ -102,7 +102,7 @@ struct tegra_sflash_data { struct device *dev; - struct spi_master *master; + struct spi_controller *host; spinlock_t lock; struct clk *clk; @@ -251,7 +251,7 @@ static int tegra_sflash_start_transfer_one(struct spi_device *spi, struct spi_transfer *t, bool is_first_of_msg, bool is_single_xfer) { - struct tegra_sflash_data *tsd = spi_master_get_devdata(spi->master); + struct tegra_sflash_data *tsd = spi_controller_get_devdata(spi->controller); u32 speed; u32 command; @@ -303,12 +303,12 @@ static int tegra_sflash_start_transfer_one(struct spi_device *spi, return tegra_sflash_start_cpu_based_transfer(tsd, t); } -static int tegra_sflash_transfer_one_message(struct spi_master *master, +static int tegra_sflash_transfer_one_message(struct spi_controller *host, struct spi_message *msg) { bool is_first_msg = true; int single_xfer; - struct tegra_sflash_data *tsd = spi_master_get_devdata(master); + struct tegra_sflash_data *tsd = spi_controller_get_devdata(host); struct spi_transfer *xfer; struct spi_device *spi = msg->spi; int ret; @@ -351,7 +351,7 @@ static int tegra_sflash_transfer_one_message(struct spi_master *master, exit: tegra_sflash_writel(tsd, tsd->def_command_reg, SPI_COMMAND); msg->status = ret; - spi_finalize_current_message(master); + spi_finalize_current_message(host); return ret; } @@ -416,7 +416,7 @@ MODULE_DEVICE_TABLE(of, tegra_sflash_of_match); static int tegra_sflash_probe(struct platform_device *pdev) { - struct spi_master *master; + struct spi_controller *host; struct tegra_sflash_data *tsd; int ret; const struct of_device_id *match; @@ -427,37 +427,37 @@ static int tegra_sflash_probe(struct platform_device *pdev) return -ENODEV; } - master = spi_alloc_master(&pdev->dev, sizeof(*tsd)); - if (!master) { - dev_err(&pdev->dev, "master allocation failed\n"); + host = spi_alloc_host(&pdev->dev, sizeof(*tsd)); + if (!host) { + dev_err(&pdev->dev, "host allocation failed\n"); return -ENOMEM; } /* the spi->mode bits understood by this driver: */ - master->mode_bits = SPI_CPOL | SPI_CPHA; - master->transfer_one_message = tegra_sflash_transfer_one_message; - master->auto_runtime_pm = true; - master->num_chipselect = MAX_CHIP_SELECT; + host->mode_bits = SPI_CPOL | SPI_CPHA; + host->transfer_one_message = tegra_sflash_transfer_one_message; + host->auto_runtime_pm = true; + host->num_chipselect = MAX_CHIP_SELECT; - platform_set_drvdata(pdev, master); - tsd = spi_master_get_devdata(master); - tsd->master = master; + platform_set_drvdata(pdev, host); + tsd = spi_controller_get_devdata(host); + tsd->host = host; tsd->dev = &pdev->dev; spin_lock_init(&tsd->lock); if (of_property_read_u32(tsd->dev->of_node, "spi-max-frequency", - &master->max_speed_hz)) - master->max_speed_hz = 25000000; /* 25MHz */ + &host->max_speed_hz)) + host->max_speed_hz = 25000000; /* 25MHz */ tsd->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(tsd->base)) { ret = PTR_ERR(tsd->base); - goto exit_free_master; + goto exit_free_host; } ret = platform_get_irq(pdev, 0); if (ret < 0) - goto exit_free_master; + goto exit_free_host; tsd->irq = ret; ret = request_irq(tsd->irq, tegra_sflash_isr, 0, @@ -465,7 +465,7 @@ static int tegra_sflash_probe(struct platform_device *pdev) if (ret < 0) { dev_err(&pdev->dev, "Failed to register ISR for IRQ %d\n", tsd->irq); - goto exit_free_master; + goto exit_free_host; } tsd->clk = devm_clk_get(&pdev->dev, NULL); @@ -505,10 +505,10 @@ static int tegra_sflash_probe(struct platform_device *pdev) tegra_sflash_writel(tsd, tsd->def_command_reg, SPI_COMMAND); pm_runtime_put(&pdev->dev); - master->dev.of_node = pdev->dev.of_node; - ret = devm_spi_register_master(&pdev->dev, master); + host->dev.of_node = pdev->dev.of_node; + ret = devm_spi_register_controller(&pdev->dev, host); if (ret < 0) { - dev_err(&pdev->dev, "can not register to master err %d\n", ret); + dev_err(&pdev->dev, "can not register to host err %d\n", ret); goto exit_pm_disable; } return ret; @@ -519,15 +519,15 @@ exit_pm_disable: tegra_sflash_runtime_suspend(&pdev->dev); exit_free_irq: free_irq(tsd->irq, tsd); -exit_free_master: - spi_master_put(master); +exit_free_host: + spi_controller_put(host); return ret; } static void tegra_sflash_remove(struct platform_device *pdev) { - struct spi_master *master = platform_get_drvdata(pdev); - struct tegra_sflash_data *tsd = spi_master_get_devdata(master); + struct spi_controller *host = platform_get_drvdata(pdev); + struct tegra_sflash_data *tsd = spi_controller_get_devdata(host); free_irq(tsd->irq, tsd); @@ -539,15 +539,15 @@ static void tegra_sflash_remove(struct platform_device *pdev) #ifdef CONFIG_PM_SLEEP static int tegra_sflash_suspend(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); + struct spi_controller *host = dev_get_drvdata(dev); - return spi_master_suspend(master); + return spi_controller_suspend(host); } static int tegra_sflash_resume(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct tegra_sflash_data *tsd = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct tegra_sflash_data *tsd = spi_controller_get_devdata(host); int ret; ret = pm_runtime_resume_and_get(dev); @@ -558,14 +558,14 @@ static int tegra_sflash_resume(struct device *dev) tegra_sflash_writel(tsd, tsd->command_reg, SPI_COMMAND); pm_runtime_put(dev); - return spi_master_resume(master); + return spi_controller_resume(host); } #endif static int tegra_sflash_runtime_suspend(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct tegra_sflash_data *tsd = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct tegra_sflash_data *tsd = spi_controller_get_devdata(host); /* Flush all write which are in PPSB queue by reading back */ tegra_sflash_readl(tsd, SPI_COMMAND); @@ -576,8 +576,8 @@ static int tegra_sflash_runtime_suspend(struct device *dev) static int tegra_sflash_runtime_resume(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct tegra_sflash_data *tsd = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct tegra_sflash_data *tsd = spi_controller_get_devdata(host); int ret; ret = clk_prepare_enable(tsd->clk); From db34aad4d61b0034c896a7abb481b32fcdcd8332 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:18 +0800 Subject: [PATCH 0629/1562] spi: tegra20-slink: switch to use modern name Change legacy name master/slave to modern name host/target or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-14-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-tegra20-slink.c | 96 ++++++++++++++++----------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/drivers/spi/spi-tegra20-slink.c b/drivers/spi/spi-tegra20-slink.c index f5cd365c913a..ed1393d159ae 100644 --- a/drivers/spi/spi-tegra20-slink.c +++ b/drivers/spi/spi-tegra20-slink.c @@ -152,7 +152,7 @@ struct tegra_slink_chip_data { struct tegra_slink_data { struct device *dev; - struct spi_master *master; + struct spi_controller *host; const struct tegra_slink_chip_data *chip_data; spinlock_t lock; @@ -671,7 +671,7 @@ static void tegra_slink_deinit_dma_param(struct tegra_slink_data *tspi, static int tegra_slink_start_transfer_one(struct spi_device *spi, struct spi_transfer *t) { - struct tegra_slink_data *tspi = spi_master_get_devdata(spi->master); + struct tegra_slink_data *tspi = spi_controller_get_devdata(spi->controller); u32 speed; u8 bits_per_word; unsigned total_fifo_words; @@ -737,7 +737,7 @@ static int tegra_slink_setup(struct spi_device *spi) SLINK_CS_POLARITY3, }; - struct tegra_slink_data *tspi = spi_master_get_devdata(spi->master); + struct tegra_slink_data *tspi = spi_controller_get_devdata(spi->controller); u32 val; unsigned long flags; int ret; @@ -768,10 +768,10 @@ static int tegra_slink_setup(struct spi_device *spi) return 0; } -static int tegra_slink_prepare_message(struct spi_master *master, +static int tegra_slink_prepare_message(struct spi_controller *host, struct spi_message *msg) { - struct tegra_slink_data *tspi = spi_master_get_devdata(master); + struct tegra_slink_data *tspi = spi_controller_get_devdata(host); struct spi_device *spi = msg->spi; tegra_slink_clear_status(tspi); @@ -794,11 +794,11 @@ static int tegra_slink_prepare_message(struct spi_master *master, return 0; } -static int tegra_slink_transfer_one(struct spi_master *master, +static int tegra_slink_transfer_one(struct spi_controller *host, struct spi_device *spi, struct spi_transfer *xfer) { - struct tegra_slink_data *tspi = spi_master_get_devdata(master); + struct tegra_slink_data *tspi = spi_controller_get_devdata(host); int ret; reinit_completion(&tspi->xfer_completion); @@ -825,10 +825,10 @@ static int tegra_slink_transfer_one(struct spi_master *master, return 0; } -static int tegra_slink_unprepare_message(struct spi_master *master, +static int tegra_slink_unprepare_message(struct spi_controller *host, struct spi_message *msg) { - struct tegra_slink_data *tspi = spi_master_get_devdata(master); + struct tegra_slink_data *tspi = spi_controller_get_devdata(host); tegra_slink_writel(tspi, tspi->def_command_reg, SLINK_COMMAND); tegra_slink_writel(tspi, tspi->def_command2_reg, SLINK_COMMAND2); @@ -999,7 +999,7 @@ MODULE_DEVICE_TABLE(of, tegra_slink_of_match); static int tegra_slink_probe(struct platform_device *pdev) { - struct spi_master *master; + struct spi_controller *host; struct tegra_slink_data *tspi; struct resource *r; int ret, spi_irq; @@ -1007,36 +1007,36 @@ static int tegra_slink_probe(struct platform_device *pdev) cdata = of_device_get_match_data(&pdev->dev); - master = spi_alloc_master(&pdev->dev, sizeof(*tspi)); - if (!master) { - dev_err(&pdev->dev, "master allocation failed\n"); + host = spi_alloc_host(&pdev->dev, sizeof(*tspi)); + if (!host) { + dev_err(&pdev->dev, "host allocation failed\n"); return -ENOMEM; } /* the spi->mode bits understood by this driver: */ - master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH; - master->setup = tegra_slink_setup; - master->prepare_message = tegra_slink_prepare_message; - master->transfer_one = tegra_slink_transfer_one; - master->unprepare_message = tegra_slink_unprepare_message; - master->auto_runtime_pm = true; - master->num_chipselect = MAX_CHIP_SELECT; + host->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH; + host->setup = tegra_slink_setup; + host->prepare_message = tegra_slink_prepare_message; + host->transfer_one = tegra_slink_transfer_one; + host->unprepare_message = tegra_slink_unprepare_message; + host->auto_runtime_pm = true; + host->num_chipselect = MAX_CHIP_SELECT; - platform_set_drvdata(pdev, master); - tspi = spi_master_get_devdata(master); - tspi->master = master; + platform_set_drvdata(pdev, host); + tspi = spi_controller_get_devdata(host); + tspi->host = host; tspi->dev = &pdev->dev; tspi->chip_data = cdata; spin_lock_init(&tspi->lock); if (of_property_read_u32(tspi->dev->of_node, "spi-max-frequency", - &master->max_speed_hz)) - master->max_speed_hz = 25000000; /* 25MHz */ + &host->max_speed_hz)) + host->max_speed_hz = 25000000; /* 25MHz */ tspi->base = devm_platform_get_and_ioremap_resource(pdev, 0, &r); if (IS_ERR(tspi->base)) { ret = PTR_ERR(tspi->base); - goto exit_free_master; + goto exit_free_host; } tspi->phys = r->start; @@ -1045,26 +1045,26 @@ static int tegra_slink_probe(struct platform_device *pdev) if (IS_ERR(tspi->clk)) { ret = PTR_ERR(tspi->clk); dev_err(&pdev->dev, "Can not get clock %d\n", ret); - goto exit_free_master; + goto exit_free_host; } tspi->rst = devm_reset_control_get_exclusive(&pdev->dev, "spi"); if (IS_ERR(tspi->rst)) { dev_err(&pdev->dev, "can not get reset\n"); ret = PTR_ERR(tspi->rst); - goto exit_free_master; + goto exit_free_host; } ret = devm_tegra_core_dev_init_opp_table_common(&pdev->dev); if (ret) - goto exit_free_master; + goto exit_free_host; tspi->max_buf_size = SLINK_FIFO_DEPTH << 2; tspi->dma_buf_size = DEFAULT_SPI_DMA_BUF_LEN; ret = tegra_slink_init_dma_param(tspi, true); if (ret < 0) - goto exit_free_master; + goto exit_free_host; ret = tegra_slink_init_dma_param(tspi, false); if (ret < 0) goto exit_rx_dma_free; @@ -1103,10 +1103,10 @@ static int tegra_slink_probe(struct platform_device *pdev) tegra_slink_writel(tspi, tspi->def_command_reg, SLINK_COMMAND); tegra_slink_writel(tspi, tspi->def_command2_reg, SLINK_COMMAND2); - master->dev.of_node = pdev->dev.of_node; - ret = spi_register_master(master); + host->dev.of_node = pdev->dev.of_node; + ret = spi_register_controller(host); if (ret < 0) { - dev_err(&pdev->dev, "can not register to master err %d\n", ret); + dev_err(&pdev->dev, "can not register to host err %d\n", ret); goto exit_free_irq; } @@ -1124,17 +1124,17 @@ exit_pm_disable: tegra_slink_deinit_dma_param(tspi, false); exit_rx_dma_free: tegra_slink_deinit_dma_param(tspi, true); -exit_free_master: - spi_master_put(master); +exit_free_host: + spi_controller_put(host); return ret; } static void tegra_slink_remove(struct platform_device *pdev) { - struct spi_master *master = spi_master_get(platform_get_drvdata(pdev)); - struct tegra_slink_data *tspi = spi_master_get_devdata(master); + struct spi_controller *host = spi_controller_get(platform_get_drvdata(pdev)); + struct tegra_slink_data *tspi = spi_controller_get_devdata(host); - spi_unregister_master(master); + spi_unregister_controller(host); free_irq(tspi->irq, tspi); @@ -1146,21 +1146,21 @@ static void tegra_slink_remove(struct platform_device *pdev) if (tspi->rx_dma_chan) tegra_slink_deinit_dma_param(tspi, true); - spi_master_put(master); + spi_controller_put(host); } #ifdef CONFIG_PM_SLEEP static int tegra_slink_suspend(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); + struct spi_controller *host = dev_get_drvdata(dev); - return spi_master_suspend(master); + return spi_controller_suspend(host); } static int tegra_slink_resume(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct tegra_slink_data *tspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct tegra_slink_data *tspi = spi_controller_get_devdata(host); int ret; ret = pm_runtime_resume_and_get(dev); @@ -1172,14 +1172,14 @@ static int tegra_slink_resume(struct device *dev) tegra_slink_writel(tspi, tspi->command2_reg, SLINK_COMMAND2); pm_runtime_put(dev); - return spi_master_resume(master); + return spi_controller_resume(host); } #endif static int __maybe_unused tegra_slink_runtime_suspend(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct tegra_slink_data *tspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct tegra_slink_data *tspi = spi_controller_get_devdata(host); /* Flush all write which are in PPSB queue by reading back */ tegra_slink_readl(tspi, SLINK_MAS_DATA); @@ -1190,8 +1190,8 @@ static int __maybe_unused tegra_slink_runtime_suspend(struct device *dev) static int __maybe_unused tegra_slink_runtime_resume(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct tegra_slink_data *tspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct tegra_slink_data *tspi = spi_controller_get_devdata(host); int ret; ret = clk_prepare_enable(tspi->clk); From 767e45324bf8fbbaa5463a692ad697226425d28b Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:19 +0800 Subject: [PATCH 0630/1562] spi: tegra210-quad: switch to use modern name Change legacy name master to modern name host or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-15-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-tegra210-quad.c | 80 ++++++++++++++++----------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/drivers/spi/spi-tegra210-quad.c b/drivers/spi/spi-tegra210-quad.c index e9ad9b0b598b..afbd64a217eb 100644 --- a/drivers/spi/spi-tegra210-quad.c +++ b/drivers/spi/spi-tegra210-quad.c @@ -175,7 +175,7 @@ struct tegra_qspi_client_data { struct tegra_qspi { struct device *dev; - struct spi_master *master; + struct spi_controller *host; /* lock to protect data accessed by irq */ spinlock_t lock; @@ -809,7 +809,7 @@ err_out: static u32 tegra_qspi_setup_transfer_one(struct spi_device *spi, struct spi_transfer *t, bool is_first_of_msg) { - struct tegra_qspi *tqspi = spi_master_get_devdata(spi->master); + struct tegra_qspi *tqspi = spi_controller_get_devdata(spi->controller); struct tegra_qspi_client_data *cdata = spi->controller_data; u32 command1, command2, speed = t->speed_hz; u8 bits_per_word = t->bits_per_word; @@ -870,7 +870,7 @@ static u32 tegra_qspi_setup_transfer_one(struct spi_device *spi, struct spi_tran static int tegra_qspi_start_transfer_one(struct spi_device *spi, struct spi_transfer *t, u32 command1) { - struct tegra_qspi *tqspi = spi_master_get_devdata(spi->master); + struct tegra_qspi *tqspi = spi_controller_get_devdata(spi->controller); unsigned int total_fifo_words; u8 bus_width = 0; int ret; @@ -925,7 +925,7 @@ static int tegra_qspi_start_transfer_one(struct spi_device *spi, static struct tegra_qspi_client_data *tegra_qspi_parse_cdata_dt(struct spi_device *spi) { struct tegra_qspi_client_data *cdata; - struct tegra_qspi *tqspi = spi_master_get_devdata(spi->master); + struct tegra_qspi *tqspi = spi_controller_get_devdata(spi->controller); cdata = devm_kzalloc(tqspi->dev, sizeof(*cdata), GFP_KERNEL); if (!cdata) @@ -941,7 +941,7 @@ static struct tegra_qspi_client_data *tegra_qspi_parse_cdata_dt(struct spi_devic static int tegra_qspi_setup(struct spi_device *spi) { - struct tegra_qspi *tqspi = spi_master_get_devdata(spi->master); + struct tegra_qspi *tqspi = spi_controller_get_devdata(spi->controller); struct tegra_qspi_client_data *cdata = spi->controller_data; unsigned long flags; u32 val; @@ -1005,7 +1005,7 @@ static void tegra_qspi_handle_error(struct tegra_qspi *tqspi) static void tegra_qspi_transfer_end(struct spi_device *spi) { - struct tegra_qspi *tqspi = spi_master_get_devdata(spi->master); + struct tegra_qspi *tqspi = spi_controller_get_devdata(spi->controller); int cs_val = (spi->mode & SPI_CS_HIGH) ? 0 : 1; if (cs_val) @@ -1316,10 +1316,10 @@ static bool tegra_qspi_validate_cmb_seq(struct tegra_qspi *tqspi, return true; } -static int tegra_qspi_transfer_one_message(struct spi_master *master, +static int tegra_qspi_transfer_one_message(struct spi_controller *host, struct spi_message *msg) { - struct tegra_qspi *tqspi = spi_master_get_devdata(master); + struct tegra_qspi *tqspi = spi_controller_get_devdata(host); int ret; if (tegra_qspi_validate_cmb_seq(tqspi, msg)) @@ -1327,7 +1327,7 @@ static int tegra_qspi_transfer_one_message(struct spi_master *master, else ret = tegra_qspi_non_combined_seq_xfer(tqspi, msg); - spi_finalize_current_message(master); + spi_finalize_current_message(host); return ret; } @@ -1533,38 +1533,38 @@ MODULE_DEVICE_TABLE(acpi, tegra_qspi_acpi_match); static int tegra_qspi_probe(struct platform_device *pdev) { - struct spi_master *master; + struct spi_controller *host; struct tegra_qspi *tqspi; struct resource *r; int ret, qspi_irq; int bus_num; - master = devm_spi_alloc_master(&pdev->dev, sizeof(*tqspi)); - if (!master) + host = devm_spi_alloc_host(&pdev->dev, sizeof(*tqspi)); + if (!host) return -ENOMEM; - platform_set_drvdata(pdev, master); - tqspi = spi_master_get_devdata(master); + platform_set_drvdata(pdev, host); + tqspi = spi_controller_get_devdata(host); - master->mode_bits = SPI_MODE_0 | SPI_MODE_3 | SPI_CS_HIGH | - SPI_TX_DUAL | SPI_RX_DUAL | SPI_TX_QUAD | SPI_RX_QUAD; - master->bits_per_word_mask = SPI_BPW_MASK(32) | SPI_BPW_MASK(16) | SPI_BPW_MASK(8); - master->flags = SPI_CONTROLLER_HALF_DUPLEX; - master->setup = tegra_qspi_setup; - master->transfer_one_message = tegra_qspi_transfer_one_message; - master->num_chipselect = 1; - master->auto_runtime_pm = true; + host->mode_bits = SPI_MODE_0 | SPI_MODE_3 | SPI_CS_HIGH | + SPI_TX_DUAL | SPI_RX_DUAL | SPI_TX_QUAD | SPI_RX_QUAD; + host->bits_per_word_mask = SPI_BPW_MASK(32) | SPI_BPW_MASK(16) | SPI_BPW_MASK(8); + host->flags = SPI_CONTROLLER_HALF_DUPLEX; + host->setup = tegra_qspi_setup; + host->transfer_one_message = tegra_qspi_transfer_one_message; + host->num_chipselect = 1; + host->auto_runtime_pm = true; bus_num = of_alias_get_id(pdev->dev.of_node, "spi"); if (bus_num >= 0) - master->bus_num = bus_num; + host->bus_num = bus_num; - tqspi->master = master; + tqspi->host = host; tqspi->dev = &pdev->dev; spin_lock_init(&tqspi->lock); tqspi->soc_data = device_get_match_data(&pdev->dev); - master->num_chipselect = tqspi->soc_data->cs_count; + host->num_chipselect = tqspi->soc_data->cs_count; tqspi->base = devm_platform_get_and_ioremap_resource(pdev, 0, &r); if (IS_ERR(tqspi->base)) return PTR_ERR(tqspi->base); @@ -1625,10 +1625,10 @@ static int tegra_qspi_probe(struct platform_device *pdev) goto exit_pm_disable; } - master->dev.of_node = pdev->dev.of_node; - ret = spi_register_master(master); + host->dev.of_node = pdev->dev.of_node; + ret = spi_register_controller(host); if (ret < 0) { - dev_err(&pdev->dev, "failed to register master: %d\n", ret); + dev_err(&pdev->dev, "failed to register host: %d\n", ret); goto exit_free_irq; } @@ -1644,10 +1644,10 @@ exit_pm_disable: static void tegra_qspi_remove(struct platform_device *pdev) { - struct spi_master *master = platform_get_drvdata(pdev); - struct tegra_qspi *tqspi = spi_master_get_devdata(master); + struct spi_controller *host = platform_get_drvdata(pdev); + struct tegra_qspi *tqspi = spi_controller_get_devdata(host); - spi_unregister_master(master); + spi_unregister_controller(host); free_irq(tqspi->irq, tqspi); pm_runtime_force_suspend(&pdev->dev); tegra_qspi_deinit_dma(tqspi); @@ -1655,15 +1655,15 @@ static void tegra_qspi_remove(struct platform_device *pdev) static int __maybe_unused tegra_qspi_suspend(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); + struct spi_controller *host = dev_get_drvdata(dev); - return spi_master_suspend(master); + return spi_controller_suspend(host); } static int __maybe_unused tegra_qspi_resume(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct tegra_qspi *tqspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct tegra_qspi *tqspi = spi_controller_get_devdata(host); int ret; ret = pm_runtime_resume_and_get(dev); @@ -1676,13 +1676,13 @@ static int __maybe_unused tegra_qspi_resume(struct device *dev) tegra_qspi_writel(tqspi, tqspi->def_command2_reg, QSPI_COMMAND2); pm_runtime_put(dev); - return spi_master_resume(master); + return spi_controller_resume(host); } static int __maybe_unused tegra_qspi_runtime_suspend(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct tegra_qspi *tqspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct tegra_qspi *tqspi = spi_controller_get_devdata(host); /* Runtime pm disabled with ACPI */ if (has_acpi_companion(tqspi->dev)) @@ -1697,8 +1697,8 @@ static int __maybe_unused tegra_qspi_runtime_suspend(struct device *dev) static int __maybe_unused tegra_qspi_runtime_resume(struct device *dev) { - struct spi_master *master = dev_get_drvdata(dev); - struct tegra_qspi *tqspi = spi_master_get_devdata(master); + struct spi_controller *host = dev_get_drvdata(dev); + struct tegra_qspi *tqspi = spi_controller_get_devdata(host); int ret; /* Runtime pm disabled with ACPI */ From 9d93c8d97b4cdb5edddb4c5530881c90eecb7e44 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:20 +0800 Subject: [PATCH 0631/1562] spi: spi-ti-qspi: switch to use modern name Change legacy name master to modern name host or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-16-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-ti-qspi.c | 88 +++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c index a6a89c59c418..0fe6899e78dd 100644 --- a/drivers/spi/spi-ti-qspi.c +++ b/drivers/spi/spi-ti-qspi.c @@ -40,7 +40,7 @@ struct ti_qspi { /* list synchronization */ struct mutex list_lock; - struct spi_master *master; + struct spi_controller *host; void __iomem *base; void __iomem *mmap_base; size_t mmap_size; @@ -137,20 +137,20 @@ static inline void ti_qspi_write(struct ti_qspi *qspi, static int ti_qspi_setup(struct spi_device *spi) { - struct ti_qspi *qspi = spi_master_get_devdata(spi->master); + struct ti_qspi *qspi = spi_controller_get_devdata(spi->controller); int ret; - if (spi->master->busy) { - dev_dbg(qspi->dev, "master busy doing other transfers\n"); + if (spi->controller->busy) { + dev_dbg(qspi->dev, "host busy doing other transfers\n"); return -EBUSY; } - if (!qspi->master->max_speed_hz) { + if (!qspi->host->max_speed_hz) { dev_err(qspi->dev, "spi max frequency not defined\n"); return -EINVAL; } - spi->max_speed_hz = min(spi->max_speed_hz, qspi->master->max_speed_hz); + spi->max_speed_hz = min(spi->max_speed_hz, qspi->host->max_speed_hz); ret = pm_runtime_resume_and_get(qspi->dev); if (ret < 0) { @@ -526,7 +526,7 @@ static int ti_qspi_dma_xfer_sg(struct ti_qspi *qspi, struct sg_table rx_sg, static void ti_qspi_enable_memory_map(struct spi_device *spi) { - struct ti_qspi *qspi = spi_master_get_devdata(spi->master); + struct ti_qspi *qspi = spi_controller_get_devdata(spi->controller); ti_qspi_write(qspi, MM_SWITCH, QSPI_SPI_SWITCH_REG); if (qspi->ctrl_base) { @@ -540,7 +540,7 @@ static void ti_qspi_enable_memory_map(struct spi_device *spi) static void ti_qspi_disable_memory_map(struct spi_device *spi) { - struct ti_qspi *qspi = spi_master_get_devdata(spi->master); + struct ti_qspi *qspi = spi_controller_get_devdata(spi->controller); ti_qspi_write(qspi, 0, QSPI_SPI_SWITCH_REG); if (qspi->ctrl_base) @@ -554,7 +554,7 @@ static void ti_qspi_setup_mmap_read(struct spi_device *spi, u8 opcode, u8 data_nbits, u8 addr_width, u8 dummy_bytes) { - struct ti_qspi *qspi = spi_master_get_devdata(spi->master); + struct ti_qspi *qspi = spi_controller_get_devdata(spi->controller); u32 memval = opcode; switch (data_nbits) { @@ -576,7 +576,7 @@ static void ti_qspi_setup_mmap_read(struct spi_device *spi, u8 opcode, static int ti_qspi_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op) { - struct ti_qspi *qspi = spi_controller_get_devdata(mem->spi->master); + struct ti_qspi *qspi = spi_controller_get_devdata(mem->spi->controller); size_t max_len; if (op->data.dir == SPI_MEM_DATA_IN) { @@ -606,7 +606,7 @@ static int ti_qspi_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op) static int ti_qspi_exec_mem_op(struct spi_mem *mem, const struct spi_mem_op *op) { - struct ti_qspi *qspi = spi_master_get_devdata(mem->spi->master); + struct ti_qspi *qspi = spi_controller_get_devdata(mem->spi->controller); u32 from = 0; int ret = 0; @@ -633,10 +633,10 @@ static int ti_qspi_exec_mem_op(struct spi_mem *mem, struct sg_table sgt; if (virt_addr_valid(op->data.buf.in) && - !spi_controller_dma_map_mem_op_data(mem->spi->master, op, + !spi_controller_dma_map_mem_op_data(mem->spi->controller, op, &sgt)) { ret = ti_qspi_dma_xfer_sg(qspi, sgt, from); - spi_controller_dma_unmap_mem_op_data(mem->spi->master, + spi_controller_dma_unmap_mem_op_data(mem->spi->controller, op, &sgt); } else { ret = ti_qspi_dma_bounce_buffer(qspi, from, @@ -658,10 +658,10 @@ static const struct spi_controller_mem_ops ti_qspi_mem_ops = { .adjust_op_size = ti_qspi_adjust_op_size, }; -static int ti_qspi_start_transfer_one(struct spi_master *master, +static int ti_qspi_start_transfer_one(struct spi_controller *host, struct spi_message *m) { - struct ti_qspi *qspi = spi_master_get_devdata(master); + struct ti_qspi *qspi = spi_controller_get_devdata(host); struct spi_device *spi = m->spi; struct spi_transfer *t; int status = 0, ret; @@ -720,7 +720,7 @@ static int ti_qspi_start_transfer_one(struct spi_master *master, ti_qspi_write(qspi, qspi->cmd | QSPI_INVAL, QSPI_SPI_CMD_REG); m->status = status; - spi_finalize_current_message(master); + spi_finalize_current_message(host); return status; } @@ -756,33 +756,33 @@ MODULE_DEVICE_TABLE(of, ti_qspi_match); static int ti_qspi_probe(struct platform_device *pdev) { struct ti_qspi *qspi; - struct spi_master *master; + struct spi_controller *host; struct resource *r, *res_mmap; struct device_node *np = pdev->dev.of_node; u32 max_freq; int ret = 0, num_cs, irq; dma_cap_mask_t mask; - master = spi_alloc_master(&pdev->dev, sizeof(*qspi)); - if (!master) + host = spi_alloc_host(&pdev->dev, sizeof(*qspi)); + if (!host) return -ENOMEM; - master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_RX_DUAL | SPI_RX_QUAD; + host->mode_bits = SPI_CPOL | SPI_CPHA | SPI_RX_DUAL | SPI_RX_QUAD; - master->flags = SPI_CONTROLLER_HALF_DUPLEX; - master->setup = ti_qspi_setup; - master->auto_runtime_pm = true; - master->transfer_one_message = ti_qspi_start_transfer_one; - master->dev.of_node = pdev->dev.of_node; - master->bits_per_word_mask = SPI_BPW_MASK(32) | SPI_BPW_MASK(16) | - SPI_BPW_MASK(8); - master->mem_ops = &ti_qspi_mem_ops; + host->flags = SPI_CONTROLLER_HALF_DUPLEX; + host->setup = ti_qspi_setup; + host->auto_runtime_pm = true; + host->transfer_one_message = ti_qspi_start_transfer_one; + host->dev.of_node = pdev->dev.of_node; + host->bits_per_word_mask = SPI_BPW_MASK(32) | SPI_BPW_MASK(16) | + SPI_BPW_MASK(8); + host->mem_ops = &ti_qspi_mem_ops; if (!of_property_read_u32(np, "num-cs", &num_cs)) - master->num_chipselect = num_cs; + host->num_chipselect = num_cs; - qspi = spi_master_get_devdata(master); - qspi->master = master; + qspi = spi_controller_get_devdata(host); + qspi->host = host; qspi->dev = &pdev->dev; platform_set_drvdata(pdev, qspi); @@ -792,7 +792,7 @@ static int ti_qspi_probe(struct platform_device *pdev) if (r == NULL) { dev_err(&pdev->dev, "missing platform data\n"); ret = -ENODEV; - goto free_master; + goto free_host; } } @@ -812,7 +812,7 @@ static int ti_qspi_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) { ret = irq; - goto free_master; + goto free_host; } mutex_init(&qspi->list_lock); @@ -820,7 +820,7 @@ static int ti_qspi_probe(struct platform_device *pdev) qspi->base = devm_ioremap_resource(&pdev->dev, r); if (IS_ERR(qspi->base)) { ret = PTR_ERR(qspi->base); - goto free_master; + goto free_host; } @@ -830,7 +830,7 @@ static int ti_qspi_probe(struct platform_device *pdev) "syscon-chipselects"); if (IS_ERR(qspi->ctrl_base)) { ret = PTR_ERR(qspi->ctrl_base); - goto free_master; + goto free_host; } ret = of_property_read_u32_index(np, "syscon-chipselects", @@ -838,7 +838,7 @@ static int ti_qspi_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "couldn't get ctrl_mod reg index\n"); - goto free_master; + goto free_host; } } @@ -853,7 +853,7 @@ static int ti_qspi_probe(struct platform_device *pdev) pm_runtime_enable(&pdev->dev); if (!of_property_read_u32(np, "spi-max-frequency", &max_freq)) - master->max_speed_hz = max_freq; + host->max_speed_hz = max_freq; dma_cap_zero(mask); dma_cap_set(DMA_MEMCPY, mask); @@ -876,7 +876,7 @@ static int ti_qspi_probe(struct platform_device *pdev) dma_release_channel(qspi->rx_chan); goto no_dma; } - master->dma_rx = qspi->rx_chan; + host->dma_rx = qspi->rx_chan; init_completion(&qspi->transfer_complete); if (res_mmap) qspi->mmap_phys_base = (dma_addr_t)res_mmap->start; @@ -889,21 +889,21 @@ no_dma: "mmap failed with error %ld using PIO mode\n", PTR_ERR(qspi->mmap_base)); qspi->mmap_base = NULL; - master->mem_ops = NULL; + host->mem_ops = NULL; } } qspi->mmap_enabled = false; qspi->current_cs = -1; - ret = devm_spi_register_master(&pdev->dev, master); + ret = devm_spi_register_controller(&pdev->dev, host); if (!ret) return 0; ti_qspi_dma_cleanup(qspi); pm_runtime_disable(&pdev->dev); -free_master: - spi_master_put(master); +free_host: + spi_controller_put(host); return ret; } @@ -912,9 +912,9 @@ static void ti_qspi_remove(struct platform_device *pdev) struct ti_qspi *qspi = platform_get_drvdata(pdev); int rc; - rc = spi_master_suspend(qspi->master); + rc = spi_controller_suspend(qspi->host); if (rc) { - dev_alert(&pdev->dev, "spi_master_suspend() failed (%pe)\n", + dev_alert(&pdev->dev, "spi_controller_suspend() failed (%pe)\n", ERR_PTR(rc)); return; } From d1d8b09d0a0a86fb785dbb0d69765fb98dde429c Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:21 +0800 Subject: [PATCH 0632/1562] spi: wpcm-fiu: switch to use devm_spi_alloc_host() Switch to use modern name function devm_spi_alloc_host(). No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-17-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-wpcm-fiu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-wpcm-fiu.c b/drivers/spi/spi-wpcm-fiu.c index d76f7b5a9b97..6b16a22cc3a4 100644 --- a/drivers/spi/spi-wpcm-fiu.c +++ b/drivers/spi/spi-wpcm-fiu.c @@ -441,7 +441,7 @@ static int wpcm_fiu_probe(struct platform_device *pdev) struct wpcm_fiu_spi *fiu; struct resource *res; - ctrl = devm_spi_alloc_master(dev, sizeof(*fiu)); + ctrl = devm_spi_alloc_host(dev, sizeof(*fiu)); if (!ctrl) return -ENOMEM; From 40daed14705ee76b35717ffedc80a7f281023bca Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:22 +0800 Subject: [PATCH 0633/1562] spi: topcliff-pch: switch to use modern name Change legacy name master to modern name host or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-18-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-topcliff-pch.c | 226 ++++++++++++++++----------------- 1 file changed, 113 insertions(+), 113 deletions(-) diff --git a/drivers/spi/spi-topcliff-pch.c b/drivers/spi/spi-topcliff-pch.c index af5846cfe5e9..271f3e7f834b 100644 --- a/drivers/spi/spi-topcliff-pch.c +++ b/drivers/spi/spi-topcliff-pch.c @@ -124,7 +124,7 @@ struct pch_spi_dma_ctrl { * struct pch_spi_data - Holds the SPI channel specific details * @io_remap_addr: The remapped PCI base address * @io_base_addr: Base address - * @master: Pointer to the SPI master structure + * @host: Pointer to the SPI controller structure * @work: Reference to work queue handler * @wait: Wait queue for waking up upon receiving an * interrupt. @@ -161,7 +161,7 @@ struct pch_spi_dma_ctrl { struct pch_spi_data { void __iomem *io_remap_addr; unsigned long io_base_addr; - struct spi_master *master; + struct spi_controller *host; struct work_struct work; wait_queue_head_t wait; u8 transfer_complete; @@ -216,48 +216,48 @@ static const struct pci_device_id pch_spi_pcidev_id[] = { /** * pch_spi_writereg() - Performs register writes - * @master: Pointer to struct spi_master. + * @host: Pointer to struct spi_controller. * @idx: Register offset. * @val: Value to be written to register. */ -static inline void pch_spi_writereg(struct spi_master *master, int idx, u32 val) +static inline void pch_spi_writereg(struct spi_controller *host, int idx, u32 val) { - struct pch_spi_data *data = spi_master_get_devdata(master); + struct pch_spi_data *data = spi_controller_get_devdata(host); iowrite32(val, (data->io_remap_addr + idx)); } /** * pch_spi_readreg() - Performs register reads - * @master: Pointer to struct spi_master. + * @host: Pointer to struct spi_controller. * @idx: Register offset. */ -static inline u32 pch_spi_readreg(struct spi_master *master, int idx) +static inline u32 pch_spi_readreg(struct spi_controller *host, int idx) { - struct pch_spi_data *data = spi_master_get_devdata(master); + struct pch_spi_data *data = spi_controller_get_devdata(host); return ioread32(data->io_remap_addr + idx); } -static inline void pch_spi_setclr_reg(struct spi_master *master, int idx, +static inline void pch_spi_setclr_reg(struct spi_controller *host, int idx, u32 set, u32 clr) { - u32 tmp = pch_spi_readreg(master, idx); + u32 tmp = pch_spi_readreg(host, idx); tmp = (tmp & ~clr) | set; - pch_spi_writereg(master, idx, tmp); + pch_spi_writereg(host, idx, tmp); } -static void pch_spi_set_master_mode(struct spi_master *master) +static void pch_spi_set_host_mode(struct spi_controller *host) { - pch_spi_setclr_reg(master, PCH_SPCR, SPCR_MSTR_BIT, 0); + pch_spi_setclr_reg(host, PCH_SPCR, SPCR_MSTR_BIT, 0); } /** * pch_spi_clear_fifo() - Clears the Transmit and Receive FIFOs - * @master: Pointer to struct spi_master. + * @host: Pointer to struct spi_controller. */ -static void pch_spi_clear_fifo(struct spi_master *master) +static void pch_spi_clear_fifo(struct spi_controller *host) { - pch_spi_setclr_reg(master, PCH_SPCR, SPCR_FICLR_BIT, 0); - pch_spi_setclr_reg(master, PCH_SPCR, 0, SPCR_FICLR_BIT); + pch_spi_setclr_reg(host, PCH_SPCR, SPCR_FICLR_BIT, 0); + pch_spi_setclr_reg(host, PCH_SPCR, 0, SPCR_FICLR_BIT); } static void pch_spi_handler_sub(struct pch_spi_data *data, u32 reg_spsr_val, @@ -312,7 +312,7 @@ static void pch_spi_handler_sub(struct pch_spi_data *data, u32 reg_spsr_val, if (reg_spsr_val & SPSR_FI_BIT) { if ((tx_index == bpw_len) && (rx_index == tx_index)) { /* disable interrupts */ - pch_spi_setclr_reg(data->master, PCH_SPCR, 0, + pch_spi_setclr_reg(data->host, PCH_SPCR, 0, PCH_ALL); /* transfer is completed; @@ -321,7 +321,7 @@ static void pch_spi_handler_sub(struct pch_spi_data *data, u32 reg_spsr_val, data->transfer_active = false; wake_up(&data->wait); } else { - dev_vdbg(&data->master->dev, + dev_vdbg(&data->host->dev, "%s : Transfer is not completed", __func__); } @@ -383,10 +383,10 @@ static irqreturn_t pch_spi_handler(int irq, void *dev_id) /** * pch_spi_set_baud_rate() - Sets SPBR field in SPBRR - * @master: Pointer to struct spi_master. + * @host: Pointer to struct spi_controller. * @speed_hz: Baud rate. */ -static void pch_spi_set_baud_rate(struct spi_master *master, u32 speed_hz) +static void pch_spi_set_baud_rate(struct spi_controller *host, u32 speed_hz) { u32 n_spbr = PCH_CLOCK_HZ / (speed_hz * 2); @@ -394,21 +394,21 @@ static void pch_spi_set_baud_rate(struct spi_master *master, u32 speed_hz) if (n_spbr > PCH_MAX_SPBR) n_spbr = PCH_MAX_SPBR; - pch_spi_setclr_reg(master, PCH_SPBRR, n_spbr, MASK_SPBRR_SPBR_BITS); + pch_spi_setclr_reg(host, PCH_SPBRR, n_spbr, MASK_SPBRR_SPBR_BITS); } /** * pch_spi_set_bits_per_word() - Sets SIZE field in SPBRR - * @master: Pointer to struct spi_master. + * @host: Pointer to struct spi_controller. * @bits_per_word: Bits per word for SPI transfer. */ -static void pch_spi_set_bits_per_word(struct spi_master *master, +static void pch_spi_set_bits_per_word(struct spi_controller *host, u8 bits_per_word) { if (bits_per_word == 8) - pch_spi_setclr_reg(master, PCH_SPBRR, 0, SPBRR_SIZE_BIT); + pch_spi_setclr_reg(host, PCH_SPBRR, 0, SPBRR_SIZE_BIT); else - pch_spi_setclr_reg(master, PCH_SPBRR, SPBRR_SIZE_BIT, 0); + pch_spi_setclr_reg(host, PCH_SPBRR, SPBRR_SIZE_BIT, 0); } /** @@ -420,12 +420,12 @@ static void pch_spi_setup_transfer(struct spi_device *spi) u32 flags = 0; dev_dbg(&spi->dev, "%s SPBRR content =%x setting baud rate=%d\n", - __func__, pch_spi_readreg(spi->master, PCH_SPBRR), + __func__, pch_spi_readreg(spi->controller, PCH_SPBRR), spi->max_speed_hz); - pch_spi_set_baud_rate(spi->master, spi->max_speed_hz); + pch_spi_set_baud_rate(spi->controller, spi->max_speed_hz); /* set bits per word */ - pch_spi_set_bits_per_word(spi->master, spi->bits_per_word); + pch_spi_set_bits_per_word(spi->controller, spi->bits_per_word); if (!(spi->mode & SPI_LSB_FIRST)) flags |= SPCR_LSBF_BIT; @@ -433,29 +433,29 @@ static void pch_spi_setup_transfer(struct spi_device *spi) flags |= SPCR_CPOL_BIT; if (spi->mode & SPI_CPHA) flags |= SPCR_CPHA_BIT; - pch_spi_setclr_reg(spi->master, PCH_SPCR, flags, + pch_spi_setclr_reg(spi->controller, PCH_SPCR, flags, (SPCR_LSBF_BIT | SPCR_CPOL_BIT | SPCR_CPHA_BIT)); /* Clear the FIFO by toggling FICLR to 1 and back to 0 */ - pch_spi_clear_fifo(spi->master); + pch_spi_clear_fifo(spi->controller); } /** * pch_spi_reset() - Clears SPI registers - * @master: Pointer to struct spi_master. + * @host: Pointer to struct spi_controller. */ -static void pch_spi_reset(struct spi_master *master) +static void pch_spi_reset(struct spi_controller *host) { /* write 1 to reset SPI */ - pch_spi_writereg(master, PCH_SRST, 0x1); + pch_spi_writereg(host, PCH_SRST, 0x1); /* clear reset */ - pch_spi_writereg(master, PCH_SRST, 0x0); + pch_spi_writereg(host, PCH_SRST, 0x0); } static int pch_spi_transfer(struct spi_device *pspi, struct spi_message *pmsg) { - struct pch_spi_data *data = spi_master_get_devdata(pspi->master); + struct pch_spi_data *data = spi_controller_get_devdata(pspi->controller); int retval; unsigned long flags; @@ -524,15 +524,15 @@ static void pch_spi_set_tx(struct pch_spi_data *data, int *bpw) /* set baud rate if needed */ if (data->cur_trans->speed_hz) { - dev_dbg(&data->master->dev, "%s:setting baud rate\n", __func__); - pch_spi_set_baud_rate(data->master, data->cur_trans->speed_hz); + dev_dbg(&data->host->dev, "%s:setting baud rate\n", __func__); + pch_spi_set_baud_rate(data->host, data->cur_trans->speed_hz); } /* set bits per word if needed */ if (data->cur_trans->bits_per_word && (data->current_msg->spi->bits_per_word != data->cur_trans->bits_per_word)) { - dev_dbg(&data->master->dev, "%s:set bits per word\n", __func__); - pch_spi_set_bits_per_word(data->master, + dev_dbg(&data->host->dev, "%s:set bits per word\n", __func__); + pch_spi_set_bits_per_word(data->host, data->cur_trans->bits_per_word); *bpw = data->cur_trans->bits_per_word; } else { @@ -590,13 +590,13 @@ static void pch_spi_set_tx(struct pch_spi_data *data, int *bpw) if (n_writes > PCH_MAX_FIFO_DEPTH) n_writes = PCH_MAX_FIFO_DEPTH; - dev_dbg(&data->master->dev, + dev_dbg(&data->host->dev, "\n%s:Pulling down SSN low - writing 0x2 to SSNXCR\n", __func__); - pch_spi_writereg(data->master, PCH_SSNXCR, SSN_LOW); + pch_spi_writereg(data->host, PCH_SSNXCR, SSN_LOW); for (j = 0; j < n_writes; j++) - pch_spi_writereg(data->master, PCH_SPDWR, data->pkt_tx_buff[j]); + pch_spi_writereg(data->host, PCH_SPDWR, data->pkt_tx_buff[j]); /* update tx_index */ data->tx_index = j; @@ -609,13 +609,13 @@ static void pch_spi_set_tx(struct pch_spi_data *data, int *bpw) static void pch_spi_nomore_transfer(struct pch_spi_data *data) { struct spi_message *pmsg, *tmp; - dev_dbg(&data->master->dev, "%s called\n", __func__); + dev_dbg(&data->host->dev, "%s called\n", __func__); /* Invoke complete callback * [To the spi core..indicating end of transfer] */ data->current_msg->status = 0; if (data->current_msg->complete) { - dev_dbg(&data->master->dev, + dev_dbg(&data->host->dev, "%s:Invoking callback of SPI core\n", __func__); data->current_msg->complete(data->current_msg->context); } @@ -623,7 +623,7 @@ static void pch_spi_nomore_transfer(struct pch_spi_data *data) /* update status in global variable */ data->bcurrent_msg_processing = false; - dev_dbg(&data->master->dev, + dev_dbg(&data->host->dev, "%s:data->bcurrent_msg_processing = false\n", __func__); data->current_msg = NULL; @@ -638,11 +638,11 @@ static void pch_spi_nomore_transfer(struct pch_spi_data *data) * bpw;sfer requests in the current message or there are *more messages) */ - dev_dbg(&data->master->dev, "%s:Invoke queue_work\n", __func__); + dev_dbg(&data->host->dev, "%s:Invoke queue_work\n", __func__); schedule_work(&data->work); } else if (data->board_dat->suspend_sts || data->status == STATUS_EXITING) { - dev_dbg(&data->master->dev, + dev_dbg(&data->host->dev, "%s suspend/remove initiated, flushing queue\n", __func__); list_for_each_entry_safe(pmsg, tmp, data->queue.next, queue) { @@ -662,14 +662,14 @@ static void pch_spi_set_ir(struct pch_spi_data *data) /* enable interrupts, set threshold, enable SPI */ if ((data->bpw_len) > PCH_MAX_FIFO_DEPTH) /* set receive threshold to PCH_RX_THOLD */ - pch_spi_setclr_reg(data->master, PCH_SPCR, + pch_spi_setclr_reg(data->host, PCH_SPCR, PCH_RX_THOLD << SPCR_RFIC_FIELD | SPCR_FIE_BIT | SPCR_RFIE_BIT | SPCR_ORIE_BIT | SPCR_SPE_BIT, MASK_RFIC_SPCR_BITS | PCH_ALL); else /* set receive threshold to maximum */ - pch_spi_setclr_reg(data->master, PCH_SPCR, + pch_spi_setclr_reg(data->host, PCH_SPCR, PCH_RX_THOLD_MAX << SPCR_RFIC_FIELD | SPCR_FIE_BIT | SPCR_ORIE_BIT | SPCR_SPE_BIT, @@ -677,18 +677,18 @@ static void pch_spi_set_ir(struct pch_spi_data *data) /* Wait until the transfer completes; go to sleep after initiating the transfer. */ - dev_dbg(&data->master->dev, + dev_dbg(&data->host->dev, "%s:waiting for transfer to get over\n", __func__); wait_event_interruptible(data->wait, data->transfer_complete); /* clear all interrupts */ - pch_spi_writereg(data->master, PCH_SPSR, - pch_spi_readreg(data->master, PCH_SPSR)); + pch_spi_writereg(data->host, PCH_SPSR, + pch_spi_readreg(data->host, PCH_SPSR)); /* Disable interrupts and SPI transfer */ - pch_spi_setclr_reg(data->master, PCH_SPCR, 0, PCH_ALL | SPCR_SPE_BIT); + pch_spi_setclr_reg(data->host, PCH_SPCR, 0, PCH_ALL | SPCR_SPE_BIT); /* clear FIFO */ - pch_spi_clear_fifo(data->master); + pch_spi_clear_fifo(data->host); } static void pch_spi_copy_rx_data(struct pch_spi_data *data, int bpw) @@ -750,25 +750,25 @@ static int pch_spi_start_transfer(struct pch_spi_data *data) spin_lock_irqsave(&data->lock, flags); /* disable interrupts, SPI set enable */ - pch_spi_setclr_reg(data->master, PCH_SPCR, SPCR_SPE_BIT, PCH_ALL); + pch_spi_setclr_reg(data->host, PCH_SPCR, SPCR_SPE_BIT, PCH_ALL); spin_unlock_irqrestore(&data->lock, flags); /* Wait until the transfer completes; go to sleep after initiating the transfer. */ - dev_dbg(&data->master->dev, + dev_dbg(&data->host->dev, "%s:waiting for transfer to get over\n", __func__); rtn = wait_event_interruptible_timeout(data->wait, data->transfer_complete, msecs_to_jiffies(2 * HZ)); if (!rtn) - dev_err(&data->master->dev, + dev_err(&data->host->dev, "%s wait-event timeout\n", __func__); - dma_sync_sg_for_cpu(&data->master->dev, dma->sg_rx_p, dma->nent, + dma_sync_sg_for_cpu(&data->host->dev, dma->sg_rx_p, dma->nent, DMA_FROM_DEVICE); - dma_sync_sg_for_cpu(&data->master->dev, dma->sg_tx_p, dma->nent, + dma_sync_sg_for_cpu(&data->host->dev, dma->sg_tx_p, dma->nent, DMA_FROM_DEVICE); memset(data->dma.tx_buf_virt, 0, PAGE_SIZE); @@ -780,14 +780,14 @@ static int pch_spi_start_transfer(struct pch_spi_data *data) spin_lock_irqsave(&data->lock, flags); /* clear fifo threshold, disable interrupts, disable SPI transfer */ - pch_spi_setclr_reg(data->master, PCH_SPCR, 0, + pch_spi_setclr_reg(data->host, PCH_SPCR, 0, MASK_RFIC_SPCR_BITS | MASK_TFIC_SPCR_BITS | PCH_ALL | SPCR_SPE_BIT); /* clear all interrupts */ - pch_spi_writereg(data->master, PCH_SPSR, - pch_spi_readreg(data->master, PCH_SPSR)); + pch_spi_writereg(data->host, PCH_SPSR, + pch_spi_readreg(data->host, PCH_SPSR)); /* clear FIFO */ - pch_spi_clear_fifo(data->master); + pch_spi_clear_fifo(data->host); spin_unlock_irqrestore(&data->lock, flags); @@ -846,7 +846,7 @@ static void pch_spi_request_dma(struct pch_spi_data *data, int bpw) param->width = width; chan = dma_request_channel(mask, pch_spi_filter, param); if (!chan) { - dev_err(&data->master->dev, + dev_err(&data->host->dev, "ERROR: dma_request_channel FAILS(Tx)\n"); goto out; } @@ -860,7 +860,7 @@ static void pch_spi_request_dma(struct pch_spi_data *data, int bpw) param->width = width; chan = dma_request_channel(mask, pch_spi_filter, param); if (!chan) { - dev_err(&data->master->dev, + dev_err(&data->host->dev, "ERROR: dma_request_channel FAILS(Rx)\n"); dma_release_channel(dma->chan_tx); dma->chan_tx = NULL; @@ -913,9 +913,9 @@ static void pch_spi_handle_dma(struct pch_spi_data *data, int *bpw) /* set baud rate if needed */ if (data->cur_trans->speed_hz) { - dev_dbg(&data->master->dev, "%s:setting baud rate\n", __func__); + dev_dbg(&data->host->dev, "%s:setting baud rate\n", __func__); spin_lock_irqsave(&data->lock, flags); - pch_spi_set_baud_rate(data->master, data->cur_trans->speed_hz); + pch_spi_set_baud_rate(data->host, data->cur_trans->speed_hz); spin_unlock_irqrestore(&data->lock, flags); } @@ -923,9 +923,9 @@ static void pch_spi_handle_dma(struct pch_spi_data *data, int *bpw) if (data->cur_trans->bits_per_word && (data->current_msg->spi->bits_per_word != data->cur_trans->bits_per_word)) { - dev_dbg(&data->master->dev, "%s:set bits per word\n", __func__); + dev_dbg(&data->host->dev, "%s:set bits per word\n", __func__); spin_lock_irqsave(&data->lock, flags); - pch_spi_set_bits_per_word(data->master, + pch_spi_set_bits_per_word(data->host, data->cur_trans->bits_per_word); spin_unlock_irqrestore(&data->lock, flags); *bpw = data->cur_trans->bits_per_word; @@ -969,12 +969,12 @@ static void pch_spi_handle_dma(struct pch_spi_data *data, int *bpw) size = data->bpw_len; rem = data->bpw_len; } - dev_dbg(&data->master->dev, "%s num=%d size=%d rem=%d\n", + dev_dbg(&data->host->dev, "%s num=%d size=%d rem=%d\n", __func__, num, size, rem); spin_lock_irqsave(&data->lock, flags); /* set receive fifo threshold and transmit fifo threshold */ - pch_spi_setclr_reg(data->master, PCH_SPCR, + pch_spi_setclr_reg(data->host, PCH_SPCR, ((size - 1) << SPCR_RFIC_FIELD) | (PCH_TX_THOLD << SPCR_TFIC_FIELD), MASK_RFIC_SPCR_BITS | MASK_TFIC_SPCR_BITS); @@ -1016,11 +1016,11 @@ static void pch_spi_handle_dma(struct pch_spi_data *data, int *bpw) num, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc_rx) { - dev_err(&data->master->dev, + dev_err(&data->host->dev, "%s:dmaengine_prep_slave_sg Failed\n", __func__); return; } - dma_sync_sg_for_device(&data->master->dev, sg, num, DMA_FROM_DEVICE); + dma_sync_sg_for_device(&data->host->dev, sg, num, DMA_FROM_DEVICE); desc_rx->callback = pch_dma_rx_complete; desc_rx->callback_param = data; dma->nent = num; @@ -1078,20 +1078,20 @@ static void pch_spi_handle_dma(struct pch_spi_data *data, int *bpw) sg, num, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc_tx) { - dev_err(&data->master->dev, + dev_err(&data->host->dev, "%s:dmaengine_prep_slave_sg Failed\n", __func__); return; } - dma_sync_sg_for_device(&data->master->dev, sg, num, DMA_TO_DEVICE); + dma_sync_sg_for_device(&data->host->dev, sg, num, DMA_TO_DEVICE); desc_tx->callback = NULL; desc_tx->callback_param = data; dma->nent = num; dma->desc_tx = desc_tx; - dev_dbg(&data->master->dev, "%s:Pulling down SSN low - writing 0x2 to SSNXCR\n", __func__); + dev_dbg(&data->host->dev, "%s:Pulling down SSN low - writing 0x2 to SSNXCR\n", __func__); spin_lock_irqsave(&data->lock, flags); - pch_spi_writereg(data->master, PCH_SSNXCR, SSN_LOW); + pch_spi_writereg(data->host, PCH_SSNXCR, SSN_LOW); desc_rx->tx_submit(desc_rx); desc_tx->tx_submit(desc_tx); spin_unlock_irqrestore(&data->lock, flags); @@ -1107,12 +1107,12 @@ static void pch_spi_process_messages(struct work_struct *pwork) int bpw; data = container_of(pwork, struct pch_spi_data, work); - dev_dbg(&data->master->dev, "%s data initialized\n", __func__); + dev_dbg(&data->host->dev, "%s data initialized\n", __func__); spin_lock(&data->lock); /* check if suspend has been initiated;if yes flush queue */ if (data->board_dat->suspend_sts || (data->status == STATUS_EXITING)) { - dev_dbg(&data->master->dev, + dev_dbg(&data->host->dev, "%s suspend/remove initiated, flushing queue\n", __func__); list_for_each_entry_safe(pmsg, tmp, data->queue.next, queue) { pmsg->status = -EIO; @@ -1132,7 +1132,7 @@ static void pch_spi_process_messages(struct work_struct *pwork) } data->bcurrent_msg_processing = true; - dev_dbg(&data->master->dev, + dev_dbg(&data->host->dev, "%s Set data->bcurrent_msg_processing= true\n", __func__); /* Get the message from the queue and delete it from there. */ @@ -1150,7 +1150,7 @@ static void pch_spi_process_messages(struct work_struct *pwork) if (data->use_dma) pch_spi_request_dma(data, data->current_msg->spi->bits_per_word); - pch_spi_writereg(data->master, PCH_SSNXCR, SSN_NO_CONTROL); + pch_spi_writereg(data->host, PCH_SSNXCR, SSN_NO_CONTROL); do { int cnt; /* If we are already processing a message get the next @@ -1161,14 +1161,14 @@ static void pch_spi_process_messages(struct work_struct *pwork) data->cur_trans = list_entry(data->current_msg->transfers.next, struct spi_transfer, transfer_list); - dev_dbg(&data->master->dev, + dev_dbg(&data->host->dev, "%s :Getting 1st transfer message\n", __func__); } else { data->cur_trans = list_entry(data->cur_trans->transfer_list.next, struct spi_transfer, transfer_list); - dev_dbg(&data->master->dev, + dev_dbg(&data->host->dev, "%s :Getting next transfer message\n", __func__); } @@ -1210,7 +1210,7 @@ static void pch_spi_process_messages(struct work_struct *pwork) data->cur_trans->len = data->save_total_len; data->current_msg->actual_length += data->cur_trans->len; - dev_dbg(&data->master->dev, + dev_dbg(&data->host->dev, "%s:data->current_msg->actual_length=%d\n", __func__, data->current_msg->actual_length); @@ -1229,7 +1229,7 @@ static void pch_spi_process_messages(struct work_struct *pwork) } while (data->cur_trans != NULL); out: - pch_spi_writereg(data->master, PCH_SSNXCR, SSN_HIGH); + pch_spi_writereg(data->host, PCH_SSNXCR, SSN_HIGH); if (data->use_dma) pch_spi_release_dma(data); } @@ -1248,7 +1248,7 @@ static int pch_spi_get_resources(struct pch_spi_board_data *board_dat, dev_dbg(&board_dat->pdev->dev, "%s ENTRY\n", __func__); /* reset PCH SPI h/w */ - pch_spi_reset(data->master); + pch_spi_reset(data->host); dev_dbg(&board_dat->pdev->dev, "%s pch_spi_reset invoked successfully\n", __func__); @@ -1297,22 +1297,22 @@ static int pch_alloc_dma_buf(struct pch_spi_board_data *board_dat, static int pch_spi_pd_probe(struct platform_device *plat_dev) { int ret; - struct spi_master *master; + struct spi_controller *host; struct pch_spi_board_data *board_dat = dev_get_platdata(&plat_dev->dev); struct pch_spi_data *data; dev_dbg(&plat_dev->dev, "%s:debug\n", __func__); - master = spi_alloc_master(&board_dat->pdev->dev, + host = spi_alloc_host(&board_dat->pdev->dev, sizeof(struct pch_spi_data)); - if (!master) { - dev_err(&plat_dev->dev, "spi_alloc_master[%d] failed.\n", + if (!host) { + dev_err(&plat_dev->dev, "spi_alloc_host[%d] failed.\n", plat_dev->id); return -ENOMEM; } - data = spi_master_get_devdata(master); - data->master = master; + data = spi_controller_get_devdata(host); + data->host = host; platform_set_drvdata(plat_dev, data); @@ -1330,13 +1330,13 @@ static int pch_spi_pd_probe(struct platform_device *plat_dev) dev_dbg(&plat_dev->dev, "[ch%d] remap_addr=%p\n", plat_dev->id, data->io_remap_addr); - /* initialize members of SPI master */ - master->num_chipselect = PCH_MAX_CS; - master->transfer = pch_spi_transfer; - master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST; - master->bits_per_word_mask = SPI_BPW_MASK(8) | SPI_BPW_MASK(16); - master->max_speed_hz = PCH_MAX_BAUDRATE; - master->flags = SPI_CONTROLLER_MUST_RX | SPI_CONTROLLER_MUST_TX; + /* initialize members of SPI host */ + host->num_chipselect = PCH_MAX_CS; + host->transfer = pch_spi_transfer; + host->mode_bits = SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST; + host->bits_per_word_mask = SPI_BPW_MASK(8) | SPI_BPW_MASK(16); + host->max_speed_hz = PCH_MAX_BAUDRATE; + host->flags = SPI_CONTROLLER_MUST_RX | SPI_CONTROLLER_MUST_TX; data->board_dat = board_dat; data->plat_dev = plat_dev; @@ -1365,25 +1365,25 @@ static int pch_spi_pd_probe(struct platform_device *plat_dev) } data->irq_reg_sts = true; - pch_spi_set_master_mode(master); + pch_spi_set_host_mode(host); if (use_dma) { dev_info(&plat_dev->dev, "Use DMA for data transfers\n"); ret = pch_alloc_dma_buf(board_dat, data); if (ret) - goto err_spi_register_master; + goto err_spi_register_controller; } - ret = spi_register_master(master); + ret = spi_register_controller(host); if (ret != 0) { dev_err(&plat_dev->dev, - "%s spi_register_master FAILED\n", __func__); - goto err_spi_register_master; + "%s spi_register_controller FAILED\n", __func__); + goto err_spi_register_controller; } return 0; -err_spi_register_master: +err_spi_register_controller: pch_free_dma_buf(board_dat, data); free_irq(board_dat->pdev->irq, data); err_request_irq: @@ -1391,7 +1391,7 @@ err_request_irq: err_spi_get_resources: pci_iounmap(board_dat->pdev, data->io_remap_addr); err_pci_iomap: - spi_master_put(master); + spi_controller_put(host); return ret; } @@ -1427,13 +1427,13 @@ static void pch_spi_pd_remove(struct platform_device *plat_dev) /* disable interrupts & free IRQ */ if (data->irq_reg_sts) { /* disable interrupts */ - pch_spi_setclr_reg(data->master, PCH_SPCR, 0, PCH_ALL); + pch_spi_setclr_reg(data->host, PCH_SPCR, 0, PCH_ALL); data->irq_reg_sts = false; free_irq(board_dat->pdev->irq, data); } pci_iounmap(board_dat->pdev, data->io_remap_addr); - spi_unregister_master(data->master); + spi_unregister_controller(data->host); } #ifdef CONFIG_PM static int pch_spi_pd_suspend(struct platform_device *pd_dev, @@ -1463,8 +1463,8 @@ static int pch_spi_pd_suspend(struct platform_device *pd_dev, /* Free IRQ */ if (data->irq_reg_sts) { /* disable all interrupts */ - pch_spi_setclr_reg(data->master, PCH_SPCR, 0, PCH_ALL); - pch_spi_reset(data->master); + pch_spi_setclr_reg(data->host, PCH_SPCR, 0, PCH_ALL); + pch_spi_reset(data->host); free_irq(board_dat->pdev->irq, data); data->irq_reg_sts = false; @@ -1498,8 +1498,8 @@ static int pch_spi_pd_resume(struct platform_device *pd_dev) } /* reset PCH SPI h/w */ - pch_spi_reset(data->master); - pch_spi_set_master_mode(data->master); + pch_spi_reset(data->host); + pch_spi_set_host_mode(data->host); data->irq_reg_sts = true; } return 0; From 4c2ee0991013ca8a32bb093a017d460204790112 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:23 +0800 Subject: [PATCH 0634/1562] spi: uniphier: switch to use modern name Change legacy name master to modern name host or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-19-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-uniphier.c | 194 ++++++++++++++++++------------------- 1 file changed, 97 insertions(+), 97 deletions(-) diff --git a/drivers/spi/spi-uniphier.c b/drivers/spi/spi-uniphier.c index f5344527af0b..4a18cf896194 100644 --- a/drivers/spi/spi-uniphier.c +++ b/drivers/spi/spi-uniphier.c @@ -26,7 +26,7 @@ struct uniphier_spi_priv { void __iomem *base; dma_addr_t base_dma_addr; struct clk *clk; - struct spi_master *master; + struct spi_controller *host; struct completion xfer_done; int error; @@ -127,7 +127,7 @@ static inline void uniphier_spi_irq_disable(struct uniphier_spi_priv *priv, static void uniphier_spi_set_mode(struct spi_device *spi) { - struct uniphier_spi_priv *priv = spi_master_get_devdata(spi->master); + struct uniphier_spi_priv *priv = spi_controller_get_devdata(spi->controller); u32 val1, val2; /* @@ -180,7 +180,7 @@ static void uniphier_spi_set_mode(struct spi_device *spi) static void uniphier_spi_set_transfer_size(struct spi_device *spi, int size) { - struct uniphier_spi_priv *priv = spi_master_get_devdata(spi->master); + struct uniphier_spi_priv *priv = spi_controller_get_devdata(spi->controller); u32 val; val = readl(priv->base + SSI_TXWDS); @@ -198,7 +198,7 @@ static void uniphier_spi_set_transfer_size(struct spi_device *spi, int size) static void uniphier_spi_set_baudrate(struct spi_device *spi, unsigned int speed) { - struct uniphier_spi_priv *priv = spi_master_get_devdata(spi->master); + struct uniphier_spi_priv *priv = spi_controller_get_devdata(spi->controller); u32 val, ckdiv; /* @@ -217,7 +217,7 @@ static void uniphier_spi_set_baudrate(struct spi_device *spi, static void uniphier_spi_setup_transfer(struct spi_device *spi, struct spi_transfer *t) { - struct uniphier_spi_priv *priv = spi_master_get_devdata(spi->master); + struct uniphier_spi_priv *priv = spi_controller_get_devdata(spi->controller); u32 val; priv->error = 0; @@ -333,7 +333,7 @@ static void uniphier_spi_fill_tx_fifo(struct uniphier_spi_priv *priv) static void uniphier_spi_set_cs(struct spi_device *spi, bool enable) { - struct uniphier_spi_priv *priv = spi_master_get_devdata(spi->master); + struct uniphier_spi_priv *priv = spi_controller_get_devdata(spi->controller); u32 val; val = readl(priv->base + SSI_FPS); @@ -346,16 +346,16 @@ static void uniphier_spi_set_cs(struct spi_device *spi, bool enable) writel(val, priv->base + SSI_FPS); } -static bool uniphier_spi_can_dma(struct spi_master *master, +static bool uniphier_spi_can_dma(struct spi_controller *host, struct spi_device *spi, struct spi_transfer *t) { - struct uniphier_spi_priv *priv = spi_master_get_devdata(master); + struct uniphier_spi_priv *priv = spi_controller_get_devdata(host); unsigned int bpw = bytes_per_word(priv->bits_per_word); - if ((!master->dma_tx && !master->dma_rx) - || (!master->dma_tx && t->tx_buf) - || (!master->dma_rx && t->rx_buf)) + if ((!host->dma_tx && !host->dma_rx) + || (!host->dma_tx && t->tx_buf) + || (!host->dma_rx && t->rx_buf)) return false; return DIV_ROUND_UP(t->len, bpw) > SSI_FIFO_DEPTH; @@ -363,33 +363,33 @@ static bool uniphier_spi_can_dma(struct spi_master *master, static void uniphier_spi_dma_rxcb(void *data) { - struct spi_master *master = data; - struct uniphier_spi_priv *priv = spi_master_get_devdata(master); + struct spi_controller *host = data; + struct uniphier_spi_priv *priv = spi_controller_get_devdata(host); int state = atomic_fetch_andnot(SSI_DMA_RX_BUSY, &priv->dma_busy); uniphier_spi_irq_disable(priv, SSI_IE_RXRE); if (!(state & SSI_DMA_TX_BUSY)) - spi_finalize_current_transfer(master); + spi_finalize_current_transfer(host); } static void uniphier_spi_dma_txcb(void *data) { - struct spi_master *master = data; - struct uniphier_spi_priv *priv = spi_master_get_devdata(master); + struct spi_controller *host = data; + struct uniphier_spi_priv *priv = spi_controller_get_devdata(host); int state = atomic_fetch_andnot(SSI_DMA_TX_BUSY, &priv->dma_busy); uniphier_spi_irq_disable(priv, SSI_IE_TXRE); if (!(state & SSI_DMA_RX_BUSY)) - spi_finalize_current_transfer(master); + spi_finalize_current_transfer(host); } -static int uniphier_spi_transfer_one_dma(struct spi_master *master, +static int uniphier_spi_transfer_one_dma(struct spi_controller *host, struct spi_device *spi, struct spi_transfer *t) { - struct uniphier_spi_priv *priv = spi_master_get_devdata(master); + struct uniphier_spi_priv *priv = spi_controller_get_devdata(host); struct dma_async_tx_descriptor *rxdesc = NULL, *txdesc = NULL; int buswidth; @@ -412,23 +412,23 @@ static int uniphier_spi_transfer_one_dma(struct spi_master *master, .src_maxburst = SSI_FIFO_BURST_NUM, }; - dmaengine_slave_config(master->dma_rx, &rxconf); + dmaengine_slave_config(host->dma_rx, &rxconf); rxdesc = dmaengine_prep_slave_sg( - master->dma_rx, + host->dma_rx, t->rx_sg.sgl, t->rx_sg.nents, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!rxdesc) goto out_err_prep; rxdesc->callback = uniphier_spi_dma_rxcb; - rxdesc->callback_param = master; + rxdesc->callback_param = host; uniphier_spi_irq_enable(priv, SSI_IE_RXRE); atomic_or(SSI_DMA_RX_BUSY, &priv->dma_busy); dmaengine_submit(rxdesc); - dma_async_issue_pending(master->dma_rx); + dma_async_issue_pending(host->dma_rx); } if (priv->tx_buf) { @@ -439,23 +439,23 @@ static int uniphier_spi_transfer_one_dma(struct spi_master *master, .dst_maxburst = SSI_FIFO_BURST_NUM, }; - dmaengine_slave_config(master->dma_tx, &txconf); + dmaengine_slave_config(host->dma_tx, &txconf); txdesc = dmaengine_prep_slave_sg( - master->dma_tx, + host->dma_tx, t->tx_sg.sgl, t->tx_sg.nents, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!txdesc) goto out_err_prep; txdesc->callback = uniphier_spi_dma_txcb; - txdesc->callback_param = master; + txdesc->callback_param = host; uniphier_spi_irq_enable(priv, SSI_IE_TXRE); atomic_or(SSI_DMA_TX_BUSY, &priv->dma_busy); dmaengine_submit(txdesc); - dma_async_issue_pending(master->dma_tx); + dma_async_issue_pending(host->dma_tx); } /* signal that we need to wait for completion */ @@ -463,17 +463,17 @@ static int uniphier_spi_transfer_one_dma(struct spi_master *master, out_err_prep: if (rxdesc) - dmaengine_terminate_sync(master->dma_rx); + dmaengine_terminate_sync(host->dma_rx); return -EINVAL; } -static int uniphier_spi_transfer_one_irq(struct spi_master *master, +static int uniphier_spi_transfer_one_irq(struct spi_controller *host, struct spi_device *spi, struct spi_transfer *t) { - struct uniphier_spi_priv *priv = spi_master_get_devdata(master); - struct device *dev = master->dev.parent; + struct uniphier_spi_priv *priv = spi_controller_get_devdata(host); + struct device *dev = host->dev.parent; unsigned long time_left; reinit_completion(&priv->xfer_done); @@ -495,11 +495,11 @@ static int uniphier_spi_transfer_one_irq(struct spi_master *master, return priv->error; } -static int uniphier_spi_transfer_one_poll(struct spi_master *master, +static int uniphier_spi_transfer_one_poll(struct spi_controller *host, struct spi_device *spi, struct spi_transfer *t) { - struct uniphier_spi_priv *priv = spi_master_get_devdata(master); + struct uniphier_spi_priv *priv = spi_controller_get_devdata(host); int loop = SSI_POLL_TIMEOUT_US * 10; while (priv->tx_bytes) { @@ -520,14 +520,14 @@ static int uniphier_spi_transfer_one_poll(struct spi_master *master, return 0; irq_transfer: - return uniphier_spi_transfer_one_irq(master, spi, t); + return uniphier_spi_transfer_one_irq(host, spi, t); } -static int uniphier_spi_transfer_one(struct spi_master *master, +static int uniphier_spi_transfer_one(struct spi_controller *host, struct spi_device *spi, struct spi_transfer *t) { - struct uniphier_spi_priv *priv = spi_master_get_devdata(master); + struct uniphier_spi_priv *priv = spi_controller_get_devdata(host); unsigned long threshold; bool use_dma; @@ -537,9 +537,9 @@ static int uniphier_spi_transfer_one(struct spi_master *master, uniphier_spi_setup_transfer(spi, t); - use_dma = master->can_dma ? master->can_dma(master, spi, t) : false; + use_dma = host->can_dma ? host->can_dma(host, spi, t) : false; if (use_dma) - return uniphier_spi_transfer_one_dma(master, spi, t); + return uniphier_spi_transfer_one_dma(host, spi, t); /* * If the transfer operation will take longer than @@ -548,33 +548,33 @@ static int uniphier_spi_transfer_one(struct spi_master *master, threshold = DIV_ROUND_UP(SSI_POLL_TIMEOUT_US * priv->speed_hz, USEC_PER_SEC * BITS_PER_BYTE); if (t->len > threshold) - return uniphier_spi_transfer_one_irq(master, spi, t); + return uniphier_spi_transfer_one_irq(host, spi, t); else - return uniphier_spi_transfer_one_poll(master, spi, t); + return uniphier_spi_transfer_one_poll(host, spi, t); } -static int uniphier_spi_prepare_transfer_hardware(struct spi_master *master) +static int uniphier_spi_prepare_transfer_hardware(struct spi_controller *host) { - struct uniphier_spi_priv *priv = spi_master_get_devdata(master); + struct uniphier_spi_priv *priv = spi_controller_get_devdata(host); writel(SSI_CTL_EN, priv->base + SSI_CTL); return 0; } -static int uniphier_spi_unprepare_transfer_hardware(struct spi_master *master) +static int uniphier_spi_unprepare_transfer_hardware(struct spi_controller *host) { - struct uniphier_spi_priv *priv = spi_master_get_devdata(master); + struct uniphier_spi_priv *priv = spi_controller_get_devdata(host); writel(0, priv->base + SSI_CTL); return 0; } -static void uniphier_spi_handle_err(struct spi_master *master, +static void uniphier_spi_handle_err(struct spi_controller *host, struct spi_message *msg) { - struct uniphier_spi_priv *priv = spi_master_get_devdata(master); + struct uniphier_spi_priv *priv = spi_controller_get_devdata(host); u32 val; /* stop running spi transfer */ @@ -587,12 +587,12 @@ static void uniphier_spi_handle_err(struct spi_master *master, uniphier_spi_irq_disable(priv, SSI_IE_ALL_MASK); if (atomic_read(&priv->dma_busy) & SSI_DMA_TX_BUSY) { - dmaengine_terminate_async(master->dma_tx); + dmaengine_terminate_async(host->dma_tx); atomic_andnot(SSI_DMA_TX_BUSY, &priv->dma_busy); } if (atomic_read(&priv->dma_busy) & SSI_DMA_RX_BUSY) { - dmaengine_terminate_async(master->dma_rx); + dmaengine_terminate_async(host->dma_rx); atomic_andnot(SSI_DMA_RX_BUSY, &priv->dma_busy); } } @@ -641,7 +641,7 @@ done: static int uniphier_spi_probe(struct platform_device *pdev) { struct uniphier_spi_priv *priv; - struct spi_master *master; + struct spi_controller *host; struct resource *res; struct dma_slave_caps caps; u32 dma_tx_burst = 0, dma_rx_burst = 0; @@ -649,20 +649,20 @@ static int uniphier_spi_probe(struct platform_device *pdev) int irq; int ret; - master = spi_alloc_master(&pdev->dev, sizeof(*priv)); - if (!master) + host = spi_alloc_host(&pdev->dev, sizeof(*priv)); + if (!host) return -ENOMEM; - platform_set_drvdata(pdev, master); + platform_set_drvdata(pdev, host); - priv = spi_master_get_devdata(master); - priv->master = master; + priv = spi_controller_get_devdata(host); + priv->host = host; priv->is_save_param = false; priv->base = devm_platform_get_and_ioremap_resource(pdev, 0, &res); if (IS_ERR(priv->base)) { ret = PTR_ERR(priv->base); - goto out_master_put; + goto out_host_put; } priv->base_dma_addr = res->start; @@ -670,12 +670,12 @@ static int uniphier_spi_probe(struct platform_device *pdev) if (IS_ERR(priv->clk)) { dev_err(&pdev->dev, "failed to get clock\n"); ret = PTR_ERR(priv->clk); - goto out_master_put; + goto out_host_put; } ret = clk_prepare_enable(priv->clk); if (ret) - goto out_master_put; + goto out_host_put; irq = platform_get_irq(pdev, 0); if (irq < 0) { @@ -694,35 +694,35 @@ static int uniphier_spi_probe(struct platform_device *pdev) clk_rate = clk_get_rate(priv->clk); - master->max_speed_hz = DIV_ROUND_UP(clk_rate, SSI_MIN_CLK_DIVIDER); - master->min_speed_hz = DIV_ROUND_UP(clk_rate, SSI_MAX_CLK_DIVIDER); - master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST; - master->dev.of_node = pdev->dev.of_node; - master->bus_num = pdev->id; - master->bits_per_word_mask = SPI_BPW_RANGE_MASK(1, 32); + host->max_speed_hz = DIV_ROUND_UP(clk_rate, SSI_MIN_CLK_DIVIDER); + host->min_speed_hz = DIV_ROUND_UP(clk_rate, SSI_MAX_CLK_DIVIDER); + host->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST; + host->dev.of_node = pdev->dev.of_node; + host->bus_num = pdev->id; + host->bits_per_word_mask = SPI_BPW_RANGE_MASK(1, 32); - master->set_cs = uniphier_spi_set_cs; - master->transfer_one = uniphier_spi_transfer_one; - master->prepare_transfer_hardware + host->set_cs = uniphier_spi_set_cs; + host->transfer_one = uniphier_spi_transfer_one; + host->prepare_transfer_hardware = uniphier_spi_prepare_transfer_hardware; - master->unprepare_transfer_hardware + host->unprepare_transfer_hardware = uniphier_spi_unprepare_transfer_hardware; - master->handle_err = uniphier_spi_handle_err; - master->can_dma = uniphier_spi_can_dma; + host->handle_err = uniphier_spi_handle_err; + host->can_dma = uniphier_spi_can_dma; - master->num_chipselect = 1; - master->flags = SPI_CONTROLLER_MUST_RX | SPI_CONTROLLER_MUST_TX; + host->num_chipselect = 1; + host->flags = SPI_CONTROLLER_MUST_RX | SPI_CONTROLLER_MUST_TX; - master->dma_tx = dma_request_chan(&pdev->dev, "tx"); - if (IS_ERR_OR_NULL(master->dma_tx)) { - if (PTR_ERR(master->dma_tx) == -EPROBE_DEFER) { + host->dma_tx = dma_request_chan(&pdev->dev, "tx"); + if (IS_ERR_OR_NULL(host->dma_tx)) { + if (PTR_ERR(host->dma_tx) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; goto out_disable_clk; } - master->dma_tx = NULL; + host->dma_tx = NULL; dma_tx_burst = INT_MAX; } else { - ret = dma_get_slave_caps(master->dma_tx, &caps); + ret = dma_get_slave_caps(host->dma_tx, &caps); if (ret) { dev_err(&pdev->dev, "failed to get TX DMA capacities: %d\n", ret); @@ -731,16 +731,16 @@ static int uniphier_spi_probe(struct platform_device *pdev) dma_tx_burst = caps.max_burst; } - master->dma_rx = dma_request_chan(&pdev->dev, "rx"); - if (IS_ERR_OR_NULL(master->dma_rx)) { - if (PTR_ERR(master->dma_rx) == -EPROBE_DEFER) { + host->dma_rx = dma_request_chan(&pdev->dev, "rx"); + if (IS_ERR_OR_NULL(host->dma_rx)) { + if (PTR_ERR(host->dma_rx) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; goto out_release_dma; } - master->dma_rx = NULL; + host->dma_rx = NULL; dma_rx_burst = INT_MAX; } else { - ret = dma_get_slave_caps(master->dma_rx, &caps); + ret = dma_get_slave_caps(host->dma_rx, &caps); if (ret) { dev_err(&pdev->dev, "failed to get RX DMA capacities: %d\n", ret); @@ -749,41 +749,41 @@ static int uniphier_spi_probe(struct platform_device *pdev) dma_rx_burst = caps.max_burst; } - master->max_dma_len = min(dma_tx_burst, dma_rx_burst); + host->max_dma_len = min(dma_tx_burst, dma_rx_burst); - ret = devm_spi_register_master(&pdev->dev, master); + ret = devm_spi_register_controller(&pdev->dev, host); if (ret) goto out_release_dma; return 0; out_release_dma: - if (!IS_ERR_OR_NULL(master->dma_rx)) { - dma_release_channel(master->dma_rx); - master->dma_rx = NULL; + if (!IS_ERR_OR_NULL(host->dma_rx)) { + dma_release_channel(host->dma_rx); + host->dma_rx = NULL; } - if (!IS_ERR_OR_NULL(master->dma_tx)) { - dma_release_channel(master->dma_tx); - master->dma_tx = NULL; + if (!IS_ERR_OR_NULL(host->dma_tx)) { + dma_release_channel(host->dma_tx); + host->dma_tx = NULL; } out_disable_clk: clk_disable_unprepare(priv->clk); -out_master_put: - spi_master_put(master); +out_host_put: + spi_controller_put(host); return ret; } static void uniphier_spi_remove(struct platform_device *pdev) { - struct spi_master *master = platform_get_drvdata(pdev); - struct uniphier_spi_priv *priv = spi_master_get_devdata(master); + struct spi_controller *host = platform_get_drvdata(pdev); + struct uniphier_spi_priv *priv = spi_controller_get_devdata(host); - if (master->dma_tx) - dma_release_channel(master->dma_tx); - if (master->dma_rx) - dma_release_channel(master->dma_rx); + if (host->dma_tx) + dma_release_channel(host->dma_tx); + if (host->dma_rx) + dma_release_channel(host->dma_rx); clk_disable_unprepare(priv->clk); } From 4e4856e721041fb6e7386369433a4850b34dde1e Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:24 +0800 Subject: [PATCH 0635/1562] spi: xcomm: switch to use modern name Change legacy name master to modern name host or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-20-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-xcomm.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/spi/spi-xcomm.c b/drivers/spi/spi-xcomm.c index a3d57554f5ba..63354dd3110f 100644 --- a/drivers/spi/spi-xcomm.c +++ b/drivers/spi/spi-xcomm.c @@ -132,10 +132,10 @@ static int spi_xcomm_txrx_bufs(struct spi_xcomm *spi_xcomm, return t->len; } -static int spi_xcomm_transfer_one(struct spi_master *master, +static int spi_xcomm_transfer_one(struct spi_controller *host, struct spi_message *msg) { - struct spi_xcomm *spi_xcomm = spi_master_get_devdata(master); + struct spi_xcomm *spi_xcomm = spi_controller_get_devdata(host); unsigned int settings = spi_xcomm->settings; struct spi_device *spi = msg->spi; unsigned cs_change = 0; @@ -197,7 +197,7 @@ static int spi_xcomm_transfer_one(struct spi_master *master, spi_xcomm_chipselect(spi_xcomm, spi, false); msg->status = status; - spi_finalize_current_message(master); + spi_finalize_current_message(host); return status; } @@ -205,27 +205,27 @@ static int spi_xcomm_transfer_one(struct spi_master *master, static int spi_xcomm_probe(struct i2c_client *i2c) { struct spi_xcomm *spi_xcomm; - struct spi_master *master; + struct spi_controller *host; int ret; - master = spi_alloc_master(&i2c->dev, sizeof(*spi_xcomm)); - if (!master) + host = spi_alloc_host(&i2c->dev, sizeof(*spi_xcomm)); + if (!host) return -ENOMEM; - spi_xcomm = spi_master_get_devdata(master); + spi_xcomm = spi_controller_get_devdata(host); spi_xcomm->i2c = i2c; - master->num_chipselect = 16; - master->mode_bits = SPI_CPHA | SPI_CPOL | SPI_3WIRE; - master->bits_per_word_mask = SPI_BPW_MASK(8); - master->flags = SPI_CONTROLLER_HALF_DUPLEX; - master->transfer_one_message = spi_xcomm_transfer_one; - master->dev.of_node = i2c->dev.of_node; - i2c_set_clientdata(i2c, master); + host->num_chipselect = 16; + host->mode_bits = SPI_CPHA | SPI_CPOL | SPI_3WIRE; + host->bits_per_word_mask = SPI_BPW_MASK(8); + host->flags = SPI_CONTROLLER_HALF_DUPLEX; + host->transfer_one_message = spi_xcomm_transfer_one; + host->dev.of_node = i2c->dev.of_node; + i2c_set_clientdata(i2c, host); - ret = devm_spi_register_master(&i2c->dev, master); + ret = devm_spi_register_controller(&i2c->dev, host); if (ret < 0) - spi_master_put(master); + spi_controller_put(host); return ret; } From 709b785a377c06535a98663a227fa82f61b08aec Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:25 +0800 Subject: [PATCH 0636/1562] spi: xilinx: switch to use modern name Change legacy name master/slave to modern name host/target or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-21-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-xilinx.c | 58 ++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/drivers/spi/spi-xilinx.c b/drivers/spi/spi-xilinx.c index 8e6e3876aa9a..12355957be97 100644 --- a/drivers/spi/spi-xilinx.c +++ b/drivers/spi/spi-xilinx.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Xilinx SPI controller driver (master mode only) + * Xilinx SPI controller driver (host mode only) * * Author: MontaVista Software, Inc. * source@mvista.com @@ -83,7 +83,7 @@ struct xilinx_spi { void __iomem *regs; /* virt. address of the control registers */ int irq; - bool force_irq; /* force irq to setup master inhibit */ + bool force_irq; /* force irq to setup host inhibit */ u8 *rx_ptr; /* pointer in the Tx buffer */ const u8 *tx_ptr; /* pointer in the Rx buffer */ u8 bytes_per_word; @@ -174,10 +174,10 @@ static void xspi_init_hw(struct xilinx_spi *xspi) regs_base + XIPIF_V123B_IIER_OFFSET); /* Disable the global IPIF interrupt */ xspi->write_fn(0, regs_base + XIPIF_V123B_DGIER_OFFSET); - /* Deselect the slave on the SPI bus */ + /* Deselect the Target on the SPI bus */ xspi->write_fn(0xffff, regs_base + XSPI_SSR_OFFSET); - /* Disable the transmitter, enable Manual Slave Select Assertion, - * put SPI controller into master mode, and enable it */ + /* Disable the transmitter, enable Manual Target Select Assertion, + * put SPI controller into host mode, and enable it */ xspi->write_fn(XSPI_CR_MANUAL_SSELECT | XSPI_CR_MASTER_MODE | XSPI_CR_ENABLE | XSPI_CR_TXFIFO_RESET | XSPI_CR_RXFIFO_RESET, regs_base + XSPI_CR_OFFSET); @@ -185,12 +185,12 @@ static void xspi_init_hw(struct xilinx_spi *xspi) static void xilinx_spi_chipselect(struct spi_device *spi, int is_on) { - struct xilinx_spi *xspi = spi_master_get_devdata(spi->master); + struct xilinx_spi *xspi = spi_controller_get_devdata(spi->controller); u16 cr; u32 cs; if (is_on == BITBANG_CS_INACTIVE) { - /* Deselect the slave on the SPI bus */ + /* Deselect the target on the SPI bus */ xspi->write_fn(xspi->cs_inactive, xspi->regs + XSPI_SSR_OFFSET); return; } @@ -225,7 +225,7 @@ static void xilinx_spi_chipselect(struct spi_device *spi, int is_on) static int xilinx_spi_setup_transfer(struct spi_device *spi, struct spi_transfer *t) { - struct xilinx_spi *xspi = spi_master_get_devdata(spi->master); + struct xilinx_spi *xspi = spi_controller_get_devdata(spi->controller); if (spi->mode & SPI_CS_HIGH) xspi->cs_inactive &= ~BIT(spi_get_chipselect(spi, 0)); @@ -237,7 +237,7 @@ static int xilinx_spi_setup_transfer(struct spi_device *spi, static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t) { - struct xilinx_spi *xspi = spi_master_get_devdata(spi->master); + struct xilinx_spi *xspi = spi_controller_get_devdata(spi->controller); int remaining_words; /* the number of words left to transfer */ bool use_irq = false; u16 cr = 0; @@ -335,9 +335,9 @@ static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t) } -/* This driver supports single master mode only. Hence Tx FIFO Empty +/* This driver supports single host mode only. Hence Tx FIFO Empty * is the only interrupt we care about. - * Receive FIFO Overrun, Transmit FIFO Underrun, Mode Fault, and Slave Mode + * Receive FIFO Overrun, Transmit FIFO Underrun, Mode Fault, and Target Mode * Fault are not to happen. */ static irqreturn_t xilinx_spi_irq(int irq, void *dev_id) @@ -393,7 +393,7 @@ static int xilinx_spi_probe(struct platform_device *pdev) struct xspi_platform_data *pdata; struct resource *res; int ret, num_cs = 0, bits_per_word; - struct spi_master *master; + struct spi_controller *host; bool force_irq = false; u32 tmp; u8 i; @@ -415,26 +415,26 @@ static int xilinx_spi_probe(struct platform_device *pdev) if (!num_cs) { dev_err(&pdev->dev, - "Missing slave select configuration data\n"); + "Missing target select configuration data\n"); return -EINVAL; } if (num_cs > XILINX_SPI_MAX_CS) { - dev_err(&pdev->dev, "Invalid number of spi slaves\n"); + dev_err(&pdev->dev, "Invalid number of spi targets\n"); return -EINVAL; } - master = devm_spi_alloc_master(&pdev->dev, sizeof(struct xilinx_spi)); - if (!master) + host = devm_spi_alloc_host(&pdev->dev, sizeof(struct xilinx_spi)); + if (!host) return -ENODEV; /* the spi->mode bits understood by this driver: */ - master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST | SPI_LOOP | - SPI_CS_HIGH; + host->mode_bits = SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST | SPI_LOOP | + SPI_CS_HIGH; - xspi = spi_master_get_devdata(master); + xspi = spi_controller_get_devdata(host); xspi->cs_inactive = 0xffffffff; - xspi->bitbang.master = master; + xspi->bitbang.master = host; xspi->bitbang.chipselect = xilinx_spi_chipselect; xspi->bitbang.setup_transfer = xilinx_spi_setup_transfer; xspi->bitbang.txrx_bufs = xilinx_spi_txrx_bufs; @@ -444,9 +444,9 @@ static int xilinx_spi_probe(struct platform_device *pdev) if (IS_ERR(xspi->regs)) return PTR_ERR(xspi->regs); - master->bus_num = pdev->id; - master->num_chipselect = num_cs; - master->dev.of_node = pdev->dev.of_node; + host->bus_num = pdev->id; + host->num_chipselect = num_cs; + host->dev.of_node = pdev->dev.of_node; /* * Detect endianess on the IP via loop bit in CR. Detection @@ -466,7 +466,7 @@ static int xilinx_spi_probe(struct platform_device *pdev) xspi->write_fn = xspi_write32_be; } - master->bits_per_word_mask = SPI_BPW_MASK(bits_per_word); + host->bits_per_word_mask = SPI_BPW_MASK(bits_per_word); xspi->bytes_per_word = bits_per_word / 8; xspi->buffer_size = xilinx_spi_find_buffer_size(xspi); @@ -496,17 +496,17 @@ static int xilinx_spi_probe(struct platform_device *pdev) if (pdata) { for (i = 0; i < pdata->num_devices; i++) - spi_new_device(master, pdata->devices + i); + spi_new_device(host, pdata->devices + i); } - platform_set_drvdata(pdev, master); + platform_set_drvdata(pdev, host); return 0; } static void xilinx_spi_remove(struct platform_device *pdev) { - struct spi_master *master = platform_get_drvdata(pdev); - struct xilinx_spi *xspi = spi_master_get_devdata(master); + struct spi_controller *host = platform_get_drvdata(pdev); + struct xilinx_spi *xspi = spi_controller_get_devdata(host); void __iomem *regs_base = xspi->regs; spi_bitbang_stop(&xspi->bitbang); @@ -516,7 +516,7 @@ static void xilinx_spi_remove(struct platform_device *pdev) /* Disable the global IPIF interrupt */ xspi->write_fn(0, regs_base + XIPIF_V123B_DGIER_OFFSET); - spi_master_put(xspi->bitbang.master); + spi_controller_put(xspi->bitbang.master); } /* work with hotplug and coldplug */ From 1633ffd290c77eb6e2c5500a25faf9fc2640b0d1 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:26 +0800 Subject: [PATCH 0637/1562] spi: xlp: switch to use modern name Change legacy name master/slave to modern name host/target or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-22-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-xlp.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/spi/spi-xlp.c b/drivers/spi/spi-xlp.c index 3b91cdd5ae21..49302364b7bd 100644 --- a/drivers/spi/spi-xlp.c +++ b/drivers/spi/spi-xlp.c @@ -95,7 +95,7 @@ struct xlp_spi_priv { int rx_len; /* rx xfer length */ int txerrors; /* TXFIFO underflow count */ int rxerrors; /* RXFIFO overflow count */ - int cs; /* slave device chip select */ + int cs; /* target device chip select */ u32 spi_clk; /* spi clock frequency */ bool cmd_cont; /* cs active */ struct completion done; /* completion notification */ @@ -138,7 +138,7 @@ static int xlp_spi_setup(struct spi_device *spi) u32 fdiv, cfg; int cs; - xspi = spi_master_get_devdata(spi->master); + xspi = spi_controller_get_devdata(spi->controller); cs = spi_get_chipselect(spi, 0); /* * The value of fdiv must be between 4 and 65535. @@ -343,17 +343,17 @@ static int xlp_spi_txrx_bufs(struct xlp_spi_priv *xs, struct spi_transfer *t) return bytesleft; } -static int xlp_spi_transfer_one(struct spi_master *master, +static int xlp_spi_transfer_one(struct spi_controller *host, struct spi_device *spi, struct spi_transfer *t) { - struct xlp_spi_priv *xspi = spi_master_get_devdata(master); + struct xlp_spi_priv *xspi = spi_controller_get_devdata(host); int ret = 0; xspi->cs = spi_get_chipselect(spi, 0); xspi->dev = spi->dev; - if (spi_transfer_is_last(master, t)) + if (spi_transfer_is_last(host, t)) xspi->cmd_cont = 0; else xspi->cmd_cont = 1; @@ -361,13 +361,13 @@ static int xlp_spi_transfer_one(struct spi_master *master, if (xlp_spi_txrx_bufs(xspi, t)) ret = -EIO; - spi_finalize_current_transfer(master); + spi_finalize_current_transfer(host); return ret; } static int xlp_spi_probe(struct platform_device *pdev) { - struct spi_master *master; + struct spi_controller *host; struct xlp_spi_priv *xspi; struct clk *clk; int irq, err; @@ -398,28 +398,28 @@ static int xlp_spi_probe(struct platform_device *pdev) xspi->spi_clk = clk_get_rate(clk); - master = spi_alloc_master(&pdev->dev, 0); - if (!master) { - dev_err(&pdev->dev, "could not alloc master\n"); + host = spi_alloc_host(&pdev->dev, 0); + if (!host) { + dev_err(&pdev->dev, "could not alloc host\n"); return -ENOMEM; } - master->bus_num = 0; - master->num_chipselect = XLP_SPI_MAX_CS; - master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH; - master->setup = xlp_spi_setup; - master->transfer_one = xlp_spi_transfer_one; - master->dev.of_node = pdev->dev.of_node; + host->bus_num = 0; + host->num_chipselect = XLP_SPI_MAX_CS; + host->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH; + host->setup = xlp_spi_setup; + host->transfer_one = xlp_spi_transfer_one; + host->dev.of_node = pdev->dev.of_node; init_completion(&xspi->done); - spi_master_set_devdata(master, xspi); + spi_controller_set_devdata(host, xspi); xlp_spi_sysctl_setup(xspi); /* register spi controller */ - err = devm_spi_register_master(&pdev->dev, master); + err = devm_spi_register_controller(&pdev->dev, host); if (err) { - dev_err(&pdev->dev, "spi register master failed!\n"); - spi_master_put(master); + dev_err(&pdev->dev, "spi register host failed!\n"); + spi_controller_put(host); return err; } From 061851a0cc5dae1a992edd4d573a7dc514bb7fbe Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:27 +0800 Subject: [PATCH 0638/1562] spi: xtensa-xtfpga: switch to use modern name Change legacy name master to modern name host or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-23-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-xtensa-xtfpga.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/spi/spi-xtensa-xtfpga.c b/drivers/spi/spi-xtensa-xtfpga.c index dbd85d7a1526..3c7721894376 100644 --- a/drivers/spi/spi-xtensa-xtfpga.c +++ b/drivers/spi/spi-xtensa-xtfpga.c @@ -53,7 +53,7 @@ static inline void xtfpga_spi_wait_busy(struct xtfpga_spi *xspi) static u32 xtfpga_spi_txrx_word(struct spi_device *spi, unsigned nsecs, u32 v, u8 bits, unsigned flags) { - struct xtfpga_spi *xspi = spi_master_get_devdata(spi->master); + struct xtfpga_spi *xspi = spi_controller_get_devdata(spi->controller); xspi->data = (xspi->data << bits) | (v & GENMASK(bits - 1, 0)); xspi->data_sz += bits; @@ -71,7 +71,7 @@ static u32 xtfpga_spi_txrx_word(struct spi_device *spi, unsigned nsecs, static void xtfpga_spi_chipselect(struct spi_device *spi, int is_on) { - struct xtfpga_spi *xspi = spi_master_get_devdata(spi->master); + struct xtfpga_spi *xspi = spi_controller_get_devdata(spi->controller); WARN_ON(xspi->data_sz != 0); xspi->data_sz = 0; @@ -81,19 +81,19 @@ static int xtfpga_spi_probe(struct platform_device *pdev) { struct xtfpga_spi *xspi; int ret; - struct spi_master *master; + struct spi_controller *host; - master = devm_spi_alloc_master(&pdev->dev, sizeof(struct xtfpga_spi)); - if (!master) + host = devm_spi_alloc_host(&pdev->dev, sizeof(struct xtfpga_spi)); + if (!host) return -ENOMEM; - master->flags = SPI_CONTROLLER_NO_RX; - master->bits_per_word_mask = SPI_BPW_RANGE_MASK(1, 16); - master->bus_num = pdev->dev.id; - master->dev.of_node = pdev->dev.of_node; + host->flags = SPI_CONTROLLER_NO_RX; + host->bits_per_word_mask = SPI_BPW_RANGE_MASK(1, 16); + host->bus_num = pdev->dev.id; + host->dev.of_node = pdev->dev.of_node; - xspi = spi_master_get_devdata(master); - xspi->bitbang.master = master; + xspi = spi_controller_get_devdata(host); + xspi->bitbang.master = host; xspi->bitbang.chipselect = xtfpga_spi_chipselect; xspi->bitbang.txrx_word[SPI_MODE_0] = xtfpga_spi_txrx_word; xspi->regs = devm_platform_ioremap_resource(pdev, 0); @@ -113,17 +113,17 @@ static int xtfpga_spi_probe(struct platform_device *pdev) return ret; } - platform_set_drvdata(pdev, master); + platform_set_drvdata(pdev, host); return 0; } static void xtfpga_spi_remove(struct platform_device *pdev) { - struct spi_master *master = platform_get_drvdata(pdev); - struct xtfpga_spi *xspi = spi_master_get_devdata(master); + struct spi_controller *host = platform_get_drvdata(pdev); + struct xtfpga_spi *xspi = spi_controller_get_devdata(host); spi_bitbang_stop(&xspi->bitbang); - spi_master_put(master); + spi_controller_put(host); } MODULE_ALIAS("platform:" XTFPGA_SPI_NAME); From 178ebb0c505b0a35edb4fb2a0e23a1f29e1db14d Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:28 +0800 Subject: [PATCH 0639/1562] spi: zynq-qspi: switch to use modern name Change legacy name master/slave to modern name host/target or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-24-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-zynq-qspi.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/spi/spi-zynq-qspi.c b/drivers/spi/spi-zynq-qspi.c index 0db69a2a72ff..d6325c6be3d4 100644 --- a/drivers/spi/spi-zynq-qspi.c +++ b/drivers/spi/spi-zynq-qspi.c @@ -54,10 +54,10 @@ #define ZYNQ_QSPI_CONFIG_MSTREN_MASK BIT(0) /* Master Mode */ /* - * QSPI Configuration Register - Baud rate and slave select + * QSPI Configuration Register - Baud rate and target select * * These are the values used in the calculation of baud rate divisor and - * setting the slave select. + * setting the target select. */ #define ZYNQ_QSPI_CONFIG_BAUD_DIV_MAX GENMASK(2, 0) /* Baud rate maximum */ #define ZYNQ_QSPI_CONFIG_BAUD_DIV_SHIFT 3 /* Baud rate divisor shift */ @@ -164,14 +164,14 @@ static inline void zynq_qspi_write(struct zynq_qspi *xqspi, u32 offset, * * The default settings of the QSPI controller's configurable parameters on * reset are - * - Master mode + * - Host mode * - Baud rate divisor is set to 2 * - Tx threshold set to 1l Rx threshold set to 32 * - Flash memory interface mode enabled * - Size of the word to be transferred as 8 bit * This function performs the following actions * - Disable and clear all the interrupts - * - Enable manual slave select + * - Enable manual target select * - Enable manual start * - Deselect all the chip select lines * - Set the size of the word to be transferred as 32 bit @@ -289,7 +289,7 @@ static void zynq_qspi_txfifo_op(struct zynq_qspi *xqspi, unsigned int size) */ static void zynq_qspi_chipselect(struct spi_device *spi, bool assert) { - struct spi_controller *ctlr = spi->master; + struct spi_controller *ctlr = spi->controller; struct zynq_qspi *xqspi = spi_controller_get_devdata(ctlr); u32 config_reg; @@ -377,7 +377,7 @@ static int zynq_qspi_config_op(struct zynq_qspi *xqspi, struct spi_device *spi) */ static int zynq_qspi_setup_op(struct spi_device *spi) { - struct spi_controller *ctlr = spi->master; + struct spi_controller *ctlr = spi->controller; struct zynq_qspi *qspi = spi_controller_get_devdata(ctlr); if (ctlr->busy) @@ -525,7 +525,7 @@ static irqreturn_t zynq_qspi_irq(int irq, void *dev_id) static int zynq_qspi_exec_mem_op(struct spi_mem *mem, const struct spi_mem_op *op) { - struct zynq_qspi *xqspi = spi_controller_get_devdata(mem->spi->master); + struct zynq_qspi *xqspi = spi_controller_get_devdata(mem->spi->controller); int err = 0, i; u8 *tmpbuf; @@ -637,7 +637,7 @@ static int zynq_qspi_probe(struct platform_device *pdev) struct zynq_qspi *xqspi; u32 num_cs; - ctlr = spi_alloc_master(&pdev->dev, sizeof(*xqspi)); + ctlr = spi_alloc_host(&pdev->dev, sizeof(*xqspi)); if (!ctlr) return -ENOMEM; @@ -647,14 +647,14 @@ static int zynq_qspi_probe(struct platform_device *pdev) xqspi->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(xqspi->regs)) { ret = PTR_ERR(xqspi->regs); - goto remove_master; + goto remove_ctlr; } xqspi->pclk = devm_clk_get(&pdev->dev, "pclk"); if (IS_ERR(xqspi->pclk)) { dev_err(&pdev->dev, "pclk clock not found.\n"); ret = PTR_ERR(xqspi->pclk); - goto remove_master; + goto remove_ctlr; } init_completion(&xqspi->data_completion); @@ -663,13 +663,13 @@ static int zynq_qspi_probe(struct platform_device *pdev) if (IS_ERR(xqspi->refclk)) { dev_err(&pdev->dev, "ref_clk clock not found.\n"); ret = PTR_ERR(xqspi->refclk); - goto remove_master; + goto remove_ctlr; } ret = clk_prepare_enable(xqspi->pclk); if (ret) { dev_err(&pdev->dev, "Unable to enable APB clock.\n"); - goto remove_master; + goto remove_ctlr; } ret = clk_prepare_enable(xqspi->refclk); @@ -715,7 +715,7 @@ static int zynq_qspi_probe(struct platform_device *pdev) ret = devm_spi_register_controller(&pdev->dev, ctlr); if (ret) { - dev_err(&pdev->dev, "spi_register_master failed\n"); + dev_err(&pdev->dev, "devm_spi_register_controller failed\n"); goto clk_dis_all; } @@ -725,7 +725,7 @@ clk_dis_all: clk_disable_unprepare(xqspi->refclk); clk_dis_pclk: clk_disable_unprepare(xqspi->pclk); -remove_master: +remove_ctlr: spi_controller_put(ctlr); return ret; From ca6f114372ae4d05387f0ccb5d4b2b1320bf22b3 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:29 +0800 Subject: [PATCH 0640/1562] spi: zynqmp-gqspi: switch to use modern name Change legacy name master/slave to modern name host/target or controller. No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-25-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-zynqmp-gqspi.c | 50 +++++++++++++++++----------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/spi/spi-zynqmp-gqspi.c b/drivers/spi/spi-zynqmp-gqspi.c index 9a46b2478f4e..99524a3c9f38 100644 --- a/drivers/spi/spi-zynqmp-gqspi.c +++ b/drivers/spi/spi-zynqmp-gqspi.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * Xilinx Zynq UltraScale+ MPSoC Quad-SPI (QSPI) controller driver - * (master mode only) + * (host mode only) * * Copyright (C) 2009 - 2015 Xilinx, Inc. */ @@ -235,21 +235,21 @@ static inline void zynqmp_gqspi_write(struct zynqmp_qspi *xqspi, u32 offset, } /** - * zynqmp_gqspi_selectslave - For selection of slave device + * zynqmp_gqspi_selecttarget - For selection of target device * @instanceptr: Pointer to the zynqmp_qspi structure - * @slavecs: For chip select - * @slavebus: To check which bus is selected- upper or lower + * @targetcs: For chip select + * @targetbus: To check which bus is selected- upper or lower */ -static void zynqmp_gqspi_selectslave(struct zynqmp_qspi *instanceptr, - u8 slavecs, u8 slavebus) +static void zynqmp_gqspi_selecttarget(struct zynqmp_qspi *instanceptr, + u8 targetcs, u8 targetbus) { /* * Bus and CS lines selected here will be updated in the instance and * used for subsequent GENFIFO entries during transfer. */ - /* Choose slave select line */ - switch (slavecs) { + /* Choose target select line */ + switch (targetcs) { case GQSPI_SELECT_FLASH_CS_BOTH: instanceptr->genfifocs = GQSPI_GENFIFO_CS_LOWER | GQSPI_GENFIFO_CS_UPPER; @@ -261,11 +261,11 @@ static void zynqmp_gqspi_selectslave(struct zynqmp_qspi *instanceptr, instanceptr->genfifocs = GQSPI_GENFIFO_CS_LOWER; break; default: - dev_warn(instanceptr->dev, "Invalid slave select\n"); + dev_warn(instanceptr->dev, "Invalid target select\n"); } /* Choose the bus */ - switch (slavebus) { + switch (targetbus) { case GQSPI_SELECT_FLASH_BUS_BOTH: instanceptr->genfifobus = GQSPI_GENFIFO_BUS_LOWER | GQSPI_GENFIFO_BUS_UPPER; @@ -277,7 +277,7 @@ static void zynqmp_gqspi_selectslave(struct zynqmp_qspi *instanceptr, instanceptr->genfifobus = GQSPI_GENFIFO_BUS_LOWER; break; default: - dev_warn(instanceptr->dev, "Invalid slave bus\n"); + dev_warn(instanceptr->dev, "Invalid target bus\n"); } } @@ -337,13 +337,13 @@ static void zynqmp_qspi_set_tapdelay(struct zynqmp_qspi *xqspi, u32 baudrateval) * * The default settings of the QSPI controller's configurable parameters on * reset are - * - Master mode + * - Host mode * - TX threshold set to 1 * - RX threshold set to 1 * - Flash memory interface mode enabled * This function performs the following actions * - Disable and clear all the interrupts - * - Enable manual slave select + * - Enable manual target select * - Enable manual start * - Deselect all the chip select lines * - Set the little endian mode of TX FIFO @@ -426,9 +426,9 @@ static void zynqmp_qspi_init_hw(struct zynqmp_qspi *xqspi) GQSPI_RX_FIFO_THRESHOLD); zynqmp_gqspi_write(xqspi, GQSPI_GF_THRESHOLD_OFST, GQSPI_GEN_FIFO_THRESHOLD_RESET_VAL); - zynqmp_gqspi_selectslave(xqspi, - GQSPI_SELECT_FLASH_CS_LOWER, - GQSPI_SELECT_FLASH_BUS_LOWER); + zynqmp_gqspi_selecttarget(xqspi, + GQSPI_SELECT_FLASH_CS_LOWER, + GQSPI_SELECT_FLASH_BUS_LOWER); /* Initialize DMA */ zynqmp_gqspi_write(xqspi, GQSPI_QSPIDMA_DST_CTRL_OFST, @@ -459,7 +459,7 @@ static void zynqmp_qspi_copy_read_data(struct zynqmp_qspi *xqspi, */ static void zynqmp_qspi_chipselect(struct spi_device *qspi, bool is_high) { - struct zynqmp_qspi *xqspi = spi_master_get_devdata(qspi->master); + struct zynqmp_qspi *xqspi = spi_controller_get_devdata(qspi->controller); ulong timeout; u32 genfifoentry = 0, statusreg; @@ -594,7 +594,7 @@ static int zynqmp_qspi_config_op(struct zynqmp_qspi *xqspi, */ static int zynqmp_qspi_setup_op(struct spi_device *qspi) { - struct spi_controller *ctlr = qspi->master; + struct spi_controller *ctlr = qspi->controller; struct zynqmp_qspi *xqspi = spi_controller_get_devdata(ctlr); if (ctlr->busy) @@ -1048,7 +1048,7 @@ static int zynqmp_qspi_exec_op(struct spi_mem *mem, const struct spi_mem_op *op) { struct zynqmp_qspi *xqspi = spi_controller_get_devdata - (mem->spi->master); + (mem->spi->controller); int err = 0, i; u32 genfifoentry = 0; u16 opcode = op->cmd.opcode; @@ -1224,7 +1224,7 @@ static int zynqmp_qspi_probe(struct platform_device *pdev) u32 num_cs; const struct qspi_platform_data *p_data; - ctlr = spi_alloc_master(&pdev->dev, sizeof(*xqspi)); + ctlr = spi_alloc_host(&pdev->dev, sizeof(*xqspi)); if (!ctlr) return -ENOMEM; @@ -1240,27 +1240,27 @@ static int zynqmp_qspi_probe(struct platform_device *pdev) xqspi->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(xqspi->regs)) { ret = PTR_ERR(xqspi->regs); - goto remove_master; + goto remove_ctlr; } xqspi->pclk = devm_clk_get(&pdev->dev, "pclk"); if (IS_ERR(xqspi->pclk)) { dev_err(dev, "pclk clock not found.\n"); ret = PTR_ERR(xqspi->pclk); - goto remove_master; + goto remove_ctlr; } xqspi->refclk = devm_clk_get(&pdev->dev, "ref_clk"); if (IS_ERR(xqspi->refclk)) { dev_err(dev, "ref_clk clock not found.\n"); ret = PTR_ERR(xqspi->refclk); - goto remove_master; + goto remove_ctlr; } ret = clk_prepare_enable(xqspi->pclk); if (ret) { dev_err(dev, "Unable to enable APB clock.\n"); - goto remove_master; + goto remove_ctlr; } ret = clk_prepare_enable(xqspi->refclk); @@ -1346,7 +1346,7 @@ clk_dis_all: clk_disable_unprepare(xqspi->refclk); clk_dis_pclk: clk_disable_unprepare(xqspi->pclk); -remove_master: +remove_ctlr: spi_controller_put(ctlr); return ret; From a23271718e767e8b701693b140fcc021a4e90b1b Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:30 +0800 Subject: [PATCH 0641/1562] spi: cs42l43: switch to use devm_spi_alloc_host() Switch to use modern name function devm_spi_alloc_host(). No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-26-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-cs42l43.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-cs42l43.c b/drivers/spi/spi-cs42l43.c index d239fc5a49cc..f13073e12593 100644 --- a/drivers/spi/spi-cs42l43.c +++ b/drivers/spi/spi-cs42l43.c @@ -213,7 +213,7 @@ static int cs42l43_spi_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - priv->ctlr = devm_spi_alloc_master(&pdev->dev, sizeof(*priv->ctlr)); + priv->ctlr = devm_spi_alloc_host(&pdev->dev, sizeof(*priv->ctlr)); if (!priv->ctlr) return -ENOMEM; From 4ac9ed81aaaab128b98855cd6005a52fa65dd4da Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 28 Nov 2023 17:30:31 +0800 Subject: [PATCH 0642/1562] spi: ljca: switch to use devm_spi_alloc_host() Switch to use modern name function devm_spi_alloc_host(). No functional changed. Signed-off-by: Yang Yingliang Link: https://msgid.link/r/20231128093031.3707034-27-yangyingliang@huawei.com Signed-off-by: Mark Brown --- drivers/spi/spi-ljca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-ljca.c b/drivers/spi/spi-ljca.c index c5a066c73817..1cc1422ddba0 100644 --- a/drivers/spi/spi-ljca.c +++ b/drivers/spi/spi-ljca.c @@ -223,7 +223,7 @@ static int ljca_spi_probe(struct auxiliary_device *auxdev, struct ljca_spi_dev *ljca_spi; int ret; - controller = devm_spi_alloc_master(&auxdev->dev, sizeof(*ljca_spi)); + controller = devm_spi_alloc_host(&auxdev->dev, sizeof(*ljca_spi)); if (!controller) return -ENOMEM; From 4649620d9404d3aceb25891c24bab77143e3f21c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 8 Dec 2023 20:13:44 +0100 Subject: [PATCH 0643/1562] thermal: core: Make thermal_zone_device_unregister() return after freeing the zone Make thermal_zone_device_unregister() wait until all of the references to the given thermal zone object have been dropped and free it before returning. This guarantees that when thermal_zone_device_unregister() returns, there is no leftover activity regarding the thermal zone in question which is required by some of its callers (for instance, modular driver code that wants to know when it is safe to let the module go away). Subsequently, this will allow some confusing device_is_registered() checks to be dropped from the thermal sysfs and core code. Signed-off-by: Rafael J. Wysocki Reviewed-and-tested-by: Lukasz Luba Acked-by: Daniel Lezcano --- drivers/thermal/thermal_core.c | 6 +++++- include/linux/thermal.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 625ba07cbe2f..70a294d12187 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -822,7 +822,7 @@ static void thermal_release(struct device *dev) tz = to_thermal_zone(dev); thermal_zone_destroy_device_groups(tz); mutex_destroy(&tz->lock); - kfree(tz); + complete(&tz->removal); } else if (!strncmp(dev_name(dev), "cooling_device", sizeof("cooling_device") - 1)) { cdev = to_cooling_device(dev); @@ -1315,6 +1315,7 @@ thermal_zone_device_register_with_trips(const char *type, struct thermal_trip *t INIT_LIST_HEAD(&tz->thermal_instances); ida_init(&tz->ida); mutex_init(&tz->lock); + init_completion(&tz->removal); id = ida_alloc(&thermal_tz_ida, GFP_KERNEL); if (id < 0) { result = id; @@ -1494,6 +1495,9 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz) put_device(&tz->device); thermal_notify_tz_delete(tz_id); + + wait_for_completion(&tz->removal); + kfree(tz); } EXPORT_SYMBOL_GPL(thermal_zone_device_unregister); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 0ea99f50d57c..bedbaec9a42e 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -117,6 +117,7 @@ struct thermal_cooling_device { * @id: unique id number for each thermal zone * @type: the thermal zone device type * @device: &struct device for this thermal zone + * @removal: removal completion * @trip_temp_attrs: attributes for trip points for sysfs: trip temperature * @trip_type_attrs: attributes for trip points for sysfs: trip type * @trip_hyst_attrs: attributes for trip points for sysfs: trip hysteresis @@ -156,6 +157,7 @@ struct thermal_zone_device { int id; char type[THERMAL_NAME_LENGTH]; struct device device; + struct completion removal; struct attribute_group trips_attribute_group; struct thermal_attr *trip_temp_attrs; struct thermal_attr *trip_type_attrs; From eeae55ed9c0a74604a49789e36b7cdf0ee8bd69c Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 2 Dec 2023 02:09:28 +0800 Subject: [PATCH 0644/1562] intel_idle: Add Meteorlake support Add intel_idle support for MeteorLake. C1 and C1E states on Meteorlake are mutually exclusive, like Alderlake and Raptorlake, but they have little latency difference with measureable power difference, so always enable "C1E promotion" bit and expose C1E only. Expose C6 because it has less power compared with C1E, and smaller latency compared with C8/C10. Ignore C8 and expose C10, because C8 does not show latency advantage compared with C10. Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index dcda0afecfc5..cfd0b24fd7f1 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -923,6 +923,35 @@ static struct cpuidle_state adl_l_cstates[] __initdata = { .enter = NULL } }; +static struct cpuidle_state mtl_l_cstates[] __initdata = { + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 140, + .target_residency = 420, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C10", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 310, + .target_residency = 930, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + static struct cpuidle_state gmt_cstates[] __initdata = { { .name = "C1", @@ -1349,6 +1378,10 @@ static const struct idle_cpu idle_cpu_adl_l __initconst = { .state_table = adl_l_cstates, }; +static const struct idle_cpu idle_cpu_mtl_l __initconst = { + .state_table = mtl_l_cstates, +}; + static const struct idle_cpu idle_cpu_gmt __initconst = { .state_table = gmt_cstates, }; @@ -1423,6 +1456,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &idle_cpu_icx), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &idle_cpu_adl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &idle_cpu_adl_l), + X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &idle_cpu_mtl_l), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &idle_cpu_gmt), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr), X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &idle_cpu_spr), From a1ca8295ee53a2fc57085fae26df37228c655791 Mon Sep 17 00:00:00 2001 From: Wang chaodong Date: Fri, 20 Oct 2023 16:51:06 +0800 Subject: [PATCH 0645/1562] PM: hibernate: Drop unnecessary local variable initialization It is not necessary to intialize the error variable in create_basic_memory_bitmaps(), because it is only read after being assigned a value. Signed-off-by: Wang chaodong [ rjw: Subject and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 50a15408c3fc..71b2f12ed3b5 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1119,7 +1119,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm) int create_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; - int error = 0; + int error; if (forbidden_pages_map && free_pages_map) return 0; From bbeaa4691fa8682e2fe2e87f28d5fce39805fa68 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Fri, 27 Oct 2023 09:55:33 +0800 Subject: [PATCH 0646/1562] PM: hibernate: Do not initialize error in swap_write_page() 'error' first receives the function result before it is used, and it does not need to be assigned a value during definition. Signed-off-by: Li zeming [ rjw: Subject rewrite ] Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a2cb0babb5ec..68973ca2cf07 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -451,7 +451,7 @@ err_close: static int swap_write_page(struct swap_map_handle *handle, void *buf, struct hib_bio_batch *hb) { - int error = 0; + int error; sector_t offset; if (!handle->cur) From 4ac934b1aaa99e00ca25875d55094a4fe34e212d Mon Sep 17 00:00:00 2001 From: Li zeming Date: Tue, 24 Oct 2023 10:04:34 +0800 Subject: [PATCH 0647/1562] PM: hibernate: Do not initialize error in snapshot_write_next() The error variable in snapshot_write_next() gets a value before it is used, so don't initialize it to 0 upfront. Signed-off-by: Li zeming [ rjw: Subject and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 71b2f12ed3b5..e3e8f1c6e75f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2778,7 +2778,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) int snapshot_write_next(struct snapshot_handle *handle) { static struct chain_allocator ca; - int error = 0; + int error; next: /* Check if we have already loaded the entire image */ From bb6ec2e9fd8b83b2db68a449754c899a211bf84b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 23 Oct 2023 19:42:45 +0100 Subject: [PATCH 0648/1562] tools/nolibc: Use linux/wait.h rather than duplicating it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linux defines a few custom flags for waitpid() which aren't currently provided by nolibc, make them available to nolibc based programs by just including linux/wait.h where they are defined instead of defining our own copy of the flags. Signed-off-by: Mark Brown Signed-off-by: Willy Tarreau Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/types.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h index 8cfc4c860fa4..ad0ddaa89e50 100644 --- a/tools/include/nolibc/types.h +++ b/tools/include/nolibc/types.h @@ -12,6 +12,7 @@ #include /* for LINUX_REBOOT_* */ #include #include +#include /* Only the generic macros and types may be defined here. The arch-specific @@ -108,9 +109,6 @@ #define WTERMSIG(status) ((status) & 0x7f) #define WIFSIGNALED(status) ((status) - 1 < 0xff) -/* waitpid() flags */ -#define WNOHANG 1 - /* standard exit() codes */ #define EXIT_SUCCESS 0 #define EXIT_FAILURE 1 From bdeeeaba83682225a7bf5f100fe8652a59590d33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 31 Oct 2023 21:36:58 +0100 Subject: [PATCH 0649/1562] selftests/nolibc: use EFI -bios for LoongArch qemu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qemu for LoongArch does not work properly with direct kernel boot. The kernel will panic during initialization and hang without any output. When booting in EFI mode everything work correctly. While users most likely don't have the LoongArch EFI binary installed at least an explicit error about 'file not found' is better than a hanging test without output that can never succeed. Link: https://lore.kernel.org/loongarch/1738d60a-df3a-4102-b1da-d16a29b6e06a@t-8ch.de/ Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20231031-nolibc-out-of-tree-v1-1-47c92f73590a@weissschuh.net --- tools/testing/selftests/nolibc/Makefile | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 6c7040a75d81..28e616d64612 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -88,6 +88,13 @@ QEMU_ARCH_s390 = s390x QEMU_ARCH_loongarch = loongarch64 QEMU_ARCH = $(QEMU_ARCH_$(XARCH)) +QEMU_BIOS_DIR = /usr/share/edk2/ +QEMU_BIOS_loongarch = $(QEMU_BIOS_DIR)/loongarch64/OVMF_CODE.fd + +ifneq ($(QEMU_BIOS_$(XARCH)),) +QEMU_ARGS_BIOS = -bios $(QEMU_BIOS_$(XARCH)) +endif + # QEMU_ARGS : some arch-specific args to pass to qemu QEMU_ARGS_i386 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_x86_64 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" @@ -101,7 +108,7 @@ QEMU_ARGS_ppc64le = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_s390 = -M s390-ccw-virtio -m 1G -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_loongarch = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS = $(QEMU_ARGS_$(XARCH)) $(QEMU_ARGS_EXTRA) +QEMU_ARGS = $(QEMU_ARGS_$(XARCH)) $(QEMU_ARGS_BIOS) $(QEMU_ARGS_EXTRA) # OUTPUT is only set when run from the main makefile, otherwise # it defaults to this nolibc directory. From 7263c9d9b67a9412fcfc2c90b259a28d55d0e970 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 31 Oct 2023 21:36:59 +0100 Subject: [PATCH 0650/1562] selftests/nolibc: anchor paths in $(srcdir) if possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is easier to recognize paths from their well-known location in the source tree than having to resolve the relative path in ones head. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20231031-nolibc-out-of-tree-v1-2-47c92f73590a@weissschuh.net --- tools/testing/selftests/nolibc/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 28e616d64612..be89a4fe0e23 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -174,7 +174,7 @@ sysroot: sysroot/$(ARCH)/include sysroot/$(ARCH)/include: $(Q)rm -rf sysroot/$(ARCH) sysroot/sysroot $(QUIET_MKDIR)mkdir -p sysroot - $(Q)$(MAKE) -C ../../../include/nolibc ARCH=$(ARCH) OUTPUT=$(CURDIR)/sysroot/ headers_standalone + $(Q)$(MAKE) -C $(srctree)/tools/include/nolibc ARCH=$(ARCH) OUTPUT=$(CURDIR)/sysroot/ headers_standalone $(Q)mv sysroot/sysroot sysroot/$(ARCH) ifneq ($(NOLIBC_SYSROOT),0) @@ -184,7 +184,7 @@ nolibc-test: nolibc-test.c nolibc-test-linkage.c sysroot/$(ARCH)/include else nolibc-test: nolibc-test.c nolibc-test-linkage.c $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ - -nostdlib -static -include ../../../include/nolibc/nolibc.h nolibc-test.c nolibc-test-linkage.c -lgcc + -nostdlib -static -include $(srctree)/tools/include/nolibc/nolibc.h nolibc-test.c nolibc-test-linkage.c -lgcc endif libc-test: nolibc-test.c nolibc-test-linkage.c From 69620b3a5bc5e6798724ab9bf0dd1b3c980a4949 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 31 Oct 2023 21:37:00 +0100 Subject: [PATCH 0651/1562] selftests/nolibc: support out-of-tree builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Out of tree builds are much more convenient when building for multiple architectures or configurations in parallel. Only absolute O= parameters are supported as Makefile.include will always resolve relative paths in relation to $(srctree) instead of the current directory. Add a call to "make outputmakefile" to verify that the sourcetree is clean. This is based on Zhangjins out-of-tree patch. It extends that work for get_init_cpio support and also drops relative O= specifications explicitly. Link: https://lore.kernel.org/lkml/06d96bd81fe812a9718098a383678ad3beba98b1.1691215074.git.falcon@tinylab.org/ Co-developed-by: Zhangjin Wu Signed-off-by: Zhangjin Wu Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20231031-nolibc-out-of-tree-v1-3-47c92f73590a@weissschuh.net --- tools/testing/selftests/nolibc/Makefile | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index be89a4fe0e23..4818ae4bdff4 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -1,9 +1,16 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for nolibc tests include ../../../scripts/Makefile.include +include ../../../scripts/utilities.mak # We need this for the "cc-option" macro. include ../../../build/Build.include +ifneq ($(O),) +ifneq ($(call is-absolute,$(O)),y) +$(error Only absolute O= parameters are supported) +endif +endif + # we're in ".../tools/testing/selftests/nolibc" ifeq ($(srctree),) srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR))) @@ -14,6 +21,8 @@ include $(srctree)/scripts/subarch.include ARCH = $(SUBARCH) endif +objtree ?= $(srctree) + # XARCH extends the kernel's ARCH with a few variants of the same # architecture that only differ by the configuration, the toolchain # and the Qemu program used. It is copied as-is into ARCH except for @@ -52,7 +61,7 @@ IMAGE_ppc64le = arch/powerpc/boot/zImage IMAGE_riscv = arch/riscv/boot/Image IMAGE_s390 = arch/s390/boot/bzImage IMAGE_loongarch = arch/loongarch/boot/vmlinuz.efi -IMAGE = $(IMAGE_$(XARCH)) +IMAGE = $(objtree)/$(IMAGE_$(XARCH)) IMAGE_NAME = $(notdir $(IMAGE)) # default kernel configurations that appear to be usable @@ -174,6 +183,7 @@ sysroot: sysroot/$(ARCH)/include sysroot/$(ARCH)/include: $(Q)rm -rf sysroot/$(ARCH) sysroot/sysroot $(QUIET_MKDIR)mkdir -p sysroot + $(Q)$(MAKE) -C $(srctree) outputmakefile $(Q)$(MAKE) -C $(srctree)/tools/include/nolibc ARCH=$(ARCH) OUTPUT=$(CURDIR)/sysroot/ headers_standalone $(Q)mv sysroot/sysroot sysroot/$(ARCH) @@ -206,7 +216,7 @@ run-user: nolibc-test $(Q)$(REPORT) $(CURDIR)/run.out initramfs.cpio: kernel nolibc-test - $(QUIET_GEN)echo 'file /init nolibc-test 755 0 0' | $(srctree)/usr/gen_init_cpio - > initramfs.cpio + $(QUIET_GEN)echo 'file /init nolibc-test 755 0 0' | $(objtree)/usr/gen_init_cpio - > initramfs.cpio initramfs: nolibc-test $(QUIET_MKDIR)mkdir -p initramfs @@ -224,12 +234,12 @@ kernel-standalone: initramfs # run the tests after building the kernel run: kernel initramfs.cpio - $(Q)qemu-system-$(QEMU_ARCH) -display none -no-reboot -kernel "$(srctree)/$(IMAGE)" -initrd initramfs.cpio -serial stdio $(QEMU_ARGS) > "$(CURDIR)/run.out" + $(Q)qemu-system-$(QEMU_ARCH) -display none -no-reboot -kernel "$(IMAGE)" -initrd initramfs.cpio -serial stdio $(QEMU_ARGS) > "$(CURDIR)/run.out" $(Q)$(REPORT) $(CURDIR)/run.out # re-run the tests from an existing kernel rerun: - $(Q)qemu-system-$(QEMU_ARCH) -display none -no-reboot -kernel "$(srctree)/$(IMAGE)" -initrd initramfs.cpio -serial stdio $(QEMU_ARGS) > "$(CURDIR)/run.out" + $(Q)qemu-system-$(QEMU_ARCH) -display none -no-reboot -kernel "$(IMAGE)" -initrd initramfs.cpio -serial stdio $(QEMU_ARGS) > "$(CURDIR)/run.out" $(Q)$(REPORT) $(CURDIR)/run.out # report with existing test log From 91f16451593b4709036e72a6aaccadc16d87a339 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 5 Nov 2023 10:23:05 +0100 Subject: [PATCH 0652/1562] selftests/nolibc: add script to run testsuite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The script can run the testsuite for multiple architectures and provides an overall test report. Furthermore it can automatically download crosstools from mirrors.kernel.org if requested by the user. Example execution: $ ./run-tests.sh i386: 162 test(s): 162 passed, 0 skipped, 0 failed => status: success x86_64: 162 test(s): 162 passed, 0 skipped, 0 failed => status: success arm64: 162 test(s): 162 passed, 0 skipped, 0 failed => status: success arm: 162 test(s): 162 passed, 0 skipped, 0 failed => status: success mips: 162 test(s): 161 passed, 1 skipped, 0 failed => status: warning ppc: 162 test(s): 162 passed, 0 skipped, 0 failed => status: success ppc64: 162 test(s): 162 passed, 0 skipped, 0 failed => status: success ppc64le: 162 test(s): 162 passed, 0 skipped, 0 failed => status: success riscv: 162 test(s): 162 passed, 0 skipped, 0 failed => status: success s390: 162 test(s): 161 passed, 1 skipped, 0 failed => status: warning loongarch: 162 test(s): 161 passed, 1 skipped, 0 failed => status: warning Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20231105-nolibc-run-tests-v1-1-b59ff770a978@weissschuh.net --- tools/testing/selftests/nolibc/.gitignore | 1 + tools/testing/selftests/nolibc/run-tests.sh | 153 ++++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100755 tools/testing/selftests/nolibc/run-tests.sh diff --git a/tools/testing/selftests/nolibc/.gitignore b/tools/testing/selftests/nolibc/.gitignore index 5119f9f7afd2..35d247a0d5bd 100644 --- a/tools/testing/selftests/nolibc/.gitignore +++ b/tools/testing/selftests/nolibc/.gitignore @@ -3,4 +3,5 @@ /libc-test /nolibc-test /run.out +/run.out.* /sysroot/ diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh new file mode 100755 index 000000000000..1bf020d49f54 --- /dev/null +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -0,0 +1,153 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test runner for nolibc tests + +set -e + +trap 'echo Aborting...' 'ERR' + +crosstool_version=13.2.0 +hostarch=x86_64 +nproc=$(( $(nproc) + 2)) +cache_dir="${XDG_CACHE_HOME:-"$HOME"/.cache}" +download_location="${cache_dir}/crosstools/" +build_location="$(realpath "${cache_dir}"/nolibc-tests/)" +perform_download=0 +archs="i386 x86_64 arm64 arm mips ppc ppc64 ppc64le riscv s390 loongarch" + +TEMP=$(getopt -o 'j:d:c:b:a:ph' -n "$0" -- "$@") + +eval set -- "$TEMP" +unset TEMP + +print_usage() { + cat < + +Known architectures: + ${archs} + +Options: + -j [N] Allow N jobs at once (default: ${nproc}) + -p Allow download of toolchains + -d [DIR] Download location for toolchains (default: ${download_location}) + -c [VERSION] Version of toolchains to use (default: ${crosstool_version}) + -a [ARCH] Host architecture of toolchains to use (default: ${hostarch}) + -b [DIR] Build location (default: ${build_location}) +EOF +} + +while true; do + case "$1" in + '-j') + nproc="$2" + shift 2; continue ;; + '-p') + perform_download=1 + shift; continue ;; + '-d') + download_location="$2" + shift 2; continue ;; + '-c') + crosstool_version="$2" + shift 2; continue ;; + '-a') + hostarch="$2" + shift 2; continue ;; + '-b') + build_location="$(realpath "$2")" + shift 2; continue ;; + '-h') + print_usage + exit 0 + ;; + '--') + shift; break ;; + *) + echo 'Internal error!' >&2; exit 1 ;; + esac +done + +if [[ -n "$*" ]]; then + archs="$*" +fi + +crosstool_arch() { + case "$1" in + arm64) echo aarch64;; + ppc) echo powerpc;; + ppc64) echo powerpc64;; + ppc64le) echo powerpc64;; + riscv) echo riscv64;; + loongarch) echo loongarch64;; + mips*) echo mips;; + *) echo "$1";; + esac +} + +crosstool_abi() { + case "$1" in + arm) echo linux-gnueabi;; + *) echo linux;; + esac +} + +download_crosstool() { + arch="$(crosstool_arch "$1")" + abi="$(crosstool_abi "$1")" + + archive_name="${hostarch}-gcc-${crosstool_version}-nolibc-${arch}-${abi}.tar.gz" + url="https://mirrors.edge.kernel.org/pub/tools/crosstool/files/bin/${hostarch}/${crosstool_version}/${archive_name}" + archive="${download_location}${archive_name}" + stamp="${archive}.stamp" + + [ -f "${stamp}" ] && return + + echo "Downloading crosstools ${arch} ${crosstool_version}" + mkdir -p "${download_location}" + curl -o "${archive}" --fail --continue-at - "${url}" + tar -C "${download_location}" -xf "${archive}" + touch "${stamp}" +} + +# capture command output, print it on failure +# mimics chronic(1) from moreutils +function swallow_output() { + if ! OUTPUT="$("$@" 2>&1)"; then + echo "$OUTPUT" + return 1 + fi + return 0 +} + +test_arch() { + arch=$1 + ct_arch=$(crosstool_arch "$arch") + ct_abi=$(crosstool_abi "$1") + cross_compile=$(realpath "${download_location}gcc-${crosstool_version}-nolibc/${ct_arch}-${ct_abi}/bin/${ct_arch}-${ct_abi}-") + build_dir="${build_location}/${arch}" + MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" O="${build_dir}") + + mkdir -p "$build_dir" + if [ ! -f "${build_dir}/.config" ]; then + swallow_output "${MAKE[@]}" defconfig + fi + printf '%-15s' "$arch:" + swallow_output "${MAKE[@]}" run V=1 + cp run.out run.out."${arch}" + "${MAKE[@]}" report | grep passed +} + +if [ "$perform_download" -ne 0 ]; then + for arch in $archs; do + download_crosstool "$arch" + done +fi + +for arch in $archs; do + test_arch "$arch" +done From 48946c5aa7a848c7dfc2151267af92956f492f58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 5 Nov 2023 11:07:05 +0100 Subject: [PATCH 0653/1562] tools/nolibc: error out on unsupported architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an architecture is unsupported arch.h would silently continue. This leads to a lot of followup errors because my_syscallX() is not defined and the startup code is missing. Avoid these confusing errors and fail the build early with a clear error message and location. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau --- tools/include/nolibc/arch.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/include/nolibc/arch.h b/tools/include/nolibc/arch.h index e276fb0680af..2f72ccac0378 100644 --- a/tools/include/nolibc/arch.h +++ b/tools/include/nolibc/arch.h @@ -33,6 +33,8 @@ #include "arch-s390.h" #elif defined(__loongarch__) #include "arch-loongarch.h" +#else +#error Unsupported Architecture #endif #endif /* _NOLIBC_ARCH_H */ From aa68a5a83a0acc4c1babcb4f8be49261514ab65c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 20 Oct 2023 09:30:33 +0200 Subject: [PATCH 0654/1562] tools/nolibc: move MIPS ABI validation into arch-mips.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When installing nolibc to a sysroot arch.h is not used so its ABI check is bypassed. This makes is possible to compile nolibc with a non O32 ABI which may build but can not run. Move the check into arch-mips.h so it will always be evaluated. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau --- tools/include/nolibc/arch-mips.h | 4 ++++ tools/include/nolibc/arch.h | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h index 4ab6fa54beee..3a2c76716b83 100644 --- a/tools/include/nolibc/arch-mips.h +++ b/tools/include/nolibc/arch-mips.h @@ -10,6 +10,10 @@ #include "compiler.h" #include "crt.h" +#if !defined(_ABIO32) +#error Unsupported MIPS ABI +#endif + /* Syscalls for MIPS ABI O32 : * - WARNING! there's always a delayed slot! * - WARNING again, the syntax is different, registers take a '$' and numbers diff --git a/tools/include/nolibc/arch.h b/tools/include/nolibc/arch.h index 2f72ccac0378..c8f4e5d3add9 100644 --- a/tools/include/nolibc/arch.h +++ b/tools/include/nolibc/arch.h @@ -23,7 +23,7 @@ #include "arch-arm.h" #elif defined(__aarch64__) #include "arch-aarch64.h" -#elif defined(__mips__) && defined(_ABIO32) +#elif defined(__mips__) #include "arch-mips.h" #elif defined(__powerpc__) #include "arch-powerpc.h" From c4c20a7d6ef9d5a4330a63c1fd4553dac5f93c04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 20 Oct 2023 09:36:18 +0200 Subject: [PATCH 0655/1562] selftests/nolibc: use XARCH for MIPS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIPS has many different configurations prepare the support of additional ones by moving the build of MIPS to the generic XARCH infrastructure. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau --- tools/testing/selftests/nolibc/Makefile | 12 +++++++----- tools/testing/selftests/nolibc/run-tests.sh | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 4818ae4bdff4..07c94df50dd1 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -40,12 +40,14 @@ objtree ?= $(srctree) # configure default variants for target kernel supported architectures XARCH_powerpc = ppc +XARCH_mips = mips32le XARCH = $(or $(XARCH_$(ARCH)),$(ARCH)) # map from user input variants to their kernel supported architectures ARCH_ppc = powerpc ARCH_ppc64 = powerpc ARCH_ppc64le = powerpc +ARCH_mips32le = mips ARCH := $(or $(ARCH_$(XARCH)),$(XARCH)) # kernel image names by architecture @@ -54,7 +56,7 @@ IMAGE_x86_64 = arch/x86/boot/bzImage IMAGE_x86 = arch/x86/boot/bzImage IMAGE_arm64 = arch/arm64/boot/Image IMAGE_arm = arch/arm/boot/zImage -IMAGE_mips = vmlinuz +IMAGE_mips32le = vmlinuz IMAGE_ppc = vmlinux IMAGE_ppc64 = vmlinux IMAGE_ppc64le = arch/powerpc/boot/zImage @@ -70,7 +72,7 @@ DEFCONFIG_x86_64 = defconfig DEFCONFIG_x86 = defconfig DEFCONFIG_arm64 = defconfig DEFCONFIG_arm = multi_v7_defconfig -DEFCONFIG_mips = malta_defconfig +DEFCONFIG_mips32le = malta_defconfig DEFCONFIG_ppc = pmac32_defconfig DEFCONFIG_ppc64 = powernv_be_defconfig DEFCONFIG_ppc64le = powernv_defconfig @@ -88,7 +90,7 @@ QEMU_ARCH_x86_64 = x86_64 QEMU_ARCH_x86 = x86_64 QEMU_ARCH_arm64 = aarch64 QEMU_ARCH_arm = arm -QEMU_ARCH_mips = mipsel # works with malta_defconfig +QEMU_ARCH_mips32le = mipsel # works with malta_defconfig QEMU_ARCH_ppc = ppc QEMU_ARCH_ppc64 = ppc64 QEMU_ARCH_ppc64le = ppc64 @@ -110,7 +112,7 @@ QEMU_ARGS_x86_64 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $( QEMU_ARGS_x86 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_arm64 = -M virt -cpu cortex-a53 -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_arm = -M virt -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS_mips = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_mips32le = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_ppc = -M g3beige -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_ppc64 = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_ppc64le = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" @@ -134,7 +136,7 @@ CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2) CFLAGS_s390 = -m64 -CFLAGS_mips = -EL +CFLAGS_mips32le = -EL CFLAGS_STACKPROTECTOR ?= $(call cc-option,-mstack-protector-guard=global $(call cc-option,-fstack-protector-all)) CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall -Wextra \ $(call cc-option,-fno-stack-protector) \ diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index 1bf020d49f54..8f2c3bc572cb 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -14,7 +14,7 @@ cache_dir="${XDG_CACHE_HOME:-"$HOME"/.cache}" download_location="${cache_dir}/crosstools/" build_location="$(realpath "${cache_dir}"/nolibc-tests/)" perform_download=0 -archs="i386 x86_64 arm64 arm mips ppc ppc64 ppc64le riscv s390 loongarch" +archs="i386 x86_64 arm64 arm mips32le ppc ppc64 ppc64le riscv s390 loongarch" TEMP=$(getopt -o 'j:d:c:b:a:ph' -n "$0" -- "$@") From bb503f5f01546c65fc510787b2964de3b62b6646 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 20 Oct 2023 13:39:39 +0200 Subject: [PATCH 0656/1562] selftests/nolibc: explicitly specify ABI for MIPS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit More ABIs exist, for better clarity specify it explicitly everywhere. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau --- tools/testing/selftests/nolibc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 07c94df50dd1..6c7bc6ad3387 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -136,7 +136,7 @@ CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2) CFLAGS_s390 = -m64 -CFLAGS_mips32le = -EL +CFLAGS_mips32le = -EL -mabi=32 CFLAGS_STACKPROTECTOR ?= $(call cc-option,-mstack-protector-guard=global $(call cc-option,-fstack-protector-all)) CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall -Wextra \ $(call cc-option,-fno-stack-protector) \ From 3ab1e9db098a41dcfc0d93ae964bd5901e4ef1b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 20 Oct 2023 13:34:27 +0200 Subject: [PATCH 0657/1562] selftests/nolibc: extraconfig support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow some postprocessing of defconfig files. Suggested-by: Zhangjin Wu Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau --- tools/testing/selftests/nolibc/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 6c7bc6ad3387..4983718866b6 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -81,6 +81,8 @@ DEFCONFIG_s390 = defconfig DEFCONFIG_loongarch = defconfig DEFCONFIG = $(DEFCONFIG_$(XARCH)) +EXTRACONFIG = $(EXTRACONFIG_$(XARCH)) + # optional tests to run (default = all) TEST = @@ -227,6 +229,10 @@ initramfs: nolibc-test defconfig: $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) prepare + $(Q)if [ -n "$(EXTRACONFIG)" ]; then \ + $(srctree)/scripts/config --file $(objtree)/.config $(EXTRACONFIG); \ + $(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) olddefconfig < /dev/null; \ + fi kernel: $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null From b4b9fb91da99035ce59ac74c9a27562afddfc21d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 20 Oct 2023 11:04:10 +0200 Subject: [PATCH 0658/1562] selftests/nolibc: add configuration for mipso32be MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow testing MIPS O32 big endian. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau --- tools/testing/selftests/nolibc/Makefile | 7 +++++++ tools/testing/selftests/nolibc/run-tests.sh | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 4983718866b6..546859c9f7ac 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -48,6 +48,7 @@ ARCH_ppc = powerpc ARCH_ppc64 = powerpc ARCH_ppc64le = powerpc ARCH_mips32le = mips +ARCH_mips32be = mips ARCH := $(or $(ARCH_$(XARCH)),$(XARCH)) # kernel image names by architecture @@ -57,6 +58,7 @@ IMAGE_x86 = arch/x86/boot/bzImage IMAGE_arm64 = arch/arm64/boot/Image IMAGE_arm = arch/arm/boot/zImage IMAGE_mips32le = vmlinuz +IMAGE_mips32be = vmlinuz IMAGE_ppc = vmlinux IMAGE_ppc64 = vmlinux IMAGE_ppc64le = arch/powerpc/boot/zImage @@ -73,6 +75,7 @@ DEFCONFIG_x86 = defconfig DEFCONFIG_arm64 = defconfig DEFCONFIG_arm = multi_v7_defconfig DEFCONFIG_mips32le = malta_defconfig +DEFCONFIG_mips32be = malta_defconfig DEFCONFIG_ppc = pmac32_defconfig DEFCONFIG_ppc64 = powernv_be_defconfig DEFCONFIG_ppc64le = powernv_defconfig @@ -81,6 +84,7 @@ DEFCONFIG_s390 = defconfig DEFCONFIG_loongarch = defconfig DEFCONFIG = $(DEFCONFIG_$(XARCH)) +EXTRACONFIG_mips32be = -d CONFIG_CPU_LITTLE_ENDIAN -e CONFIG_CPU_BIG_ENDIAN EXTRACONFIG = $(EXTRACONFIG_$(XARCH)) # optional tests to run (default = all) @@ -93,6 +97,7 @@ QEMU_ARCH_x86 = x86_64 QEMU_ARCH_arm64 = aarch64 QEMU_ARCH_arm = arm QEMU_ARCH_mips32le = mipsel # works with malta_defconfig +QEMU_ARCH_mips32be = mips QEMU_ARCH_ppc = ppc QEMU_ARCH_ppc64 = ppc64 QEMU_ARCH_ppc64le = ppc64 @@ -115,6 +120,7 @@ QEMU_ARGS_x86 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $( QEMU_ARGS_arm64 = -M virt -cpu cortex-a53 -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_arm = -M virt -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_mips32le = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_mips32be = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_ppc = -M g3beige -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_ppc64 = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_ppc64le = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" @@ -139,6 +145,7 @@ CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2) CFLAGS_s390 = -m64 CFLAGS_mips32le = -EL -mabi=32 +CFLAGS_mips32be = -EB -mabi=32 CFLAGS_STACKPROTECTOR ?= $(call cc-option,-mstack-protector-guard=global $(call cc-option,-fstack-protector-all)) CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall -Wextra \ $(call cc-option,-fno-stack-protector) \ diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index 8f2c3bc572cb..3a1eaccfbd8d 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -14,7 +14,7 @@ cache_dir="${XDG_CACHE_HOME:-"$HOME"/.cache}" download_location="${cache_dir}/crosstools/" build_location="$(realpath "${cache_dir}"/nolibc-tests/)" perform_download=0 -archs="i386 x86_64 arm64 arm mips32le ppc ppc64 ppc64le riscv s390 loongarch" +archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv s390 loongarch" TEMP=$(getopt -o 'j:d:c:b:a:ph' -n "$0" -- "$@") From 07f679b50252dc9e3d0c19aca5801f82c230c527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 5 Nov 2023 15:16:38 +0100 Subject: [PATCH 0659/1562] selftests/nolibc: fix testcase status alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Center-align all possible status reports. Before OK and FAIL were center-aligned in relation to each other but SKIPPED and FAILED would be left-aligned. Before: 7 environ_addr = <0x7fffef3e7c50> [OK] 8 environ_envp = <0x7fffef3e7c58> [FAIL] 9 environ_auxv [SKIPPED] 10 environ_total [SKIPPED] 11 environ_HOME = <0x7fffef3e99bd> [OK] 12 auxv_addr [SKIPPED] 13 auxv_AT_UID = 1000 [OK] After: 7 environ_addr = <0x7ffff13b00a0> [OK] 8 environ_envp = <0x7ffff13b00a8> [FAIL] 9 environ_auxv [SKIPPED] 10 environ_total [SKIPPED] 11 environ_HOME = <0x7ffff13b19bd> [OK] 12 auxv_addr [SKIPPED] 13 auxv_AT_UID = 1000 [OK] Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/nolibc-test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 2f10541e6f38..e173014f6b66 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -150,11 +150,11 @@ static void result(int llen, enum RESULT r) const char *msg; if (r == OK) - msg = " [OK]"; + msg = " [OK]"; else if (r == SKIPPED) msg = "[SKIPPED]"; else - msg = "[FAIL]"; + msg = " [FAIL]"; if (llen < 64) putcharn(' ', 64 - llen); From d7233e2b758b927695e63e078fe55abcc6ecd3a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 15 Sep 2077 02:13:52 +0200 Subject: [PATCH 0660/1562] selftests/nolibc: introduce QEMU_ARCH_USER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While ppc64le shares the same executable with regular ppc64 the user variant needs has a dedicated executable. Introduce a new QEMU_ARCH_USER Makefile variable to accommodate that. Fixes: 17362f3d0bd3 ("selftests/nolibc: use qemu-system-ppc64 for ppc64le") Link: https://lore.kernel.org/r/20770915-nolibc-run-user-v1-1-3caec61726dc@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 546859c9f7ac..47f0fb52fe55 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -106,6 +106,9 @@ QEMU_ARCH_s390 = s390x QEMU_ARCH_loongarch = loongarch64 QEMU_ARCH = $(QEMU_ARCH_$(XARCH)) +QEMU_ARCH_USER_ppc64le = ppc64le +QEMU_ARCH_USER = $(or $(QEMU_ARCH_USER_$(XARCH)),$(QEMU_ARCH_$(XARCH))) + QEMU_BIOS_DIR = /usr/share/edk2/ QEMU_BIOS_loongarch = $(QEMU_BIOS_DIR)/loongarch64/OVMF_CODE.fd @@ -223,7 +226,7 @@ run-nolibc-test: nolibc-test # qemu user-land test run-user: nolibc-test - $(Q)qemu-$(QEMU_ARCH) ./nolibc-test > "$(CURDIR)/run.out" || : + $(Q)qemu-$(QEMU_ARCH_USER) ./nolibc-test > "$(CURDIR)/run.out" || : $(Q)$(REPORT) $(CURDIR)/run.out initramfs.cpio: kernel nolibc-test From 8bcf9a485541fe0079483162496db1add932689b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 15 Sep 2077 02:13:53 +0200 Subject: [PATCH 0661/1562] selftests/nolibc: run-tests.sh: enable testing via qemu-user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qemu-user is faster than a full system test. Link: https://lore.kernel.org/r/20770915-nolibc-run-user-v1-2-3caec61726dc@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/run-tests.sh | 22 ++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index 3a1eaccfbd8d..c0a5a7cea9fa 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -14,9 +14,10 @@ cache_dir="${XDG_CACHE_HOME:-"$HOME"/.cache}" download_location="${cache_dir}/crosstools/" build_location="$(realpath "${cache_dir}"/nolibc-tests/)" perform_download=0 +test_mode=system archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv s390 loongarch" -TEMP=$(getopt -o 'j:d:c:b:a:ph' -n "$0" -- "$@") +TEMP=$(getopt -o 'j:d:c:b:a:m:ph' -n "$0" -- "$@") eval set -- "$TEMP" unset TEMP @@ -38,6 +39,7 @@ Options: -c [VERSION] Version of toolchains to use (default: ${crosstool_version}) -a [ARCH] Host architecture of toolchains to use (default: ${hostarch}) -b [DIR] Build location (default: ${build_location}) + -m [MODE] Test mode user/system (default: ${test_mode}) EOF } @@ -61,6 +63,9 @@ while true; do '-b') build_location="$(realpath "$2")" shift 2; continue ;; + '-m') + test_mode="$2" + shift 2; continue ;; '-h') print_usage exit 0 @@ -133,11 +138,22 @@ test_arch() { MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" O="${build_dir}") mkdir -p "$build_dir" - if [ ! -f "${build_dir}/.config" ]; then + if [ "$test_mode" = "system" ] && [ ! -f "${build_dir}/.config" ]; then swallow_output "${MAKE[@]}" defconfig fi + case "$test_mode" in + 'system') + test_target=run + ;; + 'user') + test_target=run-user + ;; + *) + echo "Unknown mode $test_mode" + exit 1 + esac printf '%-15s' "$arch:" - swallow_output "${MAKE[@]}" run V=1 + swallow_output "${MAKE[@]}" "$test_target" V=1 cp run.out run.out."${arch}" "${MAKE[@]}" report | grep passed } From 544102458a8d1c33f9f5f99f9bda8e2b858bcb10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 8 Nov 2023 19:14:43 +0100 Subject: [PATCH 0662/1562] tools/nolibc: mips: add support for PIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIPS requires some extra instructions to set up the $gp register for the with a pointer to the global data area. This isn't needed for non-PIC builds, but this patch enables the code unconditionally to prevent bitrot. Also enable PIC in one of the test configurations for ongoing validation. Link: https://lore.kernel.org/r/20231108-nolibc-pic-v2-1-4fb0d6284757@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/arch-mips.h | 7 ++++++- tools/testing/selftests/nolibc/Makefile | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h index 3a2c76716b83..62cc50ef3288 100644 --- a/tools/include/nolibc/arch-mips.h +++ b/tools/include/nolibc/arch-mips.h @@ -184,8 +184,13 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ __asm__ volatile ( ".set push\n" ".set noreorder\n" - ".option pic0\n" + "bal 1f\n" /* prime $ra for .cpload */ + "nop\n" + "1:\n" + ".cpload $ra\n" "move $a0, $sp\n" /* save stack pointer to $a0, as arg1 of _start_c */ + "addiu $sp, $sp, -4\n" /* space for .cprestore to store $gp */ + ".cprestore 0\n" "li $t0, -8\n" "and $sp, $sp, $t0\n" /* $sp must be 8-byte aligned */ "addiu $sp, $sp, -16\n" /* the callee expects to save a0..a3 there */ diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 47f0fb52fe55..40dd95228051 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -147,7 +147,7 @@ CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2) CFLAGS_s390 = -m64 -CFLAGS_mips32le = -EL -mabi=32 +CFLAGS_mips32le = -EL -mabi=32 -fPIC CFLAGS_mips32be = -EB -mabi=32 CFLAGS_STACKPROTECTOR ?= $(call cc-option,-mstack-protector-guard=global $(call cc-option,-fstack-protector-all)) CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall -Wextra \ From b9e64724cd8aeca9e7ab4523a92ccf2ba0cd1de2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 22 Nov 2023 08:27:59 +0100 Subject: [PATCH 0663/1562] selftests/nolibc: make result alignment more robust MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the check of the existing length into the function so it can't be forgotten by the caller. Also hardcode the padding character as only spaces are ever used. Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/nolibc-test.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index e173014f6b66..2b71fb5fae4e 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -130,11 +130,17 @@ static const char *errorname(int err) } } -static void putcharn(char c, size_t n) +static void align_result(size_t llen) { - char buf[64]; + const size_t align = 64; + char buf[align]; + size_t n; - memset(buf, c, n); + if (llen >= align) + return; + + n = align - llen; + memset(buf, ' ', n); buf[n] = '\0'; fputs(buf, stdout); } @@ -156,8 +162,7 @@ static void result(int llen, enum RESULT r) else msg = " [FAIL]"; - if (llen < 64) - putcharn(' ', 64 - llen); + align_result(llen); puts(msg); } From dece8476d6dda087cacb8e105bb70e02a9ef9387 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 23 Nov 2023 22:53:13 +0100 Subject: [PATCH 0664/1562] tools/nolibc: annotate va_list printf formats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __attribute__(format(printf)) can also be used for functions that take a va_list argument. As per the GCC docs: For functions where the arguments are not available to be checked (such as vprintf), specify the third parameter as zero. Link: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/stdio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index d7ef43973916..16cd4d807251 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -212,7 +212,7 @@ char *fgets(char *s, int size, FILE *stream) * - %s * - unknown modifiers are ignored. */ -static __attribute__((unused)) +static __attribute__((unused, format(printf, 2, 0))) int vfprintf(FILE *stream, const char *fmt, va_list args) { char escape, lpref, c; @@ -318,7 +318,7 @@ int vfprintf(FILE *stream, const char *fmt, va_list args) return written; } -static __attribute__((unused)) +static __attribute__((unused, format(printf, 1, 0))) int vprintf(const char *fmt, va_list args) { return vfprintf(stdout, fmt, args); From 825f404776b4f9d5f4a35545ea2d258bb16c0d4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 25 Nov 2023 11:54:02 +0100 Subject: [PATCH 0665/1562] tools/nolibc: drop duplicated testcase ioctl_tiocinq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The same testcase is present on the line above. Fixes: b4844fa0bdb4 ("selftests/nolibc: implement a few tests for various syscalls") Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/nolibc-test.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 2b71fb5fae4e..783c2a97c4e3 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -910,7 +910,6 @@ int run_syscall(int min, int max) CASE_TEST(gettimeofday_tv_tz);EXPECT_SYSZR(1, gettimeofday(&tv, &tz)); break; CASE_TEST(getpagesize); EXPECT_SYSZR(1, test_getpagesize()); break; CASE_TEST(ioctl_tiocinq); EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break; - CASE_TEST(ioctl_tiocinq); EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break; CASE_TEST(link_root1); EXPECT_SYSER(1, link("/", "/"), -1, EEXIST); break; CASE_TEST(link_blah); EXPECT_SYSER(1, link("/proc/self/blah", "/blah"), -1, ENOENT); break; CASE_TEST(link_dir); EXPECT_SYSER(euid0, link("/", "/blah"), -1, EPERM); break; From 7b20478b777c3be39a2b69b08a6c0b50c10105f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 22 Nov 2023 23:22:52 +0100 Subject: [PATCH 0666/1562] tools/nolibc: drop custom definition of struct rusage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A future commit will include linux/resource.h, which will conflict with the private definition of struct rusage in nolibc. Avoid the conflict by dropping the private definition and use the one from the UAPI headers. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/lkml/20231123-nolibc-rlimit-v1-1-a428b131de2a@weissschuh.net/ Acked-by: Willy Tarreau --- tools/include/nolibc/types.h | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h index ad0ddaa89e50..b26a5d0c417c 100644 --- a/tools/include/nolibc/types.h +++ b/tools/include/nolibc/types.h @@ -13,6 +13,7 @@ #include #include #include +#include /* Only the generic macros and types may be defined here. The arch-specific @@ -178,26 +179,6 @@ struct linux_dirent64 { char d_name[]; }; -/* needed by wait4() */ -struct rusage { - struct timeval ru_utime; - struct timeval ru_stime; - long ru_maxrss; - long ru_ixrss; - long ru_idrss; - long ru_isrss; - long ru_minflt; - long ru_majflt; - long ru_nswap; - long ru_inblock; - long ru_oublock; - long ru_msgsnd; - long ru_msgrcv; - long ru_nsignals; - long ru_nvcsw; - long ru_nivcsw; -}; - /* The format of the struct as returned by the libc to the application, which * significantly differs from the format returned by the stat() syscall flavours. */ From a0bb5f88fc3d72bc92c24a631f2c7794362efac1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 22 Nov 2023 23:49:43 +0100 Subject: [PATCH 0667/1562] tools/nolibc: add support for getrlimit/setrlimit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The implementation uses the prlimit64 systemcall as that is available on all architectures. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/lkml/20231123-nolibc-rlimit-v1-2-a428b131de2a@weissschuh.net/ Acked-by: Willy Tarreau --- tools/include/nolibc/sys.h | 38 ++++++++++++++++++++ tools/testing/selftests/nolibc/nolibc-test.c | 29 +++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 2f359cb03d10..dda9dffd1d74 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -21,6 +21,7 @@ #include /* for O_* and AT_* */ #include /* for statx() */ #include +#include #include "arch.h" #include "errno.h" @@ -898,6 +899,43 @@ int reboot(int cmd) } +/* + * int getrlimit(int resource, struct rlimit *rlim); + * int setrlimit(int resource, const struct rlimit *rlim); + */ + +static __attribute__((unused)) +int sys_prlimit64(pid_t pid, int resource, + const struct rlimit64 *new_limit, struct rlimit64 *old_limit) +{ + return my_syscall4(__NR_prlimit64, pid, resource, new_limit, old_limit); +} + +static __attribute__((unused)) +int getrlimit(int resource, struct rlimit *rlim) +{ + struct rlimit64 rlim64; + int ret; + + ret = __sysret(sys_prlimit64(0, resource, NULL, &rlim64)); + rlim->rlim_cur = rlim64.rlim_cur; + rlim->rlim_max = rlim64.rlim_max; + + return ret; +} + +static __attribute__((unused)) +int setrlimit(int resource, const struct rlimit *rlim) +{ + struct rlimit64 rlim64 = { + .rlim_cur = rlim->rlim_cur, + .rlim_max = rlim->rlim_max, + }; + + return __sysret(sys_prlimit64(0, resource, &rlim64, NULL)); +} + + /* * int sched_yield(void); */ diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 783c2a97c4e3..a0271ac313ee 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -839,6 +840,33 @@ int test_pipe(void) return !!memcmp(buf, msg, len); } +int test_rlimit(void) +{ + struct rlimit rlim = { + .rlim_cur = 1 << 20, + .rlim_max = 1 << 21, + }; + int ret; + + ret = setrlimit(RLIMIT_CORE, &rlim); + if (ret) + return -1; + + rlim.rlim_cur = 0; + rlim.rlim_max = 0; + + ret = getrlimit(RLIMIT_CORE, &rlim); + if (ret) + return -1; + + if (rlim.rlim_cur != 1 << 20) + return -1; + if (rlim.rlim_max != 1 << 21) + return -1; + + return 0; +} + /* Run syscall tests between IDs and . * Return 0 on success, non-zero on failure. @@ -928,6 +956,7 @@ int run_syscall(int min, int max) CASE_TEST(poll_fault); EXPECT_SYSER(1, poll(NULL, 1, 0), -1, EFAULT); break; CASE_TEST(prctl); EXPECT_SYSER(1, prctl(PR_SET_NAME, (unsigned long)NULL, 0, 0, 0), -1, EFAULT); break; CASE_TEST(read_badf); EXPECT_SYSER(1, read(-1, &tmp, 1), -1, EBADF); break; + CASE_TEST(rlimit); EXPECT_SYSZR(1, test_rlimit()); break; CASE_TEST(rmdir_blah); EXPECT_SYSER(1, rmdir("/blah"), -1, ENOENT); break; CASE_TEST(sched_yield); EXPECT_SYSZR(1, sched_yield()); break; CASE_TEST(select_null); EXPECT_SYSZR(1, ({ struct timeval tv = { 0 }; select(0, NULL, NULL, NULL, &tv); })); break; From d543d9ddf593b1f4cb1d57d9ac0ad279fe18adaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 22 Nov 2023 23:49:52 +0100 Subject: [PATCH 0668/1562] selftests/nolibc: disable coredump via setrlimit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qemu-user does has its own implementation of coredumping. That implementation does not respect the call to prctl(PR_SET_DUMPABLE, 0) in run_protection(). This leads to a coredump for every test run under qemu-user. Use also setrlimit() to inhibit coredump creation which is respected by qemu-user. Link: https://lore.kernel.org/qemu-devel/20231115-qemu-user-dumpable-v1-2-edbe7f0fbb02@t-8ch.de/ Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/lkml/20231123-nolibc-rlimit-v1-3-a428b131de2a@weissschuh.net/ Acked-by: Willy Tarreau --- tools/testing/selftests/nolibc/nolibc-test.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index a0271ac313ee..6ba4f8275ac4 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -1166,6 +1166,7 @@ static int run_protection(int min __attribute__((unused)), { pid_t pid; int llen = 0, status; + struct rlimit rlimit = { 0, 0 }; llen += printf("0 -fstackprotector "); @@ -1197,6 +1198,7 @@ static int run_protection(int min __attribute__((unused)), close(STDERR_FILENO); prctl(PR_SET_DUMPABLE, 0, 0, 0, 0); + setrlimit(RLIMIT_CORE, &rlimit); smash_stack(); return 1; From c3ffdfff978a089f2d678571664eecd41953253d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 8 Dec 2023 20:19:03 +0100 Subject: [PATCH 0669/1562] thermal: Drop redundant and confusing device_is_registered() checks Multiple places in the thermal subsystem (most importantly, sysfs attribute callback functions) check if the given thermal zone device is still registered in order to return early in case the device_del() in thermal_zone_device_unregister() has run already. However, after thermal_zone_device_unregister() has been made wait for all of the zone-related activity to complete before returning, it is not necessary to do that any more, because all of the code holding a reference to the thermal zone device object will be waited for even if it does not do anything special to enforce this. Accordingly, drop all of the device_is_registered() checks that are now redundant and get rid of the zone locking that is not necessary any more after dropping them. Signed-off-by: Rafael J. Wysocki Reviewed-and-tested-by: Lukasz Luba Acked-by: Daniel Lezcano --- drivers/thermal/thermal_core.c | 9 ----- drivers/thermal/thermal_helpers.c | 5 +-- drivers/thermal/thermal_hwmon.c | 5 +-- drivers/thermal/thermal_sysfs.c | 60 +++---------------------------- 4 files changed, 7 insertions(+), 72 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 70a294d12187..2cf6caff6784 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -203,9 +203,6 @@ int thermal_zone_device_set_policy(struct thermal_zone_device *tz, mutex_lock(&thermal_governor_lock); mutex_lock(&tz->lock); - if (!device_is_registered(&tz->device)) - goto exit; - gov = __find_governor(strim(policy)); if (!gov) goto exit; @@ -471,12 +468,6 @@ static int thermal_zone_device_set_mode(struct thermal_zone_device *tz, return ret; } - if (!device_is_registered(&tz->device)) { - mutex_unlock(&tz->lock); - - return -ENODEV; - } - if (tz->ops->change_mode) ret = tz->ops->change_mode(tz, mode); diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c index 69e8ea4aa908..d0afb623e475 100644 --- a/drivers/thermal/thermal_helpers.c +++ b/drivers/thermal/thermal_helpers.c @@ -139,10 +139,7 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp) goto unlock; } - if (device_is_registered(&tz->device)) - ret = __thermal_zone_get_temp(tz, temp); - else - ret = -ENODEV; + ret = __thermal_zone_get_temp(tz, temp); unlock: mutex_unlock(&tz->lock); diff --git a/drivers/thermal/thermal_hwmon.c b/drivers/thermal/thermal_hwmon.c index c3ae44659b81..252116f1e535 100644 --- a/drivers/thermal/thermal_hwmon.c +++ b/drivers/thermal/thermal_hwmon.c @@ -80,10 +80,7 @@ temp_crit_show(struct device *dev, struct device_attribute *attr, char *buf) mutex_lock(&tz->lock); - if (device_is_registered(&tz->device)) - ret = tz->ops->get_crit_temp(tz, &temperature); - else - ret = -ENODEV; + ret = tz->ops->get_crit_temp(tz, &temperature); mutex_unlock(&tz->lock); diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c index 9e3d8fa01eea..f52af8a3b4b5 100644 --- a/drivers/thermal/thermal_sysfs.c +++ b/drivers/thermal/thermal_sysfs.c @@ -83,24 +83,12 @@ trip_point_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); - enum thermal_trip_type type; int trip_id; if (sscanf(attr->attr.name, "trip_point_%d_type", &trip_id) != 1) return -EINVAL; - mutex_lock(&tz->lock); - - if (!device_is_registered(dev)) { - mutex_unlock(&tz->lock); - return -ENODEV; - } - - type = tz->trips[trip_id].type; - - mutex_unlock(&tz->lock); - - switch (type) { + switch (tz->trips[trip_id].type) { case THERMAL_TRIP_CRITICAL: return sprintf(buf, "critical\n"); case THERMAL_TRIP_HOT: @@ -132,11 +120,6 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr, mutex_lock(&tz->lock); - if (!device_is_registered(dev)) { - ret = -ENODEV; - goto unlock; - } - trip = &tz->trips[trip_id]; if (temp != trip->temperature) { @@ -162,23 +145,12 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); - int trip_id, temp; + int trip_id; if (sscanf(attr->attr.name, "trip_point_%d_temp", &trip_id) != 1) return -EINVAL; - mutex_lock(&tz->lock); - - if (!device_is_registered(dev)) { - mutex_unlock(&tz->lock); - return -ENODEV; - } - - temp = tz->trips[trip_id].temperature; - - mutex_unlock(&tz->lock); - - return sprintf(buf, "%d\n", temp); + return sprintf(buf, "%d\n", tz->trips[trip_id].temperature); } static ssize_t @@ -199,11 +171,6 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr, mutex_lock(&tz->lock); - if (!device_is_registered(dev)) { - ret = -ENODEV; - goto unlock; - } - trip = &tz->trips[trip_id]; if (hyst != trip->hysteresis) { @@ -229,23 +196,12 @@ trip_point_hyst_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); - int trip_id, hyst; + int trip_id; if (sscanf(attr->attr.name, "trip_point_%d_hyst", &trip_id) != 1) return -EINVAL; - mutex_lock(&tz->lock); - - if (!device_is_registered(dev)) { - mutex_unlock(&tz->lock); - return -ENODEV; - } - - hyst = tz->trips[trip_id].hysteresis; - - mutex_unlock(&tz->lock); - - return sprintf(buf, "%d\n", hyst); + return sprintf(buf, "%d\n", tz->trips[trip_id].hysteresis); } static ssize_t @@ -294,11 +250,6 @@ emul_temp_store(struct device *dev, struct device_attribute *attr, mutex_lock(&tz->lock); - if (!device_is_registered(dev)) { - ret = -ENODEV; - goto unlock; - } - if (!tz->ops->set_emul_temp) tz->emul_temperature = temperature; else @@ -307,7 +258,6 @@ emul_temp_store(struct device *dev, struct device_attribute *attr, if (!ret) __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); -unlock: mutex_unlock(&tz->lock); return ret ? ret : count; From b38aa87f67931e23ebc32c0ca00a86dfa4688719 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 8 Dec 2023 20:20:00 +0100 Subject: [PATCH 0670/1562] thermal: core: Rework thermal zone availability check In order to avoid running __thermal_zone_device_update() for thermal zones going away, the thermal zone lock is held around device_del() in thermal_zone_device_unregister() and thermal_zone_device_update() passes the given thermal zone device to device_is_registered(). This allows thermal_zone_device_update() to skip the __thermal_zone_device_update() if device_del() has already run for the thermal zone at hand. However, instead of looking at driver core internals, the thermal subsystem may as well rely on its own data structures for this purpose. Namely, if the thermal zone is not present in thermal_tz_list, it can be regarded as unavailable, which in fact is already the case in thermal_zone_device_unregister(). Accordingly, the device_is_registered() check in thermal_zone_device_update() can be replaced with checking whether or not the node list_head in struct thermal_zone_device is empty, in which case it is not there in thermal_tz_list. To make this work, though, it is necessary to initialize tz->node in thermal_zone_device_register_with_trips() before registering the thermal zone device and it needs to be added to thermal_tz_list and deleted from it under its zone lock. After the above modifications, the zone lock does not need to be held around device_del() in thermal_zone_device_unregister() any more. Signed-off-by: Rafael J. Wysocki Reviewed-and-tested-by: Lukasz Luba Acked-by: Daniel Lezcano --- drivers/thermal/thermal_core.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 2cf6caff6784..e5434cdbf23b 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -505,11 +505,16 @@ int thermal_zone_device_is_enabled(struct thermal_zone_device *tz) return tz->mode == THERMAL_DEVICE_ENABLED; } +static bool thermal_zone_is_present(struct thermal_zone_device *tz) +{ + return !list_empty(&tz->node); +} + void thermal_zone_device_update(struct thermal_zone_device *tz, enum thermal_notify_event event) { mutex_lock(&tz->lock); - if (device_is_registered(&tz->device)) + if (thermal_zone_is_present(tz)) __thermal_zone_device_update(tz, event); mutex_unlock(&tz->lock); } @@ -1304,6 +1309,7 @@ thermal_zone_device_register_with_trips(const char *type, struct thermal_trip *t } INIT_LIST_HEAD(&tz->thermal_instances); + INIT_LIST_HEAD(&tz->node); ida_init(&tz->ida); mutex_init(&tz->lock); init_completion(&tz->removal); @@ -1369,7 +1375,9 @@ thermal_zone_device_register_with_trips(const char *type, struct thermal_trip *t } mutex_lock(&thermal_list_lock); + mutex_lock(&tz->lock); list_add_tail(&tz->node, &thermal_tz_list); + mutex_unlock(&tz->lock); mutex_unlock(&thermal_list_lock); /* Bind cooling devices for this zone */ @@ -1460,7 +1468,10 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz) mutex_unlock(&thermal_list_lock); return; } + + mutex_lock(&tz->lock); list_del(&tz->node); + mutex_unlock(&tz->lock); /* Unbind all cdevs associated with 'this' thermal zone */ list_for_each_entry(cdev, &thermal_cdev_list, node) @@ -1477,9 +1488,7 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz) ida_free(&thermal_tz_ida, tz->id); ida_destroy(&tz->ida); - mutex_lock(&tz->lock); device_del(&tz->device); - mutex_unlock(&tz->lock); kfree(tz->tzp); From e5c7bcb499840551cfbe85c6df177ebc50432bf0 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 12 Dec 2023 09:12:38 +0100 Subject: [PATCH 0671/1562] spi: sh-msiof: Enforce fixed DTDL for R-Car H3 Documentation says only DTDL of 200 is allowed for this SoC. Fixes: 4286db8456f4 ("spi: sh-msiof: Add R-Car Gen 2 and 3 fallback bindings") Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Reviewed-by: Yoshihiro Shimoda Link: https://msgid.link/r/20231212081239.14254-1-wsa+renesas@sang-engineering.com Signed-off-by: Mark Brown --- drivers/spi/spi-sh-msiof.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c index fb452bc78372..cfc3b1ddbd22 100644 --- a/drivers/spi/spi-sh-msiof.c +++ b/drivers/spi/spi-sh-msiof.c @@ -29,12 +29,15 @@ #include +#define SH_MSIOF_FLAG_FIXED_DTDL_200 BIT(0) + struct sh_msiof_chipdata { u32 bits_per_word_mask; u16 tx_fifo_size; u16 rx_fifo_size; u16 ctlr_flags; u16 min_div_pow; + u32 flags; }; struct sh_msiof_spi_priv { @@ -1072,6 +1075,16 @@ static const struct sh_msiof_chipdata rcar_gen3_data = { .min_div_pow = 1, }; +static const struct sh_msiof_chipdata rcar_r8a7795_data = { + .bits_per_word_mask = SPI_BPW_MASK(8) | SPI_BPW_MASK(16) | + SPI_BPW_MASK(24) | SPI_BPW_MASK(32), + .tx_fifo_size = 64, + .rx_fifo_size = 64, + .ctlr_flags = SPI_CONTROLLER_MUST_TX, + .min_div_pow = 1, + .flags = SH_MSIOF_FLAG_FIXED_DTDL_200, +}; + static const struct of_device_id sh_msiof_match[] __maybe_unused = { { .compatible = "renesas,sh-mobile-msiof", .data = &sh_data }, { .compatible = "renesas,msiof-r8a7743", .data = &rcar_gen2_data }, @@ -1082,6 +1095,7 @@ static const struct of_device_id sh_msiof_match[] __maybe_unused = { { .compatible = "renesas,msiof-r8a7793", .data = &rcar_gen2_data }, { .compatible = "renesas,msiof-r8a7794", .data = &rcar_gen2_data }, { .compatible = "renesas,rcar-gen2-msiof", .data = &rcar_gen2_data }, + { .compatible = "renesas,msiof-r8a7795", .data = &rcar_r8a7795_data }, { .compatible = "renesas,msiof-r8a7796", .data = &rcar_gen3_data }, { .compatible = "renesas,rcar-gen3-msiof", .data = &rcar_gen3_data }, { .compatible = "renesas,rcar-gen4-msiof", .data = &rcar_gen3_data }, @@ -1279,6 +1293,9 @@ static int sh_msiof_spi_probe(struct platform_device *pdev) return -ENXIO; } + if (chipdata->flags & SH_MSIOF_FLAG_FIXED_DTDL_200) + info->dtdl = 200; + if (info->mode == MSIOF_SPI_TARGET) ctlr = spi_alloc_target(&pdev->dev, sizeof(struct sh_msiof_spi_priv)); From c90b5c4e6554c1194d5f7cfe13dfd710a7661cab Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Mon, 20 Nov 2023 13:18:13 +0200 Subject: [PATCH 0672/1562] irqchip/renesas-rzg2l: Use tabs instead of spaces Use tabs instead of spaces in definition of TINT_EXTRACT_HWIRQ() and TINT_EXTRACT_GPIOINT() macros to align with coding style requirements described in Documentation/process/coding-style.rst, "Indentation" chapter. Signed-off-by: Claudiu Beznea Signed-off-by: Thomas Gleixner Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20231120111820.87398-3-claudiu.beznea.uj@bp.renesas.com --- drivers/irqchip/irq-renesas-rzg2l.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index fe8d516f3614..cc42cbd05762 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -53,8 +53,8 @@ #define IITSR_IITSEL_EDGE_BOTH 3 #define IITSR_IITSEL_MASK(n) IITSR_IITSEL((n), 3) -#define TINT_EXTRACT_HWIRQ(x) FIELD_GET(GENMASK(15, 0), (x)) -#define TINT_EXTRACT_GPIOINT(x) FIELD_GET(GENMASK(31, 16), (x)) +#define TINT_EXTRACT_HWIRQ(x) FIELD_GET(GENMASK(15, 0), (x)) +#define TINT_EXTRACT_GPIOINT(x) FIELD_GET(GENMASK(31, 16), (x)) struct rzg2l_irqc_priv { void __iomem *base; From 02f6507640173addeeb3af035d2c6f0b3cff1567 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Mon, 20 Nov 2023 13:18:14 +0200 Subject: [PATCH 0673/1562] irqchip/renesas-rzg2l: Align struct member names to tabs Align struct member names to tabs to follow the requirements from maintainer-tip file. 3 tabs were used at the moment as the next commits will add a new member which requires 3 tabs for a better view. Signed-off-by: Claudiu Beznea Signed-off-by: Thomas Gleixner Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20231120111820.87398-4-claudiu.beznea.uj@bp.renesas.com --- drivers/irqchip/irq-renesas-rzg2l.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index cc42cbd05762..90971ab06f0c 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -57,9 +57,9 @@ #define TINT_EXTRACT_GPIOINT(x) FIELD_GET(GENMASK(31, 16), (x)) struct rzg2l_irqc_priv { - void __iomem *base; - struct irq_fwspec fwspec[IRQC_NUM_IRQ]; - raw_spinlock_t lock; + void __iomem *base; + struct irq_fwspec fwspec[IRQC_NUM_IRQ]; + raw_spinlock_t lock; }; static struct rzg2l_irqc_priv *irq_data_to_priv(struct irq_data *data) From b94f455372ad6e6b4da8e8ed9864d9c7daaf54b8 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Mon, 20 Nov 2023 13:18:15 +0200 Subject: [PATCH 0674/1562] irqchip/renesas-rzg2l: Document structure members Document structure members to follow the requirements specified in maintainer-tip, section 4.3.7. Struct declarations and initializers. Signed-off-by: Claudiu Beznea Signed-off-by: Thomas Gleixner Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20231120111820.87398-5-claudiu.beznea.uj@bp.renesas.com --- drivers/irqchip/irq-renesas-rzg2l.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 90971ab06f0c..0a77927b678b 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -56,6 +56,12 @@ #define TINT_EXTRACT_HWIRQ(x) FIELD_GET(GENMASK(15, 0), (x)) #define TINT_EXTRACT_GPIOINT(x) FIELD_GET(GENMASK(31, 16), (x)) +/** + * struct rzg2l_irqc_priv - IRQ controller private data structure + * @base: Controller's base address + * @fwspec: IRQ firmware specific data + * @lock: Lock to serialize access to hardware registers + */ struct rzg2l_irqc_priv { void __iomem *base; struct irq_fwspec fwspec[IRQC_NUM_IRQ]; From ef88eefb1a81a8701eabb7d5ced761a66a465a49 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Mon, 20 Nov 2023 13:18:16 +0200 Subject: [PATCH 0675/1562] irqchip/renesas-rzg2l: Implement restriction when writing ISCR register The RZ/G2L manual (chapter "IRQ Status Control Register (ISCR)") describes the operation to clear interrupts through the ISCR register as follows: [Write operation] When "Falling-edge detection", "Rising-edge detection" or "Falling/Rising-edge detection" is set in IITSR: - In case ISTAT is 1 0: IRQn interrupt detection status is cleared. 1: Invalid to write. - In case ISTAT is 0 Invalid to write. When "Low-level detection" is set in IITSR.: Invalid to write. Take the interrupt type into account when clearing interrupts through the ISCR register to avoid writing the ISCR when the interrupt type is level. Signed-off-by: Claudiu Beznea Signed-off-by: Thomas Gleixner Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20231120111820.87398-6-claudiu.beznea.uj@bp.renesas.com --- drivers/irqchip/irq-renesas-rzg2l.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 0a77927b678b..d450417948e4 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -78,11 +78,17 @@ static void rzg2l_irq_eoi(struct irq_data *d) unsigned int hw_irq = irqd_to_hwirq(d) - IRQC_IRQ_START; struct rzg2l_irqc_priv *priv = irq_data_to_priv(d); u32 bit = BIT(hw_irq); - u32 reg; + u32 iitsr, iscr; - reg = readl_relaxed(priv->base + ISCR); - if (reg & bit) - writel_relaxed(reg & ~bit, priv->base + ISCR); + iscr = readl_relaxed(priv->base + ISCR); + iitsr = readl_relaxed(priv->base + IITSR); + + /* + * ISCR can only be cleared if the type is falling-edge, rising-edge or + * falling/rising-edge. + */ + if ((iscr & bit) && (iitsr & IITSR_IITSEL_MASK(hw_irq))) + writel_relaxed(iscr & ~bit, priv->base + ISCR); } static void rzg2l_tint_eoi(struct irq_data *d) From 2eca4731cc66563b3919d8753dbd74d18c39f662 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Mon, 20 Nov 2023 13:18:17 +0200 Subject: [PATCH 0676/1562] irqchip/renesas-rzg2l: Add macro to retrieve TITSR register offset based on register's index There are 2 TITSR registers available on the IA55 interrupt controller. Add a macro that retrieves the TITSR register offset based on it's index. This macro is useful in when adding suspend/resume support so both TITSR registers can be accessed in a for loop. Signed-off-by: Claudiu Beznea Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231120111820.87398-7-claudiu.beznea.uj@bp.renesas.com --- drivers/irqchip/irq-renesas-rzg2l.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index d450417948e4..34add75080e0 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -28,8 +28,7 @@ #define ISCR 0x10 #define IITSR 0x14 #define TSCR 0x20 -#define TITSR0 0x24 -#define TITSR1 0x28 +#define TITSR(n) (0x24 + (n) * 4) #define TITSR0_MAX_INT 16 #define TITSEL_WIDTH 0x2 #define TSSR(n) (0x30 + ((n) * 4)) @@ -200,8 +199,7 @@ static int rzg2l_tint_set_edge(struct irq_data *d, unsigned int type) struct rzg2l_irqc_priv *priv = irq_data_to_priv(d); unsigned int hwirq = irqd_to_hwirq(d); u32 titseln = hwirq - IRQC_TINT_START; - u32 offset; - u8 sense; + u8 index, sense; u32 reg; switch (type & IRQ_TYPE_SENSE_MASK) { @@ -217,17 +215,17 @@ static int rzg2l_tint_set_edge(struct irq_data *d, unsigned int type) return -EINVAL; } - offset = TITSR0; + index = 0; if (titseln >= TITSR0_MAX_INT) { titseln -= TITSR0_MAX_INT; - offset = TITSR1; + index = 1; } raw_spin_lock(&priv->lock); - reg = readl_relaxed(priv->base + offset); + reg = readl_relaxed(priv->base + TITSR(index)); reg &= ~(IRQ_MASK << (titseln * TITSEL_WIDTH)); reg |= sense << (titseln * TITSEL_WIDTH); - writel_relaxed(reg, priv->base + offset); + writel_relaxed(reg, priv->base + TITSR(index)); raw_spin_unlock(&priv->lock); return 0; From 74d2ef5f6f4b2437e6292ab2502400e8048db4aa Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Mon, 20 Nov 2023 13:18:18 +0200 Subject: [PATCH 0677/1562] irqchip/renesas-rzg2l: Add support for suspend to RAM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The irqchip-renesas-rzg2l driver is used on RZ/G3S SoC. RZ/G3S can go into deep sleep states where power to different SoC's parts is cut off and RAM is switched to self-refresh. The resume from these states is done with the help of the bootloader. The IA55 IRQ controller needs to be reconfigured when resuming from deep sleep state. For this the IA55 registers are cached in suspend and restored in resume. The IA55 IRQ controller is connected to GPIO controller and GIC as follows: ┌──────────┐ ┌──────────┐ │ │ SPIX │ │ │ ├─────────►│ │ │ │ │ │ │ │ │ │ ┌────────┐IRQ0-7 │ IA55 │ │ GIC │ Pin0 ───────►│ ├─────────────►│ │ │ │ │ │ │ │ PPIY │ │ ... │ GPIO │ │ ├─────────►│ │ │ │GPIOINT0-127 │ │ │ │ PinN ───────►│ ├─────────────►│ │ │ │ └────────┘ └──────────┘ └──────────┘ where: - Pin0 is the first GPIO controller pin - PinN is the last GPIO controller pin - SPIX is the SPI interrupt with identifier X - PPIY is the PPI interrupt with identifier Y Implement suspend/resume functionality with syscore_ops to be able to cache/restore the registers after/before the GPIO controller suspend/resume functions are invoked. As the syscore_ops suspend/resume functions do not take any argument make the driver private data static so it can be accessed from the suspend/resume functions. The IA55 interrupt controller is resumed before the GPIO controller. As GPIO pins could be in an a state which causes spurious interrupts, the reconfiguration of the interrupt controller is restricted to restore the interrupt type and leave them disabled. An eventually required interrupt enable operation will be done as part of the GPIO controller resume function after restoring the GPIO state. [ tglx: Massaged changelog ] Signed-off-by: Claudiu Beznea Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231120111820.87398-8-claudiu.beznea.uj@bp.renesas.com --- drivers/irqchip/irq-renesas-rzg2l.c | 68 ++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 11 deletions(-) diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 34add75080e0..9494fc26259c 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -18,6 +18,7 @@ #include #include #include +#include #define IRQC_IRQ_START 1 #define IRQC_IRQ_COUNT 8 @@ -55,17 +56,29 @@ #define TINT_EXTRACT_HWIRQ(x) FIELD_GET(GENMASK(15, 0), (x)) #define TINT_EXTRACT_GPIOINT(x) FIELD_GET(GENMASK(31, 16), (x)) +/** + * struct rzg2l_irqc_reg_cache - registers cache (necessary for suspend/resume) + * @iitsr: IITSR register + * @titsr: TITSR registers + */ +struct rzg2l_irqc_reg_cache { + u32 iitsr; + u32 titsr[2]; +}; + /** * struct rzg2l_irqc_priv - IRQ controller private data structure * @base: Controller's base address * @fwspec: IRQ firmware specific data * @lock: Lock to serialize access to hardware registers + * @cache: Registers cache for suspend/resume */ -struct rzg2l_irqc_priv { +static struct rzg2l_irqc_priv { void __iomem *base; struct irq_fwspec fwspec[IRQC_NUM_IRQ]; raw_spinlock_t lock; -}; + struct rzg2l_irqc_reg_cache cache; +} *rzg2l_irqc_data; static struct rzg2l_irqc_priv *irq_data_to_priv(struct irq_data *data) { @@ -246,6 +259,38 @@ static int rzg2l_irqc_set_type(struct irq_data *d, unsigned int type) return irq_chip_set_type_parent(d, IRQ_TYPE_LEVEL_HIGH); } +static int rzg2l_irqc_irq_suspend(void) +{ + struct rzg2l_irqc_reg_cache *cache = &rzg2l_irqc_data->cache; + void __iomem *base = rzg2l_irqc_data->base; + + cache->iitsr = readl_relaxed(base + IITSR); + for (u8 i = 0; i < 2; i++) + cache->titsr[i] = readl_relaxed(base + TITSR(i)); + + return 0; +} + +static void rzg2l_irqc_irq_resume(void) +{ + struct rzg2l_irqc_reg_cache *cache = &rzg2l_irqc_data->cache; + void __iomem *base = rzg2l_irqc_data->base; + + /* + * Restore only interrupt type. TSSRx will be restored at the + * request of pin controller to avoid spurious interrupts due + * to invalid PIN states. + */ + for (u8 i = 0; i < 2; i++) + writel_relaxed(cache->titsr[i], base + TITSR(i)); + writel_relaxed(cache->iitsr, base + IITSR); +} + +static struct syscore_ops rzg2l_irqc_syscore_ops = { + .suspend = rzg2l_irqc_irq_suspend, + .resume = rzg2l_irqc_irq_resume, +}; + static const struct irq_chip irqc_chip = { .name = "rzg2l-irqc", .irq_eoi = rzg2l_irqc_eoi, @@ -331,7 +376,6 @@ static int rzg2l_irqc_init(struct device_node *node, struct device_node *parent) struct irq_domain *irq_domain, *parent_domain; struct platform_device *pdev; struct reset_control *resetn; - struct rzg2l_irqc_priv *priv; int ret; pdev = of_find_device_by_node(node); @@ -344,15 +388,15 @@ static int rzg2l_irqc_init(struct device_node *node, struct device_node *parent) return -ENODEV; } - priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); - if (!priv) + rzg2l_irqc_data = devm_kzalloc(&pdev->dev, sizeof(*rzg2l_irqc_data), GFP_KERNEL); + if (!rzg2l_irqc_data) return -ENOMEM; - priv->base = devm_of_iomap(&pdev->dev, pdev->dev.of_node, 0, NULL); - if (IS_ERR(priv->base)) - return PTR_ERR(priv->base); + rzg2l_irqc_data->base = devm_of_iomap(&pdev->dev, pdev->dev.of_node, 0, NULL); + if (IS_ERR(rzg2l_irqc_data->base)) + return PTR_ERR(rzg2l_irqc_data->base); - ret = rzg2l_irqc_parse_interrupts(priv, node); + ret = rzg2l_irqc_parse_interrupts(rzg2l_irqc_data, node); if (ret) { dev_err(&pdev->dev, "cannot parse interrupts: %d\n", ret); return ret; @@ -375,17 +419,19 @@ static int rzg2l_irqc_init(struct device_node *node, struct device_node *parent) goto pm_disable; } - raw_spin_lock_init(&priv->lock); + raw_spin_lock_init(&rzg2l_irqc_data->lock); irq_domain = irq_domain_add_hierarchy(parent_domain, 0, IRQC_NUM_IRQ, node, &rzg2l_irqc_domain_ops, - priv); + rzg2l_irqc_data); if (!irq_domain) { dev_err(&pdev->dev, "failed to add irq domain\n"); ret = -ENOMEM; goto pm_put; } + register_syscore_ops(&rzg2l_irqc_syscore_ops); + return 0; pm_put: From 1cf0697a24ef60b3ce8be47090a6e8e79329d962 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Mon, 20 Nov 2023 13:18:19 +0200 Subject: [PATCH 0678/1562] dt-bindings: interrupt-controller: renesas,rzg2l-irqc: Document RZ/G3S Document the RZ/G3S (R9108G045) interrupt controller. This has few extra functionalities compared with RZ/G2UL but the already existing driver can still be used. Signed-off-by: Claudiu Beznea Signed-off-by: Thomas Gleixner Reviewed-by: Geert Uytterhoeven Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20231120111820.87398-9-claudiu.beznea.uj@bp.renesas.com --- .../bindings/interrupt-controller/renesas,rzg2l-irqc.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/renesas,rzg2l-irqc.yaml b/Documentation/devicetree/bindings/interrupt-controller/renesas,rzg2l-irqc.yaml index 2ef3081eaaf3..d3b5aec0a3f7 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/renesas,rzg2l-irqc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/renesas,rzg2l-irqc.yaml @@ -26,6 +26,7 @@ properties: - renesas,r9a07g043u-irqc # RZ/G2UL - renesas,r9a07g044-irqc # RZ/G2{L,LC} - renesas,r9a07g054-irqc # RZ/V2L + - renesas,r9a08g045-irqc # RZ/G3S - const: renesas,rzg2l-irqc '#interrupt-cells': @@ -167,7 +168,9 @@ allOf: properties: compatible: contains: - const: renesas,r9a07g043u-irqc + enum: + - renesas,r9a07g043u-irqc + - renesas,r9a08g045-irqc then: properties: interrupts: From ca596295f4c9ec803d3379635ad175897993f121 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Mon, 27 Nov 2023 16:52:12 +0100 Subject: [PATCH 0679/1562] dt-bindings: interrupt-controller: mpm: Pass MSG RAM slice through phandle Due to the wild nature of the Qualcomm RPM Message RAM, the kernel can't really use 'reg' to point to the MPM's slice of Message RAM without cutting into an already-defined RPM MSG RAM node used for GLINK and SMEM. Document passing the register space as a slice of SRAM through the qcom,rpm-msg-ram property. This also makes 'reg' deprecated. Signed-off-by: Konrad Dybcio Signed-off-by: Thomas Gleixner Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20230328-topic-msgram_mpm-v7-1-6ee2bfeaac2c@linaro.org --- .../interrupt-controller/qcom,mpm.yaml | 52 +++++++++++++------ 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/qcom,mpm.yaml b/Documentation/devicetree/bindings/interrupt-controller/qcom,mpm.yaml index 509d20c091af..4ce7912d8047 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/qcom,mpm.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/qcom,mpm.yaml @@ -29,6 +29,12 @@ properties: maxItems: 1 description: Specifies the base address and size of vMPM registers in RPM MSG RAM. + deprecated: true + + qcom,rpm-msg-ram: + $ref: /schemas/types.yaml#/definitions/phandle + description: + Phandle to the APSS MPM slice of the RPM Message RAM interrupts: maxItems: 1 @@ -64,33 +70,45 @@ properties: required: - compatible - - reg - interrupts - mboxes - interrupt-controller - '#interrupt-cells' - qcom,mpm-pin-count - qcom,mpm-pin-map + - qcom,rpm-msg-ram additionalProperties: false examples: - | #include - mpm: interrupt-controller@45f01b8 { - compatible = "qcom,mpm"; - interrupts = ; - reg = <0x45f01b8 0x1000>; - mboxes = <&apcs_glb 1>; - interrupt-controller; - #interrupt-cells = <2>; - interrupt-parent = <&intc>; - qcom,mpm-pin-count = <96>; - qcom,mpm-pin-map = <2 275>, - <5 296>, - <12 422>, - <24 79>, - <86 183>, - <90 260>, - <91 260>; + + remoteproc-rpm { + compatible = "qcom,msm8998-rpm-proc", "qcom,rpm-proc"; + + glink-edge { + compatible = "qcom,glink-rpm"; + + interrupts = ; + qcom,rpm-msg-ram = <&rpm_msg_ram>; + mboxes = <&apcs_glb 0>; + }; + + mpm: interrupt-controller { + compatible = "qcom,mpm"; + qcom,rpm-msg-ram = <&apss_mpm>; + interrupts = ; + mboxes = <&apcs_glb 1>; + interrupt-controller; + #interrupt-cells = <2>; + interrupt-parent = <&intc>; + qcom,mpm-pin-count = <96>; + qcom,mpm-pin-map = <2 275>, + <5 296>, + <12 422>, + <24 79>, + <86 183>, + <91 260>; + }; }; From 221b110d87c2d3ea113ad784b2c6505726a3e157 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Mon, 27 Nov 2023 16:52:13 +0100 Subject: [PATCH 0680/1562] irqchip/qcom-mpm: Support passing a slice of SRAM as reg space The MPM hardware is accessible from the ARM CPUs through a shared memory region (RPM MSG RAM) which is also concurrently accessed by other kinds of cores on the system like modem, ADSP etc. Modeling this relation in a (somewhat) sane manner in the device tree requires to - either present the MPM as a child of said memory region, which makes little sense, as a mapped memory carveout is not a bus. - define nodes which bleed their register spaces into one another - or passing their slice of the MSG RAM through a property Go with the third option and add a way to map a region passed through the "qcom,rpm-msg-ram" property as register space for the MPM interrupt controller. The current way of using 'reg' is preserved for backwards compatibility reasons. [ tglx: Massaged changelog ] Signed-off-by: Konrad Dybcio Signed-off-by: Thomas Gleixner Reviewed-by: Bryan O'Donoghue Acked-by: Shawn Guo Link: https://lore.kernel.org/r/20230328-topic-msgram_mpm-v7-2-6ee2bfeaac2c@linaro.org --- drivers/irqchip/irq-qcom-mpm.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-qcom-mpm.c b/drivers/irqchip/irq-qcom-mpm.c index 7124565234a5..cda5838d2232 100644 --- a/drivers/irqchip/irq-qcom-mpm.c +++ b/drivers/irqchip/irq-qcom-mpm.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -322,8 +323,10 @@ static int qcom_mpm_init(struct device_node *np, struct device_node *parent) struct device *dev = &pdev->dev; struct irq_domain *parent_domain; struct generic_pm_domain *genpd; + struct device_node *msgram_np; struct qcom_mpm_priv *priv; unsigned int pin_cnt; + struct resource res; int i, irq; int ret; @@ -374,9 +377,26 @@ static int qcom_mpm_init(struct device_node *np, struct device_node *parent) raw_spin_lock_init(&priv->lock); - priv->base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(priv->base)) - return PTR_ERR(priv->base); + /* If we have a handle to an RPM message ram partition, use it. */ + msgram_np = of_parse_phandle(np, "qcom,rpm-msg-ram", 0); + if (msgram_np) { + ret = of_address_to_resource(msgram_np, 0, &res); + if (ret) { + of_node_put(msgram_np); + return ret; + } + + /* Don't use devm_ioremap_resource, as we're accessing a shared region. */ + priv->base = devm_ioremap(dev, res.start, resource_size(&res)); + of_node_put(msgram_np); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + } else { + /* Otherwise, fall back to simple MMIO. */ + priv->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + } for (i = 0; i < priv->reg_stride; i++) { qcom_mpm_write(priv, MPM_REG_ENABLE, i, 0); From 69ffab9b9e698248cbb4042e47f82afb00dc1bb4 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Fri, 8 Dec 2023 08:38:57 -0800 Subject: [PATCH 0681/1562] irqchip/irq-xtensa-pic: Clean up - get rid of the cached_irq_mask variable - use BIT() macro instead of bit shifts - drop .disable and .enable as they are equivalent to the default implementations Signed-off-by: Max Filippov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231208163857.82644-1-jcmvbkbc@gmail.com --- drivers/irqchip/irq-xtensa-pic.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/drivers/irqchip/irq-xtensa-pic.c b/drivers/irqchip/irq-xtensa-pic.c index 0c18d1f1e264..f9d6fce4da33 100644 --- a/drivers/irqchip/irq-xtensa-pic.c +++ b/drivers/irqchip/irq-xtensa-pic.c @@ -12,6 +12,7 @@ * Kevin Chea */ +#include #include #include #include @@ -19,8 +20,6 @@ #include #include -unsigned int cached_irq_mask; - /* * Device Tree IRQ specifier translation function which works with one or * two cell bindings. First cell value maps directly to the hwirq number. @@ -44,34 +43,30 @@ static const struct irq_domain_ops xtensa_irq_domain_ops = { static void xtensa_irq_mask(struct irq_data *d) { - cached_irq_mask &= ~(1 << d->hwirq); - xtensa_set_sr(cached_irq_mask, intenable); + u32 irq_mask; + + irq_mask = xtensa_get_sr(intenable); + irq_mask &= ~BIT(d->hwirq); + xtensa_set_sr(irq_mask, intenable); } static void xtensa_irq_unmask(struct irq_data *d) { - cached_irq_mask |= 1 << d->hwirq; - xtensa_set_sr(cached_irq_mask, intenable); -} + u32 irq_mask; -static void xtensa_irq_enable(struct irq_data *d) -{ - xtensa_irq_unmask(d); -} - -static void xtensa_irq_disable(struct irq_data *d) -{ - xtensa_irq_mask(d); + irq_mask = xtensa_get_sr(intenable); + irq_mask |= BIT(d->hwirq); + xtensa_set_sr(irq_mask, intenable); } static void xtensa_irq_ack(struct irq_data *d) { - xtensa_set_sr(1 << d->hwirq, intclear); + xtensa_set_sr(BIT(d->hwirq), intclear); } static int xtensa_irq_retrigger(struct irq_data *d) { - unsigned int mask = 1u << d->hwirq; + unsigned int mask = BIT(d->hwirq); if (WARN_ON(mask & ~XCHAL_INTTYPE_MASK_SOFTWARE)) return 0; @@ -81,8 +76,6 @@ static int xtensa_irq_retrigger(struct irq_data *d) static struct irq_chip xtensa_irq_chip = { .name = "xtensa", - .irq_enable = xtensa_irq_enable, - .irq_disable = xtensa_irq_disable, .irq_mask = xtensa_irq_mask, .irq_unmask = xtensa_irq_unmask, .irq_ack = xtensa_irq_ack, From 37a8ab24d3d4c465b070bd704e2ad2fa277df9d7 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:18 -0400 Subject: [PATCH 0682/1562] maple_tree: remove unnecessary default labels from switch statements Patch series "maple_tree: iterator state changes". These patches have some general cleanup and a change to separate the maple state status tracking from the maple state node. The maple state status change allows for walks to continue from previous places when the status needs to be recorded to make logical sense for the next call to the maple state. For instance, it allows for prev/next to function in a way that better resembles the linked list. It also allows switch statements to be used to detect missed states during compile, and the addition of fast-path "active" state is cleaner as an enum. While making the status change, perf showed some very small (one line) functions that were not inlined even with the inline key word. Making these small functions __always_inline is less expensive according to perf. As part of that change, some inlines have been dropped from larger functions. Perf also showed that the commonly used mas_for_each() iterator was spending a lot of time finding the end of the node. This series introduces caching of the end of the node in the maple state (and updating it during writes). This caching along with the inline changes yielded at 23.25% improvement on the BENCH_MAS_FOR_EACH maple tree test framework benchmark. I've also included a change to mtree_range_walk and mtree_lookup_walk to take advantage of Peng's change [1] to the initial pivot setup. mmtests did not produce any significant gains. [1] https://lore.kernel.org/all/20230711035444.526-1-zhangpeng.00@bytedance.com/T/#u This patch (of 12): Removing the default types from the switch statements will cause compile warnings on missing cases. Link: https://lkml.kernel.org/r/20231101171629.3612299-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Suggested-by: Andrew Morton Signed-off-by: Andrew Morton --- lib/maple_tree.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4439469442c7..fff94a510fa8 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -771,7 +771,6 @@ static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv, BUG_ON(piv >= mt_pivots[type]); switch (type) { - default: case maple_range_64: case maple_leaf_64: node->mr64.pivot[piv] = val; @@ -795,7 +794,6 @@ static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv, static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt) { switch (mt) { - default: case maple_arange_64: return mn->ma64.slot; case maple_range_64: @@ -804,6 +802,8 @@ static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt) case maple_dense: return mn->slot; } + + return NULL; } static inline bool mt_write_locked(const struct maple_tree *mt) @@ -7002,7 +7002,6 @@ static void mt_dump_range(unsigned long min, unsigned long max, else pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max); break; - default: case mt_dump_dec: if (min == max) pr_info("%.*s%lu: ", depth * 2, spaces, min); @@ -7042,7 +7041,6 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, case mt_dump_hex: pr_cont("%p %lX ", node->slot[i], node->pivot[i]); break; - default: case mt_dump_dec: pr_cont("%p %lu ", node->slot[i], node->pivot[i]); } @@ -7072,7 +7070,6 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; - default: case mt_dump_dec: pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); @@ -7097,7 +7094,6 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, case mt_dump_hex: pr_cont("%lx ", node->gap[i]); break; - default: case mt_dump_dec: pr_cont("%lu ", node->gap[i]); } @@ -7108,7 +7104,6 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, case mt_dump_hex: pr_cont("%p %lX ", node->slot[i], node->pivot[i]); break; - default: case mt_dump_dec: pr_cont("%p %lu ", node->slot[i], node->pivot[i]); } From f7a59018953910032231c0a019208c4b0a4a8bc3 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:19 -0400 Subject: [PATCH 0683/1562] maple_tree: make mas_erase() more robust mas_erase() may not deal correctly with all maple states. Make the function more robust by ensuring the state is in one of the two acceptable states. Link: https://lkml.kernel.org/r/20231101171629.3612299-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index fff94a510fa8..8d379d34ea0a 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6173,7 +6173,7 @@ void *mas_erase(struct ma_state *mas) void *entry; MA_WR_STATE(wr_mas, mas, NULL); - if (mas_is_none(mas) || mas_is_paused(mas)) + if (!mas_is_active(mas) || !mas_is_start(mas)) mas->node = MAS_START; /* Retry unnecessary when holding the write lock. */ From bf857ddd21d0bffc1edafc317e8e2ce0d6d5950c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:20 -0400 Subject: [PATCH 0684/1562] maple_tree: move debug check to __mas_set_range() __mas_set_range() was created to shortcut resetting the maple state and a debug check was added to the caller (the vma iterator) to ensure the internal maple state remains safe to use. Move the debug check from the vma iterator into the maple tree itself so other users do not incorrectly use the advanced maple state modification. Fallout from this change include a large amount of debug setup needed to be moved to earlier in the header, and the maple_tree.h radix-tree test code needed to move the inclusion of the header to after the atomic define. None of those changes have functional changes. Link: https://lkml.kernel.org/r/20231101171629.3612299-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 315 ++++++++++---------- mm/internal.h | 2 - tools/testing/radix-tree/linux/maple_tree.h | 2 +- 3 files changed, 160 insertions(+), 159 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index a452dd8a1e5c..b5d5992578c9 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -557,162 +557,6 @@ static inline void mas_reset(struct ma_state *mas) */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) -/** - * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the - * current location. - * @mas: Maple Tree operation state. - * @start: New start of range in the Maple Tree. - * @last: New end of range in the Maple Tree. - * - * set the internal maple state values to a sub-range. - * Please use mas_set_range() if you do not know where you are in the tree. - */ -static inline void __mas_set_range(struct ma_state *mas, unsigned long start, - unsigned long last) -{ - mas->index = start; - mas->last = last; -} - -/** - * mas_set_range() - Set up Maple Tree operation state for a different index. - * @mas: Maple Tree operation state. - * @start: New start of range in the Maple Tree. - * @last: New end of range in the Maple Tree. - * - * Move the operation state to refer to a different range. This will - * have the effect of starting a walk from the top; see mas_next() - * to move to an adjacent index. - */ -static inline -void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) -{ - __mas_set_range(mas, start, last); - mas->node = MAS_START; -} - -/** - * mas_set() - Set up Maple Tree operation state for a different index. - * @mas: Maple Tree operation state. - * @index: New index into the Maple Tree. - * - * Move the operation state to refer to a different index. This will - * have the effect of starting a walk from the top; see mas_next() - * to move to an adjacent index. - */ -static inline void mas_set(struct ma_state *mas, unsigned long index) -{ - - mas_set_range(mas, index, index); -} - -static inline bool mt_external_lock(const struct maple_tree *mt) -{ - return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN; -} - -/** - * mt_init_flags() - Initialise an empty maple tree with flags. - * @mt: Maple Tree - * @flags: maple tree flags. - * - * If you need to initialise a Maple Tree with special flags (eg, an - * allocation tree), use this function. - * - * Context: Any context. - */ -static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags) -{ - mt->ma_flags = flags; - if (!mt_external_lock(mt)) - spin_lock_init(&mt->ma_lock); - rcu_assign_pointer(mt->ma_root, NULL); -} - -/** - * mt_init() - Initialise an empty maple tree. - * @mt: Maple Tree - * - * An empty Maple Tree. - * - * Context: Any context. - */ -static inline void mt_init(struct maple_tree *mt) -{ - mt_init_flags(mt, 0); -} - -static inline bool mt_in_rcu(struct maple_tree *mt) -{ -#ifdef CONFIG_MAPLE_RCU_DISABLED - return false; -#endif - return mt->ma_flags & MT_FLAGS_USE_RCU; -} - -/** - * mt_clear_in_rcu() - Switch the tree to non-RCU mode. - * @mt: The Maple Tree - */ -static inline void mt_clear_in_rcu(struct maple_tree *mt) -{ - if (!mt_in_rcu(mt)) - return; - - if (mt_external_lock(mt)) { - WARN_ON(!mt_lock_is_held(mt)); - mt->ma_flags &= ~MT_FLAGS_USE_RCU; - } else { - mtree_lock(mt); - mt->ma_flags &= ~MT_FLAGS_USE_RCU; - mtree_unlock(mt); - } -} - -/** - * mt_set_in_rcu() - Switch the tree to RCU safe mode. - * @mt: The Maple Tree - */ -static inline void mt_set_in_rcu(struct maple_tree *mt) -{ - if (mt_in_rcu(mt)) - return; - - if (mt_external_lock(mt)) { - WARN_ON(!mt_lock_is_held(mt)); - mt->ma_flags |= MT_FLAGS_USE_RCU; - } else { - mtree_lock(mt); - mt->ma_flags |= MT_FLAGS_USE_RCU; - mtree_unlock(mt); - } -} - -static inline unsigned int mt_height(const struct maple_tree *mt) -{ - return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; -} - -void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max); -void *mt_find_after(struct maple_tree *mt, unsigned long *index, - unsigned long max); -void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min); -void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); - -/** - * mt_for_each - Iterate over each entry starting at index until max. - * @__tree: The Maple Tree - * @__entry: The current entry - * @__index: The index to start the search from. Subsequently used as iterator. - * @__max: The maximum limit for @index - * - * This iterator skips all entries, which resolve to a NULL pointer, - * e.g. entries which has been reserved with XA_ZERO_ENTRY. - */ -#define mt_for_each(__tree, __entry, __index, __max) \ - for (__entry = mt_find(__tree, &(__index), __max); \ - __entry; __entry = mt_find_after(__tree, &(__index), __max)) - #ifdef CONFIG_DEBUG_MAPLE_TREE enum mt_dump_format { @@ -838,4 +682,163 @@ void mt_cache_shrink(void); #define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) #endif /* CONFIG_DEBUG_MAPLE_TREE */ +/** + * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the + * current location. + * @mas: Maple Tree operation state. + * @start: New start of range in the Maple Tree. + * @last: New end of range in the Maple Tree. + * + * set the internal maple state values to a sub-range. + * Please use mas_set_range() if you do not know where you are in the tree. + */ +static inline void __mas_set_range(struct ma_state *mas, unsigned long start, + unsigned long last) +{ + /* Ensure the range starts within the current slot */ + MAS_WARN_ON(mas, mas_is_active(mas) && + (mas->index > start || mas->last < start)); + mas->index = start; + mas->last = last; +} + +/** + * mas_set_range() - Set up Maple Tree operation state for a different index. + * @mas: Maple Tree operation state. + * @start: New start of range in the Maple Tree. + * @last: New end of range in the Maple Tree. + * + * Move the operation state to refer to a different range. This will + * have the effect of starting a walk from the top; see mas_next() + * to move to an adjacent index. + */ +static inline +void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) +{ + mas->node = MAS_START; + __mas_set_range(mas, start, last); +} + +/** + * mas_set() - Set up Maple Tree operation state for a different index. + * @mas: Maple Tree operation state. + * @index: New index into the Maple Tree. + * + * Move the operation state to refer to a different index. This will + * have the effect of starting a walk from the top; see mas_next() + * to move to an adjacent index. + */ +static inline void mas_set(struct ma_state *mas, unsigned long index) +{ + + mas_set_range(mas, index, index); +} + +static inline bool mt_external_lock(const struct maple_tree *mt) +{ + return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN; +} + +/** + * mt_init_flags() - Initialise an empty maple tree with flags. + * @mt: Maple Tree + * @flags: maple tree flags. + * + * If you need to initialise a Maple Tree with special flags (eg, an + * allocation tree), use this function. + * + * Context: Any context. + */ +static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags) +{ + mt->ma_flags = flags; + if (!mt_external_lock(mt)) + spin_lock_init(&mt->ma_lock); + rcu_assign_pointer(mt->ma_root, NULL); +} + +/** + * mt_init() - Initialise an empty maple tree. + * @mt: Maple Tree + * + * An empty Maple Tree. + * + * Context: Any context. + */ +static inline void mt_init(struct maple_tree *mt) +{ + mt_init_flags(mt, 0); +} + +static inline bool mt_in_rcu(struct maple_tree *mt) +{ +#ifdef CONFIG_MAPLE_RCU_DISABLED + return false; +#endif + return mt->ma_flags & MT_FLAGS_USE_RCU; +} + +/** + * mt_clear_in_rcu() - Switch the tree to non-RCU mode. + * @mt: The Maple Tree + */ +static inline void mt_clear_in_rcu(struct maple_tree *mt) +{ + if (!mt_in_rcu(mt)) + return; + + if (mt_external_lock(mt)) { + WARN_ON(!mt_lock_is_held(mt)); + mt->ma_flags &= ~MT_FLAGS_USE_RCU; + } else { + mtree_lock(mt); + mt->ma_flags &= ~MT_FLAGS_USE_RCU; + mtree_unlock(mt); + } +} + +/** + * mt_set_in_rcu() - Switch the tree to RCU safe mode. + * @mt: The Maple Tree + */ +static inline void mt_set_in_rcu(struct maple_tree *mt) +{ + if (mt_in_rcu(mt)) + return; + + if (mt_external_lock(mt)) { + WARN_ON(!mt_lock_is_held(mt)); + mt->ma_flags |= MT_FLAGS_USE_RCU; + } else { + mtree_lock(mt); + mt->ma_flags |= MT_FLAGS_USE_RCU; + mtree_unlock(mt); + } +} + +static inline unsigned int mt_height(const struct maple_tree *mt) +{ + return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; +} + +void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max); +void *mt_find_after(struct maple_tree *mt, unsigned long *index, + unsigned long max); +void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min); +void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); + +/** + * mt_for_each - Iterate over each entry starting at index until max. + * @__tree: The Maple Tree + * @__entry: The current entry + * @__index: The index to start the search from. Subsequently used as iterator. + * @__max: The maximum limit for @index + * + * This iterator skips all entries, which resolve to a NULL pointer, + * e.g. entries which has been reserved with XA_ZERO_ENTRY. + */ +#define mt_for_each(__tree, __entry, __index, __max) \ + for (__entry = mt_find(__tree, &(__index), __max); \ + __entry; __entry = mt_find_after(__tree, &(__index), __max)) + #endif /*_LINUX_MAPLE_TREE_H */ diff --git a/mm/internal.h b/mm/internal.h index 2bc9ff8db393..0005b8adbd5c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1135,8 +1135,6 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) { - MAS_BUG_ON(&vmi->mas, vmi->mas.node != MAS_START && - (vmi->mas.index > index || vmi->mas.last < index)); __mas_set_range(&vmi->mas, index, last - 1); } diff --git a/tools/testing/radix-tree/linux/maple_tree.h b/tools/testing/radix-tree/linux/maple_tree.h index 7d8d1f445b89..06c89bdcc515 100644 --- a/tools/testing/radix-tree/linux/maple_tree.h +++ b/tools/testing/radix-tree/linux/maple_tree.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0+ */ #define atomic_t int32_t -#include "../../../../include/linux/maple_tree.h" #define atomic_inc(x) uatomic_inc(x) #define atomic_read(x) uatomic_read(x) #define atomic_set(x, y) do {} while (0) #define U8_MAX UCHAR_MAX +#include "../../../../include/linux/maple_tree.h" From 31c532a8af57513228c2b12d281104198ff412b8 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:21 -0400 Subject: [PATCH 0685/1562] maple_tree: add end of node tracking to the maple state Analysis of the mas_for_each() iteration showed that there is a significant time spent finding the end of a node. This time can be greatly reduced if the end of the node is cached in the maple state. Care must be taken to update & invalidate as necessary. Link: https://lkml.kernel.org/r/20231101171629.3612299-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 + lib/maple_tree.c | 7 +++++++ tools/testing/radix-tree/maple.c | 1 + 3 files changed, 9 insertions(+) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index b5d5992578c9..0b82efe0cf1e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -393,6 +393,7 @@ struct ma_state { unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; + unsigned char end; /* The end of the node */ }; struct ma_wr_state { diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 8d379d34ea0a..ea0a36341fed 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2841,6 +2841,7 @@ next: goto dead_node; } while (!ma_is_leaf(type)); + mas->end = end; mas->offset = offset; mas->index = min; mas->last = max; @@ -3507,6 +3508,7 @@ static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, mas_replace_node(wr_mas->mas, old_enode); reuse_node: mas_update_gap(wr_mas->mas); + wr_mas->mas->end = b_end; return 1; } @@ -4010,6 +4012,7 @@ done: } trace_ma_write(__func__, mas, 0, wr_mas->entry); mas_update_gap(mas); + mas->end = new_end; return true; } @@ -4190,6 +4193,7 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas, if (!wr_mas->content || !wr_mas->entry) mas_update_gap(mas); + mas->end = new_end; trace_ma_write(__func__, mas, new_end, wr_mas->entry); return true; } @@ -4428,6 +4432,7 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min) if (unlikely(mte_dead_node(mas->node))) return 1; + mas->end = mas->offset; return 0; no_entry: @@ -5074,6 +5079,7 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, if (mas->index < min) mas->index = min; mas->last = mas->index + size - 1; + mas->end = mas_data_end(mas); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area); @@ -5134,6 +5140,7 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, mas->last = max; mas->index = mas->last - size + 1; + mas->end = mas_data_end(mas); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area_rev); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index cb5358674521..7095fb0ec026 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -945,6 +945,7 @@ retry: goto retry; } + mas->end = mas_data_end(mas); return ret; not_found: From e9c52d8940cbfd94b36035bbebce7f55954e7728 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:22 -0400 Subject: [PATCH 0686/1562] maple_tree: use cached node end in mas_next() When looking for the next entry, don't recalculate the node end as it is now tracked in the maple state. Link: https://lkml.kernel.org/r/20231101171629.3612299-6-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index ea0a36341fed..1e617be77dfb 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4539,6 +4539,7 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, unsigned long min; unsigned long *pivots; struct maple_enode *enode; + struct maple_node *tmp; int level = 0; unsigned char node_end; enum maple_type mt; @@ -4591,6 +4592,10 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, pivots = ma_pivots(node, mt); mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt); + tmp = mte_to_node(enode); + mt = mte_node_type(enode); + pivots = ma_pivots(tmp, mt); + mas->end = ma_data_end(tmp, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; @@ -4625,7 +4630,6 @@ static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty, unsigned long pivot; enum maple_type type; struct maple_node *node; - unsigned char data_end; unsigned long save_point = mas->last; void *entry; @@ -4633,12 +4637,11 @@ retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); - data_end = ma_data_end(node, type, pivots, mas->max); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->max >= max) { - if (likely(mas->offset < data_end)) + if (likely(mas->offset < mas->end)) pivot = pivots[mas->offset]; else goto overflow; @@ -4650,11 +4653,11 @@ retry: goto overflow; } - if (likely(mas->offset < data_end)) { + if (likely(mas->offset < mas->end)) { mas->index = pivots[mas->offset] + 1; again: mas->offset++; - if (likely(mas->offset < data_end)) + if (likely(mas->offset < mas->end)) mas->last = pivots[mas->offset]; else mas->last = mas->max; @@ -4691,7 +4694,6 @@ again: goto overflow; mas->index = mas->last + 1; - /* Node cannot end on NULL, so it's safe to short-cut here */ goto again; } From 1f41ef12abf8538b3d82cdae14c06aa171cb71ce Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:23 -0400 Subject: [PATCH 0687/1562] maple_tree: use cached node end in mas_destroy() The node end is set during the walk, so use the resulting end instead of re-fetching it. Link: https://lkml.kernel.org/r/20231101171629.3612299-7-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 1e617be77dfb..216f54177151 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5576,7 +5576,7 @@ void mas_destroy(struct ma_state *mas) mas_start(mas); mtree_range_walk(mas); - end = mas_data_end(mas) + 1; + end = mas->end + 1; if (end < mt_min_slot_count(mas->node) - 1) mas_destroy_rebalance(mas, end); From 271f61a8b41dcd86e1ecc2e0455bcc071bc7dde4 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:24 -0400 Subject: [PATCH 0688/1562] maple_tree: clean up inlines for some functions There are a few functions which were inlined but are somewhat too large to inline, so remove the inline key word. There are also several very small functions which are used in critical code sections which gcc was not inlining, so make this more strict and use __always_line for these functions. Link: https://lkml.kernel.org/r/20231101171629.3612299-8-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 78 ++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 216f54177151..f0d2aea91351 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -217,23 +217,24 @@ static inline unsigned int mt_attr(struct maple_tree *mt) return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK; } -static inline enum maple_type mte_node_type(const struct maple_enode *entry) +static __always_inline enum maple_type mte_node_type( + const struct maple_enode *entry) { return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) & MAPLE_NODE_TYPE_MASK; } -static inline bool ma_is_dense(const enum maple_type type) +static __always_inline bool ma_is_dense(const enum maple_type type) { return type < maple_leaf_64; } -static inline bool ma_is_leaf(const enum maple_type type) +static __always_inline bool ma_is_leaf(const enum maple_type type) { return type < maple_range_64; } -static inline bool mte_is_leaf(const struct maple_enode *entry) +static __always_inline bool mte_is_leaf(const struct maple_enode *entry) { return ma_is_leaf(mte_node_type(entry)); } @@ -242,7 +243,7 @@ static inline bool mte_is_leaf(const struct maple_enode *entry) * We also reserve values with the bottom two bits set to '10' which are * below 4096 */ -static inline bool mt_is_reserved(const void *entry) +static __always_inline bool mt_is_reserved(const void *entry) { return ((unsigned long)entry < MAPLE_RESERVED_RANGE) && xa_is_internal(entry); @@ -295,7 +296,8 @@ static inline bool mas_searchable(struct ma_state *mas) return true; } -static inline struct maple_node *mte_to_node(const struct maple_enode *entry) +static __always_inline struct maple_node *mte_to_node( + const struct maple_enode *entry) { return (struct maple_node *)((unsigned long)entry & ~MAPLE_NODE_MASK); } @@ -372,12 +374,12 @@ static inline bool mte_has_null(const struct maple_enode *node) return (unsigned long)node & MAPLE_ENODE_NULL; } -static inline bool ma_is_root(struct maple_node *node) +static __always_inline bool ma_is_root(struct maple_node *node) { return ((unsigned long)node->parent & MA_ROOT_PARENT); } -static inline bool mte_is_root(const struct maple_enode *node) +static __always_inline bool mte_is_root(const struct maple_enode *node) { return ma_is_root(mte_to_node(node)); } @@ -387,7 +389,7 @@ static inline bool mas_is_root_limits(const struct ma_state *mas) return !mas->min && mas->max == ULONG_MAX; } -static inline bool mt_is_alloc(struct maple_tree *mt) +static __always_inline bool mt_is_alloc(struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_ALLOC_RANGE); } @@ -526,11 +528,12 @@ void mas_set_parent(struct ma_state *mas, struct maple_enode *enode, * * Return: The slot in the parent node where @enode resides. */ -static inline unsigned int mte_parent_slot(const struct maple_enode *enode) +static __always_inline +unsigned int mte_parent_slot(const struct maple_enode *enode) { unsigned long val = (unsigned long)mte_to_node(enode)->parent; - if (val & MA_ROOT_PARENT) + if (unlikely(val & MA_ROOT_PARENT)) return 0; /* @@ -546,7 +549,8 @@ static inline unsigned int mte_parent_slot(const struct maple_enode *enode) * * Return: The parent maple node. */ -static inline struct maple_node *mte_parent(const struct maple_enode *enode) +static __always_inline +struct maple_node *mte_parent(const struct maple_enode *enode) { return (void *)((unsigned long) (mte_to_node(enode)->parent) & ~MAPLE_NODE_MASK); @@ -558,7 +562,7 @@ static inline struct maple_node *mte_parent(const struct maple_enode *enode) * * Return: true if dead, false otherwise. */ -static inline bool ma_dead_node(const struct maple_node *node) +static __always_inline bool ma_dead_node(const struct maple_node *node) { struct maple_node *parent; @@ -574,7 +578,7 @@ static inline bool ma_dead_node(const struct maple_node *node) * * Return: true if dead, false otherwise. */ -static inline bool mte_dead_node(const struct maple_enode *enode) +static __always_inline bool mte_dead_node(const struct maple_enode *enode) { struct maple_node *parent, *node; @@ -730,7 +734,7 @@ static inline unsigned long mas_pivot(struct ma_state *mas, unsigned char piv) * Return: The pivot at @piv within the limit of the @pivots array, @mas->max * otherwise. */ -static inline unsigned long +static __always_inline unsigned long mas_safe_pivot(const struct ma_state *mas, unsigned long *pivots, unsigned char piv, enum maple_type type) { @@ -812,20 +816,20 @@ static inline bool mt_write_locked(const struct maple_tree *mt) lockdep_is_held(&mt->ma_lock); } -static inline bool mt_locked(const struct maple_tree *mt) +static __always_inline bool mt_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } -static inline void *mt_slot(const struct maple_tree *mt, +static __always_inline void *mt_slot(const struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_check(slots[offset], mt_locked(mt)); } -static inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, - unsigned char offset) +static __always_inline void *mt_slot_locked(struct maple_tree *mt, + void __rcu **slots, unsigned char offset) { return rcu_dereference_protected(slots[offset], mt_write_locked(mt)); } @@ -837,8 +841,8 @@ static inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, * * Return: The entry stored in @slots at the @offset. */ -static inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots, - unsigned char offset) +static __always_inline void *mas_slot_locked(struct ma_state *mas, + void __rcu **slots, unsigned char offset) { return mt_slot_locked(mas->tree, slots, offset); } @@ -851,8 +855,8 @@ static inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots, * * Return: The entry stored in @slots at the @offset */ -static inline void *mas_slot(struct ma_state *mas, void __rcu **slots, - unsigned char offset) +static __always_inline void *mas_slot(struct ma_state *mas, void __rcu **slots, + unsigned char offset) { return mt_slot(mas->tree, slots, offset); } @@ -863,7 +867,7 @@ static inline void *mas_slot(struct ma_state *mas, void __rcu **slots, * * Return: The pointer to the root of the tree */ -static inline void *mas_root(struct ma_state *mas) +static __always_inline void *mas_root(struct ma_state *mas) { return rcu_dereference_check(mas->tree->ma_root, mt_locked(mas->tree)); } @@ -1437,10 +1441,8 @@ retry: * Uses metadata to find the end of the data when possible. * Return: The zero indexed last slot with data (may be null). */ -static inline unsigned char ma_data_end(struct maple_node *node, - enum maple_type type, - unsigned long *pivots, - unsigned long max) +static __always_inline unsigned char ma_data_end(struct maple_node *node, + enum maple_type type, unsigned long *pivots, unsigned long max) { unsigned char offset; @@ -4344,7 +4346,7 @@ exists: } -static inline void mas_rewalk(struct ma_state *mas, unsigned long index) +static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index) { retry: mas_set(mas, index); @@ -4353,7 +4355,7 @@ retry: goto retry; } -static inline bool mas_rewalk_if_dead(struct ma_state *mas, +static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas, struct maple_node *node, const unsigned long index) { if (unlikely(ma_dead_node(node))) { @@ -4372,7 +4374,7 @@ static inline bool mas_rewalk_if_dead(struct ma_state *mas, * The prev node value will be mas->node[mas->offset] or MAS_NONE. * Return: 1 if the node is dead, 0 otherwise. */ -static inline int mas_prev_node(struct ma_state *mas, unsigned long min) +static int mas_prev_node(struct ma_state *mas, unsigned long min) { enum maple_type mt; int offset, level; @@ -4533,8 +4535,8 @@ underflow: * The next value will be mas->node[mas->offset] or MAS_NONE. * Return: 1 on dead node, 0 otherwise. */ -static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, - unsigned long max) +static int mas_next_node(struct ma_state *mas, struct maple_node *node, + unsigned long max) { unsigned long min; unsigned long *pivots; @@ -5664,7 +5666,7 @@ int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries) } EXPORT_SYMBOL_GPL(mas_expected_entries); -static inline bool mas_next_setup(struct ma_state *mas, unsigned long max, +static bool mas_next_setup(struct ma_state *mas, unsigned long max, void **entry) { bool was_none = mas_is_none(mas); @@ -5780,8 +5782,7 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max) } EXPORT_SYMBOL_GPL(mt_next); -static inline bool mas_prev_setup(struct ma_state *mas, unsigned long min, - void **entry) +static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry) { if (unlikely(mas->index <= min)) { mas->node = MAS_UNDERFLOW; @@ -5930,8 +5931,7 @@ EXPORT_SYMBOL_GPL(mas_pause); * * Returns: True if entry is the answer, false otherwise. */ -static inline bool mas_find_setup(struct ma_state *mas, unsigned long max, - void **entry) +static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry) { if (mas_is_active(mas)) { if (mas->last < max) @@ -6047,7 +6047,7 @@ EXPORT_SYMBOL_GPL(mas_find_range); * * Returns: True if entry is the answer, false otherwise. */ -static inline bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, +static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, void **entry) { if (mas_is_active(mas)) { From 067311d33e650adfe7ae23765959ddcc1ba18510 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:25 -0400 Subject: [PATCH 0689/1562] maple_tree: separate ma_state node from status The maple tree node is overloaded to keep status as well as the active node. This, unfortunately, results in a re-walk on underflow or overflow. Since the maple state has room, the status can be placed in its own enum in the structure. Once an underflow/overflow is detected, certain modes can restore the status to active and others may need to re-walk just that one node to see the entry. The status being an enum has the benefit of detecting unhandled status in switch statements. [Liam.Howlett@oracle.com: fix comments about MAS_*] Link: https://lkml.kernel.org/r/20231106154124.614247-1-Liam.Howlett@oracle.com [Liam.Howlett@oracle.com: update forking to separate maple state and node] Link: https://lkml.kernel.org/r/20231106154551.615042-1-Liam.Howlett@oracle.com [Liam.Howlett@oracle.com: fix mas_prev() state separation code] Link: https://lkml.kernel.org/r/20231207193319.4025462-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20231101171629.3612299-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 89 +++--- include/linux/mm_types.h | 3 +- lib/maple_tree.c | 457 +++++++++++++++++++------------ lib/test_maple_tree.c | 199 +++++++------- mm/internal.h | 8 +- tools/testing/radix-tree/maple.c | 26 +- 6 files changed, 450 insertions(+), 332 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0b82efe0cf1e..4dd668f7b111 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -349,6 +349,36 @@ static inline bool mtree_empty(const struct maple_tree *mt) /* Advanced API */ +/* + * Maple State Status + * ma_active means the maple state is pointing to a node and offset and can + * continue operating on the tree. + * ma_start means we have not searched the tree. + * ma_root means we have searched the tree and the entry we found lives in + * the root of the tree (ie it has index 0, length 1 and is the only entry in + * the tree). + * ma_none means we have searched the tree and there is no node in the + * tree for this entry. For example, we searched for index 1 in an empty + * tree. Or we have a tree which points to a full leaf node and we + * searched for an entry which is larger than can be contained in that + * leaf node. + * ma_pause means the data within the maple state may be stale, restart the + * operation + * ma_overflow means the search has reached the upper limit of the search + * ma_underflow means the search has reached the lower limit of the search + * ma_error means there was an error, check the node for the error number. + */ +enum maple_status { + ma_active, + ma_start, + ma_root, + ma_none, + ma_pause, + ma_overflow, + ma_underflow, + ma_error, +}; + /* * The maple state is defined in the struct ma_state and is used to keep track * of information during operations, and even between operations when using the @@ -381,6 +411,13 @@ static inline bool mtree_empty(const struct maple_tree *mt) * When returning a value the maple state index and last respectively contain * the start and end of the range for the entry. Ranges are inclusive in the * Maple Tree. + * + * The status of the state is used to determine how the next action should treat + * the state. For instance, if the status is ma_start then the next action + * should start at the root of the tree and walk down. If the status is + * ma_pause then the node may be stale data and should be discarded. If the + * status is ma_overflow, then the last action hit the upper limit. + * */ struct ma_state { struct maple_tree *tree; /* The tree we're operating in */ @@ -390,6 +427,7 @@ struct ma_state { unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct maple_alloc *alloc; /* Allocated nodes for this operation */ + enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; @@ -416,28 +454,12 @@ struct ma_wr_state { spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) - /* * Special values for ma_state.node. - * MAS_START means we have not searched the tree. - * MAS_ROOT means we have searched the tree and the entry we found lives in - * the root of the tree (ie it has index 0, length 1 and is the only entry in - * the tree). - * MAS_NONE means we have searched the tree and there is no node in the - * tree for this entry. For example, we searched for index 1 in an empty - * tree. Or we have a tree which points to a full leaf node and we - * searched for an entry which is larger than can be contained in that - * leaf node. * MA_ERROR represents an errno. After dropping the lock and attempting * to resolve the error, the walk would have to be restarted from the * top of the tree as the tree may have been modified. */ -#define MAS_START ((struct maple_enode *)1UL) -#define MAS_ROOT ((struct maple_enode *)5UL) -#define MAS_NONE ((struct maple_enode *)9UL) -#define MAS_PAUSE ((struct maple_enode *)17UL) -#define MAS_OVERFLOW ((struct maple_enode *)33UL) -#define MAS_UNDERFLOW ((struct maple_enode *)65UL) #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) @@ -446,7 +468,8 @@ struct ma_wr_state { .tree = mt, \ .index = first, \ .last = end, \ - .node = MAS_START, \ + .node = NULL, \ + .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ @@ -477,7 +500,6 @@ void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); -bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); @@ -506,28 +528,18 @@ static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, mas->tree = tree; mas->index = mas->last = addr; mas->max = ULONG_MAX; - mas->node = MAS_START; + mas->status = ma_start; + mas->node = NULL; } -/* Checks if a mas has not found anything */ -static inline bool mas_is_none(const struct ma_state *mas) -{ - return mas->node == MAS_NONE; -} - -/* Checks if a mas has been paused */ -static inline bool mas_is_paused(const struct ma_state *mas) -{ - return mas->node == MAS_PAUSE; -} - -/* Check if the mas is pointing to a node or not */ static inline bool mas_is_active(struct ma_state *mas) { - if ((unsigned long)mas->node >= MAPLE_RESERVED_RANGE) - return true; + return mas->status == ma_active; +} - return false; +static inline bool mas_is_err(struct ma_state *mas) +{ + return mas->status == ma_error; } /** @@ -540,9 +552,10 @@ static inline bool mas_is_active(struct ma_state *mas) * * Context: Any context. */ -static inline void mas_reset(struct ma_state *mas) +static __always_inline void mas_reset(struct ma_state *mas) { - mas->node = MAS_START; + mas->status = ma_start; + mas->node = NULL; } /** @@ -716,7 +729,7 @@ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { - mas->node = MAS_START; + mas_reset(mas); __mas_set_range(mas, start, last); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ef18d2b25378..a66534c78c4d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1071,7 +1071,8 @@ struct vma_iterator { .mas = { \ .tree = &(__mm)->mm_mt, \ .index = __addr, \ - .node = MAS_START, \ + .node = NULL, \ + .status = ma_start, \ }, \ } diff --git a/lib/maple_tree.c b/lib/maple_tree.c index f0d2aea91351..187a9796188e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -249,40 +249,40 @@ static __always_inline bool mt_is_reserved(const void *entry) xa_is_internal(entry); } -static inline void mas_set_err(struct ma_state *mas, long err) +static __always_inline void mas_set_err(struct ma_state *mas, long err) { mas->node = MA_ERROR(err); + mas->status = ma_error; } -static inline bool mas_is_ptr(const struct ma_state *mas) +static __always_inline bool mas_is_ptr(const struct ma_state *mas) { - return mas->node == MAS_ROOT; + return mas->status == ma_root; } -static inline bool mas_is_start(const struct ma_state *mas) +static __always_inline bool mas_is_start(const struct ma_state *mas) { - return mas->node == MAS_START; + return mas->status == ma_start; } -bool mas_is_err(struct ma_state *mas) +static __always_inline bool mas_is_none(const struct ma_state *mas) { - return xa_is_err(mas->node); + return mas->status == ma_none; +} + +static __always_inline bool mas_is_paused(const struct ma_state *mas) +{ + return mas->status == ma_pause; } static __always_inline bool mas_is_overflow(struct ma_state *mas) { - if (unlikely(mas->node == MAS_OVERFLOW)) - return true; - - return false; + return mas->status == ma_overflow; } -static __always_inline bool mas_is_underflow(struct ma_state *mas) +static inline bool mas_is_underflow(struct ma_state *mas) { - if (unlikely(mas->node == MAS_UNDERFLOW)) - return true; - - return false; + return mas->status == ma_underflow; } static inline bool mas_searchable(struct ma_state *mas) @@ -1274,6 +1274,7 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) if (mas->mas_flags & MA_STATE_PREALLOC) { if (allocated) return; + BUG_ON(!allocated); WARN_ON(!allocated); } @@ -1379,14 +1380,14 @@ static void mas_node_count(struct ma_state *mas, int count) * mas_start() - Sets up maple state for operations. * @mas: The maple state. * - * If mas->node == MAS_START, then set the min, max and depth to + * If mas->status == mas_start, then set the min, max and depth to * defaults. * * Return: - * - If mas->node is an error or not MAS_START, return NULL. - * - If it's an empty tree: NULL & mas->node == MAS_NONE - * - If it's a single entry: The entry & mas->node == MAS_ROOT - * - If it's a tree: NULL & mas->node == safe root node. + * - If mas->node is an error or not mas_start, return NULL. + * - If it's an empty tree: NULL & mas->status == ma_none + * - If it's a single entry: The entry & mas->status == mas_root + * - If it's a tree: NULL & mas->status == safe root node. */ static inline struct maple_enode *mas_start(struct ma_state *mas) { @@ -1402,6 +1403,7 @@ retry: /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; + mas->status = ma_active; mas->node = mte_safe_root(root); mas->offset = 0; if (mte_dead_node(mas->node)) @@ -1412,13 +1414,14 @@ retry: /* empty tree */ if (unlikely(!root)) { - mas->node = MAS_NONE; + mas->node = NULL; + mas->status = ma_none; mas->offset = MAPLE_NODE_SLOTS; return NULL; } /* Single entry tree */ - mas->node = MAS_ROOT; + mas->status = ma_root; mas->offset = MAPLE_NODE_SLOTS; /* Single entry tree. */ @@ -2225,19 +2228,21 @@ static inline bool mas_next_sibling(struct ma_state *mas) } /* - * mte_node_or_node() - Return the encoded node or MAS_NONE. + * mte_node_or_none() - Set the enode and state. * @enode: The encoded maple node. * - * Shorthand to avoid setting %NULLs in the tree or maple_subtree_state. - * - * Return: @enode or MAS_NONE + * Set the node to the enode and the status. */ -static inline struct maple_enode *mte_node_or_none(struct maple_enode *enode) +static inline void mas_node_or_none(struct ma_state *mas, + struct maple_enode *enode) { - if (enode) - return enode; - - return ma_enode_ptr(MAS_NONE); + if (enode) { + mas->node = enode; + mas->status = ma_active; + } else { + mas->node = NULL; + mas->status = ma_none; + } } /* @@ -2557,13 +2562,15 @@ static inline void mast_set_split_parents(struct maple_subtree_state *mast, * The node will either be RCU freed or pushed back on the maple state. */ static inline void mas_topiary_node(struct ma_state *mas, - struct maple_enode *enode, bool in_rcu) + struct ma_state *tmp_mas, bool in_rcu) { struct maple_node *tmp; + struct maple_enode *enode; - if (enode == MAS_NONE) + if (mas_is_none(tmp_mas)) return; + enode = tmp_mas->node; tmp = mte_to_node(enode); mte_set_node_dead(enode); if (in_rcu) @@ -2603,8 +2610,8 @@ static inline void mas_topiary_replace(struct ma_state *mas, /* Update the parent pointers in the tree */ tmp[0] = *mas; tmp[0].offset = 0; - tmp[1].node = MAS_NONE; - tmp[2].node = MAS_NONE; + tmp[1].status = ma_none; + tmp[2].status = ma_none; while (!mte_is_leaf(tmp[0].node)) { n = 0; for (i = 0; i < 3; i++) { @@ -2624,7 +2631,7 @@ static inline void mas_topiary_replace(struct ma_state *mas, break; while (n < 3) - tmp_next[n++].node = MAS_NONE; + tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) tmp[i] = tmp_next[i]; @@ -2637,8 +2644,8 @@ static inline void mas_topiary_replace(struct ma_state *mas, tmp[0] = *mas; tmp[0].offset = 0; tmp[0].node = old_enode; - tmp[1].node = MAS_NONE; - tmp[2].node = MAS_NONE; + tmp[1].status = ma_none; + tmp[2].status = ma_none; in_rcu = mt_in_rcu(mas->tree); do { n = 0; @@ -2653,7 +2660,7 @@ static inline void mas_topiary_replace(struct ma_state *mas, if ((tmp_next[n].min >= tmp_next->index) && (tmp_next[n].max <= tmp_next->last)) { mat_add(&subtrees, tmp_next[n].node); - tmp_next[n].node = MAS_NONE; + tmp_next[n].status = ma_none; } else { n++; } @@ -2664,16 +2671,16 @@ static inline void mas_topiary_replace(struct ma_state *mas, break; while (n < 3) - tmp_next[n++].node = MAS_NONE; + tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) { - mas_topiary_node(mas, tmp[i].node, in_rcu); + mas_topiary_node(mas, &tmp[i], in_rcu); tmp[i] = tmp_next[i]; } } while (!mte_is_leaf(tmp[0].node)); for (i = 0; i < 3; i++) - mas_topiary_node(mas, tmp[i].node, in_rcu); + mas_topiary_node(mas, &tmp[i], in_rcu); mas_mat_destroy(mas, &subtrees); } @@ -2712,9 +2719,9 @@ static inline void mast_cp_to_nodes(struct maple_subtree_state *mast, { bool new_lmax = true; - mast->l->node = mte_node_or_none(left); - mast->m->node = mte_node_or_none(middle); - mast->r->node = mte_node_or_none(right); + mas_node_or_none(mast->l, left); + mas_node_or_none(mast->m, middle); + mas_node_or_none(mast->r, right); mast->l->min = mast->orig_l->min; if (split == mast->bn->b_end) { @@ -2894,7 +2901,7 @@ static int mas_spanning_rebalance(struct ma_state *mas, mast->l = &l_mas; mast->m = &m_mas; mast->r = &r_mas; - l_mas.node = r_mas.node = m_mas.node = MAS_NONE; + l_mas.status = r_mas.status = m_mas.status = ma_none; /* Check if this is not root and has sufficient data. */ if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && @@ -3421,7 +3428,6 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) /* Try to push left. */ if (mas_push_data(mas, height, &mast, true)) break; - /* Try to push right. */ if (mas_push_data(mas, height, &mast, false)) break; @@ -3537,6 +3543,7 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry) slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); + mas->status = ma_active; if (mas->index) { if (contents) { @@ -3569,7 +3576,7 @@ static inline void mas_store_root(struct ma_state *mas, void *entry) mas_root_expand(mas, entry); else { rcu_assign_pointer(mas->tree->ma_root, entry); - mas->node = MAS_START; + mas->status = ma_start; } } @@ -3801,7 +3808,7 @@ static inline int mas_new_root(struct ma_state *mas, void *entry) mas->depth = 0; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, entry); - mas->node = MAS_START; + mas->status = ma_start; goto done; } @@ -3814,6 +3821,7 @@ static inline int mas_new_root(struct ma_state *mas, void *entry) slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); + mas->status = ma_active; rcu_assign_pointer(slots[0], entry); pivots[0] = mas->last; mas->depth = 1; @@ -4367,11 +4375,13 @@ static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas, /* * mas_prev_node() - Find the prev non-null entry at the same level in the - * tree. The prev value will be mas->node[mas->offset] or MAS_NONE. + * tree. The prev value will be mas->node[mas->offset] or the status will be + * ma_none. * @mas: The maple state * @min: The lower limit to search * - * The prev node value will be mas->node[mas->offset] or MAS_NONE. + * The prev node value will be mas->node[mas->offset] or the status will be + * ma_none. * Return: 1 if the node is dead, 0 otherwise. */ static int mas_prev_node(struct ma_state *mas, unsigned long min) @@ -4441,7 +4451,7 @@ no_entry: if (unlikely(ma_dead_node(node))) return 1; - mas->node = MAS_NONE; + mas->status = ma_underflow; return 0; } @@ -4455,8 +4465,7 @@ no_entry: * * Return: The entry in the previous slot which is possibly NULL */ -static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty, - bool set_underflow) +static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty) { void *entry; void __rcu **slots; @@ -4489,13 +4498,16 @@ again: mas->last = mas->index - 1; mas->index = mas_safe_min(mas, pivots, mas->offset); } else { + if (mas->index <= min) + goto underflow; + if (mas_prev_node(mas, min)) { mas_rewalk(mas, save_point); goto retry; } - if (mas_is_none(mas)) - goto underflow; + if (WARN_ON_ONCE(mas_is_underflow(mas))) + return NULL; mas->last = mas->max; node = mas_mn(mas); @@ -4509,12 +4521,15 @@ again: if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; + if (likely(entry)) return entry; if (!empty) { - if (mas->index <= min) - goto underflow; + if (mas->index <= min) { + mas->status = ma_underflow; + return NULL; + } goto again; } @@ -4522,8 +4537,7 @@ again: return entry; underflow: - if (set_underflow) - mas->node = MAS_UNDERFLOW; + mas->status = ma_underflow; return NULL; } @@ -4532,7 +4546,8 @@ underflow: * @mas: The maple state * @max: The maximum pivot value to check. * - * The next value will be mas->node[mas->offset] or MAS_NONE. + * The next value will be mas->node[mas->offset] or the status will have + * overflowed. * Return: 1 on dead node, 0 otherwise. */ static int mas_next_node(struct ma_state *mas, struct maple_node *node, @@ -4548,13 +4563,13 @@ static int mas_next_node(struct ma_state *mas, struct maple_node *node, void __rcu **slots; if (mas->max >= max) - goto no_entry; + goto overflow; min = mas->max + 1; level = 0; do { if (ma_is_root(node)) - goto no_entry; + goto overflow; /* Walk up. */ if (unlikely(mas_ascend(mas))) @@ -4605,11 +4620,11 @@ static int mas_next_node(struct ma_state *mas, struct maple_node *node, mas->min = min; return 0; -no_entry: +overflow: if (unlikely(ma_dead_node(node))) return 1; - mas->node = MAS_NONE; + mas->status = ma_overflow; return 0; } @@ -4624,8 +4639,7 @@ no_entry: * * Return: The entry in the next slot which is possibly NULL */ -static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty, - bool set_overflow) +static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty) { void __rcu **slots; unsigned long *pivots; @@ -4646,13 +4660,15 @@ retry: if (likely(mas->offset < mas->end)) pivot = pivots[mas->offset]; else - goto overflow; + pivot = mas->max; if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; - if (pivot >= max) - goto overflow; + if (pivot >= max) { /* Was at the limit, next will extend beyond */ + mas->status = ma_overflow; + return NULL; + } } if (likely(mas->offset < mas->end)) { @@ -4664,16 +4680,18 @@ again: else mas->last = mas->max; } else { + if (mas->last >= max) { + mas->status = ma_overflow; + return NULL; + } + if (mas_next_node(mas, node, max)) { mas_rewalk(mas, save_point); goto retry; } - if (WARN_ON_ONCE(mas_is_none(mas))) { - mas->node = MAS_OVERFLOW; + if (WARN_ON_ONCE(mas_is_overflow(mas))) return NULL; - goto overflow; - } mas->offset = 0; mas->index = mas->min; @@ -4691,20 +4709,18 @@ again: if (entry) return entry; + if (!empty) { - if (mas->last >= max) - goto overflow; + if (mas->last >= max) { + mas->status = ma_overflow; + return NULL; + } mas->index = mas->last + 1; goto again; } return entry; - -overflow: - if (set_overflow) - mas->node = MAS_OVERFLOW; - return NULL; } /* @@ -4723,11 +4739,11 @@ overflow: static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) { if (mas->last >= limit) { - mas->node = MAS_OVERFLOW; + mas->status = ma_overflow; return NULL; } - return mas_next_slot(mas, limit, false, true); + return mas_next_slot(mas, limit, false); } /* @@ -4895,7 +4911,7 @@ done: * @mas: The maple state. * * mas->index and mas->last will be set to the range if there is a value. If - * mas->node is MAS_NONE, reset to MAS_START. + * mas->status is ma_none, reset to ma_start * * Return: the entry at the location or %NULL. */ @@ -4904,7 +4920,7 @@ void *mas_walk(struct ma_state *mas) void *entry; if (!mas_is_active(mas) || !mas_is_start(mas)) - mas->node = MAS_START; + mas->status = ma_start; retry: entry = mas_state_walk(mas); if (mas_is_start(mas)) { @@ -4920,7 +4936,7 @@ retry: mas->index = 1; mas->last = ULONG_MAX; - mas->node = MAS_NONE; + mas->status = ma_none; return NULL; } @@ -5672,27 +5688,40 @@ static bool mas_next_setup(struct ma_state *mas, unsigned long max, bool was_none = mas_is_none(mas); if (unlikely(mas->last >= max)) { - mas->node = MAS_OVERFLOW; + mas->status = ma_overflow; return true; } - if (mas_is_active(mas)) + switch (mas->status) { + case ma_active: return false; - - if (mas_is_none(mas) || mas_is_paused(mas)) { - mas->node = MAS_START; - } else if (mas_is_overflow(mas)) { + case ma_none: + fallthrough; + case ma_pause: + mas->status = ma_start; + fallthrough; + case ma_start: + mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ + break; + case ma_overflow: /* Overflowed before, but the max changed */ - mas->node = MAS_START; - } else if (mas_is_underflow(mas)) { - mas->node = MAS_START; + mas->status = ma_active; + break; + case ma_underflow: + /* The user expects the mas to be one before where it is */ + mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; + break; + case ma_root: + break; + case ma_error: + return true; } - if (mas_is_start(mas)) - *entry = mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ + if (likely(mas_is_active(mas))) /* Fast path */ + return false; if (mas_is_ptr(mas)) { *entry = NULL; @@ -5702,7 +5731,7 @@ static bool mas_next_setup(struct ma_state *mas, unsigned long max, } mas->index = 1; mas->last = ULONG_MAX; - mas->node = MAS_NONE; + mas->status = ma_none; return true; } @@ -5731,7 +5760,7 @@ void *mas_next(struct ma_state *mas, unsigned long max) return entry; /* Retries on dead nodes handled by mas_next_slot */ - return mas_next_slot(mas, max, false, true); + return mas_next_slot(mas, max, false); } EXPORT_SYMBOL_GPL(mas_next); @@ -5754,7 +5783,7 @@ void *mas_next_range(struct ma_state *mas, unsigned long max) return entry; /* Retries on dead nodes handled by mas_next_slot */ - return mas_next_slot(mas, max, true, true); + return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_next_range); @@ -5785,33 +5814,45 @@ EXPORT_SYMBOL_GPL(mt_next); static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry) { if (unlikely(mas->index <= min)) { - mas->node = MAS_UNDERFLOW; + mas->status = ma_underflow; return true; } - if (mas_is_active(mas)) + switch (mas->status) { + case ma_active: return false; - - if (mas_is_overflow(mas)) { - mas->node = MAS_START; + case ma_start: + break; + case ma_none: + fallthrough; + case ma_pause: + mas->status = ma_start; + break; + case ma_underflow: + /* underflowed before but the min changed */ + mas->status = ma_active; + break; + case ma_overflow: + /* User expects mas to be one after where it is */ + mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; - } - - if (mas_is_none(mas) || mas_is_paused(mas)) { - mas->node = MAS_START; - } else if (mas_is_underflow(mas)) { - /* underflowed before but the min changed */ - mas->node = MAS_START; + break; + case ma_root: + break; + case ma_error: + return true; } if (mas_is_start(mas)) mas_walk(mas); if (unlikely(mas_is_ptr(mas))) { - if (!mas->index) - goto none; + if (!mas->index) { + mas->status = ma_none; + return true; + } mas->index = mas->last = 0; *entry = mas_root(mas); return true; @@ -5821,7 +5862,7 @@ static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry if (mas->index) { /* Walked to out-of-range pointer? */ mas->index = mas->last = 0; - mas->node = MAS_ROOT; + mas->status = ma_root; *entry = mas_root(mas); return true; } @@ -5829,10 +5870,6 @@ static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry } return false; - -none: - mas->node = MAS_NONE; - return true; } /** @@ -5841,7 +5878,7 @@ none: * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. - * Will reset mas to MAS_START if the node is MAS_NONE. Will stop on not + * Will reset mas to ma_start if the status is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. @@ -5853,7 +5890,7 @@ void *mas_prev(struct ma_state *mas, unsigned long min) if (mas_prev_setup(mas, min, &entry)) return entry; - return mas_prev_slot(mas, min, false, true); + return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_prev); @@ -5864,7 +5901,7 @@ EXPORT_SYMBOL_GPL(mas_prev); * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. - * Will reset mas to MAS_START if the node is MAS_NONE. Will stop on not + * Will reset mas to ma_start if the node is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. @@ -5876,7 +5913,7 @@ void *mas_prev_range(struct ma_state *mas, unsigned long min) if (mas_prev_setup(mas, min, &entry)) return entry; - return mas_prev_slot(mas, min, true, true); + return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_prev_range); @@ -5919,7 +5956,8 @@ EXPORT_SYMBOL_GPL(mt_prev); */ void mas_pause(struct ma_state *mas) { - mas->node = MAS_PAUSE; + mas->status = ma_pause; + mas->node = NULL; } EXPORT_SYMBOL_GPL(mas_pause); @@ -5933,32 +5971,52 @@ EXPORT_SYMBOL_GPL(mas_pause); */ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry) { - if (mas_is_active(mas)) { + switch (mas->status) { + case ma_active: if (mas->last < max) return false; - return true; - } - - if (mas_is_paused(mas)) { + case ma_start: + break; + case ma_pause: if (unlikely(mas->last >= max)) return true; mas->index = ++mas->last; - mas->node = MAS_START; - } else if (mas_is_none(mas)) { + mas->status = ma_start; + break; + case ma_none: if (unlikely(mas->last >= max)) return true; mas->index = mas->last; - mas->node = MAS_START; - } else if (mas_is_overflow(mas) || mas_is_underflow(mas)) { - if (mas->index > max) { - mas->node = MAS_OVERFLOW; + mas->status = ma_start; + break; + case ma_underflow: + /* mas is pointing at entry before unable to go lower */ + if (unlikely(mas->index >= max)) { + mas->status = ma_overflow; return true; } - mas->node = MAS_START; + mas->status = ma_active; + *entry = mas_walk(mas); + if (*entry) + return true; + break; + case ma_overflow: + if (unlikely(mas->last >= max)) + return true; + + mas->status = ma_active; + *entry = mas_walk(mas); + if (*entry) + return true; + break; + case ma_root: + break; + case ma_error: + return true; } if (mas_is_start(mas)) { @@ -5985,7 +6043,7 @@ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long m return false; ptr_out_of_range: - mas->node = MAS_NONE; + mas->status = ma_none; mas->index = 1; mas->last = ULONG_MAX; return true; @@ -5999,7 +6057,7 @@ ptr_out_of_range: * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. - * May set @mas->node to MAS_NONE. + * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ @@ -6011,7 +6069,10 @@ void *mas_find(struct ma_state *mas, unsigned long max) return entry; /* Retries on dead nodes handled by mas_next_slot */ - return mas_next_slot(mas, max, false, false); + entry = mas_next_slot(mas, max, false); + /* Ignore overflow */ + mas->status = ma_active; + return entry; } EXPORT_SYMBOL_GPL(mas_find); @@ -6023,7 +6084,7 @@ EXPORT_SYMBOL_GPL(mas_find); * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. - * May set @mas->node to MAS_NONE. + * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ @@ -6035,7 +6096,7 @@ void *mas_find_range(struct ma_state *mas, unsigned long max) return entry; /* Retries on dead nodes handled by mas_next_slot */ - return mas_next_slot(mas, max, true, false); + return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_find_range); @@ -6050,33 +6111,45 @@ EXPORT_SYMBOL_GPL(mas_find_range); static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, void **entry) { - if (mas_is_active(mas)) { - if (mas->index > min) - return false; - return true; - } - - if (mas_is_paused(mas)) { + switch (mas->status) { + case ma_active: + goto active; + case ma_start: + break; + case ma_pause: if (unlikely(mas->index <= min)) { - mas->node = MAS_NONE; + mas->status = ma_underflow; return true; } - mas->node = MAS_START; mas->last = --mas->index; - } else if (mas_is_none(mas)) { + mas->status = ma_start; + break; + case ma_none: if (mas->index <= min) goto none; mas->last = mas->index; - mas->node = MAS_START; - } else if (mas_is_underflow(mas) || mas_is_overflow(mas)) { - if (mas->last <= min) { - mas->node = MAS_UNDERFLOW; + mas->status = ma_start; + break; + case ma_overflow: /* user expects the mas to be one after where it is */ + if (unlikely(mas->index <= min)) { + mas->status = ma_underflow; return true; } - mas->node = MAS_START; + mas->status = ma_active; + break; + case ma_underflow: /* user expects the mas to be one before where it is */ + if (unlikely(mas->index <= min)) + return true; + + mas->status = ma_active; + break; + case ma_root: + break; + case ma_error: + return true; } if (mas_is_start(mas)) { @@ -6099,19 +6172,20 @@ static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, * previous location is 0. */ mas->last = mas->index = 0; - mas->node = MAS_ROOT; + mas->status = ma_root; *entry = mas_root(mas); return true; } } +active: if (mas->index < min) return true; return false; none: - mas->node = MAS_NONE; + mas->status = ma_none; return true; } @@ -6124,7 +6198,7 @@ none: * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. - * May set @mas->node to MAS_NONE. + * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ @@ -6136,7 +6210,7 @@ void *mas_find_rev(struct ma_state *mas, unsigned long min) return entry; /* Retries on dead nodes handled by mas_prev_slot */ - return mas_prev_slot(mas, min, false, false); + return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_find_rev); @@ -6150,7 +6224,7 @@ EXPORT_SYMBOL_GPL(mas_find_rev); * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. - * May set @mas->node to MAS_NONE. + * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ @@ -6162,7 +6236,7 @@ void *mas_find_range_rev(struct ma_state *mas, unsigned long min) return entry; /* Retries on dead nodes handled by mas_prev_slot */ - return mas_prev_slot(mas, min, true, false); + return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_find_range_rev); @@ -6183,7 +6257,7 @@ void *mas_erase(struct ma_state *mas) MA_WR_STATE(wr_mas, mas, NULL); if (!mas_is_active(mas) || !mas_is_start(mas)) - mas->node = MAS_START; + mas->status = ma_start; /* Retry unnecessary when holding the write lock. */ entry = mas_state_walk(mas); @@ -6228,7 +6302,7 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp) if (!mas_allocated(mas)) return false; - mas->node = MAS_START; + mas->status = ma_start; return true; } @@ -6627,7 +6701,7 @@ static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, node = mt_alloc_one(gfp); if (!node) { - new_mas->node = MAS_NONE; + new_mas->status = ma_none; mas_set_err(mas, -ENOMEM); return; } @@ -6971,11 +7045,11 @@ static inline struct maple_enode *mas_get_slot(struct ma_state *mas, static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) { - struct maple_enode *p = MAS_NONE, *mn = mas->node; + struct maple_enode *p, *mn = mas->node; unsigned long p_min, p_max; mas_next_node(mas, mas_mn(mas), max); - if (!mas_is_none(mas)) + if (!mas_is_overflow(mas)) return; if (mte_is_root(mn)) @@ -6988,7 +7062,7 @@ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) p_min = mas->min; p_max = mas->max; mas_prev_node(mas, 0); - } while (!mas_is_none(mas)); + } while (!mas_is_underflow(mas)); mas->node = p; mas->max = p_max; @@ -7443,7 +7517,7 @@ static void mt_validate_nulls(struct maple_tree *mt) MA_STATE(mas, mt, 0, 0); mas_start(&mas); - if (mas_is_none(&mas) || (mas.node == MAS_ROOT)) + if (mas_is_none(&mas) || (mas_is_ptr(&mas))) return; while (!mte_is_leaf(mas.node)) @@ -7460,7 +7534,7 @@ static void mt_validate_nulls(struct maple_tree *mt) last = entry; if (offset == mas_data_end(&mas)) { mas_next_node(&mas, mas_mn(&mas), ULONG_MAX); - if (mas_is_none(&mas)) + if (mas_is_overflow(&mas)) return; offset = 0; slots = ma_slots(mte_to_node(mas.node), @@ -7469,7 +7543,7 @@ static void mt_validate_nulls(struct maple_tree *mt) offset++; } - } while (!mas_is_none(&mas)); + } while (!mas_is_overflow(&mas)); } /* @@ -7490,7 +7564,7 @@ void mt_validate(struct maple_tree *mt) while (!mte_is_leaf(mas.node)) mas_descend(&mas); - while (!mas_is_none(&mas)) { + while (!mas_is_overflow(&mas)) { MAS_WARN_ON(&mas, mte_dead_node(mas.node)); end = mas_data_end(&mas); if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && @@ -7515,16 +7589,35 @@ EXPORT_SYMBOL_GPL(mt_validate); void mas_dump(const struct ma_state *mas) { pr_err("MAS: tree=%p enode=%p ", mas->tree, mas->node); - if (mas_is_none(mas)) - pr_err("(MAS_NONE) "); - else if (mas_is_ptr(mas)) - pr_err("(MAS_ROOT) "); - else if (mas_is_start(mas)) - pr_err("(MAS_START) "); - else if (mas_is_paused(mas)) - pr_err("(MAS_PAUSED) "); + switch (mas->status) { + case ma_active: + pr_err("(ma_active)"); + break; + case ma_none: + pr_err("(ma_none)"); + break; + case ma_root: + pr_err("(ma_root)"); + break; + case ma_start: + pr_err("(ma_start) "); + break; + case ma_pause: + pr_err("(ma_pause) "); + break; + case ma_overflow: + pr_err("(ma_overflow) "); + break; + case ma_underflow: + pr_err("(ma_underflow) "); + break; + case ma_error: + pr_err("(ma_error) "); + break; + } - pr_err("[%u] index=%lx last=%lx\n", mas->offset, mas->index, mas->last); + pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, + mas->index, mas->last); pr_err(" min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n", mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); if (mas->index > mas->last) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 3e4597fb49d3..e7a5d688c9e0 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -54,6 +54,11 @@ atomic_t maple_tree_tests_passed; #else #define cond_resched() do {} while (0) #endif + +#define mas_is_none(x) ((x)->status == ma_none) +#define mas_is_overflow(x) ((x)->status == ma_overflow) +#define mas_is_underflow(x) ((x)->status == ma_underflow) + static int __init mtree_insert_index(struct maple_tree *mt, unsigned long index, gfp_t gfp) { @@ -582,7 +587,7 @@ static noinline void __init check_find(struct maple_tree *mt) MT_BUG_ON(mt, last != mas.last); - mas.node = MAS_NONE; + mas.status = ma_none; mas.index = ULONG_MAX; mas.last = ULONG_MAX; entry2 = mas_prev(&mas, 0); @@ -2178,7 +2183,7 @@ static noinline void __init next_prev_test(struct maple_tree *mt) MT_BUG_ON(mt, val != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 5); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); mas.index = 0; mas.last = 5; @@ -3042,10 +3047,6 @@ static noinline void __init check_empty_area_fill(struct maple_tree *mt) * DNE active active range of NULL */ -#define mas_active(x) (((x).node != MAS_ROOT) && \ - ((x).node != MAS_START) && \ - ((x).node != MAS_PAUSE) && \ - ((x).node != MAS_NONE)) static noinline void __init check_state_handling(struct maple_tree *mt) { MA_STATE(mas, mt, 0, 0); @@ -3060,7 +3061,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) /* prev: Start -> underflow*/ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != NULL); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, mas.status != ma_underflow); /* prev: Start -> root */ mas_set(&mas, 10); @@ -3068,7 +3069,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* prev: pause -> root */ mas_set(&mas, 10); @@ -3077,7 +3078,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* next: start -> none */ mas_set(&mas, 0); @@ -3085,7 +3086,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); MT_BUG_ON(mt, entry != NULL); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* next: start -> none*/ mas_set(&mas, 10); @@ -3093,7 +3094,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); MT_BUG_ON(mt, entry != NULL); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find: start -> root */ mas_set(&mas, 0); @@ -3101,21 +3102,21 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* find: root -> none */ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find: none -> none */ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find: start -> none */ mas_set(&mas, 10); @@ -3123,14 +3124,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find_rev: none -> root */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* find_rev: start -> root */ mas_set(&mas, 0); @@ -3138,21 +3139,21 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* find_rev: root -> none */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find_rev: none -> none */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find_rev: start -> root */ mas_set(&mas, 10); @@ -3160,7 +3161,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: start -> none */ mas_set(&mas, 10); @@ -3168,7 +3169,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: pause -> none*/ mas_set(&mas, 10); @@ -3177,7 +3178,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: none -> none */ mas.index = mas.last = 10; @@ -3185,14 +3186,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: none -> none */ entry = mas_walk(&mas); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: start -> root */ mas_set(&mas, 0); @@ -3200,7 +3201,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: pause -> root */ mas_set(&mas, 0); @@ -3209,22 +3210,22 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: none -> root */ - mas.node = MAS_NONE; + mas.status = ma_none; entry = mas_walk(&mas); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: root -> root */ entry = mas_walk(&mas); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: root -> none */ mas_set(&mas, 10); @@ -3232,7 +3233,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: none -> root */ mas.index = mas.last = 0; @@ -3240,7 +3241,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); mas_unlock(&mas); @@ -3258,7 +3259,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* next: pause ->active */ mas_set(&mas, 0); @@ -3267,126 +3268,132 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* next: none ->active */ mas.index = mas.last = 0; mas.offset = 0; - mas.node = MAS_NONE; + mas.status = ma_none; entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* next:active ->active */ - entry = mas_next(&mas, ULONG_MAX); + /* next:active ->active (spanning limit) */ + entry = mas_next(&mas, 0x2100); MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* next:active -> active beyond data */ + /* next:active -> overflow (limit reached) beyond data */ entry = mas_next(&mas, 0x2999); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x2501); MT_BUG_ON(mt, mas.last != 0x2fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_overflow(&mas)); - /* Continue after last range ends after max */ + /* next:overflow -> active (limit changed) */ entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr3); MT_BUG_ON(mt, mas.index != 0x3000); MT_BUG_ON(mt, mas.last != 0x3500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* next:active -> active continued */ + /* next:active -> overflow (limit reached) */ entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x3501); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, !mas_active(mas)); - - /* next:active -> overflow */ - entry = mas_next(&mas, ULONG_MAX); - MT_BUG_ON(mt, entry != NULL); - MT_BUG_ON(mt, mas.index != 0x3501); - MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_OVERFLOW); + MT_BUG_ON(mt, !mas_is_overflow(&mas)); /* next:overflow -> overflow */ entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x3501); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_OVERFLOW); + MT_BUG_ON(mt, !mas_is_overflow(&mas)); /* prev:overflow -> active */ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != ptr3); MT_BUG_ON(mt, mas.index != 0x3000); MT_BUG_ON(mt, mas.last != 0x3500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* next: none -> active, skip value at location */ mas_set(&mas, 0); entry = mas_next(&mas, ULONG_MAX); - mas.node = MAS_NONE; + mas.status = ma_none; mas.offset = 0; entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* prev:active ->active */ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* prev:active -> active spanning end range */ + /* prev:active -> underflow (span limit) */ + mas_next(&mas, ULONG_MAX); + entry = mas_prev(&mas, 0x1200); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* spanning limit */ + entry = mas_prev(&mas, 0x1200); /* underflow */ + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); + + /* prev:underflow -> underflow (lower limit) spanning end range */ entry = mas_prev(&mas, 0x0100); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0x0FFF); - MT_BUG_ON(mt, !mas_active(mas)); - - /* prev:active -> underflow */ - entry = mas_prev(&mas, 0); - MT_BUG_ON(mt, entry != NULL); - MT_BUG_ON(mt, mas.index != 0); - MT_BUG_ON(mt, mas.last != 0x0FFF); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* prev:underflow -> underflow */ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0x0FFF); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); + + /* prev:underflow -> underflow */ + entry = mas_prev(&mas, 0); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0x0FFF); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* next:underflow -> active */ entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* prev:first value -> underflow */ entry = mas_prev(&mas, 0x1000); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* find:underflow -> first value */ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* prev: pause ->active */ mas_set(&mas, 0x3600); @@ -3397,21 +3404,21 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* prev:active -> active spanning min */ + /* prev:active -> underflow spanning min */ entry = mas_prev(&mas, 0x1600); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1FFF); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* prev: active ->active, continue */ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find: start ->active */ mas_set(&mas, 0); @@ -3419,7 +3426,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find: pause ->active */ mas_set(&mas, 0); @@ -3428,7 +3435,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find: start ->active on value */; mas_set(&mas, 1200); @@ -3436,14 +3443,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find:active ->active */ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find:active -> active (NULL)*/ @@ -3451,35 +3458,35 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x2501); MT_BUG_ON(mt, mas.last != 0x2FFF); - MT_BUG_ON(mt, !mas_active(mas)); + MAS_BUG_ON(&mas, !mas_is_active(&mas)); /* find: overflow ->active */ entry = mas_find(&mas, 0x5000); MT_BUG_ON(mt, entry != ptr3); MT_BUG_ON(mt, mas.index != 0x3000); MT_BUG_ON(mt, mas.last != 0x3500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find:active -> active (NULL) end*/ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x3501); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, !mas_active(mas)); + MAS_BUG_ON(&mas, !mas_is_active(&mas)); /* find_rev: active (END) ->active */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != ptr3); MT_BUG_ON(mt, mas.index != 0x3000); MT_BUG_ON(mt, mas.last != 0x3500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find_rev:active ->active */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find_rev: pause ->active */ mas_pause(&mas); @@ -3487,14 +3494,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* find_rev:active -> active */ + /* find_rev:active -> underflow */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0x0FFF); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* find_rev: start ->active */ mas_set(&mas, 0x1200); @@ -3502,7 +3509,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk start ->active */ mas_set(&mas, 0x1200); @@ -3510,7 +3517,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk start ->active */ mas_set(&mas, 0x1600); @@ -3518,7 +3525,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk pause ->active */ mas_set(&mas, 0x1200); @@ -3527,7 +3534,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk pause -> active */ mas_set(&mas, 0x1600); @@ -3536,25 +3543,25 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk none -> active */ mas_set(&mas, 0x1200); - mas.node = MAS_NONE; + mas.status = ma_none; entry = mas_walk(&mas); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk none -> active */ mas_set(&mas, 0x1600); - mas.node = MAS_NONE; + mas.status = ma_none; entry = mas_walk(&mas); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk active -> active */ mas.index = 0x1200; @@ -3564,7 +3571,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk active -> active */ mas.index = 0x1600; @@ -3573,7 +3580,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); mas_unlock(&mas); } diff --git a/mm/internal.h b/mm/internal.h index 0005b8adbd5c..8450562744cf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1163,13 +1163,13 @@ static inline void vma_iter_store(struct vma_iterator *vmi, { #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) - if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START && + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.index > vma->vm_start)) { pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", vmi->mas.index, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } - if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START && + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.last < vma->vm_start)) { pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, @@ -1177,7 +1177,7 @@ static inline void vma_iter_store(struct vma_iterator *vmi, } #endif - if (vmi->mas.node != MAS_START && + if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); @@ -1188,7 +1188,7 @@ static inline void vma_iter_store(struct vma_iterator *vmi, static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) { - if (vmi->mas.node != MAS_START && + if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 7095fb0ec026..857c439e6bbc 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -118,6 +118,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.alloc == NULL); MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); mas_push_node(&mas, mn); + mas_reset(&mas); mas_nomem(&mas, GFP_KERNEL); /* free */ mtree_unlock(mt); @@ -141,7 +142,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); - mas.node = MAS_START; + mas.status = ma_start; mas_nomem(&mas, GFP_KERNEL); /* Allocate 3 nodes, will fail. */ mas_node_count(&mas, 3); @@ -158,6 +159,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) /* Ensure we counted 3. */ MT_BUG_ON(mt, mas_allocated(&mas) != 3); /* Free. */ + mas_reset(&mas); mas_nomem(&mas, GFP_KERNEL); /* Set allocation request to 1. */ @@ -272,6 +274,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) ma_free_rcu(mn); MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); } + mas_reset(&mas); MT_BUG_ON(mt, mas_nomem(&mas, GFP_KERNEL)); } @@ -294,6 +297,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) smn = smn->slot[0]; /* next. */ } MT_BUG_ON(mt, mas_allocated(&mas) != total); + mas_reset(&mas); mas_nomem(&mas, GFP_KERNEL); /* Free. */ MT_BUG_ON(mt, mas_allocated(&mas) != 0); @@ -441,7 +445,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) mas.node = MA_ERROR(-ENOMEM); mas_node_count(&mas, 10); /* Request */ mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.node = MAS_START; + mas.status = ma_start; MT_BUG_ON(mt, mas_allocated(&mas) != 10); mas_destroy(&mas); @@ -452,7 +456,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) mas.node = MA_ERROR(-ENOMEM); mas_node_count(&mas, 10 + MAPLE_ALLOC_SLOTS - 1); /* Request */ mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.node = MAS_START; + mas.status = ma_start; MT_BUG_ON(mt, mas_allocated(&mas) != 10 + MAPLE_ALLOC_SLOTS - 1); mas_destroy(&mas); @@ -941,7 +945,7 @@ retry: ret = mas_descend_walk(mas, range_min, range_max); if (unlikely(mte_dead_node(mas->node))) { - mas->node = MAS_START; + mas->status = ma_start; goto retry; } @@ -961,10 +965,10 @@ static inline void *mas_range_load(struct ma_state *mas, unsigned long index = mas->index; if (mas_is_none(mas) || mas_is_paused(mas)) - mas->node = MAS_START; + mas->status = ma_start; retry: if (mas_tree_walk(mas, range_min, range_max)) - if (unlikely(mas->node == MAS_ROOT)) + if (unlikely(mas->status == ma_root)) return mas_root(mas); if (likely(mas->offset != MAPLE_NODE_SLOTS)) @@ -35337,7 +35341,7 @@ static void mas_dfs_preorder(struct ma_state *mas) unsigned char end, slot = 0; unsigned long *pivots; - if (mas->node == MAS_START) { + if (mas->status == ma_start) { mas_start(mas); return; } @@ -35374,7 +35378,7 @@ walk_up: return; done: - mas->node = MAS_NONE; + mas->status = ma_none; } @@ -35833,7 +35837,7 @@ static noinline void __init check_nomem(struct maple_tree *mt) mas_store(&ms, &ms); /* insert 1 -> &ms, fails. */ MT_BUG_ON(mt, ms.node != MA_ERROR(-ENOMEM)); mas_nomem(&ms, GFP_KERNEL); /* Node allocated in here. */ - MT_BUG_ON(mt, ms.node != MAS_START); + MT_BUG_ON(mt, ms.status != ma_start); mtree_unlock(mt); MT_BUG_ON(mt, mtree_insert(mt, 2, mt, GFP_KERNEL) != 0); mtree_lock(mt); @@ -35952,7 +35956,7 @@ static int __init compare_tree(struct maple_tree *mt_a, struct maple_tree *mt_b) if (mas_is_ptr(&mas_a) || mas_is_ptr(&mas_b)) { if (!(mas_is_ptr(&mas_a) && mas_is_ptr(&mas_b))) { - pr_err("One is MAS_ROOT and the other is not.\n"); + pr_err("One is ma_root and the other is not.\n"); return -1; } return 0; @@ -35961,7 +35965,7 @@ static int __init compare_tree(struct maple_tree *mt_a, struct maple_tree *mt_b) while (!mas_is_none(&mas_a) || !mas_is_none(&mas_b)) { if (mas_is_none(&mas_a) || mas_is_none(&mas_b)) { - pr_err("One is MAS_NONE and the other is not.\n"); + pr_err("One is ma_none and the other is not.\n"); return -1; } From 9a40d45c1f2c49273c04938ec3d7849f685eb3c1 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:26 -0400 Subject: [PATCH 0690/1562] maple_tree: remove mas_searchable() Now that the status of the maple state is outside of the node, the mas_searchable() function can be dropped for easier open-coding of what is going on. Link: https://lkml.kernel.org/r/20231101171629.3612299-10-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 66 ++++++++------------------------ tools/testing/radix-tree/maple.c | 4 +- 2 files changed, 19 insertions(+), 51 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 187a9796188e..c7016066f12b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -285,17 +285,6 @@ static inline bool mas_is_underflow(struct ma_state *mas) return mas->status == ma_underflow; } -static inline bool mas_searchable(struct ma_state *mas) -{ - if (mas_is_none(mas)) - return false; - - if (mas_is_ptr(mas)) - return false; - - return true; -} - static __always_inline struct maple_node *mte_to_node( const struct maple_enode *entry) { @@ -6030,12 +6019,11 @@ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long m } - if (unlikely(!mas_searchable(mas))) { - if (unlikely(mas_is_ptr(mas))) - goto ptr_out_of_range; + if (unlikely(mas_is_ptr(mas))) + goto ptr_out_of_range; + if (unlikely(mas_is_none(mas))) return true; - } if (mas->index == max) return true; @@ -6162,20 +6150,18 @@ static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, return true; } - if (unlikely(!mas_searchable(mas))) { - if (mas_is_ptr(mas)) - goto none; + if (unlikely(mas_is_ptr(mas))) + goto none; - if (mas_is_none(mas)) { - /* - * Walked to the location, and there was nothing so the - * previous location is 0. - */ - mas->last = mas->index = 0; - mas->status = ma_root; - *entry = mas_root(mas); - return true; - } + if (unlikely(mas_is_none(mas))) { + /* + * Walked to the location, and there was nothing so the previous + * location is 0. + */ + mas->last = mas->index = 0; + mas->status = ma_root; + *entry = mas_root(mas); + return true; } active: @@ -6905,7 +6891,7 @@ retry: if (entry) goto unlock; - while (mas_searchable(&mas) && (mas.last < max)) { + while (mas_is_active(&mas) && (mas.last < max)) { entry = mas_next_entry(&mas, max); if (likely(entry && !xa_is_zero(entry))) break; @@ -6987,26 +6973,6 @@ unsigned int mt_nr_allocated(void) return kmem_cache_nr_allocated(maple_node_cache); } -/* - * mas_dead_node() - Check if the maple state is pointing to a dead node. - * @mas: The maple state - * @index: The index to restore in @mas. - * - * Used in test code. - * Return: 1 if @mas has been reset to MAS_START, 0 otherwise. - */ -static inline int mas_dead_node(struct ma_state *mas, unsigned long index) -{ - if (unlikely(!mas_searchable(mas) || mas_is_start(mas))) - return 0; - - if (likely(!mte_dead_node(mas->node))) - return 0; - - mas_rewalk(mas, index); - return 1; -} - void mt_cache_shrink(void) { } @@ -7558,7 +7524,7 @@ void mt_validate(struct maple_tree *mt) MA_STATE(mas, mt, 0, 0); rcu_read_lock(); mas_start(&mas); - if (!mas_searchable(&mas)) + if (!mas_is_active(&mas)) goto done; while (!mte_is_leaf(mas.node)) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 857c439e6bbc..56ae47291ee0 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -974,8 +974,10 @@ retry: if (likely(mas->offset != MAPLE_NODE_SLOTS)) entry = mas_get_slot(mas, mas->offset); - if (mas_dead_node(mas, index)) + if (mas_is_active(mas) && mte_dead_node(mas->node)) { + mas_set(mas, index); goto retry; + } return entry; } From 0de56e38b307b0cb2ac825e8e7cb371a28daf844 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:27 -0400 Subject: [PATCH 0691/1562] maple_tree: use maple state end for write operations ma_wr_state was previously tracking the end of the node for writing. Since the implementation of the ma_state end tracking, this is duplicated work. This patch removes the maple write state tracking of the end of the node and uses the maple state end instead. Link: https://lkml.kernel.org/r/20231101171629.3612299-11-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 - lib/maple_tree.c | 46 ++++++++++++++++++++------------------ 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 4dd668f7b111..b3d63123b945 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -441,7 +441,6 @@ struct ma_wr_state { unsigned long r_max; /* range max */ enum maple_type type; /* mas->node type */ unsigned char offset_end; /* The offset where the write ends */ - unsigned char node_end; /* mas->node end */ unsigned long *pivots; /* mas->node->pivots pointer */ unsigned long end_piv; /* The pivot at the offset end */ void __rcu **slots; /* mas->node->slots pointer */ diff --git a/lib/maple_tree.c b/lib/maple_tree.c index c7016066f12b..59500fe6988b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2158,11 +2158,11 @@ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, } slot = offset_end + 1; - if (slot > wr_mas->node_end) + if (slot > mas->end) goto b_end; /* Copy end data to the end of the node. */ - mas_mab_cp(mas, slot, wr_mas->node_end + 1, b_node, ++b_end); + mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end); b_node->b_end--; return; @@ -2253,8 +2253,8 @@ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) wr_mas->node = mas_mn(wr_mas->mas); wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type); - count = wr_mas->node_end = ma_data_end(wr_mas->node, wr_mas->type, - wr_mas->pivots, mas->max); + count = mas->end = ma_data_end(wr_mas->node, wr_mas->type, + wr_mas->pivots, mas->max); offset = mas->offset; while (offset < count && mas->index > wr_mas->pivots[offset]) @@ -3904,10 +3904,10 @@ static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) memset(&b_node, 0, sizeof(struct maple_big_node)); /* Copy l_mas and store the value in b_node. */ - mas_store_b_node(&l_wr_mas, &b_node, l_wr_mas.node_end); + mas_store_b_node(&l_wr_mas, &b_node, l_mas.end); /* Copy r_mas into b_node. */ - if (r_mas.offset <= r_wr_mas.node_end) - mas_mab_cp(&r_mas, r_mas.offset, r_wr_mas.node_end, + if (r_mas.offset <= r_mas.end) + mas_mab_cp(&r_mas, r_mas.offset, r_mas.end, &b_node, b_node.b_end + 1); else b_node.b_end++; @@ -3949,7 +3949,7 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, if (mas->last == wr_mas->end_piv) offset_end++; /* don't copy this offset */ else if (unlikely(wr_mas->r_max == ULONG_MAX)) - mas_bulk_rebalance(mas, wr_mas->node_end, wr_mas->type); + mas_bulk_rebalance(mas, mas->end, wr_mas->type); /* set up node. */ if (in_rcu) { @@ -3985,12 +3985,12 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, * this range wrote to the end of the node or it overwrote the rest of * the data */ - if (offset_end > wr_mas->node_end) + if (offset_end > mas->end) goto done; dst_offset = mas->offset + 1; /* Copy to the end of node if necessary. */ - copy_size = wr_mas->node_end - offset_end + 1; + copy_size = mas->end - offset_end + 1; memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end, sizeof(void *) * copy_size); memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end, @@ -4077,10 +4077,10 @@ static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) } else { /* Check next slot(s) if we are overwriting the end */ if ((mas->last == wr_mas->end_piv) && - (wr_mas->node_end != wr_mas->offset_end) && + (mas->end != wr_mas->offset_end) && !wr_mas->slots[wr_mas->offset_end + 1]) { wr_mas->offset_end++; - if (wr_mas->offset_end == wr_mas->node_end) + if (wr_mas->offset_end == mas->end) mas->last = mas->max; else mas->last = wr_mas->pivots[wr_mas->offset_end]; @@ -4105,11 +4105,11 @@ static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) { - while ((wr_mas->offset_end < wr_mas->node_end) && + while ((wr_mas->offset_end < wr_mas->mas->end) && (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) wr_mas->offset_end++; - if (wr_mas->offset_end < wr_mas->node_end) + if (wr_mas->offset_end < wr_mas->mas->end) wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; else wr_mas->end_piv = wr_mas->mas->max; @@ -4121,7 +4121,7 @@ static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; - unsigned char new_end = wr_mas->node_end + 2; + unsigned char new_end = mas->end + 2; new_end -= wr_mas->offset_end - mas->offset; if (wr_mas->r_min == mas->index) @@ -4155,10 +4155,10 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas, if (mt_in_rcu(mas->tree)) return false; - if (mas->offset != wr_mas->node_end) + if (mas->offset != mas->end) return false; - end = wr_mas->node_end; + end = mas->end; if (mas->offset != end) return false; @@ -4210,7 +4210,7 @@ static void mas_wr_bnode(struct ma_wr_state *wr_mas) trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); memset(&b_node, 0, sizeof(struct maple_big_node)); mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); - mas_commit_b_node(wr_mas, &b_node, wr_mas->node_end); + mas_commit_b_node(wr_mas, &b_node, wr_mas->mas->end); } static inline void mas_wr_modify(struct ma_wr_state *wr_mas) @@ -4238,7 +4238,7 @@ static inline void mas_wr_modify(struct ma_wr_state *wr_mas) if (mas_wr_append(wr_mas, new_end)) return; - if (new_end == wr_mas->node_end && mas_wr_slot_store(wr_mas)) + if (new_end == mas->end && mas_wr_slot_store(wr_mas)) return; if (mas_wr_node_store(wr_mas, new_end)) @@ -5052,6 +5052,7 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned char offset; unsigned long *pivots; enum maple_type mt; + struct maple_node *node; if (min > max) return -EINVAL; @@ -5082,13 +5083,14 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, if (unlikely(offset == MAPLE_NODE_SLOTS)) return -EBUSY; + node = mas_mn(mas); mt = mte_node_type(mas->node); - pivots = ma_pivots(mas_mn(mas), mt); + pivots = ma_pivots(node, mt); min = mas_safe_min(mas, pivots, offset); if (mas->index < min) mas->index = min; mas->last = mas->index + size - 1; - mas->end = mas_data_end(mas); + mas->end = ma_data_end(node, mt, pivots, mas->max); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area); @@ -7596,7 +7598,7 @@ void mas_wr_dump(const struct ma_wr_state *wr_mas) pr_err("WR_MAS: node=%p r_min=%lx r_max=%lx\n", wr_mas->node, wr_mas->r_min, wr_mas->r_max); pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n", - wr_mas->type, wr_mas->offset_end, wr_mas->node_end, + wr_mas->type, wr_mas->offset_end, wr_mas->mas->end, wr_mas->end_piv); } EXPORT_SYMBOL_GPL(mas_wr_dump); From 24662decdd44645e8f027d7912be962dd461d1aa Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:28 -0400 Subject: [PATCH 0692/1562] maple_tree: don't find node end in mtree_lookup_walk() Since the pivot being set is now reliable, the optimized loop no longer needs to find the node end. The redundant check for a dead node can also be avoided as there is no danger of using the wrong pivot since the results will be thrown out in the case of a dead node by the later check. This patch also adds a benchmark test for the function to the maple tree test framework. The benchmark shows an average increase performance of 5.98% over 3 runs with this commit. Link: https://lkml.kernel.org/r/20231101171629.3612299-12-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 12 +++--------- lib/test_maple_tree.c | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 59500fe6988b..bc62122a4b15 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3742,23 +3742,17 @@ static inline void *mtree_lookup_walk(struct ma_state *mas) enum maple_type type; void __rcu **slots; unsigned char end; - unsigned long max; next = mas->node; - max = ULONG_MAX; do { - offset = 0; node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); - end = ma_data_end(node, type, pivots, max); - if (unlikely(ma_dead_node(node))) - goto dead_node; + end = mt_pivots[type]; + offset = 0; do { - if (pivots[offset] >= mas->index) { - max = pivots[offset]; + if (pivots[offset] >= mas->index) break; - } } while (++offset < end); slots = ma_slots(node, type); diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index e7a5d688c9e0..29185ac5c727 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -43,6 +43,7 @@ atomic_t maple_tree_tests_passed; /* #define BENCH_NODE_STORE */ /* #define BENCH_AWALK */ /* #define BENCH_WALK */ +/* #define BENCH_LOAD */ /* #define BENCH_MT_FOR_EACH */ /* #define BENCH_FORK */ /* #define BENCH_MAS_FOR_EACH */ @@ -1754,6 +1755,19 @@ static noinline void __init bench_walk(struct maple_tree *mt) } #endif +#if defined(BENCH_LOAD) +static noinline void __init bench_load(struct maple_tree *mt) +{ + int i, max = 2500, count = 550000000; + + for (i = 0; i < max; i += 10) + mtree_store_range(mt, i, i + 5, xa_mk_value(i), GFP_KERNEL); + + for (i = 0; i < count; i++) + mtree_load(mt, 1470); +} +#endif + #if defined(BENCH_MT_FOR_EACH) static noinline void __init bench_mt_for_each(struct maple_tree *mt) { @@ -3623,6 +3637,13 @@ static int __init maple_tree_seed(void) mtree_destroy(&tree); goto skip; #endif +#if defined(BENCH_LOAD) +#define BENCH + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + bench_load(&tree); + mtree_destroy(&tree); + goto skip; +#endif #if defined(BENCH_FORK) #define BENCH bench_forking(); From a3c63c8c5df6406e79490456a1fc41a287676070 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:29 -0400 Subject: [PATCH 0693/1562] maple_tree: mtree_range_walk() clean up mtree_range_walk() needed to be updated to avoid checking if there was a pivot value. On closer examination, the code could avoid setting min or max in certain scenarios. The commit removes the extra check for pivot[offset] before setting max and only sets max when necessary. It also only sets min if it is necessary by checking offset 0 prior to the loop (as it has always done). The commit also drops a dead node check since the end of the node will return the array size when the last slot is occupied (by a potential reuse in a dead node). The data will be discarded later if the node is marked dead. Benchmarking these changes results in an increase in performance of 5.45% using the BENCH_WALK in the maple tree test code. Link: https://lkml.kernel.org/r/20231101171629.3612299-13-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index bc62122a4b15..925c5742a57d 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2806,32 +2806,29 @@ static inline void *mtree_range_walk(struct ma_state *mas) min = mas->min; max = mas->max; do { - offset = 0; last = next; node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = ma_data_end(node, type, pivots, max); - if (unlikely(ma_dead_node(node))) - goto dead_node; - - if (pivots[offset] >= mas->index) { - prev_max = max; - prev_min = min; - max = pivots[offset]; + prev_min = min; + prev_max = max; + if (pivots[0] >= mas->index) { + offset = 0; + max = pivots[0]; goto next; } - do { + offset = 1; + while (offset < end) { + if (pivots[offset] >= mas->index) { + max = pivots[offset]; + break; + } offset++; - } while ((offset < end) && (pivots[offset] < mas->index)); + } - prev_min = min; min = pivots[offset - 1] + 1; - prev_max = max; - if (likely(offset < end && pivots[offset])) - max = pivots[offset]; - next: slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); From d1fefa3d22447923e75ab2cd7abe302e43b77d0c Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 27 Oct 2023 16:49:44 +0800 Subject: [PATCH 0694/1562] maple_tree: remove unused function The function are defined in the maple_tree.c file, but not called elsewhere, so delete the unused function. lib/maple_tree.c:689:29: warning: unused function 'mas_pivot'. Link: https://lkml.kernel.org/r/20231027084944.24888-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Jiapeng Chong Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7064 Acked-by: David Hildenbrand Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 925c5742a57d..3aa69c6920e1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -684,35 +684,6 @@ static inline unsigned long *ma_gaps(struct maple_node *node, return NULL; } -/* - * mas_pivot() - Get the pivot at @piv of the maple encoded node. - * @mas: The maple state. - * @piv: The pivot. - * - * Return: the pivot at @piv of @mn. - */ -static inline unsigned long mas_pivot(struct ma_state *mas, unsigned char piv) -{ - struct maple_node *node = mas_mn(mas); - enum maple_type type = mte_node_type(mas->node); - - if (MAS_WARN_ON(mas, piv >= mt_pivots[type])) { - mas_set_err(mas, -EIO); - return 0; - } - - switch (type) { - case maple_arange_64: - return node->ma64.pivot[piv]; - case maple_range_64: - case maple_leaf_64: - return node->mr64.pivot[piv]; - case maple_dense: - return 0; - } - return 0; -} - /* * mas_safe_pivot() - get the pivot at @piv or mas->max. * @mas: The maple state From 2e783f0c1a0d9017209f2ed243960924ebb602cb Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Mon, 20 Nov 2023 15:09:33 +0800 Subject: [PATCH 0695/1562] maple_tree: move the check forward to avoid static check warning Patch series "Some cleanups of maple tree", v2. These are some small cleanups of maple tree. This patch (of 5): Put the check for gap before its reference to avoid Smatch static check warnings. This is not a bug, it's just a validation program. Even with this change, Smatch may still generate warnings because MT_BUG_ON() doesn't necessarily stop the program. It may require fixing Smatch itself to avoid these warnings. Link: https://lkml.kernel.org/r/20231120070937.35481-1-zhangpeng.00@bytedance.com Link: https://lkml.kernel.org/r/20231120070937.35481-2-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reported-by: Dan Carpenter Closes: http://lists.infradead.org/pipermail/maple-tree/2023-November/003046.html Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3aa69c6920e1..df3ae393f12a 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7257,6 +7257,7 @@ static void mas_validate_gaps(struct ma_state *mas) counted: if (mt == maple_arange_64) { + MT_BUG_ON(mas->tree, !gaps); offset = ma_meta_gap(node, mt); if (offset > i) { pr_err("gap offset %p[%u] is invalid\n", node, offset); @@ -7269,7 +7270,6 @@ counted: MT_BUG_ON(mas->tree, 1); } - MT_BUG_ON(mas->tree, !gaps); for (i++ ; i < mt_slot_count(mte); i++) { if (gaps[i] != 0) { pr_err("gap %p[%u] beyond node limit != 0\n", From 3f05fcdebf2979569802e1ee94cf4c7d887546e2 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Mon, 20 Nov 2023 15:09:34 +0800 Subject: [PATCH 0696/1562] maple_tree: avoid ascending when mas->min is also the parent's minimum When the child node is the first child of its parent node, mas->min does not need to be updated. This can reduce the number of ascending times in some cases. Link: https://lkml.kernel.org/r/20231120070937.35481-3-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Dan Carpenter Signed-off-by: Andrew Morton --- lib/maple_tree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index df3ae393f12a..e26bc4473eb7 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1088,14 +1088,16 @@ static int mas_ascend(struct ma_state *mas) return 0; } - if (!mas->min) + min = 0; + max = ULONG_MAX; + if (!mas->offset) { + min = mas->min; set_min = true; + } if (mas->max == ULONG_MAX) set_max = true; - min = 0; - max = ULONG_MAX; do { p_enode = a_enode; a_type = mas_parent_type(mas, p_enode); From c5e941213826d68b0d938dae540d9d6c143560ec Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Mon, 20 Nov 2023 15:09:35 +0800 Subject: [PATCH 0697/1562] maple_tree: remove an unused parameter for ma_meta_end() The parameter maple_type is not used, so remove it. Link: https://lkml.kernel.org/r/20231120070937.35481-4-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Dan Carpenter Signed-off-by: Andrew Morton --- lib/maple_tree.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index e26bc4473eb7..65c25e88c47e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -930,10 +930,8 @@ static inline unsigned char ma_meta_end(struct maple_node *mn, /* * ma_meta_gap() - Get the largest gap location of a node from the metadata * @mn: The maple node - * @mt: The maple node type */ -static inline unsigned char ma_meta_gap(struct maple_node *mn, - enum maple_type mt) +static inline unsigned char ma_meta_gap(struct maple_node *mn) { return mn->ma64.meta.gap; } @@ -1587,7 +1585,7 @@ static inline unsigned long mas_max_gap(struct ma_state *mas) node = mas_mn(mas); MAS_BUG_ON(mas, mt != maple_arange_64); - offset = ma_meta_gap(node, mt); + offset = ma_meta_gap(node); gaps = ma_gaps(node, mt); return gaps[offset]; } @@ -1618,7 +1616,7 @@ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, ascend: MAS_BUG_ON(mas, pmt != maple_arange_64); - meta_offset = ma_meta_gap(pnode, pmt); + meta_offset = ma_meta_gap(pnode); meta_gap = pgaps[meta_offset]; pgaps[offset] = new; @@ -7260,7 +7258,7 @@ static void mas_validate_gaps(struct ma_state *mas) counted: if (mt == maple_arange_64) { MT_BUG_ON(mas->tree, !gaps); - offset = ma_meta_gap(node, mt); + offset = ma_meta_gap(node); if (offset > i) { pr_err("gap offset %p[%u] is invalid\n", node, offset); MT_BUG_ON(mas->tree, 1); From 026b935cd929c18d496fbf9432e8174ec40cdbc8 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Mon, 20 Nov 2023 15:09:36 +0800 Subject: [PATCH 0698/1562] maple_tree: delete one of the two identical checks There are two identical checks, delete one of them. Link: https://lkml.kernel.org/r/20231120070937.35481-5-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Dan Carpenter Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 65c25e88c47e..ca37cdf4b82e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4117,9 +4117,6 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas, if (mt_in_rcu(mas->tree)) return false; - if (mas->offset != mas->end) - return false; - end = mas->end; if (mas->offset != end) return false; From 330018fe69c66333cb2115e54f1844e471668fc3 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Mon, 20 Nov 2023 15:09:37 +0800 Subject: [PATCH 0699/1562] maple_tree: simplify mas_leaf_set_meta() Now it seems that the incoming 'end' is already pointing to the last item, so we can simplify this function, considering only whether the last slot is being used. This has passed the maple tree test suite. Link: https://lkml.kernel.org/r/20231120070937.35481-6-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Dan Carpenter Signed-off-by: Andrew Morton --- lib/maple_tree.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index ca37cdf4b82e..47f2a7a97385 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1964,27 +1964,13 @@ complete: /* * mas_leaf_set_meta() - Set the metadata of a leaf if possible. - * @mas: The maple state * @node: The maple node - * @pivots: pointer to the maple node pivots * @mt: The maple type - * @end: The assumed end - * - * Note, end may be incremented within this function but not modified at the - * source. This is fine since the metadata is the last thing to be stored in a - * node during a write. + * @end: The node end */ -static inline void mas_leaf_set_meta(struct ma_state *mas, - struct maple_node *node, unsigned long *pivots, +static inline void mas_leaf_set_meta(struct maple_node *node, enum maple_type mt, unsigned char end) { - /* There is no room for metadata already */ - if (mt_pivots[mt] <= end) - return; - - if (pivots[end] && pivots[end] < mas->max) - end++; - if (end < mt_slots[mt] - 1) ma_set_meta(node, mt, 0, end); } @@ -2041,7 +2027,7 @@ static inline void mab_mas_cp(struct maple_big_node *b_node, ma_set_meta(node, mt, offset, end); } else { - mas_leaf_set_meta(mas, node, pivots, mt, end); + mas_leaf_set_meta(node, mt, end); } } @@ -3962,7 +3948,7 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, dst_pivots[new_end] = mas->max; done: - mas_leaf_set_meta(mas, newnode, dst_pivots, maple_leaf_64, new_end); + mas_leaf_set_meta(newnode, maple_leaf_64, new_end); if (in_rcu) { struct maple_enode *old_enode = mas->node; From 0a97c01cd20bb96359d8c9dedad92a061ed34e0b Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 30 Nov 2023 11:40:18 -0800 Subject: [PATCH 0700/1562] list_lru: allow explicit memcg and NUMA node selection Patch series "workload-specific and memory pressure-driven zswap writeback", v8. There are currently several issues with zswap writeback: 1. There is only a single global LRU for zswap, making it impossible to perform worload-specific shrinking - an memcg under memory pressure cannot determine which pages in the pool it owns, and often ends up writing pages from other memcgs. This issue has been previously observed in practice and mitigated by simply disabling memcg-initiated shrinking: https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u But this solution leaves a lot to be desired, as we still do not have an avenue for an memcg to free up its own memory locked up in the zswap pool. 2. We only shrink the zswap pool when the user-defined limit is hit. This means that if we set the limit too high, cold data that are unlikely to be used again will reside in the pool, wasting precious memory. It is hard to predict how much zswap space will be needed ahead of time, as this depends on the workload (specifically, on factors such as memory access patterns and compressibility of the memory pages). This patch series solves these issues by separating the global zswap LRU into per-memcg and per-NUMA LRUs, and performs workload-specific (i.e memcg- and NUMA-aware) zswap writeback under memory pressure. The new shrinker does not have any parameter that must be tuned by the user, and can be opted in or out on a per-memcg basis. As a proof of concept, we ran the following synthetic benchmark: build the linux kernel in a memory-limited cgroup, and allocate some cold data in tmpfs to see if the shrinker could write them out and improved the overall performance. Depending on the amount of cold data generated, we observe from 14% to 35% reduction in kernel CPU time used in the kernel builds. This patch (of 6): The interface of list_lru is based on the assumption that the list node and the data it represents belong to the same allocated on the correct node/memcg. While this assumption is valid for existing slab objects LRU such as dentries and inodes, it is undocumented, and rather inflexible for certain potential list_lru users (such as the upcoming zswap shrinker and the THP shrinker). It has caused us a lot of issues during our development. This patch changes list_lru interface so that the caller must explicitly specify numa node and memcg when adding and removing objects. The old list_lru_add() and list_lru_del() are renamed to list_lru_add_obj() and list_lru_del_obj(), respectively. It also extends the list_lru API with a new function, list_lru_putback, which undoes a previous list_lru_isolate call. Unlike list_lru_add, it does not increment the LRU node count (as list_lru_isolate does not decrement the node count). list_lru_putback also allows for explicit memcg and NUMA node selection. Link: https://lkml.kernel.org/r/20231130194023.4102148-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-2-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Tested-by: Bagas Sanjaya Cc: Chris Li Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- drivers/android/binder_alloc.c | 7 ++--- fs/dcache.c | 8 +++-- fs/gfs2/quota.c | 6 ++-- fs/inode.c | 4 +-- fs/nfs/nfs42xattr.c | 8 ++--- fs/nfsd/filecache.c | 4 +-- fs/xfs/xfs_buf.c | 6 ++-- fs/xfs/xfs_dquot.c | 2 +- fs/xfs/xfs_qm.c | 2 +- include/linux/list_lru.h | 54 ++++++++++++++++++++++++++++++++-- mm/list_lru.c | 48 +++++++++++++++++++++++++----- mm/workingset.c | 4 +-- 12 files changed, 117 insertions(+), 36 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 138f6d43d13b..f69d30c9f50f 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -234,7 +234,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, if (page->page_ptr) { trace_binder_alloc_lru_start(alloc, index); - on_lru = list_lru_del(&binder_alloc_lru, &page->lru); + on_lru = list_lru_del_obj(&binder_alloc_lru, &page->lru); WARN_ON(!on_lru); trace_binder_alloc_lru_end(alloc, index); @@ -285,7 +285,7 @@ free_range: trace_binder_free_lru_start(alloc, index); - ret = list_lru_add(&binder_alloc_lru, &page->lru); + ret = list_lru_add_obj(&binder_alloc_lru, &page->lru); WARN_ON(!ret); trace_binder_free_lru_end(alloc, index); @@ -848,7 +848,7 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) if (!alloc->pages[i].page_ptr) continue; - on_lru = list_lru_del(&binder_alloc_lru, + on_lru = list_lru_del_obj(&binder_alloc_lru, &alloc->pages[i].lru); page_addr = alloc->buffer + i * PAGE_SIZE; binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, @@ -1287,4 +1287,3 @@ int binder_alloc_copy_from_buffer(struct binder_alloc *alloc, return binder_alloc_do_buffer_copy(alloc, false, buffer, buffer_offset, dest, bytes); } - diff --git a/fs/dcache.c b/fs/dcache.c index c82ae731df9a..2ba37643b9c5 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -428,7 +428,8 @@ static void d_lru_add(struct dentry *dentry) this_cpu_inc(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_inc(nr_dentry_negative); - WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); + WARN_ON_ONCE(!list_lru_add_obj( + &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_lru_del(struct dentry *dentry) @@ -438,7 +439,8 @@ static void d_lru_del(struct dentry *dentry) this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); - WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); + WARN_ON_ONCE(!list_lru_del_obj( + &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_shrink_del(struct dentry *dentry) @@ -1240,7 +1242,7 @@ static enum lru_status dentry_lru_isolate(struct list_head *item, * * This is guaranteed by the fact that all LRU management * functions are intermediated by the LRU API calls like - * list_lru_add and list_lru_del. List movement in this file + * list_lru_add_obj and list_lru_del_obj. List movement in this file * only ever occur through this functions or through callbacks * like this one, that are called from the LRU API. * diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 95dae7838b4e..b57f8c7b35be 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -271,7 +271,7 @@ static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash, if (qd->qd_sbd != sdp) continue; if (lockref_get_not_dead(&qd->qd_lockref)) { - list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru); return qd; } } @@ -344,7 +344,7 @@ static void qd_put(struct gfs2_quota_data *qd) } qd->qd_lockref.count = 0; - list_lru_add(&gfs2_qd_lru, &qd->qd_lru); + list_lru_add_obj(&gfs2_qd_lru, &qd->qd_lru); spin_unlock(&qd->qd_lockref.lock); } @@ -1517,7 +1517,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) lockref_mark_dead(&qd->qd_lockref); spin_unlock(&qd->qd_lockref.lock); - list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru); list_add(&qd->qd_lru, &dispose); } spin_unlock(&qd_lock); diff --git a/fs/inode.c b/fs/inode.c index f238d987dec9..ef2034a985e0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -464,7 +464,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate) if (!mapping_shrinkable(&inode->i_data)) return; - if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) + if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); else if (rotate) inode->i_state |= I_REFERENCED; @@ -482,7 +482,7 @@ void inode_add_lru(struct inode *inode) static void inode_lru_list_del(struct inode *inode) { - if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) + if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 2ad66a8922f4..49aaf28a6950 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -132,7 +132,7 @@ nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry) lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; - return list_lru_add(lru, &entry->lru); + return list_lru_add_obj(lru, &entry->lru); } static bool @@ -143,7 +143,7 @@ nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry) lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; - return list_lru_del(lru, &entry->lru); + return list_lru_del_obj(lru, &entry->lru); } /* @@ -349,7 +349,7 @@ nfs4_xattr_cache_unlink(struct inode *inode) oldcache = nfsi->xattr_cache; if (oldcache != NULL) { - list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru); + list_lru_del_obj(&nfs4_xattr_cache_lru, &oldcache->lru); oldcache->inode = NULL; } nfsi->xattr_cache = NULL; @@ -474,7 +474,7 @@ nfs4_xattr_get_cache(struct inode *inode, int add) kref_get(&cache->ref); nfsi->xattr_cache = cache; cache->inode = inode; - list_lru_add(&nfs4_xattr_cache_lru, &cache->lru); + list_lru_add_obj(&nfs4_xattr_cache_lru, &cache->lru); } spin_unlock(&inode->i_lock); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ef063f93fde9..6c2decfdeb4b 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -322,7 +322,7 @@ nfsd_file_check_writeback(struct nfsd_file *nf) static bool nfsd_file_lru_add(struct nfsd_file *nf) { set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) { + if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru)) { trace_nfsd_file_lru_add(nf); return true; } @@ -331,7 +331,7 @@ static bool nfsd_file_lru_add(struct nfsd_file *nf) static bool nfsd_file_lru_remove(struct nfsd_file *nf) { - if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) { + if (list_lru_del_obj(&nfsd_file_lru, &nf->nf_lru)) { trace_nfsd_file_lru_del(nf); return true; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 545c7991b9b5..669332849680 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -169,7 +169,7 @@ xfs_buf_stale( atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && - (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) + (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) atomic_dec(&bp->b_hold); ASSERT(atomic_read(&bp->b_hold) >= 1); @@ -1047,7 +1047,7 @@ xfs_buf_rele( * buffer for the LRU and clear the (now stale) dispose list * state flag */ - if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) { bp->b_state &= ~XFS_BSTATE_DISPOSE; atomic_inc(&bp->b_hold); } @@ -1060,7 +1060,7 @@ xfs_buf_rele( * was on was the disposal list */ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); } else { ASSERT(list_empty(&bp->b_lru)); } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index a013b87ab8d5..61a45a86ffe8 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1065,7 +1065,7 @@ xfs_qm_dqput( struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; trace_xfs_dqput_free(dqp); - if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) + if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru)) XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); } xfs_dqunlock(dqp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 94a7932ac570..67d0a8564ff3 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -171,7 +171,7 @@ xfs_qm_dqpurge( * hits zero, so it really should be on the freelist here. */ ASSERT(!list_empty(&dqp->q_lru)); - list_lru_del(&qi->qi_lru, &dqp->q_lru); + list_lru_del_obj(&qi->qi_lru, &dqp->q_lru); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index db86ad78d428..7675a48a0701 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -75,6 +75,8 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * list_lru_add: add an element to the lru list's tail * @lru: the lru pointer * @item: the item to be added. + * @nid: the node id of the sublist to add the item to. + * @memcg: the cgroup of the sublist to add the item to. * * If the element is already part of a list, this function returns doing * nothing. Therefore the caller does not need to keep state about whether or @@ -87,12 +89,28 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * * Return: true if the list was updated, false otherwise */ -bool list_lru_add(struct list_lru *lru, struct list_head *item); +bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); /** - * list_lru_del: delete an element to the lru list + * list_lru_add_obj: add an element to the lru list's tail + * @lru: the lru pointer + * @item: the item to be added. + * + * This function is similar to list_lru_add(), but the NUMA node and the + * memcg of the sublist is determined by @item list_head. This assumption is + * valid for slab objects LRU such as dentries, inodes, etc. + * + * Return value: true if the list was updated, false otherwise + */ +bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); + +/** + * list_lru_del: delete an element from the lru list * @lru: the lru pointer * @item: the item to be deleted. + * @nid: the node id of the sublist to delete the item from. + * @memcg: the cgroup of the sublist to delete the item from. * * This function works analogously as list_lru_add() in terms of list * manipulation. The comments about an element already pertaining to @@ -100,7 +118,21 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item); * * Return: true if the list was updated, false otherwise */ -bool list_lru_del(struct list_lru *lru, struct list_head *item); +bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); + +/** + * list_lru_del_obj: delete an element from the lru list + * @lru: the lru pointer + * @item: the item to be deleted. + * + * This function is similar to list_lru_del(), but the NUMA node and the + * memcg of the sublist is determined by @item list_head. This assumption is + * valid for slab objects LRU such as dentries, inodes, etc. + * + * Return value: true if the list was updated, false otherwise. + */ +bool list_lru_del_obj(struct list_lru *lru, struct list_head *item); /** * list_lru_count_one: return the number of objects currently held by @lru @@ -138,6 +170,22 @@ static inline unsigned long list_lru_count(struct list_lru *lru) void list_lru_isolate(struct list_lru_one *list, struct list_head *item); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); +/** + * list_lru_putback: undo list_lru_isolate + * @lru: the lru pointer. + * @item: the item to put back. + * @nid: the node id of the sublist to put the item back to. + * @memcg: the cgroup of the sublist to put the item back to. + * + * Put back an isolated item into its original LRU. Note that unlike + * list_lru_add, this does not increment the node LRU count (as + * list_lru_isolate does not originally decrement this count). + * + * Since we might have dropped the LRU lock in between, recompute list_lru_one + * from the node's id and memcg. + */ +void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); diff --git a/mm/list_lru.c b/mm/list_lru.c index a05e5bef3b40..fcca67ac26ec 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -116,21 +116,19 @@ list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, } #endif /* CONFIG_MEMCG_KMEM */ -bool list_lru_add(struct list_lru *lru, struct list_head *item) +bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg) { - int nid = page_to_nid(virt_to_page(item)); struct list_lru_node *nlru = &lru->node[nid]; - struct mem_cgroup *memcg; struct list_lru_one *l; spin_lock(&nlru->lock); if (list_empty(item)) { - l = list_lru_from_kmem(lru, nid, item, &memcg); + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) - set_shrinker_bit(memcg, nid, - lru_shrinker_id(lru)); + set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); nlru->nr_items++; spin_unlock(&nlru->lock); return true; @@ -140,15 +138,25 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) } EXPORT_SYMBOL_GPL(list_lru_add); -bool list_lru_del(struct list_lru *lru, struct list_head *item) +bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); + struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ? + mem_cgroup_from_slab_obj(item) : NULL; + + return list_lru_add(lru, item, nid, memcg); +} +EXPORT_SYMBOL_GPL(list_lru_add_obj); + +bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg) +{ struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; spin_lock(&nlru->lock); if (!list_empty(item)) { - l = list_lru_from_kmem(lru, nid, item, NULL); + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); list_del_init(item); l->nr_items--; nlru->nr_items--; @@ -160,6 +168,16 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) } EXPORT_SYMBOL_GPL(list_lru_del); +bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) +{ + int nid = page_to_nid(virt_to_page(item)); + struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ? + mem_cgroup_from_slab_obj(item) : NULL; + + return list_lru_del(lru, item, nid, memcg); +} +EXPORT_SYMBOL_GPL(list_lru_del_obj); + void list_lru_isolate(struct list_lru_one *list, struct list_head *item) { list_del_init(item); @@ -175,6 +193,20 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, } EXPORT_SYMBOL_GPL(list_lru_isolate_move); +void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg) +{ + struct list_lru_one *list = + list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + + if (list_empty(item)) { + list_add_tail(item, &list->list); + if (!list->nr_items++) + set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); + } +} +EXPORT_SYMBOL_GPL(list_lru_putback); + unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { diff --git a/mm/workingset.c b/mm/workingset.c index b192e44a0e7c..c17d45c6f29b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -615,12 +615,12 @@ void workingset_update_node(struct xa_node *node) if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { - list_lru_add(&shadow_nodes, &node->private_list); + list_lru_add_obj(&shadow_nodes, &node->private_list); __inc_lruvec_kmem_state(node, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { - list_lru_del(&shadow_nodes, &node->private_list); + list_lru_del_obj(&shadow_nodes, &node->private_list); __dec_lruvec_kmem_state(node, WORKINGSET_NODES); } } From fdc4161ff6a5e96222e159c1f1b28d31a985130d Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 30 Nov 2023 11:40:19 -0800 Subject: [PATCH 0701/1562] memcontrol: implement mem_cgroup_tryget_online() This patch implements a helper function that try to get a reference to an memcg's css, as well as checking if it is online. This new function is almost exactly the same as the existing mem_cgroup_tryget(), except for the onlineness check. In the !CONFIG_MEMCG case, it always returns true, analogous to mem_cgroup_tryget(). This is useful for e.g to the new zswap writeback scheme, where we need to select the next online memcg as a candidate for the global limit reclaim. Link: https://lkml.kernel.org/r/20231130194023.4102148-3-nphamcs@gmail.com Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Reviewed-by: Yosry Ahmed Cc: Chris Li Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7bdcf3020d7a..2bd7d14ace78 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -821,6 +821,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) return !memcg || css_tryget(&memcg->css); } +static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) +{ + return !memcg || css_tryget_online(&memcg->css); +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) @@ -1349,6 +1354,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) return true; } +static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) +{ + return true; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } From a65b0e7607ccb5e5184591f73e48512f25c76061 Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Thu, 30 Nov 2023 11:40:20 -0800 Subject: [PATCH 0702/1562] zswap: make shrinking memcg-aware Currently, we only have a single global LRU for zswap. This makes it impossible to perform worload-specific shrinking - an memcg cannot determine which pages in the pool it owns, and often ends up writing pages from other memcgs. This issue has been previously observed in practice and mitigated by simply disabling memcg-initiated shrinking: https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u This patch fully resolves the issue by replacing the global zswap LRU with memcg- and NUMA-specific LRUs, and modify the reclaim logic: a) When a store attempt hits an memcg limit, it now triggers a synchronous reclaim attempt that, if successful, allows the new hotter page to be accepted by zswap. b) If the store attempt instead hits the global zswap limit, it will trigger an asynchronous reclaim attempt, in which an memcg is selected for reclaim in a round-robin-like fashion. [nphamcs@gmail.com: use correct function for the onlineness check, use mem_cgroup_iter_break()] Link: https://lkml.kernel.org/r/20231205195419.2563217-1-nphamcs@gmail.com [nphamcs@gmail.com: drop the pool's reference at the end of the writeback step] Link: https://lkml.kernel.org/r/20231206030627.4155634-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-4-nphamcs@gmail.com Signed-off-by: Domenico Cerasuolo Co-developed-by: Nhat Pham Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Cc: Chris Li Cc: Dan Streetman Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 + include/linux/zswap.h | 2 + mm/memcontrol.c | 2 + mm/swap.h | 3 +- mm/swap_state.c | 24 +++- mm/zswap.c | 270 +++++++++++++++++++++++++++++-------- 6 files changed, 246 insertions(+), 60 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2bd7d14ace78..a308c8eacf20 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1192,6 +1192,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return NULL; } +static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) +{ + return NULL; +} + static inline bool folio_memcg_kmem(struct folio *folio) { return false; diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 2a60ce39cfde..e571e393669b 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -15,6 +15,7 @@ bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); void zswap_swapoff(int type); +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); #else @@ -31,6 +32,7 @@ static inline bool zswap_load(struct folio *folio) static inline void zswap_invalidate(int type, pgoff_t offset) {} static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} +static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 592572d4842e..ce75e504fe8b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5614,6 +5614,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); + zswap_memcg_offline_cleanup(memcg); + memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); diff --git a/mm/swap.h b/mm/swap.h index 73c332ee4d91..c0dc73e10e91 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -51,7 +51,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct swap_iocb **plug); struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated); + bool *new_page_allocated, + bool skip_if_exists); struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, diff --git a/mm/swap_state.c b/mm/swap_state.c index 85d9e5806a6a..6c84236382f3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -412,7 +412,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated) + bool *new_page_allocated, + bool skip_if_exists) { struct swap_info_struct *si; struct folio *folio; @@ -470,6 +471,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (err != -EEXIST) goto fail_put_swap; + /* + * Protect against a recursive call to __read_swap_cache_async() + * on the same entry waiting forever here because SWAP_HAS_CACHE + * is set but the folio is not the swap cache yet. This can + * happen today if mem_cgroup_swapin_charge_folio() below + * triggers reclaim through zswap, which may call + * __read_swap_cache_async() in the writeback path. + */ + if (skip_if_exists) + goto fail_put_swap; + /* * We might race against __delete_from_swap_cache(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE @@ -537,7 +549,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, mpol = get_vma_policy(vma, addr, 0, &ilx); page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated); + &page_allocated, false); mpol_cond_put(mpol); if (page_allocated) @@ -654,7 +666,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, /* Ok, do the async read-ahead now */ page = __read_swap_cache_async( swp_entry(swp_type(entry), offset), - gfp_mask, mpol, ilx, &page_allocated); + gfp_mask, mpol, ilx, &page_allocated, false); if (!page) continue; if (page_allocated) { @@ -672,7 +684,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, skip: /* The page was likely read above, so no need for plugging here */ page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated); + &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); return page; @@ -827,7 +839,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, pte_unmap(pte); pte = NULL; page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated); + &page_allocated, false); if (!page) continue; if (page_allocated) { @@ -847,7 +859,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, skip: /* The page was likely read above, so no need for plugging here */ page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, - &page_allocated); + &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); return page; diff --git a/mm/zswap.c b/mm/zswap.c index 699c6ee11222..213626e0f659 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "swap.h" #include "internal.h" @@ -174,8 +175,8 @@ struct zswap_pool { struct work_struct shrink_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; - struct list_head lru; - spinlock_t lru_lock; + struct list_lru list_lru; + struct mem_cgroup *next_shrink; }; /* @@ -291,15 +292,46 @@ static void zswap_update_total_size(void) zswap_pool_total_size = total; } +/* should be called under RCU */ +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) +{ + return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; +} +#else +static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) +{ + return NULL; +} +#endif + +static inline int entry_to_nid(struct zswap_entry *entry) +{ + return page_to_nid(virt_to_page(entry)); +} + +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) +{ + struct zswap_pool *pool; + + /* lock out zswap pools list modification */ + spin_lock(&zswap_pools_lock); + list_for_each_entry(pool, &zswap_pools, list) { + if (pool->next_shrink == memcg) + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); + } + spin_unlock(&zswap_pools_lock); +} + /********************************* * zswap entry functions **********************************/ static struct kmem_cache *zswap_entry_cache; -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) { struct zswap_entry *entry; - entry = kmem_cache_alloc(zswap_entry_cache, gfp); + entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; entry->refcount = 1; @@ -312,6 +344,61 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } +/********************************* +* lru functions +**********************************/ +static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) +{ + int nid = entry_to_nid(entry); + struct mem_cgroup *memcg; + + /* + * Note that it is safe to use rcu_read_lock() here, even in the face of + * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection + * used in list_lru lookup, only two scenarios are possible: + * + * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The + * new entry will be reparented to memcg's parent's list_lru. + * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The + * new entry will be added directly to memcg's parent's list_lru. + * + * Similar reasoning holds for list_lru_del() and list_lru_putback(). + */ + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + /* will always succeed */ + list_lru_add(list_lru, &entry->lru, nid, memcg); + rcu_read_unlock(); +} + +static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) +{ + int nid = entry_to_nid(entry); + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + /* will always succeed */ + list_lru_del(list_lru, &entry->lru, nid, memcg); + rcu_read_unlock(); +} + +static void zswap_lru_putback(struct list_lru *list_lru, + struct zswap_entry *entry) +{ + int nid = entry_to_nid(entry); + spinlock_t *lock = &list_lru->node[nid].lock; + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + spin_lock(lock); + /* we cannot use list_lru_add here, because it increments node's lru count */ + list_lru_putback(list_lru, &entry->lru, nid, memcg); + spin_unlock(lock); + rcu_read_unlock(); +} + /********************************* * rbtree functions **********************************/ @@ -396,9 +483,7 @@ static void zswap_free_entry(struct zswap_entry *entry) if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { - spin_lock(&entry->pool->lru_lock); - list_del(&entry->lru); - spin_unlock(&entry->pool->lru_lock); + zswap_lru_del(&entry->pool->list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); zswap_pool_put(entry->pool); } @@ -632,21 +717,15 @@ static void zswap_invalidate_entry(struct zswap_tree *tree, zswap_entry_put(tree, entry); } -static int zswap_reclaim_entry(struct zswap_pool *pool) +static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, + spinlock_t *lock, void *arg) { - struct zswap_entry *entry; + struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); struct zswap_tree *tree; pgoff_t swpoffset; - int ret; + enum lru_status ret = LRU_REMOVED_RETRY; + int writeback_result; - /* Get an entry off the LRU */ - spin_lock(&pool->lru_lock); - if (list_empty(&pool->lru)) { - spin_unlock(&pool->lru_lock); - return -EINVAL; - } - entry = list_last_entry(&pool->lru, struct zswap_entry, lru); - list_del_init(&entry->lru); /* * Once the lru lock is dropped, the entry might get freed. The * swpoffset is copied to the stack, and entry isn't deref'd again @@ -654,28 +733,32 @@ static int zswap_reclaim_entry(struct zswap_pool *pool) */ swpoffset = swp_offset(entry->swpentry); tree = zswap_trees[swp_type(entry->swpentry)]; - spin_unlock(&pool->lru_lock); + list_lru_isolate(l, item); + /* + * It's safe to drop the lock here because we return either + * LRU_REMOVED_RETRY or LRU_RETRY. + */ + spin_unlock(lock); /* Check for invalidate() race */ spin_lock(&tree->lock); - if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) { - ret = -EAGAIN; + if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) goto unlock; - } + /* Hold a reference to prevent a free during writeback */ zswap_entry_get(entry); spin_unlock(&tree->lock); - ret = zswap_writeback_entry(entry, tree); + writeback_result = zswap_writeback_entry(entry, tree); spin_lock(&tree->lock); - if (ret) { - /* Writeback failed, put entry back on LRU */ - spin_lock(&pool->lru_lock); - list_move(&entry->lru, &pool->lru); - spin_unlock(&pool->lru_lock); + if (writeback_result) { + zswap_reject_reclaim_fail++; + zswap_lru_putback(&entry->pool->list_lru, entry); + ret = LRU_RETRY; goto put_unlock; } + zswap_written_back_pages++; /* * Writeback started successfully, the page now belongs to the @@ -689,24 +772,91 @@ put_unlock: zswap_entry_put(tree, entry); unlock: spin_unlock(&tree->lock); - return ret ? -EAGAIN : 0; + spin_lock(lock); + return ret; +} + +static int shrink_memcg(struct mem_cgroup *memcg) +{ + struct zswap_pool *pool; + int nid, shrunk = 0; + + /* + * Skip zombies because their LRUs are reparented and we would be + * reclaiming from the parent instead of the dead memcg. + */ + if (memcg && !mem_cgroup_online(memcg)) + return -ENOENT; + + pool = zswap_pool_current_get(); + if (!pool) + return -EINVAL; + + for_each_node_state(nid, N_NORMAL_MEMORY) { + unsigned long nr_to_walk = 1; + + shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, + &shrink_memcg_cb, NULL, &nr_to_walk); + } + zswap_pool_put(pool); + return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) { struct zswap_pool *pool = container_of(w, typeof(*pool), shrink_work); + struct mem_cgroup *memcg; int ret, failures = 0; + /* global reclaim will select cgroup in a round-robin fashion. */ do { - ret = zswap_reclaim_entry(pool); - if (ret) { - zswap_reject_reclaim_fail++; - if (ret != -EAGAIN) - break; + spin_lock(&zswap_pools_lock); + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); + memcg = pool->next_shrink; + + /* + * We need to retry if we have gone through a full round trip, or if we + * got an offline memcg (or else we risk undoing the effect of the + * zswap memcg offlining cleanup callback). This is not catastrophic + * per se, but it will keep the now offlined memcg hostage for a while. + * + * Note that if we got an online memcg, we will keep the extra + * reference in case the original reference obtained by mem_cgroup_iter + * is dropped by the zswap memcg offlining callback, ensuring that the + * memcg is not killed when we are reclaiming. + */ + if (!memcg) { + spin_unlock(&zswap_pools_lock); if (++failures == MAX_RECLAIM_RETRIES) break; + + goto resched; } + + if (!mem_cgroup_tryget_online(memcg)) { + /* drop the reference from mem_cgroup_iter() */ + mem_cgroup_iter_break(NULL, memcg); + pool->next_shrink = NULL; + spin_unlock(&zswap_pools_lock); + + if (++failures == MAX_RECLAIM_RETRIES) + break; + + goto resched; + } + spin_unlock(&zswap_pools_lock); + + ret = shrink_memcg(memcg); + /* drop the extra reference */ + mem_cgroup_put(memcg); + + if (ret == -EINVAL) + break; + if (ret && ++failures == MAX_RECLAIM_RETRIES) + break; + +resched: cond_resched(); } while (!zswap_can_accept()); zswap_pool_put(pool); @@ -767,8 +917,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); - INIT_LIST_HEAD(&pool->lru); - spin_lock_init(&pool->lru_lock); + list_lru_init_memcg(&pool->list_lru, NULL); INIT_WORK(&pool->shrink_work, shrink_worker); zswap_pool_debug("created", pool); @@ -834,6 +983,13 @@ static void zswap_pool_destroy(struct zswap_pool *pool) cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); + list_lru_destroy(&pool->list_lru); + + spin_lock(&zswap_pools_lock); + mem_cgroup_iter_break(NULL, pool->next_shrink); + pool->next_shrink = NULL; + spin_unlock(&zswap_pools_lock); + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) zpool_destroy_pool(pool->zpools[i]); kfree(pool); @@ -1081,7 +1237,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* try to allocate swap cache page */ mpol = get_task_policy(current); page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &page_was_allocated); + NO_INTERLEAVE_INDEX, &page_was_allocated, true); if (!page) { ret = -ENOMEM; goto fail; @@ -1147,7 +1303,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* start writeback */ __swap_writepage(page, &wbc); put_page(page); - zswap_written_back_pages++; return ret; @@ -1204,6 +1359,7 @@ bool zswap_store(struct folio *folio) struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL; + struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; struct zpool *zpool; unsigned int dlen = PAGE_SIZE; @@ -1235,15 +1391,15 @@ bool zswap_store(struct folio *folio) zswap_invalidate_entry(tree, dupentry); } spin_unlock(&tree->lock); - - /* - * XXX: zswap reclaim does not work with cgroups yet. Without a - * cgroup-aware entry LRU, we will push out entries system-wide based on - * local cgroup limits. - */ objcg = get_obj_cgroup_from_folio(folio); - if (objcg && !obj_cgroup_may_zswap(objcg)) - goto reject; + if (objcg && !obj_cgroup_may_zswap(objcg)) { + memcg = get_mem_cgroup_from_objcg(objcg); + if (shrink_memcg(memcg)) { + mem_cgroup_put(memcg); + goto reject; + } + mem_cgroup_put(memcg); + } /* reclaim space if needed */ if (zswap_is_full()) { @@ -1260,7 +1416,7 @@ bool zswap_store(struct folio *folio) } /* allocate entry */ - entry = zswap_entry_cache_alloc(GFP_KERNEL); + entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; @@ -1287,6 +1443,15 @@ bool zswap_store(struct folio *folio) if (!entry->pool) goto freepage; + if (objcg) { + memcg = get_mem_cgroup_from_objcg(objcg); + if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) { + mem_cgroup_put(memcg); + goto put_pool; + } + mem_cgroup_put(memcg); + } + /* compress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); @@ -1365,9 +1530,8 @@ insert_entry: zswap_invalidate_entry(tree, dupentry); } if (entry->length) { - spin_lock(&entry->pool->lru_lock); - list_add(&entry->lru, &entry->pool->lru); - spin_unlock(&entry->pool->lru_lock); + INIT_LIST_HEAD(&entry->lru); + zswap_lru_add(&entry->pool->list_lru, entry); } spin_unlock(&tree->lock); @@ -1380,6 +1544,7 @@ insert_entry: put_dstmem: mutex_unlock(acomp_ctx->mutex); +put_pool: zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); @@ -1474,9 +1639,8 @@ freeentry: zswap_invalidate_entry(tree, entry); folio_mark_dirty(folio); } else if (entry->length) { - spin_lock(&entry->pool->lru_lock); - list_move(&entry->lru, &entry->pool->lru); - spin_unlock(&entry->pool->lru_lock); + zswap_lru_del(&entry->pool->list_lru, entry); + zswap_lru_add(&entry->pool->list_lru, entry); } zswap_entry_put(tree, entry); spin_unlock(&tree->lock); From 7108cc3f765cafd48a6a35f8add140beaecfa75b Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Thu, 30 Nov 2023 11:40:21 -0800 Subject: [PATCH 0703/1562] mm: memcg: add per-memcg zswap writeback stat Since zswap now writes back pages from memcg-specific LRUs, we now need a new stat to show writebacks count for each memcg. [nphamcs@gmail.com: rename ZSWP_WB to ZSWPWB] Link: https://lkml.kernel.org/r/20231205193307.2432803-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-5-nphamcs@gmail.com Suggested-by: Nhat Pham Signed-off-by: Domenico Cerasuolo Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Reviewed-by: Yosry Ahmed Cc: Chris Li Cc: Dan Streetman Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Signed-off-by: Andrew Morton --- include/linux/vm_event_item.h | 1 + mm/memcontrol.c | 1 + mm/vmstat.c | 1 + mm/zswap.c | 4 ++++ 4 files changed, 7 insertions(+) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index d1b847502f09..747943bc8cc2 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -142,6 +142,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_ZSWAP ZSWPIN, ZSWPOUT, + ZSWPWB, #endif #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ce75e504fe8b..69b0ad455242 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -703,6 +703,7 @@ static const unsigned int memcg_vm_event_stat[] = { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) ZSWPIN, ZSWPOUT, + ZSWPWB, #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE THP_FAULT_ALLOC, diff --git a/mm/vmstat.c b/mm/vmstat.c index afa5a38fcc9c..cfd8d8256f8e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1401,6 +1401,7 @@ const char * const vmstat_text[] = { #ifdef CONFIG_ZSWAP "zswpin", "zswpout", + "zswpwb", #endif #ifdef CONFIG_X86 "direct_map_level2_splits", diff --git a/mm/zswap.c b/mm/zswap.c index 213626e0f659..c329fca810c8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -760,6 +760,10 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o } zswap_written_back_pages++; + if (entry->objcg) + count_objcg_event(entry->objcg, ZSWPWB); + + count_vm_event(ZSWPWB); /* * Writeback started successfully, the page now belongs to the * swapcache. Drop the entry from zswap - unless invalidate already From a697dc2be925d4814f26d7588347ccdd2c5525ed Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Thu, 30 Nov 2023 11:40:22 -0800 Subject: [PATCH 0704/1562] selftests: cgroup: update per-memcg zswap writeback selftest The memcg-zswap self test is updated to adjust to the behavior change implemented by commit 87730b165089 ("zswap: make shrinking memcg-aware"), where zswap performs writeback for specific memcg. Link: https://lkml.kernel.org/r/20231130194023.4102148-6-nphamcs@gmail.com Signed-off-by: Domenico Cerasuolo Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Acked-by: Chris Li (Google) Cc: Dan Streetman Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 76 ++++++++++++++------- 1 file changed, 51 insertions(+), 25 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index c99d2adaca3f..47fdaa146443 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -50,9 +50,9 @@ static int get_zswap_stored_pages(size_t *value) return read_int("/sys/kernel/debug/zswap/stored_pages", value); } -static int get_zswap_written_back_pages(size_t *value) +static int get_cg_wb_count(const char *cg) { - return read_int("/sys/kernel/debug/zswap/written_back_pages", value); + return cg_read_key_long(cg, "memory.stat", "zswp_wb"); } static long get_zswpout(const char *cgroup) @@ -73,6 +73,24 @@ static int allocate_bytes(const char *cgroup, void *arg) return 0; } +static char *setup_test_group_1M(const char *root, const char *name) +{ + char *group_name = cg_name(root, name); + + if (!group_name) + return NULL; + if (cg_create(group_name)) + goto fail; + if (cg_write(group_name, "memory.max", "1M")) { + cg_destroy(group_name); + goto fail; + } + return group_name; +fail: + free(group_name); + return NULL; +} + /* * Sanity test to check that pages are written into zswap. */ @@ -117,43 +135,51 @@ out: /* * When trying to store a memcg page in zswap, if the memcg hits its memory - * limit in zswap, writeback should not be triggered. - * - * This was fixed with commit 0bdf0efa180a("zswap: do not shrink if cgroup may - * not zswap"). Needs to be revised when a per memcg writeback mechanism is - * implemented. + * limit in zswap, writeback should affect only the zswapped pages of that + * memcg. */ static int test_no_invasive_cgroup_shrink(const char *root) { - size_t written_back_before, written_back_after; int ret = KSFT_FAIL; - char *test_group; + size_t control_allocation_size = MB(10); + char *control_allocation, *wb_group = NULL, *control_group = NULL; /* Set up */ - test_group = cg_name(root, "no_shrink_test"); - if (!test_group) + wb_group = setup_test_group_1M(root, "per_memcg_wb_test1"); + if (!wb_group) + return KSFT_FAIL; + if (cg_write(wb_group, "memory.zswap.max", "10K")) goto out; - if (cg_create(test_group)) - goto out; - if (cg_write(test_group, "memory.max", "1M")) - goto out; - if (cg_write(test_group, "memory.zswap.max", "10K")) - goto out; - if (get_zswap_written_back_pages(&written_back_before)) + control_group = setup_test_group_1M(root, "per_memcg_wb_test2"); + if (!control_group) goto out; - /* Allocate 10x memory.max to push memory into zswap */ - if (cg_run(test_group, allocate_bytes, (void *)MB(10))) + /* Push some test_group2 memory into zswap */ + if (cg_enter_current(control_group)) + goto out; + control_allocation = malloc(control_allocation_size); + for (int i = 0; i < control_allocation_size; i += 4095) + control_allocation[i] = 'a'; + if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1) goto out; - /* Verify that no writeback happened because of the memcg allocation */ - if (get_zswap_written_back_pages(&written_back_after)) + /* Allocate 10x memory.max to push wb_group memory into zswap and trigger wb */ + if (cg_run(wb_group, allocate_bytes, (void *)MB(10))) goto out; - if (written_back_after == written_back_before) + + /* Verify that only zswapped memory from gwb_group has been written back */ + if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(control_group) == 0) ret = KSFT_PASS; out: - cg_destroy(test_group); - free(test_group); + cg_enter_current(root); + if (control_group) { + cg_destroy(control_group); + free(control_group); + } + cg_destroy(wb_group); + free(wb_group); + if (control_allocation) + free(control_allocation); return ret; } From b5ba474f3f518701249598b35c581b92a3c95b48 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 30 Nov 2023 11:40:23 -0800 Subject: [PATCH 0705/1562] zswap: shrink zswap pool based on memory pressure Currently, we only shrink the zswap pool when the user-defined limit is hit. This means that if we set the limit too high, cold data that are unlikely to be used again will reside in the pool, wasting precious memory. It is hard to predict how much zswap space will be needed ahead of time, as this depends on the workload (specifically, on factors such as memory access patterns and compressibility of the memory pages). This patch implements a memcg- and NUMA-aware shrinker for zswap, that is initiated when there is memory pressure. The shrinker does not have any parameter that must be tuned by the user, and can be opted in or out on a per-memcg basis. Furthermore, to make it more robust for many workloads and prevent overshrinking (i.e evicting warm pages that might be refaulted into memory), we build in the following heuristics: * Estimate the number of warm pages residing in zswap, and attempt to protect this region of the zswap LRU. * Scale the number of freeable objects by an estimate of the memory saving factor. The better zswap compresses the data, the fewer pages we will evict to swap (as we will otherwise incur IO for relatively small memory saving). * During reclaim, if the shrinker encounters a page that is also being brought into memory, the shrinker will cautiously terminate its shrinking action, as this is a sign that it is touching the warmer region of the zswap LRU. As a proof of concept, we ran the following synthetic benchmark: build the linux kernel in a memory-limited cgroup, and allocate some cold data in tmpfs to see if the shrinker could write them out and improved the overall performance. Depending on the amount of cold data generated, we observe from 14% to 35% reduction in kernel CPU time used in the kernel builds. [nphamcs@gmail.com: check shrinker enablement early, use less costly stat flushing] Link: https://lkml.kernel.org/r/20231206194456.3234203-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-7-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Tested-by: Bagas Sanjaya Cc: Chris Li Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Cc: Chengming Zhou Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/zswap.rst | 10 ++ include/linux/mmzone.h | 2 + include/linux/zswap.h | 25 +++- mm/Kconfig | 14 ++ mm/mmzone.c | 1 + mm/swap_state.c | 2 + mm/zswap.c | 192 ++++++++++++++++++++++++- 7 files changed, 240 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index 45b98390e938..62fc244ec702 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -153,6 +153,16 @@ attribute, e. g.:: Setting this parameter to 100 will disable the hysteresis. +When there is a sizable amount of cold memory residing in the zswap pool, it +can be advantageous to proactively write these cold pages to swap and reclaim +the memory for other use cases. By default, the zswap shrinker is disabled. +User can enable it as follows: + + echo Y > /sys/module/zswap/parameters/shrinker_enabled + +This can be enabled at the boot time if ``CONFIG_ZSWAP_SHRINKER_DEFAULT_ON`` is +selected. + A debugfs interface is provided for various statistic about pool size, number of pages stored, same-value filled pages and various counters for the reasons pages are rejected. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 14faffa4354f..9ef9d010bff0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -22,6 +22,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -641,6 +642,7 @@ struct lruvec { #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif + struct zswap_lruvec_state zswap_lruvec_state; }; /* Isolate for asynchronous migration */ diff --git a/include/linux/zswap.h b/include/linux/zswap.h index e571e393669b..08c240e16a01 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -5,20 +5,40 @@ #include #include +struct lruvec; + extern u64 zswap_pool_total_size; extern atomic_t zswap_stored_pages; #ifdef CONFIG_ZSWAP +struct zswap_lruvec_state { + /* + * Number of pages in zswap that should be protected from the shrinker. + * This number is an estimate of the following counts: + * + * a) Recent page faults. + * b) Recent insertion to the zswap LRU. This includes new zswap stores, + * as well as recent zswap LRU rotations. + * + * These pages are likely to be warm, and might incur IO if the are written + * to swap. + */ + atomic_long_t nr_zswap_protected; +}; + bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); - +void zswap_lruvec_state_init(struct lruvec *lruvec); +void zswap_page_swapin(struct page *page); #else +struct zswap_lruvec_state {}; + static inline bool zswap_store(struct folio *folio) { return false; @@ -33,7 +53,8 @@ static inline void zswap_invalidate(int type, pgoff_t offset) {} static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} - +static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} +static inline void zswap_page_swapin(struct page *page) {} #endif #endif /* _LINUX_ZSWAP_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 57cd378c73d6..ca87cdb72f11 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -61,6 +61,20 @@ config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON The cost is that if the page was never dirtied and needs to be swapped out again, it will be re-compressed. +config ZSWAP_SHRINKER_DEFAULT_ON + bool "Shrink the zswap pool on memory pressure" + depends on ZSWAP + default n + help + If selected, the zswap shrinker will be enabled, and the pages + stored in the zswap pool will become available for reclaim (i.e + written back to the backing swap device) on memory pressure. + + This means that zswap writeback could happen even if the pool is + not yet full, or the cgroup zswap limit has not been reached, + reducing the chance that cold pages will reside in the zswap pool + and consume memory indefinitely. + choice prompt "Default compressor" depends on ZSWAP diff --git a/mm/mmzone.c b/mm/mmzone.c index b594d3f268fe..c01896eca736 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -78,6 +78,7 @@ void lruvec_init(struct lruvec *lruvec) memset(lruvec, 0, sizeof(struct lruvec)); spin_lock_init(&lruvec->lru_lock); + zswap_lruvec_state_init(lruvec); for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); diff --git a/mm/swap_state.c b/mm/swap_state.c index 6c84236382f3..c597cec606e4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -687,6 +687,7 @@ skip: &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); + zswap_page_swapin(page); return page; } @@ -862,6 +863,7 @@ skip: &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); + zswap_page_swapin(page); return page; } diff --git a/mm/zswap.c b/mm/zswap.c index c329fca810c8..015425ed9003 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -148,6 +148,11 @@ module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 +/* Enable/disable memory pressure-based shrinker. */ +static bool zswap_shrinker_enabled = IS_ENABLED( + CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); +module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); + /********************************* * data structures **********************************/ @@ -177,6 +182,8 @@ struct zswap_pool { char tfm_name[CRYPTO_MAX_ALG_NAME]; struct list_lru list_lru; struct mem_cgroup *next_shrink; + struct shrinker *shrinker; + atomic_t nr_stored; }; /* @@ -275,17 +282,26 @@ static bool zswap_can_accept(void) DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } +static u64 get_zswap_pool_size(struct zswap_pool *pool) +{ + u64 pool_size = 0; + int i; + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + pool_size += zpool_get_total_size(pool->zpools[i]); + + return pool_size; +} + static void zswap_update_total_size(void) { struct zswap_pool *pool; u64 total = 0; - int i; rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - total += zpool_get_total_size(pool->zpools[i]); + total += get_zswap_pool_size(pool); rcu_read_unlock(); @@ -344,13 +360,34 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } +/********************************* +* zswap lruvec functions +**********************************/ +void zswap_lruvec_state_init(struct lruvec *lruvec) +{ + atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); +} + +void zswap_page_swapin(struct page *page) +{ + struct lruvec *lruvec; + + if (page) { + lruvec = folio_lruvec(page_folio(page)); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + } +} + /********************************* * lru functions **********************************/ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { + atomic_long_t *nr_zswap_protected; + unsigned long lru_size, old, new; int nid = entry_to_nid(entry); struct mem_cgroup *memcg; + struct lruvec *lruvec; /* * Note that it is safe to use rcu_read_lock() here, even in the face of @@ -368,6 +405,19 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_add(list_lru, &entry->lru, nid, memcg); + + /* Update the protection area */ + lru_size = list_lru_count_one(list_lru, nid, memcg); + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; + old = atomic_long_inc_return(nr_zswap_protected); + /* + * Decay to avoid overflow and adapt to changing workloads. + * This is based on LRU reclaim cost decaying heuristics. + */ + do { + new = old > lru_size / 4 ? old / 2 : old; + } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); rcu_read_unlock(); } @@ -389,6 +439,7 @@ static void zswap_lru_putback(struct list_lru *list_lru, int nid = entry_to_nid(entry); spinlock_t *lock = &list_lru->node[nid].lock; struct mem_cgroup *memcg; + struct lruvec *lruvec; rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); @@ -396,6 +447,10 @@ static void zswap_lru_putback(struct list_lru *list_lru, /* we cannot use list_lru_add here, because it increments node's lru count */ list_lru_putback(list_lru, &entry->lru, nid, memcg); spin_unlock(lock); + + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); + /* increment the protection area to account for the LRU rotation. */ + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); rcu_read_unlock(); } @@ -485,6 +540,7 @@ static void zswap_free_entry(struct zswap_entry *entry) else { zswap_lru_del(&entry->pool->list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); + atomic_dec(&entry->pool->nr_stored); zswap_pool_put(entry->pool); } zswap_entry_cache_free(entry); @@ -526,6 +582,109 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, return entry; } +/********************************* +* shrinker functions +**********************************/ +static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, + spinlock_t *lock, void *arg); + +static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); + unsigned long shrink_ret, nr_protected, lru_size; + struct zswap_pool *pool = shrinker->private_data; + bool encountered_page_in_swapcache = false; + + if (!zswap_shrinker_enabled) { + sc->nr_scanned = 0; + return SHRINK_STOP; + } + + nr_protected = + atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); + lru_size = list_lru_shrink_count(&pool->list_lru, sc); + + /* + * Abort if we are shrinking into the protected region. + * + * This short-circuiting is necessary because if we have too many multiple + * concurrent reclaimers getting the freeable zswap object counts at the + * same time (before any of them made reasonable progress), the total + * number of reclaimed objects might be more than the number of unprotected + * objects (i.e the reclaimers will reclaim into the protected area of the + * zswap LRU). + */ + if (nr_protected >= lru_size - sc->nr_to_scan) { + sc->nr_scanned = 0; + return SHRINK_STOP; + } + + shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, + &encountered_page_in_swapcache); + + if (encountered_page_in_swapcache) + return SHRINK_STOP; + + return shrink_ret ? shrink_ret : SHRINK_STOP; +} + +static unsigned long zswap_shrinker_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct zswap_pool *pool = shrinker->private_data; + struct mem_cgroup *memcg = sc->memcg; + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); + unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; + + if (!zswap_shrinker_enabled) + return 0; + +#ifdef CONFIG_MEMCG_KMEM + mem_cgroup_flush_stats(); + nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; + nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); +#else + /* use pool stats instead of memcg stats */ + nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; + nr_stored = atomic_read(&pool->nr_stored); +#endif + + if (!nr_stored) + return 0; + + nr_protected = + atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); + nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); + /* + * Subtract the lru size by an estimate of the number of pages + * that should be protected. + */ + nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; + + /* + * Scale the number of freeable pages by the memory saving factor. + * This ensures that the better zswap compresses memory, the fewer + * pages we will evict to swap (as it will otherwise incur IO for + * relatively small memory saving). + */ + return mult_frac(nr_freeable, nr_backing, nr_stored); +} + +static void zswap_alloc_shrinker(struct zswap_pool *pool) +{ + pool->shrinker = + shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); + if (!pool->shrinker) + return; + + pool->shrinker->private_data = pool; + pool->shrinker->scan_objects = zswap_shrinker_scan; + pool->shrinker->count_objects = zswap_shrinker_count; + pool->shrinker->batch = 0; + pool->shrinker->seeks = DEFAULT_SEEKS; +} + /********************************* * per-cpu code **********************************/ @@ -721,6 +880,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o spinlock_t *lock, void *arg) { struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); + bool *encountered_page_in_swapcache = (bool *)arg; struct zswap_tree *tree; pgoff_t swpoffset; enum lru_status ret = LRU_REMOVED_RETRY; @@ -756,6 +916,17 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o zswap_reject_reclaim_fail++; zswap_lru_putback(&entry->pool->list_lru, entry); ret = LRU_RETRY; + + /* + * Encountering a page already in swap cache is a sign that we are shrinking + * into the warmer region. We should terminate shrinking (if we're in the dynamic + * shrinker context). + */ + if (writeback_result == -EEXIST && encountered_page_in_swapcache) { + ret = LRU_SKIP; + *encountered_page_in_swapcache = true; + } + goto put_unlock; } zswap_written_back_pages++; @@ -914,6 +1085,11 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) &pool->node); if (ret) goto error; + + zswap_alloc_shrinker(pool); + if (!pool->shrinker) + goto error; + pr_debug("using %s compressor\n", pool->tfm_name); /* being the current pool takes 1 ref; this func expects the @@ -921,13 +1097,19 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); - list_lru_init_memcg(&pool->list_lru, NULL); + if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) + goto lru_fail; + shrinker_register(pool->shrinker); INIT_WORK(&pool->shrink_work, shrink_worker); + atomic_set(&pool->nr_stored, 0); zswap_pool_debug("created", pool); return pool; +lru_fail: + list_lru_destroy(&pool->list_lru); + shrinker_free(pool->shrinker); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); @@ -985,6 +1167,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool) zswap_pool_debug("destroying", pool); + shrinker_free(pool->shrinker); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); list_lru_destroy(&pool->list_lru); @@ -1536,6 +1719,7 @@ insert_entry: if (entry->length) { INIT_LIST_HEAD(&entry->lru); zswap_lru_add(&entry->pool->list_lru, entry); + atomic_inc(&entry->pool->nr_stored); } spin_unlock(&tree->lock); From 4b86316ef18231109e4ebb3661ffc69d816fb56f Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 29 Nov 2023 15:11:40 -0700 Subject: [PATCH 0706/1562] selftests/mm: dont run ksm_functional_tests twice ksm functional test is already being run. Remove the duplicate call to ./ksm_functional_tests. Link: https://lkml.kernel.org/r/20231129221140.614713-1-npache@redhat.com Fixes: 93fb70aa5904 ("selftests/vm: add KSM unmerge tests") Signed-off-by: Nico Pache Acked-by: Joel Savitz Cc: David Hildenbrand Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 00757445278e..c0212258b852 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -334,8 +334,6 @@ CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0 CATEGORY="ksm" run_test ./ksm_functional_tests -run_test ./ksm_functional_tests - # protection_keys tests if [ -x ./protection_keys_32 ] then From 9294a037c01564786abb15436529fae3863268a2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 30 Nov 2023 02:36:44 +0000 Subject: [PATCH 0707/1562] mm/damon/core: implement goal-oriented feedback-driven quota auto-tuning Patch series "mm/damon: let users feed and tame/auto-tune DAMOS". Introduce Aim-oriented Feedback-driven DAMOS Aggressiveness Auto-tuning. It makes DAMOS self-tuned with periodic simple user feedback. Background: DAMOS Control Difficulty ==================================== DAMOS helps users easily implement access pattern aware system operations. However, controlling DAMOS in the wild is not that easy. The basic way for DAMOS control is specifying the target access pattern. In this approach, the user is assumed to well understand the access pattern and the characteristics of the system and the workloads. Though there are useful tools for that, it takes time and effort depending on the complexity and the dynamicity of the system and the workloads. After all, the access pattern consists of three ranges, namely the size, the access rate, and the age of the regions. It means users need to tune six parameters, which is anyway not a simple task. One of the worst cases would be DAMOS being too aggressive like a berserker, and therefore consuming too much system resource and making unwanted radical system operations. To let users avoid such cases, DAMOS allows users to set the upper-limit of the schemes' aggressiveness, namely DAMOS quota. DAMOS further provides its best-effort under the limit by prioritizing regions based on the access pattern of the regions. For example, users can ask DAMOS to page out up to 100 MiB of memory regions per second. Then DAMOS pages out regions that are not accessed for a longer time (colder) first under the limit. This allows users to set the target access pattern a bit naive with wider ranges, and focus on tuning only one parameter, the quota. In other words, the number of parameters to tune can be reduced from six to one. Still, however, the optimum value for the quota depends on the system and the workloads' characteristics, so not that simple. The number of parameters to tune can also increase again if the user needs to run multiple schemes. Aim-oriented Feedback-driven DAMOS Aggressiveness Auto Tuning ============================================================= Users would use DAMOS since they want to achieve something with it. They will likely have measurable metrics representing the achievement and the target number of the metric like SLO, and continuously measure that anyway. While the additional cost of getting the information is nearly zero, it could be useful for DAMOS to understand how appropriate its current aggressiveness is set, and adjust it on its own to make the metric value more close to the target. Based on this idea, we introduce a new way of tuning DAMOS with nearly zero additional effort, namely Aim-oriented Feedback-driven DAMOS Aggressiveness Auto Tuning. It asks users to provide feedback representing how well DAMOS is doing relative to the users' aim. Then DAMOS adjusts its aggressiveness, specifically the quota that provides the best effort result under the limit, based on the current level of the aggressiveness and the users' feedback. Implementation ============== The implementation asks users to represent the feedback with score numbers. The scores could be anything including user-space specific metrics including latency and throughput of special user-space workloads, and system metrics including free memory ratio, memory pressure stall time (PSI), and active to inactive LRU lists size ratio. The feedback scores and the aggressiveness of the given DAMOS scheme are assumed to be positively proportional, though. Selecting metrics of the assumption is the users' responsibility. The core logic uses the below simple feedback loop algorithm to calculate the next aggressiveness level of the scheme from the current aggressiveness level and the current feedback (target_score and current_score). It calculates the compensation for next aggressiveness as a proportion of current aggressiveness and distance to the target score. As a result, it arrives at the near-goal state in a short time using big steps when it's far from the goal, but avoids making unnecessarily radical changes that could turn out to be a bad decision using small steps when its near to the goal. f(n) = max(1, f(n - 1) * ((target_score - current_score) / target_score + 1)) Note that the compensation value becomes negative when it's over achieving the goal. That's why the feedback metric and the aggressiveness of the scheme should be positively proportional. The distance-adaptive speed manipulation is simply applied. Example Use Cases ================= If users want to reduce the memory footprint of the system as much as possible as long as the time spent for handling the resulting memory pressure is within a threshold, they could use DAMOS scheme that reclaims cold memory regions aiming for a little level of memory pressure stall time. If users want the active/inactive LRU lists well balanced to reduce the performance impact due to possible future memory pressure, they could use two schemes. The first one would be set to locate hot pages in the active LRU list, aiming for a specific active-to-inactive LRU list size ratio, say, 70%. The second one would be to locate cold pages in the inactive LRU list, aiming for a specific inactive-to-active LRU list size ratio, say, 30%. Then, DAMOS will balance the two schemes based on the goal and feedback. This aim-oriented auto tuning could also be useful for general balancing-required access aware system operations such as system memory auto scaling[3] and tiered memory management[4]. These two example usages are not what current DAMOS implementation is already supporting, but require additional DAMOS action developments, though. Evaluation: subtle memory pressure aiming proactive reclamation =============================================================== To show if the implementation works as expected, we prepare four different system configurations on AWS i3.metal instances. The first setup (original) runs the workload without any DAMOS scheme. The second setup (not-tuned) runs the workload with a virtual address space-based proactive reclamation scheme that pages out memory regions that are not accessed for five seconds or more. The third setup (offline-tuned) runs the same proactive reclamation DAMOS scheme, but after making it tuned for each workload offline, using our previous user-space driven automatic tuning approach, namely DAMOOS[1]. The fourth and final setup (AFDAA) runs the scheme that is the same as that of 'not-tuned' setup, but aims to keep 0.5% of 'some' memory pressure stall time (PSI) for the last 10 seconds using the aiming-oriented auto tuning. For each setup, we run realistic workloads from PARSEC3 and SPLASH-2X benchmark suites. For each run, we measure RSS and runtime of the workload, and 'some' memory pressure stall time (PSI) of the system. We repeat the runs five times and use averaged measurements. For simple comparison of the results, we normalize the measurements to those of 'original'. In the case of the PSI, though, the measurement for 'original' was zero, so we normalize the value to that of 'not-tuned' scheme's result. The normalized results are shown below. Not-tuned Offline-tuned AFDAA RSS 0.622688178226118 0.787950678944904 0.740093483278979 runtime 1.11767826657912 1.0564674983585 1.0910833880499 PSI 1 0.727521443794069 0.308498846350299 The 'not-tuned' scheme achieves about 38.7% memory saving but incur about 11.7% runtime slowdown. The 'offline-tuned' scheme achieves about 22.2% memory saving with about 5.5% runtime slowdown. It also achieves about 28.2% memory pressure stall time saving. AFDAA achieves about 26% memory saving with about 9.1% runtime slowdown. It also achieves about 69.1% memory pressure stall time saving. We repeat this test multiple times, and get consistent results. AFDAA is now integrated in our daily DAMON performance test setup. Apparently the aggressiveness of 'AFDAA' setup is somewhere between those of 'not-tuned' and 'offline-tuned' setup, since its memory saving and runtime overhead are between those of the other two setups. Actually we set the memory pressure stall time goal aiming for this middle aggressiveness. The difference in the two metrics are not significant, though. However, it shows significant saving of the memory pressure stall time, which was the goal of the auto-tuning, over the two variants. Hence, we conclude the automatic tuning is working as expected. Please note that the AFDAA setup is only for the evaluation, and therefore intentionally set a bit aggressive. It might not be appropriate for production environments. The test code is also available[2], so you could reproduce it on your system and workloads. Patches Sequence ================ The first four patches implement the core logic and user interfaces for the auto tuning. The first patch implements the core logic for the auto tuning, and the API for DAMOS users in the kernel space. The second patch implements basic file operations of DAMON sysfs directories and files that will be used for setting the goals and providing the feedback. The third patch connects the quota goals files inputs to the DAMOS core logic. Finally the fourth patch implements a dedicated DAMOS sysfs command for efficiently committing the quota goals feedback. Two patches for simple tests of the logic and interfaces follow. The fifth patch implements the core logic unit test. The sixth patch implements a selftest for the DAMON Sysfs interface for the goals. Finally, three patches for documentation follows. The seventh patch documents the design of the feature. The eighth patch updates the API doc for the new sysfs files. The final eighth patch updates the usage document for the features. References ========== [1] DAOS paper: https://www.amazon.science/publications/daos-data-access-aware-operating-system [2] Evaluation code: https://github.com/damonitor/damon-tests/commit/3f884e61193f0166b8724554b6d06b0c449a712d [3] Memory auto scaling RFC idea: https://lore.kernel.org/damon/20231112195114.61474-1-sj@kernel.org/ [4] DAMON-based tiered memory management RFC idea: https://lore.kernel.org/damon/20231112195602.61525-1-sj@kernel.org/ This patch (of 9) Users can effectively control the upper-limit aggressiveness of DAMOS schemes using the quota feature. The quota provides best result under the limit by prioritizing regions based on the access pattern. That said, finding the best value, which could depend on dynamic characteristics of the system and the workloads, is still challenging. Implement a simple feedback-driven tuning mechanism and use it for automatic tuning of DAMOS quota. The implementation allows users to provide the feedback by setting a feedback score returning callback function. Then DAMOS periodically calls the function back and adjusts the quota based on the return value of the callback and current quota value. Note that the absolute-value based time/size quotas still work as the maximum hard limits of the scheme's aggressiveness. The feedback-driven auto-tuned quota is applied only if it is not exceeding the manually set maximum limits. Same for the scheme-target access pattern and filters like other features. [sj@kernel.org: document get_score_arg field of struct damos_quota] Link: https://lkml.kernel.org/r/20231204170106.60992-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231130023652.50284-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231130023652.50284-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 20 +++++++++++++ mm/damon/core.c | 68 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 79 insertions(+), 9 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index ab2f17d9926b..aa34ab433bc5 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -136,6 +136,9 @@ enum damos_action { * @weight_nr_accesses: Weight of the region's nr_accesses for prioritization. * @weight_age: Weight of the region's age for prioritization. * + * @get_score: Feedback function for self-tuning quota. + * @get_score_arg: Parameter for @get_score + * * To avoid consuming too much CPU time or IO resources for applying the * &struct damos->action to large memory, DAMON allows users to set time and/or * size quotas. The quotas can be set by writing non-zero values to &ms and @@ -153,6 +156,17 @@ enum damos_action { * You could customize the prioritization logic by setting &weight_sz, * &weight_nr_accesses, and &weight_age, because monitoring operations are * encouraged to respect those. + * + * If @get_score function pointer is set, DAMON calls it back with + * @get_score_arg and get the return value of it for every @reset_interval. + * Then, DAMON adjusts the effective quota using the return value as a feedback + * score to the current quota, using its internal feedback loop algorithm. + * + * The feedback loop algorithem assumes the quota input and the feedback score + * output are in a positive proportional relationship, and the goal of the + * tuning is getting the feedback screo value of 10,000. If @ms and/or @sz are + * set together, those work as a hard limit quota. If neither @ms nor @sz are + * set, the mechanism starts from the quota of one byte. */ struct damos_quota { unsigned long ms; @@ -163,6 +177,9 @@ struct damos_quota { unsigned int weight_nr_accesses; unsigned int weight_age; + unsigned long (*get_score)(void *arg); + void *get_score_arg; + /* private: */ /* For throughput estimation */ unsigned long total_charged_sz; @@ -179,6 +196,9 @@ struct damos_quota { /* For prioritization */ unsigned long histogram[DAMOS_MAX_SCORE + 1]; unsigned int min_score; + + /* For feedback loop */ + unsigned long esz_bp; }; /** diff --git a/mm/damon/core.c b/mm/damon/core.c index ce1562783e7e..f91715a58dc7 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1038,26 +1038,76 @@ static void damon_do_apply_schemes(struct damon_ctx *c, } } -/* Shouldn't be called if quota->ms and quota->sz are zero */ +/* + * damon_feed_loop_next_input() - get next input to achieve a target score. + * @last_input The last input. + * @score Current score that made with @last_input. + * + * Calculate next input to achieve the target score, based on the last input + * and current score. Assuming the input and the score are positively + * proportional, calculate how much compensation should be added to or + * subtracted from the last input as a proportion of the last input. Avoid + * next input always being zero by setting it non-zero always. In short form + * (assuming support of float and signed calculations), the algorithm is as + * below. + * + * next_input = max(last_input * ((goal - current) / goal + 1), 1) + * + * For simple implementation, we assume the target score is always 10,000. The + * caller should adjust @score for this. + * + * Returns next input that assumed to achieve the target score. + */ +static unsigned long damon_feed_loop_next_input(unsigned long last_input, + unsigned long score) +{ + const unsigned long goal = 10000; + unsigned long score_goal_diff = max(goal, score) - min(goal, score); + unsigned long score_goal_diff_bp = score_goal_diff * 10000 / goal; + unsigned long compensation = last_input * score_goal_diff_bp / 10000; + /* Set minimum input as 10000 to avoid compensation be zero */ + const unsigned long min_input = 10000; + + if (goal > score) + return last_input + compensation; + if (last_input > compensation + min_input) + return last_input - compensation; + return min_input; +} + +/* Shouldn't be called if quota->ms, quota->sz, and quota->get_score unset */ static void damos_set_effective_quota(struct damos_quota *quota) { unsigned long throughput; unsigned long esz; - if (!quota->ms) { + if (!quota->ms && !quota->get_score) { quota->esz = quota->sz; return; } - if (quota->total_charged_ns) - throughput = quota->total_charged_sz * 1000000 / - quota->total_charged_ns; - else - throughput = PAGE_SIZE * 1024; - esz = throughput * quota->ms; + if (quota->get_score) { + quota->esz_bp = damon_feed_loop_next_input( + max(quota->esz_bp, 10000UL), + quota->get_score(quota->get_score_arg)); + esz = quota->esz_bp / 10000; + } + + if (quota->ms) { + if (quota->total_charged_ns) + throughput = quota->total_charged_sz * 1000000 / + quota->total_charged_ns; + else + throughput = PAGE_SIZE * 1024; + if (quota->get_score) + esz = min(throughput * quota->ms, esz); + else + esz = throughput * quota->ms; + } if (quota->sz && quota->sz < esz) esz = quota->sz; + quota->esz = esz; } @@ -1069,7 +1119,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) unsigned long cumulated_sz; unsigned int score, max_score = 0; - if (!quota->ms && !quota->sz) + if (!quota->ms && !quota->sz && !quota->get_score) return; /* New charge window starts */ From 7f262da0a30df32a960f90ab6c11b08a7233ea34 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 30 Nov 2023 02:36:45 +0000 Subject: [PATCH 0708/1562] mm/damon/sysfs-schemes: implement files for scheme quota goals setup Implement DAMON sysfs directories and files for the goals of DAMOS quota. Those allow users set multiple goals for their aim, with target values. Users can further enter the current score value for each goal as feedback for DAMOS. Note that this commit is implementing only the basic file operations, and not connecting the files with the DAMOS core logic. Hence writing something to the files makes no real effect. The following commit will connect the file operations and the core logic. Link: https://lkml.kernel.org/r/20231130023652.50284-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 224 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 221 insertions(+), 3 deletions(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index fe0fe2562000..e5531dbd4cf1 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -820,6 +820,203 @@ static const struct kobj_type damon_sysfs_watermarks_ktype = { .default_groups = damon_sysfs_watermarks_groups, }; +/* + * quota goal directory + */ + +struct damos_sysfs_quota_goal { + struct kobject kobj; + unsigned long target_value; + unsigned long current_value; +}; + +static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void) +{ + return kzalloc(sizeof(struct damos_sysfs_quota_goal), GFP_KERNEL); +} + +static ssize_t target_value_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, struct + damos_sysfs_quota_goal, kobj); + + return sysfs_emit(buf, "%lu\n", goal->target_value); +} + +static ssize_t target_value_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, struct + damos_sysfs_quota_goal, kobj); + int err = kstrtoul(buf, 0, &goal->target_value); + + return err ? err : count; +} + +static ssize_t current_value_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, struct + damos_sysfs_quota_goal, kobj); + + return sysfs_emit(buf, "%lu\n", goal->current_value); +} + +static ssize_t current_value_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, struct + damos_sysfs_quota_goal, kobj); + int err = kstrtoul(buf, 0, &goal->current_value); + + /* feed callback should check existence of this file and read value */ + return err ? err : count; +} + +static void damos_sysfs_quota_goal_release(struct kobject *kobj) +{ + /* or, notify this release to the feed callback */ + kfree(container_of(kobj, struct damos_sysfs_quota_goal, kobj)); +} + +static struct kobj_attribute damos_sysfs_quota_goal_target_value_attr = + __ATTR_RW_MODE(target_value, 0600); + +static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr = + __ATTR_RW_MODE(current_value, 0600); + +static struct attribute *damos_sysfs_quota_goal_attrs[] = { + &damos_sysfs_quota_goal_target_value_attr.attr, + &damos_sysfs_quota_goal_current_value_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damos_sysfs_quota_goal); + +static const struct kobj_type damos_sysfs_quota_goal_ktype = { + .release = damos_sysfs_quota_goal_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damos_sysfs_quota_goal_groups, +}; + +/* + * quota goals directory + */ + +struct damos_sysfs_quota_goals { + struct kobject kobj; + struct damos_sysfs_quota_goal **goals_arr; /* counted by nr */ + int nr; +}; + +static struct damos_sysfs_quota_goals *damos_sysfs_quota_goals_alloc(void) +{ + return kzalloc(sizeof(struct damos_sysfs_quota_goals), GFP_KERNEL); +} + +static void damos_sysfs_quota_goals_rm_dirs( + struct damos_sysfs_quota_goals *goals) +{ + struct damos_sysfs_quota_goal **goals_arr = goals->goals_arr; + int i; + + for (i = 0; i < goals->nr; i++) + kobject_put(&goals_arr[i]->kobj); + goals->nr = 0; + kfree(goals_arr); + goals->goals_arr = NULL; +} + +static int damos_sysfs_quota_goals_add_dirs( + struct damos_sysfs_quota_goals *goals, int nr_goals) +{ + struct damos_sysfs_quota_goal **goals_arr, *goal; + int err, i; + + damos_sysfs_quota_goals_rm_dirs(goals); + if (!nr_goals) + return 0; + + goals_arr = kmalloc_array(nr_goals, sizeof(*goals_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!goals_arr) + return -ENOMEM; + goals->goals_arr = goals_arr; + + for (i = 0; i < nr_goals; i++) { + goal = damos_sysfs_quota_goal_alloc(); + if (!goal) { + damos_sysfs_quota_goals_rm_dirs(goals); + return -ENOMEM; + } + + err = kobject_init_and_add(&goal->kobj, + &damos_sysfs_quota_goal_ktype, &goals->kobj, + "%d", i); + if (err) { + kobject_put(&goal->kobj); + damos_sysfs_quota_goals_rm_dirs(goals); + return err; + } + + goals_arr[i] = goal; + goals->nr++; + } + return 0; +} + +static ssize_t nr_goals_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_quota_goals *goals = container_of(kobj, + struct damos_sysfs_quota_goals, kobj); + + return sysfs_emit(buf, "%d\n", goals->nr); +} + +static ssize_t nr_goals_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_quota_goals *goals; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + goals = container_of(kobj, struct damos_sysfs_quota_goals, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damos_sysfs_quota_goals_add_dirs(goals, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damos_sysfs_quota_goals_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damos_sysfs_quota_goals, kobj)); +} + +static struct kobj_attribute damos_sysfs_quota_goals_nr_attr = + __ATTR_RW_MODE(nr_goals, 0600); + +static struct attribute *damos_sysfs_quota_goals_attrs[] = { + &damos_sysfs_quota_goals_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damos_sysfs_quota_goals); + +static const struct kobj_type damos_sysfs_quota_goals_ktype = { + .release = damos_sysfs_quota_goals_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damos_sysfs_quota_goals_groups, +}; + /* * scheme/weights directory */ @@ -938,6 +1135,7 @@ static const struct kobj_type damon_sysfs_weights_ktype = { struct damon_sysfs_quotas { struct kobject kobj; struct damon_sysfs_weights *weights; + struct damos_sysfs_quota_goals *goals; unsigned long ms; unsigned long sz; unsigned long reset_interval_ms; @@ -951,6 +1149,7 @@ static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas) { struct damon_sysfs_weights *weights; + struct damos_sysfs_quota_goals *goals; int err; weights = damon_sysfs_weights_alloc(0, 0, 0); @@ -959,16 +1158,35 @@ static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas) err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype, "as->kobj, "weights"); - if (err) + if (err) { kobject_put(&weights->kobj); - else - quotas->weights = weights; + return err; + } + quotas->weights = weights; + + goals = damos_sysfs_quota_goals_alloc(); + if (!goals) { + kobject_put(&weights->kobj); + return -ENOMEM; + } + err = kobject_init_and_add(&goals->kobj, + &damos_sysfs_quota_goals_ktype, "as->kobj, + "goals"); + if (err) { + kobject_put(&weights->kobj); + kobject_put(&goals->kobj); + } else { + quotas->goals = goals; + } + return err; } static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas) { kobject_put("as->weights->kobj); + damos_sysfs_quota_goals_rm_dirs(quotas->goals); + kobject_put("as->goals->kobj); } static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr, From 8b549a4fd3c5fd721e268564596095398b0469bc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 30 Nov 2023 02:36:46 +0000 Subject: [PATCH 0709/1562] mm/damon/sysfs-schemes: commit damos quota goals user input to DAMOS Make DAMON sysfs interface to read the user inputs for DAMOS quota goals and pass those to DAMOS, so that the users can use the quota auto-tuning feature. It uses the DAMON sysfs interface's user input commit mechanism, which applies all user inputs for initial starting of DAMON and online input updates, which can be done by writing 'on' and 'commit' to the kdamond's 'state' file, respectively. In other words, the user should periodically write appropriate value to 'current_value' files and 'commit' command to the 'state' file. 'target_value' files could also be similarly updated at any time. Note that the interface is supporting multiple goals while the core logic supports only one goal. DAMON sysfs interface passes only best feedback among the given inputs, to avoid making DAMOS too aggressive. Link: https://lkml.kernel.org/r/20231130023652.50284-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index e5531dbd4cf1..a7917534ca19 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1868,6 +1868,34 @@ static int damon_sysfs_set_scheme_filters(struct damos *scheme, return 0; } +static unsigned long damos_sysfs_get_quota_score(void *arg) +{ + return (unsigned long)arg; +} + +static void damos_sysfs_set_quota_score( + struct damos_sysfs_quota_goals *sysfs_goals, + struct damos_quota *quota) +{ + struct damos_sysfs_quota_goal *sysfs_goal; + int i; + + quota->get_score = NULL; + quota->get_score_arg = (void *)0; + for (i = 0; i < sysfs_goals->nr; i++) { + sysfs_goal = sysfs_goals->goals_arr[i]; + if (!sysfs_goal->target_value) + continue; + + /* Higher score makes scheme less aggressive */ + quota->get_score_arg = (void *)max( + (unsigned long)quota->get_score_arg, + sysfs_goal->current_value * 10000 / + sysfs_goal->target_value); + quota->get_score = damos_sysfs_get_quota_score; + } +} + static struct damos *damon_sysfs_mk_scheme( struct damon_sysfs_scheme *sysfs_scheme) { @@ -1905,6 +1933,8 @@ static struct damos *damon_sysfs_mk_scheme( .low = sysfs_wmarks->low, }; + damos_sysfs_set_quota_score(sysfs_quotas->goals, "a); + scheme = damon_new_scheme(&pattern, sysfs_scheme->action, sysfs_scheme->apply_interval_us, "a, &wmarks); if (!scheme) @@ -1945,6 +1975,8 @@ static void damon_sysfs_update_scheme(struct damos *scheme, scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses; scheme->quota.weight_age = sysfs_weights->age; + damos_sysfs_set_quota_score(sysfs_quotas->goals, &scheme->quota); + scheme->wmarks.metric = sysfs_wmarks->metric; scheme->wmarks.interval = sysfs_wmarks->interval_us; scheme->wmarks.high = sysfs_wmarks->high; From d91beaa505a0789c78adcf3ab021f5ed79794697 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 30 Nov 2023 02:36:47 +0000 Subject: [PATCH 0710/1562] mm/damon/sysfs-schemes: implement a command for scheme quota goals only commit To update DAMOS quota goals, users need to enter 'commit' command to the 'state' file of the kdamond, which applies not only the goals but entire inputs. It is inefficient. Implement yet another 'state' file input command for reading and committing only the scheme quota goals, namely 'commit_schemes_quota_goals'. Link: https://lkml.kernel.org/r/20231130023652.50284-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-common.h | 3 +++ mm/damon/sysfs-schemes.c | 16 ++++++++++++++++ mm/damon/sysfs.c | 27 +++++++++++++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 5ff081226e28..4c37a166eb81 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -56,3 +56,6 @@ int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); int damon_sysfs_schemes_clear_regions( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); + +void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index a7917534ca19..8dbaac6e5c2d 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1896,6 +1896,22 @@ static void damos_sysfs_set_quota_score( } } +void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + struct damos *scheme; + int i = 0; + + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_scheme *sysfs_scheme; + + sysfs_scheme = sysfs_schemes->schemes_arr[i]; + damos_sysfs_set_quota_score(sysfs_scheme->quotas->goals, + &scheme->quota); + i++; + } +} + static struct damos *damon_sysfs_mk_scheme( struct damon_sysfs_scheme *sysfs_scheme) { diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 7472404456aa..1f891e18b4ee 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -994,6 +994,11 @@ enum damon_sysfs_cmd { DAMON_SYSFS_CMD_OFF, /* @DAMON_SYSFS_CMD_COMMIT: Update kdamond inputs. */ DAMON_SYSFS_CMD_COMMIT, + /* + * @DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS: Commit the quota goals + * to DAMON. + */ + DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS, /* * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: Update scheme stats sysfs * files. @@ -1025,6 +1030,7 @@ static const char * const damon_sysfs_cmd_strs[] = { "on", "off", "commit", + "commit_schemes_quota_goals", "update_schemes_stats", "update_schemes_tried_bytes", "update_schemes_tried_regions", @@ -1351,6 +1357,24 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) kdamond->contexts->contexts_arr[0]); } +static int damon_sysfs_commit_schemes_quota_goals( + struct damon_sysfs_kdamond *sysfs_kdamond) +{ + struct damon_ctx *ctx; + struct damon_sysfs_context *sysfs_ctx; + + if (!damon_sysfs_kdamond_running(sysfs_kdamond)) + return -EINVAL; + /* TODO: Support multiple contexts per kdamond */ + if (sysfs_kdamond->contexts->nr != 1) + return -EINVAL; + + ctx = sysfs_kdamond->damon_ctx; + sysfs_ctx = sysfs_kdamond->contexts->contexts_arr[0]; + damos_sysfs_set_quota_scores(sysfs_ctx->schemes, ctx); + return 0; +} + /* * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests. * @c: The DAMON context of the callback. @@ -1379,6 +1403,9 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active) case DAMON_SYSFS_CMD_COMMIT: err = damon_sysfs_commit_input(kdamond); break; + case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS: + err = damon_sysfs_commit_schemes_quota_goals(kdamond); + break; case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: total_bytes_only = true; fallthrough; From f1762cb3eaea34add3655ecd0be9a77aca4e884c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 30 Nov 2023 02:36:48 +0000 Subject: [PATCH 0711/1562] mm/damon/core-test: add a unit test for the feedback loop algorithm Implement a simple kunit test for testing the behavior of the feedback loop algorithm for the aim-oriented feedback-friven DAMOS aggressiveness auto tuning. Link: https://lkml.kernel.org/r/20231130023652.50284-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index e6a01ea2ec54..6e5e9502d648 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -446,6 +446,37 @@ static void damos_test_filter_out(struct kunit *test) damos_free_filter(f); } +static void damon_test_feed_loop_next_input(struct kunit *test) +{ + unsigned long last_input = 900000, current_score = 200; + + /* + * If current score is lower than the goal, which is always 10,000 + * (read the comment on damon_feed_loop_next_input()'s comment), next + * input should be higher than the last input. + */ + KUNIT_EXPECT_GT(test, + damon_feed_loop_next_input(last_input, current_score), + last_input); + + /* + * If current score is higher than the goal, next input should be lower + * than the last input. + */ + current_score = 250000000; + KUNIT_EXPECT_LT(test, + damon_feed_loop_next_input(last_input, current_score), + last_input); + + /* + * The next input depends on the distance between the current score and + * the goal + */ + KUNIT_EXPECT_GT(test, + damon_feed_loop_next_input(last_input, 200), + damon_feed_loop_next_input(last_input, 2000)); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -461,6 +492,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_moving_sum), KUNIT_CASE(damos_test_new_filter), KUNIT_CASE(damos_test_filter_out), + KUNIT_CASE(damon_test_feed_loop_next_input), {}, }; From 3649caed1c9b7aa57049620c498596c17fc7af9e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 30 Nov 2023 02:36:49 +0000 Subject: [PATCH 0712/1562] selftests/damon: test quota goals directory Add DAMON selftests for testing creation/existence of quota goals directories and files, and simple valid input writes. Link: https://lkml.kernel.org/r/20231130023652.50284-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 27 ++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index 56f0230a8b92..e9a976d296e2 100755 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -150,6 +150,32 @@ test_weights() ensure_file "$weights_dir/age_permil" "exist" "600" } +test_goal() +{ + goal_dir=$1 + ensure_dir "$goal_dir" "exist" + ensure_file "$goal_dir/target_value" "exist" "600" + ensure_file "$goal_dir/current_value" "exist" "600" +} + +test_goals() +{ + goals_dir=$1 + ensure_dir "$goals_dir" "exist" + ensure_file "$goals_dir/nr_goals" "exist" "600" + + ensure_write_succ "$goals_dir/nr_goals" "1" "valid input" + test_goal "$goals_dir/0" + + ensure_write_succ "$goals_dir/nr_goals" "2" "valid input" + test_goal "$goals_dir/0" + test_goal "$goals_dir/1" + + ensure_write_succ "$goals_dir/nr_goals" "0" "valid input" + ensure_dir "$goals_dir/0" "not_exist" + ensure_dir "$goals_dir/1" "not_exist" +} + test_quotas() { quotas_dir=$1 @@ -158,6 +184,7 @@ test_quotas() ensure_file "$quotas_dir/bytes" "exist" 600 ensure_file "$quotas_dir/reset_interval_ms" "exist" 600 test_weights "$quotas_dir/weights" + test_goals "$quotas_dir/goals" } test_access_pattern() From 3143a7bfd2a9852d63a2d88109cf5af49fcf4950 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 30 Nov 2023 02:36:50 +0000 Subject: [PATCH 0713/1562] Docs/mm/damon/design: document DAMOS quota auto tuning Document the DAMOS quota auto tuning feature on the design document. Link: https://lkml.kernel.org/r/20231130023652.50284-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 1f7e0586b5fa..947c9df6cd33 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -346,6 +346,17 @@ the weight will be respected are up to the underlying prioritization mechanism implementation. +Aim-oriented Feedback-driven Auto-tuning +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Automatic feedback-driven quota tuning. Instead of setting the absolute quota +value, users can repeatedly provide numbers representing how much of their goal +for the scheme is achieved as feedback. DAMOS then automatically tunes the +aggressiveness (the quota) of the corresponding scheme. For example, if DAMOS +is under achieving the goal, DAMOS automatically increases the quota. If DAMOS +is over achieving the goal, it decreases the quota. + + .. _damon_design_damos_watermarks: Watermarks From 0972913f9673b13f18c9d72199e25ae07cc1876b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 30 Nov 2023 02:36:51 +0000 Subject: [PATCH 0714/1562] Docs/ABI/damon: document DAMOS quota goals Update DAMON ABI document for the newly added DAMON sysfs files and inputs for DAMOS quota goals. Link: https://lkml.kernel.org/r/20231130023652.50284-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../ABI/testing/sysfs-kernel-mm-damon | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index b35649a46a2f..bfa5b8288d8d 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -25,12 +25,14 @@ Description: Writing 'on' or 'off' to this file makes the kdamond starts or stops, respectively. Reading the file returns the keywords based on the current status. Writing 'commit' to this file makes the kdamond reads the user inputs in the sysfs files - except 'state' again. Writing 'update_schemes_stats' to the - file updates contents of schemes stats files of the kdamond. - Writing 'update_schemes_tried_regions' to the file updates - contents of 'tried_regions' directory of every scheme directory - of this kdamond. Writing 'update_schemes_tried_bytes' to the - file updates only '.../tried_regions/total_bytes' files of this + except 'state' again. Writing 'commit_schemes_quota_goals' to + this file makes the kdamond reads the quota goal files again. + Writing 'update_schemes_stats' to the file updates contents of + schemes stats files of the kdamond. Writing + 'update_schemes_tried_regions' to the file updates contents of + 'tried_regions' directory of every scheme directory of this + kdamond. Writing 'update_schemes_tried_bytes' to the file + updates only '.../tried_regions/total_bytes' files of this kdamond. Writing 'clear_schemes_tried_regions' to the file removes contents of the 'tried_regions' directory. @@ -212,6 +214,25 @@ Contact: SeongJae Park Description: Writing to and reading from this file sets and gets the quotas charge reset interval of the scheme in milliseconds. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/goals/nr_goals +Date: Nov 2023 +Contact: SeongJae Park +Description: Writing a number 'N' to this file creates the number of + directories for setting automatic tuning of the scheme's + aggressiveness named '0' to 'N-1' under the goals/ directory. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/goals//target_value +Date: Nov 2023 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the target + value of the goal metric. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/goals//current_value +Date: Nov 2023 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the current + value of the goal metric. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/weights/sz_permil Date: Mar 2022 Contact: SeongJae Park From 6140edeea8bf30bf94c23b18c39448b43f528f46 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 30 Nov 2023 02:36:52 +0000 Subject: [PATCH 0715/1562] Docs/admin-guide/mm/damon/usage: document for quota goals Update DAMON sysfs usage for newly added DAMOS quota goals interface. Link: https://lkml.kernel.org/r/20231130023652.50284-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 48 +++++++++++++++++--- Documentation/mm/damon/design.rst | 2 + 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index da94feb97ed1..ff9f62e65722 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -83,6 +83,8 @@ comma (","). :: │ │ │ │ │ │ │ │ age/min,max │ │ │ │ │ │ │ quotas/ms,bytes,reset_interval_ms │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil + │ │ │ │ │ │ │ │ goals/nr_goals + │ │ │ │ │ │ │ │ │ 0/target_value,current_value │ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low │ │ │ │ │ │ │ filters/nr_filters │ │ │ │ │ │ │ │ 0/type,matching,memcg_id @@ -113,6 +115,8 @@ details) exists. In the beginning, this directory has only one file, child directories named ``0`` to ``N-1``. Each directory represents each kdamond. +.. _sysfs_kdamond: + kdamonds// ------------- @@ -121,11 +125,18 @@ In each kdamond directory, two files (``state`` and ``pid``) and one directory Reading ``state`` returns ``on`` if the kdamond is currently running, or ``off`` if it is not running. Writing ``on`` or ``off`` makes the kdamond be -in the state. Writing ``commit`` to the ``state`` file makes kdamond reads the +in the state. + +Writing ``commit`` to the ``state`` file makes kdamond reads the user inputs in the sysfs files except ``state`` file again. Writing -``update_schemes_stats`` to ``state`` file updates the contents of stats files -for each DAMON-based operation scheme of the kdamond. For details of the -stats, please refer to :ref:`stats section `. +``commit_schemes_quota_goals`` to the ``state`` file makes kdamond reads the +DAMON-based operation schemes' :ref:`quota goals ` +of the kdamond. + +Writing ``update_schemes_stats`` to ``state`` file updates the +contents of stats files for each DAMON-based operation scheme of the kdamond. +For details of the stats, please refer to :ref:`stats section +`. Writing ``update_schemes_tried_regions`` to ``state`` file updates the DAMON-based operation scheme action tried regions directory for each @@ -319,8 +330,7 @@ The directory for the :ref:`quotas ` of the given DAMON-based operation scheme. Under ``quotas`` directory, three files (``ms``, ``bytes``, -``reset_interval_ms``) and one directory (``weights``) having three files -(``sz_permil``, ``nr_accesses_permil``, and ``age_permil``) in it exist. +``reset_interval_ms``) and two directores (``weights`` and ``goals``) exist. You can set the ``time quota`` in milliseconds, ``size quota`` in bytes, and ``reset interval`` in milliseconds by writing the values to the three files, @@ -330,11 +340,35 @@ apply the action to only up to ``bytes`` bytes of memory regions within the ``reset_interval_ms``. Setting both ``ms`` and ``bytes`` zero disables the quota limits. -You can also set the :ref:`prioritization weights +Under ``weights`` directory, three files (``sz_permil``, +``nr_accesses_permil``, and ``age_permil``) exist. +You can set the :ref:`prioritization weights ` for size, access frequency, and age in per-thousand unit by writing the values to the three files under the ``weights`` directory. +.. _sysfs_schemes_quota_goals: + +schemes//quotas/goals/ +------------------------- + +The directory for the :ref:`automatic quota tuning goals +` of the given DAMON-based operation +scheme. + +In the beginning, this directory has only one file, ``nr_goals``. Writing a +number (``N``) to the file creates the number of child directories named ``0`` +to ``N-1``. Each directory represents each goal and current achievement. +Among the multiple feedback, the best one is used. + +Each goal directory contains two files, namely ``target_value`` and +``current_value``. Users can set and get any number to those files to set the +feedback. User space main workload's latency or throughput, system metrics +like free memory ratio or memory pressure stall time (PSI) could be example +metrics for the values. Note that users should write +``commit_schemes_quota_goals`` to the ``state`` file of the :ref:`kdamond +directory ` to pass the feedback to DAMON. + schemes//watermarks/ ----------------------- diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 947c9df6cd33..8b4a49ac057d 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -346,6 +346,8 @@ the weight will be respected are up to the underlying prioritization mechanism implementation. +.. _damon_design_damos_quotas_auto_tuning: + Aim-oriented Feedback-driven Auto-tuning ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 1486fb50136f4799946f5ecfe050094574647153 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 18 Nov 2023 10:32:28 +0800 Subject: [PATCH 0716/1562] mm: ksm: use more folio api in ksm_might_need_to_copy() Patch series "mm: cleanup and use more folio in page fault", v3. Rename page_copy_prealloc() to folio_prealloc(), which is used by more functions, also do more folio conversion in page fault. This patch (of 5): Since ksm only support normal page, no swapout/in for ksm large folio too, add large folio check in ksm_might_need_to_copy(), also convert page->index to folio->index as page->index is going away. Then convert ksm_might_need_to_copy() to use more folio api to save nine compound_head() calls, short 'address' to reduce max-line-length. Link: https://lkml.kernel.org/r/20231118023232.1409103-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20231118023232.1409103-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Sidhartha Kumar Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/ksm.h | 4 ++-- mm/ksm.c | 39 +++++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index c2dd786a30e1..4643d5244e77 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -77,7 +77,7 @@ static inline void ksm_exit(struct mm_struct *mm) * but what if the vma was unmerged while the page was swapped out? */ struct page *ksm_might_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address); + struct vm_area_struct *vma, unsigned long addr); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); @@ -130,7 +130,7 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, } static inline struct page *ksm_might_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long addr) { return page; } diff --git a/mm/ksm.c b/mm/ksm.c index 5d60d5385de6..b93389a3780e 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2876,48 +2876,51 @@ void __ksm_exit(struct mm_struct *mm) } struct page *ksm_might_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = page_folio(page); struct anon_vma *anon_vma = folio_anon_vma(folio); - struct page *new_page; + struct folio *new_folio; - if (PageKsm(page)) { - if (page_stable_node(page) && + if (folio_test_large(folio)) + return page; + + if (folio_test_ksm(folio)) { + if (folio_stable_node(folio) && !(ksm_run & KSM_RUN_UNMERGE)) return page; /* no need to copy it */ } else if (!anon_vma) { return page; /* no need to copy it */ - } else if (page->index == linear_page_index(vma, address) && + } else if (folio->index == linear_page_index(vma, addr) && anon_vma->root == vma->anon_vma->root) { return page; /* still no need to copy it */ } if (PageHWPoison(page)) return ERR_PTR(-EHWPOISON); - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) return page; /* let do_swap_page report the error */ - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (new_page && - mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) { - put_page(new_page); - new_page = NULL; + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); + if (new_folio && + mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) { + folio_put(new_folio); + new_folio = NULL; } - if (new_page) { - if (copy_mc_user_highpage(new_page, page, address, vma)) { - put_page(new_page); + if (new_folio) { + if (copy_mc_user_highpage(&new_folio->page, page, addr, vma)) { + folio_put(new_folio); memory_failure_queue(page_to_pfn(page), 0); return ERR_PTR(-EHWPOISON); } - SetPageDirty(new_page); - __SetPageUptodate(new_page); - __SetPageLocked(new_page); + folio_set_dirty(new_folio); + __folio_mark_uptodate(new_folio); + __folio_set_locked(new_folio); #ifdef CONFIG_SWAP count_vm_event(KSM_SWPIN_COPY); #endif } - return new_page; + return new_folio ? &new_folio->page : NULL; } void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) From f8b6187d8dd98fd32fe393071f362a7b6beaad0a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 18 Nov 2023 10:32:29 +0800 Subject: [PATCH 0717/1562] mm: memory: use a folio in validate_page_before_insert() Use a folio in validate_page_before_insert() to save two compound_head() calls. Link: https://lkml.kernel.org/r/20231118023232.1409103-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sidhartha Kumar Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index a8ff3489211b..5a917b21a122 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1842,9 +1842,12 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, static int validate_page_before_insert(struct page *page) { - if (PageAnon(page) || PageSlab(page) || page_has_type(page)) + struct folio *folio = page_folio(page); + + if (folio_test_anon(folio) || folio_test_slab(folio) || + page_has_type(page)) return -EINVAL; - flush_dcache_page(page); + flush_dcache_folio(folio); return 0; } From 294de6d8f14a69f1251b94223ba9d90d64b28cec Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 18 Nov 2023 10:32:30 +0800 Subject: [PATCH 0718/1562] mm: memory: rename page_copy_prealloc() to folio_prealloc() Let's rename page_copy_prealloc() to folio_prealloc(), which could be reused in more functons, as it maybe zero the new page, pass a new need_zero to it, and call the vma_alloc_zeroed_movable_folio() if need_zero is true. Link: https://lkml.kernel.org/r/20231118023232.1409103-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sidhartha Kumar Reviewed-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 5a917b21a122..32fb7e066197 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -992,12 +992,17 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, return 0; } -static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm, - struct vm_area_struct *vma, unsigned long addr) +static inline struct folio *folio_prealloc(struct mm_struct *src_mm, + struct vm_area_struct *vma, unsigned long addr, bool need_zero) { struct folio *new_folio; - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); + if (need_zero) + new_folio = vma_alloc_zeroed_movable_folio(vma, addr); + else + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + addr, false); + if (!new_folio) return NULL; @@ -1129,7 +1134,7 @@ again: } else if (ret == -EBUSY) { goto out; } else if (ret == -EAGAIN) { - prealloc = page_copy_prealloc(src_mm, src_vma, addr); + prealloc = folio_prealloc(src_mm, src_vma, addr, false); if (!prealloc) return -ENOMEM; } else if (ret) { From e4621e70469c3ac6e1b6914f1c42941a8a6e44d2 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 18 Nov 2023 10:32:31 +0800 Subject: [PATCH 0719/1562] mm: memory: use a folio in do_cow_fault() Use folio_prealloc() helper and convert to use a folio in do_cow_fault(), which save five compound_head() calls. Link: https://lkml.kernel.org/r/20231118023232.1409103-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/memory.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 32fb7e066197..3d92b4a7b6e4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4656,6 +4656,7 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) static vm_fault_t do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct folio *folio; vm_fault_t ret; ret = vmf_can_call_fault(vmf); @@ -4664,16 +4665,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (ret) return ret; - vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); - if (!vmf->cow_page) + folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false); + if (!folio) return VM_FAULT_OOM; - if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm, - GFP_KERNEL)) { - put_page(vmf->cow_page); - return VM_FAULT_OOM; - } - folio_throttle_swaprate(page_folio(vmf->cow_page), GFP_KERNEL); + vmf->cow_page = &folio->page; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -4682,7 +4678,7 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) return ret; copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); - __SetPageUptodate(vmf->cow_page); + __folio_mark_uptodate(folio); ret |= finish_fault(vmf); unlock_page(vmf->page); @@ -4691,7 +4687,7 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) goto uncharge_out; return ret; uncharge_out: - put_page(vmf->cow_page); + folio_put(folio); return ret; } From cf503cc665c442ce9893cb12561c57a328465e29 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 18 Nov 2023 10:32:32 +0800 Subject: [PATCH 0720/1562] mm: memory: use folio_prealloc() in wp_page_copy() Use folio_prealloc() helper to simplify code a bit. Link: https://lkml.kernel.org/r/20231118023232.1409103-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Sidhartha Kumar Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 3d92b4a7b6e4..99582b188ed2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3117,6 +3117,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) int page_copied = 0; struct mmu_notifier_range range; vm_fault_t ret; + bool pfn_is_zero; delayacct_wpcopy_start(); @@ -3126,16 +3127,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (unlikely(ret)) goto out; - if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { - new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); - if (!new_folio) - goto oom; - } else { + pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte)); + new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero); + if (!new_folio) + goto oom; + + if (!pfn_is_zero) { int err; - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, - vmf->address, false); - if (!new_folio) - goto oom; err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); if (err) { @@ -3156,10 +3154,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) kmsan_copy_page_meta(&new_folio->page, vmf->page); } - if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL)) - goto oom_free_new; - folio_throttle_swaprate(new_folio, GFP_KERNEL); - __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, @@ -3258,8 +3252,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) delayacct_wpcopy_end(); return 0; -oom_free_new: - folio_put(new_folio); oom: ret = VM_FAULT_OOM; out: From ec056cef76a525706601b32048f174f9bea72c7c Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 1 Dec 2023 16:10:45 +0000 Subject: [PATCH 0721/1562] mm/readahead: do not allow order-1 folio The THP machinery does not support order-1 folios because it requires meta data spanning the first 3 `struct page`s. So order-2 is the smallest large folio that we can safely create. There was a theoretical bug whereby if ra->size was 2 or 3 pages (due to the device-specific bdi->ra_pages being set that way), we could end up with order = 1. Fix this by unconditionally checking if the preferred order is 1 and if so, set it to 0. Previously this was done in a few specific places, but with this refactoring it is done just once, unconditionally, at the end of the calculation. This is a theoretical bug found during review of the code; I have no evidence to suggest this manifests in the real world (I expect all device-specific ra_pages values are much bigger than 3). Link: https://lkml.kernel.org/r/20231201161045.3962614-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/readahead.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 6925e6959fd3..23620c57c122 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -511,16 +511,14 @@ void page_cache_ra_order(struct readahead_control *ractl, unsigned int order = new_order; /* Align with smaller pages if needed */ - if (index & ((1UL << order) - 1)) { + if (index & ((1UL << order) - 1)) order = __ffs(index); - if (order == 1) - order = 0; - } /* Don't allocate pages past EOF */ - while (index + (1UL << order) - 1 > limit) { - if (--order == 1) - order = 0; - } + while (index + (1UL << order) - 1 > limit) + order--; + /* THP machinery does not support order-1 */ + if (order == 1) + order = 0; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) break; From f67f8d4a8c1e1ebc85a6cbdb9a7266f14863461c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 1 Dec 2023 14:59:36 -0500 Subject: [PATCH 0722/1562] mm/rmap: fix misplaced parenthesis of a likely() Running my yearly branch profiler to see where likely/unlikely annotation may be added or removed, I discovered this: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 0 457918 100 page_try_dup_anon_rmap rmap.h 264 [..] 458021 0 0 page_try_dup_anon_rmap rmap.h 265 I thought it was interesting that line 264 of rmap.h had a 100% incorrect annotation, but the line directly below it was 100% correct. Looking at the code: if (likely(!is_device_private_page(page) && unlikely(page_needs_cow_for_dma(vma, page)))) It didn't make sense. The "likely()" was around the entire if statement (not just the "!is_device_private_page(page)"), which also included the "unlikely()" portion of that if condition. If the unlikely portion is unlikely to be true, that would make the entire if condition unlikely to be true, so it made no sense at all to say the entire if condition is true. What is more likely to be likely is just the first part of the if statement before the && operation. It's likely to be a misplaced parenthesis. And after making the if condition broken into a likely() && unlikely(), both now appear to be correct! Link: https://lkml.kernel.org/r/20231201145936.5ddfdb50@gandalf.local.home Fixes:fb3d824d1a46c ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()") Signed-off-by: Steven Rostedt (Google) Acked-by: Vlastimil Babka Cc: David Hildenbrand Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/rmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b26fe858fd44..3c2fc291b071 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -261,8 +261,8 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, * guarantee the pinned page won't be randomly replaced in the * future on write faults. */ - if (likely(!is_device_private_page(page) && - unlikely(page_needs_cow_for_dma(vma, page)))) + if (likely(!is_device_private_page(page)) && + unlikely(page_needs_cow_for_dma(vma, page))) return -EBUSY; ClearPageAnonExclusive(page); From 39042079a0c241d09fa6fc3bb67c2ddf60011d0f Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Dec 2023 19:08:29 +0000 Subject: [PATCH 0723/1562] kmemleak: avoid RCU stalls when freeing metadata for per-CPU pointers On systems with large number of CPUs, the following soft lockup splat might sometimes happen: [ 2656.001617] watchdog: BUG: soft lockup - CPU#364 stuck for 21s! [ksoftirqd/364:2206] : [ 2656.141194] RIP: 0010:_raw_spin_unlock_irqrestore+0x3d/0x70 : 2656.241214] Call Trace: [ 2656.243971] [ 2656.246237] ? show_trace_log_lvl+0x1c4/0x2df [ 2656.251152] ? show_trace_log_lvl+0x1c4/0x2df [ 2656.256066] ? kmemleak_free_percpu+0x11f/0x1f0 [ 2656.261173] ? watchdog_timer_fn+0x379/0x470 [ 2656.265984] ? __pfx_watchdog_timer_fn+0x10/0x10 [ 2656.271179] ? __hrtimer_run_queues+0x5f3/0xd00 [ 2656.276283] ? __pfx___hrtimer_run_queues+0x10/0x10 [ 2656.281783] ? ktime_get_update_offsets_now+0x95/0x2c0 [ 2656.287573] ? ktime_get_update_offsets_now+0xdd/0x2c0 [ 2656.293380] ? hrtimer_interrupt+0x2e9/0x780 [ 2656.298221] ? __sysvec_apic_timer_interrupt+0x184/0x640 [ 2656.304211] ? sysvec_apic_timer_interrupt+0x8e/0xc0 [ 2656.309807] [ 2656.312169] [ 2656.326110] kmemleak_free_percpu+0x11f/0x1f0 [ 2656.331015] free_percpu.part.0+0x1b/0xe70 [ 2656.335635] free_vfsmnt+0xb9/0x100 [ 2656.339567] rcu_do_batch+0x3c8/0xe30 [ 2656.363693] rcu_core+0x3de/0x5a0 [ 2656.367433] __do_softirq+0x2d0/0x9a8 [ 2656.381119] run_ksoftirqd+0x36/0x60 [ 2656.385145] smpboot_thread_fn+0x556/0x910 [ 2656.394971] kthread+0x2a4/0x350 [ 2656.402826] ret_from_fork+0x29/0x50 [ 2656.406861] The issue is caused by kmemleak registering each per_cpu_ptr() corresponding to the __percpu pointer. This is unnecessary since such individual per-CPU pointers are not tracked anyway. Create a new object_percpu_tree_root rbtree that stores a single __percpu pointer together with an OBJECT_PERCPU flag for the kmemleak metadata. Scanning needs to be done for all per_cpu_ptr() pointers with a cond_resched() between each CPU iteration to avoid RCU stalls. [catalin.marinas@arm.com: update comment] Link: https://lkml.kernel.org/r/20231206114414.2085824-1-catalin.marinas@arm.com Link: https://lore.kernel.org/r/20231127194153.289626-1-longman@redhat.comLink: https://lkml.kernel.org/r/20231201190829.825856-1-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Reported-by: Waiman Long Closes: https://lore.kernel.org/r/20231127194153.289626-1-longman@redhat.com Reviewed-by: Waiman Long Signed-off-by: Andrew Morton --- mm/kmemleak.c | 178 +++++++++++++++++++++++++++----------------------- 1 file changed, 97 insertions(+), 81 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 0fb4dcc3b06a..6a540c2b27c5 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -14,17 +14,15 @@ * The following locks and mutexes are used by kmemleak: * * - kmemleak_lock (raw_spinlock_t): protects the object_list as well as - * del_state modifications and accesses to the object_tree_root (or - * object_phys_tree_root). The object_list is the main list holding the - * metadata (struct kmemleak_object) for the allocated memory blocks. - * The object_tree_root and object_phys_tree_root are red - * black trees used to look-up metadata based on a pointer to the - * corresponding memory block. The object_phys_tree_root is for objects - * allocated with physical address. The kmemleak_object structures are - * added to the object_list and object_tree_root (or object_phys_tree_root) - * in the create_object() function called from the kmemleak_alloc() (or - * kmemleak_alloc_phys()) callback and removed in delete_object() called from - * the kmemleak_free() callback + * del_state modifications and accesses to the object trees + * (object_tree_root, object_phys_tree_root, object_percpu_tree_root). The + * object_list is the main list holding the metadata (struct + * kmemleak_object) for the allocated memory blocks. The object trees are + * red black trees used to look-up metadata based on a pointer to the + * corresponding memory block. The kmemleak_object structures are added to + * the object_list and the object tree root in the create_object() function + * called from the kmemleak_alloc{,_phys,_percpu}() callback and removed in + * delete_object() called from the kmemleak_free{,_phys,_percpu}() callback * - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object. * Accesses to the metadata (e.g. count) are protected by this lock. Note * that some members of this structure may be protected by other means @@ -178,6 +176,8 @@ struct kmemleak_object { #define OBJECT_FULL_SCAN (1 << 3) /* flag set for object allocated with physical address */ #define OBJECT_PHYS (1 << 4) +/* flag set for per-CPU pointers */ +#define OBJECT_PERCPU (1 << 5) /* set when __remove_object() called */ #define DELSTATE_REMOVED (1 << 0) @@ -206,6 +206,8 @@ static LIST_HEAD(mem_pool_free_list); static struct rb_root object_tree_root = RB_ROOT; /* search tree for object (with OBJECT_PHYS flag) boundaries */ static struct rb_root object_phys_tree_root = RB_ROOT; +/* search tree for object (with OBJECT_PERCPU flag) boundaries */ +static struct rb_root object_percpu_tree_root = RB_ROOT; /* protecting the access to object_list, object_tree_root (or object_phys_tree_root) */ static DEFINE_RAW_SPINLOCK(kmemleak_lock); @@ -298,7 +300,7 @@ static void hex_dump_object(struct seq_file *seq, const u8 *ptr = (const u8 *)object->pointer; size_t len; - if (WARN_ON_ONCE(object->flags & OBJECT_PHYS)) + if (WARN_ON_ONCE(object->flags & (OBJECT_PHYS | OBJECT_PERCPU))) return; /* limit the number of lines to HEX_MAX_LINES */ @@ -390,6 +392,15 @@ static void dump_object_info(struct kmemleak_object *object) stack_depot_print(object->trace_handle); } +static struct rb_root *object_tree(unsigned long objflags) +{ + if (objflags & OBJECT_PHYS) + return &object_phys_tree_root; + if (objflags & OBJECT_PERCPU) + return &object_percpu_tree_root; + return &object_tree_root; +} + /* * Look-up a memory block metadata (kmemleak_object) in the object search * tree based on a pointer value. If alias is 0, only values pointing to the @@ -397,10 +408,9 @@ static void dump_object_info(struct kmemleak_object *object) * when calling this function. */ static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias, - bool is_phys) + unsigned int objflags) { - struct rb_node *rb = is_phys ? object_phys_tree_root.rb_node : - object_tree_root.rb_node; + struct rb_node *rb = object_tree(objflags)->rb_node; unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); while (rb) { @@ -429,7 +439,7 @@ static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias, /* Look-up a kmemleak object which allocated with virtual address. */ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) { - return __lookup_object(ptr, alias, false); + return __lookup_object(ptr, alias, 0); } /* @@ -542,14 +552,14 @@ static void put_object(struct kmemleak_object *object) * Look up an object in the object search tree and increase its use_count. */ static struct kmemleak_object *__find_and_get_object(unsigned long ptr, int alias, - bool is_phys) + unsigned int objflags) { unsigned long flags; struct kmemleak_object *object; rcu_read_lock(); raw_spin_lock_irqsave(&kmemleak_lock, flags); - object = __lookup_object(ptr, alias, is_phys); + object = __lookup_object(ptr, alias, objflags); raw_spin_unlock_irqrestore(&kmemleak_lock, flags); /* check whether the object is still available */ @@ -563,19 +573,16 @@ static struct kmemleak_object *__find_and_get_object(unsigned long ptr, int alia /* Look up and get an object which allocated with virtual address. */ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) { - return __find_and_get_object(ptr, alias, false); + return __find_and_get_object(ptr, alias, 0); } /* - * Remove an object from the object_tree_root (or object_phys_tree_root) - * and object_list. Must be called with the kmemleak_lock held _if_ kmemleak - * is still enabled. + * Remove an object from its object tree and object_list. Must be called with + * the kmemleak_lock held _if_ kmemleak is still enabled. */ static void __remove_object(struct kmemleak_object *object) { - rb_erase(&object->rb_node, object->flags & OBJECT_PHYS ? - &object_phys_tree_root : - &object_tree_root); + rb_erase(&object->rb_node, object_tree(object->flags)); if (!(object->del_state & DELSTATE_NO_DELETE)) list_del_rcu(&object->object_list); object->del_state |= DELSTATE_REMOVED; @@ -583,11 +590,11 @@ static void __remove_object(struct kmemleak_object *object) static struct kmemleak_object *__find_and_remove_object(unsigned long ptr, int alias, - bool is_phys) + unsigned int objflags) { struct kmemleak_object *object; - object = __lookup_object(ptr, alias, is_phys); + object = __lookup_object(ptr, alias, objflags); if (object) __remove_object(object); @@ -595,19 +602,18 @@ static struct kmemleak_object *__find_and_remove_object(unsigned long ptr, } /* - * Look up an object in the object search tree and remove it from both - * object_tree_root (or object_phys_tree_root) and object_list. The - * returned object's use_count should be at least 1, as initially set - * by create_object(). + * Look up an object in the object search tree and remove it from both object + * tree root and object_list. The returned object's use_count should be at + * least 1, as initially set by create_object(). */ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias, - bool is_phys) + unsigned int objflags) { unsigned long flags; struct kmemleak_object *object; raw_spin_lock_irqsave(&kmemleak_lock, flags); - object = __find_and_remove_object(ptr, alias, is_phys); + object = __find_and_remove_object(ptr, alias, objflags); raw_spin_unlock_irqrestore(&kmemleak_lock, flags); return object; @@ -678,7 +684,7 @@ static struct kmemleak_object *__alloc_object(gfp_t gfp) } static int __link_object(struct kmemleak_object *object, unsigned long ptr, - size_t size, int min_count, bool is_phys) + size_t size, int min_count, unsigned int objflags) { struct kmemleak_object *parent; @@ -686,7 +692,7 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr, unsigned long untagged_ptr; unsigned long untagged_objp; - object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0); + object->flags = OBJECT_ALLOCATED | objflags; object->pointer = ptr; object->size = kfence_ksize((void *)ptr) ?: size; object->min_count = min_count; @@ -697,12 +703,11 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr, * Only update min_addr and max_addr with object * storing virtual address. */ - if (!is_phys) { + if (!(objflags & (OBJECT_PHYS | OBJECT_PERCPU))) { min_addr = min(min_addr, untagged_ptr); max_addr = max(max_addr, untagged_ptr + size); } - link = is_phys ? &object_phys_tree_root.rb_node : - &object_tree_root.rb_node; + link = &object_tree(objflags)->rb_node; rb_parent = NULL; while (*link) { rb_parent = *link; @@ -724,8 +729,7 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr, } } rb_link_node(&object->rb_node, rb_parent, link); - rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root : - &object_tree_root); + rb_insert_color(&object->rb_node, object_tree(objflags)); list_add_tail_rcu(&object->object_list, &object_list); return 0; @@ -733,11 +737,10 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr, /* * Create the metadata (struct kmemleak_object) corresponding to an allocated - * memory block and add it to the object_list and object_tree_root (or - * object_phys_tree_root). + * memory block and add it to the object_list and object tree. */ static void __create_object(unsigned long ptr, size_t size, - int min_count, gfp_t gfp, bool is_phys) + int min_count, gfp_t gfp, unsigned int objflags) { struct kmemleak_object *object; unsigned long flags; @@ -748,7 +751,7 @@ static void __create_object(unsigned long ptr, size_t size, return; raw_spin_lock_irqsave(&kmemleak_lock, flags); - ret = __link_object(object, ptr, size, min_count, is_phys); + ret = __link_object(object, ptr, size, min_count, objflags); raw_spin_unlock_irqrestore(&kmemleak_lock, flags); if (ret) mem_pool_free(object); @@ -758,14 +761,21 @@ static void __create_object(unsigned long ptr, size_t size, static void create_object(unsigned long ptr, size_t size, int min_count, gfp_t gfp) { - __create_object(ptr, size, min_count, gfp, false); + __create_object(ptr, size, min_count, gfp, 0); } /* Create kmemleak object which allocated with physical address. */ static void create_object_phys(unsigned long ptr, size_t size, int min_count, gfp_t gfp) { - __create_object(ptr, size, min_count, gfp, true); + __create_object(ptr, size, min_count, gfp, OBJECT_PHYS); +} + +/* Create kmemleak object corresponding to a per-CPU allocation. */ +static void create_object_percpu(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) +{ + __create_object(ptr, size, min_count, gfp, OBJECT_PERCPU); } /* @@ -792,11 +802,11 @@ static void __delete_object(struct kmemleak_object *object) * Look up the metadata (struct kmemleak_object) corresponding to ptr and * delete it. */ -static void delete_object_full(unsigned long ptr) +static void delete_object_full(unsigned long ptr, unsigned int objflags) { struct kmemleak_object *object; - object = find_and_remove_object(ptr, 0, false); + object = find_and_remove_object(ptr, 0, objflags); if (!object) { #ifdef DEBUG kmemleak_warn("Freeing unknown object at 0x%08lx\n", @@ -812,7 +822,8 @@ static void delete_object_full(unsigned long ptr) * delete it. If the memory block is partially freed, the function may create * additional metadata for the remaining parts of the block. */ -static void delete_object_part(unsigned long ptr, size_t size, bool is_phys) +static void delete_object_part(unsigned long ptr, size_t size, + unsigned int objflags) { struct kmemleak_object *object, *object_l, *object_r; unsigned long start, end, flags; @@ -826,7 +837,7 @@ static void delete_object_part(unsigned long ptr, size_t size, bool is_phys) goto out; raw_spin_lock_irqsave(&kmemleak_lock, flags); - object = __find_and_remove_object(ptr, 1, is_phys); + object = __find_and_remove_object(ptr, 1, objflags); if (!object) { #ifdef DEBUG kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", @@ -844,11 +855,11 @@ static void delete_object_part(unsigned long ptr, size_t size, bool is_phys) end = object->pointer + object->size; if ((ptr > start) && !__link_object(object_l, start, ptr - start, - object->min_count, is_phys)) + object->min_count, objflags)) object_l = NULL; if ((ptr + size < end) && !__link_object(object_r, ptr + size, end - ptr - size, - object->min_count, is_phys)) + object->min_count, objflags)) object_r = NULL; unlock: @@ -879,11 +890,11 @@ static void paint_it(struct kmemleak_object *object, int color) raw_spin_unlock_irqrestore(&object->lock, flags); } -static void paint_ptr(unsigned long ptr, int color, bool is_phys) +static void paint_ptr(unsigned long ptr, int color, unsigned int objflags) { struct kmemleak_object *object; - object = __find_and_get_object(ptr, 0, is_phys); + object = __find_and_get_object(ptr, 0, objflags); if (!object) { kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n", ptr, @@ -901,16 +912,16 @@ static void paint_ptr(unsigned long ptr, int color, bool is_phys) */ static void make_gray_object(unsigned long ptr) { - paint_ptr(ptr, KMEMLEAK_GREY, false); + paint_ptr(ptr, KMEMLEAK_GREY, 0); } /* * Mark the object as black-colored so that it is ignored from scans and * reporting. */ -static void make_black_object(unsigned long ptr, bool is_phys) +static void make_black_object(unsigned long ptr, unsigned int objflags) { - paint_ptr(ptr, KMEMLEAK_BLACK, is_phys); + paint_ptr(ptr, KMEMLEAK_BLACK, objflags); } /* @@ -1046,8 +1057,6 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc); void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, gfp_t gfp) { - unsigned int cpu; - pr_debug("%s(0x%px, %zu)\n", __func__, ptr, size); /* @@ -1055,9 +1064,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, * (min_count is set to 0). */ if (kmemleak_enabled && ptr && !IS_ERR(ptr)) - for_each_possible_cpu(cpu) - create_object((unsigned long)per_cpu_ptr(ptr, cpu), - size, 0, gfp); + create_object_percpu((unsigned long)ptr, size, 0, gfp); } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); @@ -1098,7 +1105,7 @@ void __ref kmemleak_free(const void *ptr) pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) - delete_object_full((unsigned long)ptr); + delete_object_full((unsigned long)ptr, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); @@ -1116,7 +1123,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) - delete_object_part((unsigned long)ptr, size, false); + delete_object_part((unsigned long)ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_part); @@ -1129,14 +1136,10 @@ EXPORT_SYMBOL_GPL(kmemleak_free_part); */ void __ref kmemleak_free_percpu(const void __percpu *ptr) { - unsigned int cpu; - pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) - for_each_possible_cpu(cpu) - delete_object_full((unsigned long)per_cpu_ptr(ptr, - cpu)); + delete_object_full((unsigned long)ptr, OBJECT_PERCPU); } EXPORT_SYMBOL_GPL(kmemleak_free_percpu); @@ -1206,7 +1209,7 @@ void __ref kmemleak_ignore(const void *ptr) pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) - make_black_object((unsigned long)ptr, false); + make_black_object((unsigned long)ptr, 0); } EXPORT_SYMBOL(kmemleak_ignore); @@ -1280,7 +1283,7 @@ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) pr_debug("%s(0x%px)\n", __func__, &phys); if (kmemleak_enabled) - delete_object_part((unsigned long)phys, size, true); + delete_object_part((unsigned long)phys, size, OBJECT_PHYS); } EXPORT_SYMBOL(kmemleak_free_part_phys); @@ -1294,7 +1297,7 @@ void __ref kmemleak_ignore_phys(phys_addr_t phys) pr_debug("%s(0x%px)\n", __func__, &phys); if (kmemleak_enabled) - make_black_object((unsigned long)phys, true); + make_black_object((unsigned long)phys, OBJECT_PHYS); } EXPORT_SYMBOL(kmemleak_ignore_phys); @@ -1305,7 +1308,7 @@ static bool update_checksum(struct kmemleak_object *object) { u32 old_csum = object->checksum; - if (WARN_ON_ONCE(object->flags & OBJECT_PHYS)) + if (WARN_ON_ONCE(object->flags & (OBJECT_PHYS | OBJECT_PERCPU))) return false; kasan_disable_current(); @@ -1461,7 +1464,6 @@ static void scan_object(struct kmemleak_object *object) { struct kmemleak_scan_area *area; unsigned long flags; - void *obj_ptr; /* * Once the object->lock is acquired, the corresponding memory block @@ -1474,14 +1476,27 @@ static void scan_object(struct kmemleak_object *object) /* already freed object */ goto out; - obj_ptr = object->flags & OBJECT_PHYS ? - __va((phys_addr_t)object->pointer) : - (void *)object->pointer; + if (object->flags & OBJECT_PERCPU) { + unsigned int cpu; - if (hlist_empty(&object->area_list) || + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr((void __percpu *)object->pointer, cpu); + void *end = start + object->size; + + scan_block(start, end, object); + + raw_spin_unlock_irqrestore(&object->lock, flags); + cond_resched(); + raw_spin_lock_irqsave(&object->lock, flags); + if (!(object->flags & OBJECT_ALLOCATED)) + break; + } + } else if (hlist_empty(&object->area_list) || object->flags & OBJECT_FULL_SCAN) { - void *start = obj_ptr; - void *end = obj_ptr + object->size; + void *start = object->flags & OBJECT_PHYS ? + __va((phys_addr_t)object->pointer) : + (void *)object->pointer; + void *end = start + object->size; void *next; do { @@ -1496,11 +1511,12 @@ static void scan_object(struct kmemleak_object *object) cond_resched(); raw_spin_lock_irqsave(&object->lock, flags); } while (object->flags & OBJECT_ALLOCATED); - } else + } else { hlist_for_each_entry(area, &object->area_list, node) scan_block((void *)area->start, (void *)(area->start + area->size), object); + } out: raw_spin_unlock_irqrestore(&object->lock, flags); } From b75427691f4a1fd3625d1f4c016832b8c75c2326 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 10 Nov 2023 11:33:21 +0800 Subject: [PATCH 0724/1562] mm: huge_memory: use more folio api in __split_huge_page_tail() Use more folio APIs to save six compound_head() calls in __split_huge_page_tail(). Link: https://lkml.kernel.org/r/20231110033324.2455523-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6eb55f97a3d2..c848ea97ab02 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2509,13 +2509,13 @@ static void __split_huge_page_tail(struct folio *folio, int tail, clear_compound_head(page_tail); /* Finally unfreeze refcount. Additional reference from page cache. */ - page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || - PageSwapCache(head))); + page_ref_unfreeze(page_tail, 1 + (!folio_test_anon(folio) || + folio_test_swapcache(folio))); - if (page_is_young(head)) - set_page_young(page_tail); - if (page_is_idle(head)) - set_page_idle(page_tail); + if (folio_test_young(folio)) + folio_set_young(new_folio); + if (folio_test_idle(folio)) + folio_set_idle(new_folio); folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); From 683ec99f12f4c386c23bed7f6a8ef44db5a4999a Mon Sep 17 00:00:00 2001 From: Dmytro Maluka Date: Tue, 5 Dec 2023 18:02:44 +0100 Subject: [PATCH 0725/1562] mm/thp: add CONFIG_TRANSPARENT_HUGEPAGE_NEVER option Currently enabling THP support (CONFIG_TRANSPARENT_HUGEPAGE) requires enabling either CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS or CONFIG_TRANSPARENT_HUGEPAGE_MADVISE, which both cause khugepaged starting by default at kernel bootup. Add the third choice CONFIG_TRANSPARENT_HUGEPAGE_NEVER, in line with the existing kernel command line setting transparent_hugepage=never, to disable THP by default (in particular, to prevent starting khugepaged by default) but still allow enabling it at runtime via sysfs. Rationale: khugepaged has its own non-negligible memory cost even if it is not used by any applications, since it bumps up vm.min_free_kbytes to its own required minimum in set_recommended_min_free_kbytes(). For example, on a machine with 4GB RAM, with 3 mm zones and pageblock_order == MAX_ORDER, starting khugepaged causes vm.min_free_kbytes increase from 8MB to 132MB. So if we use THP on machines with e.g. >=8GB of memory for better performance, but avoid using it on lower-memory machines to avoid its memory overhead, then for the same reason we also want to avoid even starting khugepaged on those <8GB machines. So with CONFIG_TRANSPARENT_HUGEPAGE_NEVER we can use the same kernel image on both >=8GB and <8GB machines, with THP support enabled but khugepaged not started by default. The userspace can then decide to enable THP via sysfs if needed, based on the total amount of memory. This could also be achieved with the existing transparent_hugepage=never setting in the kernel command line instead. But it seems cleaner to avoid tweaking the command line for such a basic setting. P.S. I see that CONFIG_TRANSPARENT_HUGEPAGE_NEVER was already proposed in the past [1] but without an explanation of the purpose. [1] https://lore.kernel.org/all/202211301651462590168@zte.com.cn/ Link: https://lkml.kernel.org/r/20231205170244.2746210-1-dmaluka@chromium.org Link: https://lore.kernel.org/all/20231204163254.2636289-1-dmaluka@chromium.org/ Signed-off-by: Dmytro Maluka Signed-off-by: Andrew Morton --- mm/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/Kconfig b/mm/Kconfig index ca87cdb72f11..8f8b02e9c136 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -873,6 +873,12 @@ choice madvise(MADV_HUGEPAGE) but it won't risk to increase the memory footprint of applications without a guaranteed benefit. + + config TRANSPARENT_HUGEPAGE_NEVER + bool "never" + help + Disable Transparent Hugepage by default. It can still be + enabled at runtime via sysfs. endchoice config THP_SWAP From a1748f85bec936d87cac8a9785fb2a38147fc998 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Tue, 5 Dec 2023 10:29:54 +0800 Subject: [PATCH 0726/1562] mm: filemap: remove unnecessary iitialization of ret The ret variable can be defined without assigning a value, as it is assigned before use. Link: https://lkml.kernel.org/r/20231205022954.101045-1-zeming@nfschina.com Signed-off-by: Li zeming Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 71f00539ac00..c0d7e1d7eea2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1623,7 +1623,7 @@ EXPORT_SYMBOL_GPL(__folio_lock_killable); static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) { struct wait_queue_head *q = folio_waitqueue(folio); - int ret = 0; + int ret; wait->folio = folio; wait->bit_nr = PG_locked; From 47e61d8874cca8070d4f9295819876c18b5207b2 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 5 Dec 2023 11:05:30 +0800 Subject: [PATCH 0727/1562] mm: hugetlb_vmemmap: add check of CONFIG_MEMORY_HOTPLUG back The compiler will optimize the code as much as possible if we add the check of CONFIG_MEMORY_HOTPLUG back. Link: https://lkml.kernel.org/r/20231205030530.3802-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 54f388aa361f..2646a2798a0e 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -119,7 +119,7 @@ static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, * +--+ | | * +------------------------+ */ - if (unlikely(!vmemmap_walk->nr_walked)) { + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { struct page *page = head ? head + pte_index(addr) : pte_page(ptep_get(pte_offset_kernel(pmd, addr))); From 49b960de6b323c34a79e6e92219fac31b12268a3 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 5 Dec 2023 11:08:53 +0800 Subject: [PATCH 0728/1562] mm: hugetlb_vmemmap: move mmap lock to vmemmap_remap_range() All the users of vmemmap_remap_range() will hold the mmap lock and release it once it returns, it is naturally to move the lock to vmemmap_remap_range() to simplify the code and the users. Link: https://lkml.kernel.org/r/20231205030853.3921-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 2646a2798a0e..da177e49d956 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -163,8 +163,10 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, VM_BUG_ON(!PAGE_ALIGNED(start | end)); + mmap_read_lock(&init_mm); ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops, NULL, walk); + mmap_read_unlock(&init_mm); if (ret) return ret; @@ -282,7 +284,6 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, static int vmemmap_remap_split(unsigned long start, unsigned long end, unsigned long reuse) { - int ret; struct vmemmap_remap_walk walk = { .remap_pte = NULL, .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, @@ -291,11 +292,7 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end, /* See the comment in the vmemmap_remap_free(). */ BUG_ON(start - reuse != PAGE_SIZE); - mmap_read_lock(&init_mm); - ret = vmemmap_remap_range(reuse, end, &walk); - mmap_read_unlock(&init_mm); - - return ret; + return vmemmap_remap_range(reuse, end, &walk); } /** @@ -358,7 +355,6 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, */ BUG_ON(start - reuse != PAGE_SIZE); - mmap_read_lock(&init_mm); ret = vmemmap_remap_range(reuse, end, &walk); if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; @@ -377,7 +373,6 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, vmemmap_remap_range(reuse, end, &walk); } - mmap_read_unlock(&init_mm); return ret; } @@ -434,11 +429,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) return -ENOMEM; - mmap_read_lock(&init_mm); - vmemmap_remap_range(reuse, end, &walk); - mmap_read_unlock(&init_mm); - - return 0; + return vmemmap_remap_range(reuse, end, &walk); } DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); From 4196810a2542562c9a2ae4146f2908e7858d6af4 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Tue, 5 Dec 2023 10:17:51 +0800 Subject: [PATCH 0729/1562] mm: cma: remove unnecessary initialization of ret The ret variable can be defined without assigning a value, as it is assigned before use. Link: https://lkml.kernel.org/r/20231205021751.100459-1-zeming@nfschina.com Signed-off-by: Li zeming Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- mm/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/cma.c b/mm/cma.c index 2b2494fd6b59..7c09c47e530b 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -244,7 +244,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, { phys_addr_t memblock_end = memblock_end_of_DRAM(); phys_addr_t highmem_start; - int ret = 0; + int ret; /* * We can't use __pa(high_memory) directly, since high_memory From dd05f5ec1e468b7dd723e1571dbd41d6f30a201d Mon Sep 17 00:00:00 2001 From: Chen Haonan Date: Wed, 6 Dec 2023 18:36:27 +0800 Subject: [PATCH 0730/1562] mm: use vma_pages() for vma objects vma_pages() is more readable and also better at avoiding error codes, so use vma_pages() instead of direct operations on vma Link: https://lkml.kernel.org/r/tencent_151850CF327EB055BBC83298A929BD06CD0A@qq.com Signed-off-by: Chen Haonan Signed-off-by: Andrew Morton --- mm/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/internal.h b/mm/internal.h index 8450562744cf..222e63b2dea4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -616,7 +616,7 @@ folio_within_range(struct folio *folio, struct vm_area_struct *vma, unsigned long start, unsigned long end) { pgoff_t pgoff, addr; - unsigned long vma_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + unsigned long vma_pglen = vma_pages(vma); VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio); if (start > end) From d19b1a1797d8e73eebce7eced289e0c7c1b5de80 Mon Sep 17 00:00:00 2001 From: Barry Song <21cnbao@gmail.com> Date: Thu, 7 Dec 2023 00:00:54 +1300 Subject: [PATCH 0731/1562] mm: compaction: avoid fast_isolate_freepages blindly choose improper pageblock Testing shows fast_isolate_freepages can blindly choose an unsuitable pageblock from time to time particularly while the min mark is used from XXX path: if (!page) { cc->fast_search_fail++; if (scan_start) { /* * Use the highest PFN found above min. If one was * not found, be pessimistic for direct compaction * and use the min mark. */ if (highest >= min_pfn) { page = pfn_to_page(highest); cc->free_pfn = highest; } else { if (cc->direct_compaction && pfn_valid(min_pfn)) { /* XXX */ page = pageblock_pfn_to_page(min_pfn, min(pageblock_end_pfn(min_pfn), zone_end_pfn(cc->zone)), cc->zone); cc->free_pfn = min_pfn; } } } } The reason is that no code is doing any check on the min_pfn min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1)); In contrast, slow path of isolate_freepages() is always skipping unsuitable pageblocks in a decent way. This issue doesn't happen quite often. When running 25 machines with 16GiB memory for one night, most of them can hit this unexpected code path. However the frequency isn't like many times per second. It might be one time in a couple of hours. Thus, it is very hard to measure the visible performance impact in my machines though the affection of choosing the unsuitable migration_target should be negative in theory. I feel it's still worth fixing this to at least make the code theoretically self-explanatory as it is quite odd an unsuitable migration_target can be still migration_target. Link: https://lkml.kernel.org/r/20231206110054.61617-1-v-songbaohua@oppo.com Signed-off-by: Barry Song Reported-by: Zhanyuan Hu Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index 01ba298739dd..de15a2ef0af5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1611,6 +1611,9 @@ static void fast_isolate_freepages(struct compact_control *cc) min(pageblock_end_pfn(min_pfn), zone_end_pfn(cc->zone)), cc->zone); + if (page && !suitable_migration_target(cc, page)) + page = NULL; + cc->free_pfn = min_pfn; } } From d9d9bd979cced7d4a51b65224b1d7f396c8b4eea Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Thu, 9 Nov 2023 16:08:22 +0000 Subject: [PATCH 0732/1562] maple_tree: change return type of mas_split_final_node as void. mas_split_final_node() always returns true and its return value is never checked. Change return type to void. Link: https://lkml.kernel.org/r/20231109160821.16248-2-ppbuk5246@gmail.com Signed-off-by: Levi Yun Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 47f2a7a97385..dc1f45b1628d 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3132,7 +3132,7 @@ done: * @mas: The maple state * @height: The height of the tree in case it's a new root. */ -static inline bool mas_split_final_node(struct maple_subtree_state *mast, +static inline void mas_split_final_node(struct maple_subtree_state *mast, struct ma_state *mas, int height) { struct maple_enode *ancestor; @@ -3156,7 +3156,6 @@ static inline bool mas_split_final_node(struct maple_subtree_state *mast, mast->l->node = ancestor; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true); mas->offset = mast->bn->b_end - 1; - return true; } /* From 8a3134a02538096e35dddc3c80676505c2edf57c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 5 Dec 2023 14:50:45 -0800 Subject: [PATCH 0733/1562] ACPI: watchdog: fix kernel-doc warnings Fix kernel-doc warnings found when using "W=1". acpi_watchdog.c:85: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst acpi_watchdog.c:85: warning: missing initial short description on line: * Returns true if this system should prefer ACPI based watchdog instead of Signed-off-by: Randy Dunlap Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/acpi_watchdog.c b/drivers/acpi/acpi_watchdog.c index ca28183f4d13..8e9e001da38f 100644 --- a/drivers/acpi/acpi_watchdog.c +++ b/drivers/acpi/acpi_watchdog.c @@ -81,7 +81,7 @@ static const struct acpi_table_wdat *acpi_watchdog_get_wdat(void) return wdat; } -/** +/* * Returns true if this system should prefer ACPI based watchdog instead of * the native one (which are typically the same hardware). */ From 9ecc3b38abebd18998af40f42029ba84f3f4311c Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Wed, 6 Dec 2023 18:43:16 +0800 Subject: [PATCH 0734/1562] ACPI: NUMA: Remove unnecessary check in acpi_parse_gi_affinity() The acpi_map_pxm_to_node() function will never return a node value that is greater than or equal to MAX_NUMNODES. Remove the unnecessary `node >= MAX_NUMNODES` check to keep the code consistent with other users of the acpi_map_pxm_to_node() function. Signed-off-by: Yuntao Wang Signed-off-by: Rafael J. Wysocki --- drivers/acpi/numa/srat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 12f330b0eac0..9d2d0deb256e 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -430,7 +430,7 @@ acpi_parse_gi_affinity(union acpi_subtable_headers *header, return -EINVAL; node = acpi_map_pxm_to_node(gi_affinity->proximity_domain); - if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) { + if (node == NUMA_NO_NODE) { pr_err("SRAT: Too many proximity domains.\n"); return -EINVAL; } From ec0f96260737ae23dcad57463fdc32eba025b11d Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Wed, 6 Dec 2023 18:43:17 +0800 Subject: [PATCH 0735/1562] ACPI: NUMA: Optimize the check for the availability of node values The first_unset_node() function returns the first unused node in nodes_found_map. If all nodes are in use, the function returns MAX_NUMNODES. Use this return value to determine whether there are any available node values in nodes_found_map, eliminating the need to use nodes_weight() for this purpose. Signed-off-by: Yuntao Wang [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/numa/srat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 9d2d0deb256e..d58e5ef424f2 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -67,9 +67,9 @@ int acpi_map_pxm_to_node(int pxm) node = pxm_to_node_map[pxm]; if (node == NUMA_NO_NODE) { - if (nodes_weight(nodes_found_map) >= MAX_NUMNODES) - return NUMA_NO_NODE; node = first_unset_node(nodes_found_map); + if (node >= MAX_NUMNODES) + return NUMA_NO_NODE; __acpi_map_pxm_to_node(pxm, node); node_set(node, nodes_found_map); } From e3f577830ce216b0ca21d4750cbbd64cfc21efff Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Wed, 6 Dec 2023 18:43:18 +0800 Subject: [PATCH 0736/1562] ACPI: NUMA: Fix the logic of getting the fake_pxm value The for loop does not iterate over the last element of the node_to_pxm_map array. This could lead to a conflict between the final fake_pxm value and the existing pxm values. That is, the final fake_pxm value can not be guaranteed to be an unused pxm value. While at it, fix up white space in slit_valid(). Signed-off-by: Yuntao Wang [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/numa/srat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index d58e5ef424f2..0214518fc582 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -183,7 +183,7 @@ static int __init slit_valid(struct acpi_table_slit *slit) int i, j; int d = slit->locality_count; for (i = 0; i < d; i++) { - for (j = 0; j < d; j++) { + for (j = 0; j < d; j++) { u8 val = slit->entry[d*i + j]; if (i == j) { if (val != LOCAL_DISTANCE) @@ -532,7 +532,7 @@ int __init acpi_numa_init(void) */ /* fake_pxm is the next unused PXM value after SRAT parsing */ - for (i = 0, fake_pxm = -1; i < MAX_NUMNODES - 1; i++) { + for (i = 0, fake_pxm = -1; i < MAX_NUMNODES; i++) { if (node_to_pxm_map[i] > fake_pxm) fake_pxm = node_to_pxm_map[i]; } From 3ebccf1d1ca74bbb78e6f8c38d1d172e468d91f8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 11 Dec 2023 13:14:29 +0200 Subject: [PATCH 0737/1562] ACPI: LPSS: Fix the fractional clock divider flags The conversion to CLK_FRAC_DIVIDER_POWER_OF_TWO_PS uses wrong flags in the parameters and hence miscalculates the values in the clock divider. Fix this by applying the flag to the proper parameter. Fixes: 82f53f9ee577 ("clk: fractional-divider: Introduce POWER_OF_TWO_PS flag") Reported-by: Alex Vinarskis Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_lpss.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c index 875de44961bf..d48407472dfb 100644 --- a/drivers/acpi/acpi_lpss.c +++ b/drivers/acpi/acpi_lpss.c @@ -461,8 +461,9 @@ static int register_device_clock(struct acpi_device *adev, if (!clk_name) return -ENOMEM; clk = clk_register_fractional_divider(NULL, clk_name, parent, + 0, prv_base, 1, 15, 16, 15, CLK_FRAC_DIVIDER_POWER_OF_TWO_PS, - prv_base, 1, 15, 16, 15, 0, NULL); + NULL); parent = clk_name; clk_name = kasprintf(GFP_KERNEL, "%s-update", devname); From b6515a88baf4628e93fcc39c2b81fc1740eb3c3f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 4 Dec 2023 20:36:14 +0100 Subject: [PATCH 0738/1562] thermal: trip: Drop redundant __thermal_zone_get_trip() header The __thermal_zone_get_trip() header in drivers/thermal/thermal_core.h is redundant, because there is one already in thermal.h, so drop it. No functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Daniel Lezcano --- drivers/thermal/thermal_core.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 7dfe6c8deb8e..fe2917a74054 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -120,8 +120,6 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz, for (__trip = __tz->trips; __trip - __tz->trips < __tz->num_trips; __trip++) void __thermal_zone_set_trips(struct thermal_zone_device *tz); -int __thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id, - struct thermal_trip *trip); int thermal_zone_trip_id(struct thermal_zone_device *tz, const struct thermal_trip *trip); void thermal_zone_trip_updated(struct thermal_zone_device *tz, From 0c0c4740c9d2668a234f7743ba50d54acab0821c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 4 Dec 2023 20:41:30 +0100 Subject: [PATCH 0739/1562] thermal: trip: Use for_each_trip() in __thermal_zone_set_trips() Make __thermal_zone_set_trips() use for_each_trip() instead of an open- coded loop over trip indices. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Daniel Lezcano Reviewed-by: Lukasz Luba --- drivers/thermal/thermal_trip.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c index 90861dec7eb0..581f1cd883e5 100644 --- a/drivers/thermal/thermal_trip.c +++ b/drivers/thermal/thermal_trip.c @@ -63,25 +63,21 @@ EXPORT_SYMBOL_GPL(thermal_zone_get_num_trips); */ void __thermal_zone_set_trips(struct thermal_zone_device *tz) { - struct thermal_trip trip; + const struct thermal_trip *trip; int low = -INT_MAX, high = INT_MAX; bool same_trip = false; - int i, ret; + int ret; lockdep_assert_held(&tz->lock); if (!tz->ops->set_trips) return; - for (i = 0; i < tz->num_trips; i++) { + for_each_trip(tz, trip) { bool low_set = false; int trip_low; - ret = __thermal_zone_get_trip(tz, i , &trip); - if (ret) - return; - - trip_low = trip.temperature - trip.hysteresis; + trip_low = trip->temperature - trip->hysteresis; if (trip_low < tz->temperature && trip_low > low) { low = trip_low; @@ -89,9 +85,9 @@ void __thermal_zone_set_trips(struct thermal_zone_device *tz) same_trip = false; } - if (trip.temperature > tz->temperature && - trip.temperature < high) { - high = trip.temperature; + if (trip->temperature > tz->temperature && + trip->temperature < high) { + high = trip->temperature; same_trip = low_set; } } From 2e3e7dad4bf5ad869e8dc9fa09151913154318b0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 4 Dec 2023 20:46:35 +0100 Subject: [PATCH 0740/1562] thermal: helpers: Use for_each_trip() in __thermal_zone_get_temp() Make __thermal_zone_get_temp() use for_each_trip() instead of an open- coded loop over trip indices. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Daniel Lezcano Reviewed-by: Lukasz Luba --- drivers/thermal/thermal_helpers.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c index d0afb623e475..c3982e0f0075 100644 --- a/drivers/thermal/thermal_helpers.c +++ b/drivers/thermal/thermal_helpers.c @@ -82,20 +82,18 @@ EXPORT_SYMBOL(get_thermal_instance); */ int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp) { - int ret = -EINVAL; - int count; + const struct thermal_trip *trip; int crit_temp = INT_MAX; - struct thermal_trip trip; + int ret = -EINVAL; lockdep_assert_held(&tz->lock); ret = tz->ops->get_temp(tz, temp); if (IS_ENABLED(CONFIG_THERMAL_EMULATION) && tz->emul_temperature) { - for (count = 0; count < tz->num_trips; count++) { - ret = __thermal_zone_get_trip(tz, count, &trip); - if (!ret && trip.type == THERMAL_TRIP_CRITICAL) { - crit_temp = trip.temperature; + for_each_trip(tz, trip) { + if (trip->type == THERMAL_TRIP_CRITICAL) { + crit_temp = trip->temperature; break; } } From 183b64132f9692b15545c1aad755e71a53bcfb94 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 4 Dec 2023 20:49:03 +0100 Subject: [PATCH 0741/1562] thermal: netlink: Use for_each_trip() in thermal_genl_cmd_tz_get_trip() Make thermal_genl_cmd_tz_get_trip() use for_each_trip() instead of an open- coded loop over trip indices. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Daniel Lezcano Reviewed-by: Lukasz Luba --- drivers/thermal/thermal_netlink.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c index 08bc46c3ec7b..21f00d73acb7 100644 --- a/drivers/thermal/thermal_netlink.c +++ b/drivers/thermal/thermal_netlink.c @@ -450,10 +450,10 @@ out_cancel_nest: static int thermal_genl_cmd_tz_get_trip(struct param *p) { struct sk_buff *msg = p->msg; + const struct thermal_trip *trip; struct thermal_zone_device *tz; struct nlattr *start_trip; - struct thermal_trip trip; - int ret, i, id; + int id; if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID]) return -EINVAL; @@ -470,16 +470,12 @@ static int thermal_genl_cmd_tz_get_trip(struct param *p) mutex_lock(&tz->lock); - for (i = 0; i < tz->num_trips; i++) { - - ret = __thermal_zone_get_trip(tz, i, &trip); - if (ret) - goto out_cancel_nest; - - if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, i) || - nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, trip.type) || - nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TEMP, trip.temperature) || - nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_HYST, trip.hysteresis)) + for_each_trip(tz, trip) { + if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, + thermal_zone_trip_id(tz, trip)) || + nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, trip->type) || + nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TEMP, trip->temperature) || + nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_HYST, trip->hysteresis)) goto out_cancel_nest; } From bdc22c8d52d70fc5655ab4dbf72fa79b034bb7b5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 5 Dec 2023 20:18:39 +0100 Subject: [PATCH 0742/1562] thermal: trip: Send trip change notifications on all trip updates The _store callbacks of the trip point temperature and hysteresis sysfs attributes invoke thermal_notify_tz_trip_change() to send a notification regarding the trip point change, but when trip points are updated by the platform firmware, trip point change notifications are not sent. To make the behavior after a trip point change more consistent, modify all of the 3 places where trip point temperature is updated to use a new function called thermal_zone_set_trip_temp() for this purpose and make that function call thermal_notify_tz_trip_change(). Note that trip point hysteresis can only be updated via sysfs and trip_point_hyst_store() calls thermal_notify_tz_trip_change() already, so this code path need not be changed. Signed-off-by: Rafael J. Wysocki Acked-by: Daniel Lezcano --- drivers/acpi/thermal.c | 7 +++++-- .../intel/int340x_thermal/int340x_thermal_zone.c | 8 +++++--- drivers/thermal/thermal_sysfs.c | 4 ++-- drivers/thermal/thermal_trip.c | 14 +++++++++++++- include/linux/thermal.h | 2 ++ 5 files changed, 27 insertions(+), 8 deletions(-) diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index f74d81abdbfc..3e679e9feec5 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -297,6 +297,7 @@ static int acpi_thermal_adjust_trip(struct thermal_trip *trip, void *data) struct acpi_thermal_trip *acpi_trip = trip->priv; struct adjust_trip_data *atd = data; struct acpi_thermal *tz = atd->tz; + int temp; if (!acpi_trip || !acpi_thermal_trip_valid(acpi_trip)) return 0; @@ -307,9 +308,11 @@ static int acpi_thermal_adjust_trip(struct thermal_trip *trip, void *data) acpi_thermal_update_trip_devices(tz, trip); if (acpi_thermal_trip_valid(acpi_trip)) - trip->temperature = acpi_thermal_temp(tz, acpi_trip->temp_dk); + temp = acpi_thermal_temp(tz, acpi_trip->temp_dk); else - trip->temperature = THERMAL_TEMP_INVALID; + temp = THERMAL_TEMP_INVALID; + + thermal_zone_set_trip_temp(tz->thermal_zone, trip, temp); return 0; } diff --git a/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c b/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c index a03b67579dd9..3e4bfe817fac 100644 --- a/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c +++ b/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c @@ -225,7 +225,8 @@ EXPORT_SYMBOL_GPL(int340x_thermal_zone_remove); static int int340x_update_one_trip(struct thermal_trip *trip, void *arg) { - struct acpi_device *zone_adev = arg; + struct int34x_thermal_zone *int34x_zone = arg; + struct acpi_device *zone_adev = int34x_zone->adev; int temp, err; switch (trip->type) { @@ -249,14 +250,15 @@ static int int340x_update_one_trip(struct thermal_trip *trip, void *arg) if (err) temp = THERMAL_TEMP_INVALID; - trip->temperature = temp; + thermal_zone_set_trip_temp(int34x_zone->zone, trip, temp); + return 0; } void int340x_thermal_update_trips(struct int34x_thermal_zone *int34x_zone) { thermal_zone_for_each_trip(int34x_zone->zone, int340x_update_one_trip, - int34x_zone->adev); + int34x_zone); } EXPORT_SYMBOL_GPL(int340x_thermal_update_trips); diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c index f52af8a3b4b5..d8ff74a4338a 100644 --- a/drivers/thermal/thermal_sysfs.c +++ b/drivers/thermal/thermal_sysfs.c @@ -129,9 +129,9 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr, goto unlock; } - trip->temperature = temp; + thermal_zone_set_trip_temp(tz, trip, temp); - thermal_zone_trip_updated(tz, trip); + __thermal_zone_device_update(tz, THERMAL_TRIP_CHANGED); } unlock: diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c index 581f1cd883e5..a1ad345c0741 100644 --- a/drivers/thermal/thermal_trip.c +++ b/drivers/thermal/thermal_trip.c @@ -152,7 +152,6 @@ int thermal_zone_trip_id(struct thermal_zone_device *tz, */ return trip - tz->trips; } - void thermal_zone_trip_updated(struct thermal_zone_device *tz, const struct thermal_trip *trip) { @@ -161,3 +160,16 @@ void thermal_zone_trip_updated(struct thermal_zone_device *tz, trip->hysteresis); __thermal_zone_device_update(tz, THERMAL_TRIP_CHANGED); } + +void thermal_zone_set_trip_temp(struct thermal_zone_device *tz, + struct thermal_trip *trip, int temp) +{ + if (trip->temperature == temp) + return; + + trip->temperature = temp; + thermal_notify_tz_trip_change(tz->id, thermal_zone_trip_id(tz, trip), + trip->type, trip->temperature, + trip->hysteresis); +} +EXPORT_SYMBOL_GPL(thermal_zone_set_trip_temp); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index bedbaec9a42e..09f6eb82c191 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -291,6 +291,8 @@ int thermal_zone_for_each_trip(struct thermal_zone_device *tz, int (*cb)(struct thermal_trip *, void *), void *data); int thermal_zone_get_num_trips(struct thermal_zone_device *tz); +void thermal_zone_set_trip_temp(struct thermal_zone_device *tz, + struct thermal_trip *trip, int temp); int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp); From 38c872a9e96f72f2947affc0526cc05659367d3d Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 12 Dec 2023 13:22:39 -0800 Subject: [PATCH 0743/1562] ACPI: extlog: Clear Extended Error Log status when RAS_CEC handled the error When both CONFIG_RAS_CEC and CONFIG_ACPI_EXTLOG are enabled, Linux does not clear the status word of the BIOS supplied error record for corrected errors. This may prevent logging of subsequent uncorrected errors. Fix by clearing the status. Fixes: 23ba710a0864 ("x86/mce: Fix all mce notifiers to update the mce->kflags bitmask") Reported-by: Erwin Tsaur Signed-off-by: Tony Luck Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_extlog.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index 193147769146..ca87a0939135 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -145,9 +145,14 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, static u32 err_seq; estatus = extlog_elog_entry_check(cpu, bank); - if (estatus == NULL || (mce->kflags & MCE_HANDLED_CEC)) + if (!estatus) return NOTIFY_DONE; + if (mce->kflags & MCE_HANDLED_CEC) { + estatus->block_status = 0; + return NOTIFY_DONE; + } + memcpy(elog_buf, (void *)estatus, ELOG_ENTRY_LEN); /* clear record status to enable BIOS to update it again */ estatus->block_status = 0; From ccb45b34d44016b91fa75646741d317d6d6fdeea Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 12 Dec 2023 22:48:38 +0100 Subject: [PATCH 0744/1562] ACPI: arm64: export acpi_arch_thermal_cpufreq_pctg() The cpufreq code can be in a loadable module, so the architecture support for it has to be exported: ERROR: modpost: "acpi_arch_thermal_cpufreq_pctg" [drivers/acpi/processor.ko] undefined! Fixes: 310293a2b941 ("ACPI: processor: reduce CPUFREQ thermal reduction pctg for Tegra241") Signed-off-by: Arnd Bergmann Signed-off-by: Rafael J. Wysocki --- drivers/acpi/arm64/thermal_cpufreq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/acpi/arm64/thermal_cpufreq.c b/drivers/acpi/arm64/thermal_cpufreq.c index d524f2cd6044..582854914c5c 100644 --- a/drivers/acpi/arm64/thermal_cpufreq.c +++ b/drivers/acpi/arm64/thermal_cpufreq.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include #include "../internal.h" @@ -18,3 +19,4 @@ int acpi_arch_thermal_cpufreq_pctg(void) return 0; } +EXPORT_SYMBOL_GPL(acpi_arch_thermal_cpufreq_pctg); From 404f62cd6407f163e03cfaca97e27c1c4c62eb3c Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 13 Dec 2023 13:13:22 +0100 Subject: [PATCH 0745/1562] thermal/core: Check get_temp ops is present when registering a tz Initially the check against the get_temp ops in the thermal_zone_device_update() was put in there in order to catch drivers not providing this method. Instead of checking again and again the function if the ops exists in the update function, let's do the check at registration time, so it is checked one time and for all. Signed-off-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index e5434cdbf23b..2415dc50c31d 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -434,11 +434,6 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz, if (atomic_read(&in_suspend)) return; - if (WARN_ONCE(!tz->ops->get_temp, - "'%s' must not be called without 'get_temp' ops set\n", - __func__)) - return; - if (!thermal_zone_device_is_enabled(tz)) return; @@ -1285,7 +1280,7 @@ thermal_zone_device_register_with_trips(const char *type, struct thermal_trip *t return ERR_PTR(-EINVAL); } - if (!ops) { + if (!ops || !ops->get_temp) { pr_err("Thermal zone device ops not defined\n"); return ERR_PTR(-EINVAL); } From 36f7050b29f3d736b5faf1059bd43baec04db9fb Mon Sep 17 00:00:00 2001 From: Nandhini Srikandan Date: Wed, 13 Dec 2023 14:08:35 +0800 Subject: [PATCH 0746/1562] spi: dw: Remove Intel Thunder Bay SOC support Remove Intel Thunder Bay specific code as the product got cancelled and there are no end customers or users. Signed-off-by: Nandhini Srikandan Reviewed-by: Andy Shevchenko Link: https://msgid.link/r/20231213060836.29203-2-nandhini.srikandan@intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-dw-mmio.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/spi/spi-dw-mmio.c b/drivers/spi/spi-dw-mmio.c index 46801189a651..cc74cbe03431 100644 --- a/drivers/spi/spi-dw-mmio.c +++ b/drivers/spi/spi-dw-mmio.c @@ -411,7 +411,6 @@ static const struct of_device_id dw_spi_mmio_of_match[] = { { .compatible = "renesas,rzn1-spi", .data = dw_spi_pssi_init}, { .compatible = "snps,dwc-ssi-1.01a", .data = dw_spi_hssi_init}, { .compatible = "intel,keembay-ssi", .data = dw_spi_intel_init}, - { .compatible = "intel,thunderbay-ssi", .data = dw_spi_intel_init}, { .compatible = "intel,mountevans-imc-ssi", .data = dw_spi_mountevans_imc_init, From e1fca6957f1966cb6e75cdc354f4bcaed230a454 Mon Sep 17 00:00:00 2001 From: Nandhini Srikandan Date: Wed, 13 Dec 2023 14:08:36 +0800 Subject: [PATCH 0747/1562] spi: dw: Remove Intel Thunder Bay SOC support Remove Intel Thunder Bay specific code as the product got cancelled and there are no end customers or users. Signed-off-by: Nandhini Srikandan Acked-by: Krzysztof Kozlowski Link: https://msgid.link/r/20231213060836.29203-3-nandhini.srikandan@intel.com Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml b/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml index 6348a387a21c..fde3776a558b 100644 --- a/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml +++ b/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml @@ -72,8 +72,6 @@ properties: - const: snps,dw-apb-ssi - description: Intel Keem Bay SPI Controller const: intel,keembay-ssi - - description: Intel Thunder Bay SPI Controller - const: intel,thunderbay-ssi - description: Intel Mount Evans Integrated Management Complex SPI Controller const: intel,mountevans-imc-ssi - description: AMD Pensando Elba SoC SPI Controller From 578bd4ce7100ae34f98c6b0147fe75cfa0dadbac Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 11 Dec 2023 10:41:51 -0800 Subject: [PATCH 0748/1562] xfs: recompute growfsrtfree transaction reservation while growing rt volume While playing with growfs to create a 20TB realtime section on a filesystem that didn't previously have an rt section, I noticed that growfs would occasionally shut down the log due to a transaction reservation overflow. xfs_calc_growrtfree_reservation uses the current size of the realtime summary file (m_rsumsize) to compute the transaction reservation for a growrtfree transaction. The reservations are computed at mount time, which means that m_rsumsize is zero when growfs starts "freeing" the new realtime extents into the rt volume. As a result, the transaction is undersized and fails. Fix this by recomputing the transaction reservations every time we change m_rsumsize. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_rtalloc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 8feb58c6241c..0c9893b9f2a9 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1038,6 +1038,9 @@ xfs_growfs_rt( nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nsbp->sb_rbmblocks); nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); + /* recompute growfsrt reservation from new rsumsize */ + xfs_trans_resv_calc(nmp, &nmp->m_resv); + /* * Start a transaction, get the log reservation. */ @@ -1124,6 +1127,8 @@ error_cancel: */ mp->m_rsumlevels = nrsumlevels; mp->m_rsumsize = nrsumsize; + /* recompute growfsrt reservation from new rsumsize */ + xfs_trans_resv_calc(mp, &mp->m_resv); error = xfs_trans_commit(tp); if (error) From c00eebd09e95757c9c1d08f0a6bbc32c543daf90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Dec 2023 10:06:29 +0100 Subject: [PATCH 0749/1562] xfs: consolidate the xfs_attr_defer_* helpers Consolidate the xfs_attr_defer_* helpers into a single xfs_attr_defer_add one that picks the right dela_state based on the passed in operation. Also move to a single trace point as the actual operation is visible through the flags in the delta_state passed to the trace point. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 90 ++++++++++------------------------------ fs/xfs/xfs_trace.h | 2 - 2 files changed, 21 insertions(+), 71 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e28d93d232de..4fed0c87a968 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -880,11 +880,10 @@ xfs_attr_lookup( return error; } -static int -xfs_attr_intent_init( +static void +xfs_attr_defer_add( struct xfs_da_args *args, - unsigned int op_flags, /* op flag (set or remove) */ - struct xfs_attr_intent **attr) /* new xfs_attr_intent */ + unsigned int op_flags) { struct xfs_attr_intent *new; @@ -893,66 +892,22 @@ xfs_attr_intent_init( new->xattri_op_flags = op_flags; new->xattri_da_args = args; - *attr = new; - return 0; -} + switch (op_flags) { + case XFS_ATTRI_OP_FLAGS_SET: + new->xattri_dela_state = xfs_attr_init_add_state(args); + break; + case XFS_ATTRI_OP_FLAGS_REPLACE: + new->xattri_dela_state = xfs_attr_init_replace_state(args); + break; + case XFS_ATTRI_OP_FLAGS_REMOVE: + new->xattri_dela_state = xfs_attr_init_remove_state(args); + break; + default: + ASSERT(0); + } -/* Sets an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_add( - struct xfs_da_args *args) -{ - struct xfs_attr_intent *new; - int error = 0; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new); - if (error) - return error; - - new->xattri_dela_state = xfs_attr_init_add_state(args); xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); - - return 0; -} - -/* Sets an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_replace( - struct xfs_da_args *args) -{ - struct xfs_attr_intent *new; - int error = 0; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new); - if (error) - return error; - - new->xattri_dela_state = xfs_attr_init_replace_state(args); - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); - trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp); - - return 0; -} - -/* Removes an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_remove( - struct xfs_da_args *args) -{ - - struct xfs_attr_intent *new; - int error; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new); - if (error) - return error; - - new->xattri_dela_state = xfs_attr_init_remove_state(args); - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); - trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp); - - return 0; } /* @@ -1038,16 +993,16 @@ xfs_attr_set( error = xfs_attr_lookup(args); switch (error) { case -EEXIST: - /* if no value, we are performing a remove operation */ if (!args->value) { - error = xfs_attr_defer_remove(args); + /* if no value, we are performing a remove operation */ + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE); break; } + /* Pure create fails if the attr already exists */ if (args->attr_flags & XATTR_CREATE) goto out_trans_cancel; - - error = xfs_attr_defer_replace(args); + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE); break; case -ENOATTR: /* Can't remove what isn't there. */ @@ -1057,14 +1012,11 @@ xfs_attr_set( /* Pure replace fails if no existing attr to replace. */ if (args->attr_flags & XATTR_REPLACE) goto out_trans_cancel; - - error = xfs_attr_defer_add(args); + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET); break; default: goto out_trans_cancel; } - if (error) - goto out_trans_cancel; /* * If this is a synchronous mount, make sure that the diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 514095b6ba2b..516529c151ae 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4408,8 +4408,6 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add); -DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace); -DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove); TRACE_EVENT(xfs_force_shutdown, From 2e8f7b6f4a15ea92cb2186ad300ae4191d0edcef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Dec 2023 10:06:30 +0100 Subject: [PATCH 0750/1562] xfs: move xfs_attr_defer_type up in xfs_attr_item.c We'll reference it directly in xlog_recover_attri_commit_pass2, so move it up a bit. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_attr_item.c | 66 +++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 39f2c5a46179..4e0eaa2640e0 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -654,6 +654,39 @@ xfs_attr_relog_intent( return &new_attrip->attri_item; } +/* Get an ATTRD so we can process all the attrs. */ +static struct xfs_log_item * +xfs_attr_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + struct xfs_attri_log_item *attrip; + struct xfs_attrd_log_item *attrdp; + + attrip = ATTRI_ITEM(intent); + + attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); + + xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, + &xfs_attrd_item_ops); + attrdp->attrd_attrip = attrip; + attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; + + return &attrdp->attrd_item; +} + +const struct xfs_defer_op_type xfs_attr_defer_type = { + .max_items = 1, + .create_intent = xfs_attr_create_intent, + .abort_intent = xfs_attr_abort_intent, + .create_done = xfs_attr_create_done, + .finish_item = xfs_attr_finish_item, + .cancel_item = xfs_attr_cancel_item, + .recover_work = xfs_attr_recover_work, + .relog_intent = xfs_attr_relog_intent, +}; + STATIC int xlog_recover_attri_commit_pass2( struct xlog *log, @@ -730,39 +763,6 @@ xlog_recover_attri_commit_pass2( return 0; } -/* Get an ATTRD so we can process all the attrs. */ -static struct xfs_log_item * -xfs_attr_create_done( - struct xfs_trans *tp, - struct xfs_log_item *intent, - unsigned int count) -{ - struct xfs_attri_log_item *attrip; - struct xfs_attrd_log_item *attrdp; - - attrip = ATTRI_ITEM(intent); - - attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); - - xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, - &xfs_attrd_item_ops); - attrdp->attrd_attrip = attrip; - attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; - - return &attrdp->attrd_item; -} - -const struct xfs_defer_op_type xfs_attr_defer_type = { - .max_items = 1, - .create_intent = xfs_attr_create_intent, - .abort_intent = xfs_attr_abort_intent, - .create_done = xfs_attr_create_done, - .finish_item = xfs_attr_finish_item, - .cancel_item = xfs_attr_cancel_item, - .recover_work = xfs_attr_recover_work, - .relog_intent = xfs_attr_relog_intent, -}; - /* * This routine is called when an ATTRD format structure is found in a committed * transaction in the log. Its purpose is to cancel the corresponding ATTRI if From 7f2f7531e0d455f1abb9f48fbbe17c37e8742590 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Dec 2023 10:06:31 +0100 Subject: [PATCH 0751/1562] xfs: store an ops pointer in struct xfs_defer_pending The dfp_type field in struct xfs_defer_pending is only used to either look up the operations associated with the pending word or in trace points. Replace it with a direct pointer to the operations vector, and store a pretty name in the vector for tracing. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_defer.c | 43 +++++++++++++++----------------------- fs/xfs/libxfs/xfs_defer.h | 5 +++-- fs/xfs/xfs_attr_item.c | 1 + fs/xfs/xfs_bmap_item.c | 1 + fs/xfs/xfs_extfree_item.c | 2 ++ fs/xfs/xfs_refcount_item.c | 1 + fs/xfs/xfs_rmap_item.c | 1 + fs/xfs/xfs_trace.h | 16 +++++++------- 8 files changed, 34 insertions(+), 36 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index ecc2f7ec6991..e70881ae5cc5 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -251,7 +251,6 @@ xfs_defer_create_done( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; struct xfs_log_item *lip; /* If there is no log intent item, there can be no log done item. */ @@ -266,7 +265,7 @@ xfs_defer_create_done( * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; - lip = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); if (!lip) return; @@ -287,13 +286,13 @@ xfs_defer_create_intent( struct xfs_defer_pending *dfp, bool sort) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; struct xfs_log_item *lip; if (dfp->dfp_intent) return 1; - lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); + lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, + sort); if (!lip) return 0; if (IS_ERR(lip)) @@ -338,12 +337,10 @@ xfs_defer_pending_abort( struct xfs_mount *mp, struct xfs_defer_pending *dfp) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_abort(mp, dfp); if (dfp->dfp_intent && !dfp->dfp_done) { - ops->abort_intent(dfp->dfp_intent); + dfp->dfp_ops->abort_intent(dfp->dfp_intent); dfp->dfp_intent = NULL; } } @@ -353,7 +350,6 @@ xfs_defer_pending_cancel_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; struct list_head *pwi; struct list_head *n; @@ -364,7 +360,7 @@ xfs_defer_pending_cancel_work( list_del(pwi); dfp->dfp_count--; trace_xfs_defer_cancel_item(mp, dfp, pwi); - ops->cancel_item(pwi); + dfp->dfp_ops->cancel_item(pwi); } ASSERT(dfp->dfp_count == 0); kmem_cache_free(xfs_defer_pending_cache, dfp); @@ -522,11 +518,10 @@ xfs_defer_relog_intent( struct xfs_defer_pending *dfp) { struct xfs_log_item *lip; - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; xfs_defer_create_done(tp, dfp); - lip = ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); + lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); if (lip) { xfs_trans_add_item(tp, lip); set_bit(XFS_LI_DIRTY, &lip->li_flags); @@ -593,7 +588,7 @@ xfs_defer_finish_one( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + const struct xfs_defer_op_type *ops = dfp->dfp_ops; struct xfs_btree_cur *state = NULL; struct list_head *li, *n; int error; @@ -790,7 +785,6 @@ xfs_defer_cancel( static inline struct xfs_defer_pending * xfs_defer_find_last( struct xfs_trans *tp, - enum xfs_defer_ops_type type, const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp = NULL; @@ -803,7 +797,7 @@ xfs_defer_find_last( dfp_list); /* Wrong type? */ - if (dfp->dfp_type != type) + if (dfp->dfp_ops != ops) return NULL; return dfp; } @@ -836,13 +830,13 @@ xfs_defer_can_append( static inline struct xfs_defer_pending * xfs_defer_alloc( struct xfs_trans *tp, - enum xfs_defer_ops_type type) + const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp; dfp = kmem_cache_zalloc(xfs_defer_pending_cache, GFP_NOFS | __GFP_NOFAIL); - dfp->dfp_type = type; + dfp->dfp_ops = ops; INIT_LIST_HEAD(&dfp->dfp_work); list_add_tail(&dfp->dfp_list, &tp->t_dfops); @@ -862,9 +856,9 @@ xfs_defer_add( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); - dfp = xfs_defer_find_last(tp, type, ops); + dfp = xfs_defer_find_last(tp, ops); if (!dfp || !xfs_defer_can_append(dfp, ops)) - dfp = xfs_defer_alloc(tp, type); + dfp = xfs_defer_alloc(tp, ops); xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); @@ -880,17 +874,15 @@ xfs_defer_add_barrier( struct xfs_trans *tp) { struct xfs_defer_pending *dfp; - const enum xfs_defer_ops_type type = XFS_DEFER_OPS_TYPE_BARRIER; - const struct xfs_defer_op_type *ops = defer_op_types[type]; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); /* If the last defer op added was a barrier, we're done. */ - dfp = xfs_defer_find_last(tp, type, ops); + dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type); if (dfp) return; - xfs_defer_alloc(tp, type); + xfs_defer_alloc(tp, &xfs_barrier_defer_type); trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); } @@ -909,7 +901,7 @@ xfs_defer_start_recovery( dfp = kmem_cache_zalloc(xfs_defer_pending_cache, GFP_NOFS | __GFP_NOFAIL); - dfp->dfp_type = dfp_type; + dfp->dfp_ops = defer_op_types[dfp_type]; dfp->dfp_intent = lip; INIT_LIST_HEAD(&dfp->dfp_work); list_add_tail(&dfp->dfp_list, r_dfops); @@ -935,13 +927,12 @@ xfs_defer_finish_recovery( struct xfs_defer_pending *dfp, struct list_head *capture_list) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; int error; - error = ops->recover_work(dfp, capture_list); + error = dfp->dfp_ops->recover_work(dfp, capture_list); if (error) trace_xlog_intent_recovery_failed(mp, error, - ops->recover_work); + dfp->dfp_ops->recover_work); return error; } diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 5b1990ef3e5d..957a06278e88 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -34,9 +34,9 @@ struct xfs_defer_pending { struct list_head dfp_work; /* work items */ struct xfs_log_item *dfp_intent; /* log intent item */ struct xfs_log_item *dfp_done; /* log done item */ + const struct xfs_defer_op_type *dfp_ops; unsigned int dfp_count; /* # extent items */ unsigned int dfp_flags; - enum xfs_defer_ops_type dfp_type; }; /* @@ -61,6 +61,8 @@ void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); /* Description of a deferred type. */ struct xfs_defer_op_type { + const char *name; + unsigned int max_items; struct xfs_log_item *(*create_intent)(struct xfs_trans *tp, struct list_head *items, unsigned int count, bool sort); void (*abort_intent)(struct xfs_log_item *intent); @@ -76,7 +78,6 @@ struct xfs_defer_op_type { struct xfs_log_item *(*relog_intent)(struct xfs_trans *tp, struct xfs_log_item *intent, struct xfs_log_item *done_item); - unsigned int max_items; }; extern const struct xfs_defer_op_type xfs_bmap_update_defer_type; diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 4e0eaa2640e0..beae2de82450 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -677,6 +677,7 @@ xfs_attr_create_done( } const struct xfs_defer_op_type xfs_attr_defer_type = { + .name = "attr", .max_items = 1, .create_intent = xfs_attr_create_intent, .abort_intent = xfs_attr_abort_intent, diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index bc48d733634a..f43abf0b6486 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -563,6 +563,7 @@ xfs_bmap_relog_intent( } const struct xfs_defer_op_type xfs_bmap_update_defer_type = { + .name = "bmap", .max_items = XFS_BUI_MAX_FAST_EXTENTS, .create_intent = xfs_bmap_update_create_intent, .abort_intent = xfs_bmap_update_abort_intent, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 3e3469504271..e67907a379c8 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -670,6 +670,7 @@ xfs_extent_free_relog_intent( } const struct xfs_defer_op_type xfs_extent_free_defer_type = { + .name = "extent_free", .max_items = XFS_EFI_MAX_FAST_EXTENTS, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, @@ -682,6 +683,7 @@ const struct xfs_defer_op_type xfs_extent_free_defer_type = { /* sub-type with special handling for AGFL deferred frees */ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { + .name = "agfl_free", .max_items = XFS_EFI_MAX_FAST_EXTENTS, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 9974be81cb2b..b08839550f34 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -523,6 +523,7 @@ xfs_refcount_relog_intent( } const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + .name = "refcount", .max_items = XFS_CUI_MAX_FAST_EXTENTS, .create_intent = xfs_refcount_update_create_intent, .abort_intent = xfs_refcount_update_abort_intent, diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 488c4a2a80a3..65b432eb5d02 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -576,6 +576,7 @@ xfs_rmap_relog_intent( } const struct xfs_defer_op_type xfs_rmap_update_defer_type = { + .name = "rmap", .max_items = XFS_RUI_MAX_FAST_EXTENTS, .create_intent = xfs_rmap_update_create_intent, .abort_intent = xfs_rmap_update_abort_intent, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 516529c151ae..0efcdb79d10e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2549,7 +2549,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, TP_ARGS(mp, dfp), TP_STRUCT__entry( __field(dev_t, dev) - __field(int, type) + __string(name, dfp->dfp_ops->name) __field(void *, intent) __field(unsigned int, flags) __field(char, committed) @@ -2557,15 +2557,15 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __entry->type = dfp->dfp_type; + __assign_str(name, dfp->dfp_ops->name); __entry->intent = dfp->dfp_intent; __entry->flags = dfp->dfp_flags; __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p flags %s committed %d nr %d", + TP_printk("dev %d:%d optype %s intent %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __get_str(name), __entry->intent, __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), __entry->committed, @@ -2694,7 +2694,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, TP_ARGS(mp, dfp, item), TP_STRUCT__entry( __field(dev_t, dev) - __field(int, type) + __string(name, dfp->dfp_ops->name) __field(void *, intent) __field(void *, item) __field(char, committed) @@ -2703,16 +2703,16 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __entry->type = dfp->dfp_type; + __assign_str(name, dfp->dfp_ops->name); __entry->intent = dfp->dfp_intent; __entry->item = item; __entry->committed = dfp->dfp_done != NULL; __entry->flags = dfp->dfp_flags; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p item %p flags %s committed %d nr %d", + TP_printk("dev %d:%d optype %s intent %p item %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __get_str(name), __entry->intent, __entry->item, __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), From dc22af64368291a86fb6b7eb2adab21c815836b7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Dec 2023 06:16:32 +0100 Subject: [PATCH 0752/1562] xfs: pass the defer ops instead of type to xfs_defer_start_recovery xfs_defer_start_recovery is only called from xlog_recover_intent_item, and the callers of that all have the actual xfs_defer_ops_type operation vector at hand. Pass that directly instead of looking it up from the defer_op_types table. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_defer.c | 6 +++--- fs/xfs/libxfs/xfs_defer.h | 2 +- fs/xfs/libxfs/xfs_log_recover.h | 3 ++- fs/xfs/xfs_attr_item.c | 2 +- fs/xfs/xfs_bmap_item.c | 2 +- fs/xfs/xfs_extfree_item.c | 2 +- fs/xfs/xfs_log_recover.c | 4 ++-- fs/xfs/xfs_refcount_item.c | 2 +- fs/xfs/xfs_rmap_item.c | 2 +- 9 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index e70881ae5cc5..dd964bf825eb 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -894,14 +894,14 @@ xfs_defer_add_barrier( void xfs_defer_start_recovery( struct xfs_log_item *lip, - enum xfs_defer_ops_type dfp_type, - struct list_head *r_dfops) + struct list_head *r_dfops, + const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp; dfp = kmem_cache_zalloc(xfs_defer_pending_cache, GFP_NOFS | __GFP_NOFAIL); - dfp->dfp_ops = defer_op_types[dfp_type]; + dfp->dfp_ops = ops; dfp->dfp_intent = lip; INIT_LIST_HEAD(&dfp->dfp_work); list_add_tail(&dfp->dfp_list, r_dfops); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 957a06278e88..60de91b66392 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -147,7 +147,7 @@ void xfs_defer_ops_capture_abort(struct xfs_mount *mp, void xfs_defer_resources_rele(struct xfs_defer_resources *dres); void xfs_defer_start_recovery(struct xfs_log_item *lip, - enum xfs_defer_ops_type dfp_type, struct list_head *r_dfops); + struct list_head *r_dfops, const struct xfs_defer_op_type *ops); void xfs_defer_cancel_recovery(struct xfs_mount *mp, struct xfs_defer_pending *dfp); int xfs_defer_finish_recovery(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index c8e5d912895b..9fe7a9564bca 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -11,6 +11,7 @@ * define how recovery should work for that type of log item. */ struct xlog_recover_item; +struct xfs_defer_op_type; /* Sorting hat for log items as they're read in. */ enum xlog_recover_reorder { @@ -156,7 +157,7 @@ xlog_recover_resv(const struct xfs_trans_res *r) struct xfs_defer_pending; void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip, - xfs_lsn_t lsn, unsigned int dfp_type); + xfs_lsn_t lsn, const struct xfs_defer_op_type *ops); int xlog_recover_finish_intent(struct xfs_trans *tp, struct xfs_defer_pending *dfp); diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index beae2de82450..9e02111bd890 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -759,7 +759,7 @@ xlog_recover_attri_commit_pass2( memcpy(&attrip->attri_format, attri_formatp, len); xlog_recover_intent_item(log, &attrip->attri_item, lsn, - XFS_DEFER_OPS_TYPE_ATTR); + &xfs_attr_defer_type); xfs_attri_log_nameval_put(nv); return 0; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index f43abf0b6486..52fb8a148b7d 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -650,7 +650,7 @@ xlog_recover_bui_commit_pass2( atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); xlog_recover_intent_item(log, &buip->bui_item, lsn, - XFS_DEFER_OPS_TYPE_BMAP); + &xfs_bmap_update_defer_type); return 0; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index e67907a379c8..1d1185fca6a5 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -747,7 +747,7 @@ xlog_recover_efi_commit_pass2( atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); xlog_recover_intent_item(log, &efip->efi_item, lsn, - XFS_DEFER_OPS_TYPE_FREE); + &xfs_extent_free_defer_type); return 0; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index c18692af2c65..1251c81e55f9 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1942,11 +1942,11 @@ xlog_recover_intent_item( struct xlog *log, struct xfs_log_item *lip, xfs_lsn_t lsn, - unsigned int dfp_type) + const struct xfs_defer_op_type *ops) { ASSERT(xlog_item_is_intent(lip)); - xfs_defer_start_recovery(lip, dfp_type, &log->r_dfops); + xfs_defer_start_recovery(lip, &log->r_dfops, ops); /* * Insert the intent into the AIL directly and drop one reference so diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index b08839550f34..20ad8086da60 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -605,7 +605,7 @@ xlog_recover_cui_commit_pass2( atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); xlog_recover_intent_item(log, &cuip->cui_item, lsn, - XFS_DEFER_OPS_TYPE_REFCOUNT); + &xfs_refcount_update_defer_type); return 0; } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 65b432eb5d02..79ad0087aeca 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -658,7 +658,7 @@ xlog_recover_rui_commit_pass2( atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); xlog_recover_intent_item(log, &ruip->rui_item, lsn, - XFS_DEFER_OPS_TYPE_RMAP); + &xfs_rmap_update_defer_type); return 0; } From 603ce8ab12094a2d9483c79a7541335e258a5328 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Dec 2023 10:06:33 +0100 Subject: [PATCH 0753/1562] xfs: pass the defer ops directly to xfs_defer_add Pass a pointer to the xfs_defer_op_type structure to xfs_defer_add and remove the indirection through the xfs_defer_ops_type enum and a global table of all possible operations. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_alloc.c | 4 ++-- fs/xfs/libxfs/xfs_attr.c | 2 +- fs/xfs/libxfs/xfs_bmap.c | 2 +- fs/xfs/libxfs/xfs_defer.c | 16 ++-------------- fs/xfs/libxfs/xfs_defer.h | 18 ++---------------- fs/xfs/libxfs/xfs_refcount.c | 2 +- fs/xfs/libxfs/xfs_rmap.c | 2 +- 7 files changed, 10 insertions(+), 36 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 4940f9377f21..60c2c18e8e54 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2514,7 +2514,7 @@ xfs_defer_agfl_block( trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); xfs_extent_free_get_group(mp, xefi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); + xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type); return 0; } @@ -2578,7 +2578,7 @@ xfs_defer_extent_free( XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); xfs_extent_free_get_group(mp, xefi); - *dfpp = xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); + *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type); return 0; } diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 4fed0c87a968..fa49c795f407 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -906,7 +906,7 @@ xfs_attr_defer_add( ASSERT(0); } - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type); trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index ca6614f4eac5..e308d2f44a3c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6091,7 +6091,7 @@ __xfs_bmap_add( bi->bi_bmap = *bmap; xfs_bmap_update_get_group(tp->t_mountp, bi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list); + xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type); return 0; } diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index dd964bf825eb..ca7f0ac04896 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -235,16 +235,6 @@ static const struct xfs_defer_op_type xfs_barrier_defer_type = { .cancel_item = xfs_defer_barrier_cancel_item, }; -static const struct xfs_defer_op_type *defer_op_types[] = { - [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type, - [XFS_DEFER_OPS_TYPE_REFCOUNT] = &xfs_refcount_update_defer_type, - [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type, - [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, - [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, - [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, - [XFS_DEFER_OPS_TYPE_BARRIER] = &xfs_barrier_defer_type, -}; - /* Create a log intent done item for a log intent item. */ static inline void xfs_defer_create_done( @@ -847,14 +837,12 @@ xfs_defer_alloc( struct xfs_defer_pending * xfs_defer_add( struct xfs_trans *tp, - enum xfs_defer_ops_type type, - struct list_head *li) + struct list_head *li, + const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp = NULL; - const struct xfs_defer_op_type *ops = defer_op_types[type]; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); dfp = xfs_defer_find_last(tp, ops); if (!dfp || !xfs_defer_can_append(dfp, ops)) diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 60de91b66392..18a9fb92dde8 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -10,20 +10,6 @@ struct xfs_btree_cur; struct xfs_defer_op_type; struct xfs_defer_capture; -/* - * Header for deferred operation list. - */ -enum xfs_defer_ops_type { - XFS_DEFER_OPS_TYPE_BMAP, - XFS_DEFER_OPS_TYPE_REFCOUNT, - XFS_DEFER_OPS_TYPE_RMAP, - XFS_DEFER_OPS_TYPE_FREE, - XFS_DEFER_OPS_TYPE_AGFL_FREE, - XFS_DEFER_OPS_TYPE_ATTR, - XFS_DEFER_OPS_TYPE_BARRIER, - XFS_DEFER_OPS_TYPE_MAX, -}; - /* * Save a log intent item and a list of extents, so that we can replay * whatever action had to happen to the extent list and file the log done @@ -51,8 +37,8 @@ struct xfs_defer_pending { void xfs_defer_item_pause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); void xfs_defer_item_unpause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); -struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, - enum xfs_defer_ops_type type, struct list_head *h); +struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, struct list_head *h, + const struct xfs_defer_op_type *ops); int xfs_defer_finish_noroll(struct xfs_trans **tp); int xfs_defer_finish(struct xfs_trans **tp); int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp); diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 3702b4a07110..5b039cd022e0 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1458,7 +1458,7 @@ __xfs_refcount_add( ri->ri_blockcount = blockcount; xfs_refcount_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); + xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type); } /* diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index fbb0b2637463..76bf7f48cb5a 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2567,7 +2567,7 @@ __xfs_rmap_add( ri->ri_bmap = *bmap; xfs_rmap_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); + xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type); } /* Map an extent into a file. */ From 27591ea2f7751223e79fa41f11bf687777a38399 Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Thu, 14 Dec 2023 10:59:11 +0800 Subject: [PATCH 0754/1562] regulator: qcom-rpmh: extend to support multiple linear voltage ranges Update rpmh_vreg_hw_data to support multiple linear voltage ranges for potential regulators which have discrete voltage program ranges. Suggested-by: David Collins Reviewed-by: David Collins Signed-off-by: Fenglin Wu Link: https://msgid.link/r/20231214-pm8010-regulator-v2-1-82131df6b97b@quicinc.com Signed-off-by: Mark Brown --- drivers/regulator/qcom-rpmh-regulator.c | 115 ++++++++++++++++++------ 1 file changed, 89 insertions(+), 26 deletions(-) diff --git a/drivers/regulator/qcom-rpmh-regulator.c b/drivers/regulator/qcom-rpmh-regulator.c index cf502eec0915..43b45feb02e6 100644 --- a/drivers/regulator/qcom-rpmh-regulator.c +++ b/drivers/regulator/qcom-rpmh-regulator.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2018-2021, The Linux Foundation. All rights reserved. +// Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved. #define pr_fmt(fmt) "%s: " fmt, __func__ @@ -68,10 +69,11 @@ enum rpmh_regulator_type { * @regulator_type: RPMh accelerator type used to manage this * regulator * @ops: Pointer to regulator ops callback structure - * @voltage_range: The single range of voltages supported by this - * PMIC regulator type + * @voltage_ranges: The possible ranges of voltages supported by this + * PMIC regulator type + * @n_linear_ranges: Number of entries in voltage_ranges * @n_voltages: The number of unique voltage set points defined - * by voltage_range + * by voltage_ranges * @hpm_min_load_uA: Minimum load current in microamps that requires * high power mode (HPM) operation. This is used * for LDO hardware type regulators only. @@ -85,7 +87,8 @@ enum rpmh_regulator_type { struct rpmh_vreg_hw_data { enum rpmh_regulator_type regulator_type; const struct regulator_ops *ops; - const struct linear_range voltage_range; + const struct linear_range *voltage_ranges; + int n_linear_ranges; int n_voltages; int hpm_min_load_uA; const int *pmic_mode_map; @@ -449,8 +452,8 @@ static int rpmh_regulator_init_vreg(struct rpmh_vreg *vreg, struct device *dev, vreg->mode = REGULATOR_MODE_INVALID; if (rpmh_data->hw_data->n_voltages) { - vreg->rdesc.linear_ranges = &rpmh_data->hw_data->voltage_range; - vreg->rdesc.n_linear_ranges = 1; + vreg->rdesc.linear_ranges = rpmh_data->hw_data->voltage_ranges; + vreg->rdesc.n_linear_ranges = rpmh_data->hw_data->n_linear_ranges; vreg->rdesc.n_voltages = rpmh_data->hw_data->n_voltages; } @@ -613,7 +616,10 @@ static unsigned int rpmh_regulator_pmic4_bob_of_map_mode(unsigned int rpmh_mode) static const struct rpmh_vreg_hw_data pmic4_pldo = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_drms_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(1664000, 0, 255, 8000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(1664000, 0, 255, 8000), + }, + .n_linear_ranges = 1, .n_voltages = 256, .hpm_min_load_uA = 10000, .pmic_mode_map = pmic_mode_map_pmic4_ldo, @@ -623,7 +629,10 @@ static const struct rpmh_vreg_hw_data pmic4_pldo = { static const struct rpmh_vreg_hw_data pmic4_pldo_lv = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_drms_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(1256000, 0, 127, 8000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(1256000, 0, 127, 8000), + }, + .n_linear_ranges = 1, .n_voltages = 128, .hpm_min_load_uA = 10000, .pmic_mode_map = pmic_mode_map_pmic4_ldo, @@ -633,7 +642,10 @@ static const struct rpmh_vreg_hw_data pmic4_pldo_lv = { static const struct rpmh_vreg_hw_data pmic4_nldo = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_drms_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(312000, 0, 127, 8000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(312000, 0, 127, 8000), + }, + .n_linear_ranges = 1, .n_voltages = 128, .hpm_min_load_uA = 30000, .pmic_mode_map = pmic_mode_map_pmic4_ldo, @@ -643,7 +655,10 @@ static const struct rpmh_vreg_hw_data pmic4_nldo = { static const struct rpmh_vreg_hw_data pmic4_hfsmps3 = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(320000, 0, 215, 8000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(320000, 0, 215, 8000), + }, + .n_linear_ranges = 1, .n_voltages = 216, .pmic_mode_map = pmic_mode_map_pmic4_smps, .of_map_mode = rpmh_regulator_pmic4_smps_of_map_mode, @@ -652,7 +667,10 @@ static const struct rpmh_vreg_hw_data pmic4_hfsmps3 = { static const struct rpmh_vreg_hw_data pmic4_ftsmps426 = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(320000, 0, 258, 4000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(320000, 0, 258, 4000), + }, + .n_linear_ranges = 1, .n_voltages = 259, .pmic_mode_map = pmic_mode_map_pmic4_smps, .of_map_mode = rpmh_regulator_pmic4_smps_of_map_mode, @@ -661,7 +679,10 @@ static const struct rpmh_vreg_hw_data pmic4_ftsmps426 = { static const struct rpmh_vreg_hw_data pmic4_bob = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_bypass_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(1824000, 0, 83, 32000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(1824000, 0, 83, 32000), + }, + .n_linear_ranges = 1, .n_voltages = 84, .pmic_mode_map = pmic_mode_map_pmic4_bob, .of_map_mode = rpmh_regulator_pmic4_bob_of_map_mode, @@ -676,7 +697,10 @@ static const struct rpmh_vreg_hw_data pmic4_lvs = { static const struct rpmh_vreg_hw_data pmic5_pldo = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_drms_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(1504000, 0, 255, 8000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(1504000, 0, 255, 8000), + }, + .n_linear_ranges = 1, .n_voltages = 256, .hpm_min_load_uA = 10000, .pmic_mode_map = pmic_mode_map_pmic5_ldo, @@ -686,7 +710,10 @@ static const struct rpmh_vreg_hw_data pmic5_pldo = { static const struct rpmh_vreg_hw_data pmic5_pldo_lv = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_drms_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(1504000, 0, 62, 8000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(1504000, 0, 62, 8000), + }, + .n_linear_ranges = 1, .n_voltages = 63, .hpm_min_load_uA = 10000, .pmic_mode_map = pmic_mode_map_pmic5_ldo, @@ -696,7 +723,10 @@ static const struct rpmh_vreg_hw_data pmic5_pldo_lv = { static const struct rpmh_vreg_hw_data pmic5_pldo515_mv = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_drms_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(1800000, 0, 187, 8000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(1800000, 0, 187, 8000), + }, + .n_linear_ranges = 1, .n_voltages = 188, .hpm_min_load_uA = 10000, .pmic_mode_map = pmic_mode_map_pmic5_ldo, @@ -706,7 +736,10 @@ static const struct rpmh_vreg_hw_data pmic5_pldo515_mv = { static const struct rpmh_vreg_hw_data pmic5_nldo = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_drms_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(320000, 0, 123, 8000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(320000, 0, 123, 8000), + }, + .n_linear_ranges = 1, .n_voltages = 124, .hpm_min_load_uA = 30000, .pmic_mode_map = pmic_mode_map_pmic5_ldo, @@ -716,7 +749,10 @@ static const struct rpmh_vreg_hw_data pmic5_nldo = { static const struct rpmh_vreg_hw_data pmic5_nldo515 = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_drms_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(320000, 0, 210, 8000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(320000, 0, 210, 8000), + }, + .n_linear_ranges = 1, .n_voltages = 211, .hpm_min_load_uA = 30000, .pmic_mode_map = pmic_mode_map_pmic5_ldo, @@ -726,7 +762,10 @@ static const struct rpmh_vreg_hw_data pmic5_nldo515 = { static const struct rpmh_vreg_hw_data pmic5_hfsmps510 = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(320000, 0, 215, 8000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(320000, 0, 215, 8000), + }, + .n_linear_ranges = 1, .n_voltages = 216, .pmic_mode_map = pmic_mode_map_pmic5_smps, .of_map_mode = rpmh_regulator_pmic4_smps_of_map_mode, @@ -735,7 +774,10 @@ static const struct rpmh_vreg_hw_data pmic5_hfsmps510 = { static const struct rpmh_vreg_hw_data pmic5_ftsmps510 = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(300000, 0, 263, 4000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(300000, 0, 263, 4000), + }, + .n_linear_ranges = 1, .n_voltages = 264, .pmic_mode_map = pmic_mode_map_pmic5_smps, .of_map_mode = rpmh_regulator_pmic4_smps_of_map_mode, @@ -744,7 +786,10 @@ static const struct rpmh_vreg_hw_data pmic5_ftsmps510 = { static const struct rpmh_vreg_hw_data pmic5_ftsmps520 = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(300000, 0, 263, 4000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(300000, 0, 263, 4000), + }, + .n_linear_ranges = 1, .n_voltages = 264, .pmic_mode_map = pmic_mode_map_pmic5_smps, .of_map_mode = rpmh_regulator_pmic4_smps_of_map_mode, @@ -753,7 +798,10 @@ static const struct rpmh_vreg_hw_data pmic5_ftsmps520 = { static const struct rpmh_vreg_hw_data pmic5_ftsmps525_lv = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(300000, 0, 267, 4000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(300000, 0, 267, 4000), + }, + .n_linear_ranges = 1, .n_voltages = 268, .pmic_mode_map = pmic_mode_map_pmic5_smps, .of_map_mode = rpmh_regulator_pmic4_smps_of_map_mode, @@ -762,7 +810,10 @@ static const struct rpmh_vreg_hw_data pmic5_ftsmps525_lv = { static const struct rpmh_vreg_hw_data pmic5_ftsmps525_mv = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(600000, 0, 267, 8000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(600000, 0, 267, 8000), + }, + .n_linear_ranges = 1, .n_voltages = 268, .pmic_mode_map = pmic_mode_map_pmic5_smps, .of_map_mode = rpmh_regulator_pmic4_smps_of_map_mode, @@ -771,7 +822,10 @@ static const struct rpmh_vreg_hw_data pmic5_ftsmps525_mv = { static const struct rpmh_vreg_hw_data pmic5_ftsmps527 = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(320000, 0, 215, 8000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(320000, 0, 215, 8000), + }, + .n_linear_ranges = 1, .n_voltages = 215, .pmic_mode_map = pmic_mode_map_pmic5_smps, .of_map_mode = rpmh_regulator_pmic4_smps_of_map_mode, @@ -780,7 +834,10 @@ static const struct rpmh_vreg_hw_data pmic5_ftsmps527 = { static const struct rpmh_vreg_hw_data pmic5_hfsmps515 = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(320000, 0, 235, 16000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(320000, 0, 235, 16000), + }, + .n_linear_ranges = 1, .n_voltages = 236, .pmic_mode_map = pmic_mode_map_pmic5_smps, .of_map_mode = rpmh_regulator_pmic4_smps_of_map_mode, @@ -789,7 +846,10 @@ static const struct rpmh_vreg_hw_data pmic5_hfsmps515 = { static const struct rpmh_vreg_hw_data pmic5_hfsmps515_1 = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(900000, 0, 4, 16000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(900000, 0, 4, 16000), + }, + .n_linear_ranges = 1, .n_voltages = 5, .pmic_mode_map = pmic_mode_map_pmic5_smps, .of_map_mode = rpmh_regulator_pmic4_smps_of_map_mode, @@ -798,7 +858,10 @@ static const struct rpmh_vreg_hw_data pmic5_hfsmps515_1 = { static const struct rpmh_vreg_hw_data pmic5_bob = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_bypass_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(3000000, 0, 31, 32000), + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(3000000, 0, 31, 32000), + }, + .n_linear_ranges = 1, .n_voltages = 32, .pmic_mode_map = pmic_mode_map_pmic5_bob, .of_map_mode = rpmh_regulator_pmic4_bob_of_map_mode, From 638baabe951eb16607b7e4bb197998562afd57a6 Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Thu, 14 Dec 2023 10:59:12 +0800 Subject: [PATCH 0755/1562] regulator: dt-bindings: qcom,rpmh: add compatible for pm8010 Add compatible for PM8010 RPMH regulators present on sm8550-qrd and sm8550-mtp boards. Suggested-by: David Collins Reviewed-by: Krzysztof Kozlowski Signed-off-by: Fenglin Wu Link: https://msgid.link/r/20231214-pm8010-regulator-v2-2-82131df6b97b@quicinc.com Signed-off-by: Mark Brown --- .../bindings/regulator/qcom,rpmh-regulator.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.yaml b/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.yaml index acd37f28ef53..27c6d5152413 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.yaml @@ -42,6 +42,7 @@ description: | For PM7325, smps1 - smps8, ldo1 - ldo19 For PM8005, smps1 - smps4 For PM8009, smps1 - smps2, ldo1 - ldo7 + For PM8010, ldo1 - ldo7 For PM8150, smps1 - smps10, ldo1 - ldo18 For PM8150L, smps1 - smps8, ldo1 - ldo11, bob, flash, rgb For PM8350, smps1 - smps12, ldo1 - ldo10 @@ -68,6 +69,7 @@ properties: - qcom,pm8005-rpmh-regulators - qcom,pm8009-rpmh-regulators - qcom,pm8009-1-rpmh-regulators + - qcom,pm8010-rpmh-regulators - qcom,pm8150-rpmh-regulators - qcom,pm8150l-rpmh-regulators - qcom,pm8350-rpmh-regulators @@ -238,6 +240,18 @@ allOf: "^vdd-l[1-47]-supply$": true "^vdd-s[1-2]-supply$": true + - if: + properties: + compatible: + enum: + - qcom,pm8010-rpmh-regulators + then: + properties: + vdd-l1-l2-supply: true + vdd-l3-l4-supply: true + patternProperties: + "^vdd-l[5-7]-supply$": true + - if: properties: compatible: From 2544631faa7f3244c9bcb9b511ca4f1a4f5a3ba0 Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Thu, 14 Dec 2023 10:59:13 +0800 Subject: [PATCH 0756/1562] regulator: qcom-rpmh: add support for pm8010 regulators Add RPMH regulators exposed by Qualcomm Technologies, Inc. PM8010 PMIC. It has 7 LDOs with 3 different types, LDO1 - LDO2 are L502 NMOS LDOs, LDO5 and LDO7 are L502 PMOS LDOs, LDO3/LDO4/LDO6 are L502 PMOS LDO for low noise applications. Also, LDO3 - LDO7 don't support LPM. Suggested-by: David Collins Reviewed-by: David Collins Signed-off-by: Fenglin Wu Link: https://msgid.link/r/20231214-pm8010-regulator-v2-3-82131df6b97b@quicinc.com Signed-off-by: Mark Brown --- drivers/regulator/qcom-rpmh-regulator.c | 62 +++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/drivers/regulator/qcom-rpmh-regulator.c b/drivers/regulator/qcom-rpmh-regulator.c index 43b45feb02e6..80e304711345 100644 --- a/drivers/regulator/qcom-rpmh-regulator.c +++ b/drivers/regulator/qcom-rpmh-regulator.c @@ -511,6 +511,14 @@ static const int pmic_mode_map_pmic5_ldo[REGULATOR_MODE_STANDBY + 1] = { [REGULATOR_MODE_FAST] = -EINVAL, }; +static const int pmic_mode_map_pmic5_ldo_hpm[REGULATOR_MODE_STANDBY + 1] = { + [REGULATOR_MODE_INVALID] = -EINVAL, + [REGULATOR_MODE_STANDBY] = -EINVAL, + [REGULATOR_MODE_IDLE] = -EINVAL, + [REGULATOR_MODE_NORMAL] = PMIC5_LDO_MODE_HPM, + [REGULATOR_MODE_FAST] = -EINVAL, +}; + static unsigned int rpmh_regulator_pmic4_ldo_of_map_mode(unsigned int rpmh_mode) { unsigned int mode; @@ -733,6 +741,33 @@ static const struct rpmh_vreg_hw_data pmic5_pldo515_mv = { .of_map_mode = rpmh_regulator_pmic4_ldo_of_map_mode, }; +static const struct rpmh_vreg_hw_data pmic5_pldo502 = { + .regulator_type = VRM, + .ops = &rpmh_regulator_vrm_ops, + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(1504000, 0, 255, 8000), + }, + .n_linear_ranges = 1, + .n_voltages = 256, + .pmic_mode_map = pmic_mode_map_pmic5_ldo_hpm, + .of_map_mode = rpmh_regulator_pmic4_ldo_of_map_mode, +}; + +static const struct rpmh_vreg_hw_data pmic5_pldo502ln = { + .regulator_type = VRM, + .ops = &rpmh_regulator_vrm_ops, + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(1800000, 0, 2, 200000), + REGULATOR_LINEAR_RANGE(2608000, 3, 28, 16000), + REGULATOR_LINEAR_RANGE(3104000, 29, 30, 96000), + REGULATOR_LINEAR_RANGE(3312000, 31, 31, 0), + }, + .n_linear_ranges = 4, + .n_voltages = 32, + .pmic_mode_map = pmic_mode_map_pmic5_ldo_hpm, + .of_map_mode = rpmh_regulator_pmic4_ldo_of_map_mode, +}; + static const struct rpmh_vreg_hw_data pmic5_nldo = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_drms_ops, @@ -759,6 +794,19 @@ static const struct rpmh_vreg_hw_data pmic5_nldo515 = { .of_map_mode = rpmh_regulator_pmic4_ldo_of_map_mode, }; +static const struct rpmh_vreg_hw_data pmic5_nldo502 = { + .regulator_type = VRM, + .ops = &rpmh_regulator_vrm_drms_ops, + .voltage_ranges = (struct linear_range[]) { + REGULATOR_LINEAR_RANGE(528000, 0, 127, 8000), + }, + .n_linear_ranges = 1, + .n_voltages = 128, + .hpm_min_load_uA = 30000, + .pmic_mode_map = pmic_mode_map_pmic5_ldo, + .of_map_mode = rpmh_regulator_pmic4_ldo_of_map_mode, +}; + static const struct rpmh_vreg_hw_data pmic5_hfsmps510 = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_ops, @@ -1210,6 +1258,16 @@ static const struct rpmh_vreg_init_data pm8009_1_vreg_data[] = { {} }; +static const struct rpmh_vreg_init_data pm8010_vreg_data[] = { + RPMH_VREG("ldo1", "ldo%s1", &pmic5_nldo502, "vdd-l1-l2"), + RPMH_VREG("ldo2", "ldo%s2", &pmic5_nldo502, "vdd-l1-l2"), + RPMH_VREG("ldo3", "ldo%s3", &pmic5_pldo502ln, "vdd-l3-l4"), + RPMH_VREG("ldo4", "ldo%s4", &pmic5_pldo502ln, "vdd-l3-l4"), + RPMH_VREG("ldo5", "ldo%s5", &pmic5_pldo502, "vdd-l5"), + RPMH_VREG("ldo6", "ldo%s6", &pmic5_pldo502ln, "vdd-l6"), + RPMH_VREG("ldo7", "ldo%s7", &pmic5_pldo502, "vdd-l7"), +}; + static const struct rpmh_vreg_init_data pm6150_vreg_data[] = { RPMH_VREG("smps1", "smp%s1", &pmic5_ftsmps510, "vdd-s1"), RPMH_VREG("smps2", "smp%s2", &pmic5_ftsmps510, "vdd-s2"), @@ -1525,6 +1583,10 @@ static const struct of_device_id __maybe_unused rpmh_regulator_match_table[] = { .compatible = "qcom,pm8009-1-rpmh-regulators", .data = pm8009_1_vreg_data, }, + { + .compatible = "qcom,pm8010-rpmh-regulators", + .data = pm8010_vreg_data, + }, { .compatible = "qcom,pm8150-rpmh-regulators", .data = pm8150_vreg_data, From 199d1402229f26804c81508346b57a0e9c094bb6 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 11 Dec 2023 16:05:24 +0100 Subject: [PATCH 0757/1562] mtd: rawnand: pl353: Fix kernel doc Both the "chip" kernel doc member and description are wrong. This field is called "chips" and describes the list of NAND chips connected to the controller. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312102130.geZ4dqyN-lkp@intel.com/ Fixes: 08d8c62164a3 ("mtd: rawnand: pl353: Add support for the ARM PL353 SMC NAND controller") Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231211150524.108803-1-miquel.raynal@bootlin.com --- drivers/mtd/nand/raw/pl35x-nand-controller.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/pl35x-nand-controller.c b/drivers/mtd/nand/raw/pl35x-nand-controller.c index c506e92a3e45..1c76ee98efb7 100644 --- a/drivers/mtd/nand/raw/pl35x-nand-controller.c +++ b/drivers/mtd/nand/raw/pl35x-nand-controller.c @@ -128,7 +128,7 @@ struct pl35x_nand { * @conf_regs: SMC configuration registers for command phase * @io_regs: NAND data registers for data phase * @controller: Core NAND controller structure - * @chip: NAND chip information structure + * @chips: List of connected NAND chips * @selected_chip: NAND chip currently selected by the controller * @assigned_cs: List of assigned CS * @ecc_buf: Temporary buffer to extract ECC bytes From 2ca8718be0c469a99435f6330904364f8dc5a094 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 11 Dec 2023 16:07:03 +0100 Subject: [PATCH 0758/1562] mtd: rawnand: rockchip: Rename a structure Robots are unhappy with the ecc_cnt_status structure because the kernel doc says it should be called rk_ecc_cnt_status. In general, it is considered a better practice to prefix all symbols in a file with the same prexif, and thus it seems more relevant to rename the structure rather than changing the kernel doc header. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312102130.geZ4dqyN-lkp@intel.com/ Fixes: 058e0e847d54 ("mtd: rawnand: rockchip: NFC driver for RK3308, RK2928 and others") Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231211150704.109138-1-miquel.raynal@bootlin.com --- drivers/mtd/nand/raw/rockchip-nand-controller.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/nand/raw/rockchip-nand-controller.c b/drivers/mtd/nand/raw/rockchip-nand-controller.c index 596cf9a78274..ab1a9e8687e0 100644 --- a/drivers/mtd/nand/raw/rockchip-nand-controller.c +++ b/drivers/mtd/nand/raw/rockchip-nand-controller.c @@ -98,7 +98,7 @@ enum nfc_type { * @high: ECC count high bit index at register. * @high_mask: mask bit */ -struct ecc_cnt_status { +struct rk_ecc_cnt_status { u8 err_flag_bit; u8 low; u8 low_mask; @@ -144,8 +144,8 @@ struct nfc_cfg { u32 int_st_off; u32 oob0_off; u32 oob1_off; - struct ecc_cnt_status ecc0; - struct ecc_cnt_status ecc1; + struct rk_ecc_cnt_status ecc0; + struct rk_ecc_cnt_status ecc1; }; struct rk_nfc_nand_chip { From b6c985dd9a2d5902e413c2e9ba5a770fbca12322 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 11 Dec 2023 16:07:04 +0100 Subject: [PATCH 0759/1562] mtd: rawnand: rockchip: Add missing title to a kernel doc comment All fields of the nfc_cfg structure are documented but the name, which leads to a W=1 warning. Add a title. Fixes: 058e0e847d54 ("mtd: rawnand: rockchip: NFC driver for RK3308, RK2928 and others") Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231211150704.109138-2-miquel.raynal@bootlin.com --- drivers/mtd/nand/raw/rockchip-nand-controller.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mtd/nand/raw/rockchip-nand-controller.c b/drivers/mtd/nand/raw/rockchip-nand-controller.c index ab1a9e8687e0..7baaef69d70a 100644 --- a/drivers/mtd/nand/raw/rockchip-nand-controller.c +++ b/drivers/mtd/nand/raw/rockchip-nand-controller.c @@ -108,6 +108,7 @@ struct rk_ecc_cnt_status { }; /** + * struct nfc_cfg: Rockchip NAND controller configuration * @type: NFC version * @ecc_strengths: ECC strengths * @ecc_cfgs: ECC config values From 2b8aa4c3e6a5d41b10b53da2017852f647d0345b Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Thu, 14 Dec 2023 15:29:43 +0800 Subject: [PATCH 0760/1562] mtd: rawnand: diskonchip: fix a potential double free in doc_probe When nand_scan() fails, it has cleaned up related resources in its error paths. Therefore, the following nand_cleanup() may lead to a double-free. One possible trace is: doc_probe |-> nand_scan | |-> nand_scan_with_ids | |-> nand_scan_tail | |-> kfree(chip->data_buf) [First free] | |-> nand_cleanup |-> kfree(chip->data_buf) [Double free here] Fix this by removing nand_cleanup() on failure of nand_scan(). Signed-off-by: Dinghao Liu Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231214072946.10285-1-dinghao.liu@zju.edu.cn --- drivers/mtd/nand/raw/diskonchip.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/mtd/nand/raw/diskonchip.c b/drivers/mtd/nand/raw/diskonchip.c index 5d2ddb037a9a..5243fab9face 100644 --- a/drivers/mtd/nand/raw/diskonchip.c +++ b/drivers/mtd/nand/raw/diskonchip.c @@ -1491,10 +1491,12 @@ static int __init doc_probe(unsigned long physadr) else numchips = doc2001_init(mtd); - if ((ret = nand_scan(nand, numchips)) || (ret = doc->late_init(mtd))) { - /* DBB note: i believe nand_cleanup is necessary here, as - buffers may have been allocated in nand_base. Check with - Thomas. FIX ME! */ + ret = nand_scan(nand, numchips); + if (ret) + goto fail; + + ret = doc->late_init(mtd); + if (ret) { nand_cleanup(nand); goto fail; } From b511e8e05b32d028d8369af3e369c924f98323ec Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 5 Dec 2023 08:54:31 +0100 Subject: [PATCH 0761/1562] mtd: ssfdc: Remove an unused variable Since its introduction the driver was declaring a "usecount" variable, but nobody ever used it upstream. This was spot while grepping for usecount through mtd/ for other reasons. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231205075431.13401-1-miquel.raynal@bootlin.com --- drivers/mtd/ssfdc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c index 04da685c36be..211f279a33a9 100644 --- a/drivers/mtd/ssfdc.c +++ b/drivers/mtd/ssfdc.c @@ -18,7 +18,6 @@ struct ssfdcr_record { struct mtd_blktrans_dev mbd; - int usecount; unsigned char heads; unsigned char sectors; unsigned short cylinders; From a7d84a2e7663bbe12394cc771107e04668ea313a Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 5 Dec 2023 08:59:36 +0100 Subject: [PATCH 0762/1562] mtd: maps: vmu-flash: Fix the (mtd core) switch to ref counters While switching to ref counters for track mtd devices use, the vmu-flash driver was forgotten. The reason for reading the ref counter seems debatable, but let's just fix the build for now. Fixes: 19bfa9ebebb5 ("mtd: use refcount to prevent corruption") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312022315.79twVRZw-lkp@intel.com/ Cc: stable@vger.kernel.org Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231205075936.13831-1-miquel.raynal@bootlin.com --- drivers/mtd/maps/vmu-flash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/maps/vmu-flash.c b/drivers/mtd/maps/vmu-flash.c index a7ec947a3ebb..53019d313db7 100644 --- a/drivers/mtd/maps/vmu-flash.c +++ b/drivers/mtd/maps/vmu-flash.c @@ -719,7 +719,7 @@ static int vmu_can_unload(struct maple_device *mdev) card = maple_get_drvdata(mdev); for (x = 0; x < card->partitions; x++) { mtd = &((card->mtd)[x]); - if (mtd->usecount > 0) + if (kref_read(&mtd->refcnt)) return 0; } return 1; From 93d6fda7f926451a0fa1121b9558d75ca47e861e Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 29 Nov 2023 02:04:31 +0800 Subject: [PATCH 0763/1562] erofs: fix memory leak on short-lived bounced pages Both MicroLZMA and DEFLATE algorithms can use short-lived pages on demand for the overlapped inplace I/O decompression. However, those short-lived pages are actually added to `be->compressed_pages`. Thus, it should be checked instead of `pcl->compressed_bvecs`. The LZ4 algorithm doesn't work like this, so it won't be impacted. Fixes: 67139e36d970 ("erofs: introduce `z_erofs_parse_in_bvecs'") Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231128180431.4116991-1-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index a7e6847f6f8f..a33cd6757f98 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1309,12 +1309,11 @@ out: put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { - page = pcl->compressed_bvecs[i].page; + /* consider shortlived pages added when decompressing */ + page = be->compressed_pages[i]; if (erofs_page_is_managed(sbi, page)) continue; - - /* recycle all individual short-lived pages */ (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } From 3c12466b6b7bf1e56f9b32c366a3d83d87afb4de Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 12:55:34 +0800 Subject: [PATCH 0764/1562] erofs: fix lz4 inplace decompression Currently EROFS can map another compressed buffer for inplace decompression, that was used to handle the cases that some pages of compressed data are actually not in-place I/O. However, like most simple LZ77 algorithms, LZ4 expects the compressed data is arranged at the end of the decompressed buffer and it explicitly uses memmove() to handle overlapping: __________________________________________________________ |_ direction of decompression --> ____ |_ compressed data _| Although EROFS arranges compressed data like this, it typically maps two individual virtual buffers so the relative order is uncertain. Previously, it was hardly observed since LZ4 only uses memmove() for short overlapped literals and x86/arm64 memmove implementations seem to completely cover it up and they don't have this issue. Juhyung reported that EROFS data corruption can be found on a new Intel x86 processor. After some analysis, it seems that recent x86 processors with the new FSRM feature expose this issue with "rep movsb". Let's strictly use the decompressed buffer for lz4 inplace decompression for now. Later, as an useful improvement, we could try to tie up these two buffers together in the correct order. Reported-and-tested-by: Juhyung Park Closes: https://lore.kernel.org/r/CAD14+f2AVKf8Fa2OO1aAUdDNTDsVzzR6ctU_oJSmTyd6zSYR2Q@mail.gmail.com Fixes: 0ffd71bcc3a0 ("staging: erofs: introduce LZ4 decompression inplace") Fixes: 598162d05080 ("erofs: support decompress big pcluster for lz4 backend") Cc: stable # 5.4+ Tested-by: Yifan Zhao Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206045534.3920847-1-hsiangkao@linux.alibaba.com --- fs/erofs/decompressor.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 021be5feb1bc..e0d609c3958f 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -121,11 +121,11 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, } static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, - void *inpage, unsigned int *inputmargin, int *maptype, - bool may_inplace) + void *inpage, void *out, unsigned int *inputmargin, + int *maptype, bool may_inplace) { struct z_erofs_decompress_req *rq = ctx->rq; - unsigned int omargin, total, i, j; + unsigned int omargin, total, i; struct page **in; void *src, *tmp; @@ -135,12 +135,13 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) goto docopy; - for (i = 0; i < ctx->inpages; ++i) { - DBG_BUGON(rq->in[i] == NULL); - for (j = 0; j < ctx->outpages - ctx->inpages + i; ++j) - if (rq->out[j] == rq->in[i]) - goto docopy; - } + for (i = 0; i < ctx->inpages; ++i) + if (rq->out[ctx->outpages - ctx->inpages + i] != + rq->in[i]) + goto docopy; + kunmap_local(inpage); + *maptype = 3; + return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT); } if (ctx->inpages <= 1) { @@ -148,7 +149,6 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, return inpage; } kunmap_local(inpage); - might_sleep(); src = erofs_vm_map_ram(rq->in, ctx->inpages); if (!src) return ERR_PTR(-ENOMEM); @@ -204,12 +204,12 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, } static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, - u8 *out) + u8 *dst) { struct z_erofs_decompress_req *rq = ctx->rq; bool support_0padding = false, may_inplace = false; unsigned int inputmargin; - u8 *headpage, *src; + u8 *out, *headpage, *src; int ret, maptype; DBG_BUGON(*rq->in == NULL); @@ -230,11 +230,12 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } inputmargin = rq->pageofs_in; - src = z_erofs_lz4_handle_overlap(ctx, headpage, &inputmargin, + src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin, &maptype, may_inplace); if (IS_ERR(src)) return PTR_ERR(src); + out = dst + rq->pageofs_out; /* legacy format could compress extra data in a pcluster. */ if (rq->partial_decoding || !support_0padding) ret = LZ4_decompress_safe_partial(src + inputmargin, out, @@ -265,7 +266,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { erofs_put_pcpubuf(src); - } else { + } else if (maptype != 3) { DBG_BUGON(1); return -EFAULT; } @@ -308,7 +309,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, } dstmap_out: - ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out); + ret = z_erofs_lz4_decompress_mem(&ctx, dst); if (!dst_maptype) kunmap_local(dst); else if (dst_maptype == 2) From 192351616a9dde686492bcb9d1e4895a1411a527 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:53 +0800 Subject: [PATCH 0765/1562] erofs: support I/O submission for sub-page compressed blocks Add a basic I/O submission path first to support sub-page blocks: - Temporary short-lived pages will be used entirely; - In-place I/O pages can be used partially, but compressed pages need to be able to be mapped in contiguous virtual memory. As a start, currently cache decompression is explicitly disabled for sub-page blocks, which will be supported in the future. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-2-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 156 ++++++++++++++++++++++------------------------- 1 file changed, 74 insertions(+), 82 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index a33cd6757f98..8809ca62ab2f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1435,86 +1435,85 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, z_erofs_decompressqueue_work(&io->u.work); } -static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, - unsigned int nr, - struct page **pagepool, - struct address_space *mc) +static void z_erofs_fill_bio_vec(struct bio_vec *bvec, + struct z_erofs_decompress_frontend *f, + struct z_erofs_pcluster *pcl, + unsigned int nr, + struct address_space *mc) { - const pgoff_t index = pcl->obj.index; gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; - + struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr; struct address_space *mapping; - struct page *oldpage, *page; - int justfound; + struct page *page, *oldpage; + int justfound, bs = i_blocksize(f->inode); + /* Except for inplace pages, the entire page can be used for I/Os */ + bvec->bv_offset = 0; + bvec->bv_len = PAGE_SIZE; repeat: - page = READ_ONCE(pcl->compressed_bvecs[nr].page); - oldpage = page; - - if (!page) + oldpage = READ_ONCE(zbv->page); + if (!oldpage) goto out_allocpage; - justfound = (unsigned long)page & 1UL; - page = (struct page *)((unsigned long)page & ~1UL); + justfound = (unsigned long)oldpage & 1UL; + page = (struct page *)((unsigned long)oldpage & ~1UL); + bvec->bv_page = page; + DBG_BUGON(z_erofs_is_shortlived_page(page)); /* - * preallocated cached pages, which is used to avoid direct reclaim - * otherwise, it will go inplace I/O path instead. + * Handle preallocated cached pages. We tried to allocate such pages + * without triggering direct reclaim. If allocation failed, inplace + * file-backed pages will be used instead. */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); set_page_private(page, 0); + WRITE_ONCE(zbv->page, page); tocache = true; goto out_tocache; } + mapping = READ_ONCE(page->mapping); - /* - * file-backed online pages in plcuster are all locked steady, - * therefore it is impossible for `mapping' to be NULL. + * File-backed pages for inplace I/Os are all locked steady, + * therefore it is impossible for `mapping` to be NULL. */ - if (mapping && mapping != mc) - /* ought to be unmanaged pages */ - goto out; - - /* directly return for shortlived page as well */ - if (z_erofs_is_shortlived_page(page)) - goto out; + if (mapping && mapping != mc) { + if (zbv->offset < 0) + bvec->bv_offset = round_up(-zbv->offset, bs); + bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset; + return; + } lock_page(page); - /* only true if page reclaim goes wrong, should never happen */ DBG_BUGON(justfound && PagePrivate(page)); - /* the page is still in manage cache */ + /* the cached page is still in managed cache */ if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - + WRITE_ONCE(zbv->page, page); + /* + * The cached page is still available but without a valid + * `->private` pcluster hint. Let's reconnect them. + */ if (!PagePrivate(page)) { - /* - * impossible to be !PagePrivate(page) for - * the current restriction as well if - * the page is already in compressed_bvecs[]. - */ DBG_BUGON(!justfound); - - justfound = 0; - set_page_private(page, (unsigned long)pcl); - SetPagePrivate(page); + /* compressed_bvecs[] already takes a ref */ + attach_page_private(page, pcl); + put_page(page); } - /* no need to submit io if it is already up-to-date */ + /* no need to submit if it is already up-to-date */ if (PageUptodate(page)) { unlock_page(page); - page = NULL; + bvec->bv_page = NULL; } - goto out; + return; } /* - * the managed page has been truncated, it's unsafe to - * reuse this one, let's allocate a new cache-managed page. + * It has been truncated, so it's unsafe to reuse this one. Let's + * allocate a new page for compressed data. */ DBG_BUGON(page->mapping); DBG_BUGON(!justfound); @@ -1523,25 +1522,23 @@ repeat: unlock_page(page); put_page(page); out_allocpage: - page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, - oldpage, page)) { - erofs_pagepool_add(pagepool, page); + page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); + if (oldpage != cmpxchg(&zbv->page, oldpage, page)) { + erofs_pagepool_add(&f->pagepool, page); cond_resched(); goto repeat; } + bvec->bv_page = page; out_tocache: - if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { - /* turn into temporary page if fails (1 ref) */ + if (!tocache || bs != PAGE_SIZE || + add_to_page_cache_lru(page, mc, pcl->obj.index + nr, gfp)) { + /* turn into a temporary shortlived page (1 ref) */ set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); - goto out; + return; } attach_page_private(page, pcl); - /* drop a refcount added by allocpage (then we have 2 refs here) */ + /* drop a refcount added by allocpage (then 2 refs in total here) */ put_page(page); - -out: /* the only exit (for tracing and debugging) */ - return page; } static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, @@ -1596,7 +1593,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, qtail[JQ_BYPASS] = &pcl->next; } -static void z_erofs_decompressqueue_endio(struct bio *bio) +static void z_erofs_submissionqueue_endio(struct bio *bio) { struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; @@ -1608,7 +1605,6 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) DBG_BUGON(PageUptodate(page)); DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { if (!err) SetPageUptodate(page); @@ -1631,17 +1627,14 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; z_erofs_next_pcluster_t owned_head = f->owned_head; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ - pgoff_t last_index; + erofs_off_t last_pa; struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; unsigned long pflags; int memstall = 0; - /* - * if managed cache is enabled, bypass jobqueue is needed, - * no need to read from device for all pclusters in this queue. - */ + /* No need to read from device for pclusters in the bypass queue. */ q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); @@ -1654,7 +1647,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, do { struct erofs_map_dev mdev; struct z_erofs_pcluster *pcl; - pgoff_t cur, end; + erofs_off_t cur, end; + struct bio_vec bvec; unsigned int i = 0; bool bypass = true; @@ -1673,18 +1667,14 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, }; (void)erofs_map_dev(sb, &mdev); - cur = erofs_blknr(sb, mdev.m_pa); - end = cur + pcl->pclusterpages; - + cur = mdev.m_pa; + end = cur + (pcl->pclusterpages << PAGE_SHIFT); do { - struct page *page; - - page = pickup_page_for_submission(pcl, i++, - &f->pagepool, mc); - if (!page) + z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); + if (!bvec.bv_page) continue; - if (bio && (cur != last_index + 1 || + if (bio && (cur != last_pa || last_bdev != mdev.m_bdev)) { submit_bio_retry: submit_bio(bio); @@ -1695,7 +1685,8 @@ submit_bio_retry: bio = NULL; } - if (unlikely(PageWorkingset(page)) && !memstall) { + if (unlikely(PageWorkingset(bvec.bv_page)) && + !memstall) { psi_memstall_enter(&pflags); memstall = 1; } @@ -1703,23 +1694,24 @@ submit_bio_retry: if (!bio) { bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOIO); - bio->bi_end_io = z_erofs_decompressqueue_endio; - - last_bdev = mdev.m_bdev; - bio->bi_iter.bi_sector = (sector_t)cur << - (sb->s_blocksize_bits - 9); + bio->bi_end_io = z_erofs_submissionqueue_endio; + bio->bi_iter.bi_sector = cur >> 9; bio->bi_private = q[JQ_SUBMIT]; if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; + last_bdev = mdev.m_bdev; } - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + if (cur + bvec.bv_len > end) + bvec.bv_len = end - cur; + if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, + bvec.bv_offset)) goto submit_bio_retry; - last_index = cur; + last_pa = cur + bvec.bv_len; bypass = false; - } while (++cur < end); + } while ((cur += bvec.bv_len) < end); if (!bypass) qtail[JQ_SUBMIT] = &pcl->next; From 54ed3fdd66055d073cb1cd2c6c65bbc0683c40cf Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:54 +0800 Subject: [PATCH 0766/1562] erofs: record `pclustersize` in bytes instead of pages Currently, compressed sizes are recorded in pages using `pclusterpages`, However, for tailpacking pclusters, `tailpacking_size` is used instead. This approach doesn't work when dealing with sub-page blocks. To address this, let's switch them to the unified `pclustersize` in bytes. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-3-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 64 ++++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 8809ca62ab2f..d02989466711 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -56,6 +56,9 @@ struct z_erofs_pcluster { /* L: total number of bvecs */ unsigned int vcnt; + /* I: pcluster size (compressed size) in bytes */ + unsigned int pclustersize; + /* I: page offset of start position of decompression */ unsigned short pageofs_out; @@ -70,14 +73,6 @@ struct z_erofs_pcluster { struct rcu_head rcu; }; - union { - /* I: physical cluster size in pages */ - unsigned short pclusterpages; - - /* I: tailpacking inline compressed size */ - unsigned short tailpacking_size; - }; - /* I: compression algorithm format */ unsigned char algorithmformat; @@ -115,9 +110,7 @@ static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) { - if (z_erofs_is_inline_pcluster(pcl)) - return 1; - return pcl->pclusterpages; + return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; } /* @@ -298,12 +291,12 @@ static int z_erofs_create_pcluster_pool(void) return 0; } -static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) +static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) { - int i; + unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct z_erofs_pcluster_slab *pcs = pcluster_pool; - for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { - struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { struct z_erofs_pcluster *pcl; if (nrpages > pcs->maxpages) @@ -312,7 +305,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); if (!pcl) return ERR_PTR(-ENOMEM); - pcl->pclusterpages = nrpages; + pcl->pclustersize = size; return pcl; } return ERR_PTR(-EINVAL); @@ -559,6 +552,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool shouldalloc = z_erofs_should_alloc_cache(fe); bool standalone = true; /* @@ -572,10 +566,9 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; - for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page; + for (i = 0; i < pclusterpages; ++i) { + struct page *page, *newpage; void *t; /* mark pages just found for debugging */ - struct page *newpage = NULL; /* the compressed page was loaded before */ if (READ_ONCE(pcl->compressed_bvecs[i].page)) @@ -585,6 +578,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) if (page) { t = (void *)((unsigned long)page | 1); + newpage = NULL; } else { /* I/O is needed, no possible to decompress directly */ standalone = false; @@ -592,9 +586,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) continue; /* - * try to use cached I/O if page allocation - * succeeds or fallback to in-place I/O instead - * to avoid any direct reclaim. + * Try cached I/O if allocation succeeds or fallback to + * in-place I/O instead to avoid any direct reclaim. */ newpage = erofs_allocpage(&fe->pagepool, gfp); if (!newpage) @@ -626,6 +619,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); int i; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); @@ -633,7 +627,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, * refcount of workgroup is now freezed as 0, * therefore no need to worry about available decompression users. */ - for (i = 0; i < pcl->pclusterpages; ++i) { + for (i = 0; i < pclusterpages; ++i) { struct page *page = pcl->compressed_bvecs[i].page; if (!page) @@ -657,6 +651,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) { struct z_erofs_pcluster *pcl = folio_get_private(folio); + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool ret; int i; @@ -669,7 +664,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) goto out; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pcl->pclusterpages; ++i) { + for (i = 0; i < pclusterpages; ++i) { if (pcl->compressed_bvecs[i].page == &folio->page) { WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); ret = true; @@ -778,20 +773,20 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; + struct super_block *sb = fe->inode->i_sb; bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct erofs_workgroup *grp; int err; if (!(map->m_flags & EROFS_MAP_ENCODED) || - (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { + (!ztailpacking && !erofs_blknr(sb, map->m_pa))) { DBG_BUGON(1); return -EFSCORRUPTED; } /* no available pcluster, let's allocate one */ - pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : - map->m_plen >> PAGE_SHIFT); + pcl = z_erofs_alloc_pcluster(map->m_plen); if (IS_ERR(pcl)) return PTR_ERR(pcl); @@ -816,9 +811,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa); - pcl->tailpacking_size = map->m_plen; } else { - pcl->obj.index = map->m_pa >> PAGE_SHIFT; + pcl->obj.index = erofs_blknr(sb, map->m_pa); grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); if (IS_ERR(grp)) { @@ -1244,8 +1238,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, unsigned int pclusterpages = z_erofs_pclusterpages(pcl); const struct z_erofs_decompressor *decompressor = &erofs_decompressors[pcl->algorithmformat]; - unsigned int i, inputsize; - int err2; + int i, err2; struct page *page; bool overlapped; @@ -1282,18 +1275,13 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (err) goto out; - if (z_erofs_is_inline_pcluster(pcl)) - inputsize = pcl->tailpacking_size; - else - inputsize = pclusterpages * PAGE_SIZE; - err = decompressor->decompress(&(struct z_erofs_decompress_req) { .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, - .inputsize = inputsize, + .inputsize = pcl->pclustersize, .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, @@ -1668,7 +1656,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, (void)erofs_map_dev(sb, &mdev); cur = mdev.m_pa; - end = cur + (pcl->pclusterpages << PAGE_SHIFT); + end = cur + pcl->pclustersize; do { z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); if (!bvec.bv_page) From 8d2517aaeea3ab8651bb517bca8f3c8664d318ea Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:55 +0800 Subject: [PATCH 0767/1562] erofs: fix up compacted indexes for block size < 4096 Previously, the block size always equaled to PAGE_SIZE, therefore `lclusterbits` couldn't be less than 12. Since sub-page compressed blocks are now considered, `lobits` for a lcluster in each pack cannot always be `lclusterbits` as before. Otherwise, there is no enough room for the special value `Z_EROFS_LI_D0_CBLKCNT`. To support smaller block sizes, `lobits` for each compacted lcluster is now calculated as: lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1) Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-4-hsiangkao@linux.alibaba.com --- fs/erofs/zmap.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 7b55111fd533..9753875e41cb 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -82,29 +82,26 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, } static unsigned int decode_compactedbits(unsigned int lobits, - unsigned int lomask, u8 *in, unsigned int pos, u8 *type) { const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7); - const unsigned int lo = v & lomask; + const unsigned int lo = v & ((1 << lobits) - 1); *type = (v >> lobits) & 3; return lo; } -static int get_compacted_la_distance(unsigned int lclusterbits, +static int get_compacted_la_distance(unsigned int lobits, unsigned int encodebits, unsigned int vcnt, u8 *in, int i) { - const unsigned int lomask = (1 << lclusterbits) - 1; unsigned int lo, d1 = 0; u8 type; DBG_BUGON(i >= vcnt); do { - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, encodebits * i, &type); if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) return d1; @@ -123,15 +120,14 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, { struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; - const unsigned int lomask = (1 << lclusterbits) - 1; - unsigned int vcnt, base, lo, encodebits, nblk, eofs; + unsigned int vcnt, base, lo, lobits, encodebits, nblk, eofs; int i; u8 *in, type; bool big_pcluster; if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; - else if (1 << amortizedshift == 2 && lclusterbits == 12) + else if (1 << amortizedshift == 2 && lclusterbits <= 12) vcnt = 16; else return -EOPNOTSUPP; @@ -140,6 +136,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; + lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; eofs = erofs_blkoff(m->inode->i_sb, pos); base = round_down(eofs, vcnt << amortizedshift); @@ -147,15 +144,14 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, i = (eofs - base) >> amortizedshift; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, encodebits * i, &type); m->type = type; if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; /* figure out lookahead_distance: delta[1] if needed */ if (lookahead) - m->delta[1] = get_compacted_la_distance(lclusterbits, + m->delta[1] = get_compacted_la_distance(lobits, encodebits, vcnt, in, i); if (lo & Z_EROFS_LI_D0_CBLKCNT) { if (!big_pcluster) { @@ -174,8 +170,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, * of which lo saves delta[1] rather than delta[0]. * Hence, get delta[0] by the previous lcluster indirectly. */ - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * (i - 1), &type); + lo = decode_compactedbits(lobits, in, + encodebits * (i - 1), &type); if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) lo = 0; else if (lo & Z_EROFS_LI_D0_CBLKCNT) @@ -190,8 +186,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, nblk = 1; while (i > 0) { --i; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, + encodebits * i, &type); if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) i -= lo; @@ -202,8 +198,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, nblk = 0; while (i > 0) { --i; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, + encodebits * i, &type); if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { if (lo & Z_EROFS_LI_D0_CBLKCNT) { --i; From 84712492e6dab803bf595fb8494d11098b74a652 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 14 Dec 2023 13:28:08 -0600 Subject: [PATCH 0768/1562] xfs: short circuit xfs_growfs_data_private() if delta is zero Although xfs_growfs_data() doesn't call xfs_growfs_data_private() if in->newblocks == mp->m_sb.sb_dblocks, xfs_growfs_data_private() further massages the new block count so that we don't i.e. try to create a too-small new AG. This may lead to a delta of "0" in xfs_growfs_data_private(), so we end up in the shrink case and emit the EXPERIMENTAL warning even if we're not changing anything at all. Fix this by returning straightaway if the block delta is zero. (nb: in older kernels, the result of entering the shrink case with delta == 0 may actually let an -ENOSPC escape to userspace, which is confusing for users.) Fixes: fb2fc1720185 ("xfs: support shrinking unused space in the last AG") Signed-off-by: Eric Sandeen Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_fsops.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 4f5da19142f2..5e7255e6ad3e 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -134,6 +134,10 @@ xfs_growfs_data_private( if (delta < 0 && nagcount < 2) return -EINVAL; + /* No work to do */ + if (delta == 0) + return 0; + oagcount = mp->m_sb.sb_agcount; /* allocate the new per-ag structures */ if (nagcount > oagcount) { From c0e37f07d2bd3c1ee3fb5a650da7d8673557ed16 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 14 Dec 2023 13:38:45 -0800 Subject: [PATCH 0769/1562] xfs: fix an off-by-one error in xreap_agextent_binval Overall, this function tries to find and invalidate all buffers for a given extent of space on the data device. The inner for loop in this function tries to find all xfs_bufs for a given daddr. The lengths of all possible cached buffers range from 1 fsblock to the largest needed to contain a 64k xattr value (~17fsb). The scan is capped to avoid looking at anything buffer going past the given extent. Unfortunately, the loop continuation test is wrong -- max_fsbs is the largest size we want to scan, not one past that. Put another way, this loop is actually 1-indexed, not 0-indexed. Therefore, the continuation test should use <=, not <. As a result, online repairs of btree blocks fails to stale any buffers for btrees that are being torn down, which causes later assertions in the buffer cache when another thread creates a different-sized buffer. This happens in xfs/709 when allocating an inode cluster buffer: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 3346128 at fs/xfs/xfs_message.c:104 assfail+0x3a/0x40 [xfs] CPU: 0 PID: 3346128 Comm: fsstress Not tainted 6.7.0-rc4-djwx #rc4 RIP: 0010:assfail+0x3a/0x40 [xfs] Call Trace: _xfs_buf_obj_cmp+0x4a/0x50 xfs_buf_get_map+0x191/0xba0 xfs_trans_get_buf_map+0x136/0x280 xfs_ialloc_inode_init+0x186/0x340 xfs_ialloc_ag_alloc+0x254/0x720 xfs_dialloc+0x21f/0x870 xfs_create_tmpfile+0x1a9/0x2f0 xfs_rename+0x369/0xfd0 xfs_vn_rename+0xfa/0x170 vfs_rename+0x5fb/0xc30 do_renameat2+0x52d/0x6e0 __x64_sys_renameat2+0x4b/0x60 do_syscall_64+0x3b/0xe0 entry_SYSCALL_64_after_hwframe+0x46/0x4e A later refactoring patch in the online repair series fixed this by accident, which is why I didn't notice this until I started testing only the patches that are likely to end up in 6.8. Fixes: 1c7ce115e521 ("xfs: reap large AG metadata extents when possible") Signed-off-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/scrub/reap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index ee26fcb500b7..300f49e8e14a 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -248,7 +248,7 @@ xreap_agextent_binval( max_fsbs = min_t(xfs_agblock_t, agbno_next - bno, xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX)); - for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) { + for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) { struct xfs_buf *bp = NULL; xfs_daddr_t daddr; int error; From 0573676fdde7ce3829ee6a42a8e5a56355234712 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 15 Dec 2023 08:40:35 +1100 Subject: [PATCH 0770/1562] xfs: initialise di_crc in xfs_log_dinode Alexander Potapenko report that KMSAN was issuing these warnings: kmalloc-ed xlog buffer of size 512 : ffff88802fc26200 kmalloc-ed xlog buffer of size 368 : ffff88802fc24a00 kmalloc-ed xlog buffer of size 648 : ffff88802b631000 kmalloc-ed xlog buffer of size 648 : ffff88802b632800 kmalloc-ed xlog buffer of size 648 : ffff88802b631c00 xlog_write_iovec: copying 12 bytes from ffff888017ddbbd8 to ffff88802c300400 xlog_write_iovec: copying 28 bytes from ffff888017ddbbe4 to ffff88802c30040c xlog_write_iovec: copying 68 bytes from ffff88802fc26274 to ffff88802c300428 xlog_write_iovec: copying 188 bytes from ffff88802fc262bc to ffff88802c30046c ===================================================== BUG: KMSAN: uninit-value in xlog_write_iovec fs/xfs/xfs_log.c:2227 BUG: KMSAN: uninit-value in xlog_write_full fs/xfs/xfs_log.c:2263 BUG: KMSAN: uninit-value in xlog_write+0x1fac/0x2600 fs/xfs/xfs_log.c:2532 xlog_write_iovec fs/xfs/xfs_log.c:2227 xlog_write_full fs/xfs/xfs_log.c:2263 xlog_write+0x1fac/0x2600 fs/xfs/xfs_log.c:2532 xlog_cil_write_chain fs/xfs/xfs_log_cil.c:918 xlog_cil_push_work+0x30f2/0x44e0 fs/xfs/xfs_log_cil.c:1263 process_one_work kernel/workqueue.c:2630 process_scheduled_works+0x1188/0x1e30 kernel/workqueue.c:2703 worker_thread+0xee5/0x14f0 kernel/workqueue.c:2784 kthread+0x391/0x500 kernel/kthread.c:388 ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242 Uninit was created at: slab_post_alloc_hook+0x101/0xac0 mm/slab.h:768 slab_alloc_node mm/slub.c:3482 __kmem_cache_alloc_node+0x612/0xae0 mm/slub.c:3521 __do_kmalloc_node mm/slab_common.c:1006 __kmalloc+0x11a/0x410 mm/slab_common.c:1020 kmalloc ./include/linux/slab.h:604 xlog_kvmalloc fs/xfs/xfs_log_priv.h:704 xlog_cil_alloc_shadow_bufs fs/xfs/xfs_log_cil.c:343 xlog_cil_commit+0x487/0x4dc0 fs/xfs/xfs_log_cil.c:1574 __xfs_trans_commit+0x8df/0x1930 fs/xfs/xfs_trans.c:1017 xfs_trans_commit+0x30/0x40 fs/xfs/xfs_trans.c:1061 xfs_create+0x15af/0x2150 fs/xfs/xfs_inode.c:1076 xfs_generic_create+0x4cd/0x1550 fs/xfs/xfs_iops.c:199 xfs_vn_create+0x4a/0x60 fs/xfs/xfs_iops.c:275 lookup_open fs/namei.c:3477 open_last_lookups fs/namei.c:3546 path_openat+0x29ac/0x6180 fs/namei.c:3776 do_filp_open+0x24d/0x680 fs/namei.c:3809 do_sys_openat2+0x1bc/0x330 fs/open.c:1440 do_sys_open fs/open.c:1455 __do_sys_openat fs/open.c:1471 __se_sys_openat fs/open.c:1466 __x64_sys_openat+0x253/0x330 fs/open.c:1466 do_syscall_x64 arch/x86/entry/common.c:51 do_syscall_64+0x4f/0x140 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b arch/x86/entry/entry_64.S:120 Bytes 112-115 of 188 are uninitialized Memory access of size 188 starts at ffff88802fc262bc This is caused by the struct xfs_log_dinode not having the di_crc field initialised. Log recovery never uses this field (it is only present these days for on-disk format compatibility reasons) and so it's value is never checked so nothing in XFS has caught this. Further, none of the uninitialised memory access warning tools have caught this (despite catching other uninit memory accesses in the struct xfs_log_dinode back in 2017!) until recently. Alexander annotated the XFS code to get the dump of the actual bytes that were detected as uninitialised, and from that report it took me about 30s to realise what the issue was. The issue was introduced back in 2016 and every inode that is logged fails to initialise this field. This is no actual bad behaviour caused by this issue - I find it hard to even classify it as a bug... Reported-and-tested-by: Alexander Potapenko Fixes: f8d55aa0523a ("xfs: introduce inode log format object") Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_inode_item.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index cd7803fda8b1..b35335e20342 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -557,6 +557,9 @@ xfs_inode_to_log_dinode( memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_v3_pad = 0; + + /* dummy value for initialisation */ + to->di_crc = 0; } else { to->di_version = 2; to->di_flushiter = ip->i_flushiter; From 59a9ccf19ee03179faf047822bbec76cac7467a4 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 29 Mar 2023 19:54:02 -0600 Subject: [PATCH 0771/1562] platform/chrome: cros_ec_vbc: Fix -Warray-bounds warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC-13 (and Clang) does not like having a partially allocated object, since it cannot reason about it for bounds checking. Notice that the compiler is legitimately complaining about accessing an object (params, in this case) for which not enough memory was allocated. The object is of size 20 bytes: struct ec_params_vbnvcontext { uint32_t op; /* 0 4 */ uint8_t block[16]; /* 4 16 */ /* size: 20, cachelines: 1, members: 2 */ /* last cacheline: 20 bytes */ }; but only 16 bytes are allocated: sizeof(struct ec_response_vbnvcontext) == 16 In this case, as only enough space for the op field is allocated, we can use an object of type uint32_t instead of a whole struct ec_params_vbnvcontext (for which not enough memory is allocated). Fix the following warning seen under GCC 13: drivers/platform/chrome/cros_ec_vbc.c: In function ‘vboot_context_read’: drivers/platform/chrome/cros_ec_vbc.c:36:15: warning: array subscript ‘struct ec_params_vbnvcontext[1]’ is partly outside array bounds of ‘unsigned char[36]’ [-Warray-bounds=] 36 | params->op = EC_VBNV_CONTEXT_OP_READ; | ^~ In file included from drivers/platform/chrome/cros_ec_vbc.c:12: In function ‘kmalloc’, inlined from ‘vboot_context_read’ at drivers/platform/chrome/cros_ec_vbc.c:30:8: ./include/linux/slab.h:580:24: note: at offset 20 into object of size 36 allocated by ‘kmalloc_trace’ 580 | return kmalloc_trace( | ^~~~~~~~~~~~~~ 581 | kmalloc_caches[kmalloc_type(flags)][index], | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 582 | flags, size); | ~~~~~~~~~~~~ Link: https://github.com/KSPP/linux/issues/278 Signed-off-by: "Gustavo A. R. Silva" Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/ZCTrutoN+9TiJM8u@work Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec_vbc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_vbc.c b/drivers/platform/chrome/cros_ec_vbc.c index 2e4af10c7679..274ea0c64b33 100644 --- a/drivers/platform/chrome/cros_ec_vbc.c +++ b/drivers/platform/chrome/cros_ec_vbc.c @@ -20,10 +20,14 @@ static ssize_t vboot_context_read(struct file *filp, struct kobject *kobj, struct device *dev = kobj_to_dev(kobj); struct cros_ec_dev *ec = to_cros_ec_dev(dev); struct cros_ec_device *ecdev = ec->ec_dev; - struct ec_params_vbnvcontext *params; struct cros_ec_command *msg; + /* + * This should be a pointer to the same type as op field in + * struct ec_params_vbnvcontext. + */ + uint32_t *params_op; int err; - const size_t para_sz = sizeof(params->op); + const size_t para_sz = sizeof(*params_op); const size_t resp_sz = sizeof(struct ec_response_vbnvcontext); const size_t payload = max(para_sz, resp_sz); @@ -32,8 +36,8 @@ static ssize_t vboot_context_read(struct file *filp, struct kobject *kobj, return -ENOMEM; /* NB: we only kmalloc()ated enough space for the op field */ - params = (struct ec_params_vbnvcontext *)msg->data; - params->op = EC_VBNV_CONTEXT_OP_READ; + params_op = (uint32_t *)msg->data; + *params_op = EC_VBNV_CONTEXT_OP_READ; msg->version = EC_VER_VBNV_CONTEXT; msg->command = EC_CMD_VBNV_CONTEXT; From 87824da27b0aee399600d313667c1d812c2749d8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 8 Dec 2023 21:05:19 +0100 Subject: [PATCH 0772/1562] ACPI: utils: Rearrange in acpi_evaluate_reference() The code in acpi_evaluate_reference() can be improved in some ways without changing its observable behavior. Among other things: * None of the local variables in that function except for buffer needs to be initialized. * The element local variable is only used in the for () loop block, so it can be defined there. * Multiple checks can be combined. * Code duplication related to error handling can be eliminated. * Redundant inner parens can be dropped. Modify the function as per the above. No intentional functional impact. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/utils.c | 60 ++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index 3dbc5eea0e17..5a7766c3fbbd 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -335,12 +335,10 @@ acpi_evaluate_reference(acpi_handle handle, struct acpi_object_list *arguments, struct acpi_handle_list *list) { - acpi_status status = AE_OK; - union acpi_object *package = NULL; - union acpi_object *element = NULL; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; - u32 i = 0; - + union acpi_object *package; + acpi_status status; + u32 i; if (!list) return AE_BAD_PARAMETER; @@ -353,45 +351,32 @@ acpi_evaluate_reference(acpi_handle handle, package = buffer.pointer; - if ((buffer.length == 0) || !package) { + if (buffer.length == 0 || !package || + package->type != ACPI_TYPE_PACKAGE || !package->package.count) { status = AE_BAD_DATA; - acpi_util_eval_error(handle, pathname, status); - goto end; - } - if (package->type != ACPI_TYPE_PACKAGE) { - status = AE_BAD_DATA; - acpi_util_eval_error(handle, pathname, status); - goto end; - } - if (!package->package.count) { - status = AE_BAD_DATA; - acpi_util_eval_error(handle, pathname, status); - goto end; + goto err; } - list->handles = kcalloc(package->package.count, sizeof(*list->handles), GFP_KERNEL); - if (!list->handles) { - kfree(package); - return AE_NO_MEMORY; - } list->count = package->package.count; + list->handles = kcalloc(list->count, sizeof(*list->handles), GFP_KERNEL); + if (!list->handles) { + status = AE_NO_MEMORY; + goto err_clear; + } /* Extract package data. */ for (i = 0; i < list->count; i++) { - - element = &(package->package.elements[i]); + union acpi_object *element = &(package->package.elements[i]); if (element->type != ACPI_TYPE_LOCAL_REFERENCE) { status = AE_BAD_DATA; - acpi_util_eval_error(handle, pathname, status); - break; + goto err_free; } if (!element->reference.handle) { status = AE_NULL_ENTRY; - acpi_util_eval_error(handle, pathname, status); - break; + goto err_free; } /* Get the acpi_handle. */ @@ -399,16 +384,21 @@ acpi_evaluate_reference(acpi_handle handle, acpi_handle_debug(list->handles[i], "Found in reference list\n"); } - if (ACPI_FAILURE(status)) { - list->count = 0; - kfree(list->handles); - list->handles = NULL; - } - end: kfree(buffer.pointer); return status; + +err_free: + kfree(list->handles); + list->handles = NULL; + +err_clear: + list->count = 0; + +err: + acpi_util_eval_error(handle, pathname, status); + goto end; } EXPORT_SYMBOL(acpi_evaluate_reference); From 6909e0f322b0527fee9fdc54685e6cad69008713 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 8 Dec 2023 21:06:04 +0100 Subject: [PATCH 0773/1562] ACPI: utils: Return bool from acpi_evaluate_reference() There are only 4 users of acpi_evaluate_reference() and none of them actually cares about the reason why it fails. All of them are only interested in whether or not it is successful, so it can return a bool value indicating that. Modify acpi_evaluate_reference() as per the observation above and update its callers accordingly so as to get rid of useless code and local variables. The observable behavior of the kernel is not expected to change after this modification of the code. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_lpss.c | 5 +-- drivers/acpi/scan.c | 5 ++- drivers/acpi/thermal.c | 4 +-- drivers/acpi/utils.c | 32 +++++++------------ .../platform/surface/surface_acpi_notify.c | 4 +-- include/acpi/acpi_bus.h | 8 ++--- 6 files changed, 20 insertions(+), 38 deletions(-) diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c index 79f4fc7d6871..1623af8d62bc 100644 --- a/drivers/acpi/acpi_lpss.c +++ b/drivers/acpi/acpi_lpss.c @@ -565,16 +565,13 @@ static struct device *acpi_lpss_find_device(const char *hid, const char *uid) static bool acpi_lpss_dep(struct acpi_device *adev, acpi_handle handle) { struct acpi_handle_list dep_devices; - acpi_status status; bool ret = false; int i; if (!acpi_has_method(adev->handle, "_DEP")) return false; - status = acpi_evaluate_reference(adev->handle, "_DEP", NULL, - &dep_devices); - if (ACPI_FAILURE(status)) { + if (!acpi_evaluate_reference(adev->handle, "_DEP", NULL, &dep_devices)) { dev_dbg(&adev->dev, "Failed to evaluate _DEP.\n"); return false; } diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 02bb2cce423f..7b731958af5e 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1984,7 +1984,6 @@ static void acpi_scan_init_hotplug(struct acpi_device *adev) static u32 acpi_scan_check_dep(acpi_handle handle, bool check_dep) { struct acpi_handle_list dep_devices; - acpi_status status; u32 count; int i; @@ -1998,8 +1997,7 @@ static u32 acpi_scan_check_dep(acpi_handle handle, bool check_dep) !acpi_has_method(handle, "_HID")) return 0; - status = acpi_evaluate_reference(handle, "_DEP", NULL, &dep_devices); - if (ACPI_FAILURE(status)) { + if (!acpi_evaluate_reference(handle, "_DEP", NULL, &dep_devices)) { acpi_handle_debug(handle, "Failed to evaluate _DEP.\n"); return 0; } @@ -2008,6 +2006,7 @@ static u32 acpi_scan_check_dep(acpi_handle handle, bool check_dep) struct acpi_device_info *info; struct acpi_dep_data *dep; bool skip, honor_dep; + acpi_status status; status = acpi_get_object_info(dep_devices.handles[i], &info); if (ACPI_FAILURE(status)) { diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index f74d81abdbfc..15f09c71a5ec 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -247,7 +247,6 @@ static bool update_trip_devices(struct acpi_thermal *tz, { struct acpi_handle_list devices = { 0 }; char method[] = "_PSL"; - acpi_status status; if (index != ACPI_THERMAL_TRIP_PASSIVE) { method[1] = 'A'; @@ -255,8 +254,7 @@ static bool update_trip_devices(struct acpi_thermal *tz, method[3] = '0' + index; } - status = acpi_evaluate_reference(tz->device->handle, method, NULL, &devices); - if (ACPI_FAILURE(status)) { + if (!acpi_evaluate_reference(tz->device->handle, method, NULL, &devices)) { acpi_handle_info(tz->device->handle, "%s evaluation failure\n", method); return false; } diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index 5a7766c3fbbd..958dc651d467 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -329,19 +329,18 @@ const char *acpi_get_subsystem_id(acpi_handle handle) } EXPORT_SYMBOL_GPL(acpi_get_subsystem_id); -acpi_status -acpi_evaluate_reference(acpi_handle handle, - acpi_string pathname, - struct acpi_object_list *arguments, - struct acpi_handle_list *list) +bool acpi_evaluate_reference(acpi_handle handle, acpi_string pathname, + struct acpi_object_list *arguments, + struct acpi_handle_list *list) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *package; acpi_status status; + bool ret = false; u32 i; if (!list) - return AE_BAD_PARAMETER; + return false; /* Evaluate object. */ @@ -352,42 +351,35 @@ acpi_evaluate_reference(acpi_handle handle, package = buffer.pointer; if (buffer.length == 0 || !package || - package->type != ACPI_TYPE_PACKAGE || !package->package.count) { - status = AE_BAD_DATA; + package->type != ACPI_TYPE_PACKAGE || !package->package.count) goto err; - } list->count = package->package.count; list->handles = kcalloc(list->count, sizeof(*list->handles), GFP_KERNEL); - if (!list->handles) { - status = AE_NO_MEMORY; + if (!list->handles) goto err_clear; - } /* Extract package data. */ for (i = 0; i < list->count; i++) { union acpi_object *element = &(package->package.elements[i]); - if (element->type != ACPI_TYPE_LOCAL_REFERENCE) { - status = AE_BAD_DATA; + if (element->type != ACPI_TYPE_LOCAL_REFERENCE || + !element->reference.handle) goto err_free; - } - if (!element->reference.handle) { - status = AE_NULL_ENTRY; - goto err_free; - } /* Get the acpi_handle. */ list->handles[i] = element->reference.handle; acpi_handle_debug(list->handles[i], "Found in reference list\n"); } + ret = true; + end: kfree(buffer.pointer); - return status; + return ret; err_free: kfree(list->handles); diff --git a/drivers/platform/surface/surface_acpi_notify.c b/drivers/platform/surface/surface_acpi_notify.c index e4dee920da18..96ec052d0940 100644 --- a/drivers/platform/surface/surface_acpi_notify.c +++ b/drivers/platform/surface/surface_acpi_notify.c @@ -740,15 +740,13 @@ static bool is_san_consumer(struct platform_device *pdev, acpi_handle handle) { struct acpi_handle_list dep_devices; acpi_handle supplier = ACPI_HANDLE(&pdev->dev); - acpi_status status; bool ret = false; int i; if (!acpi_has_method(handle, "_DEP")) return false; - status = acpi_evaluate_reference(handle, "_DEP", NULL, &dep_devices); - if (ACPI_FAILURE(status)) { + if (!acpi_evaluate_reference(handle, "_DEP", NULL, &dep_devices)) { san_consumer_dbg(&pdev->dev, handle, "failed to evaluate _DEP\n"); return false; } diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index aae31552c574..fbe60af56b34 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -25,11 +25,9 @@ acpi_status acpi_evaluate_integer(acpi_handle handle, acpi_string pathname, struct acpi_object_list *arguments, unsigned long long *data); -acpi_status -acpi_evaluate_reference(acpi_handle handle, - acpi_string pathname, - struct acpi_object_list *arguments, - struct acpi_handle_list *list); +bool acpi_evaluate_reference(acpi_handle handle, acpi_string pathname, + struct acpi_object_list *arguments, + struct acpi_handle_list *list); bool acpi_handle_list_equal(struct acpi_handle_list *list1, struct acpi_handle_list *list2); void acpi_handle_list_replace(struct acpi_handle_list *dst, From 1feb042d4e9b30b3ec3363e557d2ba884485f835 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 8 Dec 2023 21:06:45 +0100 Subject: [PATCH 0774/1562] ACPI: utils: Refine acpi_handle_list_equal() slightly It is somewhat better to use the size of the first array element for computing the size of the entire array than to rely on the array element data type definition knowledge and the former is also consistent with the array allocation in acpi_evaluate_reference(), so modify the code accordingly. No intentional functional impact. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index 958dc651d467..57663065dbf6 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -408,7 +408,7 @@ bool acpi_handle_list_equal(struct acpi_handle_list *list1, { return list1->count == list2->count && !memcmp(list1->handles, list2->handles, - list1->count * sizeof(acpi_handle)); + list1->count * sizeof(*list1->handles)); } EXPORT_SYMBOL_GPL(acpi_handle_list_equal); From 4c660ffef34b7d645ae3144369bc50257f295212 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 8 Dec 2023 21:07:41 +0100 Subject: [PATCH 0775/1562] ACPI: utils: Fix white space in struct acpi_handle_list definition Fix inadvertently introduced white space damage in the struct acpi_handle_list definition. No functional impact. Fixes: 2e57d10a6591 ("ACPI: utils: Dynamically determine acpi_handle_list size") Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index fbe60af56b34..3dcf07b41428 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -14,7 +14,7 @@ struct acpi_handle_list { u32 count; - acpi_handle* handles; + acpi_handle *handles; }; /* acpi_utils.h */ From 0c4cae1bc00d31c78858c184ede351baea232bdb Mon Sep 17 00:00:00 2001 From: Chris Feng Date: Wed, 13 Dec 2023 16:32:51 +0800 Subject: [PATCH 0776/1562] PM: hibernate: Avoid missing wakeup events during hibernation Wakeup events that occur in the hibernation process's hibernation_platform_enter() cannot wake up the system. Although the current hibernation framework will execute part of the recovery process after a wakeup event occurs, it ultimately performs a shutdown operation because the system does not check the return value of hibernation_platform_enter(). In short, if a wakeup event occurs before putting the system into the final low-power state, it will be missed. To solve this problem, check the return value of hibernation_platform_enter(). When it returns -EAGAIN or -EBUSY (indicate the occurrence of a wakeup event), execute the hibernation recovery process, discard the previously saved image, and ultimately return to the working state. Signed-off-by: Chris Feng [ rjw: Rephrase the message printed when going back to the working state ] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 10 ++++++++-- kernel/power/power.h | 2 ++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index dee341ae4ace..4b0b7cf2e019 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -642,9 +642,9 @@ int hibernation_platform_enter(void) */ static void power_down(void) { -#ifdef CONFIG_SUSPEND int error; +#ifdef CONFIG_SUSPEND if (hibernation_mode == HIBERNATION_SUSPEND) { error = suspend_devices_and_enter(mem_sleep_current); if (error) { @@ -667,7 +667,13 @@ static void power_down(void) kernel_restart(NULL); break; case HIBERNATION_PLATFORM: - hibernation_platform_enter(); + error = hibernation_platform_enter(); + if (error == -EAGAIN || error == -EBUSY) { + swsusp_unmark(); + events_check_enabled = false; + pr_info("Wakeup event detected during hibernation, rolling back.\n"); + return; + } fallthrough; case HIBERNATION_SHUTDOWN: if (kernel_can_power_off()) diff --git a/kernel/power/power.h b/kernel/power/power.h index 17fd9aaaf084..8499a39c62f4 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -175,6 +175,8 @@ extern int swsusp_write(unsigned int flags); void swsusp_close(void); #ifdef CONFIG_SUSPEND extern int swsusp_unmark(void); +#else +static inline int swsusp_unmark(void) { return 0; } #endif struct __kernel_old_timeval; From 71cd7e80cfde548959952eac7063aeaea1f2e1c6 Mon Sep 17 00:00:00 2001 From: Hongchen Zhang Date: Thu, 16 Nov 2023 08:56:09 +0800 Subject: [PATCH 0777/1562] PM: hibernate: Enforce ordering during image compression/decompression An S4 (suspend to disk) test on the LoongArch 3A6000 platform sometimes fails with the following error messaged in the dmesg log: Invalid LZO compressed length That happens because when compressing/decompressing the image, the synchronization between the control thread and the compress/decompress/crc thread is based on a relaxed ordering interface, which is unreliable, and the following situation may occur: CPU 0 CPU 1 save_image_lzo lzo_compress_threadfn atomic_set(&d->stop, 1); atomic_read(&data[thr].stop) data[thr].cmp = data[thr].cmp_len; WRITE data[thr].cmp_len Then CPU0 gets a stale cmp_len and writes it to disk. During resume from S4, wrong cmp_len is loaded. To maintain data consistency between the two threads, use the acquire/release variants of atomic set and read operations. Fixes: 081a9d043c98 ("PM / Hibernate: Improve performance of LZO/plain hibernation, checksum image") Cc: All applicable Signed-off-by: Hongchen Zhang Co-developed-by: Weihao Li Signed-off-by: Weihao Li [ rjw: Subject rewrite and changelog edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 68973ca2cf07..975e7195573b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -606,11 +606,11 @@ static int crc32_threadfn(void *data) unsigned i; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } @@ -619,7 +619,7 @@ static int crc32_threadfn(void *data) for (i = 0; i < d->run_threads; i++) *d->crc32 = crc32_le(*d->crc32, d->unc[i], *d->unc_len[i]); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; @@ -649,12 +649,12 @@ static int lzo_compress_threadfn(void *data) struct cmp_data *d = data; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } @@ -663,7 +663,7 @@ static int lzo_compress_threadfn(void *data) d->ret = lzo1x_1_compress(d->unc, d->unc_len, d->cmp + LZO_HEADER, &d->cmp_len, d->wrk); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; @@ -798,7 +798,7 @@ static int save_image_lzo(struct swap_map_handle *handle, data[thr].unc_len = off; - atomic_set(&data[thr].ready, 1); + atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } @@ -806,12 +806,12 @@ static int save_image_lzo(struct swap_map_handle *handle, break; crc->run_threads = thr; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, - atomic_read(&data[thr].stop)); + atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; @@ -850,7 +850,7 @@ static int save_image_lzo(struct swap_map_handle *handle, } } - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } @@ -1132,12 +1132,12 @@ static int lzo_decompress_threadfn(void *data) struct dec_data *d = data; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } @@ -1150,7 +1150,7 @@ static int lzo_decompress_threadfn(void *data) flush_icache_range((unsigned long)d->unc, (unsigned long)d->unc + d->unc_len); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; @@ -1335,7 +1335,7 @@ static int load_image_lzo(struct swap_map_handle *handle, } if (crc->run_threads) { - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); crc->run_threads = 0; } @@ -1371,7 +1371,7 @@ static int load_image_lzo(struct swap_map_handle *handle, pg = 0; } - atomic_set(&data[thr].ready, 1); + atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } @@ -1390,7 +1390,7 @@ static int load_image_lzo(struct swap_map_handle *handle, for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, - atomic_read(&data[thr].stop)); + atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; @@ -1421,7 +1421,7 @@ static int load_image_lzo(struct swap_map_handle *handle, ret = snapshot_write_next(snapshot); if (ret <= 0) { crc->run_threads = thr + 1; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); goto out_finish; } @@ -1429,13 +1429,13 @@ static int load_image_lzo(struct swap_map_handle *handle, } crc->run_threads = thr; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); } out_finish: if (crc->run_threads) { - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } stop = ktime_get(); From e695c1fc5a3db1e161abe8061d715a504aff3f9f Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Fri, 15 Dec 2023 12:33:37 +0100 Subject: [PATCH 0778/1562] spi: pl022: delete description of cur_msg The variable cur_msg was removed, but its description is left behind. Delete this description. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312151816.munFeE4L-lkp@intel.com/ Signed-off-by: Nam Cao Reviewed-by: Linus Walleij Link: https://msgid.link/r/f06a9b6eac184cc648ae7444c480add6da87a84d.1702639801.git.namcao@linutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi-pl022.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/spi/spi-pl022.c b/drivers/spi/spi-pl022.c index 3baf45da01cd..bdec67cf45f2 100644 --- a/drivers/spi/spi-pl022.c +++ b/drivers/spi/spi-pl022.c @@ -338,7 +338,6 @@ struct vendor_data { * @clk: outgoing clock "SPICLK" for the SPI bus * @host: SPI framework hookup * @host_info: controller-specific data from machine setup - * @cur_msg: Pointer to current spi_message being processed * @cur_transfer: Pointer to current spi_transfer * @cur_chip: pointer to current clients chip(assigned from controller_state) * @tx: current position in TX buffer to be read From 644f315d12ea29a67bc985d06ab0962452eb3605 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Fri, 15 Dec 2023 12:33:38 +0100 Subject: [PATCH 0779/1562] spi: pl022: update description of internal_cs_control() The arguments of internal_cs_control() was changed, but its description was not updated. Update the description to match the expected arguments. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312151816.munFeE4L-lkp@intel.com/ Signed-off-by: Nam Cao Reviewed-by: Linus Walleij Link: https://msgid.link/r/4036d8d5845c04179f330f83e825a3921aa50c5a.1702639801.git.namcao@linutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi-pl022.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-pl022.c b/drivers/spi/spi-pl022.c index bdec67cf45f2..de63cf0557ce 100644 --- a/drivers/spi/spi-pl022.c +++ b/drivers/spi/spi-pl022.c @@ -419,7 +419,7 @@ struct chip_data { /** * internal_cs_control - Control chip select signals via SSP_CSR. * @pl022: SSP driver private data structure - * @command: select/delect the chip + * @enable: select/delect the chip * * Used on controller with internal chip select control via SSP_CSR register * (vendor extension). Each of the 5 LSB in the register controls one chip From 04e6ccfc93c5a1aa1d75a537cf27e418895e20ea Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 14 Dec 2023 11:52:25 +0100 Subject: [PATCH 0780/1562] thermal: core: Fix NULL pointer dereference in zone registration error path If device_register() in thermal_zone_device_register_with_trips() returns an error, the tz variable is set to NULL and subsequently dereferenced in kfree(tz->tzp). Commit adc8749b150c ("thermal/drivers/core: Use put_device() if device_register() fails") added the tz = NULL assignment in question to avoid a possible double-free after dropping the reference to the zone device. However, after commit 4649620d9404 ("thermal: core: Make thermal_zone_device_unregister() return after freeing the zone"), that assignment has become redundant, because dropping the reference to the zone device does not cause the zone object to be freed any more. Drop it to address the NULL pointer dereference. Fixes: 3d439b1a2ad3 ("thermal/core: Alloc-copy-free the thermal zone parameters structure") Signed-off-by: Rafael J. Wysocki Reviewed-by: Lukasz Luba --- drivers/thermal/thermal_core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 2415dc50c31d..5e5fcbd81dda 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1393,7 +1393,6 @@ unregister: device_del(&tz->device); release_device: put_device(&tz->device); - tz = NULL; remove_id: ida_free(&thermal_tz_ida, id); free_tzp: From 13ae04d8d45227c2ba51e188daf9fc13d08a1b12 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:27 -0800 Subject: [PATCH 0781/1562] xfs: force all buffers to be written during btree bulk load While stress-testing online repair of btrees, I noticed periodic assertion failures from the buffer cache about buffers with incorrect DELWRI_Q state. Looking further, I observed this race between the AIL trying to write out a btree block and repair zapping a btree block after the fact: AIL: Repair0: pin buffer X delwri_queue: set DELWRI_Q add to delwri list stale buf X: clear DELWRI_Q does not clear b_list free space X commit delwri_submit # oops Worse yet, I discovered that running the same repair over and over in a tight loop can result in a second race that cause data integrity problems with the repair: AIL: Repair0: Repair1: pin buffer X delwri_queue: set DELWRI_Q add to delwri list stale buf X: clear DELWRI_Q does not clear b_list free space X commit find free space X get buffer rewrite buffer delwri_queue: set DELWRI_Q already on a list, do not add commit BAD: committed tree root before all blocks written delwri_submit # too late now I traced this to my own misunderstanding of how the delwri lists work, particularly with regards to the AIL's buffer list. If a buffer is logged and committed, the buffer can end up on that AIL buffer list. If btree repairs are run twice in rapid succession, it's possible that the first repair will invalidate the buffer and free it before the next time the AIL wakes up. Marking the buffer stale clears DELWRI_Q from the buffer state without removing the buffer from its delwri list. The buffer doesn't know which list it's on, so it cannot know which lock to take to protect the list for a removal. If the second repair allocates the same block, it will then recycle the buffer to start writing the new btree block. Meanwhile, if the AIL wakes up and walks the buffer list, it will ignore the buffer because it can't lock it, and go back to sleep. When the second repair calls delwri_queue to put the buffer on the list of buffers to write before committing the new btree, it will set DELWRI_Q again, but since the buffer hasn't been removed from the AIL's buffer list, it won't add it to the bulkload buffer's list. This is incorrect, because the bulkload caller relies on delwri_submit to ensure that all the buffers have been sent to disk /before/ committing the new btree root pointer. This ordering requirement is required for data consistency. Worse, the AIL won't clear DELWRI_Q from the buffer when it does finally drop it, so the next thread to walk through the btree will trip over a debug assertion on that flag. To fix this, create a new function that waits for the buffer to be removed from any other delwri lists before adding the buffer to the caller's delwri list. By waiting for the buffer to clear both the delwri list and any potential delwri wait list, we can be sure that repair will initiate writes of all buffers and report all write errors back to userspace instead of committing the new structure. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree_staging.c | 4 +-- fs/xfs/xfs_buf.c | 44 ++++++++++++++++++++++++++++--- fs/xfs/xfs_buf.h | 1 + 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index dd75e208b543..29e3f8ccb185 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -342,9 +342,7 @@ xfs_btree_bload_drop_buf( if (*bpp == NULL) return; - if (!xfs_buf_delwri_queue(*bpp, buffers_list)) - ASSERT(0); - + xfs_buf_delwri_queue_here(*bpp, buffers_list); xfs_buf_relse(*bpp); *bpp = NULL; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 545c7991b9b5..ec4bd7a24d88 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2049,6 +2049,14 @@ error_free: return NULL; } +static inline void +xfs_buf_list_del( + struct xfs_buf *bp) +{ + list_del_init(&bp->b_list); + wake_up_var(&bp->b_list); +} + /* * Cancel a delayed write list. * @@ -2066,7 +2074,7 @@ xfs_buf_delwri_cancel( xfs_buf_lock(bp); bp->b_flags &= ~_XBF_DELWRI_Q; - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); xfs_buf_relse(bp); } } @@ -2119,6 +2127,34 @@ xfs_buf_delwri_queue( return true; } +/* + * Queue a buffer to this delwri list as part of a data integrity operation. + * If the buffer is on any other delwri list, we'll wait for that to clear + * so that the caller can submit the buffer for IO and wait for the result. + * Callers must ensure the buffer is not already on the list. + */ +void +xfs_buf_delwri_queue_here( + struct xfs_buf *bp, + struct list_head *buffer_list) +{ + /* + * We need this buffer to end up on the /caller's/ delwri list, not any + * old list. This can happen if the buffer is marked stale (which + * clears DELWRI_Q) after the AIL queues the buffer to its list but + * before the AIL has a chance to submit the list. + */ + while (!list_empty(&bp->b_list)) { + xfs_buf_unlock(bp); + wait_var_event(&bp->b_list, list_empty(&bp->b_list)); + xfs_buf_lock(bp); + } + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + + xfs_buf_delwri_queue(bp, buffer_list); +} + /* * Compare function is more complex than it needs to be because * the return value is only 32 bits and we are doing comparisons @@ -2181,7 +2217,7 @@ xfs_buf_delwri_submit_buffers( * reference and remove it from the list here. */ if (!(bp->b_flags & _XBF_DELWRI_Q)) { - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); xfs_buf_relse(bp); continue; } @@ -2201,7 +2237,7 @@ xfs_buf_delwri_submit_buffers( list_move_tail(&bp->b_list, wait_list); } else { bp->b_flags |= XBF_ASYNC; - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); } __xfs_buf_submit(bp, false); } @@ -2255,7 +2291,7 @@ xfs_buf_delwri_submit( while (!list_empty(&wait_list)) { bp = list_first_entry(&wait_list, struct xfs_buf, b_list); - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); /* * Wait on the locked buffer, check for errors and unlock and diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c86e16419656..b470de08a46c 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -319,6 +319,7 @@ extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); +void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); From c1e0f8e6fb060b23b6f1b82eb4265983f7d271f8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:27 -0800 Subject: [PATCH 0782/1562] xfs: set XBF_DONE on newly formatted btree block that are ready for writing The btree bulkloading code calls xfs_buf_delwri_queue_here when it has finished formatting a new btree block and wants to queue it to be written to disk. Once the new btree root has been committed, the blocks (and hence the buffers) will be accessible to the rest of the filesystem. Mark each new buffer as DONE when adding it to the delwri list so that the next btree traversal can skip reloading the contents from disk. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree_staging.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 29e3f8ccb185..1c5f9ed70c3e 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -342,6 +342,12 @@ xfs_btree_bload_drop_buf( if (*bpp == NULL) return; + /* + * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent + * xfs_buf_read will not pointlessly reread the contents from the disk. + */ + (*bpp)->b_flags |= XBF_DONE; + xfs_buf_delwri_queue_here(*bpp, buffers_list); xfs_buf_relse(*bpp); *bpp = NULL; From 26de64629d8b439a03bce243f14a46f7440729f3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:28 -0800 Subject: [PATCH 0783/1562] xfs: read leaf blocks when computing keys for bulkloading into node blocks When constructing a new btree, xfs_btree_bload_node needs to read the btree blocks for level N to compute the keyptrs for the blocks that will be loaded into level N+1. The level N blocks must be formatted at that point. A subsequent patch will change the btree bulkloader to write new btree blocks in 256K chunks to moderate memory consumption if the new btree is very large. As a consequence of that, it's possible that the buffers for lower level blocks might have been reclaimed by the time the node builder comes back to the block. Therefore, change xfs_btree_bload_node to read the lower level blocks to handle the reclaimed buffer case. As a side effect, the read will increase the LRU refs, which will bias towards keeping new btree buffers in memory after the new btree commits. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree.c | 2 +- fs/xfs/libxfs/xfs_btree.h | 3 +++ fs/xfs/libxfs/xfs_btree_staging.c | 7 ++++++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 6a6503ab0cd7..c100e92140be 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1330,7 +1330,7 @@ xfs_btree_get_buf_block( * Read in the buffer at the given ptr and return the buffer and * the block pointer within the buffer. */ -STATIC int +int xfs_btree_read_buf_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 4d68a58be160..e0875cec4939 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -700,6 +700,9 @@ void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, struct xfs_btree_block **block, struct xfs_buf **bpp); +int xfs_btree_read_buf_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, int flags, + struct xfs_btree_block **block, struct xfs_buf **bpp); void xfs_btree_set_sibling(struct xfs_btree_cur *cur, struct xfs_btree_block *block, const union xfs_btree_ptr *ptr, int lr); diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 1c5f9ed70c3e..c8b46ac3923f 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -489,7 +489,12 @@ xfs_btree_bload_node( ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr)); - ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block, + /* + * Read the lower-level block in case the buffer for it has + * been reclaimed. LRU refs will be set on the block, which is + * desirable if the new btree commits. + */ + ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block, &child_bp); if (ret) return ret; From a20ffa7d9f863056364b11a680145a76ef15acb2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:28 -0800 Subject: [PATCH 0784/1562] xfs: add debug knobs to control btree bulk load slack factors Add some debug knobs so that we can control the leaf and node block slack when rebuilding btrees. For developers, it might be useful to construct btrees of various heights by crafting a filesystem with a certain number of records and then using repair+knobs to rebuild the index with a certain shape. Practically speaking, you'd only ever do that for extreme stress testing of the runtime code or the btree generator. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/newbt.c | 11 ++++++--- fs/xfs/xfs_globals.c | 12 ++++++++++ fs/xfs/xfs_sysctl.h | 2 ++ fs/xfs/xfs_sysfs.c | 54 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 76 insertions(+), 3 deletions(-) diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 992cf34a13e7..46883606ad88 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -32,6 +32,7 @@ * btree bulk loading code calculates for us. However, there are some * exceptions to this rule: * + * (0) If someone turned one of the debug knobs. * (1) If this is a per-AG btree and the AG has less than 10% space free. * (2) If this is an inode btree and the FS has less than 10% space free. @@ -47,9 +48,13 @@ xrep_newbt_estimate_slack( uint64_t free; uint64_t sz; - /* Let the btree code compute the default slack values. */ - bload->leaf_slack = -1; - bload->node_slack = -1; + /* + * The xfs_globals values are set to -1 (i.e. take the bload defaults) + * unless someone has set them otherwise, so we just pull the values + * here. + */ + bload->leaf_slack = xfs_globals.bload_leaf_slack; + bload->node_slack = xfs_globals.bload_node_slack; if (sc->ops->type == ST_PERAG) { free = sc->sa.pag->pagf_freeblks; diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 9edc1f2bc939..f18fec0adf66 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -44,4 +44,16 @@ struct xfs_globals xfs_globals = { .pwork_threads = -1, /* automatic thread detection */ .larp = false, /* log attribute replay */ #endif + + /* + * Leave this many record slots empty when bulk loading btrees. By + * default we load new btree leaf blocks 75% full. + */ + .bload_leaf_slack = -1, + + /* + * Leave this many key/ptr slots empty when bulk loading btrees. By + * default we load new btree node blocks 75% full. + */ + .bload_node_slack = -1, }; diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index f78ad6b10ea5..276696a07040 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -85,6 +85,8 @@ struct xfs_globals { int pwork_threads; /* parallel workqueue threads */ bool larp; /* log attribute replay */ #endif + int bload_leaf_slack; /* btree bulk load leaf slack */ + int bload_node_slack; /* btree bulk load node slack */ int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 871f16a4a5d8..17485666b672 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -262,6 +262,58 @@ larp_show( XFS_SYSFS_ATTR_RW(larp); #endif /* DEBUG */ +STATIC ssize_t +bload_leaf_slack_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + xfs_globals.bload_leaf_slack = val; + return count; +} + +STATIC ssize_t +bload_leaf_slack_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack); +} +XFS_SYSFS_ATTR_RW(bload_leaf_slack); + +STATIC ssize_t +bload_node_slack_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + xfs_globals.bload_node_slack = val; + return count; +} + +STATIC ssize_t +bload_node_slack_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack); +} +XFS_SYSFS_ATTR_RW(bload_node_slack); + static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), @@ -271,6 +323,8 @@ static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(pwork_threads), ATTR_LIST(larp), #endif + ATTR_LIST(bload_leaf_slack), + ATTR_LIST(bload_node_slack), NULL, }; ATTRIBUTE_GROUPS(xfs_dbg); From 6dfeb0c2ecde71d61af77f65eabbdd6ca9315161 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:29 -0800 Subject: [PATCH 0785/1562] xfs: move btree bulkload record initialization to ->get_record implementations When we're performing a bulk load of a btree, move the code that actually stores the btree record in the new btree block out of the generic code and into the individual ->get_record implementations. This is preparation for being able to store multiple records with a single indirect call. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree_staging.c | 17 +++++++---------- fs/xfs/libxfs/xfs_btree_staging.h | 15 ++++++++++----- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index c8b46ac3923f..cd409a2ee87b 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -440,22 +440,19 @@ STATIC int xfs_btree_bload_leaf( struct xfs_btree_cur *cur, unsigned int recs_this_block, - xfs_btree_bload_get_record_fn get_record, + xfs_btree_bload_get_records_fn get_records, struct xfs_btree_block *block, void *priv) { - unsigned int j; + unsigned int j = 1; int ret; /* Fill the leaf block with records. */ - for (j = 1; j <= recs_this_block; j++) { - union xfs_btree_rec *block_rec; - - ret = get_record(cur, priv); - if (ret) + while (j <= recs_this_block) { + ret = get_records(cur, j, block, recs_this_block - j + 1, priv); + if (ret < 0) return ret; - block_rec = xfs_btree_rec_addr(cur, j, block); - cur->bc_ops->init_rec_from_cur(cur, block_rec); + j += ret; } return 0; @@ -798,7 +795,7 @@ xfs_btree_bload( trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr, nr_this_block); - ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record, + ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records, block, priv); if (ret) goto out; diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index 5f638f711246..bd5b3f004823 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -47,7 +47,9 @@ void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, int whichfork, const struct xfs_btree_ops *ops); /* Bulk loading of staged btrees. */ -typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv); +typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur, + unsigned int idx, struct xfs_btree_block *block, + unsigned int nr_wanted, void *priv); typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, void *priv); typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur, @@ -55,11 +57,14 @@ typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur, struct xfs_btree_bload { /* - * This function will be called nr_records times to load records into - * the btree. The function does this by setting the cursor's bc_rec - * field in in-core format. Records must be returned in sort order. + * This function will be called to load @nr_wanted records into the + * btree. The implementation does this by setting the cursor's bc_rec + * field in in-core format and using init_rec_from_cur to set the + * records in the btree block. Records must be returned in sort order. + * The function must return the number of records loaded or the usual + * negative errno. */ - xfs_btree_bload_get_record_fn get_record; + xfs_btree_bload_get_records_fn get_records; /* * This function will be called nr_blocks times to obtain a pointer From e069d549705e49841247acf9b3176744e27d5425 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:29 -0800 Subject: [PATCH 0786/1562] xfs: constrain dirty buffers while formatting a staged btree Constrain the number of dirty buffers that are locked by the btree staging code at any given time by establishing a threshold at which we put them all on the delwri queue and push them to disk. This limits memory consumption while writing out new btrees. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree_staging.c | 50 ++++++++++++++++++++++++------- fs/xfs/libxfs/xfs_btree_staging.h | 10 +++++++ fs/xfs/scrub/newbt.c | 1 + 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index cd409a2ee87b..0c978a31e284 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -333,24 +333,41 @@ xfs_btree_commit_ifakeroot( /* * Put a btree block that we're loading onto the ordered list and release it. * The btree blocks will be written to disk when bulk loading is finished. + * If we reach the dirty buffer threshold, flush them to disk before + * continuing. */ -static void +static int xfs_btree_bload_drop_buf( - struct list_head *buffers_list, - struct xfs_buf **bpp) + struct xfs_btree_bload *bbl, + struct list_head *buffers_list, + struct xfs_buf **bpp) { - if (*bpp == NULL) - return; + struct xfs_buf *bp = *bpp; + int error; + + if (!bp) + return 0; /* * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent * xfs_buf_read will not pointlessly reread the contents from the disk. */ - (*bpp)->b_flags |= XBF_DONE; + bp->b_flags |= XBF_DONE; - xfs_buf_delwri_queue_here(*bpp, buffers_list); - xfs_buf_relse(*bpp); + xfs_buf_delwri_queue_here(bp, buffers_list); + xfs_buf_relse(bp); *bpp = NULL; + bbl->nr_dirty++; + + if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty) + return 0; + + error = xfs_buf_delwri_submit(buffers_list); + if (error) + return error; + + bbl->nr_dirty = 0; + return 0; } /* @@ -422,7 +439,10 @@ xfs_btree_bload_prep_block( */ if (*blockp) xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB); - xfs_btree_bload_drop_buf(buffers_list, bpp); + + ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp); + if (ret) + return ret; /* Initialize the new btree block. */ xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block); @@ -770,6 +790,7 @@ xfs_btree_bload( cur->bc_nlevels = bbl->btree_height; xfs_btree_set_ptr_null(cur, &child_ptr); xfs_btree_set_ptr_null(cur, &ptr); + bbl->nr_dirty = 0; xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, &avg_per_block, &blocks, &blocks_with_extra); @@ -808,7 +829,10 @@ xfs_btree_bload( xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1); } total_blocks += blocks; - xfs_btree_bload_drop_buf(&buffers_list, &bp); + + ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp); + if (ret) + goto out; /* Populate the internal btree nodes. */ for (level = 1; level < cur->bc_nlevels; level++) { @@ -850,7 +874,11 @@ xfs_btree_bload( xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1); } total_blocks += blocks; - xfs_btree_bload_drop_buf(&buffers_list, &bp); + + ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp); + if (ret) + goto out; + xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1); } diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index bd5b3f004823..f0a5007284ef 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -112,6 +112,16 @@ struct xfs_btree_bload { * height of the new btree. */ unsigned int btree_height; + + /* + * Flush the new btree block buffer list to disk after this many blocks + * have been formatted. Zero prohibits writing any buffers until all + * blocks have been formatted. + */ + uint16_t max_dirty; + + /* Number of dirty buffers. */ + uint16_t nr_dirty; }; int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur, diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 46883606ad88..81919eeabcdb 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -94,6 +94,7 @@ xrep_newbt_init_ag( xnr->alloc_hint = alloc_hint; xnr->resv = resv; INIT_LIST_HEAD(&xnr->resv_list); + xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */ xrep_newbt_estimate_slack(xnr); } From 6ece924b95226235059ed2ffc2c0f44a124c5910 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:30 -0800 Subject: [PATCH 0787/1562] xfs: create separate structures and code for u32 bitmaps Create a version of the xbitmap that handles 32-bit integer intervals and adapt the xfs_agblock_t bitmap to use it. This reduces the size of the interval tree nodes from 48 to 36 bytes and enables us to use a more efficient slab (:0000040 instead of :0000048) which allows us to pack more nodes into a single slab page (102 vs 85). As a side effect, the users of these bitmaps no longer have to convert between u32 and u64 quantities just to use the bitmap; and the hairy overflow checking code in xagb_bitmap_test goes away. Later in this patchset we're going to add bitmaps for xfs_agino_t, xfs_rgblock_t, and xfs_dablk_t, so the increase in code size (5622 vs. 9959 bytes) seems worth it. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/agheader_repair.c | 9 +- fs/xfs/scrub/bitmap.c | 518 +++++++++++++++++++++++++-------- fs/xfs/scrub/bitmap.h | 92 +++--- fs/xfs/scrub/reap.c | 5 +- 4 files changed, 458 insertions(+), 166 deletions(-) diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 876a2f41b063..4000bdc8b500 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -494,12 +494,11 @@ xrep_agfl_walk_rmap( /* Strike out the blocks that are cross-linked according to the rmapbt. */ STATIC int xrep_agfl_check_extent( - uint64_t start, - uint64_t len, + uint32_t agbno, + uint32_t len, void *priv) { struct xrep_agfl *ra = priv; - xfs_agblock_t agbno = start; xfs_agblock_t last_agbno = agbno + len - 1; int error; @@ -647,8 +646,8 @@ struct xrep_agfl_fill { /* Fill the AGFL with whatever blocks are in this extent. */ static int xrep_agfl_fill( - uint64_t start, - uint64_t len, + uint32_t start, + uint32_t len, void *priv) { struct xrep_agfl_fill *af = priv; diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index e0c89a9a0ca0..503b79010002 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -16,7 +16,9 @@ #include -struct xbitmap_node { +/* u64 bitmap */ + +struct xbitmap64_node { struct rb_node bn_rbnode; /* First set bit of this interval and subtree. */ @@ -39,72 +41,72 @@ struct xbitmap_node { * forward-declare them anyway for clarity. */ static inline void -xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root); +xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root); static inline void -xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root); +xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root); -static inline struct xbitmap_node * -xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start, +static inline struct xbitmap64_node * +xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start, uint64_t last); -static inline struct xbitmap_node * -xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start, +static inline struct xbitmap64_node * +xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start, uint64_t last); -INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t, - __bn_subtree_last, START, LAST, static inline, xbitmap_tree) +INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t, + __bn_subtree_last, START, LAST, static inline, xbitmap64_tree) /* Iterate each interval of a bitmap. Do not change the bitmap. */ -#define for_each_xbitmap_extent(bn, bitmap) \ +#define for_each_xbitmap64_extent(bn, bitmap) \ for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ - struct xbitmap_node, bn_rbnode); \ + struct xbitmap64_node, bn_rbnode); \ (bn) != NULL; \ (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ - struct xbitmap_node, bn_rbnode)) + struct xbitmap64_node, bn_rbnode)) /* Clear a range of this bitmap. */ int -xbitmap_clear( - struct xbitmap *bitmap, +xbitmap64_clear( + struct xbitmap64 *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_node *bn; - struct xbitmap_node *new_bn; + struct xbitmap64_node *bn; + struct xbitmap64_node *new_bn; uint64_t last = start + len - 1; - while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) { + while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last))) { if (bn->bn_start < start && bn->bn_last > last) { uint64_t old_last = bn->bn_last; /* overlaps with the entire clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_last = start - 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); /* add an extent */ - new_bn = kmalloc(sizeof(struct xbitmap_node), + new_bn = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS); if (!new_bn) return -ENOMEM; new_bn->bn_start = last + 1; new_bn->bn_last = old_last; - xbitmap_tree_insert(new_bn, &bitmap->xb_root); + xbitmap64_tree_insert(new_bn, &bitmap->xb_root); } else if (bn->bn_start < start) { /* overlaps with the left side of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_last = start - 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); } else if (bn->bn_last > last) { /* overlaps with the right side of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_start = last + 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); break; } else { /* in the middle of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); kfree(bn); } } @@ -114,59 +116,59 @@ xbitmap_clear( /* Set a range of this bitmap. */ int -xbitmap_set( - struct xbitmap *bitmap, +xbitmap64_set( + struct xbitmap64 *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_node *left; - struct xbitmap_node *right; + struct xbitmap64_node *left; + struct xbitmap64_node *right; uint64_t last = start + len - 1; int error; /* Is this whole range already set? */ - left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + left = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last); if (left && left->bn_start <= start && left->bn_last >= last) return 0; /* Clear out everything in the range we want to set. */ - error = xbitmap_clear(bitmap, start, len); + error = xbitmap64_clear(bitmap, start, len); if (error) return error; /* Do we have a left-adjacent extent? */ - left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + left = xbitmap64_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); ASSERT(!left || left->bn_last + 1 == start); /* Do we have a right-adjacent extent? */ - right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + right = xbitmap64_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); ASSERT(!right || right->bn_start == last + 1); if (left && right) { /* combine left and right adjacent extent */ - xbitmap_tree_remove(left, &bitmap->xb_root); - xbitmap_tree_remove(right, &bitmap->xb_root); + xbitmap64_tree_remove(left, &bitmap->xb_root); + xbitmap64_tree_remove(right, &bitmap->xb_root); left->bn_last = right->bn_last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); kfree(right); } else if (left) { /* combine with left extent */ - xbitmap_tree_remove(left, &bitmap->xb_root); + xbitmap64_tree_remove(left, &bitmap->xb_root); left->bn_last = last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); } else if (right) { /* combine with right extent */ - xbitmap_tree_remove(right, &bitmap->xb_root); + xbitmap64_tree_remove(right, &bitmap->xb_root); right->bn_start = start; - xbitmap_tree_insert(right, &bitmap->xb_root); + xbitmap64_tree_insert(right, &bitmap->xb_root); } else { /* add an extent */ - left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS); + left = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS); if (!left) return -ENOMEM; left->bn_start = start; left->bn_last = last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); } return 0; @@ -174,21 +176,21 @@ xbitmap_set( /* Free everything related to this bitmap. */ void -xbitmap_destroy( - struct xbitmap *bitmap) +xbitmap64_destroy( + struct xbitmap64 *bitmap) { - struct xbitmap_node *bn; + struct xbitmap64_node *bn; - while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { - xbitmap_tree_remove(bn, &bitmap->xb_root); + while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { + xbitmap64_tree_remove(bn, &bitmap->xb_root); kfree(bn); } } /* Set up a per-AG block bitmap. */ void -xbitmap_init( - struct xbitmap *bitmap) +xbitmap64_init( + struct xbitmap64 *bitmap) { bitmap->xb_root = RB_ROOT_CACHED; } @@ -208,18 +210,18 @@ xbitmap_init( * This is the logical equivalent of bitmap &= ~sub. */ int -xbitmap_disunion( - struct xbitmap *bitmap, - struct xbitmap *sub) +xbitmap64_disunion( + struct xbitmap64 *bitmap, + struct xbitmap64 *sub) { - struct xbitmap_node *bn; + struct xbitmap64_node *bn; int error; - if (xbitmap_empty(bitmap) || xbitmap_empty(sub)) + if (xbitmap64_empty(bitmap) || xbitmap64_empty(sub)) return 0; - for_each_xbitmap_extent(bn, sub) { - error = xbitmap_clear(bitmap, bn->bn_start, + for_each_xbitmap64_extent(bn, sub) { + error = xbitmap64_clear(bitmap, bn->bn_start, bn->bn_last - bn->bn_start + 1); if (error) return error; @@ -228,6 +230,345 @@ xbitmap_disunion( return 0; } +/* How many bits are set in this bitmap? */ +uint64_t +xbitmap64_hweight( + struct xbitmap64 *bitmap) +{ + struct xbitmap64_node *bn; + uint64_t ret = 0; + + for_each_xbitmap64_extent(bn, bitmap) + ret += bn->bn_last - bn->bn_start + 1; + + return ret; +} + +/* Call a function for every run of set bits in this bitmap. */ +int +xbitmap64_walk( + struct xbitmap64 *bitmap, + xbitmap64_walk_fn fn, + void *priv) +{ + struct xbitmap64_node *bn; + int error = 0; + + for_each_xbitmap64_extent(bn, bitmap) { + error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); + if (error) + break; + } + + return error; +} + +/* Does this bitmap have no bits set at all? */ +bool +xbitmap64_empty( + struct xbitmap64 *bitmap) +{ + return bitmap->xb_root.rb_root.rb_node == NULL; +} + +/* Is the start of the range set or clear? And for how long? */ +bool +xbitmap64_test( + struct xbitmap64 *bitmap, + uint64_t start, + uint64_t *len) +{ + struct xbitmap64_node *bn; + uint64_t last = start + *len - 1; + + bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last); + if (!bn) + return false; + if (bn->bn_start <= start) { + if (bn->bn_last < last) + *len = bn->bn_last - start + 1; + return true; + } + *len = bn->bn_start - start; + return false; +} + +/* u32 bitmap */ + +struct xbitmap32_node { + struct rb_node bn_rbnode; + + /* First set bit of this interval and subtree. */ + uint32_t bn_start; + + /* Last set bit of this interval. */ + uint32_t bn_last; + + /* Last set bit of this subtree. Do not touch this. */ + uint32_t __bn_subtree_last; +}; + +/* Define our own interval tree type with uint32_t parameters. */ + +/* + * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll + * forward-declare them anyway for clarity. + */ +static inline void +xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root); + +static inline void +xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root); + +static inline struct xbitmap32_node * +xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start, + uint32_t last); + +static inline struct xbitmap32_node * +xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start, + uint32_t last); + +INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t, + __bn_subtree_last, START, LAST, static inline, xbitmap32_tree) + +/* Iterate each interval of a bitmap. Do not change the bitmap. */ +#define for_each_xbitmap32_extent(bn, bitmap) \ + for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ + struct xbitmap32_node, bn_rbnode); \ + (bn) != NULL; \ + (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ + struct xbitmap32_node, bn_rbnode)) + +/* Clear a range of this bitmap. */ +int +xbitmap32_clear( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t len) +{ + struct xbitmap32_node *bn; + struct xbitmap32_node *new_bn; + uint32_t last = start + len - 1; + + while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last))) { + if (bn->bn_start < start && bn->bn_last > last) { + uint32_t old_last = bn->bn_last; + + /* overlaps with the entire clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + + /* add an extent */ + new_bn = kmalloc(sizeof(struct xbitmap32_node), + XCHK_GFP_FLAGS); + if (!new_bn) + return -ENOMEM; + new_bn->bn_start = last + 1; + new_bn->bn_last = old_last; + xbitmap32_tree_insert(new_bn, &bitmap->xb_root); + } else if (bn->bn_start < start) { + /* overlaps with the left side of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + } else if (bn->bn_last > last) { + /* overlaps with the right side of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_start = last + 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + break; + } else { + /* in the middle of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } + } + + return 0; +} + +/* Set a range of this bitmap. */ +int +xbitmap32_set( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t len) +{ + struct xbitmap32_node *left; + struct xbitmap32_node *right; + uint32_t last = start + len - 1; + int error; + + /* Is this whole range already set? */ + left = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last); + if (left && left->bn_start <= start && left->bn_last >= last) + return 0; + + /* Clear out everything in the range we want to set. */ + error = xbitmap32_clear(bitmap, start, len); + if (error) + return error; + + /* Do we have a left-adjacent extent? */ + left = xbitmap32_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + ASSERT(!left || left->bn_last + 1 == start); + + /* Do we have a right-adjacent extent? */ + right = xbitmap32_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + ASSERT(!right || right->bn_start == last + 1); + + if (left && right) { + /* combine left and right adjacent extent */ + xbitmap32_tree_remove(left, &bitmap->xb_root); + xbitmap32_tree_remove(right, &bitmap->xb_root); + left->bn_last = right->bn_last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + kfree(right); + } else if (left) { + /* combine with left extent */ + xbitmap32_tree_remove(left, &bitmap->xb_root); + left->bn_last = last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + } else if (right) { + /* combine with right extent */ + xbitmap32_tree_remove(right, &bitmap->xb_root); + right->bn_start = start; + xbitmap32_tree_insert(right, &bitmap->xb_root); + } else { + /* add an extent */ + left = kmalloc(sizeof(struct xbitmap32_node), XCHK_GFP_FLAGS); + if (!left) + return -ENOMEM; + left->bn_start = start; + left->bn_last = last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + } + + return 0; +} + +/* Free everything related to this bitmap. */ +void +xbitmap32_destroy( + struct xbitmap32 *bitmap) +{ + struct xbitmap32_node *bn; + + while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, 0, -1U))) { + xbitmap32_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } +} + +/* Set up a per-AG block bitmap. */ +void +xbitmap32_init( + struct xbitmap32 *bitmap) +{ + bitmap->xb_root = RB_ROOT_CACHED; +} + +/* + * Remove all the blocks mentioned in @sub from the extents in @bitmap. + * + * The intent is that callers will iterate the rmapbt for all of its records + * for a given owner to generate @bitmap; and iterate all the blocks of the + * metadata structures that are not being rebuilt and have the same rmapbt + * owner to generate @sub. This routine subtracts all the extents + * mentioned in sub from all the extents linked in @bitmap, which leaves + * @bitmap as the list of blocks that are not accounted for, which we assume + * are the dead blocks of the old metadata structure. The blocks mentioned in + * @bitmap can be reaped. + * + * This is the logical equivalent of bitmap &= ~sub. + */ +int +xbitmap32_disunion( + struct xbitmap32 *bitmap, + struct xbitmap32 *sub) +{ + struct xbitmap32_node *bn; + int error; + + if (xbitmap32_empty(bitmap) || xbitmap32_empty(sub)) + return 0; + + for_each_xbitmap32_extent(bn, sub) { + error = xbitmap32_clear(bitmap, bn->bn_start, + bn->bn_last - bn->bn_start + 1); + if (error) + return error; + } + + return 0; +} + +/* How many bits are set in this bitmap? */ +uint32_t +xbitmap32_hweight( + struct xbitmap32 *bitmap) +{ + struct xbitmap32_node *bn; + uint32_t ret = 0; + + for_each_xbitmap32_extent(bn, bitmap) + ret += bn->bn_last - bn->bn_start + 1; + + return ret; +} + +/* Call a function for every run of set bits in this bitmap. */ +int +xbitmap32_walk( + struct xbitmap32 *bitmap, + xbitmap32_walk_fn fn, + void *priv) +{ + struct xbitmap32_node *bn; + int error = 0; + + for_each_xbitmap32_extent(bn, bitmap) { + error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); + if (error) + break; + } + + return error; +} + +/* Does this bitmap have no bits set at all? */ +bool +xbitmap32_empty( + struct xbitmap32 *bitmap) +{ + return bitmap->xb_root.rb_root.rb_node == NULL; +} + +/* Is the start of the range set or clear? And for how long? */ +bool +xbitmap32_test( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t *len) +{ + struct xbitmap32_node *bn; + uint32_t last = start + *len - 1; + + bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last); + if (!bn) + return false; + if (bn->bn_start <= start) { + if (bn->bn_last < last) + *len = bn->bn_last - start + 1; + return true; + } + *len = bn->bn_start - start; + return false; +} + +/* xfs_agblock_t bitmap */ + /* * Record all btree blocks seen while iterating all records of a btree. * @@ -316,66 +657,3 @@ xagb_bitmap_set_btcur_path( return 0; } - -/* How many bits are set in this bitmap? */ -uint64_t -xbitmap_hweight( - struct xbitmap *bitmap) -{ - struct xbitmap_node *bn; - uint64_t ret = 0; - - for_each_xbitmap_extent(bn, bitmap) - ret += bn->bn_last - bn->bn_start + 1; - - return ret; -} - -/* Call a function for every run of set bits in this bitmap. */ -int -xbitmap_walk( - struct xbitmap *bitmap, - xbitmap_walk_fn fn, - void *priv) -{ - struct xbitmap_node *bn; - int error = 0; - - for_each_xbitmap_extent(bn, bitmap) { - error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); - if (error) - break; - } - - return error; -} - -/* Does this bitmap have no bits set at all? */ -bool -xbitmap_empty( - struct xbitmap *bitmap) -{ - return bitmap->xb_root.rb_root.rb_node == NULL; -} - -/* Is the start of the range set or clear? And for how long? */ -bool -xbitmap_test( - struct xbitmap *bitmap, - uint64_t start, - uint64_t *len) -{ - struct xbitmap_node *bn; - uint64_t last = start + *len - 1; - - bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); - if (!bn) - return false; - if (bn->bn_start <= start) { - if (bn->bn_last < last) - *len = bn->bn_last - start + 1; - return true; - } - *len = bn->bn_start - start; - return false; -} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 4fe58bad6734..231b27c09b4e 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -6,17 +6,19 @@ #ifndef __XFS_SCRUB_BITMAP_H__ #define __XFS_SCRUB_BITMAP_H__ -struct xbitmap { +/* u64 bitmap */ + +struct xbitmap64 { struct rb_root_cached xb_root; }; -void xbitmap_init(struct xbitmap *bitmap); -void xbitmap_destroy(struct xbitmap *bitmap); +void xbitmap64_init(struct xbitmap64 *bitmap); +void xbitmap64_destroy(struct xbitmap64 *bitmap); -int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len); -int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); -int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); -uint64_t xbitmap_hweight(struct xbitmap *bitmap); +int xbitmap64_clear(struct xbitmap64 *bitmap, uint64_t start, uint64_t len); +int xbitmap64_set(struct xbitmap64 *bitmap, uint64_t start, uint64_t len); +int xbitmap64_disunion(struct xbitmap64 *bitmap, struct xbitmap64 *sub); +uint64_t xbitmap64_hweight(struct xbitmap64 *bitmap); /* * Return codes for the bitmap iterator functions are 0 to continue iterating, @@ -25,79 +27,93 @@ uint64_t xbitmap_hweight(struct xbitmap *bitmap); * iteration, because neither bitmap iterator ever generates that error code on * its own. Callers must not modify the bitmap while walking it. */ -typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv); -int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn, +typedef int (*xbitmap64_walk_fn)(uint64_t start, uint64_t len, void *priv); +int xbitmap64_walk(struct xbitmap64 *bitmap, xbitmap64_walk_fn fn, void *priv); -bool xbitmap_empty(struct xbitmap *bitmap); -bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len); +bool xbitmap64_empty(struct xbitmap64 *bitmap); +bool xbitmap64_test(struct xbitmap64 *bitmap, uint64_t start, uint64_t *len); + +/* u32 bitmap */ + +struct xbitmap32 { + struct rb_root_cached xb_root; +}; + +void xbitmap32_init(struct xbitmap32 *bitmap); +void xbitmap32_destroy(struct xbitmap32 *bitmap); + +int xbitmap32_clear(struct xbitmap32 *bitmap, uint32_t start, uint32_t len); +int xbitmap32_set(struct xbitmap32 *bitmap, uint32_t start, uint32_t len); +int xbitmap32_disunion(struct xbitmap32 *bitmap, struct xbitmap32 *sub); +uint32_t xbitmap32_hweight(struct xbitmap32 *bitmap); + +/* + * Return codes for the bitmap iterator functions are 0 to continue iterating, + * and non-zero to stop iterating. Any non-zero value will be passed up to the + * iteration caller. The special value -ECANCELED can be used to stop + * iteration, because neither bitmap iterator ever generates that error code on + * its own. Callers must not modify the bitmap while walking it. + */ +typedef int (*xbitmap32_walk_fn)(uint32_t start, uint32_t len, void *priv); +int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn, + void *priv); + +bool xbitmap32_empty(struct xbitmap32 *bitmap); +bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len); /* Bitmaps, but for type-checked for xfs_agblock_t */ struct xagb_bitmap { - struct xbitmap agbitmap; + struct xbitmap32 agbitmap; }; static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) { - xbitmap_init(&bitmap->agbitmap); + xbitmap32_init(&bitmap->agbitmap); } static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) { - xbitmap_destroy(&bitmap->agbitmap); + xbitmap32_destroy(&bitmap->agbitmap); } static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, xfs_agblock_t start, xfs_extlen_t len) { - return xbitmap_clear(&bitmap->agbitmap, start, len); + return xbitmap32_clear(&bitmap->agbitmap, start, len); } static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, xfs_agblock_t start, xfs_extlen_t len) { - return xbitmap_set(&bitmap->agbitmap, start, len); + return xbitmap32_set(&bitmap->agbitmap, start, len); } -static inline bool -xagb_bitmap_test( - struct xagb_bitmap *bitmap, - xfs_agblock_t start, - xfs_extlen_t *len) +static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t *len) { - uint64_t biglen = *len; - bool ret; - - ret = xbitmap_test(&bitmap->agbitmap, start, &biglen); - - if (start + biglen >= UINT_MAX) { - ASSERT(0); - biglen = UINT_MAX - start; - } - - *len = biglen; - return ret; + return xbitmap32_test(&bitmap->agbitmap, start, len); } static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, struct xagb_bitmap *sub) { - return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap); + return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap); } static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) { - return xbitmap_hweight(&bitmap->agbitmap); + return xbitmap32_hweight(&bitmap->agbitmap); } static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) { - return xbitmap_empty(&bitmap->agbitmap); + return xbitmap32_empty(&bitmap->agbitmap); } static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, - xbitmap_walk_fn fn, void *priv) + xbitmap32_walk_fn fn, void *priv) { - return xbitmap_walk(&bitmap->agbitmap, fn, priv); + return xbitmap32_walk(&bitmap->agbitmap, fn, priv); } int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 300f49e8e14a..b1f112ecc062 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -430,13 +430,12 @@ xreap_agextent_iter( */ STATIC int xreap_agmeta_extent( - uint64_t fsbno, - uint64_t len, + uint32_t agbno, + uint32_t len, void *priv) { struct xreap_state *rs = priv; struct xfs_scrub *sc = rs->sc; - xfs_agblock_t agbno = fsbno; xfs_agblock_t agbno_next = agbno + len; int error = 0; From 0f08af0f9f3eb4a67fa3849c63e918bac9773da8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:30 -0800 Subject: [PATCH 0788/1562] xfs: move the per-AG datatype bitmaps to separate files Move struct xagb_bitmap to its own pair of C and header files per request of Christoph. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/scrub/agb_bitmap.c | 103 +++++++++++++++++++++++++++++++++ fs/xfs/scrub/agb_bitmap.h | 68 ++++++++++++++++++++++ fs/xfs/scrub/agheader_repair.c | 1 + fs/xfs/scrub/bitmap.c | 91 ----------------------------- fs/xfs/scrub/bitmap.h | 59 ------------------- fs/xfs/scrub/reap.c | 1 + fs/xfs/scrub/rmap.c | 1 + 8 files changed, 175 insertions(+), 150 deletions(-) create mode 100644 fs/xfs/scrub/agb_bitmap.c create mode 100644 fs/xfs/scrub/agb_bitmap.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 1537d66e5ab0..eb557dca9373 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -145,6 +145,7 @@ ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) xfs-y += $(addprefix scrub/, \ trace.o \ + agb_bitmap.o \ agheader.o \ alloc.o \ attr.o \ diff --git a/fs/xfs/scrub/agb_bitmap.c b/fs/xfs/scrub/agb_bitmap.c new file mode 100644 index 000000000000..573e4e062754 --- /dev/null +++ b/fs/xfs/scrub/agb_bitmap.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "bitmap.h" +#include "scrub/agb_bitmap.h" + +/* + * Record all btree blocks seen while iterating all records of a btree. + * + * We know that the btree query_all function starts at the left edge and walks + * towards the right edge of the tree. Therefore, we know that we can walk up + * the btree cursor towards the root; if the pointer for a given level points + * to the first record/key in that block, we haven't seen this block before; + * and therefore we need to remember that we saw this block in the btree. + * + * So if our btree is: + * + * 4 + * / | \ + * 1 2 3 + * + * Pretend for this example that each leaf block has 100 btree records. For + * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we + * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so + * we record block 4. The list is [1, 4]. + * + * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit + * the loop. The list remains [1, 4]. + * + * For the 101st btree record, we've moved onto leaf block 2. Now + * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that + * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2]. + * + * For the 102nd record, bc_levels[0].ptr == 2, so we continue. + * + * For the 201st record, we've moved on to leaf block 3. + * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3]. + * + * For the 300th record we just exit, with the list being [1, 4, 2, 3]. + */ + +/* Mark a btree block to the agblock bitmap. */ +STATIC int +xagb_bitmap_visit_btblock( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + + xfs_btree_get_block(cur, level, &bp); + if (!bp) + return 0; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + + return xagb_bitmap_set(bitmap, agbno, 1); +} + +/* Mark all (per-AG) btree blocks in the agblock bitmap. */ +int +xagb_bitmap_set_btblocks( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, + XFS_BTREE_VISIT_ALL, bitmap); +} + +/* + * Record all the buffers pointed to by the btree cursor. Callers already + * engaged in a btree walk should call this function to capture the list of + * blocks going from the leaf towards the root. + */ +int +xagb_bitmap_set_btcur_path( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + int i; + int error; + + for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { + error = xagb_bitmap_visit_btblock(cur, i, bitmap); + if (error) + return error; + } + + return 0; +} diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h new file mode 100644 index 000000000000..ed08f76ff4f3 --- /dev/null +++ b/fs/xfs/scrub/agb_bitmap.h @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_AGB_BITMAP_H__ +#define __XFS_SCRUB_AGB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_agblock_t */ + +struct xagb_bitmap { + struct xbitmap32 agbitmap; +}; + +static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->agbitmap); +} + +static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap32_clear(&bitmap->agbitmap, start, len); +} +static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap32_set(&bitmap->agbitmap, start, len); +} + +static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t *len) +{ + return xbitmap32_test(&bitmap->agbitmap, start, len); +} + +static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, + struct xagb_bitmap *sub) +{ + return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap); +} + +static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) +{ + return xbitmap32_hweight(&bitmap->agbitmap); +} +static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) +{ + return xbitmap32_empty(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, + xbitmap32_walk_fn fn, void *priv) +{ + return xbitmap32_walk(&bitmap->agbitmap, fn, priv); +} + +int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); +int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); + +#endif /* __XFS_SCRUB_AGB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 4000bdc8b500..52956c0b8f79 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -26,6 +26,7 @@ #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" #include "scrub/reap.h" /* Superblock */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 503b79010002..1449bb5262d9 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -566,94 +566,3 @@ xbitmap32_test( *len = bn->bn_start - start; return false; } - -/* xfs_agblock_t bitmap */ - -/* - * Record all btree blocks seen while iterating all records of a btree. - * - * We know that the btree query_all function starts at the left edge and walks - * towards the right edge of the tree. Therefore, we know that we can walk up - * the btree cursor towards the root; if the pointer for a given level points - * to the first record/key in that block, we haven't seen this block before; - * and therefore we need to remember that we saw this block in the btree. - * - * So if our btree is: - * - * 4 - * / | \ - * 1 2 3 - * - * Pretend for this example that each leaf block has 100 btree records. For - * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we - * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so - * we record block 4. The list is [1, 4]. - * - * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit - * the loop. The list remains [1, 4]. - * - * For the 101st btree record, we've moved onto leaf block 2. Now - * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that - * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2]. - * - * For the 102nd record, bc_levels[0].ptr == 2, so we continue. - * - * For the 201st record, we've moved on to leaf block 3. - * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3]. - * - * For the 300th record we just exit, with the list being [1, 4, 2, 3]. - */ - -/* Mark a btree block to the agblock bitmap. */ -STATIC int -xagb_bitmap_visit_btblock( - struct xfs_btree_cur *cur, - int level, - void *priv) -{ - struct xagb_bitmap *bitmap = priv; - struct xfs_buf *bp; - xfs_fsblock_t fsbno; - xfs_agblock_t agbno; - - xfs_btree_get_block(cur, level, &bp); - if (!bp) - return 0; - - fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); - - return xagb_bitmap_set(bitmap, agbno, 1); -} - -/* Mark all (per-AG) btree blocks in the agblock bitmap. */ -int -xagb_bitmap_set_btblocks( - struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur) -{ - return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, - XFS_BTREE_VISIT_ALL, bitmap); -} - -/* - * Record all the buffers pointed to by the btree cursor. Callers already - * engaged in a btree walk should call this function to capture the list of - * blocks going from the leaf towards the root. - */ -int -xagb_bitmap_set_btcur_path( - struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur) -{ - int i; - int error; - - for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { - error = xagb_bitmap_visit_btblock(cur, i, bitmap); - if (error) - return error; - } - - return 0; -} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 231b27c09b4e..2df8911606d6 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -62,63 +62,4 @@ int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn, bool xbitmap32_empty(struct xbitmap32 *bitmap); bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len); -/* Bitmaps, but for type-checked for xfs_agblock_t */ - -struct xagb_bitmap { - struct xbitmap32 agbitmap; -}; - -static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) -{ - xbitmap32_init(&bitmap->agbitmap); -} - -static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) -{ - xbitmap32_destroy(&bitmap->agbitmap); -} - -static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, - xfs_agblock_t start, xfs_extlen_t len) -{ - return xbitmap32_clear(&bitmap->agbitmap, start, len); -} -static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, - xfs_agblock_t start, xfs_extlen_t len) -{ - return xbitmap32_set(&bitmap->agbitmap, start, len); -} - -static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap, - xfs_agblock_t start, xfs_extlen_t *len) -{ - return xbitmap32_test(&bitmap->agbitmap, start, len); -} - -static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, - struct xagb_bitmap *sub) -{ - return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap); -} - -static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) -{ - return xbitmap32_hweight(&bitmap->agbitmap); -} -static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) -{ - return xbitmap32_empty(&bitmap->agbitmap); -} - -static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, - xbitmap32_walk_fn fn, void *priv) -{ - return xbitmap32_walk(&bitmap->agbitmap, fn, priv); -} - -int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur); -int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur); - #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index b1f112ecc062..bfc3583132ac 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -37,6 +37,7 @@ #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" #include "scrub/reap.h" /* diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index d29a26ecddd6..c99d1714f283 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -24,6 +24,7 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" /* * Set us up to scrub reverse mapping btrees. From efb43b355457dab474c7eb40d6b2f3cb04c24ecf Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:31 -0800 Subject: [PATCH 0789/1562] xfs: roll the scrub transaction after completing a repair When we've finished repairing an AG header, roll the scrub transaction. This ensure that any failures caused by defer ops failing are captured by the xrep_done tracepoint and that any stacktraces that occur will point to the repair code that caused it, instead of xchk_teardown. Going forward, repair functions should commit the transaction if they're going to return success. Usually the space reaping functions that run after a successful atomic commit of the new metadata will take care of that for us. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/agheader_repair.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 52956c0b8f79..26bd1ff68f1b 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -73,7 +73,7 @@ xrep_superblock( /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); - return error; + return 0; } /* AGF */ @@ -342,7 +342,7 @@ xrep_agf_commit_new( pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); - return 0; + return xrep_roll_ag_trans(sc); } /* Repair the AGF. v5 filesystems only. */ @@ -789,6 +789,9 @@ xrep_agfl( /* Dump any AGFL overflow. */ error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, XFS_AG_RESV_AGFL); + if (error) + goto err; + err: xagb_bitmap_destroy(&agfl_extents); return error; @@ -962,7 +965,7 @@ xrep_agi_commit_new( pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); - return 0; + return xrep_roll_ag_trans(sc); } /* Repair the AGI. */ From 8bd0bf570bd7b5cbcce3f70b760d8dcccd8df6c8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:31 -0800 Subject: [PATCH 0790/1562] xfs: remove trivial bnobt/inobt scrub helpers Christoph Hellwig complained about awkward code in the next two repair patches such as: sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT; error = xchk_bnobt(sc); This is a little silly, so let's export the xchk_{,i}allocbt functions to the dispatch table in scrub.c directly and get rid of the helpers. Originally I had planned each btree gets its own separate entry point, but since repair doesn't work that way, it no longer makes sense to complicate the call chain that way. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/alloc.c | 34 +++++++++++++++------------------- fs/xfs/scrub/ialloc.c | 37 ++++++++++++++++++------------------- fs/xfs/scrub/scrub.c | 8 ++++---- fs/xfs/scrub/scrub.h | 6 ++---- 4 files changed, 39 insertions(+), 46 deletions(-) diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 279af72b1671..eb8ec47fc129 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -138,33 +138,29 @@ xchk_allocbt_rec( return 0; } -/* Scrub the freespace btrees for some AG. */ -STATIC int +/* Scrub one of the freespace btrees for some AG. */ +int xchk_allocbt( - struct xfs_scrub *sc, - xfs_btnum_t which) + struct xfs_scrub *sc) { struct xchk_alloc ca = { }; struct xfs_btree_cur *cur; - cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur; + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_BNOBT: + cur = sc->sa.bno_cur; + break; + case XFS_SCRUB_TYPE_CNTBT: + cur = sc->sa.cnt_cur; + break; + default: + ASSERT(0); + return -EIO; + } + return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca); } -int -xchk_bnobt( - struct xfs_scrub *sc) -{ - return xchk_allocbt(sc, XFS_BTNUM_BNO); -} - -int -xchk_cntbt( - struct xfs_scrub *sc) -{ - return xchk_allocbt(sc, XFS_BTNUM_CNT); -} - /* xref check that the extent is not free */ void xchk_xref_is_used_space( diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index fb7bbf47ae5d..83d9a29ce91e 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -708,11 +708,10 @@ xchk_iallocbt_xref_rmap_inodes( xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } -/* Scrub the inode btrees for some AG. */ -STATIC int +/* Scrub one of the inode btrees for some AG. */ +int xchk_iallocbt( - struct xfs_scrub *sc, - xfs_btnum_t which) + struct xfs_scrub *sc) { struct xfs_btree_cur *cur; struct xchk_iallocbt iabt = { @@ -720,9 +719,23 @@ xchk_iallocbt( .next_startino = NULLAGINO, .next_cluster_ino = NULLAGINO, }; + xfs_btnum_t which; int error; - cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur; + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_INOBT: + cur = sc->sa.ino_cur; + which = XFS_BTNUM_INO; + break; + case XFS_SCRUB_TYPE_FINOBT: + cur = sc->sa.fino_cur; + which = XFS_BTNUM_FINO; + break; + default: + ASSERT(0); + return -EIO; + } + error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT, &iabt); if (error) @@ -743,20 +756,6 @@ xchk_iallocbt( return error; } -int -xchk_inobt( - struct xfs_scrub *sc) -{ - return xchk_iallocbt(sc, XFS_BTNUM_INO); -} - -int -xchk_finobt( - struct xfs_scrub *sc) -{ - return xchk_iallocbt(sc, XFS_BTNUM_FINO); -} - /* See if an inode btree has (or doesn't have) an inode chunk record. */ static inline void xchk_xref_inode_check( diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 4849efcaa33a..31fabae588be 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -238,25 +238,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, - .scrub = xchk_bnobt, + .scrub = xchk_allocbt, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, - .scrub = xchk_cntbt, + .scrub = xchk_allocbt, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, - .scrub = xchk_inobt, + .scrub = xchk_iallocbt, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, - .scrub = xchk_finobt, + .scrub = xchk_iallocbt, .has = xfs_has_finobt, .repair = xrep_notsupported, }, diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 1ef9c6b4842a..a6a1bea4d62b 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -129,10 +129,8 @@ int xchk_superblock(struct xfs_scrub *sc); int xchk_agf(struct xfs_scrub *sc); int xchk_agfl(struct xfs_scrub *sc); int xchk_agi(struct xfs_scrub *sc); -int xchk_bnobt(struct xfs_scrub *sc); -int xchk_cntbt(struct xfs_scrub *sc); -int xchk_inobt(struct xfs_scrub *sc); -int xchk_finobt(struct xfs_scrub *sc); +int xchk_allocbt(struct xfs_scrub *sc); +int xchk_iallocbt(struct xfs_scrub *sc); int xchk_rmapbt(struct xfs_scrub *sc); int xchk_refcountbt(struct xfs_scrub *sc); int xchk_inode(struct xfs_scrub *sc); From 4bdfd7d15747b170ce93a06fafccaf20544b6684 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:32 -0800 Subject: [PATCH 0791/1562] xfs: repair free space btrees Rebuild the free space btrees from the gaps in the rmap btree. Refer to the case study in Documentation/filesystems/xfs-online-fsck-design.rst for more details. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_ag.h | 9 + fs/xfs/libxfs/xfs_ag_resv.c | 2 + fs/xfs/libxfs/xfs_alloc.c | 10 +- fs/xfs/libxfs/xfs_alloc.h | 2 +- fs/xfs/libxfs/xfs_alloc_btree.c | 13 +- fs/xfs/libxfs/xfs_types.h | 7 + fs/xfs/scrub/alloc.c | 18 +- fs/xfs/scrub/alloc_repair.c | 934 ++++++++++++++++++++++++++++++++ fs/xfs/scrub/common.h | 19 + fs/xfs/scrub/newbt.c | 48 +- fs/xfs/scrub/newbt.h | 3 + fs/xfs/scrub/repair.c | 72 +++ fs/xfs/scrub/repair.h | 24 + fs/xfs/scrub/scrub.c | 14 +- fs/xfs/scrub/scrub.h | 8 + fs/xfs/scrub/trace.h | 24 +- fs/xfs/scrub/xfarray.h | 22 + fs/xfs/xfs_extent_busy.c | 13 + fs/xfs/xfs_extent_busy.h | 2 + 20 files changed, 1224 insertions(+), 21 deletions(-) create mode 100644 fs/xfs/scrub/alloc_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index eb557dca9373..3af3cadc1ca1 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -182,6 +182,7 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ + alloc_repair.o \ newbt.o \ reap.o \ repair.o \ diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 2e0aef87d633..f16cb7a174d4 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -80,6 +80,15 @@ struct xfs_perag { */ uint16_t pag_checked; uint16_t pag_sick; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Alternate btree heights so that online repair won't trip the write + * verifiers while rebuilding the AG btrees. + */ + uint8_t pagf_repair_levels[XFS_BTNUM_AGF]; +#endif + spinlock_t pag_state_lock; spinlock_t pagb_lock; /* lock for pagb_tree */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 7fd1fea95552..da1057bd0e60 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -411,6 +411,8 @@ xfs_ag_resv_free_extent( fallthrough; case XFS_AG_RESV_NONE: xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); + fallthrough; + case XFS_AG_RESV_IGNORE: return; } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 60c2c18e8e54..3bd0a33fee0a 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -246,11 +246,9 @@ xfs_alloc_btrec_to_irec( /* Simple checks for free space records. */ xfs_failaddr_t xfs_alloc_check_irec( - struct xfs_btree_cur *cur, - const struct xfs_alloc_rec_incore *irec) + struct xfs_perag *pag, + const struct xfs_alloc_rec_incore *irec) { - struct xfs_perag *pag = cur->bc_ag.pag; - if (irec->ar_blockcount == 0) return __this_address; @@ -299,7 +297,7 @@ xfs_alloc_get_rec( return error; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur, &irec); + fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); @@ -3944,7 +3942,7 @@ xfs_alloc_query_range_helper( xfs_failaddr_t fa; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur, &irec); + fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 851cafbd6449..0b956f8b9d5a 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -185,7 +185,7 @@ xfs_alloc_get_rec( union xfs_btree_rec; void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_alloc_rec_incore *irec); -xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_alloc_check_irec(struct xfs_perag *pag, const struct xfs_alloc_rec_incore *irec); int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index c65228efed4a..a7032bf0cd37 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -323,7 +323,18 @@ xfs_allocbt_verify( if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) btnum = XFS_BTNUM_CNTi; if (pag && xfs_perag_initialised_agf(pag)) { - if (level >= pag->pagf_levels[btnum]) + unsigned int maxlevel = pag->pagf_levels[btnum]; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Online repair could be rewriting the free space btrees, so + * we'll validate against the larger of either tree while this + * is going on. + */ + maxlevel = max_t(unsigned int, maxlevel, + pag->pagf_repair_levels[btnum]); +#endif + if (level >= maxlevel) return __this_address; } else if (level >= mp->m_alloc_maxlevels) return __this_address; diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 533200c4ccc2..035bf703d719 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -208,6 +208,13 @@ enum xfs_ag_resv_type { XFS_AG_RESV_AGFL, XFS_AG_RESV_METADATA, XFS_AG_RESV_RMAPBT, + + /* + * Don't increase fdblocks when freeing extent. This is a pony for + * the bnobt repair functions to re-free the free space without + * altering fdblocks. If you think you need this you're wrong. + */ + XFS_AG_RESV_IGNORE, }; /* Results of scanning a btree keyspace to check occupancy. */ diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index eb8ec47fc129..d1b8a4997dd2 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -9,13 +9,16 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_btree.h" #include "xfs_alloc.h" #include "xfs_rmap.h" +#include "xfs_ag.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "xfs_ag.h" +#include "scrub/repair.h" /* * Set us up to scrub free space btrees. @@ -24,10 +27,19 @@ int xchk_setup_ag_allocbt( struct xfs_scrub *sc) { + int error; + if (xchk_need_intent_drain(sc)) xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); - return xchk_setup_ag_btree(sc, false); + error = xchk_setup_ag_btree(sc, false); + if (error) + return error; + + if (xchk_could_repair(sc)) + return xrep_setup_ag_allocbt(sc); + + return 0; } /* Free space btree scrubber. */ @@ -127,7 +139,7 @@ xchk_allocbt_rec( struct xchk_alloc *ca = bs->private; xfs_alloc_btrec_to_irec(rec, &irec); - if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) { + if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c new file mode 100644 index 000000000000..45edda096869 --- /dev/null +++ b/fs/xfs/scrub/alloc_repair.c @@ -0,0 +1,934 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_inode.h" +#include "xfs_refcount.h" +#include "xfs_extent_busy.h" +#include "xfs_health.h" +#include "xfs_bmap.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Free Space Btree Repair + * ======================= + * + * The reverse mappings are supposed to record all space usage for the entire + * AG. Therefore, we can recreate the free extent records in an AG by looking + * for gaps in the physical extents recorded in the rmapbt. These records are + * staged in @free_records. Identifying the gaps is more difficult on a + * reflink filesystem because rmap records are allowed to overlap. + * + * Because the final step of building a new index is to free the space used by + * the old index, repair needs to find that space. Unfortunately, all + * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share + * the same rmapbt owner code (OWN_AG), so this is not straightforward. + * + * The scan of the reverse mapping information records the space used by OWN_AG + * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While + * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to + * record all visited rmap btree blocks and all blocks owned by the AGFL. + * + * After that is where the definitions of old_allocbt_blocks shifts. This + * expression identifies possible former bnobt/cntbt blocks: + * + * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks); + * + * Substituting from above definitions, that becomes: + * + * old_allocbt_blocks & ~not_allocbt_blocks + * + * The OWN_AG bitmap itself isn't needed after this point, so what we really do + * instead is: + * + * old_allocbt_blocks &= ~not_allocbt_blocks; + * + * After this point, @old_allocbt_blocks is a bitmap of alleged former + * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first + * parameter in place to avoid copying records around. + * + * Next, some of the space described by @free_records are diverted to the newbt + * reservation and used to format new btree blocks. The remaining records are + * written to the new btree indices. We reconstruct both bnobt and cntbt at + * the same time since we've already done all the work. + * + * We use the prefix 'xrep_abt' here because we regenerate both free space + * allocation btrees at the same time. + */ + +struct xrep_abt { + /* Blocks owned by the rmapbt or the agfl. */ + struct xagb_bitmap not_allocbt_blocks; + + /* All OWN_AG blocks. */ + struct xagb_bitmap old_allocbt_blocks; + + /* + * New bnobt information. All btree block reservations are added to + * the reservation list in new_bnobt. + */ + struct xrep_newbt new_bnobt; + + /* new cntbt information */ + struct xrep_newbt new_cntbt; + + /* Free space extents. */ + struct xfarray *free_records; + + struct xfs_scrub *sc; + + /* Number of non-null records in @free_records. */ + uint64_t nr_real_records; + + /* get_records()'s position in the free space record array. */ + xfarray_idx_t array_cur; + + /* + * Next block we anticipate seeing in the rmap records. If the next + * rmap record is greater than next_agbno, we have found unused space. + */ + xfs_agblock_t next_agbno; + + /* Number of free blocks in this AG. */ + xfs_agblock_t nr_blocks; + + /* Longest free extent we found in the AG. */ + xfs_agblock_t longest; +}; + +/* Set up to repair AG free space btrees. */ +int +xrep_setup_ag_allocbt( + struct xfs_scrub *sc) +{ + unsigned int busy_gen; + + /* + * Make sure the busy extent list is clear because we can't put extents + * on there twice. + */ + busy_gen = READ_ONCE(sc->sa.pag->pagb_gen); + if (xfs_extent_busy_list_empty(sc->sa.pag)) + return 0; + + return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0); +} + +/* Check for any obvious conflicts in the free extent. */ +STATIC int +xrep_abt_check_free_ext( + struct xfs_scrub *sc, + const struct xfs_alloc_rec_incore *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->ar_startblock, rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be shared or CoW staging. */ + if (sc->sa.refc_cur) { + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_SHARED, rec->ar_startblock, + rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_COW, rec->ar_startblock, + rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * Stash a free space record for all the space since the last bno we found + * all the way up to @end. + */ +static int +xrep_abt_stash( + struct xrep_abt *ra, + xfs_agblock_t end) +{ + struct xfs_alloc_rec_incore arec = { + .ar_startblock = ra->next_agbno, + .ar_blockcount = end - ra->next_agbno, + }; + struct xfs_scrub *sc = ra->sc; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + error = xrep_abt_check_free_ext(ra->sc, &arec); + if (error) + return error; + + trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec); + + error = xfarray_append(ra->free_records, &arec); + if (error) + return error; + + ra->nr_blocks += arec.ar_blockcount; + return 0; +} + +/* Record extents that aren't in use from gaps in the rmap records. */ +STATIC int +xrep_abt_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_abt *ra = priv; + int error; + + /* Record all the OWN_AG blocks... */ + if (rec->rm_owner == XFS_RMAP_OWN_AG) { + error = xagb_bitmap_set(&ra->old_allocbt_blocks, + rec->rm_startblock, rec->rm_blockcount); + if (error) + return error; + } + + /* ...and all the rmapbt blocks... */ + error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur); + if (error) + return error; + + /* ...and all the free space. */ + if (rec->rm_startblock > ra->next_agbno) { + error = xrep_abt_stash(ra, rec->rm_startblock); + if (error) + return error; + } + + /* + * rmap records can overlap on reflink filesystems, so project + * next_agbno as far out into the AG space as we currently know about. + */ + ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno, + rec->rm_startblock + rec->rm_blockcount); + return 0; +} + +/* Collect an AGFL block for the not-to-release list. */ +static int +xrep_abt_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xrep_abt *ra = priv; + + return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1); +} + +/* + * Compare two free space extents by block number. We want to sort in order of + * increasing block number. + */ +static int +xrep_bnobt_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_alloc_rec_incore *ap = a; + const struct xfs_alloc_rec_incore *bp = b; + + if (ap->ar_startblock > bp->ar_startblock) + return 1; + else if (ap->ar_startblock < bp->ar_startblock) + return -1; + return 0; +} + +/* + * Re-sort the free extents by block number so that we can put the records into + * the bnobt in the correct order. Make sure the records do not overlap in + * physical space. + */ +STATIC int +xrep_bnobt_sort_records( + struct xrep_abt *ra) +{ + struct xfs_alloc_rec_incore arec; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + xfs_agblock_t next_agbno = 0; + int error; + + error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0); + if (error) + return error; + + while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) { + if (arec.ar_startblock < next_agbno) + return -EFSCORRUPTED; + + next_agbno = arec.ar_startblock + arec.ar_blockcount; + } + + return error; +} + +/* + * Compare two free space extents by length and then block number. We want + * to sort first in order of increasing length and then in order of increasing + * block number. + */ +static int +xrep_cntbt_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_alloc_rec_incore *ap = a; + const struct xfs_alloc_rec_incore *bp = b; + + if (ap->ar_blockcount > bp->ar_blockcount) + return 1; + else if (ap->ar_blockcount < bp->ar_blockcount) + return -1; + return xrep_bnobt_extent_cmp(a, b); +} + +/* + * Sort the free extents by length so so that we can put the records into the + * cntbt in the correct order. Don't let userspace kill us if we're resorting + * after allocating btree blocks. + */ +STATIC int +xrep_cntbt_sort_records( + struct xrep_abt *ra, + bool is_resort) +{ + return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp, + is_resort ? 0 : XFARRAY_SORT_KILLABLE); +} + +/* + * Iterate all reverse mappings to find (1) the gaps between rmap records (all + * unowned space), (2) the OWN_AG extents (which encompass the free space + * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL + * blocks. The free space is (1) + (2) - (3) - (4). + */ +STATIC int +xrep_abt_find_freespace( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_buf *agfl_bp; + xfs_agblock_t agend; + int error; + + xagb_bitmap_init(&ra->not_allocbt_blocks); + + xrep_ag_btcur_init(sc, &sc->sa); + + /* + * Iterate all the reverse mappings to find gaps in the physical + * mappings, all the OWN_AG blocks, and all the rmapbt extents. + */ + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra); + if (error) + goto err; + + /* Insert a record for space between the last rmap and EOAG. */ + agend = be32_to_cpu(agf->agf_length); + if (ra->next_agbno < agend) { + error = xrep_abt_stash(ra, agend); + if (error) + goto err; + } + + /* Collect all the AGFL blocks. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + goto err; + + error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra); + if (error) + goto err_agfl; + + /* Compute the old bnobt/cntbt blocks. */ + error = xagb_bitmap_disunion(&ra->old_allocbt_blocks, + &ra->not_allocbt_blocks); + if (error) + goto err_agfl; + + ra->nr_real_records = xfarray_length(ra->free_records); +err_agfl: + xfs_trans_brelse(sc->tp, agfl_bp); +err: + xchk_ag_btcur_free(&sc->sa); + xagb_bitmap_destroy(&ra->not_allocbt_blocks); + return error; +} + +/* + * We're going to use the observed free space records to reserve blocks for the + * new free space btrees, so we play an iterative game where we try to converge + * on the number of blocks we need: + * + * 1. Estimate how many blocks we'll need to store the records. + * 2. If the first free record has more blocks than we need, we're done. + * We will have to re-sort the records prior to building the cntbt. + * 3. If that record has exactly the number of blocks we need, null out the + * record. We're done. + * 4. Otherwise, we still need more blocks. Null out the record, subtract its + * length from the number of blocks we need, and go back to step 1. + * + * Fortunately, we don't have to do any transaction work to play this game, so + * we don't have to tear down the staging cursors. + */ +STATIC int +xrep_abt_reserve_space( + struct xrep_abt *ra, + struct xfs_btree_cur *bno_cur, + struct xfs_btree_cur *cnt_cur, + bool *needs_resort) +{ + struct xfs_scrub *sc = ra->sc; + xfarray_idx_t record_nr; + unsigned int allocated = 0; + int error = 0; + + record_nr = xfarray_length(ra->free_records) - 1; + do { + struct xfs_alloc_rec_incore arec; + uint64_t required; + unsigned int desired; + unsigned int len; + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(cnt_cur, + &ra->new_cntbt.bload, ra->nr_real_records); + if (error) + break; + + error = xfs_btree_bload_compute_geometry(bno_cur, + &ra->new_bnobt.bload, ra->nr_real_records); + if (error) + break; + + /* How many btree blocks do we need to store all records? */ + required = ra->new_bnobt.bload.nr_blocks + + ra->new_cntbt.bload.nr_blocks; + ASSERT(required < INT_MAX); + + /* If we've reserved enough blocks, we're done. */ + if (allocated >= required) + break; + + desired = required - allocated; + + /* We need space but there's none left; bye! */ + if (ra->nr_real_records == 0) { + error = -ENOSPC; + break; + } + + /* Grab the first record from the list. */ + error = xfarray_load(ra->free_records, record_nr, &arec); + if (error) + break; + + ASSERT(arec.ar_blockcount <= UINT_MAX); + len = min_t(unsigned int, arec.ar_blockcount, desired); + + trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno, + arec.ar_startblock, len, XFS_RMAP_OWN_AG); + + error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag, + arec.ar_startblock, len); + if (error) + break; + allocated += len; + ra->nr_blocks -= len; + + if (arec.ar_blockcount > desired) { + /* + * Record has more space than we need. The number of + * free records doesn't change, so shrink the free + * record, inform the caller that the records are no + * longer sorted by length, and exit. + */ + arec.ar_startblock += desired; + arec.ar_blockcount -= desired; + error = xfarray_store(ra->free_records, record_nr, + &arec); + if (error) + break; + + *needs_resort = true; + return 0; + } + + /* + * We're going to use up the entire record, so unset it and + * move on to the next one. This changes the number of free + * records (but doesn't break the sorting order), so we must + * go around the loop once more to re-run _bload_init. + */ + error = xfarray_unset(ra->free_records, record_nr); + if (error) + break; + ra->nr_real_records--; + record_nr--; + } while (1); + + return error; +} + +STATIC int +xrep_abt_dispose_one( + struct xrep_abt *ra, + struct xrep_newbt_resv *resv) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_perag *pag = sc->sa.pag; + xfs_agblock_t free_agbno = resv->agbno + resv->used; + xfs_extlen_t free_aglen = resv->len - resv->used; + int error; + + ASSERT(pag == resv->pag); + + /* Add a deferred rmap for each extent we used. */ + if (resv->used > 0) + xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno, + resv->used, XFS_RMAP_OWN_AG); + + /* + * For each reserved btree block we didn't use, add it to the free + * space btree. We didn't touch fdblocks when we reserved them, so + * we don't touch it now. + */ + if (free_aglen == 0) + return 0; + + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, + free_aglen, ra->new_bnobt.oinfo.oi_owner); + + error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen, + &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true); + if (error) + return error; + + return xrep_defer_finish(sc); +} + +/* + * Deal with all the space we reserved. Blocks that were allocated for the + * free space btrees need to have a (deferred) rmap added for the OWN_AG + * allocation, and blocks that didn't get used can be freed via the usual + * (deferred) means. + */ +STATIC void +xrep_abt_dispose_reservations( + struct xrep_abt *ra, + int error) +{ + struct xrep_newbt_resv *resv, *n; + + if (error) + goto junkit; + + list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { + error = xrep_abt_dispose_one(ra, resv); + if (error) + goto junkit; + } + +junkit: + list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { + xfs_perag_put(resv->pag); + list_del(&resv->list); + kfree(resv); + } + + xrep_newbt_cancel(&ra->new_bnobt); + xrep_newbt_cancel(&ra->new_cntbt); +} + +/* Retrieve free space data for bulk load. */ +STATIC int +xrep_abt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a; + struct xrep_abt *ra = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load_next(ra->free_records, &ra->array_cur, + arec); + if (error) + return error; + + ra->longest = max(ra->longest, arec->ar_blockcount); + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_abt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_abt *ra = priv; + + return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr); +} + +/* + * Reset the AGF counters to reflect the free space btrees that we just + * rebuilt, then reinitialize the per-AG data. + */ +STATIC int +xrep_abt_reset_counters( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + unsigned int freesp_btreeblks = 0; + + /* + * Compute the contribution to agf_btreeblks for the new free space + * btrees. This is the computed btree size minus anything we didn't + * use. + */ + freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1; + freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1; + + freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt); + freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt); + + /* + * The AGF header contains extra information related to the free space + * btrees, so we must update those fields here. + */ + agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks + + (be32_to_cpu(agf->agf_rmap_blocks) - 1)); + agf->agf_freeblks = cpu_to_be32(ra->nr_blocks); + agf->agf_longest = cpu_to_be32(ra->longest); + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS | + XFS_AGF_LONGEST | + XFS_AGF_FREEBLKS); + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the allocbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi]; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi]; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* + * Use the collected free space information to stage new free space btrees. + * If this is successful we'll return with the new btree root + * information logged to the repair transaction but not yet committed. + */ +STATIC int +xrep_abt_build_new_trees( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + struct xfs_perag *pag = sc->sa.pag; + bool needs_resort = false; + int error; + + /* + * Sort the free extents by length so that we can set up the free space + * btrees in as few extents as possible. This reduces the amount of + * deferred rmap / free work we have to do at the end. + */ + error = xrep_cntbt_sort_records(ra, false); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. + */ + xrep_newbt_init_bare(&ra->new_bnobt, sc); + xrep_newbt_init_bare(&ra->new_cntbt, sc); + + ra->new_bnobt.bload.get_records = xrep_abt_get_records; + ra->new_cntbt.bload.get_records = xrep_abt_get_records; + + ra->new_bnobt.bload.claim_block = xrep_abt_claim_block; + ra->new_cntbt.bload.claim_block = xrep_abt_claim_block; + + /* Allocate cursors for the staged btrees. */ + bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake, + pag, XFS_BTNUM_BNO); + cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake, + pag, XFS_BTNUM_CNT); + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* Reserve the space we'll need for the new btrees. */ + error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort); + if (error) + goto err_cur; + + /* + * If we need to re-sort the free extents by length, do so so that we + * can put the records into the cntbt in the correct order. + */ + if (needs_resort) { + error = xrep_cntbt_sort_records(ra, needs_resort); + if (error) + goto err_cur; + } + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the alternate incore btree + * height so that we don't trip the verifiers when writing the new + * btree blocks to disk. + */ + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = + ra->new_bnobt.bload.btree_height; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = + ra->new_cntbt.bload.btree_height; + + /* Load the free space by length tree. */ + ra->array_cur = XFARRAY_CURSOR_INIT; + ra->longest = 0; + error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra); + if (error) + goto err_levels; + + error = xrep_bnobt_sort_records(ra); + if (error) + return error; + + /* Load the free space by block number tree. */ + ra->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra); + if (error) + goto err_levels; + + /* + * Install the new btrees in the AG header. After this point the old + * btrees are no longer accessible and the new trees are live. + */ + xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(bno_cur, 0); + xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(cnt_cur, 0); + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_abt_reset_counters(ra); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + xrep_abt_dispose_reservations(ra, error); + + return xrep_roll_ag_trans(sc); + +err_levels: + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0; +err_cur: + xfs_btree_del_cursor(cnt_cur, error); + xfs_btree_del_cursor(bno_cur, error); +err_newbt: + xrep_abt_dispose_reservations(ra, error); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_abt_remove_old_trees( + struct xrep_abt *ra) +{ + struct xfs_perag *pag = ra->sc->sa.pag; + int error; + + /* Free the old btree blocks if they're not in use. */ + error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks, + &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE); + if (error) + return error; + + /* + * Now that we've zapped all the old allocbt blocks we can turn off + * the alternate height mechanism. + */ + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0; + return 0; +} + +/* Repair the freespace btrees for some AG. */ +int +xrep_allocbt( + struct xfs_scrub *sc) +{ + struct xrep_abt *ra; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS); + if (!ra) + return -ENOMEM; + ra->sc = sc; + + /* We rebuild both data structures. */ + sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT; + + /* + * Make sure the busy extent list is clear because we can't put extents + * on there twice. In theory we cleared this before we started, but + * let's not risk the filesystem. + */ + if (!xfs_extent_busy_list_empty(sc->sa.pag)) { + error = -EDEADLOCK; + goto out_ra; + } + + /* Set up enough storage to handle maximally fragmented free space. */ + descr = xchk_xfile_ag_descr(sc, "free space records"); + error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2, + sizeof(struct xfs_alloc_rec_incore), + &ra->free_records); + kfree(descr); + if (error) + goto out_ra; + + /* Collect the free space data and find the old btree blocks. */ + xagb_bitmap_init(&ra->old_allocbt_blocks); + error = xrep_abt_find_freespace(ra); + if (error) + goto out_bitmap; + + /* Rebuild the free space information. */ + error = xrep_abt_build_new_trees(ra); + if (error) + goto out_bitmap; + + /* Kill the old trees. */ + error = xrep_abt_remove_old_trees(ra); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&ra->old_allocbt_blocks); + xfarray_destroy(ra->free_records); +out_ra: + kfree(ra); + return error; +} + +/* Make sure both btrees are ok after we've rebuilt them. */ +int +xrep_revalidate_allocbt( + struct xfs_scrub *sc) +{ + __u32 old_type = sc->sm->sm_type; + int error; + + /* + * We must update sm_type temporarily so that the tree-to-tree cross + * reference checks will work in the correct direction, and also so + * that tracing will report correctly if there are more errors. + */ + sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT; + error = xchk_allocbt(sc); + if (error) + goto out; + + sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT; + error = xchk_allocbt(sc); +out: + sc->sm->sm_type = old_type; + return error; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index c83cf9e5b55f..c31be570e7d8 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -200,8 +200,21 @@ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT | XFS_SCRUB_OFLAG_PREEN); } + +/* + * "Should we prepare for a repair?" + * + * Return true if the caller permits us to repair metadata and we're not + * setting up for a post-repair evaluation. + */ +static inline bool xchk_could_repair(const struct xfs_scrub *sc) +{ + return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + !(sc->flags & XREP_ALREADY_FIXED); +} #else # define xchk_needs_repair(sc) (false) +# define xchk_could_repair(sc) (false) #endif /* CONFIG_XFS_ONLINE_REPAIR */ int xchk_metadata_inode_forks(struct xfs_scrub *sc); @@ -213,6 +226,12 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc); #define xchk_xfile_descr(sc, fmt, ...) \ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \ (sc)->mp->m_super->s_id, ##__VA_ARGS__) +#define xchk_xfile_ag_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \ + (sc)->mp->m_super->s_id, \ + (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \ + ##__VA_ARGS__) + /* * Setting up a hook to wait for intents to drain is costly -- we have to take diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 81919eeabcdb..bb6d980b4fcd 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -157,11 +157,13 @@ xrep_newbt_add_blocks( resv->used = 0; resv->pag = xfs_perag_hold(pag); - ASSERT(xnr->oinfo.oi_offset == 0); + if (args->tp) { + ASSERT(xnr->oinfo.oi_offset == 0); - error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); - if (error) - goto out_pag; + error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); + if (error) + goto out_pag; + } list_add_tail(&resv->list, &xnr->resv_list); return 0; @@ -171,6 +173,30 @@ out_pag: return error; } +/* + * Add an extent to the new btree reservation pool. Callers are required to + * reap this reservation manually if the repair is cancelled. @pag must be a + * passive reference. + */ +int +xrep_newbt_add_extent( + struct xrep_newbt *xnr, + struct xfs_perag *pag, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_mount *mp = xnr->sc->mp; + struct xfs_alloc_arg args = { + .tp = NULL, /* no autoreap */ + .oinfo = xnr->oinfo, + .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno), + .len = len, + .resv = xnr->resv, + }; + + return xrep_newbt_add_blocks(xnr, pag, &args); +} + /* Don't let our allocation hint take us beyond this AG */ static inline void xrep_newbt_validate_ag_alloc_hint( @@ -372,6 +398,7 @@ xrep_newbt_free_extent( free_aglen, xnr->oinfo.oi_owner); ASSERT(xnr->resv != XFS_AG_RESV_AGFL); + ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); /* * Use EFIs to free the reservations. This reduces the chance @@ -517,3 +544,16 @@ xrep_newbt_claim_block( /* Relog all the EFIs. */ return xrep_defer_finish(xnr->sc); } + +/* How many reserved blocks are unused? */ +unsigned int +xrep_newbt_unused_blocks( + struct xrep_newbt *xnr) +{ + struct xrep_newbt_resv *resv; + unsigned int unused = 0; + + list_for_each_entry(resv, &xnr->resv_list, list) + unused += resv->len - resv->used; + return unused; +} diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h index d2baffa17b1a..89f8e3970b1f 100644 --- a/fs/xfs/scrub/newbt.h +++ b/fs/xfs/scrub/newbt.h @@ -57,9 +57,12 @@ void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc, int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc, int whichfork, const struct xfs_owner_info *oinfo); int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks); +int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len); void xrep_newbt_cancel(struct xrep_newbt *xnr); int xrep_newbt_commit(struct xrep_newbt *xnr); int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr, union xfs_btree_ptr *ptr); +unsigned int xrep_newbt_unused_blocks(struct xrep_newbt *xnr); #endif /* __XFS_SCRUB_NEWBT_H__ */ diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1b8b5439f2d7..01b7e8d1a58b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -734,3 +734,75 @@ xrep_ino_dqattach( return error; } + +/* + * Initialize all the btree cursors for an AG repair except for the btree that + * we're rebuilding. + */ +void +xrep_ag_btcur_init( + struct xfs_scrub *sc, + struct xchk_ag *sa) +{ + struct xfs_mount *mp = sc->mp; + + /* Set up a bnobt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT && + sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) { + sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag, XFS_BTNUM_BNO); + sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag, XFS_BTNUM_CNT); + } + + /* Set up a inobt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT && + sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) { + sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, + sa->agi_bp, XFS_BTNUM_INO); + if (xfs_has_finobt(mp)) + sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag, + sc->tp, sa->agi_bp, XFS_BTNUM_FINO); + } + + /* Set up a rmapbt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT && + xfs_has_rmapbt(mp)) + sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag); + + /* Set up a refcountbt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT && + xfs_has_reflink(mp)) + sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, + sa->agf_bp, sc->sa.pag); +} + +/* + * Reinitialize the in-core AG state after a repair by rereading the AGF + * buffer. We had better get the same AGF buffer as the one that's attached + * to the scrub context. + */ +int +xrep_reinit_pagf( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag = sc->sa.pag; + struct xfs_buf *bp; + int error; + + ASSERT(pag); + ASSERT(xfs_perag_initialised_agf(pag)); + + clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); + error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp); + if (error) + return error; + + if (bp != sc->sa.agf_bp) { + ASSERT(bp == sc->sa.agf_bp); + return -EFSCORRUPTED; + } + + return 0; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 60d2a9ae5f2e..bc3353ecae8a 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -60,6 +60,15 @@ int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); +/* Repair setup functions */ +int xrep_setup_ag_allocbt(struct xfs_scrub *sc); + +void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); + +/* Metadata revalidators */ + +int xrep_revalidate_allocbt(struct xfs_scrub *sc); + /* Metadata repairers */ int xrep_probe(struct xfs_scrub *sc); @@ -67,6 +76,9 @@ int xrep_superblock(struct xfs_scrub *sc); int xrep_agf(struct xfs_scrub *sc); int xrep_agfl(struct xfs_scrub *sc); int xrep_agi(struct xfs_scrub *sc); +int xrep_allocbt(struct xfs_scrub *sc); + +int xrep_reinit_pagf(struct xfs_scrub *sc); #else @@ -87,11 +99,23 @@ xrep_calc_ag_resblks( return 0; } +/* repair setup functions for no-repair */ +static inline int +xrep_setup_nothing( + struct xfs_scrub *sc) +{ + return 0; +} +#define xrep_setup_ag_allocbt xrep_setup_nothing + +#define xrep_revalidate_allocbt (NULL) + #define xrep_probe xrep_notsupported #define xrep_superblock xrep_notsupported #define xrep_agf xrep_notsupported #define xrep_agfl xrep_notsupported #define xrep_agi xrep_notsupported +#define xrep_allocbt xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 31fabae588be..ebc3b68a8ffb 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -239,13 +239,15 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, .scrub = xchk_allocbt, - .repair = xrep_notsupported, + .repair = xrep_allocbt, + .repair_eval = xrep_revalidate_allocbt, }, [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, .scrub = xchk_allocbt, - .repair = xrep_notsupported, + .repair = xrep_allocbt, + .repair_eval = xrep_revalidate_allocbt, }, [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ .type = ST_PERAG, @@ -531,7 +533,10 @@ retry_op: /* Scrub for errors. */ check_start = xchk_stats_now(); - error = sc->ops->scrub(sc); + if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL) + error = sc->ops->repair_eval(sc); + else + error = sc->ops->scrub(sc); run.scrub_ns += xchk_stats_elapsed_ns(check_start); if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) goto try_harder; @@ -542,8 +547,7 @@ retry_op: xchk_update_health(sc); - if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && - !(sc->flags & XREP_ALREADY_FIXED)) { + if (xchk_could_repair(sc)) { bool needs_fix = xchk_needs_repair(sc->sm); /* Userspace asked us to rebuild the structure regardless. */ diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index a6a1bea4d62b..5f934a2a4cb9 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -35,6 +35,14 @@ struct xchk_meta_ops { /* Repair or optimize the metadata. */ int (*repair)(struct xfs_scrub *); + /* + * Re-scrub the metadata we repaired, in case there's extra work that + * we need to do to check our repair work. If this is NULL, we'll use + * the ->scrub function pointer, assuming that the regular scrub is + * sufficient. + */ + int (*repair_eval)(struct xfs_scrub *sc); + /* Decide if we even have this piece of metadata. */ bool (*has)(struct xfs_mount *); diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index aa7683075319..ea518712efa8 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1172,11 +1172,33 @@ DEFINE_EVENT(xrep_rmap_class, name, \ xfs_agblock_t agbno, xfs_extlen_t len, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn); +TRACE_EVENT(xrep_abt_found, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + const struct xfs_alloc_rec_incore *rec), + TP_ARGS(mp, agno, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = rec->ar_startblock; + __entry->blockcount = rec->ar_blockcount; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount) +) + TRACE_EVENT(xrep_refcount_extent_fn, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, struct xfs_refcount_irec *irec), diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h index 4ecac01363d9..62b9c506fdd1 100644 --- a/fs/xfs/scrub/xfarray.h +++ b/fs/xfs/scrub/xfarray.h @@ -54,6 +54,28 @@ static inline int xfarray_append(struct xfarray *array, const void *ptr) uint64_t xfarray_length(struct xfarray *array); int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec); +/* + * Iterate the non-null elements in a sparse xfarray. Callers should + * initialize *idx to XFARRAY_CURSOR_INIT before the first call; on return, it + * will be set to one more than the index of the record that was retrieved. + * Returns 1 if a record was retrieved, 0 if there weren't any more records, or + * a negative errno. + */ +static inline int +xfarray_iter( + struct xfarray *array, + xfarray_idx_t *idx, + void *rec) +{ + int ret = xfarray_load_next(array, idx, rec); + + if (ret == -ENODATA) + return 0; + if (ret == 0) + return 1; + return ret; +} + /* Declarations for xfile array sort functionality. */ typedef cmp_func_t xfarray_cmp_fn; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 9ecfdcdc752f..2ccde32c9a9e 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -678,3 +678,16 @@ xfs_extent_busy_ag_cmp( diff = b1->bno - b2->bno; return diff; } + +/* Are there any busy extents in this AG? */ +bool +xfs_extent_busy_list_empty( + struct xfs_perag *pag) +{ + bool res; + + spin_lock(&pag->pagb_lock); + res = RB_EMPTY_ROOT(&pag->pagb_tree); + spin_unlock(&pag->pagb_lock); + return res; +} diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 0639aab336f3..470032de3139 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -85,4 +85,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) list_sort(NULL, list, xfs_extent_busy_ag_cmp); } +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); + #endif /* __XFS_EXTENT_BUSY_H__ */ From dbfbf3bdf639a20da7d5fb390cd2e197d25aa418 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:32 -0800 Subject: [PATCH 0792/1562] xfs: repair inode btrees Use the rmapbt to find inode chunks, query the chunks to compute hole and free masks, and with that information rebuild the inobt and finobt. Refer to the case study in Documentation/filesystems/xfs-online-fsck-design.rst for more details. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_ialloc.c | 31 +- fs/xfs/libxfs/xfs_ialloc.h | 3 +- fs/xfs/scrub/common.c | 1 + fs/xfs/scrub/ialloc.c | 2 +- fs/xfs/scrub/ialloc_repair.c | 884 +++++++++++++++++++++++++++++++++++ fs/xfs/scrub/repair.c | 59 +++ fs/xfs/scrub/repair.h | 17 + fs/xfs/scrub/scrub.c | 6 +- fs/xfs/scrub/scrub.h | 1 + fs/xfs/scrub/trace.h | 68 +-- 11 files changed, 1022 insertions(+), 51 deletions(-) create mode 100644 fs/xfs/scrub/ialloc_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 3af3cadc1ca1..8758abdcbb20 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -183,6 +183,7 @@ ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ alloc_repair.o \ + ialloc_repair.o \ newbt.o \ reap.o \ repair.o \ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index d61d03e5b853..2361a22035b0 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -95,18 +95,28 @@ xfs_inobt_btrec_to_irec( irec->ir_free = be64_to_cpu(rec->inobt.ir_free); } +/* Compute the freecount of an incore inode record. */ +uint8_t +xfs_inobt_rec_freecount( + const struct xfs_inobt_rec_incore *irec) +{ + uint64_t realfree = irec->ir_free; + + if (xfs_inobt_issparse(irec->ir_holemask)) + realfree &= xfs_inobt_irec_to_allocmask(irec); + return hweight64(realfree); +} + /* Simple checks for inode records. */ xfs_failaddr_t xfs_inobt_check_irec( - struct xfs_btree_cur *cur, + struct xfs_perag *pag, const struct xfs_inobt_rec_incore *irec) { - uint64_t realfree; - /* Record has to be properly aligned within the AG. */ - if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino)) + if (!xfs_verify_agino(pag, irec->ir_startino)) return __this_address; - if (!xfs_verify_agino(cur->bc_ag.pag, + if (!xfs_verify_agino(pag, irec->ir_startino + XFS_INODES_PER_CHUNK - 1)) return __this_address; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || @@ -115,12 +125,7 @@ xfs_inobt_check_irec( if (irec->ir_freecount > XFS_INODES_PER_CHUNK) return __this_address; - /* if there are no holes, return the first available offset */ - if (!xfs_inobt_issparse(irec->ir_holemask)) - realfree = irec->ir_free; - else - realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec); - if (hweight64(realfree) != irec->ir_freecount) + if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount) return __this_address; return NULL; @@ -164,7 +169,7 @@ xfs_inobt_get_rec( return error; xfs_inobt_btrec_to_irec(mp, rec, irec); - fa = xfs_inobt_check_irec(cur, irec); + fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, irec); @@ -2740,7 +2745,7 @@ xfs_ialloc_count_inodes_rec( xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); - fa = xfs_inobt_check_irec(cur, &irec); + fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index fe824bb04a09..f1412183bb44 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -79,6 +79,7 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, */ int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); +uint8_t xfs_inobt_rec_freecount(const struct xfs_inobt_rec_incore *irec); /* * Inode chunk initialisation routine @@ -93,7 +94,7 @@ union xfs_btree_rec; void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); -xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_inobt_check_irec(struct xfs_perag *pag, const struct xfs_inobt_rec_incore *irec); int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 23944fcc1a6c..e0d6d8c9f640 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -604,6 +604,7 @@ xchk_ag_free( struct xchk_ag *sa) { xchk_ag_btcur_free(sa); + xrep_reset_perag_resv(sc); if (sa->agf_bp) { xfs_trans_brelse(sc->tp, sa->agf_bp); sa->agf_bp = NULL; diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 83d9a29ce91e..a720fc62262a 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -585,7 +585,7 @@ xchk_iallocbt_rec( uint16_t holemask; xfs_inobt_btrec_to_irec(mp, rec, &irec); - if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) { + if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c new file mode 100644 index 000000000000..b3f7182dd2f5 --- /dev/null +++ b/fs/xfs/scrub/ialloc_repair.c @@ -0,0 +1,884 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_health.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Inode Btree Repair + * ================== + * + * A quick refresher of inode btrees on a v5 filesystem: + * + * - Inode records are read into memory in units of 'inode clusters'. However + * many inodes fit in a cluster buffer is the smallest number of inodes that + * can be allocated or freed. Clusters are never smaller than one fs block + * though they can span multiple blocks. The size (in fs blocks) is + * computed with xfs_icluster_size_fsb(). The fs block alignment of a + * cluster is computed with xfs_ialloc_cluster_alignment(). + * + * - Each inode btree record can describe a single 'inode chunk'. The chunk + * size is defined to be 64 inodes. If sparse inodes are enabled, every + * inobt record must be aligned to the chunk size; if not, every record must + * be aligned to the start of a cluster. It is possible to construct an XFS + * geometry where one inobt record maps to multiple inode clusters; it is + * also possible to construct a geometry where multiple inobt records map to + * different parts of one inode cluster. + * + * - If sparse inodes are not enabled, the smallest unit of allocation for + * inode records is enough to contain one inode chunk's worth of inodes. + * + * - If sparse inodes are enabled, the holemask field will be active. Each + * bit of the holemask represents 4 potential inodes; if set, the + * corresponding space does *not* contain inodes and must be left alone. + * Clusters cannot be smaller than 4 inodes. The smallest unit of allocation + * of inode records is one inode cluster. + * + * So what's the rebuild algorithm? + * + * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT + * records. The OWN_INOBT records are the old inode btree blocks and will be + * cleared out after we've rebuilt the tree. Each possible inode cluster + * within an OWN_INODES record will be read in; for each possible inobt record + * associated with that cluster, compute the freemask calculated from the + * i_mode data in the inode chunk. For sparse inodes the holemask will be + * calculated by creating the properly aligned inobt record and punching out + * any chunk that's missing. Inode allocations and frees grab the AGI first, + * so repair protects itself from concurrent access by locking the AGI. + * + * Once we've reconstructed all the inode records, we can create new inode + * btree roots and reload the btrees. We rebuild both inode trees at the same + * time because they have the same rmap owner and it would be more complex to + * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT + * blocks it owns. We have all the data we need to build both, so dump + * everything and start over. + * + * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once. + */ + +struct xrep_ibt { + /* Record under construction. */ + struct xfs_inobt_rec_incore rie; + + /* new inobt information */ + struct xrep_newbt new_inobt; + + /* new finobt information */ + struct xrep_newbt new_finobt; + + /* Old inode btree blocks we found in the rmap. */ + struct xagb_bitmap old_iallocbt_blocks; + + /* Reconstructed inode records. */ + struct xfarray *inode_records; + + struct xfs_scrub *sc; + + /* Number of inodes assigned disk space. */ + unsigned int icount; + + /* Number of inodes in use. */ + unsigned int iused; + + /* Number of finobt records needed. */ + unsigned int finobt_recs; + + /* get_records()'s position in the inode record array. */ + xfarray_idx_t array_cur; +}; + +/* + * Is this inode in use? If the inode is in memory we can tell from i_mode, + * otherwise we have to check di_mode in the on-disk buffer. We only care + * that the high (i.e. non-permission) bits of _mode are zero. This should be + * safe because repair keeps all AG headers locked until the end, and process + * trying to perform an inode allocation/free must lock the AGI. + * + * @cluster_ag_base is the inode offset of the cluster within the AG. + * @cluster_bp is the cluster buffer. + * @cluster_index is the inode offset within the inode cluster. + */ +STATIC int +xrep_ibt_check_ifree( + struct xrep_ibt *ri, + xfs_agino_t cluster_ag_base, + struct xfs_buf *cluster_bp, + unsigned int cluster_index, + bool *inuse) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_dinode *dip; + xfs_ino_t fsino; + xfs_agino_t agino; + xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno; + unsigned int cluster_buf_base; + unsigned int offset; + int error; + + agino = cluster_ag_base + cluster_index; + fsino = XFS_AGINO_TO_INO(mp, agno, agino); + + /* Inode uncached or half assembled, read disk buffer */ + cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base); + offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize; + if (offset >= BBTOB(cluster_bp->b_length)) + return -EFSCORRUPTED; + dip = xfs_buf_offset(cluster_bp, offset); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) + return -EFSCORRUPTED; + + if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino) + return -EFSCORRUPTED; + + /* Will the in-core inode tell us if it's in use? */ + error = xchk_inode_is_allocated(sc, agino, inuse); + if (!error) + return 0; + + *inuse = dip->di_mode != 0; + return 0; +} + +/* Stash the accumulated inobt record for rebuilding. */ +STATIC int +xrep_ibt_stash( + struct xrep_ibt *ri) +{ + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie); + if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL) + return -EFSCORRUPTED; + + if (ri->rie.ir_freecount > 0) + ri->finobt_recs++; + + trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie); + + error = xfarray_append(ri->inode_records, &ri->rie); + if (error) + return error; + + ri->rie.ir_startino = NULLAGINO; + return 0; +} + +/* + * Given an extent of inodes and an inode cluster buffer, calculate the + * location of the corresponding inobt record (creating it if necessary), + * then update the parts of the holemask and freemask of that record that + * correspond to the inode extent we were given. + * + * @cluster_ir_startino is the AG inode number of an inobt record that we're + * proposing to create for this inode cluster. If sparse inodes are enabled, + * we must round down to a chunk boundary to find the actual sparse record. + * @cluster_bp is the buffer of the inode cluster. + * @nr_inodes is the number of inodes to check from the cluster. + */ +STATIC int +xrep_ibt_cluster_record( + struct xrep_ibt *ri, + xfs_agino_t cluster_ir_startino, + struct xfs_buf *cluster_bp, + unsigned int nr_inodes) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + xfs_agino_t ir_startino; + unsigned int cluster_base; + unsigned int cluster_index; + int error = 0; + + ir_startino = cluster_ir_startino; + if (xfs_has_sparseinodes(mp)) + ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK); + cluster_base = cluster_ir_startino - ir_startino; + + /* + * If the accumulated inobt record doesn't map this cluster, add it to + * the list and reset it. + */ + if (ri->rie.ir_startino != NULLAGINO && + ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) { + error = xrep_ibt_stash(ri); + if (error) + return error; + } + + if (ri->rie.ir_startino == NULLAGINO) { + ri->rie.ir_startino = ir_startino; + ri->rie.ir_free = XFS_INOBT_ALL_FREE; + ri->rie.ir_holemask = 0xFFFF; + ri->rie.ir_count = 0; + } + + /* Record the whole cluster. */ + ri->icount += nr_inodes; + ri->rie.ir_count += nr_inodes; + ri->rie.ir_holemask &= ~xfs_inobt_maskn( + cluster_base / XFS_INODES_PER_HOLEMASK_BIT, + nr_inodes / XFS_INODES_PER_HOLEMASK_BIT); + + /* Which inodes within this cluster are free? */ + for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { + bool inuse = false; + + error = xrep_ibt_check_ifree(ri, cluster_ir_startino, + cluster_bp, cluster_index, &inuse); + if (error) + return error; + if (!inuse) + continue; + ri->iused++; + ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base + + cluster_index); + } + return 0; +} + +/* + * For each inode cluster covering the physical extent recorded by the rmapbt, + * we must calculate the properly aligned startino of that cluster, then + * iterate each cluster to fill in used and filled masks appropriately. We + * then use the (startino, used, filled) information to construct the + * appropriate inode records. + */ +STATIC int +xrep_ibt_process_cluster( + struct xrep_ibt *ri, + xfs_agblock_t cluster_bno) +{ + struct xfs_imap imap; + struct xfs_buf *cluster_bp; + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agino_t cluster_ag_base; + xfs_agino_t irec_index; + unsigned int nr_inodes; + int error; + + nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster, + XFS_INODES_PER_CHUNK); + + /* + * Grab the inode cluster buffer. This is safe to do with a broken + * inobt because imap_to_bp directly maps the buffer without touching + * either inode btree. + */ + imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno); + imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); + imap.im_boffset = 0; + error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp); + if (error) + return error; + + /* + * Record the contents of each possible inobt record mapping this + * cluster. + */ + cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno); + for (irec_index = 0; + irec_index < igeo->inodes_per_cluster; + irec_index += XFS_INODES_PER_CHUNK) { + error = xrep_ibt_cluster_record(ri, + cluster_ag_base + irec_index, cluster_bp, + nr_inodes); + if (error) + break; + + } + + xfs_trans_brelse(sc->tp, cluster_bp); + return error; +} + +/* Check for any obvious conflicts in the inode chunk extent. */ +STATIC int +xrep_ibt_check_inode_ext( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agino_t agino; + enum xbtree_recpacking outcome; + int error; + + /* Inode records must be within the AG. */ + if (!xfs_verify_agbext(sc->sa.pag, agbno, len)) + return -EFSCORRUPTED; + + /* The entire record must align to the inode cluster size. */ + if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) || + !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster)) + return -EFSCORRUPTED; + + /* + * The entire record must also adhere to the inode cluster alignment + * size if sparse inodes are not enabled. + */ + if (!xfs_has_sparseinodes(mp) && + (!IS_ALIGNED(agbno, igeo->cluster_align) || + !IS_ALIGNED(agbno + len, igeo->cluster_align))) + return -EFSCORRUPTED; + + /* + * On a sparse inode fs, this cluster could be part of a sparse chunk. + * Sparse clusters must be aligned to sparse chunk alignment. + */ + if (xfs_has_sparseinodes(mp) && + (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) || + !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align))) + return -EFSCORRUPTED; + + /* Make sure the entire range of blocks are valid AG inodes. */ + agino = XFS_AGB_TO_AGINO(mp, agbno); + if (!xfs_verify_agino(sc->sa.pag, agino)) + return -EFSCORRUPTED; + + agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1; + if (!xfs_verify_agino(sc->sa.pag, agino)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Found a fragment of the old inode btrees; dispose of them later. */ +STATIC int +xrep_ibt_record_old_btree_blocks( + struct xrep_ibt *ri, + const struct xfs_rmap_irec *rec) +{ + if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock, + rec->rm_blockcount); +} + +/* Record extents that belong to inode cluster blocks. */ +STATIC int +xrep_ibt_record_inode_blocks( + struct xrep_ibt *ri, + const struct xfs_rmap_irec *rec) +{ + struct xfs_mount *mp = ri->sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agblock_t cluster_base; + int error; + + error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock, + rec->rm_blockcount); + if (error) + return error; + + trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno, + rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, + rec->rm_offset, rec->rm_flags); + + /* + * Record the free/hole masks for each inode cluster that could be + * mapped by this rmap record. + */ + for (cluster_base = 0; + cluster_base < rec->rm_blockcount; + cluster_base += igeo->blocks_per_cluster) { + error = xrep_ibt_process_cluster(ri, + rec->rm_startblock + cluster_base); + if (error) + return error; + } + + return 0; +} + +STATIC int +xrep_ibt_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_ibt *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + switch (rec->rm_owner) { + case XFS_RMAP_OWN_INOBT: + return xrep_ibt_record_old_btree_blocks(ri, rec); + case XFS_RMAP_OWN_INODES: + return xrep_ibt_record_inode_blocks(ri, rec); + } + return 0; +} + +/* + * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode + * btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct + * the inode btrees. The caller must clean up the lists if anything goes + * wrong. + */ +STATIC int +xrep_ibt_find_inodes( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + ri->rie.ir_startino = NULLAGINO; + + /* Collect all reverse mappings for inode blocks. */ + xrep_ag_btcur_init(sc, &sc->sa); + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri); + xchk_ag_btcur_free(&sc->sa); + if (error) + return error; + + /* If we have a record ready to go, add it to the array. */ + if (ri->rie.ir_startino != NULLAGINO) + return xrep_ibt_stash(ri); + + return 0; +} + +/* Update the AGI counters. */ +STATIC int +xrep_ibt_reset_counters( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + unsigned int freecount = ri->icount - ri->iused; + + /* Trigger inode count recalculation */ + xfs_force_summary_recalc(sc->mp); + + /* + * The AGI header contains extra information related to the inode + * btrees, so we must update those fields here. + */ + agi->agi_count = cpu_to_be32(ri->icount); + agi->agi_freecount = cpu_to_be32(freecount); + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, + XFS_AGI_COUNT | XFS_AGI_FREECOUNT); + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagi(sc); +} + +/* Retrieve finobt data for bulk load. */ +STATIC int +xrep_fibt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; + struct xrep_ibt *ri = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + do { + error = xfarray_load(ri->inode_records, + ri->array_cur++, irec); + } while (error == 0 && xfs_inobt_rec_freecount(irec) == 0); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Retrieve inobt data for bulk load. */ +STATIC int +xrep_ibt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; + struct xrep_ibt *ri = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(ri->inode_records, ri->array_cur++, irec); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new inobt blocks to the bulk loader. */ +STATIC int +xrep_ibt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_ibt *ri = priv; + + return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr); +} + +/* Feed one of the new finobt blocks to the bulk loader. */ +STATIC int +xrep_fibt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_ibt *ri = priv; + + return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr); +} + +/* Make sure the records do not overlap in inumber address space. */ +STATIC int +xrep_ibt_check_overlap( + struct xrep_ibt *ri) +{ + struct xfs_inobt_rec_incore irec; + xfarray_idx_t cur; + xfs_agino_t next_agino = 0; + int error = 0; + + foreach_xfarray_idx(ri->inode_records, cur) { + if (xchk_should_terminate(ri->sc, &error)) + return error; + + error = xfarray_load(ri->inode_records, cur, &irec); + if (error) + return error; + + if (irec.ir_startino < next_agino) + return -EFSCORRUPTED; + + next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK; + } + + return error; +} + +/* Build new inode btrees and dispose of the old one. */ +STATIC int +xrep_ibt_build_new_trees( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_btree_cur *ino_cur; + struct xfs_btree_cur *fino_cur = NULL; + xfs_fsblock_t fsbno; + bool need_finobt; + int error; + + need_finobt = xfs_has_finobt(sc->mp); + + /* + * Create new btrees for staging all the inobt records we collected + * earlier. The records were collected in order of increasing agino, + * so we do not have to sort them. Ensure there are no overlapping + * records. + */ + error = xrep_ibt_check_overlap(ri); + if (error) + return error; + + /* + * The new inode btrees will not be rooted in the AGI until we've + * successfully rebuilt the tree. + * + * Start by setting up the inobt staging cursor. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_IBT_BLOCK(sc->mp)), + xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno, + XFS_AG_RESV_NONE); + ri->new_inobt.bload.claim_block = xrep_ibt_claim_block; + ri->new_inobt.bload.get_records = xrep_ibt_get_records; + + ino_cur = xfs_inobt_stage_cursor(sc->sa.pag, &ri->new_inobt.afake, + XFS_BTNUM_INO); + error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload, + xfarray_length(ri->inode_records)); + if (error) + goto err_inocur; + + /* Set up finobt staging cursor. */ + if (need_finobt) { + enum xfs_ag_resv_type resv = XFS_AG_RESV_METADATA; + + if (sc->mp->m_finobt_nores) + resv = XFS_AG_RESV_NONE; + + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_FIBT_BLOCK(sc->mp)), + xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT, + fsbno, resv); + ri->new_finobt.bload.claim_block = xrep_fibt_claim_block; + ri->new_finobt.bload.get_records = xrep_fibt_get_records; + + fino_cur = xfs_inobt_stage_cursor(sc->sa.pag, + &ri->new_finobt.afake, XFS_BTNUM_FINO); + error = xfs_btree_bload_compute_geometry(fino_cur, + &ri->new_finobt.bload, ri->finobt_recs); + if (error) + goto err_finocur; + } + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_finocur; + + /* Reserve all the space we need to build the new btrees. */ + error = xrep_newbt_alloc_blocks(&ri->new_inobt, + ri->new_inobt.bload.nr_blocks); + if (error) + goto err_finocur; + + if (need_finobt) { + error = xrep_newbt_alloc_blocks(&ri->new_finobt, + ri->new_finobt.bload.nr_blocks); + if (error) + goto err_finocur; + } + + /* Add all inobt records. */ + ri->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri); + if (error) + goto err_finocur; + + /* Add all finobt records. */ + if (need_finobt) { + ri->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri); + if (error) + goto err_finocur; + } + + /* + * Install the new btrees in the AG header. After this point the old + * btrees are no longer accessible and the new trees are live. + */ + xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp); + xfs_btree_del_cursor(ino_cur, 0); + + if (fino_cur) { + xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp); + xfs_btree_del_cursor(fino_cur, 0); + } + + /* Reset the AGI counters now that we've changed the inode roots. */ + error = xrep_ibt_reset_counters(ri); + if (error) + goto err_finobt; + + /* Free unused blocks and bitmap. */ + if (need_finobt) { + error = xrep_newbt_commit(&ri->new_finobt); + if (error) + goto err_inobt; + } + error = xrep_newbt_commit(&ri->new_inobt); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_finocur: + if (need_finobt) + xfs_btree_del_cursor(fino_cur, error); +err_inocur: + xfs_btree_del_cursor(ino_cur, error); +err_finobt: + if (need_finobt) + xrep_newbt_cancel(&ri->new_finobt); +err_inobt: + xrep_newbt_cancel(&ri->new_inobt); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_ibt_remove_old_trees( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + /* + * Free the old inode btree blocks if they're not in use. It's ok to + * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG + * reservation because we reset the reservation before releasing the + * AGI and AGF header buffer locks. + */ + error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks, + &XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE); + if (error) + return error; + + /* + * If the finobt is enabled and has a per-AG reservation, make sure we + * reinitialize the per-AG reservations. + */ + if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores) + sc->flags |= XREP_RESET_PERAG_RESV; + + return 0; +} + +/* Repair both inode btrees. */ +int +xrep_iallocbt( + struct xfs_scrub *sc) +{ + struct xrep_ibt *ri; + struct xfs_mount *mp = sc->mp; + char *descr; + xfs_agino_t first_agino, last_agino; + int error = 0; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS); + if (!ri) + return -ENOMEM; + ri->sc = sc; + + /* We rebuild both inode btrees. */ + sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT; + + /* Set up enough storage to handle an AG with nothing but inodes. */ + xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino); + last_agino /= XFS_INODES_PER_CHUNK; + descr = xchk_xfile_ag_descr(sc, "inode index records"); + error = xfarray_create(descr, last_agino, + sizeof(struct xfs_inobt_rec_incore), + &ri->inode_records); + kfree(descr); + if (error) + goto out_ri; + + /* Collect the inode data and find the old btree blocks. */ + xagb_bitmap_init(&ri->old_iallocbt_blocks); + error = xrep_ibt_find_inodes(ri); + if (error) + goto out_bitmap; + + /* Rebuild the inode indexes. */ + error = xrep_ibt_build_new_trees(ri); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_ibt_remove_old_trees(ri); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&ri->old_iallocbt_blocks); + xfarray_destroy(ri->inode_records); +out_ri: + kfree(ri); + return error; +} + +/* Make sure both btrees are ok after we've rebuilt them. */ +int +xrep_revalidate_iallocbt( + struct xfs_scrub *sc) +{ + __u32 old_type = sc->sm->sm_type; + int error; + + /* + * We must update sm_type temporarily so that the tree-to-tree cross + * reference checks will work in the correct direction, and also so + * that tracing will report correctly if there are more errors. + */ + sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT; + error = xchk_iallocbt(sc); + if (error) + goto out; + + if (xfs_has_finobt(sc->mp)) { + sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT; + error = xchk_iallocbt(sc); + } + +out: + sc->sm->sm_type = old_type; + return error; +} diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 01b7e8d1a58b..a604f0cea8c1 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -806,3 +806,62 @@ xrep_reinit_pagf( return 0; } + +/* + * Reinitialize the in-core AG state after a repair by rereading the AGI + * buffer. We had better get the same AGI buffer as the one that's attached + * to the scrub context. + */ +int +xrep_reinit_pagi( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag = sc->sa.pag; + struct xfs_buf *bp; + int error; + + ASSERT(pag); + ASSERT(xfs_perag_initialised_agi(pag)); + + clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); + error = xfs_ialloc_read_agi(pag, sc->tp, &bp); + if (error) + return error; + + if (bp != sc->sa.agi_bp) { + ASSERT(bp == sc->sa.agi_bp); + return -EFSCORRUPTED; + } + + return 0; +} + +/* Reinitialize the per-AG block reservation for the AG we just fixed. */ +int +xrep_reset_perag_resv( + struct xfs_scrub *sc) +{ + int error; + + if (!(sc->flags & XREP_RESET_PERAG_RESV)) + return 0; + + ASSERT(sc->sa.pag != NULL); + ASSERT(sc->ops->type == ST_PERAG); + ASSERT(sc->tp); + + sc->flags &= ~XREP_RESET_PERAG_RESV; + error = xfs_ag_resv_free(sc->sa.pag); + if (error) + goto out; + error = xfs_ag_resv_init(sc->sa.pag, sc->tp); + if (error == -ENOSPC) { + xfs_err(sc->mp, +"Insufficient free space to reset per-AG reservation for AG %u after repair.", + sc->sa.pag->pag_agno); + error = 0; + } + +out: + return error; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index bc3353ecae8a..05bd55430e6e 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -59,6 +59,7 @@ int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); +int xrep_reset_perag_resv(struct xfs_scrub *sc); /* Repair setup functions */ int xrep_setup_ag_allocbt(struct xfs_scrub *sc); @@ -68,6 +69,7 @@ void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); /* Metadata revalidators */ int xrep_revalidate_allocbt(struct xfs_scrub *sc); +int xrep_revalidate_iallocbt(struct xfs_scrub *sc); /* Metadata repairers */ @@ -77,8 +79,10 @@ int xrep_agf(struct xfs_scrub *sc); int xrep_agfl(struct xfs_scrub *sc); int xrep_agi(struct xfs_scrub *sc); int xrep_allocbt(struct xfs_scrub *sc); +int xrep_iallocbt(struct xfs_scrub *sc); int xrep_reinit_pagf(struct xfs_scrub *sc); +int xrep_reinit_pagi(struct xfs_scrub *sc); #else @@ -99,6 +103,17 @@ xrep_calc_ag_resblks( return 0; } +static inline int +xrep_reset_perag_resv( + struct xfs_scrub *sc) +{ + if (!(sc->flags & XREP_RESET_PERAG_RESV)) + return 0; + + ASSERT(0); + return -EOPNOTSUPP; +} + /* repair setup functions for no-repair */ static inline int xrep_setup_nothing( @@ -109,6 +124,7 @@ xrep_setup_nothing( #define xrep_setup_ag_allocbt xrep_setup_nothing #define xrep_revalidate_allocbt (NULL) +#define xrep_revalidate_iallocbt (NULL) #define xrep_probe xrep_notsupported #define xrep_superblock xrep_notsupported @@ -116,6 +132,7 @@ xrep_setup_nothing( #define xrep_agfl xrep_notsupported #define xrep_agi xrep_notsupported #define xrep_allocbt xrep_notsupported +#define xrep_iallocbt xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index ebc3b68a8ffb..02ddfddfbed4 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -253,14 +253,16 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, .scrub = xchk_iallocbt, - .repair = xrep_notsupported, + .repair = xrep_iallocbt, + .repair_eval = xrep_revalidate_iallocbt, }, [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, .scrub = xchk_iallocbt, .has = xfs_has_finobt, - .repair = xrep_notsupported, + .repair = xrep_iallocbt, + .repair_eval = xrep_revalidate_iallocbt, }, [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ .type = ST_PERAG, diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 5f934a2a4cb9..7fc50654c4fe 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -121,6 +121,7 @@ struct xfs_scrub { #define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */ #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ +#define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ /* diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index ea518712efa8..c60f76231f0c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -106,6 +106,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \ { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ + { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \ { XREP_ALREADY_FIXED, "already_fixed" } DECLARE_EVENT_CLASS(xchk_class, @@ -1172,7 +1173,7 @@ DEFINE_EVENT(xrep_rmap_class, name, \ xfs_agblock_t agbno, xfs_extlen_t len, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap); DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn); @@ -1199,6 +1200,38 @@ TRACE_EVENT(xrep_abt_found, __entry->blockcount) ) +TRACE_EVENT(xrep_ibt_found, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + const struct xfs_inobt_rec_incore *rec), + TP_ARGS(mp, agno, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(uint16_t, holemask) + __field(uint8_t, count) + __field(uint8_t, freecount) + __field(uint64_t, freemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = rec->ir_startino; + __entry->holemask = rec->ir_holemask; + __entry->count = rec->ir_count; + __entry->freecount = rec->ir_freecount; + __entry->freemask = rec->ir_free; + ), + TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x count 0x%x freecount 0x%x freemask 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->holemask, + __entry->count, + __entry->freecount, + __entry->freemask) +) + TRACE_EVENT(xrep_refcount_extent_fn, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, struct xfs_refcount_irec *irec), @@ -1321,39 +1354,6 @@ TRACE_EVENT(xrep_reset_counters, MAJOR(__entry->dev), MINOR(__entry->dev)) ) -TRACE_EVENT(xrep_ialloc_insert, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t startino, uint16_t holemask, uint8_t count, - uint8_t freecount, uint64_t freemask), - TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_agnumber_t, agno) - __field(xfs_agino_t, startino) - __field(uint16_t, holemask) - __field(uint8_t, count) - __field(uint8_t, freecount) - __field(uint64_t, freemask) - ), - TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->startino = startino; - __entry->holemask = holemask; - __entry->count = count; - __entry->freecount = freecount; - __entry->freemask = freemask; - ), - TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->agno, - __entry->startino, - __entry->holemask, - __entry->count, - __entry->freecount, - __entry->freemask) -) - DECLARE_EVENT_CLASS(xrep_newbt_extent_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len, From d5aa62de1efe0fb8c52acf7103808048ddd38767 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:33 -0800 Subject: [PATCH 0793/1562] xfs: disable online repair quota helpers when quota not enabled Don't compile the quota helper functions if quota isn't being built into the XFS module. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/repair.c | 2 ++ fs/xfs/scrub/repair.h | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index a604f0cea8c1..b4e7c4ad779f 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -673,6 +673,7 @@ xrep_find_ag_btree_roots( return error; } +#ifdef CONFIG_XFS_QUOTA /* Force a quotacheck the next time we mount. */ void xrep_force_quotacheck( @@ -734,6 +735,7 @@ xrep_ino_dqattach( return error; } +#endif /* CONFIG_XFS_QUOTA */ /* * Initialize all the btree cursors for an AG repair except for the btree that diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index cc7ea3942729..93814acc678a 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -57,8 +57,15 @@ struct xrep_find_ag_btree { int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); + +#ifdef CONFIG_XFS_QUOTA void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); +#else +# define xrep_force_quotacheck(sc, type) ((void)0) +# define xrep_ino_dqattach(sc) (0) +#endif /* CONFIG_XFS_QUOTA */ + int xrep_reset_perag_resv(struct xfs_scrub *sc); /* Repair setup functions */ @@ -87,6 +94,8 @@ int xrep_reinit_pagi(struct xfs_scrub *sc); #else +#define xrep_ino_dqattach(sc) (0) + static inline int xrep_attempt( struct xfs_scrub *sc, From 9099cd38002f8029c9a1da08e6832d1cd18e8451 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:33 -0800 Subject: [PATCH 0794/1562] xfs: repair refcount btrees Reconstruct the refcount data from the rmap btree. Link: https://docs.kernel.org/filesystems/xfs-online-fsck-design.html#case-study-rebuilding-the-space-reference-counts Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_ag.h | 1 + fs/xfs/libxfs/xfs_btree.c | 26 + fs/xfs/libxfs/xfs_btree.h | 2 + fs/xfs/libxfs/xfs_refcount.c | 8 +- fs/xfs/libxfs/xfs_refcount.h | 2 +- fs/xfs/libxfs/xfs_refcount_btree.c | 13 +- fs/xfs/scrub/refcount.c | 2 +- fs/xfs/scrub/refcount_repair.c | 794 +++++++++++++++++++++++++++++ fs/xfs/scrub/repair.h | 2 + fs/xfs/scrub/scrub.c | 2 +- fs/xfs/scrub/trace.h | 22 +- 12 files changed, 856 insertions(+), 19 deletions(-) create mode 100644 fs/xfs/scrub/refcount_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 8758abdcbb20..7e1df6fdaaad 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -186,6 +186,7 @@ xfs-y += $(addprefix scrub/, \ ialloc_repair.o \ newbt.o \ reap.o \ + refcount_repair.o \ repair.o \ ) endif diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index f16cb7a174d4..67c3260ee789 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -87,6 +87,7 @@ struct xfs_perag { * verifiers while rebuilding the AG btrees. */ uint8_t pagf_repair_levels[XFS_BTNUM_AGF]; + uint8_t pagf_repair_refcount_level; #endif spinlock_t pag_state_lock; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c100e92140be..ea8d3659df20 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -5212,3 +5212,29 @@ xfs_btree_destroy_cur_caches(void) xfs_rmapbt_destroy_cur_cache(); xfs_refcountbt_destroy_cur_cache(); } + +/* Move the btree cursor before the first record. */ +int +xfs_btree_goto_left_edge( + struct xfs_btree_cur *cur) +{ + int stat = 0; + int error; + + memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); + if (error) + return error; + if (!stat) + return 0; + + error = xfs_btree_decrement(cur, 0, &stat); + if (error) + return error; + if (stat != 0) { + ASSERT(0); + return -EFSCORRUPTED; + } + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index e0875cec4939..d906324e25c8 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -738,4 +738,6 @@ xfs_btree_alloc_cursor( int __init xfs_btree_init_cur_caches(void); void xfs_btree_destroy_cur_caches(void); +int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 5b039cd022e0..3a9f22d94444 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -123,11 +123,9 @@ xfs_refcount_btrec_to_irec( /* Simple checks for refcount records. */ xfs_failaddr_t xfs_refcount_check_irec( - struct xfs_btree_cur *cur, + struct xfs_perag *pag, const struct xfs_refcount_irec *irec) { - struct xfs_perag *pag = cur->bc_ag.pag; - if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) return __this_address; @@ -179,7 +177,7 @@ xfs_refcount_get_rec( return error; xfs_refcount_btrec_to_irec(rec, irec); - fa = xfs_refcount_check_irec(cur, irec); + fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, irec); @@ -1899,7 +1897,7 @@ xfs_refcount_recover_extent( INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL || + if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL || XFS_IS_CORRUPT(cur->bc_mp, rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { kfree(rr); diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 783cd89ca195..5c207f1c619c 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -117,7 +117,7 @@ extern int xfs_refcount_has_records(struct xfs_btree_cur *cur, union xfs_btree_rec; extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); -xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag, const struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 3fa795e2488d..0d80bd99147c 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -226,7 +226,18 @@ xfs_refcountbt_verify( level = be16_to_cpu(block->bb_level); if (pag && xfs_perag_initialised_agf(pag)) { - if (level >= pag->pagf_refcount_level) + unsigned int maxlevel = pag->pagf_refcount_level; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Online repair could be rewriting the refcount btree, so + * we'll validate against the larger of either tree while this + * is going on. + */ + maxlevel = max_t(unsigned int, maxlevel, + pag->pagf_repair_refcount_level); +#endif + if (level >= maxlevel) return __this_address; } else if (level >= mp->m_refc_maxlevels) return __this_address; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 304ea1e1bfb0..bf22f245bbfa 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -441,7 +441,7 @@ xchk_refcountbt_rec( struct xchk_refcbt_records *rrc = bs->private; xfs_refcount_btrec_to_irec(rec, &irec); - if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) { + if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c new file mode 100644 index 000000000000..f38fccc42a20 --- /dev/null +++ b/fs/xfs/scrub/refcount_repair.c @@ -0,0 +1,794 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_error.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Rebuilding the Reference Count Btree + * ==================================== + * + * This algorithm is "borrowed" from xfs_repair. Imagine the rmap + * entries as rectangles representing extents of physical blocks, and + * that the rectangles can be laid down to allow them to overlap each + * other; then we know that we must emit a refcnt btree entry wherever + * the amount of overlap changes, i.e. the emission stimulus is + * level-triggered: + * + * - --- + * -- ----- ---- --- ------ + * -- ---- ----------- ---- --------- + * -------------------------------- ----------- + * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^ + * 2 1 23 21 3 43 234 2123 1 01 2 3 0 + * + * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner). + * + * Note that in the actual refcnt btree we don't store the refcount < 2 + * cases because the bnobt tells us which blocks are free; single-use + * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt + * supports storing multiple entries covering a given block we could + * theoretically dispense with the refcntbt and simply count rmaps, but + * that's inefficient in the (hot) write path, so we'll take the cost of + * the extra tree to save time. Also there's no guarantee that rmap + * will be enabled. + * + * Given an array of rmaps sorted by physical block number, a starting + * physical block (sp), a bag to hold rmaps that cover sp, and the next + * physical block where the level changes (np), we can reconstruct the + * refcount btree as follows: + * + * While there are still unprocessed rmaps in the array, + * - Set sp to the physical block (pblk) of the next unprocessed rmap. + * - Add to the bag all rmaps in the array where startblock == sp. + * - Set np to the physical block where the bag size will change. This + * is the minimum of (the pblk of the next unprocessed rmap) and + * (startblock + len of each rmap in the bag). + * - Record the bag size as old_bag_size. + * + * - While the bag isn't empty, + * - Remove from the bag all rmaps where startblock + len == np. + * - Add to the bag all rmaps in the array where startblock == np. + * - If the bag size isn't old_bag_size, store the refcount entry + * (sp, np - sp, bag_size) in the refcnt btree. + * - If the bag is empty, break out of the inner loop. + * - Set old_bag_size to the bag size + * - Set sp = np. + * - Set np to the physical block where the bag size will change. + * This is the minimum of (the pblk of the next unprocessed rmap) + * and (startblock + len of each rmap in the bag). + * + * Like all the other repairers, we make a list of all the refcount + * records we need, then reinitialize the refcount btree root and + * insert all the records. + */ + +/* The only parts of the rmap that we care about for computing refcounts. */ +struct xrep_refc_rmap { + xfs_agblock_t startblock; + xfs_extlen_t blockcount; +} __packed; + +struct xrep_refc { + /* refcount extents */ + struct xfarray *refcount_records; + + /* new refcountbt information */ + struct xrep_newbt new_btree; + + /* old refcountbt blocks */ + struct xagb_bitmap old_refcountbt_blocks; + + struct xfs_scrub *sc; + + /* get_records()'s position in the refcount record array. */ + xfarray_idx_t array_cur; + + /* # of refcountbt blocks */ + xfs_extlen_t btblocks; +}; + +/* Check for any obvious conflicts with this shared/CoW staging extent. */ +STATIC int +xrep_refc_check_ext( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock, + rec->rc_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->rc_startblock, rec->rc_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Record a reference count extent. */ +STATIC int +xrep_refc_stash( + struct xrep_refc *rr, + enum xfs_refc_domain domain, + xfs_agblock_t agbno, + xfs_extlen_t len, + uint64_t refcount) +{ + struct xfs_refcount_irec irec = { + .rc_startblock = agbno, + .rc_blockcount = len, + .rc_domain = domain, + }; + struct xfs_scrub *sc = rr->sc; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount); + + error = xrep_refc_check_ext(rr->sc, &irec); + if (error) + return error; + + trace_xrep_refc_found(sc->sa.pag, &irec); + + return xfarray_append(rr->refcount_records, &irec); +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_refc_stash_cow( + struct xrep_refc *rr, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1); +} + +/* Decide if an rmap could describe a shared extent. */ +static inline bool +xrep_refc_rmap_shareable( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rmap) +{ + /* AG metadata are never sharable */ + if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return false; + + /* Metadata in files are never shareable */ + if (xfs_internal_inum(mp, rmap->rm_owner)) + return false; + + /* Metadata and unwritten file blocks are not shareable. */ + if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN)) + return false; + + return true; +} + +/* + * Walk along the reverse mapping records until we find one that could describe + * a shared extent. + */ +STATIC int +xrep_refc_walk_rmaps( + struct xrep_refc *rr, + struct xrep_refc_rmap *rrm, + bool *have_rec) +{ + struct xfs_rmap_irec rmap; + struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur; + struct xfs_mount *mp = cur->bc_mp; + int have_gt; + int error = 0; + + *have_rec = false; + + /* + * Loop through the remaining rmaps. Remember CoW staging + * extents and the refcountbt blocks from the old tree for later + * disposal. We can only share written data fork extents, so + * keep looping until we find an rmap for one. + */ + do { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfs_btree_increment(cur, 0, &have_gt); + if (error) + return error; + if (!have_gt) + return 0; + + error = xfs_rmap_get_rec(cur, &rmap, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(mp, !have_gt)) + return -EFSCORRUPTED; + + if (rmap.rm_owner == XFS_RMAP_OWN_COW) { + error = xrep_refc_stash_cow(rr, rmap.rm_startblock, + rmap.rm_blockcount); + if (error) + return error; + } else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) { + /* refcountbt block, dump it when we're done. */ + rr->btblocks += rmap.rm_blockcount; + error = xagb_bitmap_set(&rr->old_refcountbt_blocks, + rmap.rm_startblock, rmap.rm_blockcount); + if (error) + return error; + } + } while (!xrep_refc_rmap_shareable(mp, &rmap)); + + rrm->startblock = rmap.rm_startblock; + rrm->blockcount = rmap.rm_blockcount; + *have_rec = true; + return 0; +} + +static inline uint32_t +xrep_refc_encode_startblock( + const struct xfs_refcount_irec *irec) +{ + uint32_t start; + + start = irec->rc_startblock & ~XFS_REFC_COWFLAG; + if (irec->rc_domain == XFS_REFC_DOMAIN_COW) + start |= XFS_REFC_COWFLAG; + + return start; +} + +/* Sort in the same order as the ondisk records. */ +static int +xrep_refc_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_refcount_irec *ap = a; + const struct xfs_refcount_irec *bp = b; + uint32_t sa, sb; + + sa = xrep_refc_encode_startblock(ap); + sb = xrep_refc_encode_startblock(bp); + + if (sa > sb) + return 1; + if (sa < sb) + return -1; + return 0; +} + +/* + * Sort the refcount extents by startblock or else the btree records will be in + * the wrong order. Make sure the records do not overlap in physical space. + */ +STATIC int +xrep_refc_sort_records( + struct xrep_refc *rr) +{ + struct xfs_refcount_irec irec; + xfarray_idx_t cur; + enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED; + xfs_agblock_t next_agbno = 0; + int error; + + error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rr->refcount_records, cur) { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfarray_load(rr->refcount_records, cur, &irec); + if (error) + return error; + + if (dom == XFS_REFC_DOMAIN_SHARED && + irec.rc_domain == XFS_REFC_DOMAIN_COW) { + dom = irec.rc_domain; + next_agbno = 0; + } + + if (dom != irec.rc_domain) + return -EFSCORRUPTED; + if (irec.rc_startblock < next_agbno) + return -EFSCORRUPTED; + + next_agbno = irec.rc_startblock + irec.rc_blockcount; + } + + return error; +} + +#define RRM_NEXT(r) ((r).startblock + (r).blockcount) +/* + * Find the next block where the refcount changes, given the next rmap we + * looked at and the ones we're already tracking. + */ +static inline int +xrep_refc_next_edge( + struct xfarray *rmap_bag, + struct xrep_refc_rmap *next_rrm, + bool next_valid, + xfs_agblock_t *nbnop) +{ + struct xrep_refc_rmap rrm; + xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT; + xfs_agblock_t nbno = NULLAGBLOCK; + int error; + + if (next_valid) + nbno = next_rrm->startblock; + + while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1) + nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm)); + + if (error) + return error; + + /* + * We should have found /something/ because either next_rrm is the next + * interesting rmap to look at after emitting this refcount extent, or + * there are other rmaps in rmap_bag contributing to the current + * sharing count. But if something is seriously wrong, bail out. + */ + if (nbno == NULLAGBLOCK) + return -EFSCORRUPTED; + + *nbnop = nbno; + return 0; +} + +/* + * Walk forward through the rmap btree to collect all rmaps starting at + * @bno in @rmap_bag. These represent the file(s) that share ownership of + * the current block. Upon return, the rmap cursor points to the last record + * satisfying the startblock constraint. + */ +static int +xrep_refc_push_rmaps_at( + struct xrep_refc *rr, + struct xfarray *rmap_bag, + xfs_agblock_t bno, + struct xrep_refc_rmap *rrm, + bool *have, + uint64_t *stack_sz) +{ + struct xfs_scrub *sc = rr->sc; + int have_gt; + int error; + + while (*have && rrm->startblock == bno) { + error = xfarray_store_anywhere(rmap_bag, rrm); + if (error) + return error; + (*stack_sz)++; + error = xrep_refc_walk_rmaps(rr, rrm, have); + if (error) + return error; + } + + error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(sc->mp, !have_gt)) + return -EFSCORRUPTED; + + return 0; +} + +/* Iterate all the rmap records to generate reference count data. */ +STATIC int +xrep_refc_find_refcounts( + struct xrep_refc *rr) +{ + struct xrep_refc_rmap rrm; + struct xfs_scrub *sc = rr->sc; + struct xfarray *rmap_bag; + char *descr; + uint64_t old_stack_sz; + uint64_t stack_sz = 0; + xfs_agblock_t sbno; + xfs_agblock_t cbno; + xfs_agblock_t nbno; + bool have; + int error; + + xrep_ag_btcur_init(sc, &sc->sa); + + /* + * Set up a sparse array to store all the rmap records that we're + * tracking to generate a reference count record. If this exceeds + * MAXREFCOUNT, we clamp rc_refcount. + */ + descr = xchk_xfile_ag_descr(sc, "rmap record bag"); + error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap), + &rmap_bag); + kfree(descr); + if (error) + goto out_cur; + + /* Start the rmapbt cursor to the left of all records. */ + error = xfs_btree_goto_left_edge(sc->sa.rmap_cur); + if (error) + goto out_bag; + + /* Process reverse mappings into refcount data. */ + while (xfs_btree_has_more_records(sc->sa.rmap_cur)) { + /* Push all rmaps with pblk == sbno onto the stack */ + error = xrep_refc_walk_rmaps(rr, &rrm, &have); + if (error) + goto out_bag; + if (!have) + break; + sbno = cbno = rrm.startblock; + error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno, + &rrm, &have, &stack_sz); + if (error) + goto out_bag; + + /* Set nbno to the bno of the next refcount change */ + error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + old_stack_sz = stack_sz; + + /* While stack isn't empty... */ + while (stack_sz) { + xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT; + + /* Pop all rmaps that end at nbno */ + while ((error = xfarray_iter(rmap_bag, &array_cur, + &rrm)) == 1) { + if (RRM_NEXT(rrm) != nbno) + continue; + error = xfarray_unset(rmap_bag, array_cur - 1); + if (error) + goto out_bag; + stack_sz--; + } + if (error) + goto out_bag; + + /* Push array items that start at nbno */ + error = xrep_refc_walk_rmaps(rr, &rrm, &have); + if (error) + goto out_bag; + if (have) { + error = xrep_refc_push_rmaps_at(rr, rmap_bag, + nbno, &rrm, &have, &stack_sz); + if (error) + goto out_bag; + } + + /* Emit refcount if necessary */ + ASSERT(nbno > cbno); + if (stack_sz != old_stack_sz) { + if (old_stack_sz > 1) { + error = xrep_refc_stash(rr, + XFS_REFC_DOMAIN_SHARED, + cbno, nbno - cbno, + old_stack_sz); + if (error) + goto out_bag; + } + cbno = nbno; + } + + /* Stack empty, go find the next rmap */ + if (stack_sz == 0) + break; + old_stack_sz = stack_sz; + sbno = nbno; + + /* Set nbno to the bno of the next refcount change */ + error = xrep_refc_next_edge(rmap_bag, &rrm, have, + &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + } + } + + ASSERT(stack_sz == 0); +out_bag: + xfarray_destroy(rmap_bag); +out_cur: + xchk_ag_btcur_free(&sc->sa); + return error; +} +#undef RRM_NEXT + +/* Retrieve refcountbt data for bulk load. */ +STATIC int +xrep_refc_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + struct xrep_refc *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(rr->refcount_records, rr->array_cur++, + irec); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_refc_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_refc *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Update the AGF counters. */ +STATIC int +xrep_refc_reset_counters( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the refcountbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_refcount_level = pag->pagf_refcount_level; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* + * Use the collected refcount information to stage a new refcount btree. If + * this is successful we'll return with the new btree root information logged + * to the repair transaction but not yet committed. + */ +STATIC int +xrep_refc_build_new_tree( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_btree_cur *refc_cur; + struct xfs_perag *pag = sc->sa.pag; + xfs_fsblock_t fsbno; + int error; + + error = xrep_refc_sort_records(rr); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp)); + xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno, + XFS_AG_RESV_METADATA); + rr->new_btree.bload.get_records = xrep_refc_get_records; + rr->new_btree.bload.claim_block = xrep_refc_claim_block; + + /* Compute how many blocks we'll need. */ + refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake, + pag); + error = xfs_btree_bload_compute_geometry(refc_cur, + &rr->new_btree.bload, + xfarray_length(rr->refcount_records)); + if (error) + goto err_cur; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + rr->new_btree.bload.nr_blocks); + if (error) + goto err_cur; + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the incore btree height so + * that we don't trip the verifiers when writing the new btree blocks + * to disk. + */ + pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height; + + /* Add all observed refcount records. */ + rr->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr); + if (error) + goto err_level; + + /* + * Install the new btree in the AG header. After this point the old + * btree is no longer accessible and the new tree is live. + */ + xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(refc_cur, 0); + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_refc_reset_counters(rr); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_level: + pag->pagf_repair_refcount_level = 0; +err_cur: + xfs_btree_del_cursor(refc_cur, error); +err_newbt: + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_refc_remove_old_tree( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + int error; + + /* Free the old refcountbt blocks if they're not in use. */ + error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks, + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); + if (error) + return error; + + /* + * Now that we've zapped all the old refcountbt blocks we can turn off + * the alternate height mechanism and reset the per-AG space + * reservations. + */ + pag->pagf_repair_refcount_level = 0; + sc->flags |= XREP_RESET_PERAG_RESV; + return 0; +} + +/* Rebuild the refcount btree. */ +int +xrep_refcountbt( + struct xfs_scrub *sc) +{ + struct xrep_refc *rr; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + rr = kzalloc(sizeof(struct xrep_refc), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + rr->sc = sc; + + /* Set up enough storage to handle one refcount record per block. */ + descr = xchk_xfile_ag_descr(sc, "reference count records"); + error = xfarray_create(descr, mp->m_sb.sb_agblocks, + sizeof(struct xfs_refcount_irec), + &rr->refcount_records); + kfree(descr); + if (error) + goto out_rr; + + /* Collect all reference counts. */ + xagb_bitmap_init(&rr->old_refcountbt_blocks); + error = xrep_refc_find_refcounts(rr); + if (error) + goto out_bitmap; + + /* Rebuild the refcount information. */ + error = xrep_refc_build_new_tree(rr); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_refc_remove_old_tree(rr); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&rr->old_refcountbt_blocks); + xfarray_destroy(rr->refcount_records); +out_rr: + kfree(rr); + return error; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 05bd55430e6e..cc7ea3942729 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -80,6 +80,7 @@ int xrep_agfl(struct xfs_scrub *sc); int xrep_agi(struct xfs_scrub *sc); int xrep_allocbt(struct xfs_scrub *sc); int xrep_iallocbt(struct xfs_scrub *sc); +int xrep_refcountbt(struct xfs_scrub *sc); int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); @@ -133,6 +134,7 @@ xrep_setup_nothing( #define xrep_agi xrep_notsupported #define xrep_allocbt xrep_notsupported #define xrep_iallocbt xrep_notsupported +#define xrep_refcountbt xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 02ddfddfbed4..6ff4dc57095f 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -276,7 +276,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .setup = xchk_setup_ag_refcountbt, .scrub = xchk_refcountbt, .has = xfs_has_reflink, - .repair = xrep_notsupported, + .repair = xrep_refcountbt, }, [XFS_SCRUB_TYPE_INODE] = { /* inode record */ .type = ST_INODE, diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index c60f76231f0c..3f7af4430951 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1232,27 +1232,29 @@ TRACE_EVENT(xrep_ibt_found, __entry->freemask) ) -TRACE_EVENT(xrep_refcount_extent_fn, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_refcount_irec *irec), - TP_ARGS(mp, agno, irec), +TRACE_EVENT(xrep_refc_found, + TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec), + TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->startblock = irec->rc_startblock; - __entry->blockcount = irec->rc_blockcount; - __entry->refcount = irec->rc_refcount; + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->domain = rec->rc_domain; + __entry->startblock = rec->rc_startblock; + __entry->blockcount = rec->rc_blockcount; + __entry->refcount = rec->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount) From 259ba1d36f559653390c0e9dbdee5c4ffc28bb29 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:34 -0800 Subject: [PATCH 0795/1562] xfs: try to attach dquots to files before repairing them Inode resource usage is tracked in the quota metadata. Repairing a file might change the resources used by that file, which means that we need to attach dquots to the file that we're examining before accessing anything in the file protected by the ILOCK. However, there's a twist: a dquot cache miss requires the dquot to be read in from the quota file, during which we drop the ILOCK on the file being examined. This means that we *must* try to attach the dquots before taking the ILOCK. Therefore, dquots must be attached to files in the scrub setup function. If doing so yields corruption errors (or unknown dquot errors), we instead clear the quotachecked status, which will cause a quotacheck on next mount. A future series will make this trigger live quotacheck. While we're here, change the xrep_ino_dqattach function to use the unlocked dqattach functions so that we avoid cycling the ILOCK if the inode already has dquots attached. This makes the naming and locking requirements consistent with the rest of the filesystem. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/bmap.c | 4 ++++ fs/xfs/scrub/common.c | 25 +++++++++++++++++++++++++ fs/xfs/scrub/common.h | 6 ++++++ fs/xfs/scrub/inode.c | 4 ++++ fs/xfs/scrub/repair.c | 13 ++++++++----- fs/xfs/scrub/rtbitmap.c | 4 ++++ fs/xfs/scrub/rtsummary.c | 4 ++++ 7 files changed, 55 insertions(+), 5 deletions(-) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 06d8c1996a33..f74bd2a97c7f 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -78,6 +78,10 @@ xchk_setup_inode_bmap( if (error) goto out; + error = xchk_ino_dqattach(sc); + if (error) + goto out; + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode */ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index e0d6d8c9f640..bff0a374fb1b 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -819,6 +819,26 @@ again: return 0; } +#ifdef CONFIG_XFS_QUOTA +/* + * Try to attach dquots to this inode if we think we might want to repair it. + * Callers must not hold any ILOCKs. If the dquots are broken and cannot be + * attached, a quotacheck will be scheduled. + */ +int +xchk_ino_dqattach( + struct xfs_scrub *sc) +{ + ASSERT(sc->tp != NULL); + ASSERT(sc->ip != NULL); + + if (!xchk_could_repair(sc)) + return 0; + + return xrep_ino_dqattach(sc); +} +#endif + /* Install an inode that we opened by handle for scrubbing. */ int xchk_install_handle_inode( @@ -1030,6 +1050,11 @@ xchk_setup_inode_contents( error = xchk_trans_alloc(sc, resblks); if (error) goto out; + + error = xchk_ino_dqattach(sc); + if (error) + goto out; + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode for us */ diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index c31be570e7d8..c69cacb0b696 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -103,9 +103,15 @@ xchk_setup_rtsummary(struct xfs_scrub *sc) } #endif #ifdef CONFIG_XFS_QUOTA +int xchk_ino_dqattach(struct xfs_scrub *sc); int xchk_setup_quota(struct xfs_scrub *sc); #else static inline int +xchk_ino_dqattach(struct xfs_scrub *sc) +{ + return 0; +} +static inline int xchk_setup_quota(struct xfs_scrub *sc) { return -ENOENT; diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index b7a93380a1ab..7e97db8255c6 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -39,6 +39,10 @@ xchk_prepare_iscrub( if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL); return 0; } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index b4e7c4ad779f..021f6ec72e87 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -700,10 +700,10 @@ xrep_force_quotacheck( * * This function ensures that the appropriate dquots are attached to an inode. * We cannot allow the dquot code to allocate an on-disk dquot block here - * because we're already in transaction context with the inode locked. The - * on-disk dquot should already exist anyway. If the quota code signals - * corruption or missing quota information, schedule quotacheck, which will - * repair corruptions in the quota metadata. + * because we're already in transaction context. The on-disk dquot should + * already exist anyway. If the quota code signals corruption or missing quota + * information, schedule quotacheck, which will repair corruptions in the quota + * metadata. */ int xrep_ino_dqattach( @@ -711,7 +711,10 @@ xrep_ino_dqattach( { int error; - error = xfs_qm_dqattach_locked(sc->ip, false); + ASSERT(sc->tp != NULL); + ASSERT(sc->ip != NULL); + + error = xfs_qm_dqattach(sc->ip); switch (error) { case -EFSBADCRC: case -EFSCORRUPTED: diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 41a1d89ae8e6..d509a08d3fc3 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -32,6 +32,10 @@ xchk_setup_rtbitmap( if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); return 0; } diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index 8b15c47408d0..f94800a029f3 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -63,6 +63,10 @@ xchk_setup_rtsummary( if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + /* * Locking order requires us to take the rtbitmap first. We must be * careful to unlock it ourselves when we are done with the rtbitmap From 576d30ecb620ae3bc156dfb2a4e91143e7f3256d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:34 -0800 Subject: [PATCH 0796/1562] xfs: add missing nrext64 inode flag check to scrub Add this missing check that the superblock nrext64 flag is set if the inode flag is set. Fixes: 9b7d16e34bbeb ("xfs: Introduce XFS_DIFLAG2_NREXT64 and associated helpers") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 7e97db8255c6..6c40f3e020ea 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -342,6 +342,10 @@ xchk_inode_flags2( if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp)) goto bad; + /* no large extent counts without the filesystem feature */ + if ((flags2 & XFS_DIFLAG2_NREXT64) && !xfs_has_large_extent_counts(mp)) + goto bad; + return; bad: xchk_ino_set_corrupt(sc, ino); From 6b5d917780219d0d8f8e2cefefcb6f50987d0fa3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:35 -0800 Subject: [PATCH 0797/1562] xfs: dont cast to char * for XFS_DFORK_*PTR macros Code in the next patch will assign the return value of XFS_DFORK_*PTR macros to a struct pointer. gcc complains about casting char* strings to struct pointers, so let's fix the macro's cast to void* to shut up the warnings. While we're at it, fix one of the scrub tests that uses PTR to use BOFF instead for a simpler integer comparison, since other linters whine about char* and void* comparisons. Can't satisfy all these dman bots. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_format.h | 2 +- fs/xfs/scrub/inode.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 9a88aba1589f..f16974126ff9 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1008,7 +1008,7 @@ enum xfs_dinode_fmt { * Return pointers to the data or attribute forks. */ #define XFS_DFORK_DPTR(dip) \ - ((char *)dip + xfs_dinode_size(dip->di_version)) + ((void *)dip + xfs_dinode_size(dip->di_version)) #define XFS_DFORK_APTR(dip) \ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) #define XFS_DFORK_PTR(dip,w) \ diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 6c40f3e020ea..a81f070b0cd2 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -556,7 +556,7 @@ xchk_dinode( } /* di_forkoff */ - if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) + if (XFS_DFORK_BOFF(dip) >= mp->m_sb.sb_inodesize) xchk_ino_set_corrupt(sc, ino); if (naextents != 0 && dip->di_forkoff == 0) xchk_ino_set_corrupt(sc, ino); From d9041681dd2f5334529a68868c9266631c384de4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:35 -0800 Subject: [PATCH 0798/1562] xfs: set inode sick state flags when we zap either ondisk fork In a few patches, we'll add some online repair code that tries to massage the ondisk inode record just enough to get it to pass the inode verifiers so that we can continue with more file repairs. Part of that massaging can include zapping the ondisk forks to clear errors. After that point, the bmap fork repair functions will rebuild the zapped forks. Christoph asked for stronger protections against online repair zapping a fork to get the inode to load vs. other threads trying to access the partially repaired file. Do this by adding a special "[DA]FORK_ZAPPED" inode health flag whenever repair zaps a fork, and sprinkling checks for that flag into the various file operations for things that don't like handling an unexpected zero-extents fork. In practice xfs_scrub will scrub and fix the forks almost immediately after zapping them, so the window is very small. However, if a crash or unmount should occur, we can still detect these zapped inode forks by looking for a zero-extents fork when data was expected. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_health.h | 10 ++++++++++ fs/xfs/scrub/bmap.c | 39 ++++++++++++++++++++++++++++++++++++-- fs/xfs/scrub/common.c | 2 ++ fs/xfs/scrub/dir.c | 16 +++++++++++++--- fs/xfs/scrub/health.c | 32 +++++++++++++++++++++++++++++++ fs/xfs/scrub/health.h | 2 ++ fs/xfs/scrub/symlink.c | 20 ++++++++++++++----- fs/xfs/xfs_dir2_readdir.c | 3 +++ fs/xfs/xfs_health.c | 8 ++++++-- fs/xfs/xfs_inode.c | 35 ++++++++++++++++++++++++++++++++++ fs/xfs/xfs_inode.h | 2 ++ fs/xfs/xfs_symlink.c | 3 +++ fs/xfs/xfs_xattr.c | 6 ++++++ 13 files changed, 166 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 99e796256c5d..6296993ff8f3 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -68,6 +68,11 @@ struct xfs_fsop_geom; #define XFS_SICK_INO_SYMLINK (1 << 6) /* symbolic link remote target */ #define XFS_SICK_INO_PARENT (1 << 7) /* parent pointers */ +#define XFS_SICK_INO_BMBTD_ZAPPED (1 << 8) /* data fork erased */ +#define XFS_SICK_INO_BMBTA_ZAPPED (1 << 9) /* attr fork erased */ +#define XFS_SICK_INO_DIR_ZAPPED (1 << 10) /* directory erased */ +#define XFS_SICK_INO_SYMLINK_ZAPPED (1 << 11) /* symlink erased */ + /* Primary evidence of health problems in a given group. */ #define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \ XFS_SICK_FS_UQUOTA | \ @@ -97,6 +102,11 @@ struct xfs_fsop_geom; XFS_SICK_INO_SYMLINK | \ XFS_SICK_INO_PARENT) +#define XFS_SICK_INO_ZAPPED (XFS_SICK_INO_BMBTD_ZAPPED | \ + XFS_SICK_INO_BMBTA_ZAPPED | \ + XFS_SICK_INO_DIR_ZAPPED | \ + XFS_SICK_INO_SYMLINK_ZAPPED) + /* These functions must be provided by the xfs implementation. */ void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask); diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index f74bd2a97c7f..1487aaf3d95f 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -19,9 +19,11 @@ #include "xfs_bmap_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "scrub/health.h" #include "xfs_ag.h" /* Set us up with an inode's bmap. */ @@ -943,7 +945,20 @@ int xchk_bmap_data( struct xfs_scrub *sc) { - return xchk_bmap(sc, XFS_DATA_FORK); + int error; + + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTD_ZAPPED)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_bmap(sc, XFS_DATA_FORK); + if (error) + return error; + + /* If the data fork is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTD_ZAPPED); + return 0; } /* Scrub an inode's attr fork. */ @@ -951,7 +966,27 @@ int xchk_bmap_attr( struct xfs_scrub *sc) { - return xchk_bmap(sc, XFS_ATTR_FORK); + int error; + + /* + * If the attr fork has been zapped, it's possible that forkoff was + * reset to zero and hence sc->ip->i_afp is NULL. We don't want the + * NULL ifp check in xchk_bmap to conclude that the attr fork is ok, + * so short circuit that logic by setting the corruption flag and + * returning immediately. + */ + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTA_ZAPPED)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_bmap(sc, XFS_ATTR_FORK); + if (error) + return error; + + /* If the attr fork is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTA_ZAPPED); + return 0; } /* Scrub an inode's CoW fork. */ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index bff0a374fb1b..f0207e71e5dc 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -1160,6 +1160,7 @@ xchk_metadata_inode_subtype( unsigned int scrub_type) { __u32 smtype = sc->sm->sm_type; + unsigned int sick_mask = sc->sick_mask; int error; sc->sm->sm_type = scrub_type; @@ -1177,6 +1178,7 @@ xchk_metadata_inode_subtype( break; } + sc->sick_mask = sick_mask; sc->sm->sm_type = smtype; return error; } diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 0b491784b759..b366fab699ac 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -15,10 +15,12 @@ #include "xfs_icache.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/readdir.h" +#include "scrub/health.h" /* Set us up to scrub directories. */ int @@ -760,6 +762,11 @@ xchk_directory( if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) return -ENOENT; + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_DIR_ZAPPED)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + /* Plausible size? */ if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -784,7 +791,10 @@ xchk_directory( /* Look up every name in this directory by hash. */ error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL); - if (error == -ECANCELED) - error = 0; - return error; + if (error && error != -ECANCELED) + return error; + + /* If the dir is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_DIR_ZAPPED); + return 0; } diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 5e2b09ed6e29..df716da11226 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -117,6 +117,38 @@ xchk_health_mask_for_scrub_type( return type_to_health_flag[scrub_type].sick_mask; } +/* + * If the scrub state is clean, add @mask to the scrub sick mask to clear + * additional sick flags from the metadata object's sick state. + */ +void +xchk_mark_healthy_if_clean( + struct xfs_scrub *sc, + unsigned int mask) +{ + if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT))) + sc->sick_mask |= mask; +} + +/* + * If we're scrubbing a piece of file metadata for the first time, does it look + * like it has been zapped? Skip the check if we just repaired the metadata + * and are revalidating it. + */ +bool +xchk_file_looks_zapped( + struct xfs_scrub *sc, + unsigned int mask) +{ + ASSERT((mask & ~XFS_SICK_INO_ZAPPED) == 0); + + if (sc->flags & XREP_ALREADY_FIXED) + return false; + + return xfs_inode_has_sickness(sc->ip, mask); +} + /* * Update filesystem health assessments based on what we found and did. * diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h index 66a273f8585b..a731b2467399 100644 --- a/fs/xfs/scrub/health.h +++ b/fs/xfs/scrub/health.h @@ -10,5 +10,7 @@ unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type); void xchk_update_health(struct xfs_scrub *sc); bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag, xfs_btnum_t btnum); +void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask); +bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask); #endif /* __XFS_SCRUB_HEALTH_H__ */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 38708fb9a5d7..60643d791d4a 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -12,8 +12,10 @@ #include "xfs_log_format.h" #include "xfs_inode.h" #include "xfs_symlink.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/health.h" /* Set us up to scrub a symbolic link. */ int @@ -41,13 +43,19 @@ xchk_symlink( if (!S_ISLNK(VFS_I(ip)->i_mode)) return -ENOENT; + + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_SYMLINK_ZAPPED)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); len = ip->i_disk_size; /* Plausible size? */ if (len > XFS_SYMLINK_MAXLEN || len <= 0) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* Inline symlink? */ @@ -55,15 +63,17 @@ xchk_symlink( if (len > xfs_inode_data_fork_size(ip) || len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip))) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* Remote symlink; must read the contents. */ error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); -out: - return error; + + /* If a remote symlink is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED); + return 0; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 9f3ceb461515..57f42c2af0a3 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -18,6 +18,7 @@ #include "xfs_bmap.h" #include "xfs_trans.h" #include "xfs_error.h" +#include "xfs_health.h" /* * Directory file type support functions @@ -519,6 +520,8 @@ xfs_readdir( if (xfs_is_shutdown(dp->i_mount)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 72a075bb2c10..9a57afee9338 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -222,7 +222,7 @@ xfs_inode_mark_sick( struct xfs_inode *ip, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY)); + ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED))); trace_xfs_inode_mark_sick(ip, mask); spin_lock(&ip->i_flags_lock); @@ -246,7 +246,7 @@ xfs_inode_mark_healthy( struct xfs_inode *ip, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY)); + ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED))); trace_xfs_inode_mark_healthy(ip, mask); spin_lock(&ip->i_flags_lock); @@ -369,6 +369,10 @@ static const struct ioctl_sick_map ino_map[] = { { XFS_SICK_INO_XATTR, XFS_BS_SICK_XATTR }, { XFS_SICK_INO_SYMLINK, XFS_BS_SICK_SYMLINK }, { XFS_SICK_INO_PARENT, XFS_BS_SICK_PARENT }, + { XFS_SICK_INO_BMBTD_ZAPPED, XFS_BS_SICK_BMBTD }, + { XFS_SICK_INO_BMBTA_ZAPPED, XFS_BS_SICK_BMBTA }, + { XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR }, + { XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK }, { 0, 0 }, }; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c0f1c89786c2..ea6b277485a4 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -37,6 +37,7 @@ #include "xfs_reflink.h" #include "xfs_ag.h" #include "xfs_log_priv.h" +#include "xfs_health.h" struct kmem_cache *xfs_inode_cache; @@ -661,6 +662,8 @@ xfs_lookup( if (xfs_is_shutdown(dp->i_mount)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); if (error) @@ -978,6 +981,8 @@ xfs_create( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; prid = xfs_get_initial_prid(dp); @@ -1217,6 +1222,8 @@ xfs_link( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(tdp, XFS_DATA_FORK)) + return -EIO; error = xfs_qm_dqattach(sip); if (error) @@ -2506,6 +2513,8 @@ xfs_remove( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; error = xfs_qm_dqattach(dp); if (error) @@ -3758,3 +3767,29 @@ xfs_inode_reload_unlinked( return error; } + +/* Has this inode fork been zapped by repair? */ +bool +xfs_ifork_zapped( + const struct xfs_inode *ip, + int whichfork) +{ + unsigned int datamask = 0; + + switch (whichfork) { + case XFS_DATA_FORK: + switch (ip->i_vnode.i_mode & S_IFMT) { + case S_IFDIR: + datamask = XFS_SICK_INO_DIR_ZAPPED; + break; + case S_IFLNK: + datamask = XFS_SICK_INO_SYMLINK_ZAPPED; + break; + } + return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask); + case XFS_ATTR_FORK: + return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED; + default: + return false; + } +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3beb470f1892..97f63bacd4c2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -622,4 +622,6 @@ xfs_inode_unlinked_incomplete( int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip); int xfs_inode_reload_unlinked(struct xfs_inode *ip); +bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 85e433df6a3f..7c713727f7fd 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -23,6 +23,7 @@ #include "xfs_trans.h" #include "xfs_ialloc.h" #include "xfs_error.h" +#include "xfs_health.h" /* ----- Kernel only functions below ----- */ int @@ -108,6 +109,8 @@ xfs_readlink( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(ip, XFS_DATA_FORK)) + return -EIO; xfs_ilock(ip, XFS_ILOCK_SHARED); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 987843f84d03..364104e1b38a 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -136,6 +136,9 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, }; int error; + if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK)) + return -EIO; + error = xfs_attr_get(&args); if (error) return error; @@ -294,6 +297,9 @@ xfs_vn_listxattr( struct inode *inode = d_inode(dentry); int error; + if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK)) + return -EIO; + /* * First read the regular on-disk attributes. */ From 2d295fe65776d15c06d53dbe3064f62e036e7c46 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:36 -0800 Subject: [PATCH 0799/1562] xfs: repair inode records If an inode is so badly damaged that it cannot be loaded into the cache, fix the ondisk metadata and try again. If there /is/ a cached inode, fix any problems and apply any optimizations that can be solved incore. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/scrub/inode.c | 6 +- fs/xfs/scrub/inode_repair.c | 820 ++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/repair.c | 42 ++ fs/xfs/scrub/repair.h | 20 + fs/xfs/scrub/scrub.c | 2 +- fs/xfs/scrub/trace.h | 129 ++++++ 7 files changed, 1018 insertions(+), 2 deletions(-) create mode 100644 fs/xfs/scrub/inode_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7e1df6fdaaad..561ab59b9422 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -184,6 +184,7 @@ xfs-y += $(addprefix scrub/, \ agheader_repair.o \ alloc_repair.o \ ialloc_repair.o \ + inode_repair.o \ newbt.o \ reap.o \ refcount_repair.o \ diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index a81f070b0cd2..6e2fe2d6250b 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -25,6 +25,7 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/trace.h" +#include "scrub/repair.h" /* Prepare the attached inode for scrubbing. */ static inline int @@ -185,8 +186,11 @@ xchk_setup_inode( * saying the inode is allocated and the icache being unable to load * the inode until we can flag the corruption in xchk_inode. The * scrub function has to note the corruption, since we're not really - * supposed to do that from the setup function. + * supposed to do that from the setup function. Save the mapping to + * make repairs to the ondisk inode buffer. */ + if (xchk_could_repair(sc)) + xrep_setup_inode(sc, &imap); return 0; out_cancel: diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c new file mode 100644 index 000000000000..f88d282fdfa5 --- /dev/null +++ b/fs/xfs/scrub/inode_repair.c @@ -0,0 +1,820 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_inode_buf.h" +#include "xfs_inode_fork.h" +#include "xfs_ialloc.h" +#include "xfs_da_format.h" +#include "xfs_reflink.h" +#include "xfs_rmap.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_quota_defs.h" +#include "xfs_quota.h" +#include "xfs_ag.h" +#include "xfs_rtbitmap.h" +#include "xfs_health.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* + * Inode Record Repair + * =================== + * + * Roughly speaking, inode problems can be classified based on whether or not + * they trip the dinode verifiers. If those trip, then we won't be able to + * xfs_iget ourselves the inode. + * + * Therefore, the xrep_dinode_* functions fix anything that will cause the + * inode buffer verifier or the dinode verifier. The xrep_inode_* functions + * fix things on live incore inodes. The inode repair functions make decisions + * with security and usability implications when reviving a file: + * + * - Files with zero di_mode or a garbage di_mode are converted to regular file + * that only root can read. This file may not actually contain user data, + * if the file was not previously a regular file. Setuid and setgid bits + * are cleared. + * + * - Zero-size directories can be truncated to look empty. It is necessary to + * run the bmapbtd and directory repair functions to fully rebuild the + * directory. + * + * - Zero-size symbolic link targets can be truncated to '?'. It is necessary + * to run the bmapbtd and symlink repair functions to salvage the symlink. + * + * - Invalid extent size hints will be removed. + * + * - Quotacheck will be scheduled if we repaired an inode that was so badly + * damaged that the ondisk inode had to be rebuilt. + * + * - Invalid user, group, or project IDs (aka -1U) will be reset to zero. + * Setuid and setgid bits are cleared. + */ + +/* + * All the information we need to repair the ondisk inode if we can't iget the + * incore inode. We don't allocate this buffer unless we're going to perform + * a repair to the ondisk inode cluster buffer. + */ +struct xrep_inode { + /* Inode mapping that we saved from the initial lookup attempt. */ + struct xfs_imap imap; + + struct xfs_scrub *sc; + + /* Sick state to set after zapping parts of the inode. */ + unsigned int ino_sick_mask; +}; + +/* + * Setup function for inode repair. @imap contains the ondisk inode mapping + * information so that we can correct the ondisk inode cluster buffer if + * necessary to make iget work. + */ +int +xrep_setup_inode( + struct xfs_scrub *sc, + const struct xfs_imap *imap) +{ + struct xrep_inode *ri; + + sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + ri = sc->buf; + memcpy(&ri->imap, imap, sizeof(struct xfs_imap)); + ri->sc = sc; + return 0; +} + +/* + * Make sure this ondisk inode can pass the inode buffer verifier. This is + * not the same as the dinode verifier. + */ +STATIC void +xrep_dinode_buf_core( + struct xfs_scrub *sc, + struct xfs_buf *bp, + unsigned int ioffset) +{ + struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset); + struct xfs_trans *tp = sc->tp; + struct xfs_mount *mp = sc->mp; + xfs_agino_t agino; + bool crc_ok = false; + bool magic_ok = false; + bool unlinked_ok = false; + + agino = be32_to_cpu(dip->di_next_unlinked); + + if (xfs_verify_agino_or_null(bp->b_pag, agino)) + unlinked_ok = true; + + if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + xfs_dinode_good_version(mp, dip->di_version)) + magic_ok = true; + + if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF)) + crc_ok = true; + + if (magic_ok && unlinked_ok && crc_ok) + return; + + if (!magic_ok) { + dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + dip->di_version = 3; + } + if (!unlinked_ok) + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); + xfs_dinode_calc_crc(mp, dip); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); + xfs_trans_log_buf(tp, bp, ioffset, + ioffset + sizeof(struct xfs_dinode) - 1); +} + +/* Make sure this inode cluster buffer can pass the inode buffer verifier. */ +STATIC void +xrep_dinode_buf( + struct xfs_scrub *sc, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = sc->mp; + int i; + int ni; + + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) + xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog); +} + +/* Reinitialize things that never change in an inode. */ +STATIC void +xrep_dinode_header( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + trace_xrep_dinode_header(sc, dip); + + dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + if (!xfs_dinode_good_version(sc->mp, dip->di_version)) + dip->di_version = 3; + dip->di_ino = cpu_to_be64(sc->sm->sm_ino); + uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid); + dip->di_gen = cpu_to_be32(sc->sm->sm_gen); +} + +/* Turn di_mode into /something/ recognizable. */ +STATIC void +xrep_dinode_mode( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_mode(sc, dip); + + if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) + return; + + /* bad mode, so we set it to a file that only root can read */ + mode = S_IFREG; + dip->di_mode = cpu_to_be16(mode); + dip->di_uid = 0; + dip->di_gid = 0; +} + +/* Fix any conflicting flags that the verifiers complain about. */ +STATIC void +xrep_dinode_flags( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = sc->mp; + uint64_t flags2 = be64_to_cpu(dip->di_flags2); + uint16_t flags = be16_to_cpu(dip->di_flags); + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_flags(sc, dip); + + /* + * For regular files on a reflink filesystem, set the REFLINK flag to + * protect shared extents. A later stage will actually check those + * extents and clear the flag if possible. + */ + if (xfs_has_reflink(mp) && S_ISREG(mode)) + flags2 |= XFS_DIFLAG2_REFLINK; + else + flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); + if (flags & XFS_DIFLAG_REALTIME) + flags2 &= ~XFS_DIFLAG2_REFLINK; + if (!xfs_has_bigtime(mp)) + flags2 &= ~XFS_DIFLAG2_BIGTIME; + if (!xfs_has_large_extent_counts(mp)) + flags2 &= ~XFS_DIFLAG2_NREXT64; + if (flags2 & XFS_DIFLAG2_NREXT64) + dip->di_nrext64_pad = 0; + else if (dip->di_version >= 3) + dip->di_v3_pad = 0; + dip->di_flags = cpu_to_be16(flags); + dip->di_flags2 = cpu_to_be64(flags2); +} + +/* + * Blow out symlink; now it points nowhere. We don't have to worry about + * incore state because this inode is failing the verifiers. + */ +STATIC void +xrep_dinode_zap_symlink( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + char *p; + + trace_xrep_dinode_zap_symlink(sc, dip); + + dip->di_format = XFS_DINODE_FMT_LOCAL; + dip->di_size = cpu_to_be64(1); + p = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + *p = '?'; + ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; +} + +/* + * Blow out dir, make the parent point to the root. In the future repair will + * reconstruct this directory for us. Note that there's no in-core directory + * inode because the sf verifier tripped, so we don't have to worry about the + * dentry cache. + */ +STATIC void +xrep_dinode_zap_dir( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_dir2_sf_hdr *sfp; + int i8count; + + trace_xrep_dinode_zap_dir(sc, dip); + + dip->di_format = XFS_DINODE_FMT_LOCAL; + i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM; + sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + sfp->count = 0; + sfp->i8count = i8count; + xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino); + dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count)); + ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED; +} + +/* Make sure we don't have a garbage file size. */ +STATIC void +xrep_dinode_size( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + uint64_t size = be64_to_cpu(dip->di_size); + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_size(sc, dip); + + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + /* di_size can't be nonzero for special files */ + dip->di_size = 0; + break; + case S_IFREG: + /* Regular files can't be larger than 2^63-1 bytes. */ + dip->di_size = cpu_to_be64(size & ~(1ULL << 63)); + break; + case S_IFLNK: + /* + * Truncate ridiculously oversized symlinks. If the size is + * zero, reset it to point to the current directory. Both of + * these conditions trigger dinode verifier errors, so there + * is no in-core state to reset. + */ + if (size > XFS_SYMLINK_MAXLEN) + dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN); + else if (size == 0) + xrep_dinode_zap_symlink(ri, dip); + break; + case S_IFDIR: + /* + * Directories can't have a size larger than 32G. If the size + * is zero, reset it to an empty directory. Both of these + * conditions trigger dinode verifier errors, so there is no + * in-core state to reset. + */ + if (size > XFS_DIR2_SPACE_SIZE) + dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE); + else if (size == 0) + xrep_dinode_zap_dir(ri, dip); + break; + } +} + +/* Fix extent size hints. */ +STATIC void +xrep_dinode_extsize_hints( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = sc->mp; + uint64_t flags2 = be64_to_cpu(dip->di_flags2); + uint16_t flags = be16_to_cpu(dip->di_flags); + uint16_t mode = be16_to_cpu(dip->di_mode); + + xfs_failaddr_t fa; + + trace_xrep_dinode_extsize_hints(sc, dip); + + fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), + mode, flags); + if (fa) { + dip->di_extsize = 0; + dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + } + + if (dip->di_version < 3) + return; + + fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) { + dip->di_cowextsize = 0; + dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE); + } +} + +/* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */ +STATIC int +xrep_dinode_core( + struct xrep_inode *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_buf *bp; + struct xfs_dinode *dip; + xfs_ino_t ino = sc->sm->sm_ino; + int error; + int iget_error; + + /* Read the inode cluster buffer. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, + ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, + NULL); + if (error) + return error; + + /* Make sure we can pass the inode buffer verifier. */ + xrep_dinode_buf(sc, bp); + bp->b_ops = &xfs_inode_buf_ops; + + /* Fix everything the verifier will complain about. */ + dip = xfs_buf_offset(bp, ri->imap.im_boffset); + xrep_dinode_header(sc, dip); + xrep_dinode_mode(sc, dip); + xrep_dinode_flags(sc, dip); + xrep_dinode_size(ri, dip); + xrep_dinode_extsize_hints(sc, dip); + + /* Write out the inode. */ + trace_xrep_dinode_fixed(sc, dip); + xfs_dinode_calc_crc(sc->mp, dip); + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF); + xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset, + ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1); + + /* + * In theory, we've fixed the ondisk inode record enough that we should + * be able to load the inode into the cache. Try to iget that inode + * now while we hold the AGI and the inode cluster buffer and take the + * IOLOCK so that we can continue with repairs without anyone else + * accessing the inode. If iget fails, we still need to commit the + * changes. + */ + iget_error = xchk_iget(sc, ino, &sc->ip); + if (!iget_error) + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + /* + * Commit the inode cluster buffer updates and drop the AGI buffer that + * we've been holding since scrub setup. From here on out, repairs + * deal only with the cached inode. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + + if (iget_error) + return iget_error; + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + error = xrep_ino_dqattach(sc); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (ri->ino_sick_mask) + xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask); + return 0; +} + +/* Fix everything xfs_dinode_verify cares about. */ +STATIC int +xrep_dinode_problems( + struct xrep_inode *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + error = xrep_dinode_core(ri); + if (error) + return error; + + /* We had to fix a totally busted inode, schedule quotacheck. */ + if (XFS_IS_UQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); + if (XFS_IS_GQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); + if (XFS_IS_PQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); + + return 0; +} + +/* + * Fix problems that the verifiers don't care about. In general these are + * errors that don't cause problems elsewhere in the kernel that we can easily + * detect, so we don't check them all that rigorously. + */ + +/* Make sure block and extent counts are ok. */ +STATIC int +xrep_inode_blockcounts( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + xfs_filblks_t count; + xfs_filblks_t acount; + xfs_extnum_t nextents; + int error; + + trace_xrep_inode_blockcounts(sc); + + /* Set data fork counters from the data fork mappings. */ + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, + &nextents, &count); + if (error) + return error; + if (xfs_is_reflink_inode(sc->ip)) { + /* + * data fork blockcount can exceed physical storage if a user + * reflinks the same block over and over again. + */ + ; + } else if (XFS_IS_REALTIME_INODE(sc->ip)) { + if (count >= sc->mp->m_sb.sb_rblocks) + return -EFSCORRUPTED; + } else { + if (count >= sc->mp->m_sb.sb_dblocks) + return -EFSCORRUPTED; + } + error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents); + if (error) + return error; + sc->ip->i_df.if_nextents = nextents; + + /* Set attr fork counters from the attr fork mappings. */ + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + if (ifp) { + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, + &nextents, &acount); + if (error) + return error; + if (count >= sc->mp->m_sb.sb_dblocks) + return -EFSCORRUPTED; + error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK, + nextents); + if (error) + return error; + ifp->if_nextents = nextents; + } else { + acount = 0; + } + + sc->ip->i_nblocks = count + acount; + return 0; +} + +/* Check for invalid uid/gid/prid. */ +STATIC void +xrep_inode_ids( + struct xfs_scrub *sc) +{ + bool dirty = false; + + trace_xrep_inode_ids(sc); + + if (!uid_valid(VFS_I(sc->ip)->i_uid)) { + i_uid_write(VFS_I(sc->ip), 0); + dirty = true; + if (XFS_IS_UQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); + } + + if (!gid_valid(VFS_I(sc->ip)->i_gid)) { + i_gid_write(VFS_I(sc->ip), 0); + dirty = true; + if (XFS_IS_GQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); + } + + if (sc->ip->i_projid == -1U) { + sc->ip->i_projid = 0; + dirty = true; + if (XFS_IS_PQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); + } + + /* strip setuid/setgid if we touched any of the ids */ + if (dirty) + VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID); +} + +static inline void +xrep_clamp_timestamp( + struct xfs_inode *ip, + struct timespec64 *ts) +{ + ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC); + *ts = timestamp_truncate(*ts, VFS_I(ip)); +} + +/* Nanosecond counters can't have more than 1 billion. */ +STATIC void +xrep_inode_timestamps( + struct xfs_inode *ip) +{ + struct timespec64 tstamp; + struct inode *inode = VFS_I(ip); + + tstamp = inode_get_atime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_atime_to_ts(inode, tstamp); + + tstamp = inode_get_mtime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_mtime_to_ts(inode, tstamp); + + tstamp = inode_get_ctime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_ctime_to_ts(inode, tstamp); + + xrep_clamp_timestamp(ip, &ip->i_crtime); +} + +/* Fix inode flags that don't make sense together. */ +STATIC void +xrep_inode_flags( + struct xfs_scrub *sc) +{ + uint16_t mode; + + trace_xrep_inode_flags(sc); + + mode = VFS_I(sc->ip)->i_mode; + + /* Clear junk flags */ + if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY) + sc->ip->i_diflags &= ~XFS_DIFLAG_ANY; + + /* NEWRTBM only applies to realtime bitmaps */ + if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino) + sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM; + else + sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM; + + /* These only make sense for directories. */ + if (!S_ISDIR(mode)) + sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT | + XFS_DIFLAG_EXTSZINHERIT | + XFS_DIFLAG_PROJINHERIT | + XFS_DIFLAG_NOSYMLINKS); + + /* These only make sense for files. */ + if (!S_ISREG(mode)) + sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME | + XFS_DIFLAG_EXTSIZE); + + /* These only make sense for non-rt files. */ + if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) + sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM; + + /* Immutable and append only? Drop the append. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) && + (sc->ip->i_diflags & XFS_DIFLAG_APPEND)) + sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND; + + /* Clear junk flags. */ + if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY; + + /* No reflink flag unless we support it and it's a file. */ + if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode)) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + + /* DAX only applies to files and dirs. */ + if (!(S_ISREG(mode) || S_ISDIR(mode))) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; + + /* No reflink files on the realtime device. */ + if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; +} + +/* + * Fix size problems with block/node format directories. If we fail to find + * the extent list, just bail out and let the bmapbtd repair functions clean + * up that mess. + */ +STATIC void +xrep_inode_blockdir_size( + struct xfs_scrub *sc) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + xfs_fileoff_t off; + int error; + + trace_xrep_inode_blockdir_size(sc); + + error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK); + if (error) + return; + + /* Find the last block before 32G; this is the dir size. */ + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE); + if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) { + /* zero-extents directory? */ + return; + } + + off = got.br_startoff + got.br_blockcount; + sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE, + XFS_FSB_TO_B(sc->mp, off)); +} + +/* Fix size problems with short format directories. */ +STATIC void +xrep_inode_sfdir_size( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + + trace_xrep_inode_sfdir_size(sc); + + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + sc->ip->i_disk_size = ifp->if_bytes; +} + +/* + * Fix any irregularities in a directory inode's size now that we can iterate + * extent maps and access other regular inode data. + */ +STATIC void +xrep_inode_dir_size( + struct xfs_scrub *sc) +{ + trace_xrep_inode_dir_size(sc); + + switch (sc->ip->i_df.if_format) { + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + xrep_inode_blockdir_size(sc); + break; + case XFS_DINODE_FMT_LOCAL: + xrep_inode_sfdir_size(sc); + break; + } +} + +/* Fix extent size hint problems. */ +STATIC void +xrep_inode_extsize( + struct xfs_scrub *sc) +{ + /* Fix misaligned extent size hints on a directory. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) { + sc->ip->i_extsize = 0; + sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT; + } +} + +/* Fix any irregularities in an inode that the verifiers don't catch. */ +STATIC int +xrep_inode_problems( + struct xfs_scrub *sc) +{ + int error; + + error = xrep_inode_blockcounts(sc); + if (error) + return error; + xrep_inode_timestamps(sc->ip); + xrep_inode_flags(sc); + xrep_inode_ids(sc); + /* + * We can now do a better job fixing the size of a directory now that + * we can scan the data fork extents than we could in xrep_dinode_size. + */ + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + xrep_inode_dir_size(sc); + xrep_inode_extsize(sc); + + trace_xrep_inode_fixed(sc); + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return xrep_roll_trans(sc); +} + +/* Repair an inode's fields. */ +int +xrep_inode( + struct xfs_scrub *sc) +{ + int error = 0; + + /* + * No inode? That means we failed the _iget verifiers. Repair all + * the things that the inode verifiers care about, then retry _iget. + */ + if (!sc->ip) { + struct xrep_inode *ri = sc->buf; + + ASSERT(ri != NULL); + + error = xrep_dinode_problems(ri); + if (error) + return error; + + /* By this point we had better have a working incore inode. */ + if (!sc->ip) + return -EFSCORRUPTED; + } + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* If we found corruption of any kind, try to fix it. */ + if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) { + error = xrep_inode_problems(sc); + if (error) + return error; + } + + /* See if we can clear the reflink flag. */ + if (xfs_is_reflink_inode(sc->ip)) { + error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); + if (error) + return error; + } + + return xrep_defer_finish(sc); +} diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 021f6ec72e87..25392dea326d 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -176,6 +176,16 @@ xrep_roll_ag_trans( return 0; } +/* Roll the scrub transaction, holding the primary metadata locked. */ +int +xrep_roll_trans( + struct xfs_scrub *sc) +{ + if (!sc->ip) + return xrep_roll_ag_trans(sc); + return xfs_trans_roll_inode(&sc->tp, sc->ip); +} + /* Finish all deferred work attached to the repair transaction. */ int xrep_defer_finish( @@ -740,6 +750,38 @@ xrep_ino_dqattach( } #endif /* CONFIG_XFS_QUOTA */ +/* + * Ensure that the inode being repaired is ready to handle a certain number of + * extents, or return EFSCORRUPTED. Caller must hold the ILOCK of the inode + * being repaired and have joined it to the scrub transaction. + */ +int +xrep_ino_ensure_extent_count( + struct xfs_scrub *sc, + int whichfork, + xfs_extnum_t nextents) +{ + xfs_extnum_t max_extents; + bool inode_has_nrext64; + + inode_has_nrext64 = xfs_inode_has_large_extent_counts(sc->ip); + max_extents = xfs_iext_max_nextents(inode_has_nrext64, whichfork); + if (nextents <= max_extents) + return 0; + if (inode_has_nrext64) + return -EFSCORRUPTED; + if (!xfs_has_large_extent_counts(sc->mp)) + return -EFSCORRUPTED; + + max_extents = xfs_iext_max_nextents(true, whichfork); + if (nextents > max_extents) + return -EFSCORRUPTED; + + sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return 0; +} + /* * Initialize all the btree cursors for an AG repair except for the btree that * we're rebuilding. diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 93814acc678a..a513b84f5330 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -30,11 +30,22 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); +int xrep_roll_trans(struct xfs_scrub *sc); int xrep_defer_finish(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, enum xfs_ag_resv_type type); xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc); +static inline int +xrep_trans_commit( + struct xfs_scrub *sc) +{ + int error = xfs_trans_commit(sc->tp); + + sc->tp = NULL; + return error; +} + struct xbitmap; struct xagb_bitmap; @@ -66,11 +77,16 @@ int xrep_ino_dqattach(struct xfs_scrub *sc); # define xrep_ino_dqattach(sc) (0) #endif /* CONFIG_XFS_QUOTA */ +int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork, + xfs_extnum_t nextents); int xrep_reset_perag_resv(struct xfs_scrub *sc); /* Repair setup functions */ int xrep_setup_ag_allocbt(struct xfs_scrub *sc); +struct xfs_imap; +int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap); + void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); /* Metadata revalidators */ @@ -88,6 +104,7 @@ int xrep_agi(struct xfs_scrub *sc); int xrep_allocbt(struct xfs_scrub *sc); int xrep_iallocbt(struct xfs_scrub *sc); int xrep_refcountbt(struct xfs_scrub *sc); +int xrep_inode(struct xfs_scrub *sc); int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); @@ -133,6 +150,8 @@ xrep_setup_nothing( } #define xrep_setup_ag_allocbt xrep_setup_nothing +#define xrep_setup_inode(sc, imap) ((void)0) + #define xrep_revalidate_allocbt (NULL) #define xrep_revalidate_iallocbt (NULL) @@ -144,6 +163,7 @@ xrep_setup_nothing( #define xrep_allocbt xrep_notsupported #define xrep_iallocbt xrep_notsupported #define xrep_refcountbt xrep_notsupported +#define xrep_inode xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 6ff4dc57095f..7e903a0fde6c 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -282,7 +282,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_INODE, .setup = xchk_setup_inode, .scrub = xchk_inode, - .repair = xrep_notsupported, + .repair = xrep_inode, }, [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ .type = ST_INODE, diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 3f7af4430951..6041c716242a 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1393,6 +1393,135 @@ DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks); DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks); DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block); +DECLARE_EVENT_CLASS(xrep_dinode_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), + TP_ARGS(sc, dip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(uint16_t, mode) + __field(uint8_t, version) + __field(uint8_t, format) + __field(uint32_t, uid) + __field(uint32_t, gid) + __field(uint64_t, size) + __field(uint64_t, nblocks) + __field(uint32_t, extsize) + __field(uint32_t, nextents) + __field(uint16_t, anextents) + __field(uint8_t, forkoff) + __field(uint8_t, aformat) + __field(uint16_t, flags) + __field(uint32_t, gen) + __field(uint64_t, flags2) + __field(uint32_t, cowextsize) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->mode = be16_to_cpu(dip->di_mode); + __entry->version = dip->di_version; + __entry->format = dip->di_format; + __entry->uid = be32_to_cpu(dip->di_uid); + __entry->gid = be32_to_cpu(dip->di_gid); + __entry->size = be64_to_cpu(dip->di_size); + __entry->nblocks = be64_to_cpu(dip->di_nblocks); + __entry->extsize = be32_to_cpu(dip->di_extsize); + __entry->nextents = be32_to_cpu(dip->di_nextents); + __entry->anextents = be16_to_cpu(dip->di_anextents); + __entry->forkoff = dip->di_forkoff; + __entry->aformat = dip->di_aformat; + __entry->flags = be16_to_cpu(dip->di_flags); + __entry->gen = be32_to_cpu(dip->di_gen); + __entry->flags2 = be64_to_cpu(dip->di_flags2); + __entry->cowextsize = be32_to_cpu(dip->di_cowextsize); + ), + TP_printk("dev %d:%d ino 0x%llx mode 0x%x version %u format %u uid %u gid %u disize 0x%llx nblocks 0x%llx extsize %u nextents %u anextents %u forkoff 0x%x aformat %u flags 0x%x gen 0x%x flags2 0x%llx cowextsize %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->mode, + __entry->version, + __entry->format, + __entry->uid, + __entry->gid, + __entry->size, + __entry->nblocks, + __entry->extsize, + __entry->nextents, + __entry->anextents, + __entry->forkoff, + __entry->aformat, + __entry->flags, + __entry->gen, + __entry->flags2, + __entry->cowextsize) +) + +#define DEFINE_REPAIR_DINODE_EVENT(name) \ +DEFINE_EVENT(xrep_dinode_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), \ + TP_ARGS(sc, dip)) +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_header); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_mode); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_flags); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_size); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_extsize_hints); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_symlink); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dir); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_fixed); + +DECLARE_EVENT_CLASS(xrep_inode_class, + TP_PROTO(struct xfs_scrub *sc), + TP_ARGS(sc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_rfsblock_t, nblocks) + __field(uint16_t, flags) + __field(uint64_t, flags2) + __field(uint32_t, nextents) + __field(uint8_t, format) + __field(uint32_t, anextents) + __field(uint8_t, aformat) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->size = sc->ip->i_disk_size; + __entry->nblocks = sc->ip->i_nblocks; + __entry->flags = sc->ip->i_diflags; + __entry->flags2 = sc->ip->i_diflags2; + __entry->nextents = sc->ip->i_df.if_nextents; + __entry->format = sc->ip->i_df.if_format; + __entry->anextents = sc->ip->i_af.if_nextents; + __entry->aformat = sc->ip->i_af.if_format; + ), + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx nblocks 0x%llx flags 0x%x flags2 0x%llx nextents %u format %u anextents %u aformat %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->nblocks, + __entry->flags, + __entry->flags2, + __entry->nextents, + __entry->format, + __entry->anextents, + __entry->aformat) +) + +#define DEFINE_REPAIR_INODE_EVENT(name) \ +DEFINE_EVENT(xrep_inode_class, name, \ + TP_PROTO(struct xfs_scrub *sc), \ + TP_ARGS(sc)) +DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockcounts); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_ids); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_flags); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockdir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_sfdir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_dir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_fixed); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ From e744cef206055954517648070d2b3aaa3d2515ba Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:37 -0800 Subject: [PATCH 0800/1562] xfs: zap broken inode forks Determine if inode fork damage is responsible for the inode being unable to pass the ifork verifiers in xfs_iget and zap the fork contents if this is true. Once this is done the fork will be empty but we'll be able to construct an in-core inode, and a subsequent call to the inode fork repair ioctl will search the rmapbt to rebuild the records that were in the fork. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_attr_leaf.c | 13 +- fs/xfs/libxfs/xfs_attr_leaf.h | 3 +- fs/xfs/libxfs/xfs_bmap.c | 22 +- fs/xfs/libxfs/xfs_bmap.h | 2 + fs/xfs/libxfs/xfs_dir2_priv.h | 3 +- fs/xfs/libxfs/xfs_dir2_sf.c | 13 +- fs/xfs/libxfs/xfs_inode_fork.c | 33 +- fs/xfs/libxfs/xfs_shared.h | 2 +- fs/xfs/libxfs/xfs_symlink_remote.c | 8 +- fs/xfs/scrub/inode_repair.c | 713 ++++++++++++++++++++++++++++- fs/xfs/scrub/trace.h | 42 ++ 11 files changed, 808 insertions(+), 46 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 654e17e6610d..5d1ab4978f32 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1040,23 +1040,16 @@ xfs_attr_shortform_allfit( return xfs_attr_shortform_bytesfit(dp, bytes); } -/* Verify the consistency of an inline attribute fork. */ +/* Verify the consistency of a raw inline attribute fork. */ xfs_failaddr_t xfs_attr_shortform_verify( - struct xfs_inode *ip) + struct xfs_attr_shortform *sfp, + size_t size) { - struct xfs_attr_shortform *sfp; struct xfs_attr_sf_entry *sfep; struct xfs_attr_sf_entry *next_sfep; char *endp; - struct xfs_ifork *ifp; int i; - int64_t size; - - ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL); - ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); - sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - size = ifp->if_bytes; /* * Give up if the attribute is way too short. diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 368f4d9fa1d5..ce6743463c86 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -56,7 +56,8 @@ int xfs_attr_sf_findname(struct xfs_da_args *args, unsigned int *basep); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); -xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip); +xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_attr_shortform *sfp, + size_t size); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index e308d2f44a3c..a073ca877ced 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6168,19 +6168,18 @@ xfs_bmap_finish_one( return error; } -/* Check that an inode's extent does not have invalid flags or bad ranges. */ +/* Check that an extent does not have invalid flags or bad ranges. */ xfs_failaddr_t -xfs_bmap_validate_extent( - struct xfs_inode *ip, +xfs_bmap_validate_extent_raw( + struct xfs_mount *mp, + bool rtfile, int whichfork, struct xfs_bmbt_irec *irec) { - struct xfs_mount *mp = ip->i_mount; - if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount)) return __this_address; - if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) { + if (rtfile && whichfork == XFS_DATA_FORK) { if (!xfs_verify_rtbext(mp, irec->br_startblock, irec->br_blockcount)) return __this_address; @@ -6210,3 +6209,14 @@ xfs_bmap_intent_destroy_cache(void) kmem_cache_destroy(xfs_bmap_intent_cache); xfs_bmap_intent_cache = NULL; } + +/* Check that an inode's extent does not have invalid flags or bad ranges. */ +xfs_failaddr_t +xfs_bmap_validate_extent( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *irec) +{ + return xfs_bmap_validate_extent_raw(ip->i_mount, + XFS_IS_REALTIME_INODE(ip), whichfork, irec); +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e33470e39728..8518324db285 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -263,6 +263,8 @@ static inline uint32_t xfs_bmap_fork_to_state(int whichfork) } } +xfs_failaddr_t xfs_bmap_validate_extent_raw(struct xfs_mount *mp, bool rtfile, + int whichfork, struct xfs_bmbt_irec *irec); xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec); int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 7404a9ff1a92..1db2e60ba827 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -175,7 +175,8 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); -extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip); +xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, int64_t size); int xfs_dir2_sf_entsize(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, int len); void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 8cd37e6e9d38..870ef1d1ebe4 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -707,11 +707,10 @@ xfs_dir2_sf_check( /* Verify the consistency of an inline directory. */ xfs_failaddr_t xfs_dir2_sf_verify( - struct xfs_inode *ip) + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, + int64_t size) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); - struct xfs_dir2_sf_hdr *sfp; struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; char *endp; @@ -719,15 +718,9 @@ xfs_dir2_sf_verify( int i; int i8count; int offset; - int64_t size; int error; uint8_t filetype; - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - - sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; - size = ifp->if_bytes; - /* * Give up if the directory is way too short. */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 5a2e7ddfa76d..dad8ea832c20 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -702,12 +702,22 @@ xfs_ifork_verify_local_data( xfs_failaddr_t fa = NULL; switch (VFS_I(ip)->i_mode & S_IFMT) { - case S_IFDIR: - fa = xfs_dir2_sf_verify(ip); + case S_IFDIR: { + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_dir2_sf_hdr *sfp; + + sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; + fa = xfs_dir2_sf_verify(mp, sfp, ifp->if_bytes); break; - case S_IFLNK: - fa = xfs_symlink_shortform_verify(ip); + } + case S_IFLNK: { + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + + fa = xfs_symlink_shortform_verify(ifp->if_u1.if_data, + ifp->if_bytes); break; + } default: break; } @@ -729,11 +739,20 @@ xfs_ifork_verify_local_attr( struct xfs_ifork *ifp = &ip->i_af; xfs_failaddr_t fa; - if (!xfs_inode_has_attr_fork(ip)) + if (!xfs_inode_has_attr_fork(ip)) { fa = __this_address; - else - fa = xfs_attr_shortform_verify(ip); + } else { + struct xfs_attr_shortform *sfp; + struct xfs_ifork *ifp; + int64_t size; + ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL); + ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); + sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; + size = ifp->if_bytes; + + fa = xfs_attr_shortform_verify(sfp, size); + } if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", ifp->if_u1.if_data, ifp->if_bytes, fa); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index c4381388c0c1..4220d3584c1b 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -139,7 +139,7 @@ bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, uint32_t size, struct xfs_buf *bp); void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp); -xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip); +xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size); /* Computed inode geometry for the filesystem. */ struct xfs_ino_geometry { diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index bdc777b9ec4a..3c96d1d617fb 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -202,15 +202,11 @@ xfs_symlink_local_to_remote( */ xfs_failaddr_t xfs_symlink_shortform_verify( - struct xfs_inode *ip) + void *sfp, + int64_t size) { - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); - char *sfp = (char *)ifp->if_u1.if_data; - int size = ifp->if_bytes; char *endp = sfp + size; - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - /* * Zero length symlinks should never occur in memory as they are * never allowed to exist on disk. diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index f88d282fdfa5..66949cc3d7cc 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -22,8 +22,11 @@ #include "xfs_ialloc.h" #include "xfs_da_format.h" #include "xfs_reflink.h" +#include "xfs_alloc.h" #include "xfs_rmap.h" +#include "xfs_rmap_btree.h" #include "xfs_bmap.h" +#include "xfs_bmap_btree.h" #include "xfs_bmap_util.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" @@ -31,6 +34,8 @@ #include "xfs_quota.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" +#include "xfs_attr_leaf.h" +#include "xfs_log_priv.h" #include "xfs_health.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" @@ -71,6 +76,16 @@ * * - Invalid user, group, or project IDs (aka -1U) will be reset to zero. * Setuid and setgid bits are cleared. + * + * - Data and attr forks are reset to extents format with zero extents if the + * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta + * repair functions to recover the space mapping. + * + * - ACLs will not be recovered if the attr fork is zapped or the extended + * attribute structure itself requires salvaging. + * + * - If the attr fork is zapped, the user and group ids are reset to root and + * the setuid and setgid bits are removed. */ /* @@ -84,8 +99,33 @@ struct xrep_inode { struct xfs_scrub *sc; + /* Blocks in use on the data device by data extents or bmbt blocks. */ + xfs_rfsblock_t data_blocks; + + /* Blocks in use on the rt device. */ + xfs_rfsblock_t rt_blocks; + + /* Blocks in use by the attr fork. */ + xfs_rfsblock_t attr_blocks; + + /* Number of data device extents for the data fork. */ + xfs_extnum_t data_extents; + + /* + * Number of realtime device extents for the data fork. If + * data_extents and rt_extents indicate that the data fork has extents + * on both devices, we'll just back away slowly. + */ + xfs_extnum_t rt_extents; + + /* Number of (data device) extents for the attr fork. */ + xfs_aextnum_t attr_extents; + /* Sick state to set after zapping parts of the inode. */ unsigned int ino_sick_mask; + + /* Must we remove all access from this file? */ + bool zap_acls; }; /* @@ -190,9 +230,10 @@ xrep_dinode_header( /* Turn di_mode into /something/ recognizable. */ STATIC void xrep_dinode_mode( - struct xfs_scrub *sc, + struct xrep_inode *ri, struct xfs_dinode *dip) { + struct xfs_scrub *sc = ri->sc; uint16_t mode = be16_to_cpu(dip->di_mode); trace_xrep_dinode_mode(sc, dip); @@ -205,13 +246,15 @@ xrep_dinode_mode( dip->di_mode = cpu_to_be16(mode); dip->di_uid = 0; dip->di_gid = 0; + ri->zap_acls = true; } /* Fix any conflicting flags that the verifiers complain about. */ STATIC void xrep_dinode_flags( struct xfs_scrub *sc, - struct xfs_dinode *dip) + struct xfs_dinode *dip, + bool isrt) { struct xfs_mount *mp = sc->mp; uint64_t flags2 = be64_to_cpu(dip->di_flags2); @@ -220,6 +263,11 @@ xrep_dinode_flags( trace_xrep_dinode_flags(sc, dip); + if (isrt) + flags |= XFS_DIFLAG_REALTIME; + else + flags &= ~XFS_DIFLAG_REALTIME; + /* * For regular files on a reflink filesystem, set the REFLINK flag to * protect shared extents. A later stage will actually check those @@ -377,6 +425,657 @@ xrep_dinode_extsize_hints( } } +/* Count extents and blocks for an inode given an rmap. */ +STATIC int +xrep_dinode_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_inode *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + /* We only care about this inode. */ + if (rec->rm_owner != ri->sc->sm->sm_ino) + return 0; + + if (rec->rm_flags & XFS_RMAP_ATTR_FORK) { + ri->attr_blocks += rec->rm_blockcount; + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + ri->attr_extents++; + + return 0; + } + + ri->data_blocks += rec->rm_blockcount; + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + ri->data_extents++; + + return 0; +} + +/* Count extents and blocks for an inode from all AG rmap data. */ +STATIC int +xrep_dinode_count_ag_rmaps( + struct xrep_inode *ri, + struct xfs_perag *pag) +{ + struct xfs_btree_cur *cur; + struct xfs_buf *agf; + int error; + + error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf); + if (error) + return error; + + cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag); + error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri); + xfs_btree_del_cursor(cur, error); + xfs_trans_brelse(ri->sc->tp, agf); + return error; +} + +/* Count extents and blocks for a given inode from all rmap data. */ +STATIC int +xrep_dinode_count_rmaps( + struct xrep_inode *ri) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error; + + if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) + return -EOPNOTSUPP; + + for_each_perag(ri->sc->mp, agno, pag) { + error = xrep_dinode_count_ag_rmaps(ri, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + /* Can't have extents on both the rt and the data device. */ + if (ri->data_extents && ri->rt_extents) + return -EFSCORRUPTED; + + trace_xrep_dinode_count_rmaps(ri->sc, + ri->data_blocks, ri->rt_blocks, ri->attr_blocks, + ri->data_extents, ri->rt_extents, ri->attr_extents); + return 0; +} + +/* Return true if this extents-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_extents_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size, + int whichfork) +{ + struct xfs_bmbt_irec new; + struct xfs_bmbt_rec *dp; + xfs_extnum_t nex; + bool isrt; + unsigned int i; + + nex = xfs_dfork_nextents(dip, whichfork); + if (nex > dfork_size / sizeof(struct xfs_bmbt_rec)) + return true; + + dp = XFS_DFORK_PTR(dip, whichfork); + + isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME); + for (i = 0; i < nex; i++, dp++) { + xfs_failaddr_t fa; + + xfs_bmbt_disk_get_all(dp, &new); + fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork, + &new); + if (fa) + return true; + } + + return false; +} + +/* Return true if this btree-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_bmbt_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size, + int whichfork) +{ + struct xfs_bmdr_block *dfp; + xfs_extnum_t nex; + unsigned int i; + unsigned int dmxr; + unsigned int nrecs; + unsigned int level; + + nex = xfs_dfork_nextents(dip, whichfork); + if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec)) + return true; + + if (dfork_size < sizeof(struct xfs_bmdr_block)) + return true; + + dfp = XFS_DFORK_PTR(dip, whichfork); + nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size) + return true; + if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork)) + return true; + + dmxr = xfs_bmdr_maxrecs(dfork_size, 0); + for (i = 1; i <= nrecs; i++) { + struct xfs_bmbt_key *fkp; + xfs_bmbt_ptr_t *fpp; + xfs_fileoff_t fileoff; + xfs_fsblock_t fsbno; + + fkp = XFS_BMDR_KEY_ADDR(dfp, i); + fileoff = be64_to_cpu(fkp->br_startoff); + if (!xfs_verify_fileoff(sc->mp, fileoff)) + return true; + + fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr); + fsbno = be64_to_cpu(*fpp); + if (!xfs_verify_fsbno(sc->mp, fsbno)) + return true; + } + + return false; +} + +/* + * Check the data fork for things that will fail the ifork verifiers or the + * ifork formatters. + */ +STATIC bool +xrep_dinode_check_dfork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + uint16_t mode) +{ + void *dfork_ptr; + int64_t data_size; + unsigned int fmt; + unsigned int dfork_size; + + /* + * Verifier functions take signed int64_t, so check for bogus negative + * values first. + */ + data_size = be64_to_cpu(dip->di_size); + if (data_size < 0) + return true; + + fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK); + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (fmt != XFS_DINODE_FMT_DEV) + return true; + break; + case S_IFREG: + if (fmt == XFS_DINODE_FMT_LOCAL) + return true; + fallthrough; + case S_IFLNK: + case S_IFDIR: + switch (fmt) { + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return true; + } + break; + default: + return true; + } + + dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK); + dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + + switch (fmt) { + case XFS_DINODE_FMT_DEV: + break; + case XFS_DINODE_FMT_LOCAL: + /* dir/symlink structure cannot be larger than the fork */ + if (data_size > dfork_size) + return true; + /* directory structure must pass verification. */ + if (S_ISDIR(mode) && + xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL) + return true; + /* symlink structure must pass verification. */ + if (S_ISLNK(mode) && + xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL) + return true; + break; + case XFS_DINODE_FMT_EXTENTS: + if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size, + XFS_DATA_FORK)) + return true; + break; + case XFS_DINODE_FMT_BTREE: + if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size, + XFS_DATA_FORK)) + return true; + break; + default: + return true; + } + + return false; +} + +static void +xrep_dinode_set_data_nextents( + struct xfs_dinode *dip, + xfs_extnum_t nextents) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + dip->di_big_nextents = cpu_to_be64(nextents); + else + dip->di_nextents = cpu_to_be32(nextents); +} + +static void +xrep_dinode_set_attr_nextents( + struct xfs_dinode *dip, + xfs_extnum_t nextents) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + dip->di_big_anextents = cpu_to_be32(nextents); + else + dip->di_anextents = cpu_to_be16(nextents); +} + +/* Reset the data fork to something sane. */ +STATIC void +xrep_dinode_zap_dfork( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_scrub *sc = ri->sc; + + trace_xrep_dinode_zap_dfork(sc, dip); + + ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED; + + xrep_dinode_set_data_nextents(dip, 0); + ri->data_blocks = 0; + ri->rt_blocks = 0; + + /* Special files always get reset to DEV */ + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + dip->di_format = XFS_DINODE_FMT_DEV; + dip->di_size = 0; + return; + } + + /* + * If we have data extents, reset to an empty map and hope the user + * will run the bmapbtd checker next. + */ + if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) { + dip->di_format = XFS_DINODE_FMT_EXTENTS; + return; + } + + /* Otherwise, reset the local format to the minimum. */ + switch (mode & S_IFMT) { + case S_IFLNK: + xrep_dinode_zap_symlink(ri, dip); + break; + case S_IFDIR: + xrep_dinode_zap_dir(ri, dip); + break; + } +} + +/* + * Check the attr fork for things that will fail the ifork verifiers or the + * ifork formatters. + */ +STATIC bool +xrep_dinode_check_afork( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + struct xfs_attr_shortform *afork_ptr; + size_t attr_size; + unsigned int afork_size; + + if (XFS_DFORK_BOFF(dip) == 0) + return dip->di_aformat != XFS_DINODE_FMT_EXTENTS || + xfs_dfork_attr_extents(dip) != 0; + + afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); + afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); + + switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) { + case XFS_DINODE_FMT_LOCAL: + /* Fork has to be large enough to extract the xattr size. */ + if (afork_size < sizeof(struct xfs_attr_sf_hdr)) + return true; + + /* xattr structure cannot be larger than the fork */ + attr_size = be16_to_cpu(afork_ptr->hdr.totsize); + if (attr_size > afork_size) + return true; + + /* xattr structure must pass verification. */ + return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL; + case XFS_DINODE_FMT_EXTENTS: + if (xrep_dinode_bad_extents_fork(sc, dip, afork_size, + XFS_ATTR_FORK)) + return true; + break; + case XFS_DINODE_FMT_BTREE: + if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size, + XFS_ATTR_FORK)) + return true; + break; + default: + return true; + } + + return false; +} + +/* + * Reset the attr fork to empty. Since the attr fork could have contained + * ACLs, make the file readable only by root. + */ +STATIC void +xrep_dinode_zap_afork( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_scrub *sc = ri->sc; + + trace_xrep_dinode_zap_afork(sc, dip); + + ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED; + + dip->di_aformat = XFS_DINODE_FMT_EXTENTS; + xrep_dinode_set_attr_nextents(dip, 0); + ri->attr_blocks = 0; + + /* + * If the data fork is in btree format, removing the attr fork entirely + * might cause verifier failures if the next level down in the bmbt + * could now fit in the data fork area. + */ + if (dip->di_format != XFS_DINODE_FMT_BTREE) + dip->di_forkoff = 0; + dip->di_mode = cpu_to_be16(mode & ~0777); + dip->di_uid = 0; + dip->di_gid = 0; +} + +/* Make sure the fork offset is a sensible value. */ +STATIC void +xrep_dinode_ensure_forkoff( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_bmdr_block *bmdr; + struct xfs_scrub *sc = ri->sc; + xfs_extnum_t attr_extents, data_extents; + size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1); + unsigned int lit_sz = XFS_LITINO(sc->mp); + unsigned int afork_min, dfork_min; + + trace_xrep_dinode_ensure_forkoff(sc, dip); + + /* + * Before calling this function, xrep_dinode_core ensured that both + * forks actually fit inside their respective literal areas. If this + * was not the case, the fork was reset to FMT_EXTENTS with zero + * records. If the rmapbt scan found attr or data fork blocks, this + * will be noted in the dinode_stats, and we must leave enough room + * for the bmap repair code to reconstruct the mapping structure. + * + * First, compute the minimum space required for the attr fork. + */ + switch (dip->di_aformat) { + case XFS_DINODE_FMT_LOCAL: + /* + * If we still have a shortform xattr structure at all, that + * means the attr fork area was exactly large enough to fit + * the sf structure. + */ + afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_EXTENTS: + attr_extents = xfs_dfork_attr_extents(dip); + if (attr_extents) { + /* + * We must maintain sufficient space to hold the entire + * extent map array in the data fork. Note that we + * previously zapped the fork if it had no chance of + * fitting in the inode. + */ + afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents; + } else if (ri->attr_extents > 0) { + /* + * The attr fork thinks it has zero extents, but we + * found some xattr extents. We need to leave enough + * empty space here so that the incore attr fork will + * get created (and hence trigger the attr fork bmap + * repairer). + */ + afork_min = bmdr_minsz; + } else { + /* No extents on disk or found in rmapbt. */ + afork_min = 0; + } + break; + case XFS_DINODE_FMT_BTREE: + /* Must have space for btree header and key/pointers. */ + bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); + afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); + break; + default: + /* We should never see any other formats. */ + afork_min = 0; + break; + } + + /* Compute the minimum space required for the data fork. */ + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + dfork_min = sizeof(__be32); + break; + case XFS_DINODE_FMT_UUID: + dfork_min = sizeof(uuid_t); + break; + case XFS_DINODE_FMT_LOCAL: + /* + * If we still have a shortform data fork at all, that means + * the data fork area was large enough to fit whatever was in + * there. + */ + dfork_min = be64_to_cpu(dip->di_size); + break; + case XFS_DINODE_FMT_EXTENTS: + data_extents = xfs_dfork_data_extents(dip); + if (data_extents) { + /* + * We must maintain sufficient space to hold the entire + * extent map array in the data fork. Note that we + * previously zapped the fork if it had no chance of + * fitting in the inode. + */ + dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents; + } else if (ri->data_extents > 0 || ri->rt_extents > 0) { + /* + * The data fork thinks it has zero extents, but we + * found some data extents. We need to leave enough + * empty space here so that the data fork bmap repair + * will recover the mappings. + */ + dfork_min = bmdr_minsz; + } else { + /* No extents on disk or found in rmapbt. */ + dfork_min = 0; + } + break; + case XFS_DINODE_FMT_BTREE: + /* Must have space for btree header and key/pointers. */ + bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); + break; + default: + dfork_min = 0; + break; + } + + /* + * Round all values up to the nearest 8 bytes, because that is the + * precision of di_forkoff. + */ + afork_min = roundup(afork_min, 8); + dfork_min = roundup(dfork_min, 8); + bmdr_minsz = roundup(bmdr_minsz, 8); + + ASSERT(dfork_min <= lit_sz); + ASSERT(afork_min <= lit_sz); + + /* + * If the data fork was zapped and we don't have enough space for the + * recovery fork, move the attr fork up. + */ + if (dip->di_format == XFS_DINODE_FMT_EXTENTS && + xfs_dfork_data_extents(dip) == 0 && + (ri->data_extents > 0 || ri->rt_extents > 0) && + bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) { + if (bmdr_minsz + afork_min > lit_sz) { + /* + * The attr for and the stub fork we need to recover + * the data fork won't both fit. Zap the attr fork. + */ + xrep_dinode_zap_afork(ri, dip, mode); + afork_min = bmdr_minsz; + } else { + void *before, *after; + + /* Otherwise, just slide the attr fork up. */ + before = XFS_DFORK_APTR(dip); + dip->di_forkoff = bmdr_minsz >> 3; + after = XFS_DFORK_APTR(dip); + memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp)); + } + } + + /* + * If the attr fork was zapped and we don't have enough space for the + * recovery fork, move the attr fork down. + */ + if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS && + xfs_dfork_attr_extents(dip) == 0 && + ri->attr_extents > 0 && + bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) { + if (dip->di_format == XFS_DINODE_FMT_BTREE) { + /* + * If the data fork is in btree format then we can't + * adjust forkoff because that runs the risk of + * violating the extents/btree format transition rules. + */ + } else if (bmdr_minsz + dfork_min > lit_sz) { + /* + * If we can't move the attr fork, too bad, we lose the + * attr fork and leak its blocks. + */ + xrep_dinode_zap_afork(ri, dip, mode); + } else { + /* + * Otherwise, just slide the attr fork down. The attr + * fork is empty, so we don't have any old contents to + * move here. + */ + dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3; + } + } +} + +/* + * Zap the data/attr forks if we spot anything that isn't going to pass the + * ifork verifiers or the ifork formatters, because we need to get the inode + * into good enough shape that the higher level repair functions can run. + */ +STATIC void +xrep_dinode_zap_forks( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + xfs_extnum_t data_extents; + xfs_extnum_t attr_extents; + xfs_filblks_t nblocks; + uint16_t mode; + bool zap_datafork = false; + bool zap_attrfork = ri->zap_acls; + + trace_xrep_dinode_zap_forks(sc, dip); + + mode = be16_to_cpu(dip->di_mode); + + data_extents = xfs_dfork_data_extents(dip); + attr_extents = xfs_dfork_attr_extents(dip); + nblocks = be64_to_cpu(dip->di_nblocks); + + /* Inode counters don't make sense? */ + if (data_extents > nblocks) + zap_datafork = true; + if (attr_extents > nblocks) + zap_attrfork = true; + if (data_extents + attr_extents > nblocks) + zap_datafork = zap_attrfork = true; + + if (!zap_datafork) + zap_datafork = xrep_dinode_check_dfork(sc, dip, mode); + if (!zap_attrfork) + zap_attrfork = xrep_dinode_check_afork(sc, dip); + + /* Zap whatever's bad. */ + if (zap_attrfork) + xrep_dinode_zap_afork(ri, dip, mode); + if (zap_datafork) + xrep_dinode_zap_dfork(ri, dip, mode); + xrep_dinode_ensure_forkoff(ri, dip, mode); + + /* + * Zero di_nblocks if we don't have any extents at all to satisfy the + * buffer verifier. + */ + data_extents = xfs_dfork_data_extents(dip); + attr_extents = xfs_dfork_attr_extents(dip); + if (data_extents + attr_extents == 0) + dip->di_nblocks = 0; +} + /* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */ STATIC int xrep_dinode_core( @@ -389,6 +1088,11 @@ xrep_dinode_core( int error; int iget_error; + /* Figure out what this inode had mapped in both forks. */ + error = xrep_dinode_count_rmaps(ri); + if (error) + return error; + /* Read the inode cluster buffer. */ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, @@ -403,10 +1107,11 @@ xrep_dinode_core( /* Fix everything the verifier will complain about. */ dip = xfs_buf_offset(bp, ri->imap.im_boffset); xrep_dinode_header(sc, dip); - xrep_dinode_mode(sc, dip); - xrep_dinode_flags(sc, dip); + xrep_dinode_mode(ri, dip); + xrep_dinode_flags(sc, dip, ri->rt_extents > 0); xrep_dinode_size(ri, dip); xrep_dinode_extsize_hints(sc, dip); + xrep_dinode_zap_forks(ri, dip); /* Write out the inode. */ trace_xrep_dinode_fixed(sc, dip); diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 6041c716242a..120faa4dce2d 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1469,6 +1469,10 @@ DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_extsize_hints); DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_symlink); DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dir); DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_fixed); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_forks); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dfork); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_afork); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_ensure_forkoff); DECLARE_EVENT_CLASS(xrep_inode_class, TP_PROTO(struct xfs_scrub *sc), @@ -1522,6 +1526,44 @@ DEFINE_REPAIR_INODE_EVENT(xrep_inode_sfdir_size); DEFINE_REPAIR_INODE_EVENT(xrep_inode_dir_size); DEFINE_REPAIR_INODE_EVENT(xrep_inode_fixed); +TRACE_EVENT(xrep_dinode_count_rmaps, + TP_PROTO(struct xfs_scrub *sc, xfs_rfsblock_t data_blocks, + xfs_rfsblock_t rt_blocks, xfs_rfsblock_t attr_blocks, + xfs_extnum_t data_extents, xfs_extnum_t rt_extents, + xfs_aextnum_t attr_extents), + TP_ARGS(sc, data_blocks, rt_blocks, attr_blocks, data_extents, + rt_extents, attr_extents), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_rfsblock_t, data_blocks) + __field(xfs_rfsblock_t, rt_blocks) + __field(xfs_rfsblock_t, attr_blocks) + __field(xfs_extnum_t, data_extents) + __field(xfs_extnum_t, rt_extents) + __field(xfs_aextnum_t, attr_extents) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->data_blocks = data_blocks; + __entry->rt_blocks = rt_blocks; + __entry->attr_blocks = attr_blocks; + __entry->data_extents = data_extents; + __entry->rt_extents = rt_extents; + __entry->attr_extents = attr_extents; + ), + TP_printk("dev %d:%d ino 0x%llx dblocks 0x%llx rtblocks 0x%llx ablocks 0x%llx dextents %llu rtextents %llu aextents %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->data_blocks, + __entry->rt_blocks, + __entry->attr_blocks, + __entry->data_extents, + __entry->rt_extents, + __entry->attr_extents) +); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ From 6c7289528d3c91855d78c56bb35fa360ed9a40bd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:37 -0800 Subject: [PATCH 0801/1562] xfs: abort directory parent scrub scans if we encounter a zapped directory In a previous patch, we added some code to perform sufficient repairs to an ondisk inode record such that the inode cache would be willing to load the inode. If the broken inode was a shortform directory, it will reset the directory to something plausible, which is to say an empty subdirectory of the root. The telltale signs that something is seriously wrong is the broken link count. Such directories look clean, but they shouldn't participate in a filesystem scan to find or confirm a directory parent pointer. Create a predicate that identifies such directories and abort the scrub. Found by fuzzing xfs/1554 with multithreaded xfs_scrub enabled and u3.bmx[0].startblock = zeroes. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/common.c | 1 + fs/xfs/scrub/common.h | 2 ++ fs/xfs/scrub/dir.c | 26 ++++++++++++++++++++++++++ fs/xfs/scrub/parent.c | 17 +++++++++++++++++ 4 files changed, 46 insertions(+) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index f0207e71e5dc..81f2b96bb5a7 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -25,6 +25,7 @@ #include "xfs_trans_priv.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" +#include "xfs_dir2_priv.h" #include "xfs_attr.h" #include "xfs_reflink.h" #include "xfs_ag.h" diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index c69cacb0b696..ec5755266259 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -198,6 +198,8 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT); } +bool xchk_dir_looks_zapped(struct xfs_inode *dp); + #ifdef CONFIG_XFS_ONLINE_REPAIR /* Decide if a repair is required. */ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index b366fab699ac..d86ab51af928 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -798,3 +798,29 @@ xchk_directory( xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_DIR_ZAPPED); return 0; } + +/* + * Decide if this directory has been zapped to satisfy the inode and ifork + * verifiers. Checking and repairing should be postponed until the directory + * is fixed. + */ +bool +xchk_dir_looks_zapped( + struct xfs_inode *dp) +{ + /* Repair zapped this dir's data fork a short time ago */ + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return true; + + /* + * If the dinode repair found a bad data fork, it will reset the fork + * to extents format with zero records and wait for the bmapbtd + * scrubber to reconstruct the block mappings. Directories always + * contain some content, so this is a clear sign of a zapped directory. + * The state checked by xfs_ifork_zapped is not persisted, so this is + * the secondary strategy if repairs are interrupted by a crash or an + * unmount. + */ + return dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS && + dp->i_df.if_nextents == 0; +} diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index e6155d86f791..7db873672146 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -156,6 +156,16 @@ xchk_parent_validate( goto out_rele; } + /* + * We cannot yet validate this parent pointer if the directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + xchk_set_incomplete(sc); + goto out_unlock; + } + /* Look for a directory entry in the parent pointing to the child. */ error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc); if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) @@ -217,6 +227,13 @@ xchk_parent( */ error = xchk_parent_validate(sc, parent_ino); } while (error == -EAGAIN); + if (error == -EBUSY) { + /* + * We could not scan a directory, so we marked the check + * incomplete. No further error return is necessary. + */ + return 0; + } return error; } From 66da11280f7ecd77abd999c469efc0dd643f26f5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:38 -0800 Subject: [PATCH 0802/1562] xfs: reintroduce reaping of file metadata blocks to xrep_reap_extents Back in commit a55e07308831b ("xfs: only allow reaping of per-AG blocks in xrep_reap_extents"), we removed from the reaping code the ability to handle bmbt blocks. At the time, the reaping code only walked single blocks, didn't correctly detect crosslinked blocks, and the special casing made the function hard to understand. It was easier to remove unneeded functionality prior to fixing all the bugs. Now that we've fixed the problems, we want again the ability to reap file metadata blocks. Reintroduce the per-file reaping functionality atop the current implementation. We require that sc->sa is uninitialized, so that we can use it to hold all the per-AG context for a given extent. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/fsb_bitmap.h | 37 ++++++++++++ fs/xfs/scrub/reap.c | 121 ++++++++++++++++++++++++++++++++++++-- fs/xfs/scrub/reap.h | 5 ++ fs/xfs/scrub/repair.h | 1 + 4 files changed, 160 insertions(+), 4 deletions(-) create mode 100644 fs/xfs/scrub/fsb_bitmap.h diff --git a/fs/xfs/scrub/fsb_bitmap.h b/fs/xfs/scrub/fsb_bitmap.h new file mode 100644 index 000000000000..40b462c1dd0d --- /dev/null +++ b/fs/xfs/scrub/fsb_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_FSB_BITMAP_H__ +#define __XFS_SCRUB_FSB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_fsblock_t */ + +struct xfsb_bitmap { + struct xbitmap64 fsbitmap; +}; + +static inline void xfsb_bitmap_init(struct xfsb_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->fsbitmap); +} + +static inline void xfsb_bitmap_destroy(struct xfsb_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->fsbitmap); +} + +static inline int xfsb_bitmap_set(struct xfsb_bitmap *bitmap, + xfs_fsblock_t start, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->fsbitmap, start, len); +} + +static inline int xfsb_bitmap_walk(struct xfsb_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->fsbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_FSB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index bfc3583132ac..0d2e32fbb51a 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -38,6 +38,7 @@ #include "scrub/repair.h" #include "scrub/bitmap.h" #include "scrub/agb_bitmap.h" +#include "scrub/fsb_bitmap.h" #include "scrub/reap.h" /* @@ -75,10 +76,10 @@ * with only the same rmap owner but the block is not owned by something with * the same rmap owner, the block will be freed. * - * The caller is responsible for locking the AG headers for the entire rebuild - * operation so that nothing else can sneak in and change the AG state while - * we're not looking. We must also invalidate any buffers associated with - * @bitmap. + * The caller is responsible for locking the AG headers/inode for the entire + * rebuild operation so that nothing else can sneak in and change the incore + * state while we're not looking. We must also invalidate any buffers + * associated with @bitmap. */ /* Information about reaping extents after a repair. */ @@ -501,3 +502,115 @@ xrep_reap_agblocks( return 0; } + +/* + * Break a file metadata extent into sub-extents by fate (crosslinked, not + * crosslinked), and dispose of each sub-extent separately. The extent must + * not cross an AG boundary. + */ +STATIC int +xreap_fsmeta_extent( + uint64_t fsbno, + uint64_t len, + void *priv) +{ + struct xreap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + xfs_agblock_t agbno_next = agbno + len; + int error = 0; + + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); + ASSERT(sc->ip != NULL); + ASSERT(!sc->sa.pag); + + /* + * We're reaping blocks after repairing file metadata, which means that + * we have to init the xchk_ag structure ourselves. + */ + sc->sa.pag = xfs_perag_get(sc->mp, agno); + if (!sc->sa.pag) + return -EFSCORRUPTED; + + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp); + if (error) + goto out_pag; + + while (agbno < agbno_next) { + xfs_extlen_t aglen; + bool crosslinked; + + error = xreap_agextent_select(rs, agbno, agbno_next, + &crosslinked, &aglen); + if (error) + goto out_agf; + + error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked); + if (error) + goto out_agf; + + if (xreap_want_defer_finish(rs)) { + /* + * Holds the AGF buffer across the deferred chain + * processing. + */ + error = xrep_defer_finish(sc); + if (error) + goto out_agf; + xreap_defer_finish_reset(rs); + } else if (xreap_want_roll(rs)) { + /* + * Hold the AGF buffer across the transaction roll so + * that we don't have to reattach it to the scrub + * context. + */ + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + error = xfs_trans_roll_inode(&sc->tp, sc->ip); + xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); + if (error) + goto out_agf; + xreap_reset(rs); + } + + agbno += aglen; + } + +out_agf: + xfs_trans_brelse(sc->tp, sc->sa.agf_bp); + sc->sa.agf_bp = NULL; +out_pag: + xfs_perag_put(sc->sa.pag); + sc->sa.pag = NULL; + return error; +} + +/* + * Dispose of every block of every fs metadata extent in the bitmap. + * Do not use this to dispose of the mappings in an ondisk inode fork. + */ +int +xrep_reap_fsblocks( + struct xfs_scrub *sc, + struct xfsb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xreap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = XFS_AG_RESV_NONE, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip != NULL); + + error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) + return xrep_defer_finish(sc); + + return 0; +} diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h index fe24626af164..0b69f16dd98f 100644 --- a/fs/xfs/scrub/reap.h +++ b/fs/xfs/scrub/reap.h @@ -6,7 +6,12 @@ #ifndef __XFS_SCRUB_REAP_H__ #define __XFS_SCRUB_REAP_H__ +struct xagb_bitmap; +struct xfsb_bitmap; + int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); +int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap, + const struct xfs_owner_info *oinfo); #endif /* __XFS_SCRUB_REAP_H__ */ diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index a513b84f5330..d4ef740c878f 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -48,6 +48,7 @@ xrep_trans_commit( struct xbitmap; struct xagb_bitmap; +struct xfsb_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); From c3a22c2e4b45fcf3184e7dd1c755e6b45dc9f499 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:38 -0800 Subject: [PATCH 0803/1562] xfs: skip the rmapbt search on an empty attr fork unless we know it was zapped The attribute fork scrubber can optionally scan the reverse mapping records of the filesystem to determine if the fork is missing mappings that it should have. However, this is a very expensive operation, so we only want to do this if we suspect that the fork is missing records. For attribute forks the criteria for suspicion is that the attr fork is in EXTENTS format and has zero extents. However, there are several ways that a file can end up in this state through regular filesystem usage. For example, an LSM can set a s_security hook but then decide not to set an ACL; or an attr set can create the attr fork but then the actual set operation fails with ENOSPC; or we can delete all the attrs on a file whose data fork is in btree format, in which case we do not delete the attr fork. We don't want to run the expensive check for any case that can be arrived at through regular operations. However. When online inode repair decides to zap an attribute fork, it cannot determine if it is zapping ACL information. As a precaution it removes all the discretionary access control permissions and sets the user and group ids to zero. Check these three additional conditions to decide if we want to scan the rmap records. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/bmap.c | 101 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 79 insertions(+), 22 deletions(-) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 1487aaf3d95f..8175e8c17c14 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -638,6 +638,82 @@ xchk_bmap_check_ag_rmaps( return error; } +/* + * Decide if we want to scan the reverse mappings to determine if the attr + * fork /really/ has zero space mappings. + */ +STATIC bool +xchk_bmap_check_empty_attrfork( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp = &ip->i_af; + + /* + * If the dinode repair found a bad attr fork, it will reset the fork + * to extents format with zero records and wait for the this scrubber + * to reconstruct the block mappings. If the fork is not in this + * state, then the fork cannot have been zapped. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0) + return false; + + /* + * Files can have an attr fork in EXTENTS format with zero records for + * several reasons: + * + * a) an attr set created a fork but ran out of space + * b) attr replace deleted an old attr but failed during the set step + * c) the data fork was in btree format when all attrs were deleted, so + * the fork was left in place + * d) the inode repair code zapped the fork + * + * Only in case (d) do we want to scan the rmapbt to see if we need to + * rebuild the attr fork. The fork zap code clears all DAC permission + * bits and zeroes the uid and gid, so avoid the scan if any of those + * three conditions are not met. + */ + if ((VFS_I(ip)->i_mode & 0777) != 0) + return false; + if (!uid_eq(VFS_I(ip)->i_uid, GLOBAL_ROOT_UID)) + return false; + if (!gid_eq(VFS_I(ip)->i_gid, GLOBAL_ROOT_GID)) + return false; + + return true; +} + +/* + * Decide if we want to scan the reverse mappings to determine if the data + * fork /really/ has zero space mappings. + */ +STATIC bool +xchk_bmap_check_empty_datafork( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp = &ip->i_df; + + /* Don't support realtime rmap checks yet. */ + if (XFS_IS_REALTIME_INODE(ip)) + return false; + + /* + * If the dinode repair found a bad data fork, it will reset the fork + * to extents format with zero records and wait for the this scrubber + * to reconstruct the block mappings. If the fork is not in this + * state, then the fork cannot have been zapped. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0) + return false; + + /* + * If we encounter an empty data fork along with evidence that the fork + * might not really be empty, we need to scan the reverse mappings to + * decide if we're going to rebuild the fork. Data forks with nonzero + * file size are scanned. + */ + return i_size_read(VFS_I(ip)) != 0; +} + /* * Decide if we want to walk every rmap btree in the fs to make sure that each * rmap for this file fork has corresponding bmbt entries. @@ -647,7 +723,6 @@ xchk_bmap_want_check_rmaps( struct xchk_bmap_info *info) { struct xfs_scrub *sc = info->sc; - struct xfs_ifork *ifp; if (!xfs_has_rmapbt(sc->mp)) return false; @@ -656,28 +731,10 @@ xchk_bmap_want_check_rmaps( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return false; - /* Don't support realtime rmap checks yet. */ - if (info->is_rt) - return false; + if (info->whichfork == XFS_ATTR_FORK) + return xchk_bmap_check_empty_attrfork(sc->ip); - /* - * The inode repair code zaps broken inode forks by resetting them back - * to EXTENTS format and zero extent records. If we encounter a fork - * in this state along with evidence that the fork isn't supposed to be - * empty, we need to scan the reverse mappings to decide if we're going - * to rebuild the fork. Data forks with nonzero file size are scanned. - * xattr forks are never empty of content, so they are always scanned. - */ - ifp = xfs_ifork_ptr(sc->ip, info->whichfork); - if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) { - if (info->whichfork == XFS_DATA_FORK && - i_size_read(VFS_I(sc->ip)) == 0) - return false; - - return true; - } - - return false; + return xchk_bmap_check_empty_datafork(sc->ip); } /* Make sure each rmap has a corresponding bmbt entry. */ From 8f71bede8efd820627ac05c19eac2758214bc896 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:39 -0800 Subject: [PATCH 0804/1562] xfs: repair inode fork block mapping data structures Use the reverse-mapping btree information to rebuild an inode block map. Update the btree bulk loading code as necessary to support inode rooted btrees and fix some bitrot problems. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_bmap_btree.c | 121 ++++- fs/xfs/libxfs/xfs_bmap_btree.h | 5 + fs/xfs/libxfs/xfs_btree_staging.c | 11 +- fs/xfs/libxfs/xfs_btree_staging.h | 2 +- fs/xfs/libxfs/xfs_iext_tree.c | 23 +- fs/xfs/libxfs/xfs_inode_fork.c | 1 + fs/xfs/libxfs/xfs_inode_fork.h | 3 + fs/xfs/scrub/bmap.c | 18 + fs/xfs/scrub/bmap_repair.c | 858 ++++++++++++++++++++++++++++++ fs/xfs/scrub/common.h | 6 +- fs/xfs/scrub/repair.c | 28 + fs/xfs/scrub/repair.h | 6 + fs/xfs/scrub/scrub.c | 4 +- fs/xfs/scrub/trace.h | 34 +- fs/xfs/xfs_trans.c | 62 +++ fs/xfs/xfs_trans.h | 4 + 17 files changed, 1153 insertions(+), 34 deletions(-) create mode 100644 fs/xfs/scrub/bmap_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 561ab59b9422..66c1a5001772 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -183,6 +183,7 @@ ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ alloc_repair.o \ + bmap_repair.o \ ialloc_repair.o \ inode_repair.o \ newbt.o \ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 8360256cff16..71f2d50f7823 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -15,6 +15,7 @@ #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_error.h" @@ -288,10 +289,7 @@ xfs_bmbt_get_minrecs( int level) { if (level == cur->bc_nlevels - 1) { - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(cur->bc_ino.ip, - cur->bc_ino.whichfork); + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0) / 2; @@ -306,10 +304,7 @@ xfs_bmbt_get_maxrecs( int level) { if (level == cur->bc_nlevels - 1) { - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(cur->bc_ino.ip, - cur->bc_ino.whichfork); + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0); @@ -543,23 +538,19 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .keys_contiguous = xfs_bmbt_keys_contiguous, }; -/* - * Allocate a new bmap btree cursor. - */ -struct xfs_btree_cur * /* new bmap btree cursor */ -xfs_bmbt_init_cursor( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* inode owning the btree */ - int whichfork) /* data or attr fork */ +static struct xfs_btree_cur * +xfs_bmbt_init_common( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork) { - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; + ASSERT(whichfork != XFS_COW_FORK); cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP, mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache); - cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); cur->bc_ops = &xfs_bmbt_ops; @@ -567,10 +558,30 @@ xfs_bmbt_init_cursor( if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); cur->bc_ino.ip = ip; cur->bc_ino.allocated = 0; cur->bc_ino.flags = 0; + + return cur; +} + +/* + * Allocate a new bmap btree cursor. + */ +struct xfs_btree_cur * +xfs_bmbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + struct xfs_btree_cur *cur; + + cur = xfs_bmbt_init_common(mp, tp, ip, whichfork); + + cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); cur->bc_ino.whichfork = whichfork; return cur; @@ -587,6 +598,76 @@ xfs_bmbt_block_maxrecs( return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)); } +/* + * Allocate a new bmap btree cursor for reloading an inode block mapping data + * structure. Note that callers can use the staged cursor to reload extents + * format inode forks if they rebuild the iext tree and commit the staged + * cursor immediately. + */ +struct xfs_btree_cur * +xfs_bmbt_stage_cursor( + struct xfs_mount *mp, + struct xfs_inode *ip, + struct xbtree_ifakeroot *ifake) +{ + struct xfs_btree_cur *cur; + struct xfs_btree_ops *ops; + + /* data fork always has larger maxheight */ + cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK); + cur->bc_nlevels = ifake->if_levels; + cur->bc_ino.forksize = ifake->if_fork_size; + + /* Don't let anyone think we're attached to the real fork yet. */ + cur->bc_ino.whichfork = -1; + xfs_btree_stage_ifakeroot(cur, ifake, &ops); + ops->update_cursor = NULL; + return cur; +} + +/* + * Swap in the new inode fork root. Once we pass this point the newly rebuilt + * mappings are in place and we have to kill off any old btree blocks. + */ +void +xfs_bmbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + int whichfork) +{ + struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; + struct xfs_ifork *ifp; + static const short brootflag[2] = {XFS_ILOG_DBROOT, XFS_ILOG_ABROOT}; + static const short extflag[2] = {XFS_ILOG_DEXT, XFS_ILOG_AEXT}; + int flags = XFS_ILOG_CORE; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(whichfork != XFS_COW_FORK); + + /* + * Free any resources hanging off the real fork, then shallow-copy the + * staging fork's contents into the real fork to transfer everything + * we just built. + */ + ifp = xfs_ifork_ptr(cur->bc_ino.ip, whichfork); + xfs_idestroy_fork(ifp); + memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); + + switch (ifp->if_format) { + case XFS_DINODE_FMT_EXTENTS: + flags |= extflag[whichfork]; + break; + case XFS_DINODE_FMT_BTREE: + flags |= brootflag[whichfork]; + break; + default: + ASSERT(0); + break; + } + xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); + xfs_btree_commit_ifakeroot(cur, tp, whichfork, &xfs_bmbt_ops); +} + /* * Calculate number of records in a bmap btree block. */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 3e7a40a83835..151b8491f60e 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -11,6 +11,7 @@ struct xfs_btree_block; struct xfs_mount; struct xfs_inode; struct xfs_trans; +struct xbtree_ifakeroot; /* * Btree block header size depends on a superblock flag. @@ -106,6 +107,10 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +struct xfs_btree_cur *xfs_bmbt_stage_cursor(struct xfs_mount *mp, + struct xfs_inode *ip, struct xbtree_ifakeroot *ifake); +void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, int whichfork); extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp, unsigned long long len); diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 0c978a31e284..e276eba87cb1 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -405,7 +405,7 @@ xfs_btree_bload_prep_block( ASSERT(*bpp == NULL); /* Allocate a new incore btree root block. */ - new_size = bbl->iroot_size(cur, nr_this_block, priv); + new_size = bbl->iroot_size(cur, level, nr_this_block, priv); ifp->if_broot = kmem_zalloc(new_size, 0); ifp->if_broot_bytes = (int)new_size; @@ -596,7 +596,14 @@ xfs_btree_bload_level_geometry( unsigned int desired_npb; unsigned int maxnr; - maxnr = cur->bc_ops->get_maxrecs(cur, level); + /* + * Compute the absolute maximum number of records that we can store in + * the ondisk block or inode root. + */ + if (cur->bc_ops->get_dmaxrecs) + maxnr = cur->bc_ops->get_dmaxrecs(cur, level); + else + maxnr = cur->bc_ops->get_maxrecs(cur, level); /* * Compute the number of blocks we need to fill each block with the diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index f0a5007284ef..055ea43b1e18 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -53,7 +53,7 @@ typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur, typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, void *priv); typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur, - unsigned int nr_this_level, void *priv); + unsigned int level, unsigned int nr_this_level, void *priv); struct xfs_btree_bload { /* diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 773cf4349428..d062794cc795 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -622,13 +622,11 @@ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp) } void -xfs_iext_insert( - struct xfs_inode *ip, +xfs_iext_insert_raw( + struct xfs_ifork *ifp, struct xfs_iext_cursor *cur, - struct xfs_bmbt_irec *irec, - int state) + struct xfs_bmbt_irec *irec) { - struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); xfs_fileoff_t offset = irec->br_startoff; struct xfs_iext_leaf *new = NULL; int nr_entries, i; @@ -662,12 +660,23 @@ xfs_iext_insert( xfs_iext_set(cur_rec(cur), irec); ifp->if_bytes += sizeof(struct xfs_iext_rec); - trace_xfs_iext_insert(ip, cur, state, _RET_IP_); - if (new) xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2); } +void +xfs_iext_insert( + struct xfs_inode *ip, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *irec, + int state) +{ + struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); + + xfs_iext_insert_raw(ifp, cur, irec); + trace_xfs_iext_insert(ip, cur, state, _RET_IP_); +} + static struct xfs_iext_node * xfs_iext_rebalance_node( struct xfs_iext_node *parent, diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index dad8ea832c20..b86d57589f67 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -520,6 +520,7 @@ xfs_idata_realloc( ifp->if_bytes = new_size; } +/* Free all memory and reset a fork back to its initial state. */ void xfs_idestroy_fork( struct xfs_ifork *ifp) diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 96d307784c85..535be5c03689 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -180,6 +180,9 @@ void xfs_init_local_fork(struct xfs_inode *ip, int whichfork, const void *data, int64_t size); xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp); +void xfs_iext_insert_raw(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *irec); void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *, int); void xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *, diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 8175e8c17c14..b169cddde6da 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -50,9 +50,18 @@ xchk_setup_inode_bmap( if (S_ISREG(VFS_I(sc->ip)->i_mode) && sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) { struct address_space *mapping = VFS_I(sc->ip)->i_mapping; + bool is_repair = xchk_could_repair(sc); xchk_ilock(sc, XFS_MMAPLOCK_EXCL); + /* Break all our leases, we're going to mess with things. */ + if (is_repair) { + error = xfs_break_layouts(VFS_I(sc->ip), + &sc->ilock_flags, BREAK_WRITE); + if (error) + goto out; + } + inode_dio_wait(VFS_I(sc->ip)); /* @@ -73,6 +82,15 @@ xchk_setup_inode_bmap( error = filemap_fdatawait_keep_errors(mapping); if (error && (error != -ENOSPC && error != -EIO)) goto out; + + /* Drop the page cache if we're repairing block mappings. */ + if (is_repair) { + error = invalidate_inode_pages2( + VFS_I(sc->ip)->i_mapping); + if (error) + goto out; + } + } /* Got the inode, lock it and we're ready to go. */ diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c new file mode 100644 index 000000000000..a8d6415b1c38 --- /dev/null +++ b/fs/xfs/scrub/bmap_repair.c @@ -0,0 +1,858 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "xfs_reflink.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Inode Fork Block Mapping (BMBT) Repair + * ====================================== + * + * Gather all the rmap records for the inode and fork we're fixing, reset the + * incore fork, then recreate the btree. + */ + +enum reflink_scan_state { + RLS_IRRELEVANT = -1, /* not applicable to this file */ + RLS_UNKNOWN, /* shared extent scans required */ + RLS_SET_IFLAG, /* iflag must be set */ +}; + +struct xrep_bmap { + /* Old bmbt blocks */ + struct xfsb_bitmap old_bmbt_blocks; + + /* New fork. */ + struct xrep_newbt new_bmapbt; + + /* List of new bmap records. */ + struct xfarray *bmap_records; + + struct xfs_scrub *sc; + + /* How many blocks did we find allocated to this file? */ + xfs_rfsblock_t nblocks; + + /* How many bmbt blocks did we find for this fork? */ + xfs_rfsblock_t old_bmbt_block_count; + + /* get_records()'s position in the free space record array. */ + xfarray_idx_t array_cur; + + /* How many real (non-hole, non-delalloc) mappings do we have? */ + uint64_t real_mappings; + + /* Which fork are we fixing? */ + int whichfork; + + /* What d the REFLINK flag be set when the repair is over? */ + enum reflink_scan_state reflink_scan; +}; + +/* Is this space extent shared? Flag the inode if it is. */ +STATIC int +xrep_bmap_discover_shared( + struct xrep_bmap *rb, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount) +{ + struct xfs_scrub *sc = rb->sc; + xfs_agblock_t agbno; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error; + + agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock); + error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount, + &fbno, &flen, false); + if (error) + return error; + + if (fbno != NULLAGBLOCK) + rb->reflink_scan = RLS_SET_IFLAG; + + return 0; +} + +/* Remember this reverse-mapping as a series of bmap records. */ +STATIC int +xrep_bmap_from_rmap( + struct xrep_bmap *rb, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + bool unwritten) +{ + struct xfs_bmbt_irec irec = { + .br_startoff = startoff, + .br_startblock = startblock, + .br_state = unwritten ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM, + }; + struct xfs_bmbt_rec rbe; + struct xfs_scrub *sc = rb->sc; + int error = 0; + + /* + * If we're repairing the data fork of a non-reflinked regular file on + * a reflink filesystem, we need to figure out if this space extent is + * shared. + */ + if (rb->reflink_scan == RLS_UNKNOWN && !unwritten) { + error = xrep_bmap_discover_shared(rb, startblock, blockcount); + if (error) + return error; + } + + do { + xfs_failaddr_t fa; + + irec.br_blockcount = min_t(xfs_filblks_t, blockcount, + XFS_MAX_BMBT_EXTLEN); + + fa = xfs_bmap_validate_extent(sc->ip, rb->whichfork, &irec); + if (fa) + return -EFSCORRUPTED; + + xfs_bmbt_disk_set_all(&rbe, &irec); + + trace_xrep_bmap_found(sc->ip, rb->whichfork, &irec); + + if (xchk_should_terminate(sc, &error)) + return error; + + error = xfarray_append(rb->bmap_records, &rbe); + if (error) + return error; + + rb->real_mappings++; + + irec.br_startblock += irec.br_blockcount; + irec.br_startoff += irec.br_blockcount; + blockcount -= irec.br_blockcount; + } while (blockcount > 0); + + return 0; +} + +/* Check for any obvious errors or conflicts in the file mapping. */ +STATIC int +xrep_bmap_check_fork_rmap( + struct xrep_bmap *rb, + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec) +{ + struct xfs_scrub *sc = rb->sc; + enum xbtree_recpacking outcome; + int error; + + /* + * Data extents for rt files are never stored on the data device, but + * everything else (xattrs, bmbt blocks) can be. + */ + if (XFS_IS_REALTIME_INODE(sc->ip) && + !(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) + return -EFSCORRUPTED; + + /* Check that this is within the AG. */ + if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* Check the file offset range. */ + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && + !xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* No contradictory flags. */ + if ((rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)) && + (rec->rm_flags & XFS_RMAP_UNWRITTEN)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, + rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->rm_startblock, rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Record extents that belong to this inode's fork. */ +STATIC int +xrep_bmap_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_bmap *rb = priv; + struct xfs_mount *mp = cur->bc_mp; + xfs_fsblock_t fsbno; + int error = 0; + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + if (rec->rm_owner != rb->sc->ip->i_ino) + return 0; + + error = xrep_bmap_check_fork_rmap(rb, cur, rec); + if (error) + return error; + + /* + * Record all blocks allocated to this file even if the extent isn't + * for the fork we're rebuilding so that we can reset di_nblocks later. + */ + rb->nblocks += rec->rm_blockcount; + + /* If this rmap isn't for the fork we want, we're done. */ + if (rb->whichfork == XFS_DATA_FORK && + (rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + if (rb->whichfork == XFS_ATTR_FORK && + !(rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + + fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, + rec->rm_startblock); + + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) { + rb->old_bmbt_block_count += rec->rm_blockcount; + return xfsb_bitmap_set(&rb->old_bmbt_blocks, fsbno, + rec->rm_blockcount); + } + + return xrep_bmap_from_rmap(rb, rec->rm_offset, fsbno, + rec->rm_blockcount, + rec->rm_flags & XFS_RMAP_UNWRITTEN); +} + +/* + * Compare two block mapping records. We want to sort in order of increasing + * file offset. + */ +static int +xrep_bmap_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_bmbt_rec *ba = a; + const struct xfs_bmbt_rec *bb = b; + xfs_fileoff_t ao = xfs_bmbt_disk_get_startoff(ba); + xfs_fileoff_t bo = xfs_bmbt_disk_get_startoff(bb); + + if (ao > bo) + return 1; + else if (ao < bo) + return -1; + return 0; +} + +/* + * Sort the bmap extents by fork offset or else the records will be in the + * wrong order. Ensure there are no overlaps in the file offset ranges. + */ +STATIC int +xrep_bmap_sort_records( + struct xrep_bmap *rb) +{ + struct xfs_bmbt_irec irec; + xfs_fileoff_t next_off = 0; + xfarray_idx_t array_cur; + int error; + + error = xfarray_sort(rb->bmap_records, xrep_bmap_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rb->bmap_records, array_cur) { + struct xfs_bmbt_rec rec; + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + error = xfarray_load(rb->bmap_records, array_cur, &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, &irec); + + if (irec.br_startoff < next_off) + return -EFSCORRUPTED; + + next_off = irec.br_startoff + irec.br_blockcount; + } + + return 0; +} + +/* Scan one AG for reverse mappings that we can turn into extent maps. */ +STATIC int +xrep_bmap_scan_ag( + struct xrep_bmap *rb, + struct xfs_perag *pag) +{ + struct xfs_scrub *sc = rb->sc; + int error; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_bmap_walk_rmap, rb); + xchk_ag_free(sc, &sc->sa); + return error; +} + +/* Find the delalloc extents from the old incore extent tree. */ +STATIC int +xrep_bmap_find_delalloc( + struct xrep_bmap *rb) +{ + struct xfs_bmbt_irec irec; + struct xfs_iext_cursor icur; + struct xfs_bmbt_rec rbe; + struct xfs_inode *ip = rb->sc->ip; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, rb->whichfork); + int error = 0; + + /* + * Skip this scan if we don't expect to find delayed allocation + * reservations in this fork. + */ + if (rb->whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0) + return 0; + + for_each_xfs_iext(ifp, &icur, &irec) { + if (!isnullstartblock(irec.br_startblock)) + continue; + + xfs_bmbt_disk_set_all(&rbe, &irec); + + trace_xrep_bmap_found(ip, rb->whichfork, &irec); + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + error = xfarray_append(rb->bmap_records, &rbe); + if (error) + return error; + } + + return 0; +} + +/* + * Collect block mappings for this fork of this inode and decide if we have + * enough space to rebuild. Caller is responsible for cleaning up the list if + * anything goes wrong. + */ +STATIC int +xrep_bmap_find_mappings( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error = 0; + + /* Iterate the rmaps for extents. */ + for_each_perag(sc->mp, agno, pag) { + error = xrep_bmap_scan_ag(rb, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + return xrep_bmap_find_delalloc(rb); +} + +/* Retrieve real extent mappings for bulk loading the bmap btree. */ +STATIC int +xrep_bmap_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_bmbt_rec rec; + struct xfs_bmbt_irec *irec = &cur->bc_rec.b; + struct xrep_bmap *rb = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + do { + error = xfarray_load(rb->bmap_records, rb->array_cur++, + &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, irec); + } while (isnullstartblock(irec->br_startblock)); + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_bmap_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_bmap *rb = priv; + + return xrep_newbt_claim_block(cur, &rb->new_bmapbt, ptr); +} + +/* Figure out how much space we need to create the incore btree root block. */ +STATIC size_t +xrep_bmap_iroot_size( + struct xfs_btree_cur *cur, + unsigned int level, + unsigned int nr_this_level, + void *priv) +{ + ASSERT(level > 0); + + return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level); +} + +/* Update the inode counters. */ +STATIC int +xrep_bmap_reset_counters( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake; + int64_t delta; + + if (rb->reflink_scan == RLS_SET_IFLAG) + sc->ip->i_diflags2 |= XFS_DIFLAG2_REFLINK; + + /* + * Update the inode block counts to reflect the extents we found in the + * rmapbt. + */ + delta = ifake->if_blocks - rb->old_bmbt_block_count; + sc->ip->i_nblocks = rb->nblocks + delta; + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + + /* + * Adjust the quota counts by the difference in size between the old + * and new bmbt. + */ + xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, delta); + return 0; +} + +/* + * Create a new iext tree and load it with block mappings. If the inode is + * in extents format, that's all we need to do to commit the new mappings. + * If it is in btree format, this takes care of preloading the incore tree. + */ +STATIC int +xrep_bmap_extents_load( + struct xrep_bmap *rb) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp = rb->new_bmapbt.ifake.if_fork; + xfarray_idx_t array_cur; + int error; + + ASSERT(ifp->if_bytes == 0); + + /* Add all the mappings (incl. delalloc) to the incore extent tree. */ + xfs_iext_first(ifp, &icur); + foreach_xfarray_idx(rb->bmap_records, array_cur) { + struct xfs_bmbt_rec rec; + + error = xfarray_load(rb->bmap_records, array_cur, &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, &irec); + + xfs_iext_insert_raw(ifp, &icur, &irec); + if (!isnullstartblock(irec.br_startblock)) + ifp->if_nextents++; + + xfs_iext_next(ifp, &icur); + } + + return xrep_ino_ensure_extent_count(rb->sc, rb->whichfork, + ifp->if_nextents); +} + +/* + * Reserve new btree blocks, bulk load the bmap records into the ondisk btree, + * and load the incore extent tree. + */ +STATIC int +xrep_bmap_btree_load( + struct xrep_bmap *rb, + struct xfs_btree_cur *bmap_cur) +{ + struct xfs_scrub *sc = rb->sc; + int error; + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(bmap_cur, + &rb->new_bmapbt.bload, rb->real_mappings); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Guess how many blocks we're going to need to rebuild an entire bmap + * from the number of extents we found, and pump up our transaction to + * have sufficient block reservation. We're allowed to exceed file + * quota to repair inconsistent metadata. + */ + error = xfs_trans_reserve_more_inode(sc->tp, sc->ip, + rb->new_bmapbt.bload.nr_blocks, 0, true); + if (error) + return error; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rb->new_bmapbt, + rb->new_bmapbt.bload.nr_blocks); + if (error) + return error; + + /* Add all observed bmap records. */ + rb->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(bmap_cur, &rb->new_bmapbt.bload, rb); + if (error) + return error; + + /* + * Load the new bmap records into the new incore extent tree to + * preserve delalloc reservations for regular files. The directory + * code loads the extent tree during xfs_dir_open and assumes + * thereafter that it remains loaded, so we must not violate that + * assumption. + */ + return xrep_bmap_extents_load(rb); +} + +/* + * Use the collected bmap information to stage a new bmap fork. If this is + * successful we'll return with the new fork information logged to the repair + * transaction but not yet committed. The caller must ensure that the inode + * is joined to the transaction; the inode will be joined to a clean + * transaction when the function returns. + */ +STATIC int +xrep_bmap_build_new_fork( + struct xrep_bmap *rb) +{ + struct xfs_owner_info oinfo; + struct xfs_scrub *sc = rb->sc; + struct xfs_btree_cur *bmap_cur; + struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake; + int error; + + error = xrep_bmap_sort_records(rb); + if (error) + return error; + + /* + * Prepare to construct the new fork by initializing the new btree + * structure and creating a fake ifork in the ifakeroot structure. + */ + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork); + error = xrep_newbt_init_inode(&rb->new_bmapbt, sc, rb->whichfork, + &oinfo); + if (error) + return error; + + rb->new_bmapbt.bload.get_records = xrep_bmap_get_records; + rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block; + rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size; + bmap_cur = xfs_bmbt_stage_cursor(sc->mp, sc->ip, ifake); + + /* + * Figure out the size and format of the new fork, then fill it with + * all the bmap records we've found. Join the inode to the transaction + * so that we can roll the transaction while holding the inode locked. + */ + if (rb->real_mappings <= XFS_IFORK_MAXEXT(sc->ip, rb->whichfork)) { + ifake->if_fork->if_format = XFS_DINODE_FMT_EXTENTS; + error = xrep_bmap_extents_load(rb); + } else { + ifake->if_fork->if_format = XFS_DINODE_FMT_BTREE; + error = xrep_bmap_btree_load(rb, bmap_cur); + } + if (error) + goto err_cur; + + /* + * Install the new fork in the inode. After this point the old mapping + * data are no longer accessible and the new tree is live. We delete + * the cursor immediately after committing the staged root because the + * staged fork might be in extents format. + */ + xfs_bmbt_commit_staged_btree(bmap_cur, sc->tp, rb->whichfork); + xfs_btree_del_cursor(bmap_cur, 0); + + /* Reset the inode counters now that we've changed the fork. */ + error = xrep_bmap_reset_counters(rb); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rb->new_bmapbt); + if (error) + return error; + + return xrep_roll_trans(sc); + +err_cur: + if (bmap_cur) + xfs_btree_del_cursor(bmap_cur, error); +err_newbt: + xrep_newbt_cancel(&rb->new_bmapbt); + return error; +} + +/* + * Now that we've logged the new inode btree, invalidate all of the old blocks + * and free them, if there were any. + */ +STATIC int +xrep_bmap_remove_old_tree( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xfs_owner_info oinfo; + + /* Free the old bmbt blocks if they're not in use. */ + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork); + return xrep_reap_fsblocks(sc, &rb->old_bmbt_blocks, &oinfo); +} + +/* Check for garbage inputs. Returns -ECANCELED if there's nothing to do. */ +STATIC int +xrep_bmap_check_inputs( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); + + ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK); + + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + + /* No fork means nothing to rebuild. */ + if (!ifp) + return -ECANCELED; + + /* + * We only know how to repair extent mappings, which is to say that we + * only support extents and btree fork format. Repairs to a local + * format fork require a higher level repair function, so we do not + * have any work to do here. + */ + switch (ifp->if_format) { + case XFS_DINODE_FMT_DEV: + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_UUID: + return -ECANCELED; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return -EFSCORRUPTED; + } + + if (whichfork == XFS_ATTR_FORK) + return 0; + + /* Only files, symlinks, and directories get to have data forks. */ + switch (VFS_I(sc->ip)->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + /* ok */ + break; + default: + return -EINVAL; + } + + /* Don't know how to rebuild realtime data forks. */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return -EOPNOTSUPP; + + return 0; +} + +/* Set up the initial state of the reflink scan. */ +static inline enum reflink_scan_state +xrep_bmap_init_reflink_scan( + struct xfs_scrub *sc, + int whichfork) +{ + /* cannot share on non-reflink filesystem */ + if (!xfs_has_reflink(sc->mp)) + return RLS_IRRELEVANT; + + /* preserve flag if it's already set */ + if (xfs_is_reflink_inode(sc->ip)) + return RLS_SET_IFLAG; + + /* can only share regular files */ + if (!S_ISREG(VFS_I(sc->ip)->i_mode)) + return RLS_IRRELEVANT; + + /* cannot share attr fork extents */ + if (whichfork != XFS_DATA_FORK) + return RLS_IRRELEVANT; + + /* cannot share realtime extents */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return RLS_IRRELEVANT; + + return RLS_UNKNOWN; +} + +/* Repair an inode fork. */ +STATIC int +xrep_bmap( + struct xfs_scrub *sc, + int whichfork) +{ + struct xrep_bmap *rb; + char *descr; + unsigned int max_bmbt_recs; + bool large_extcount; + int error = 0; + + error = xrep_bmap_check_inputs(sc, whichfork); + if (error == -ECANCELED) + return 0; + if (error) + return error; + + rb = kzalloc(sizeof(struct xrep_bmap), XCHK_GFP_FLAGS); + if (!rb) + return -ENOMEM; + rb->sc = sc; + rb->whichfork = whichfork; + rb->reflink_scan = xrep_bmap_init_reflink_scan(sc, whichfork); + + /* Set up enough storage to handle the max records for this fork. */ + large_extcount = xfs_has_large_extent_counts(sc->mp); + max_bmbt_recs = xfs_iext_max_nextents(large_extcount, whichfork); + descr = xchk_xfile_ino_descr(sc, "%s fork mapping records", + whichfork == XFS_DATA_FORK ? "data" : "attr"); + error = xfarray_create(descr, max_bmbt_recs, + sizeof(struct xfs_bmbt_rec), &rb->bmap_records); + kfree(descr); + if (error) + goto out_rb; + + /* Collect all reverse mappings for this fork's extents. */ + xfsb_bitmap_init(&rb->old_bmbt_blocks); + error = xrep_bmap_find_mappings(rb); + if (error) + goto out_bitmap; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Rebuild the bmap information. */ + error = xrep_bmap_build_new_fork(rb); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_bmap_remove_old_tree(rb); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&rb->old_bmbt_blocks); + xfarray_destroy(rb->bmap_records); +out_rb: + kfree(rb); + return error; +} + +/* Repair an inode's data fork. */ +int +xrep_bmap_data( + struct xfs_scrub *sc) +{ + return xrep_bmap(sc, XFS_DATA_FORK); +} + +/* Repair an inode's attr fork. */ +int +xrep_bmap_attr( + struct xfs_scrub *sc) +{ + return xrep_bmap(sc, XFS_ATTR_FORK); +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index ec5755266259..da09580b454a 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -239,7 +239,11 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc); (sc)->mp->m_super->s_id, \ (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \ ##__VA_ARGS__) - +#define xchk_xfile_ino_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \ + (sc)->mp->m_super->s_id, \ + (sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \ + ##__VA_ARGS__) /* * Setting up a hook to wait for intents to drain is costly -- we have to take diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 25392dea326d..26d65175ae8b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -883,6 +883,34 @@ xrep_reinit_pagi( return 0; } +/* + * Given an active reference to a perag structure, load AG headers and cursors. + * This should only be called to scan an AG while repairing file-based metadata. + */ +int +xrep_ag_init( + struct xfs_scrub *sc, + struct xfs_perag *pag, + struct xchk_ag *sa) +{ + int error; + + ASSERT(!sa->pag); + + error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp); + if (error) + return error; + + error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp); + if (error) + return error; + + /* Grab our own passive reference from the caller's ref. */ + sa->pag = xfs_perag_hold(pag); + xrep_ag_btcur_init(sc, sa); + return 0; +} + /* Reinitialize the per-AG block reservation for the AG we just fixed. */ int xrep_reset_perag_resv( diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index d4ef740c878f..8aa8b8889e39 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -89,6 +89,8 @@ struct xfs_imap; int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap); void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); +int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag, + struct xchk_ag *sa); /* Metadata revalidators */ @@ -106,6 +108,8 @@ int xrep_allocbt(struct xfs_scrub *sc); int xrep_iallocbt(struct xfs_scrub *sc); int xrep_refcountbt(struct xfs_scrub *sc); int xrep_inode(struct xfs_scrub *sc); +int xrep_bmap_data(struct xfs_scrub *sc); +int xrep_bmap_attr(struct xfs_scrub *sc); int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); @@ -165,6 +169,8 @@ xrep_setup_nothing( #define xrep_iallocbt xrep_notsupported #define xrep_refcountbt xrep_notsupported #define xrep_inode xrep_notsupported +#define xrep_bmap_data xrep_notsupported +#define xrep_bmap_attr xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 7e903a0fde6c..238ead205c52 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -288,13 +288,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_data, - .repair = xrep_notsupported, + .repair = xrep_bmap_data, }, [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_attr, - .repair = xrep_notsupported, + .repair = xrep_bmap_attr, }, [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ .type = ST_INODE, diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 120faa4dce2d..d6a1f46cf6e9 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1175,7 +1175,7 @@ DEFINE_EVENT(xrep_rmap_class, name, \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap); DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); -DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap); TRACE_EVENT(xrep_abt_found, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -1260,6 +1260,38 @@ TRACE_EVENT(xrep_refc_found, __entry->refcount) ) +TRACE_EVENT(xrep_bmap_found, + TP_PROTO(struct xfs_inode *ip, int whichfork, + struct xfs_bmbt_irec *irec), + TP_ARGS(ip, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, lblk) + __field(xfs_filblks_t, len) + __field(xfs_fsblock_t, pblk) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->whichfork = whichfork; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->lblk, + __entry->len, + __entry->pblk, + __entry->state) +); + TRACE_EVENT(xrep_findroot_block, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, uint32_t magic, uint16_t level), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 305c9d07bf1b..12d45e93f07d 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1236,6 +1236,68 @@ out_cancel: return error; } +/* + * Try to reserve more blocks for a transaction. + * + * This is for callers that need to attach resources to a transaction, scan + * those resources to determine the space reservation requirements, and then + * modify the attached resources. In other words, online repair. This can + * fail due to ENOSPC, so the caller must be able to cancel the transaction + * without shutting down the fs. + */ +int +xfs_trans_reserve_more( + struct xfs_trans *tp, + unsigned int blocks, + unsigned int rtextents) +{ + struct xfs_trans_res resv = { }; + + return xfs_trans_reserve(tp, &resv, blocks, rtextents); +} + +/* + * Try to reserve more blocks and file quota for a transaction. Same + * conditions of usage as xfs_trans_reserve_more. + */ +int +xfs_trans_reserve_more_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + unsigned int dblocks, + unsigned int rblocks, + bool force_quota) +{ + struct xfs_trans_res resv = { }; + struct xfs_mount *mp = ip->i_mount; + unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks); + int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + error = xfs_trans_reserve(tp, &resv, dblocks, rtx); + if (error) + return error; + + if (!XFS_IS_QUOTA_ON(mp) || xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) + return 0; + + if (tp->t_flags & XFS_TRANS_RESERVE) + force_quota = true; + + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, + force_quota); + if (!error) + return 0; + + /* Quota failed, give back the new reservation. */ + xfs_mod_fdblocks(mp, dblocks, tp->t_flags & XFS_TRANS_RESERVE); + tp->t_blk_res -= dblocks; + xfs_mod_frextents(mp, rtx); + tp->t_rtx_res -= rtx; + return error; +} + /* * Allocate an transaction in preparation for inode creation by reserving quota * against the given dquots. Callers are not required to hold any inode locks. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 2cb1e143fc49..08ce757c7454 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -164,6 +164,8 @@ typedef struct xfs_trans { int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, uint blocks, uint rtextents, uint flags, struct xfs_trans **tpp); +int xfs_trans_reserve_more(struct xfs_trans *tp, + unsigned int blocks, unsigned int rtextents); int xfs_trans_alloc_empty(struct xfs_mount *mp, struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); @@ -248,6 +250,8 @@ struct xfs_dquot; int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv, unsigned int dblocks, unsigned int rblocks, bool force, struct xfs_trans **tpp); +int xfs_trans_reserve_more_inode(struct xfs_trans *tp, struct xfs_inode *ip, + unsigned int dblocks, unsigned int rblocks, bool force_quota); int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, unsigned int dblocks, From 48a72f60861f790dd7c2db04d69c3a1ce606984e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:39 -0800 Subject: [PATCH 0805/1562] xfs: refactor repair forcing tests into a repair.c helper There are a couple of conditions that userspace can set to force repairs of metadata. These really belong in the repair code and not open-coded into the check code, so refactor them into a helper. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/repair.c | 22 ++++++++++++++++++++++ fs/xfs/scrub/repair.h | 2 ++ fs/xfs/scrub/scrub.c | 14 +------------- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 26d65175ae8b..020d49b0f9b9 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -27,6 +27,8 @@ #include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_defer.h" +#include "xfs_errortag.h" +#include "xfs_error.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -940,3 +942,23 @@ xrep_reset_perag_resv( out: return error; } + +/* Decide if we are going to call the repair function for a scrub type. */ +bool +xrep_will_attempt( + struct xfs_scrub *sc) +{ + /* Userspace asked us to rebuild the structure regardless. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) + return true; + + /* Let debug users force us into the repair routines. */ + if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + return true; + + /* Metadata is corrupt or failed cross-referencing. */ + if (xchk_needs_repair(sc->sm)) + return true; + + return false; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 8aa8b8889e39..51ba3df5998c 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -28,6 +28,7 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) /* Repair helpers */ int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); +bool xrep_will_attempt(struct xfs_scrub *sc); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); int xrep_roll_trans(struct xfs_scrub *sc); @@ -117,6 +118,7 @@ int xrep_reinit_pagi(struct xfs_scrub *sc); #else #define xrep_ino_dqattach(sc) (0) +#define xrep_will_attempt(sc) (false) static inline int xrep_attempt( diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 238ead205c52..c9e37c688089 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -14,8 +14,6 @@ #include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_qm.h" -#include "xfs_errortag.h" -#include "xfs_error.h" #include "xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -550,21 +548,11 @@ retry_op: xchk_update_health(sc); if (xchk_could_repair(sc)) { - bool needs_fix = xchk_needs_repair(sc->sm); - - /* Userspace asked us to rebuild the structure regardless. */ - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) - needs_fix = true; - - /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) - needs_fix = true; - /* * If userspace asked for a repair but it wasn't necessary, * report that back to userspace. */ - if (!needs_fix) { + if (!xrep_will_attempt(sc)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; goto out_nofix; } From d12bf8bac87a0d93e6e5fab67f399d1e3d3d5767 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:40 -0800 Subject: [PATCH 0806/1562] xfs: create a ranged query function for refcount btrees Implement ranged queries for refcount records. The next patch will use this to scan refcount data. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_refcount.c | 41 ++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_refcount.h | 10 +++++++++ 2 files changed, 51 insertions(+) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 3a9f22d94444..6709a7f8bad5 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -2031,6 +2031,47 @@ xfs_refcount_has_records( return xfs_btree_has_records(cur, &low, &high, NULL, outcome); } +struct xfs_refcount_query_range_info { + xfs_refcount_query_range_fn fn; + void *priv; +}; + +/* Format btree record and pass to our callback. */ +STATIC int +xfs_refcount_query_range_helper( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_refcount_query_range_info *query = priv; + struct xfs_refcount_irec irec; + xfs_failaddr_t fa; + + xfs_refcount_btrec_to_irec(rec, &irec); + fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec); + if (fa) + return xfs_refcount_complain_bad_rec(cur, fa, &irec); + + return query->fn(cur, &irec, query->priv); +} + +/* Find all refcount records between two keys. */ +int +xfs_refcount_query_range( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *low_rec, + const struct xfs_refcount_irec *high_rec, + xfs_refcount_query_range_fn fn, + void *priv) +{ + union xfs_btree_irec low_brec = { .rc = *low_rec }; + union xfs_btree_irec high_brec = { .rc = *high_rec }; + struct xfs_refcount_query_range_info query = { .priv = priv, .fn = fn }; + + return xfs_btree_query_range(cur, &low_brec, &high_brec, + xfs_refcount_query_range_helper, &query); +} + int __init xfs_refcount_intent_init_cache(void) { diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 5c207f1c619c..9b56768a590c 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -127,4 +127,14 @@ extern struct kmem_cache *xfs_refcount_intent_cache; int __init xfs_refcount_intent_init_cache(void); void xfs_refcount_intent_destroy_cache(void); +typedef int (*xfs_refcount_query_range_fn)( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv); + +int xfs_refcount_query_range(struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *low_rec, + const struct xfs_refcount_irec *high_rec, + xfs_refcount_query_range_fn fn, void *priv); + #endif /* __XFS_REFCOUNT_H__ */ From dbbdbd0086320a026903ca34efedb6abf55230ed Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:40 -0800 Subject: [PATCH 0807/1562] xfs: repair problems in CoW forks Try to repair errors that we see in file CoW forks so that we don't do stupid things like remap garbage into a file. There's not a lot we can do with the COW fork -- the ondisk metadata record only that the COW staging extents are owned by the refcount btree, which effectively means that we can't reconstruct this incore structure from scratch. Actually, this is even worse -- we can't touch written extents, because those map space that are actively under writeback, and there's not much to do with delalloc reservations. Hence we can only detect crosslinked unwritten extents and fix them by punching out the problematic parts and replacing them with delalloc extents. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/scrub/cow_repair.c | 614 ++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/off_bitmap.h | 37 +++ fs/xfs/scrub/reap.c | 32 ++ fs/xfs/scrub/repair.h | 2 + fs/xfs/scrub/scrub.c | 2 +- fs/xfs/scrub/trace.h | 84 ++++++ 7 files changed, 771 insertions(+), 1 deletion(-) create mode 100644 fs/xfs/scrub/cow_repair.c create mode 100644 fs/xfs/scrub/off_bitmap.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 66c1a5001772..a7830df42c4e 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -184,6 +184,7 @@ xfs-y += $(addprefix scrub/, \ agheader_repair.o \ alloc_repair.o \ bmap_repair.o \ + cow_repair.o \ ialloc_repair.o \ inode_repair.o \ newbt.o \ diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c new file mode 100644 index 000000000000..1e82c727af8e --- /dev/null +++ b/fs/xfs/scrub/cow_repair.c @@ -0,0 +1,614 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_rmap.h" +#include "xfs_refcount.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_errortag.h" +#include "xfs_icache.h" +#include "xfs_refcount_btree.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/off_bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/reap.h" + +/* + * CoW Fork Mapping Repair + * ======================= + * + * Although CoW staging extents are owned by incore CoW inode forks, on disk + * they are owned by the refcount btree. The ondisk metadata does not record + * any ownership information, which limits what we can do to repair the + * mappings in the CoW fork. At most, we can replace ifork mappings that lack + * an entry in the refcount btree or are described by a reverse mapping record + * whose owner is not OWN_COW. + * + * Replacing extents is also tricky -- we can't touch written CoW fork extents + * since they are undergoing writeback, and delalloc extents do not require + * repair since they only exist incore. Hence the most we can do is find the + * bad parts of unwritten mappings, allocate a replacement set of blocks, and + * replace the incore mapping. We use the regular reaping process to unmap + * or free the discarded blocks, as appropriate. + */ +struct xrep_cow { + struct xfs_scrub *sc; + + /* Bitmap of file offset ranges that need replacing. */ + struct xoff_bitmap bad_fileoffs; + + /* Bitmap of fsblocks that were removed from the CoW fork. */ + struct xfsb_bitmap old_cowfork_fsblocks; + + /* CoW fork mappings used to scan for bad CoW staging extents. */ + struct xfs_bmbt_irec irec; + + /* refcount btree block number of irec.br_startblock */ + unsigned int irec_startbno; + + /* refcount btree block number of the next refcount record we expect */ + unsigned int next_bno; +}; + +/* CoW staging extent. */ +struct xrep_cow_extent { + xfs_fsblock_t fsbno; + xfs_extlen_t len; +}; + +/* + * Mark the part of the file range that corresponds to the given physical + * space. Caller must ensure that the physical range is within xc->irec. + */ +STATIC int +xrep_cow_mark_file_range( + struct xrep_cow *xc, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount) +{ + xfs_fileoff_t startoff; + + startoff = xc->irec.br_startoff + + (startblock - xc->irec.br_startblock); + + trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff, + blockcount); + + return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount); +} + +/* + * Trim @src to fit within the CoW fork mapping being examined, and put the + * result in @dst. + */ +static inline void +xrep_cow_trim_refcount( + struct xrep_cow *xc, + struct xfs_refcount_irec *dst, + const struct xfs_refcount_irec *src) +{ + unsigned int adj; + + memcpy(dst, src, sizeof(*dst)); + + if (dst->rc_startblock < xc->irec_startbno) { + adj = xc->irec_startbno - dst->rc_startblock; + dst->rc_blockcount -= adj; + dst->rc_startblock += adj; + } + + if (dst->rc_startblock + dst->rc_blockcount > + xc->irec_startbno + xc->irec.br_blockcount) { + adj = (dst->rc_startblock + dst->rc_blockcount) - + (xc->irec_startbno + xc->irec.br_blockcount); + dst->rc_blockcount -= adj; + } +} + +/* Mark any shared CoW staging extents. */ +STATIC int +xrep_cow_mark_shared_staging( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + struct xfs_refcount_irec rrec; + xfs_fsblock_t fsbno; + + if (!xfs_refcount_check_domain(rec) || + rec->rc_domain != XFS_REFC_DOMAIN_SHARED) + return -EFSCORRUPTED; + + xrep_cow_trim_refcount(xc, &rrec, rec); + + fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, + rrec.rc_startblock); + return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount); +} + +/* + * Mark any portion of the CoW fork file offset range where there is not a CoW + * staging extent record in the refcountbt, and keep a record of where we did + * find correct refcountbt records. Staging records are always cleaned out at + * mount time, so any two inodes trying to map the same staging area would have + * already taken the fs down due to refcount btree verifier errors. Hence this + * inode should be the sole creator of the staging extent records ondisk. + */ +STATIC int +xrep_cow_mark_missing_staging( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + struct xfs_refcount_irec rrec; + int error; + + if (!xfs_refcount_check_domain(rec) || + rec->rc_domain != XFS_REFC_DOMAIN_COW) + return -EFSCORRUPTED; + + xrep_cow_trim_refcount(xc, &rrec, rec); + + if (xc->next_bno >= rrec.rc_startblock) + goto next; + + error = xrep_cow_mark_file_range(xc, + XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, + xc->next_bno), + rrec.rc_startblock - xc->next_bno); + if (error) + return error; + +next: + xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount; + return 0; +} + +/* + * Mark any area that does not correspond to a CoW staging rmap. These are + * cross-linked areas that must be avoided. + */ +STATIC int +xrep_cow_mark_missing_staging_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + xfs_fsblock_t fsbno; + xfs_agblock_t rec_bno; + xfs_extlen_t rec_len; + unsigned int adj; + + if (rec->rm_owner == XFS_RMAP_OWN_COW) + return 0; + + rec_bno = rec->rm_startblock; + rec_len = rec->rm_blockcount; + if (rec_bno < xc->irec_startbno) { + adj = xc->irec_startbno - rec_bno; + rec_len -= adj; + rec_bno += adj; + } + + if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) { + adj = (rec_bno + rec_len) - + (xc->irec_startbno + xc->irec.br_blockcount); + rec_len -= adj; + } + + fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno); + return xrep_cow_mark_file_range(xc, fsbno, rec_len); +} + +/* + * Find any part of the CoW fork mapping that isn't a single-owner CoW staging + * extent and mark the corresponding part of the file range in the bitmap. + */ +STATIC int +xrep_cow_find_bad( + struct xrep_cow *xc) +{ + struct xfs_refcount_irec rc_low = { 0 }; + struct xfs_refcount_irec rc_high = { 0 }; + struct xfs_rmap_irec rm_low = { 0 }; + struct xfs_rmap_irec rm_high = { 0 }; + struct xfs_perag *pag; + struct xfs_scrub *sc = xc->sc; + xfs_agnumber_t agno; + int error; + + agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock); + xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock); + + pag = xfs_perag_get(sc->mp, agno); + if (!pag) + return -EFSCORRUPTED; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + goto out_pag; + + /* Mark any CoW fork extents that are shared. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED; + error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_shared_staging, xc); + if (error) + goto out_sa; + + /* Make sure there are CoW staging extents for the whole mapping. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW; + xc->next_bno = xc->irec_startbno; + error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_missing_staging, xc); + if (error) + goto out_sa; + + if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { + error = xrep_cow_mark_file_range(xc, + XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, + xc->next_bno), + xc->irec_startbno + xc->irec.br_blockcount - + xc->next_bno); + if (error) + goto out_sa; + } + + /* Mark any area has an rmap that isn't a COW staging extent. */ + rm_low.rm_startblock = xc->irec_startbno; + memset(&rm_high, 0xFF, sizeof(rm_high)); + rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high, + xrep_cow_mark_missing_staging_rmap, xc); + if (error) + goto out_sa; + + /* + * If userspace is forcing us to rebuild the CoW fork or someone turned + * on the debugging knob, replace everything in the CoW fork. + */ + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, + xc->irec.br_blockcount); + if (error) + return error; + } + +out_sa: + xchk_ag_free(sc, &sc->sa); +out_pag: + xfs_perag_put(pag); + return 0; +} + +/* + * Allocate a replacement CoW staging extent of up to the given number of + * blocks, and fill out the mapping. + */ +STATIC int +xrep_cow_alloc( + struct xfs_scrub *sc, + xfs_extlen_t maxlen, + struct xrep_cow_extent *repl) +{ + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = sc->mp, + .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, + .minlen = 1, + .maxlen = maxlen, + .prod = 1, + .resv = XFS_AG_RESV_NONE, + .datatype = XFS_ALLOC_USERDATA, + }; + int error; + + error = xfs_trans_reserve_more(sc->tp, maxlen, 0); + if (error) + return error; + + error = xfs_alloc_vextent_start_ag(&args, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino)); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len); + + repl->fsbno = args.fsbno; + repl->len = args.len; + return 0; +} + +/* + * Look up the current CoW fork mapping so that we only allocate enough to + * replace a single mapping. If we don't find a mapping that covers the start + * of the file range, or we find a delalloc or written extent, something is + * seriously wrong, since we didn't drop the ILOCK. + */ +static inline int +xrep_cow_find_mapping( + struct xrep_cow *xc, + struct xfs_iext_cursor *icur, + xfs_fileoff_t startoff, + struct xfs_bmbt_irec *got) +{ + struct xfs_inode *ip = xc->sc->ip; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); + + if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got)) + goto bad; + + if (got->br_startoff > startoff) + goto bad; + + if (got->br_blockcount == 0) + goto bad; + + if (isnullstartblock(got->br_startblock)) + goto bad; + + if (xfs_bmap_is_written_extent(got)) + goto bad; + + return 0; +bad: + ASSERT(0); + return -EFSCORRUPTED; +} + +#define REPLACE_LEFT_SIDE (1U << 0) +#define REPLACE_RIGHT_SIDE (1U << 1) + +/* + * Given a CoW fork mapping @got and a replacement mapping @repl, remap the + * beginning of @got with the space described by @rep. + */ +static inline void +xrep_cow_replace_mapping( + struct xfs_inode *ip, + struct xfs_iext_cursor *icur, + const struct xfs_bmbt_irec *got, + const struct xrep_cow_extent *repl) +{ + struct xfs_bmbt_irec new = *got; /* struct copy */ + + ASSERT(repl->len > 0); + ASSERT(!isnullstartblock(got->br_startblock)); + + trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len); + + if (got->br_blockcount == repl->len) { + /* + * The new extent is a complete replacement for the existing + * extent. Update the COW fork record. + */ + new.br_startblock = repl->fsbno; + xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); + return; + } + + /* + * The new extent can replace the beginning of the COW fork record. + * Move the left side of @got upwards, then insert the new record. + */ + new.br_startoff += repl->len; + new.br_startblock += repl->len; + new.br_blockcount -= repl->len; + xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); + + new.br_startoff = got->br_startoff; + new.br_startblock = repl->fsbno; + new.br_blockcount = repl->len; + xfs_iext_insert(ip, icur, &new, BMAP_COWFORK); +} + +/* + * Replace the unwritten CoW staging extent backing the given file range with a + * new space extent that isn't as problematic. + */ +STATIC int +xrep_cow_replace_range( + struct xrep_cow *xc, + xfs_fileoff_t startoff, + xfs_extlen_t *blockcount) +{ + struct xfs_iext_cursor icur; + struct xrep_cow_extent repl; + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = xc->sc; + xfs_fileoff_t nextoff; + xfs_extlen_t alloc_len; + int error; + + /* + * Put the existing CoW fork mapping in @got. If @got ends before + * @rep, truncate @rep so we only replace one extent mapping at a time. + */ + error = xrep_cow_find_mapping(xc, &icur, startoff, &got); + if (error) + return error; + nextoff = min(startoff + *blockcount, + got.br_startoff + got.br_blockcount); + + /* + * Allocate a replacement extent. If we don't fill all the blocks, + * shorten the quantity that will be deleted in this step. + */ + alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN, + nextoff - startoff); + error = xrep_cow_alloc(sc, alloc_len, &repl); + if (error) + return error; + + /* + * Replace the old mapping with the new one, and commit the metadata + * changes made so far. + */ + xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl); + + xfs_inode_set_cowblocks_tag(sc->ip); + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* Note the old CoW staging extents; we'll reap them all later. */ + error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock, + repl.len); + if (error) + return error; + + *blockcount = repl.len; + return 0; +} + +/* + * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc + * reservation. + */ +STATIC int +xrep_cow_replace( + uint64_t startoff, + uint64_t blockcount, + void *priv) +{ + struct xrep_cow *xc = priv; + int error = 0; + + while (blockcount > 0) { + xfs_extlen_t len = min_t(xfs_filblks_t, blockcount, + XFS_MAX_BMBT_EXTLEN); + + error = xrep_cow_replace_range(xc, startoff, &len); + if (error) + break; + + blockcount -= len; + startoff += len; + } + + return error; +} + +/* + * Repair an inode's CoW fork. The CoW fork is an in-core structure, so + * there's no btree to rebuid. Instead, we replace any mappings that are + * cross-linked or lack ondisk CoW fork records in the refcount btree. + */ +int +xrep_bmap_cow( + struct xfs_scrub *sc) +{ + struct xrep_cow *xc; + struct xfs_iext_cursor icur; + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK); + int error; + + if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp)) + return -EOPNOTSUPP; + + if (!ifp) + return 0; + + /* realtime files aren't supported yet */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return -EOPNOTSUPP; + + /* + * If we're somehow not in extents format, then reinitialize it to + * an empty extent mapping fork and exit. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + return 0; + } + + xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS); + if (!xc) + return -ENOMEM; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + xc->sc = sc; + xoff_bitmap_init(&xc->bad_fileoffs); + xfsb_bitmap_init(&xc->old_cowfork_fsblocks); + + for_each_xfs_iext(ifp, &icur, &xc->irec) { + if (xchk_should_terminate(sc, &error)) + goto out_bitmap; + + /* + * delalloc reservations only exist incore, so there is no + * ondisk metadata that we can examine. Hence we leave them + * alone. + */ + if (isnullstartblock(xc->irec.br_startblock)) + continue; + + /* + * COW fork extents are only in the written state if writeback + * is actively writing to disk. We cannot restart the write + * at a different disk address since we've already issued the + * IO, so we leave these alone and hope for the best. + */ + if (xfs_bmap_is_written_extent(&xc->irec)) + continue; + + error = xrep_cow_find_bad(xc); + if (error) + goto out_bitmap; + } + + /* Replace any bad unwritten mappings with fresh reservations. */ + error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc); + if (error) + goto out_bitmap; + + /* + * Reap as many of the old CoW blocks as we can. They are owned ondisk + * by the refcount btree, not the inode, so it is correct to treat them + * like inode metadata. + */ + error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, + &XFS_RMAP_OINFO_COW); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); + xoff_bitmap_destroy(&xc->bad_fileoffs); + kmem_free(xc); + return error; +} diff --git a/fs/xfs/scrub/off_bitmap.h b/fs/xfs/scrub/off_bitmap.h new file mode 100644 index 000000000000..0d3f9e6c1aad --- /dev/null +++ b/fs/xfs/scrub/off_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_OFF_BITMAP_H__ +#define __XFS_SCRUB_OFF_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_fileoff_t */ + +struct xoff_bitmap { + struct xbitmap64 offbitmap; +}; + +static inline void xoff_bitmap_init(struct xoff_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->offbitmap); +} + +static inline void xoff_bitmap_destroy(struct xoff_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->offbitmap); +} + +static inline int xoff_bitmap_set(struct xoff_bitmap *bitmap, + xfs_fileoff_t off, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->offbitmap, off, len); +} + +static inline int xoff_bitmap_walk(struct xoff_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->offbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_OFF_BITMAP_H__ */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 0d2e32fbb51a..f99eca799809 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -20,6 +20,7 @@ #include "xfs_ialloc_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_extent_busy.h" #include "xfs_ag.h" @@ -380,6 +381,17 @@ xreap_agextent_iter( trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); rs->force_roll = true; + + if (rs->oinfo == &XFS_RMAP_OINFO_COW) { + /* + * If we're unmapping CoW staging extents, remove the + * records from the refcountbt, which will remove the + * rmap record as well. + */ + xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + return 0; + } + return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, *aglenp, rs->oinfo); } @@ -398,6 +410,26 @@ xreap_agextent_iter( return 0; } + /* + * If we're getting rid of CoW staging extents, use deferred work items + * to remove the refcountbt records (which removes the rmap records) + * and free the extent. We're not worried about the system going down + * here because log recovery walks the refcount btree to clean out the + * CoW staging extents. + */ + if (rs->oinfo == &XFS_RMAP_OINFO_COW) { + ASSERT(rs->resv == XFS_AG_RESV_NONE); + + xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL, + rs->resv, true); + if (error) + return error; + + rs->force_roll = true; + return 0; + } + /* Put blocks back on the AGFL one at a time. */ if (rs->resv == XFS_AG_RESV_AGFL) { ASSERT(*aglenp == 1); diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 51ba3df5998c..f89c8f08b037 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -111,6 +111,7 @@ int xrep_refcountbt(struct xfs_scrub *sc); int xrep_inode(struct xfs_scrub *sc); int xrep_bmap_data(struct xfs_scrub *sc); int xrep_bmap_attr(struct xfs_scrub *sc); +int xrep_bmap_cow(struct xfs_scrub *sc); int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); @@ -173,6 +174,7 @@ xrep_setup_nothing( #define xrep_inode xrep_notsupported #define xrep_bmap_data xrep_notsupported #define xrep_bmap_attr xrep_notsupported +#define xrep_bmap_cow xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index c9e37c688089..e46b3afb5467 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -298,7 +298,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_cow, - .repair = xrep_notsupported, + .repair = xrep_bmap_cow, }, [XFS_SCRUB_TYPE_DIR] = { /* directory */ .type = ST_INODE, diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index d6a1f46cf6e9..3d5c8e748955 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1596,6 +1596,90 @@ TRACE_EVENT(xrep_dinode_count_rmaps, __entry->attr_extents) ); +TRACE_EVENT(xrep_cow_mark_file_range, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock, + xfs_fileoff_t startoff, xfs_filblks_t blockcount), + TP_ARGS(ip, startblock, startoff, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, startblock) + __field(xfs_fileoff_t, startoff) + __field(xfs_filblks_t, blockcount) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->startoff = startoff; + __entry->startblock = startblock; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->startoff, + __entry->startblock, + __entry->blockcount) +); + +TRACE_EVENT(xrep_cow_replace_mapping, + TP_PROTO(struct xfs_inode *ip, const struct xfs_bmbt_irec *irec, + xfs_fsblock_t new_startblock, xfs_extlen_t new_blockcount), + TP_ARGS(ip, irec, new_startblock, new_blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, startblock) + __field(xfs_fileoff_t, startoff) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(xfs_fsblock_t, new_startblock) + __field(xfs_extlen_t, new_blockcount) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->startoff = irec->br_startoff; + __entry->startblock = irec->br_startblock; + __entry->blockcount = irec->br_blockcount; + __entry->state = irec->br_state; + __entry->new_startblock = new_startblock; + __entry->new_blockcount = new_blockcount; + ), + TP_printk("dev %d:%d ino 0x%llx startoff 0x%llx startblock 0x%llx fsbcount 0x%llx state 0x%x new_startblock 0x%llx new_fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->startoff, + __entry->startblock, + __entry->blockcount, + __entry->state, + __entry->new_startblock, + __entry->new_blockcount) +); + +TRACE_EVENT(xrep_cow_free_staging, + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, + xfs_extlen_t blockcount), + TP_ARGS(pag, agbno, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, blockcount) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->agbno = agbno; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->blockcount) +); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ From 41991cf298919de211c63251d72266aff70ecad0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:41 -0800 Subject: [PATCH 0808/1562] xfs: check rt bitmap file geometry more thoroughly I forgot that the superblock tracks the number of blocks that are in the realtime bitmap, and that the rt bitmap file can have more blocks mapped to the data fork than sb_rbmblocks if growfsrt fails. So. Add to the rtbitmap scrubber an explicit check that sb_rextents and sb_rbmblocks are correct, then adjust the rtbitmap i_size checks to allow for the growfsrt failure case. Finally, flag post-eof blocks in the rtbitmap. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/rtbitmap.c | 99 ++++++++++++++++++++++++++++++++++------- 1 file changed, 84 insertions(+), 15 deletions(-) diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index d509a08d3fc3..578b935ca93f 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -14,16 +14,30 @@ #include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_bmap.h" +#include "xfs_bit.h" #include "scrub/scrub.h" #include "scrub/common.h" +struct xchk_rtbitmap { + uint64_t rextents; + uint64_t rbmblocks; + unsigned int rextslog; +}; + /* Set us up with the realtime metadata locked. */ int xchk_setup_rtbitmap( struct xfs_scrub *sc) { + struct xfs_mount *mp = sc->mp; + struct xchk_rtbitmap *rtb; int error; + rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS); + if (!rtb) + return -ENOMEM; + sc->buf = rtb; + error = xchk_trans_alloc(sc, 0); if (error) return error; @@ -37,6 +51,17 @@ xchk_setup_rtbitmap( return error; xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); + + /* + * Now that we've locked the rtbitmap, we can't race with growfsrt + * trying to expand the bitmap or change the size of the rt volume. + * Hence it is safe to compute and check the geometry values. + */ + if (mp->m_sb.sb_rblocks) { + rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rtb->rextslog = xfs_compute_rextslog(rtb->rextents); + rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents); + } return 0; } @@ -67,21 +92,30 @@ STATIC int xchk_rtbitmap_check_extents( struct xfs_scrub *sc) { - struct xfs_mount *mp = sc->mp; struct xfs_bmbt_irec map; - xfs_rtblock_t off; - int nmap; + struct xfs_iext_cursor icur; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + xfs_fileoff_t off = 0; + xfs_fileoff_t endoff; int error = 0; - for (off = 0; off < mp->m_sb.sb_rbmblocks;) { + /* Mappings may not cross or lie beyond EOF. */ + endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); + if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff); + return 0; + } + + while (off < endoff) { + int nmap = 1; + if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) break; /* Make sure we have a written extent. */ - nmap = 1; - error = xfs_bmapi_read(mp->m_rbmip, off, - mp->m_sb.sb_rbmblocks - off, &map, &nmap, + error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap, XFS_DATA_FORK); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) break; @@ -102,12 +136,48 @@ int xchk_rtbitmap( struct xfs_scrub *sc) { + struct xfs_mount *mp = sc->mp; + struct xchk_rtbitmap *rtb = sc->buf; int error; - /* Is the size of the rtbitmap correct? */ - if (sc->mp->m_rbmip->i_disk_size != - XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) { - xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino); + /* Is sb_rextents correct? */ + if (mp->m_sb.sb_rextents != rtb->rextents) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* Is sb_rextslog correct? */ + if (mp->m_sb.sb_rextslog != rtb->rextslog) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* + * Is sb_rbmblocks large enough to handle the current rt volume? In no + * case can we exceed 4bn bitmap blocks since the super field is a u32. + */ + if (rtb->rbmblocks > U32_MAX) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* The bitmap file length must be aligned to an fsblock. */ + if (mp->m_rbmip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* + * Is the bitmap file itself large enough to handle the rt volume? + * growfsrt expands the bitmap file before updating sb_rextents, so the + * file can be larger than sb_rbmblocks. + */ + if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); return 0; } @@ -120,12 +190,11 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; - error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc); + error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; -out: - return error; + return 0; } /* xref check that the extent is not free in the rtbitmap */ From 04f0c3269b41f28c041980a30514850453ded251 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:41 -0800 Subject: [PATCH 0809/1562] xfs: check rt summary file geometry more thoroughly I forgot that the xfs_mount tracks the size and number of levels in the realtime summary file, and that the rt summary file can have more blocks mapped to the data fork than m_rsumsize implies if growfsrt fails. So. Add to the rtsummary scrubber an explicit check that all the summary geometry values are correct, then adjust the rtsummary i_size checks to allow for the growfsrt failure case. Finally, flag post-eof blocks in the summary file. While we're at it, split the extent map checking so that we only call xfs_bmapi_read once per extent instead of once per rtsummary block. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/rtsummary.c | 137 +++++++++++++++++++++++++++++++-------- 1 file changed, 110 insertions(+), 27 deletions(-) diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index f94800a029f3..b0d90426a5cb 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -31,6 +31,18 @@ * (potentially large) amount of data in pageable memory. */ +struct xchk_rtsummary { + struct xfs_rtalloc_args args; + + uint64_t rextents; + uint64_t rbmblocks; + uint64_t rsumsize; + unsigned int rsumlevels; + + /* Memory buffer for the summary comparison. */ + union xfs_suminfo_raw words[]; +}; + /* Set us up to check the rtsummary file. */ int xchk_setup_rtsummary( @@ -38,8 +50,15 @@ xchk_setup_rtsummary( { struct xfs_mount *mp = sc->mp; char *descr; + struct xchk_rtsummary *rts; int error; + rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize), + XCHK_GFP_FLAGS); + if (!rts) + return -ENOMEM; + sc->buf = rts; + /* * Create an xfile to construct a new rtsummary file. The xfile allows * us to avoid pinning kernel memory for this purpose. @@ -54,11 +73,6 @@ xchk_setup_rtsummary( if (error) return error; - /* Allocate a memory buffer for the summary comparison. */ - sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS); - if (!sc->buf) - return -ENOMEM; - error = xchk_install_live_inode(sc, mp->m_rsumip); if (error) return error; @@ -75,13 +89,29 @@ xchk_setup_rtsummary( */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + + /* + * Now that we've locked the rtbitmap and rtsummary, we can't race with + * growfsrt trying to expand the summary or change the size of the rt + * volume. Hence it is safe to compute and check the geometry values. + */ + if (mp->m_sb.sb_rblocks) { + xfs_filblks_t rsumblocks; + int rextslog; + + rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rextslog = xfs_compute_rextslog(rts->rextents); + rts->rsumlevels = rextslog + 1; + rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents); + rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels, + rts->rbmblocks); + rts->rsumsize = XFS_FSB_TO_B(mp, rsumblocks); + } return 0; } /* Helper functions to record suminfo words in an xfile. */ -typedef unsigned int xchk_rtsumoff_t; - static inline int xfsum_load( struct xfs_scrub *sc, @@ -192,19 +222,29 @@ STATIC int xchk_rtsum_compare( struct xfs_scrub *sc) { - struct xfs_rtalloc_args args = { - .mp = sc->mp, - .tp = sc->tp, - }; - struct xfs_mount *mp = sc->mp; struct xfs_bmbt_irec map; - xfs_fileoff_t off; - xchk_rtsumoff_t sumoff = 0; - int nmap; + struct xfs_iext_cursor icur; - for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) { - union xfs_suminfo_raw *ondisk_info; - int error = 0; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct xchk_rtsummary *rts = sc->buf; + xfs_fileoff_t off = 0; + xfs_fileoff_t endoff; + xfs_rtsumoff_t sumoff = 0; + int error = 0; + + rts->args.mp = sc->mp; + rts->args.tp = sc->tp; + + /* Mappings may not cross or lie beyond EOF. */ + endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); + if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff); + return 0; + } + + while (off < endoff) { + int nmap = 1; if (xchk_should_terminate(sc, &error)) return error; @@ -212,8 +252,7 @@ xchk_rtsum_compare( return 0; /* Make sure we have a written extent. */ - nmap = 1; - error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap, + error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap, XFS_DATA_FORK); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) return error; @@ -223,24 +262,33 @@ xchk_rtsum_compare( return 0; } + off += map.br_blockcount; + } + + for (off = 0; off < endoff; off++) { + union xfs_suminfo_raw *ondisk_info; + /* Read a block's worth of ondisk rtsummary file. */ - error = xfs_rtsummary_read_buf(&args, off); + error = xfs_rtsummary_read_buf(&rts->args, off); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) return error; /* Read a block's worth of computed rtsummary file. */ - error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize); + error = xfsum_copyout(sc, sumoff, rts->words, mp->m_blockwsize); if (error) { - xfs_rtbuf_cache_relse(&args); + xfs_rtbuf_cache_relse(&rts->args); return error; } - ondisk_info = xfs_rsumblock_infoptr(&args, 0); - if (memcmp(ondisk_info, sc->buf, - mp->m_blockwsize << XFS_WORDLOG) != 0) + ondisk_info = xfs_rsumblock_infoptr(&rts->args, 0); + if (memcmp(ondisk_info, rts->words, + mp->m_blockwsize << XFS_WORDLOG) != 0) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + xfs_rtbuf_cache_relse(&rts->args); + return error; + } - xfs_rtbuf_cache_relse(&args); + xfs_rtbuf_cache_relse(&rts->args); sumoff += mp->m_blockwsize; } @@ -253,8 +301,43 @@ xchk_rtsummary( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; + struct xchk_rtsummary *rts = sc->buf; int error = 0; + /* Is sb_rextents correct? */ + if (mp->m_sb.sb_rextents != rts->rextents) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + goto out_rbm; + } + + /* Is m_rsumlevels correct? */ + if (mp->m_rsumlevels != rts->rsumlevels) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* Is m_rsumsize correct? */ + if (mp->m_rsumsize != rts->rsumsize) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* The summary file length must be aligned to an fsblock. */ + if (mp->m_rsumip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* + * Is the summary file itself large enough to handle the rt volume? + * growfsrt expands the summary file before updating sb_rextents, so + * the file can be larger than rsumsize. + */ + if (mp->m_rsumip->i_disk_size < rts->rsumsize) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + /* Invoke the fork scrubber. */ error = xchk_metadata_inode_forks(sc); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) From 20cc0d398e89d1f735c8e2815defc8ba9fdcce3f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:42 -0800 Subject: [PATCH 0810/1562] xfs: always check the rtbitmap and rtsummary files XFS filesystems always have a realtime bitmap and summary file, even if there has never been a realtime volume attached. Always check them. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/scrub.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index e46b3afb5467..a7019c9bba0c 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -328,14 +328,12 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, - .has = xfs_has_realtime, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, - .has = xfs_has_realtime, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ From 5a8e07e799721ba68dd6d713d4a68598eab3bea1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:42 -0800 Subject: [PATCH 0811/1562] xfs: repair the inode core and forks of a metadata inode Add a helper function to repair the core and forks of a metadata inode, so that we can get move onto the task of repairing higher level metadata that lives in an inode. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/bmap_repair.c | 17 ++++- fs/xfs/scrub/repair.c | 153 +++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/repair.h | 2 + 3 files changed, 168 insertions(+), 4 deletions(-) diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c index a8d6415b1c38..a4bb89fdd510 100644 --- a/fs/xfs/scrub/bmap_repair.c +++ b/fs/xfs/scrub/bmap_repair.c @@ -86,6 +86,9 @@ struct xrep_bmap { /* What d the REFLINK flag be set when the repair is over? */ enum reflink_scan_state reflink_scan; + + /* Do we allow unwritten extents? */ + bool allow_unwritten; }; /* Is this space extent shared? Flag the inode if it is. */ @@ -262,6 +265,10 @@ xrep_bmap_walk_rmap( !(rec->rm_flags & XFS_RMAP_ATTR_FORK)) return 0; + /* Reject unwritten extents if we don't allow those. */ + if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten) + return -EFSCORRUPTED; + fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock); @@ -780,10 +787,11 @@ xrep_bmap_init_reflink_scan( } /* Repair an inode fork. */ -STATIC int +int xrep_bmap( struct xfs_scrub *sc, - int whichfork) + int whichfork, + bool allow_unwritten) { struct xrep_bmap *rb; char *descr; @@ -803,6 +811,7 @@ xrep_bmap( rb->sc = sc; rb->whichfork = whichfork; rb->reflink_scan = xrep_bmap_init_reflink_scan(sc, whichfork); + rb->allow_unwritten = allow_unwritten; /* Set up enough storage to handle the max records for this fork. */ large_extcount = xfs_has_large_extent_counts(sc->mp); @@ -846,7 +855,7 @@ int xrep_bmap_data( struct xfs_scrub *sc) { - return xrep_bmap(sc, XFS_DATA_FORK); + return xrep_bmap(sc, XFS_DATA_FORK, true); } /* Repair an inode's attr fork. */ @@ -854,5 +863,5 @@ int xrep_bmap_attr( struct xfs_scrub *sc) { - return xrep_bmap(sc, XFS_ATTR_FORK); + return xrep_bmap(sc, XFS_ATTR_FORK, false); } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 020d49b0f9b9..745d5b8f405a 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -29,6 +29,7 @@ #include "xfs_defer.h" #include "xfs_errortag.h" #include "xfs_error.h" +#include "xfs_reflink.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -962,3 +963,155 @@ xrep_will_attempt( return false; } + +/* Try to fix some part of a metadata inode by calling another scrubber. */ +STATIC int +xrep_metadata_inode_subtype( + struct xfs_scrub *sc, + unsigned int scrub_type) +{ + __u32 smtype = sc->sm->sm_type; + __u32 smflags = sc->sm->sm_flags; + unsigned int sick_mask = sc->sick_mask; + int error; + + /* + * Let's see if the inode needs repair. We're going to open-code calls + * to the scrub and repair functions so that we can hang on to the + * resources that we already acquired instead of using the standard + * setup/teardown routines. + */ + sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + sc->sm->sm_type = scrub_type; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xchk_bmap_attr(sc); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + if (error) + goto out; + + if (!xrep_will_attempt(sc)) + goto out; + + /* + * Repair some part of the inode. This will potentially join the inode + * to the transaction. + */ + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xrep_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xrep_bmap(sc, XFS_DATA_FORK, false); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xrep_bmap(sc, XFS_ATTR_FORK, false); + break; + } + if (error) + goto out; + + /* + * Finish all deferred intent items and then roll the transaction so + * that the inode will not be joined to the transaction when we exit + * the function. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + goto out; + error = xfs_trans_roll(&sc->tp); + if (error) + goto out; + + /* + * Clear the corruption flags and re-check the metadata that we just + * repaired. + */ + sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xchk_bmap_attr(sc); + break; + } + if (error) + goto out; + + /* If corruption persists, the repair has failed. */ + if (xchk_needs_repair(sc->sm)) { + error = -EFSCORRUPTED; + goto out; + } +out: + sc->sick_mask = sick_mask; + sc->sm->sm_type = smtype; + sc->sm->sm_flags = smflags; + return error; +} + +/* + * Repair the ondisk forks of a metadata inode. The caller must ensure that + * sc->ip points to the metadata inode and the ILOCK is held on that inode. + * The inode must not be joined to the transaction before the call, and will + * not be afterwards. + */ +int +xrep_metadata_inode_forks( + struct xfs_scrub *sc) +{ + bool dirty = false; + int error; + + /* Repair the inode record and the data fork. */ + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); + if (error) + return error; + + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); + if (error) + return error; + + /* Make sure the attr fork looks ok before we delete it. */ + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); + if (error) + return error; + + /* Clear the reflink flag since metadata never shares. */ + if (xfs_is_reflink_inode(sc->ip)) { + dirty = true; + xfs_trans_ijoin(sc->tp, sc->ip, 0); + error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); + if (error) + return error; + } + + /* + * If we modified the inode, roll the transaction but don't rejoin the + * inode to the new transaction because xrep_bmap_data can do that. + */ + if (dirty) { + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + dirty = false; + } + + return 0; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index f89c8f08b037..f0f9c5194e8d 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -82,6 +82,8 @@ int xrep_ino_dqattach(struct xfs_scrub *sc); int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork, xfs_extnum_t nextents); int xrep_reset_perag_resv(struct xfs_scrub *sc); +int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten); +int xrep_metadata_inode_forks(struct xfs_scrub *sc); /* Repair setup functions */ int xrep_setup_ag_allocbt(struct xfs_scrub *sc); From a59eb5fc21b2a6dc160ee6cdf77f20bc186a88fd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:43 -0800 Subject: [PATCH 0812/1562] xfs: create a new inode fork block unmap helper Create a new helper to unmap blocks from an inode's fork. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 41 +++++++++++++++++++++++++++++++++++++++- fs/xfs/libxfs/xfs_bmap.h | 5 ++--- fs/xfs/xfs_inode.c | 24 ++++------------------- 3 files changed, 46 insertions(+), 24 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a073ca877ced..523926fe50eb 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5239,7 +5239,7 @@ xfs_bmap_del_extent_real( * that value. If not all extents in the block range can be removed then * *done is set. */ -int /* error */ +static int __xfs_bunmapi( struct xfs_trans *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ @@ -6220,3 +6220,42 @@ xfs_bmap_validate_extent( return xfs_bmap_validate_extent_raw(ip->i_mount, XFS_IS_REALTIME_INODE(ip), whichfork, irec); } + +/* + * Used in xfs_itruncate_extents(). This is the maximum number of extents + * freed from a file in a single transaction. + */ +#define XFS_ITRUNC_MAX_EXTENTS 2 + +/* + * Unmap every extent in part of an inode's fork. We don't do any higher level + * invalidation work at all. + */ +int +xfs_bunmapi_range( + struct xfs_trans **tpp, + struct xfs_inode *ip, + uint32_t flags, + xfs_fileoff_t startoff, + xfs_fileoff_t endoff) +{ + xfs_filblks_t unmap_len = endoff - startoff + 1; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + while (unmap_len > 0) { + ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); + error = __xfs_bunmapi(*tpp, ip, startoff, &unmap_len, flags, + XFS_ITRUNC_MAX_EXTENTS); + if (error) + goto out; + + /* free the just unmapped extents */ + error = xfs_defer_finish(tpp); + if (error) + goto out; + } +out: + return error; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 8518324db285..4b83f6148e00 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -190,9 +190,6 @@ int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap); -int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags, - xfs_extnum_t nexts); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); @@ -273,6 +270,8 @@ int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, uint32_t flags); +int xfs_bunmapi_range(struct xfs_trans **tpp, struct xfs_inode *ip, + uint32_t flags, xfs_fileoff_t startoff, xfs_fileoff_t endoff); extern struct kmem_cache *xfs_bmap_intent_cache; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ea6b277485a4..1ffc8dfa2a52 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -41,12 +41,6 @@ struct kmem_cache *xfs_inode_cache; -/* - * Used in xfs_itruncate_extents(). This is the maximum number of extents - * freed from a file in a single transaction. - */ -#define XFS_ITRUNC_MAX_EXTENTS 2 - STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_inode *); @@ -1346,7 +1340,6 @@ xfs_itruncate_extents_flags( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; xfs_fileoff_t first_unmap_block; - xfs_filblks_t unmap_len; int error = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -1378,19 +1371,10 @@ xfs_itruncate_extents_flags( return 0; } - unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; - while (unmap_len > 0) { - ASSERT(tp->t_highest_agno == NULLAGNUMBER); - error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, - flags, XFS_ITRUNC_MAX_EXTENTS); - if (error) - goto out; - - /* free the just unmapped extents */ - error = xfs_defer_finish(&tp); - if (error) - goto out; - } + error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block, + XFS_MAX_FILEOFF); + if (error) + goto out; if (whichfork == XFS_DATA_FORK) { /* Remove all pending CoW reservations. */ From ffd37b22bd2b7cca7749c85a0a08268158903e55 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:43 -0800 Subject: [PATCH 0813/1562] xfs: online repair of realtime bitmaps Fix all the file metadata surrounding the realtime bitmap file, which includes the rt geometry, file size, forks, and space mappings. The bitmap contents themselves cannot be fixed without rt rmap, so that will come later. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 4 + fs/xfs/scrub/repair.h | 7 ++ fs/xfs/scrub/rtbitmap.c | 16 +-- fs/xfs/scrub/rtbitmap.h | 22 ++++ fs/xfs/scrub/rtbitmap_repair.c | 202 +++++++++++++++++++++++++++++++++ fs/xfs/scrub/scrub.c | 2 +- 6 files changed, 245 insertions(+), 8 deletions(-) create mode 100644 fs/xfs/scrub/rtbitmap.h create mode 100644 fs/xfs/scrub/rtbitmap_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index a7830df42c4e..e7727451d1c8 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -192,5 +192,9 @@ xfs-y += $(addprefix scrub/, \ refcount_repair.o \ repair.o \ ) + +xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ + rtbitmap_repair.o \ + ) endif endif diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index f0f9c5194e8d..86cf86037fe0 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -115,6 +115,12 @@ int xrep_bmap_data(struct xfs_scrub *sc); int xrep_bmap_attr(struct xfs_scrub *sc); int xrep_bmap_cow(struct xfs_scrub *sc); +#ifdef CONFIG_XFS_RT +int xrep_rtbitmap(struct xfs_scrub *sc); +#else +# define xrep_rtbitmap xrep_notsupported +#endif /* CONFIG_XFS_RT */ + int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); @@ -177,6 +183,7 @@ xrep_setup_nothing( #define xrep_bmap_data xrep_notsupported #define xrep_bmap_attr xrep_notsupported #define xrep_bmap_cow xrep_notsupported +#define xrep_rtbitmap xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 578b935ca93f..441ca9977652 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -17,12 +17,8 @@ #include "xfs_bit.h" #include "scrub/scrub.h" #include "scrub/common.h" - -struct xchk_rtbitmap { - uint64_t rextents; - uint64_t rbmblocks; - unsigned int rextslog; -}; +#include "scrub/repair.h" +#include "scrub/rtbitmap.h" /* Set us up with the realtime metadata locked. */ int @@ -38,7 +34,13 @@ xchk_setup_rtbitmap( return -ENOMEM; sc->buf = rtb; - error = xchk_trans_alloc(sc, 0); + if (xchk_could_repair(sc)) { + error = xrep_setup_rtbitmap(sc, rtb); + if (error) + return error; + } + + error = xchk_trans_alloc(sc, rtb->resblks); if (error) return error; diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h new file mode 100644 index 000000000000..85304ff019e1 --- /dev/null +++ b/fs/xfs/scrub/rtbitmap.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_RTBITMAP_H__ +#define __XFS_SCRUB_RTBITMAP_H__ + +struct xchk_rtbitmap { + uint64_t rextents; + uint64_t rbmblocks; + unsigned int rextslog; + unsigned int resblks; +}; + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb); +#else +# define xrep_setup_rtbitmap(sc, rtb) (0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_RTBITMAP_H__ */ diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c new file mode 100644 index 000000000000..46f5d5f605c9 --- /dev/null +++ b/fs/xfs/scrub/rtbitmap_repair.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/rtbitmap.h" + +/* Set up to repair the realtime bitmap file metadata. */ +int +xrep_setup_rtbitmap( + struct xfs_scrub *sc, + struct xchk_rtbitmap *rtb) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks = 0; + + /* + * Reserve enough blocks to write out a completely new bmbt for a + * maximally fragmented bitmap file. We do not hold the rtbitmap + * ILOCK yet, so this is entirely speculative. + */ + blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks); + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + rtb->resblks += blocks; + return 0; +} + +/* + * Make sure that the given range of the data fork of the realtime file is + * mapped to written blocks. The caller must ensure that the inode is joined + * to the transaction. + */ +STATIC int +xrep_rtbitmap_data_mappings( + struct xfs_scrub *sc, + xfs_filblks_t len) +{ + struct xfs_bmbt_irec map; + xfs_fileoff_t off = 0; + int error; + + ASSERT(sc->ip != NULL); + + while (off < len) { + int nmaps = 1; + + /* + * If we have a real extent mapping this block then we're + * in ok shape. + */ + error = xfs_bmapi_read(sc->ip, off, len - off, &map, &nmaps, + XFS_DATA_FORK); + if (error) + return error; + if (nmaps == 0) { + ASSERT(nmaps != 0); + return -EFSCORRUPTED; + } + + /* + * Written extents are ok. Holes are not filled because we + * do not know the freespace information. + */ + if (xfs_bmap_is_written_extent(&map) || + map.br_startblock == HOLESTARTBLOCK) { + off = map.br_startoff + map.br_blockcount; + continue; + } + + /* + * If we find a delalloc reservation then something is very + * very wrong. Bail out. + */ + if (map.br_startblock == DELAYSTARTBLOCK) + return -EFSCORRUPTED; + + /* Make sure we're really converting an unwritten extent. */ + if (map.br_state != XFS_EXT_UNWRITTEN) { + ASSERT(map.br_state == XFS_EXT_UNWRITTEN); + return -EFSCORRUPTED; + } + + /* Make sure this block has a real zeroed extent mapped. */ + nmaps = 1; + error = xfs_bmapi_write(sc->tp, sc->ip, map.br_startoff, + map.br_blockcount, + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, + 0, &map, &nmaps); + if (error) + return error; + if (nmaps != 1) + return -EFSCORRUPTED; + + /* Commit new extent and all deferred work. */ + error = xrep_defer_finish(sc); + if (error) + return error; + + off = map.br_startoff + map.br_blockcount; + } + + return 0; +} + +/* Fix broken rt volume geometry. */ +STATIC int +xrep_rtbitmap_geometry( + struct xfs_scrub *sc, + struct xchk_rtbitmap *rtb) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + + /* Superblock fields */ + if (mp->m_sb.sb_rextents != rtb->rextents) + xfs_trans_mod_sb(sc->tp, XFS_TRANS_SB_REXTENTS, + rtb->rextents - mp->m_sb.sb_rextents); + + if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + rtb->rbmblocks - mp->m_sb.sb_rbmblocks); + + if (mp->m_sb.sb_rextslog != rtb->rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + rtb->rextslog - mp->m_sb.sb_rextslog); + + /* Fix broken isize */ + sc->ip->i_disk_size = roundup_64(sc->ip->i_disk_size, + mp->m_sb.sb_blocksize); + + if (sc->ip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) + sc->ip->i_disk_size = XFS_FSB_TO_B(mp, rtb->rbmblocks); + + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return xrep_roll_trans(sc); +} + +/* Repair the realtime bitmap file metadata. */ +int +xrep_rtbitmap( + struct xfs_scrub *sc) +{ + struct xchk_rtbitmap *rtb = sc->buf; + struct xfs_mount *mp = sc->mp; + unsigned long long blocks = 0; + int error; + + /* Impossibly large rtbitmap means we can't touch the filesystem. */ + if (rtb->rbmblocks > U32_MAX) + return 0; + + /* + * If the size of the rt bitmap file is larger than what we reserved, + * figure out if we need to adjust the block reservation in the + * transaction. + */ + blocks = xfs_bmbt_calc_size(mp, rtb->rbmblocks); + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + if (blocks > rtb->resblks) { + error = xfs_trans_reserve_more(sc->tp, blocks, 0); + if (error) + return error; + + rtb->resblks += blocks; + } + + /* Fix inode core and forks. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Ensure no unwritten extents. */ + error = xrep_rtbitmap_data_mappings(sc, rtb->rbmblocks); + if (error) + return error; + + /* Fix inconsistent bitmap geometry */ + return xrep_rtbitmap_geometry(sc, rtb); +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index a7019c9bba0c..c33480894229 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -328,7 +328,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, - .repair = xrep_notsupported, + .repair = xrep_rtbitmap, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, From 7d1f0e167a067ed741dec08b7614d76893422b04 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:44 -0800 Subject: [PATCH 0814/1562] xfs: check the ondisk space mapping behind a dquot Each xfs_dquot object caches the file offset and daddr of the ondisk block that backs the dquot. Make sure these cached values are the same as the bmapi data, and that the block state is written. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/quota.c | 58 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 5671c8153433..59350cd7a325 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -6,6 +6,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_bit.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" @@ -75,6 +76,47 @@ struct xchk_quota_info { xfs_dqid_t last_id; }; +/* There's a written block backing this dquot, right? */ +STATIC int +xchk_quota_item_bmap( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + xfs_fileoff_t offset) +{ + struct xfs_bmbt_irec irec; + struct xfs_mount *mp = sc->mp; + int nmaps = 1; + int error; + + if (!xfs_verify_fileoff(mp, offset)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + if (dq->q_fileoffset != offset) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0); + if (error) + return error; + + if (nmaps != 1) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + if (!xfs_verify_fsbno(mp, irec.br_startblock)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (XFS_FSB_TO_DADDR(mp, irec.br_startblock) != dq->q_blkno) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (!xfs_bmap_is_written_extent(&irec)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + return 0; +} + /* Scrub the fields in an individual quota item. */ STATIC int xchk_quota_item( @@ -93,6 +135,17 @@ xchk_quota_item( if (xchk_should_terminate(sc, &error)) return error; + /* + * We want to validate the bmap record for the storage backing this + * dquot, so we need to lock the dquot and the quota file. For quota + * operations, the locking order is first the ILOCK and then the dquot. + * However, dqiterate gave us a locked dquot, so drop the dquot lock to + * get the ILOCK. + */ + xfs_dqunlock(dq); + xchk_ilock(sc, XFS_ILOCK_SHARED); + xfs_dqlock(dq); + /* * Except for the root dquot, the actual dquot we got must either have * the same or higher id as we saw before. @@ -103,6 +156,11 @@ xchk_quota_item( sqi->last_id = dq->q_id; + error = xchk_quota_item_bmap(sc, dq, offset); + xchk_iunlock(sc, XFS_ILOCK_SHARED); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error)) + return error; + /* * Warn if the hard limits are larger than the fs. * Administrators can do this, though in production this seems From 774b5c0a5152892bf5f43ce560f3a814b1fdf3b7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:44 -0800 Subject: [PATCH 0815/1562] xfs: check dquot resource timers For each dquot resource, ensure either (a) the resource usage is over the soft limit and there is a nonzero timer; or (b) usage is at or under the soft limit and the timer is unset. (a) is redundant with the dquot buffer verifier, but (b) isn't checked anywhere. Found by fuzzing xfs/426 and noticing that diskdq.btimer = add didn't trip any kind of warning for having a timer set even with no limits. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/quota.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 59350cd7a325..49835d2840b4 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -117,6 +117,23 @@ xchk_quota_item_bmap( return 0; } +/* Complain if a quota timer is incorrectly set. */ +static inline void +xchk_quota_item_timer( + struct xfs_scrub *sc, + xfs_fileoff_t offset, + const struct xfs_dquot_res *res) +{ + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (!res->timer) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } else { + if (res->timer) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } +} + /* Scrub the fields in an individual quota item. */ STATIC int xchk_quota_item( @@ -224,6 +241,10 @@ xchk_quota_item( dq->q_rtb.count > dq->q_rtb.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + xchk_quota_item_timer(sc, offset, &dq->q_blk); + xchk_quota_item_timer(sc, offset, &dq->q_ino); + xchk_quota_item_timer(sc, offset, &dq->q_rtb); + out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return -ECANCELED; From 21d7500929c8a0b10e22a6755850c6f9a9280284 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:45 -0800 Subject: [PATCH 0816/1562] xfs: improve dquot iteration for scrub Upon a closer inspection of the quota record scrubber, I noticed that dqiterate wasn't actually walking all possible dquots for the mapped blocks in the quota file. This is due to xfs_qm_dqget_next skipping all XFS_IS_DQUOT_UNINITIALIZED dquots. For a fsck program, we really want to look at all the dquots, even if all counters and limits in the dquot record are zero. Rewrite the implementation to do this, as well as switching to an iterator paradigm to reduce the number of indirect calls. This enables removal of the old broken dqiterate code from xfs_dquot.c. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 5 +- fs/xfs/libxfs/xfs_format.h | 3 + fs/xfs/scrub/dqiterate.c | 211 +++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/quota.c | 25 +++-- fs/xfs/scrub/quota.h | 34 ++++++ fs/xfs/scrub/trace.c | 2 + fs/xfs/scrub/trace.h | 49 +++++++++ fs/xfs/xfs_dquot.c | 31 ------ fs/xfs/xfs_dquot.h | 5 - 9 files changed, 319 insertions(+), 46 deletions(-) create mode 100644 fs/xfs/scrub/dqiterate.c create mode 100644 fs/xfs/scrub/quota.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index e7727451d1c8..cdf81eb180e2 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -176,7 +176,10 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ rtsummary.o \ ) -xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o +xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ + dqiterate.o \ + quota.o \ + ) # online repair ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index f16974126ff9..e6ca188e2271 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1272,6 +1272,9 @@ static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds) #define XFS_DQ_GRACE_MIN ((int64_t)0) #define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX) +/* Maximum id value for a quota record */ +#define XFS_DQ_ID_MAX (U32_MAX) + /* * This is the main portion of the on-disk representation of quota information * for a user. We pad this with some more expansion room to construct the on diff --git a/fs/xfs/scrub/dqiterate.c b/fs/xfs/scrub/dqiterate.c new file mode 100644 index 000000000000..20c4daedd48d --- /dev/null +++ b/fs/xfs/scrub/dqiterate.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_bmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/quota.h" +#include "scrub/trace.h" + +/* Initialize a dquot iteration cursor. */ +void +xchk_dqiter_init( + struct xchk_dqiter *cursor, + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + cursor->sc = sc; + cursor->bmap.br_startoff = NULLFILEOFF; + cursor->dqtype = dqtype & XFS_DQTYPE_REC_MASK; + cursor->quota_ip = xfs_quota_inode(sc->mp, cursor->dqtype); + cursor->id = 0; +} + +/* + * Ensure that the cached data fork mapping for the dqiter cursor is fresh and + * covers the dquot pointed to by the scan cursor. + */ +STATIC int +xchk_dquot_iter_revalidate_bmap( + struct xchk_dqiter *cursor) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip, + XFS_DATA_FORK); + xfs_fileoff_t fileoff; + xfs_dqid_t this_id = cursor->id; + int nmaps = 1; + int error; + + fileoff = this_id / qi->qi_dqperchunk; + + /* + * If we have a mapping for cursor->id and it's still fresh, there's + * no need to reread the bmbt. + */ + if (cursor->bmap.br_startoff != NULLFILEOFF && + cursor->if_seq == ifp->if_seq && + cursor->bmap.br_startoff + cursor->bmap.br_blockcount > fileoff) + return 0; + + /* Look up the data fork mapping for the dquot id of interest. */ + error = xfs_bmapi_read(cursor->quota_ip, fileoff, + XFS_MAX_FILEOFF - fileoff, &cursor->bmap, &nmaps, 0); + if (error) + return error; + if (!nmaps) { + ASSERT(nmaps > 0); + return -EFSCORRUPTED; + } + if (cursor->bmap.br_startoff > fileoff) { + ASSERT(cursor->bmap.br_startoff == fileoff); + return -EFSCORRUPTED; + } + + cursor->if_seq = ifp->if_seq; + trace_xchk_dquot_iter_revalidate_bmap(cursor, cursor->id); + return 0; +} + +/* Advance the dqiter cursor to the next non-sparse region of the quota file. */ +STATIC int +xchk_dquot_iter_advance_bmap( + struct xchk_dqiter *cursor, + uint64_t *next_ondisk_id) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip, + XFS_DATA_FORK); + xfs_fileoff_t fileoff; + uint64_t next_id; + int nmaps = 1; + int error; + + /* Find the dquot id for the next non-hole mapping. */ + do { + fileoff = cursor->bmap.br_startoff + cursor->bmap.br_blockcount; + if (fileoff > XFS_DQ_ID_MAX / qi->qi_dqperchunk) { + /* The hole goes beyond the max dquot id, we're done */ + *next_ondisk_id = -1ULL; + return 0; + } + + error = xfs_bmapi_read(cursor->quota_ip, fileoff, + XFS_MAX_FILEOFF - fileoff, &cursor->bmap, + &nmaps, 0); + if (error) + return error; + if (!nmaps) { + /* Must have reached the end of the mappings. */ + *next_ondisk_id = -1ULL; + return 0; + } + if (cursor->bmap.br_startoff > fileoff) { + ASSERT(cursor->bmap.br_startoff == fileoff); + return -EFSCORRUPTED; + } + } while (!xfs_bmap_is_real_extent(&cursor->bmap)); + + next_id = cursor->bmap.br_startoff * qi->qi_dqperchunk; + if (next_id > XFS_DQ_ID_MAX) { + /* The hole goes beyond the max dquot id, we're done */ + *next_ondisk_id = -1ULL; + return 0; + } + + /* Propose jumping forward to the dquot in the next allocated block. */ + *next_ondisk_id = next_id; + cursor->if_seq = ifp->if_seq; + trace_xchk_dquot_iter_advance_bmap(cursor, *next_ondisk_id); + return 0; +} + +/* + * Find the id of the next highest incore dquot. Normally this will correspond + * exactly with the quota file block mappings, but repair might have erased a + * mapping because it was crosslinked; in that case, we need to re-allocate the + * space so that we can reset q_blkno. + */ +STATIC void +xchk_dquot_iter_advance_incore( + struct xchk_dqiter *cursor, + uint64_t *next_incore_id) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, cursor->dqtype); + struct xfs_dquot *dq; + unsigned int nr_found; + + *next_incore_id = -1ULL; + + mutex_lock(&qi->qi_tree_lock); + nr_found = radix_tree_gang_lookup(tree, (void **)&dq, cursor->id, 1); + if (nr_found) + *next_incore_id = dq->q_id; + mutex_unlock(&qi->qi_tree_lock); + + trace_xchk_dquot_iter_advance_incore(cursor, *next_incore_id); +} + +/* + * Walk all incore dquots of this filesystem. Caller must set *@cursorp to + * zero before the first call, and must not hold the quota file ILOCK. + * Returns 1 and a valid *@dqpp; 0 and *@dqpp == NULL when there are no more + * dquots to iterate; or a negative errno. + */ +int +xchk_dquot_iter( + struct xchk_dqiter *cursor, + struct xfs_dquot **dqpp) +{ + struct xfs_mount *mp = cursor->sc->mp; + struct xfs_dquot *dq = NULL; + uint64_t next_ondisk, next_incore = -1ULL; + unsigned int lock_mode; + int error = 0; + + if (cursor->id > XFS_DQ_ID_MAX) + return 0; + next_ondisk = cursor->id; + + /* Revalidate and/or advance the cursor. */ + lock_mode = xfs_ilock_data_map_shared(cursor->quota_ip); + error = xchk_dquot_iter_revalidate_bmap(cursor); + if (!error && !xfs_bmap_is_real_extent(&cursor->bmap)) + error = xchk_dquot_iter_advance_bmap(cursor, &next_ondisk); + xfs_iunlock(cursor->quota_ip, lock_mode); + if (error) + return error; + + if (next_ondisk > cursor->id) + xchk_dquot_iter_advance_incore(cursor, &next_incore); + + /* Pick the next dquot in the sequence and return it. */ + cursor->id = min(next_ondisk, next_incore); + if (cursor->id > XFS_DQ_ID_MAX) + return 0; + + trace_xchk_dquot_iter(cursor, cursor->id); + + error = xfs_qm_dqget(mp, cursor->id, cursor->dqtype, false, &dq); + if (error) + return error; + + cursor->id = dq->q_id + 1; + *dqpp = dq; + return 1; +} diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 49835d2840b4..1a65a7502527 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -18,6 +18,7 @@ #include "xfs_bmap.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/quota.h" /* Convert a scrub type code to a DQ flag, or return 0 if error. */ static inline xfs_dqtype_t @@ -137,11 +138,9 @@ xchk_quota_item_timer( /* Scrub the fields in an individual quota item. */ STATIC int xchk_quota_item( - struct xfs_dquot *dq, - xfs_dqtype_t dqtype, - void *priv) + struct xchk_quota_info *sqi, + struct xfs_dquot *dq) { - struct xchk_quota_info *sqi = priv; struct xfs_scrub *sc = sqi->sc; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; @@ -270,7 +269,7 @@ xchk_quota_data_fork( return error; /* Check for data fork problems that apply only to quota files. */ - max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; + max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk; ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error)) @@ -297,9 +296,11 @@ int xchk_quota( struct xfs_scrub *sc) { - struct xchk_quota_info sqi; + struct xchk_dqiter cursor = { }; + struct xchk_quota_info sqi = { .sc = sc }; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_dquot *dq; xfs_dqtype_t dqtype; int error = 0; @@ -318,9 +319,15 @@ xchk_quota( * functions. */ xchk_iunlock(sc, sc->ilock_flags); - sqi.sc = sc; - sqi.last_id = 0; - error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi); + + /* Now look for things that the quota verifiers won't complain about. */ + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xchk_quota_item(&sqi, dq); + xfs_qm_dqput(dq); + if (error) + break; + } xchk_ilock(sc, XFS_ILOCK_EXCL); if (error == -ECANCELED) error = 0; diff --git a/fs/xfs/scrub/quota.h b/fs/xfs/scrub/quota.h new file mode 100644 index 000000000000..5056b7766c4a --- /dev/null +++ b/fs/xfs/scrub/quota.h @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_QUOTA_H__ +#define __XFS_SCRUB_QUOTA_H__ + +/* dquot iteration code */ + +struct xchk_dqiter { + struct xfs_scrub *sc; + + /* Quota file that we're walking. */ + struct xfs_inode *quota_ip; + + /* Cached data fork mapping for the dquot. */ + struct xfs_bmbt_irec bmap; + + /* The next dquot to scan. */ + uint64_t id; + + /* Quota type (user/group/project). */ + xfs_dqtype_t dqtype; + + /* Data fork sequence number to detect stale mappings. */ + unsigned int if_seq; +}; + +void xchk_dqiter_init(struct xchk_dqiter *cursor, struct xfs_scrub *sc, + xfs_dqtype_t dqtype); +int xchk_dquot_iter(struct xchk_dqiter *cursor, struct xfs_dquot **dqpp); + +#endif /* __XFS_SCRUB_QUOTA_H__ */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 29afa4851235..4641522fd907 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -14,9 +14,11 @@ #include "xfs_btree.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" +#include "xfs_quota.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" +#include "scrub/quota.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 3d5c8e748955..83827f3e165b 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -19,6 +19,7 @@ struct xfile; struct xfarray; struct xfarray_sortinfo; +struct xchk_dqiter; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -348,6 +349,54 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \ DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error); DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning); +#ifdef CONFIG_XFS_QUOTA +DECLARE_EVENT_CLASS(xchk_dqiter_class, + TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), + TP_ARGS(cursor, id), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_dqtype_t, dqtype) + __field(xfs_ino_t, ino) + __field(unsigned long long, cur_id) + __field(unsigned long long, id) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + ), + TP_fast_assign( + __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev; + __entry->dqtype = cursor->dqtype; + __entry->ino = cursor->quota_ip->i_ino; + __entry->cur_id = cursor->id; + __entry->startoff = cursor->bmap.br_startoff; + __entry->startblock = cursor->bmap.br_startblock; + __entry->blockcount = cursor->bmap.br_blockcount; + __entry->state = cursor->bmap.br_state; + __entry->id = id; + ), + TP_printk("dev %d:%d dquot type %s ino 0x%llx cursor_id 0x%llx startoff 0x%llx startblock 0x%llx blockcount 0x%llx state %u id 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS), + __entry->ino, + __entry->cur_id, + __entry->startoff, + __entry->startblock, + __entry->blockcount, + __entry->state, + __entry->id) +); + +#define DEFINE_SCRUB_DQITER_EVENT(name) \ +DEFINE_EVENT(xchk_dqiter_class, name, \ + TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), \ + TP_ARGS(cursor, id)) +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter); +#endif /* CONFIG_XFS_QUOTA */ + TRACE_EVENT(xchk_incomplete, TP_PROTO(struct xfs_scrub *sc, void *ret_ip), TP_ARGS(sc, ret_ip), diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index a013b87ab8d5..60ec401e26ff 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1362,34 +1362,3 @@ xfs_qm_exit(void) kmem_cache_destroy(xfs_dqtrx_cache); kmem_cache_destroy(xfs_dquot_cache); } - -/* - * Iterate every dquot of a particular type. The caller must ensure that the - * particular quota type is active. iter_fn can return negative error codes, - * or -ECANCELED to indicate that it wants to stop iterating. - */ -int -xfs_qm_dqiterate( - struct xfs_mount *mp, - xfs_dqtype_t type, - xfs_qm_dqiterate_fn iter_fn, - void *priv) -{ - struct xfs_dquot *dq; - xfs_dqid_t id = 0; - int error; - - do { - error = xfs_qm_dqget_next(mp, id, type, &dq); - if (error == -ENOENT) - return 0; - if (error) - return error; - - error = iter_fn(dq, type, priv); - id = dq->q_id + 1; - xfs_qm_dqput(dq); - } while (error == 0 && id != 0); - - return error; -} diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 80c8f851a2f3..8d9d4b0d979d 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -234,11 +234,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } -typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, - xfs_dqtype_t type, void *priv); -int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type, - xfs_qm_dqiterate_fn iter_fn, void *priv); - time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout); time64_t xfs_dquot_set_grace_period(time64_t grace); From a5b91555403e3a09ae00bed85fc78b60801dda24 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:45 -0800 Subject: [PATCH 0817/1562] xfs: repair quotas Fix anything that causes the quota verifiers to fail. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 4 + fs/xfs/scrub/quota.c | 3 +- fs/xfs/scrub/quota.h | 2 + fs/xfs/scrub/quota_repair.c | 575 ++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/repair.h | 7 + fs/xfs/scrub/scrub.c | 6 +- fs/xfs/scrub/trace.c | 1 + fs/xfs/scrub/trace.h | 29 ++ fs/xfs/xfs_dquot.c | 6 +- fs/xfs/xfs_dquot.h | 3 + 10 files changed, 628 insertions(+), 8 deletions(-) create mode 100644 fs/xfs/scrub/quota_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index cdf81eb180e2..fbe3cdc79036 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -199,5 +199,9 @@ xfs-y += $(addprefix scrub/, \ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ rtbitmap_repair.o \ ) + +xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ + quota_repair.o \ + ) endif endif diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 1a65a7502527..183d531875ea 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -21,7 +21,7 @@ #include "scrub/quota.h" /* Convert a scrub type code to a DQ flag, or return 0 if error. */ -static inline xfs_dqtype_t +xfs_dqtype_t xchk_quota_to_dqtype( struct xfs_scrub *sc) { @@ -328,7 +328,6 @@ xchk_quota( if (error) break; } - xchk_ilock(sc, XFS_ILOCK_EXCL); if (error == -ECANCELED) error = 0; if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, diff --git a/fs/xfs/scrub/quota.h b/fs/xfs/scrub/quota.h index 5056b7766c4a..6c7134ce2385 100644 --- a/fs/xfs/scrub/quota.h +++ b/fs/xfs/scrub/quota.h @@ -6,6 +6,8 @@ #ifndef __XFS_SCRUB_QUOTA_H__ #define __XFS_SCRUB_QUOTA_H__ +xfs_dqtype_t xchk_quota_to_dqtype(struct xfs_scrub *sc); + /* dquot iteration code */ struct xchk_dqiter { diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c new file mode 100644 index 000000000000..0bab4c30cb85 --- /dev/null +++ b/fs/xfs/scrub/quota_repair.c @@ -0,0 +1,575 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_dquot.h" +#include "xfs_dquot_item.h" +#include "xfs_reflink.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/quota.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* + * Quota Repair + * ============ + * + * Quota repairs are fairly simplistic; we fix everything that the dquot + * verifiers complain about, cap any counters or limits that make no sense, + * and schedule a quotacheck if we had to fix anything. We also repair any + * data fork extent records that don't apply to metadata files. + */ + +struct xrep_quota_info { + struct xfs_scrub *sc; + bool need_quotacheck; +}; + +/* + * Allocate a new block into a sparse hole in the quota file backing this + * dquot, initialize the block, and commit the whole mess. + */ +STATIC int +xrep_quota_item_fill_bmap_hole( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + struct xfs_bmbt_irec *irec) +{ + struct xfs_buf *bp; + struct xfs_mount *mp = sc->mp; + int nmaps = 1; + int error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Map a block into the file. */ + error = xfs_trans_reserve_more(sc->tp, XFS_QM_DQALLOC_SPACE_RES(mp), + 0); + if (error) + return error; + + error = xfs_bmapi_write(sc->tp, sc->ip, dq->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, + irec, &nmaps); + if (error) + return error; + if (nmaps != 1) + return -ENOSPC; + + dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock); + + trace_xrep_dquot_item_fill_bmap_hole(sc->mp, dq->q_type, dq->q_id); + + /* Initialize the new block. */ + error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, dq->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) + return error; + bp->b_ops = &xfs_dquot_buf_ops; + + xfs_qm_init_dquot_blk(sc->tp, dq->q_id, dq->q_type, bp); + xfs_buf_set_ref(bp, XFS_DQUOT_REF); + + /* + * Finish the mapping transactions and roll one more time to + * disconnect sc->ip from sc->tp. + */ + error = xrep_defer_finish(sc); + if (error) + return error; + return xfs_trans_roll(&sc->tp); +} + +/* Make sure there's a written block backing this dquot */ +STATIC int +xrep_quota_item_bmap( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + bool *dirty) +{ + struct xfs_bmbt_irec irec; + struct xfs_mount *mp = sc->mp; + struct xfs_quotainfo *qi = mp->m_quotainfo; + xfs_fileoff_t offset = dq->q_id / qi->qi_dqperchunk; + int nmaps = 1; + int error; + + /* The computed file offset should always be valid. */ + if (!xfs_verify_fileoff(mp, offset)) { + ASSERT(xfs_verify_fileoff(mp, offset)); + return -EFSCORRUPTED; + } + dq->q_fileoffset = offset; + + error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0); + if (error) + return error; + + if (nmaps < 1 || !xfs_bmap_is_real_extent(&irec)) { + /* Hole/delalloc extent; allocate a real block. */ + error = xrep_quota_item_fill_bmap_hole(sc, dq, &irec); + if (error) + return error; + } else if (irec.br_state != XFS_EXT_NORM) { + /* Unwritten extent, which we already took care of? */ + ASSERT(irec.br_state == XFS_EXT_NORM); + return -EFSCORRUPTED; + } else if (dq->q_blkno != XFS_FSB_TO_DADDR(mp, irec.br_startblock)) { + /* + * If the cached daddr is incorrect, repair probably punched a + * hole out of the quota file and filled it back in with a new + * block. Update the block mapping in the dquot. + */ + dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec.br_startblock); + } + + *dirty = true; + return 0; +} + +/* Reset quota timers if incorrectly set. */ +static inline void +xrep_quota_item_timer( + struct xfs_scrub *sc, + const struct xfs_dquot_res *res, + bool *dirty) +{ + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (!res->timer) + *dirty = true; + } else { + if (res->timer) + *dirty = true; + } +} + +/* Scrub the fields in an individual quota item. */ +STATIC int +xrep_quota_item( + struct xrep_quota_info *rqi, + struct xfs_dquot *dq) +{ + struct xfs_scrub *sc = rqi->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t fs_icount; + bool dirty = false; + int error = 0; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * We might need to fix holes in the bmap record for the storage + * backing this dquot, so we need to lock the dquot and the quota file. + * dqiterate gave us a locked dquot, so drop the dquot lock to get the + * ILOCK_EXCL. + */ + xfs_dqunlock(dq); + xchk_ilock(sc, XFS_ILOCK_EXCL); + xfs_dqlock(dq); + + error = xrep_quota_item_bmap(sc, dq, &dirty); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + if (error) + return error; + + /* Check the limits. */ + if (dq->q_blk.softlimit > dq->q_blk.hardlimit) { + dq->q_blk.softlimit = dq->q_blk.hardlimit; + dirty = true; + } + + if (dq->q_ino.softlimit > dq->q_ino.hardlimit) { + dq->q_ino.softlimit = dq->q_ino.hardlimit; + dirty = true; + } + + if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) { + dq->q_rtb.softlimit = dq->q_rtb.hardlimit; + dirty = true; + } + + /* + * Check that usage doesn't exceed physical limits. However, on + * a reflink filesystem we're allowed to exceed physical space + * if there are no quota limits. We don't know what the real number + * is, but we can make quotacheck find out for us. + */ + if (!xfs_has_reflink(mp) && dq->q_blk.count > mp->m_sb.sb_dblocks) { + dq->q_blk.reserved -= dq->q_blk.count; + dq->q_blk.reserved += mp->m_sb.sb_dblocks; + dq->q_blk.count = mp->m_sb.sb_dblocks; + rqi->need_quotacheck = true; + dirty = true; + } + fs_icount = percpu_counter_sum(&mp->m_icount); + if (dq->q_ino.count > fs_icount) { + dq->q_ino.reserved -= dq->q_ino.count; + dq->q_ino.reserved += fs_icount; + dq->q_ino.count = fs_icount; + rqi->need_quotacheck = true; + dirty = true; + } + if (dq->q_rtb.count > mp->m_sb.sb_rblocks) { + dq->q_rtb.reserved -= dq->q_rtb.count; + dq->q_rtb.reserved += mp->m_sb.sb_rblocks; + dq->q_rtb.count = mp->m_sb.sb_rblocks; + rqi->need_quotacheck = true; + dirty = true; + } + + xrep_quota_item_timer(sc, &dq->q_blk, &dirty); + xrep_quota_item_timer(sc, &dq->q_ino, &dirty); + xrep_quota_item_timer(sc, &dq->q_rtb, &dirty); + + if (!dirty) + return 0; + + trace_xrep_dquot_item(sc->mp, dq->q_type, dq->q_id); + + dq->q_flags |= XFS_DQFLAG_DIRTY; + xfs_trans_dqjoin(sc->tp, dq); + if (dq->q_id) { + xfs_qm_adjust_dqlimits(dq); + xfs_qm_adjust_dqtimers(dq); + } + xfs_trans_log_dquot(sc->tp, dq); + error = xfs_trans_roll(&sc->tp); + xfs_dqlock(dq); + return error; +} + +/* Fix a quota timer so that we can pass the verifier. */ +STATIC void +xrep_quota_fix_timer( + struct xfs_mount *mp, + const struct xfs_disk_dquot *ddq, + __be64 softlimit, + __be64 countnow, + __be32 *timer, + time64_t timelimit) +{ + uint64_t soft = be64_to_cpu(softlimit); + uint64_t count = be64_to_cpu(countnow); + time64_t new_timer; + uint32_t t; + + if (!soft || count <= soft || *timer != 0) + return; + + new_timer = xfs_dquot_set_timeout(mp, + ktime_get_real_seconds() + timelimit); + if (ddq->d_type & XFS_DQTYPE_BIGTIME) + t = xfs_dq_unix_to_bigtime(new_timer); + else + t = new_timer; + + *timer = cpu_to_be32(t); +} + +/* Fix anything the verifiers complain about. */ +STATIC int +xrep_quota_block( + struct xfs_scrub *sc, + xfs_daddr_t daddr, + xfs_dqtype_t dqtype, + xfs_dqid_t id) +{ + struct xfs_dqblk *dqblk; + struct xfs_disk_dquot *ddq; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_def_quota *defq = xfs_get_defquota(qi, dqtype); + struct xfs_buf *bp = NULL; + enum xfs_blft buftype = 0; + int i; + int error; + + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, daddr, + qi->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); + switch (error) { + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Failed verifier, retry read with no ops. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, + sc->mp->m_ddev_targp, daddr, qi->qi_dqchunklen, + 0, &bp, NULL); + if (error) + return error; + break; + case 0: + dqblk = bp->b_addr; + ddq = &dqblk[0].dd_diskdq; + + /* + * If there's nothing that would impede a dqiterate, we're + * done. + */ + if ((ddq->d_type & XFS_DQTYPE_REC_MASK) != dqtype || + id == be32_to_cpu(ddq->d_id)) { + xfs_trans_brelse(sc->tp, bp); + return 0; + } + break; + default: + return error; + } + + /* Something's wrong with the block, fix the whole thing. */ + dqblk = bp->b_addr; + bp->b_ops = &xfs_dquot_buf_ops; + for (i = 0; i < qi->qi_dqperchunk; i++, dqblk++) { + ddq = &dqblk->dd_diskdq; + + trace_xrep_disk_dquot(sc->mp, dqtype, id + i); + + ddq->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + ddq->d_version = XFS_DQUOT_VERSION; + ddq->d_type = dqtype; + ddq->d_id = cpu_to_be32(id + i); + + if (xfs_has_bigtime(sc->mp) && ddq->d_id) + ddq->d_type |= XFS_DQTYPE_BIGTIME; + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_blk_softlimit, + ddq->d_bcount, &ddq->d_btimer, + defq->blk.time); + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_ino_softlimit, + ddq->d_icount, &ddq->d_itimer, + defq->ino.time); + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_rtb_softlimit, + ddq->d_rtbcount, &ddq->d_rtbtimer, + defq->rtb.time); + + /* We only support v5 filesystems so always set these. */ + uuid_copy(&dqblk->dd_uuid, &sc->mp->m_sb.sb_meta_uuid); + xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + dqblk->dd_lsn = 0; + } + switch (dqtype) { + case XFS_DQTYPE_USER: + buftype = XFS_BLFT_UDQUOT_BUF; + break; + case XFS_DQTYPE_GROUP: + buftype = XFS_BLFT_GDQUOT_BUF; + break; + case XFS_DQTYPE_PROJ: + buftype = XFS_BLFT_PDQUOT_BUF; + break; + } + xfs_trans_buf_set_type(sc->tp, bp, buftype); + xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); + return xrep_roll_trans(sc); +} + +/* + * Repair a quota file's data fork. The function returns with the inode + * joined. + */ +STATIC int +xrep_quota_data_fork( + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + struct xfs_bmbt_irec irec = { 0 }; + struct xfs_iext_cursor icur; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_ifork *ifp; + xfs_fileoff_t max_dqid_off; + xfs_fileoff_t off; + xfs_fsblock_t fsbno; + bool truncate = false; + bool joined = false; + int error = 0; + + error = xrep_metadata_inode_forks(sc); + if (error) + goto out; + + /* Check for data fork problems that apply only to quota files. */ + max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk; + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + for_each_xfs_iext(ifp, &icur, &irec) { + if (isnullstartblock(irec.br_startblock)) { + error = -EFSCORRUPTED; + goto out; + } + + if (irec.br_startoff > max_dqid_off || + irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { + truncate = true; + break; + } + + /* Convert unwritten extents to real ones. */ + if (irec.br_state == XFS_EXT_UNWRITTEN) { + struct xfs_bmbt_irec nrec; + int nmap = 1; + + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + error = xfs_bmapi_write(sc->tp, sc->ip, + irec.br_startoff, irec.br_blockcount, + XFS_BMAPI_CONVERT, 0, &nrec, &nmap); + if (error) + goto out; + if (nmap != 1) { + error = -ENOSPC; + goto out; + } + ASSERT(nrec.br_startoff == irec.br_startoff); + ASSERT(nrec.br_blockcount == irec.br_blockcount); + + error = xfs_defer_finish(&sc->tp); + if (error) + goto out; + } + } + + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + if (truncate) { + /* Erase everything after the block containing the max dquot */ + error = xfs_bunmapi_range(&sc->tp, sc->ip, 0, + max_dqid_off * sc->mp->m_sb.sb_blocksize, + XFS_MAX_FILEOFF); + if (error) + goto out; + + /* Remove all CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(sc->ip, &sc->tp, 0, + XFS_MAX_FILEOFF, true); + if (error) + goto out; + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + + /* + * Always re-log the inode so that our permanent transaction + * can keep on rolling it forward in the log. + */ + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + } + + /* Now go fix anything that fails the verifiers. */ + for_each_xfs_iext(ifp, &icur, &irec) { + for (fsbno = irec.br_startblock, off = irec.br_startoff; + fsbno < irec.br_startblock + irec.br_blockcount; + fsbno += XFS_DQUOT_CLUSTER_SIZE_FSB, + off += XFS_DQUOT_CLUSTER_SIZE_FSB) { + error = xrep_quota_block(sc, + XFS_FSB_TO_DADDR(sc->mp, fsbno), + dqtype, off * qi->qi_dqperchunk); + if (error) + goto out; + } + } + +out: + return error; +} + +/* + * Go fix anything in the quota items that we could have been mad about. Now + * that we've checked the quota inode data fork we have to drop ILOCK_EXCL to + * use the regular dquot functions. + */ +STATIC int +xrep_quota_problems( + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + struct xchk_dqiter cursor = { }; + struct xrep_quota_info rqi = { .sc = sc }; + struct xfs_dquot *dq; + int error; + + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xrep_quota_item(&rqi, dq); + xfs_qm_dqput(dq); + if (error) + break; + } + if (error) + return error; + + /* Make a quotacheck happen. */ + if (rqi.need_quotacheck) + xrep_force_quotacheck(sc, dqtype); + return 0; +} + +/* Repair all of a quota type's items. */ +int +xrep_quota( + struct xfs_scrub *sc) +{ + xfs_dqtype_t dqtype; + int error; + + dqtype = xchk_quota_to_dqtype(sc); + + /* + * Re-take the ILOCK so that we can fix any problems that we found + * with the data fork mappings, or with the dquot bufs themselves. + */ + if (!(sc->ilock_flags & XFS_ILOCK_EXCL)) + xchk_ilock(sc, XFS_ILOCK_EXCL); + error = xrep_quota_data_fork(sc, dqtype); + if (error) + return error; + + /* + * Finish deferred items and roll the transaction to unjoin the quota + * inode from transaction so that we can unlock the quota inode; we + * play only with dquots from now on. + */ + error = xrep_defer_finish(sc); + if (error) + return error; + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + xchk_iunlock(sc, sc->ilock_flags); + + /* Fix anything the dquot verifiers don't complain about. */ + error = xrep_quota_problems(sc, dqtype); + if (error) + return error; + + return xrep_trans_commit(sc); +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 86cf86037fe0..17114327e6fa 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -121,6 +121,12 @@ int xrep_rtbitmap(struct xfs_scrub *sc); # define xrep_rtbitmap xrep_notsupported #endif /* CONFIG_XFS_RT */ +#ifdef CONFIG_XFS_QUOTA +int xrep_quota(struct xfs_scrub *sc); +#else +# define xrep_quota xrep_notsupported +#endif /* CONFIG_XFS_QUOTA */ + int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); @@ -184,6 +190,7 @@ xrep_setup_nothing( #define xrep_bmap_attr xrep_notsupported #define xrep_bmap_cow xrep_notsupported #define xrep_rtbitmap xrep_notsupported +#define xrep_quota xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index c33480894229..caf324c2b991 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -340,19 +340,19 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */ .type = ST_FS, diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 4641522fd907..d0e24ffaf754 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -15,6 +15,7 @@ #include "xfs_ag.h" #include "xfs_rtbitmap.h" #include "xfs_quota.h" +#include "xfs_quota_defs.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 83827f3e165b..6bbb4e8639dc 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1729,6 +1729,35 @@ TRACE_EVENT(xrep_cow_free_staging, __entry->blockcount) ); +#ifdef CONFIG_XFS_QUOTA +DECLARE_EVENT_CLASS(xrep_dquot_class, + TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), + TP_ARGS(mp, type, id), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint8_t, type) + __field(uint32_t, id) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->id = id; + __entry->type = type; + ), + TP_printk("dev %d:%d type %s id 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), + __entry->id) +); + +#define DEFINE_XREP_DQUOT_EVENT(name) \ +DEFINE_EVENT(xrep_dquot_class, name, \ + TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), \ + TP_ARGS(mp, type, id)) +DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item); +DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot); +DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole); +#endif /* CONFIG_XFS_QUOTA */ + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 60ec401e26ff..a93ad76f23c5 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -172,14 +172,14 @@ xfs_qm_adjust_dqtimers( /* * initialize a buffer full of dquots and log the whole thing */ -STATIC void +void xfs_qm_init_dquot_blk( struct xfs_trans *tp, - struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_buf *bp) { + struct xfs_mount *mp = tp->t_mountp; struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_dqblk *d; xfs_dqid_t curid; @@ -353,7 +353,7 @@ xfs_dquot_disk_alloc( * Make a chunk of dquots out of this buffer and log * the entire thing. */ - xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp); + xfs_qm_init_dquot_blk(tp, dqp->q_id, qtype, bp); xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 8d9d4b0d979d..956272d9b302 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -237,4 +237,7 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout); time64_t xfs_dquot_set_grace_period(time64_t grace); +void xfs_qm_init_dquot_blk(struct xfs_trans *tp, xfs_dqid_t id, xfs_dqtype_t + type, struct xfs_buf *bp); + #endif /* __XFS_DQUOT_H__ */ From b1dd019de6f34db7a4ec9ee74cb02493135880b2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 13 Oct 2023 12:38:32 +0100 Subject: [PATCH 0818/1562] btrfs: remove duplicate btrfs_clear_buffer_dirty() prototype from disk-io.h The prototype for btrfs_clear_buffer_dirty() is declared in both disk-io.h and extent_io.h, but the function is defined at extent_io.c. So remove the prototype declaration from disk-io.h. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 50dab8f639dc..e589359e6a68 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -37,8 +37,6 @@ struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level); -void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, - struct extent_buffer *buf); void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, From 6000d9313f20e6587a9e5506b4ea169ed61ab686 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 19 Oct 2023 12:52:18 +0100 Subject: [PATCH 0819/1562] btrfs: remove log_extents_lock and logged_list from struct btrfs_root The logged_list[2] and log_extents_lock[2] members of struct btrfs_root are no longer used, their last use was removed in commit 5636cf7d6dc8 ("btrfs: remove the logged extents infrastructure"). So remove these fields. This reduces the size of struct btrfs_root, on a release kernel, from 1392 bytes down to 1352 bytes. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 --- fs/btrfs/disk-io.c | 4 ---- 2 files changed, 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 196c005c31f6..99fe28bc013b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -224,9 +224,6 @@ struct btrfs_root { struct list_head root_list; - spinlock_t log_extents_lock[2]; - struct list_head logged_list[2]; - spinlock_t inode_lock; /* red-black tree that keeps track of in-memory inodes */ struct rb_root inode_tree; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 62cb97f7c94f..7d8d175d5a59 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -650,14 +650,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&root->ordered_extents); INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->reloc_dirty_list); - INIT_LIST_HEAD(&root->logged_list[0]); - INIT_LIST_HEAD(&root->logged_list[1]); spin_lock_init(&root->inode_lock); spin_lock_init(&root->delalloc_lock); spin_lock_init(&root->ordered_extent_lock); spin_lock_init(&root->accounting_lock); - spin_lock_init(&root->log_extents_lock[0]); - spin_lock_init(&root->log_extents_lock[1]); spin_lock_init(&root->qgroup_meta_rsv_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); From 6e5de50fc5d71e0a5fe2357c067cea752fe375d7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 19 Oct 2023 13:19:29 +0100 Subject: [PATCH 0820/1562] btrfs: use bool for return type of btrfs_block_can_be_shared() Currently btrfs_block_can_be_shared() returns an int that is used as a boolean. Since it all it needs is to return true or false, and it can't return errors for example, change the return type from int to bool to make it a bit more readable and obvious. Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 12 ++++++------ fs/btrfs/ctree.h | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 35c1d24d4a78..1e4d5bd1ec48 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -370,9 +370,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, /* * check if the tree block can be shared by multiple trees */ -int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf) +bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf) { /* * Tree blocks not in shareable trees and tree roots are never shared. @@ -385,7 +385,7 @@ int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, btrfs_root_last_snapshot(&root->root_item) || btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) { if (buf != root->commit_root) - return 1; + return true; /* * An extent buffer that used to be the commit root may still be * shared because the tree height may have increased and it @@ -393,10 +393,10 @@ int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, * snapshotting a subvolume created in the current transaction. */ if (btrfs_header_generation(buf) == trans->transid) - return 1; + return true; } - return 0; + return false; } static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 99fe28bc013b..9c0800f5bdcb 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -558,9 +558,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); -int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf); +bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf); int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_trans_handle *trans, From 80d197fe04e87602be402337854321c59a31acf9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 19 Oct 2023 13:19:30 +0100 Subject: [PATCH 0821/1562] btrfs: make the logic from btrfs_block_can_be_shared() easier to read The logic in btrfs_block_can_be_shared() is hard to follow as we have a lot of conditions in a single if statement including a subexpression with a logical or and two nested if statements inside the main if statement. Make this easier to read by using separate if statements that return immediately when we find a condition that determines if a block can be or can not be shared. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1e4d5bd1ec48..137c4eb24c28 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -374,27 +374,35 @@ bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + const u64 buf_gen = btrfs_header_generation(buf); + /* * Tree blocks not in shareable trees and tree roots are never shared. * If a block was allocated after the last snapshot and the block was * not allocated by tree relocation, we know the block is not shared. */ - if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && - buf != root->node && - (btrfs_header_generation(buf) <= - btrfs_root_last_snapshot(&root->root_item) || - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) { - if (buf != root->commit_root) - return true; - /* - * An extent buffer that used to be the commit root may still be - * shared because the tree height may have increased and it - * became a child of a higher level root. This can happen when - * snapshotting a subvolume created in the current transaction. - */ - if (btrfs_header_generation(buf) == trans->transid) - return true; - } + + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + return false; + + if (buf == root->node) + return false; + + if (buf_gen > btrfs_root_last_snapshot(&root->root_item) && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) + return false; + + if (buf != root->commit_root) + return true; + + /* + * An extent buffer that used to be the commit root may still be shared + * because the tree height may have increased and it became a child of a + * higher level root. This can happen when snapshotting a subvolume + * created in the current transaction. + */ + if (buf_gen == trans->transid) + return true; return false; } From 9ba7c686feb04f16088ca4523c204ed49b07fc0a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 31 Oct 2023 07:37:20 +1030 Subject: [PATCH 0822/1562] btrfs: do not utilize goto to implement delayed inode ref deletion [PROBLEM] The function __btrfs_update_delayed_inode() is doing something not meeting the code standard of today: path->slots[0]++ if (path->slots[0] >= btrfs_header_nritems(leaf)) goto search; again: if (!is_the_target_inode_ref()) goto out; ret = btrfs_delete_item(); /* Some cleanup. */ return ret; search: ret = search_for_the_last_inode_ref(); goto again; With the tag named "again", it's pretty common to think it's a loop, but the truth is, we only need to do the search once, to locate the last (also the first, since there should only be one INODE_REF or INODE_EXTREF now) ref of the inode. [FIX] Instead of the weird jumps, just do them in a stream-lined fashion. This removes those weird labels, and add extra comments on why we can do the different searches. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 45 +++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 7381241334e8..91159dd7355b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1036,14 +1036,33 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) goto out; - path->slots[0]++; - if (path->slots[0] >= btrfs_header_nritems(leaf)) - goto search; -again: + /* + * Now we're going to delete the INODE_REF/EXTREF, which should be the + * only one ref left. Check if the next item is an INODE_REF/EXTREF. + * + * But if we're the last item already, release and search for the last + * INODE_REF/EXTREF. + */ + if (path->slots[0] + 1 >= btrfs_header_nritems(leaf)) { + key.objectid = node->inode_id; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = (u64)-1; + + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto err_out; + ASSERT(ret > 0); + ASSERT(path->slots[0] > 0); + ret = 0; + path->slots[0]--; + leaf = path->nodes[0]; + } else { + path->slots[0]++; + } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != node->inode_id) goto out; - if (key.type != BTRFS_INODE_REF_KEY && key.type != BTRFS_INODE_EXTREF_KEY) goto out; @@ -1070,22 +1089,6 @@ err_out: btrfs_abort_transaction(trans, ret); return ret; - -search: - btrfs_release_path(path); - - key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = -1; - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto err_out; - ASSERT(ret); - - ret = 0; - leaf = path->nodes[0]; - path->slots[0]--; - goto again; } static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, From 9ba965dca3b13757e49f98bbea7cf48f07633ff9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 15 Nov 2023 17:59:39 +0100 Subject: [PATCH 0823/1562] btrfs: use page alloc/free wrappers for compression pages This is a preparation for managing compression pages in a cache-like manner, instead of asking the allocator each time. The common allocation and free wrappers are introduced and are functionally equivalent to the current code. The freeing helpers need to be carefully placed where the last reference is dropped. This is either after directly allocating (error handling) or when there are no other users of the pages (after copying the contents). It's safe to not use the helper and use put_page() that will handle the reference count. Not using the helper means there's lower number of pages that could be reused without passing them back to allocator. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/compression.c | 16 +++++++++++++++- fs/btrfs/compression.h | 5 +++++ fs/btrfs/inode.c | 4 ++-- fs/btrfs/lzo.c | 4 ++-- fs/btrfs/zlib.c | 6 +++--- fs/btrfs/zstd.c | 7 +++---- 6 files changed, 30 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 19b22b4653c8..1cd15d6a9c49 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -163,12 +163,26 @@ static int compression_decompress(int type, struct list_head *ws, static void btrfs_free_compressed_pages(struct compressed_bio *cb) { for (unsigned int i = 0; i < cb->nr_pages; i++) - put_page(cb->compressed_pages[i]); + btrfs_free_compr_page(cb->compressed_pages[i]); kfree(cb->compressed_pages); } static int btrfs_decompress_bio(struct compressed_bio *cb); +/* + * Common wrappers for page allocation from compression wrappers + */ +struct page *btrfs_alloc_compr_page(void) +{ + return alloc_page(GFP_NOFS); +} + +void btrfs_free_compr_page(struct page *page) +{ + ASSERT(page_ref_count(page) == 1); + put_page(page); +} + static void end_compressed_bio_read(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 03bb9d143fa7..93cc92974dee 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -32,6 +32,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 +struct page; + struct compressed_bio { /* Number of compressed pages in the array */ unsigned int nr_pages; @@ -96,6 +98,9 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); +struct page *btrfs_alloc_compr_page(void); +void btrfs_free_compr_page(struct page *page); + enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, BTRFS_COMPRESS_ZLIB = 1, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fb3c3f43c3fa..3305472453f9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1037,7 +1037,7 @@ free_pages: if (pages) { for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); - put_page(pages[i]); + btrfs_free_compr_page(pages[i]); } kfree(pages); } @@ -1052,7 +1052,7 @@ static void free_async_extent_pages(struct async_extent *async_extent) for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); - put_page(async_extent->pages[i]); + btrfs_free_compr_page(async_extent->pages[i]); } kfree(async_extent->pages); async_extent->nr_pages = 0; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d3fcfc628a4f..1131d5a29d61 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -152,7 +152,7 @@ static int copy_compressed_data_to_page(char *compressed_data, cur_page = out_pages[*cur_out / PAGE_SIZE]; /* Allocate a new page */ if (!cur_page) { - cur_page = alloc_page(GFP_NOFS); + cur_page = btrfs_alloc_compr_page(); if (!cur_page) return -ENOMEM; out_pages[*cur_out / PAGE_SIZE] = cur_page; @@ -178,7 +178,7 @@ static int copy_compressed_data_to_page(char *compressed_data, cur_page = out_pages[*cur_out / PAGE_SIZE]; /* Allocate a new page */ if (!cur_page) { - cur_page = alloc_page(GFP_NOFS); + cur_page = btrfs_alloc_compr_page(); if (!cur_page) return -ENOMEM; out_pages[*cur_out / PAGE_SIZE] = cur_page; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 6c231a116a29..36cf1f0e338e 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -121,7 +121,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; @@ -200,7 +200,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; @@ -236,7 +236,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 5511766485cd..0d66db8bc1d4 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -410,9 +410,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); - /* Allocate and map in the output buffer */ - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; @@ -457,7 +456,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; @@ -514,7 +513,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; From 4cea422a776558ccf84e918205d0c162a516502c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 15 Nov 2023 17:59:41 +0100 Subject: [PATCH 0824/1562] btrfs: use shrinker for compression page pool The pages are now allocated and freed centrally, so we can extend the logic to manage the lifetime. The main idea is to keep a few recently used pages and hand them to all writers. Ideally we won't have to go to allocator at all (a slight performance gain) and also raise chance that we'll have the pages available (slightly increased reliability). In order to avoid gathering too many pages, the shrinker is attached to the cache so we can free them on when MM demands that. The first implementation will drain the whole cache. Further this can be refined to keep some minimal number of pages for emergency purposes. The ultimate goal to avoid memory allocation failures on the write out path from the compression. The pool threshold is set to cover full BTRFS_MAX_COMPRESSED / PAGE_SIZE for minimal thread pool, which is 8 (btrfs_init_fs_info()). This is 128K / 4K * 8 = 256 pages at maximum, which is 1MiB. This is for all filesystems currently mounted, with heavy use of compression IO the allocator is still needed. The cache helps for short burst IO. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/compression.c | 102 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1cd15d6a9c49..05595d113ff8 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "misc.h" #include "ctree.h" @@ -169,16 +170,96 @@ static void btrfs_free_compressed_pages(struct compressed_bio *cb) static int btrfs_decompress_bio(struct compressed_bio *cb); +/* + * Global cache of last unused pages for compression/decompression. + */ +static struct btrfs_compr_pool { + struct shrinker *shrinker; + spinlock_t lock; + struct list_head list; + int count; + int thresh; +} compr_pool; + +static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_control *sc) +{ + int ret; + + /* + * We must not read the values more than once if 'ret' gets expanded in + * the return statement so we don't accidentally return a negative + * number, even if the first condition finds it positive. + */ + ret = READ_ONCE(compr_pool.count) - READ_ONCE(compr_pool.thresh); + + return ret > 0 ? ret : 0; +} + +static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc) +{ + struct list_head remove; + struct list_head *tmp, *next; + int freed; + + if (compr_pool.count == 0) + return SHRINK_STOP; + + INIT_LIST_HEAD(&remove); + + /* For now, just simply drain the whole list. */ + spin_lock(&compr_pool.lock); + list_splice_init(&compr_pool.list, &remove); + freed = compr_pool.count; + compr_pool.count = 0; + spin_unlock(&compr_pool.lock); + + list_for_each_safe(tmp, next, &remove) { + struct page *page = list_entry(tmp, struct page, lru); + + ASSERT(page_ref_count(page) == 1); + put_page(page); + } + + return freed; +} + /* * Common wrappers for page allocation from compression wrappers */ struct page *btrfs_alloc_compr_page(void) { + struct page *page = NULL; + + spin_lock(&compr_pool.lock); + if (compr_pool.count > 0) { + page = list_first_entry(&compr_pool.list, struct page, lru); + list_del_init(&page->lru); + compr_pool.count--; + } + spin_unlock(&compr_pool.lock); + + if (page) + return page; + return alloc_page(GFP_NOFS); } void btrfs_free_compr_page(struct page *page) { + bool do_free = false; + + spin_lock(&compr_pool.lock); + if (compr_pool.count > compr_pool.thresh) { + do_free = true; + } else { + list_add(&page->lru, &compr_pool.list); + compr_pool.count++; + } + spin_unlock(&compr_pool.lock); + + if (!do_free) + return; + ASSERT(page_ref_count(page) == 1); put_page(page); } @@ -974,15 +1055,36 @@ int __init btrfs_init_compress(void) offsetof(struct compressed_bio, bbio.bio), BIOSET_NEED_BVECS)) return -ENOMEM; + + compr_pool.shrinker = shrinker_alloc(SHRINKER_NONSLAB, "btrfs-compr-pages"); + if (!compr_pool.shrinker) + return -ENOMEM; + btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); zstd_init_workspace_manager(); + + spin_lock_init(&compr_pool.lock); + INIT_LIST_HEAD(&compr_pool.list); + compr_pool.count = 0; + /* 128K / 4K = 32, for 8 threads is 256 pages. */ + compr_pool.thresh = BTRFS_MAX_COMPRESSED / PAGE_SIZE * 8; + compr_pool.shrinker->count_objects = btrfs_compr_pool_count; + compr_pool.shrinker->scan_objects = btrfs_compr_pool_scan; + compr_pool.shrinker->batch = 32; + compr_pool.shrinker->seeks = DEFAULT_SEEKS; + shrinker_register(compr_pool.shrinker); + return 0; } void __cold btrfs_exit_compress(void) { + /* For now scan drains all pages and does not touch the parameters. */ + btrfs_compr_pool_scan(NULL, NULL); + shrinker_free(compr_pool.shrinker); + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE); btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); From cfbf07e2787e4da79c63622f1a6e64cc89f3a829 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 17 Nov 2023 14:24:14 +1030 Subject: [PATCH 0825/1562] btrfs: migrate to use folio private instead of page private As a cleanup and preparation for future folio migration, this patch would replace all page->private to folio version. This includes: - PagePrivate() -> folio_test_private() - page->private -> folio_get_private() - attach_page_private() -> folio_attach_private() - detach_page_private() -> folio_detach_private() Since we're here, also remove the forced cast on page->private, since it's (void *) already, we don't really need to do the cast. For now even if we missed some call sites, it won't cause any problem yet, as we're only using order 0 folio (single page), thus all those folio/page flags should be synced. But for the future conversion to utilize higher order folio, the page <-> folio flag sync is no longer guaranteed, thus we have to migrate to utilize folio flags. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 102 +++++++++++++++++++++++-------------------- fs/btrfs/extent_io.h | 6 +-- fs/btrfs/file.c | 4 +- fs/btrfs/inode.c | 7 +-- fs/btrfs/subpage.c | 94 ++++++++++++++++++++++++--------------- 5 files changed, 124 insertions(+), 89 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8f724c54fc8e..d68626d1c286 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -562,11 +562,13 @@ update: static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) { + struct folio *folio = page_folio(page); + ASSERT(PageLocked(page)); if (!btrfs_is_subpage(fs_info, page)) return; - ASSERT(PagePrivate(page)); + ASSERT(folio_test_private(folio)); btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); } @@ -865,6 +867,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, struct page *page, struct btrfs_subpage *prealloc) { + struct folio *folio = page_folio(page); struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0; @@ -878,22 +881,22 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, lockdep_assert_held(&page->mapping->private_lock); if (fs_info->nodesize >= PAGE_SIZE) { - if (!PagePrivate(page)) - attach_page_private(page, eb); + if (!folio_test_private(folio)) + folio_attach_private(folio, eb); else - WARN_ON(page->private != (unsigned long)eb); + WARN_ON(folio_get_private(folio) != eb); return 0; } /* Already mapped, just free prealloc */ - if (PagePrivate(page)) { + if (folio_test_private(folio)) { btrfs_free_subpage(prealloc); return 0; } if (prealloc) /* Has preallocated memory for subpage */ - attach_page_private(page, prealloc); + folio_attach_private(folio, prealloc); else /* Do new allocation to attach subpage */ ret = btrfs_attach_subpage(fs_info, page, @@ -903,11 +906,12 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, int set_page_extent_mapped(struct page *page) { + struct folio *folio = page_folio(page); struct btrfs_fs_info *fs_info; ASSERT(page->mapping); - if (PagePrivate(page)) + if (folio_test_private(folio)) return 0; fs_info = btrfs_sb(page->mapping->host->i_sb); @@ -915,24 +919,25 @@ int set_page_extent_mapped(struct page *page) if (btrfs_is_subpage(fs_info, page)) return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); - attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); + folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); return 0; } void clear_page_extent_mapped(struct page *page) { + struct folio *folio = page_folio(page); struct btrfs_fs_info *fs_info; ASSERT(page->mapping); - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return; fs_info = btrfs_sb(page->mapping->host->i_sb); if (btrfs_is_subpage(fs_info, page)) return btrfs_detach_subpage(fs_info, page); - detach_page_private(page); + folio_detach_private(folio); } static struct extent_map * @@ -1240,7 +1245,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, struct page *page, u64 *start, u64 *end) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); struct btrfs_subpage_info *spi = fs_info->subpage_info; u64 orig_start = *start; /* Declare as unsigned long so we can use bitmap ops */ @@ -1725,6 +1731,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct folio *folio = page_folio(page); int submitted = 0; u64 page_start = page_offset(page); int bit_start = 0; @@ -1732,7 +1739,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) /* Lock and write each dirty extent buffers in the range */ while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage = folio_get_private(folio); struct extent_buffer *eb; unsigned long flags; u64 start; @@ -1742,7 +1749,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * in the meantime. */ spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) { + if (!folio_test_private(folio)) { spin_unlock(&page->mapping->private_lock); break; } @@ -1807,22 +1814,23 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) { struct writeback_control *wbc = ctx->wbc; struct address_space *mapping = page->mapping; + struct folio *folio = page_folio(page); struct extent_buffer *eb; int ret; - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return 0; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc); spin_lock(&mapping->private_lock); - if (!PagePrivate(page)) { + if (!folio_test_private(folio)) { spin_unlock(&mapping->private_lock); return 0; } - eb = (struct extent_buffer *)page->private; + eb = folio_get_private(folio); /* * Shouldn't happen and normally this would be a BUG_ON but no point @@ -3060,12 +3068,13 @@ static int extent_buffer_under_io(const struct extent_buffer *eb) static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) { + struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; lockdep_assert_held(&page->mapping->private_lock); - if (PagePrivate(page)) { - subpage = (struct btrfs_subpage *)page->private; + if (folio_test_private(folio)) { + subpage = folio_get_private(folio); if (atomic_read(&subpage->eb_refs)) return true; /* @@ -3082,15 +3091,16 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag { struct btrfs_fs_info *fs_info = eb->fs_info; const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); + struct folio *folio = page_folio(page); /* - * For mapped eb, we're going to change the page private, which should + * For mapped eb, we're going to change the folio private, which should * be done under the private_lock. */ if (mapped) spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) { + if (!folio_test_private(folio)) { if (mapped) spin_unlock(&page->mapping->private_lock); return; @@ -3101,19 +3111,15 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race * and have this page now attached to the new eb. So - * only clear page_private if it's still connected to + * only clear folio if it's still connected to * this eb. */ - if (PagePrivate(page) && - page->private == (unsigned long)eb) { + if (folio_test_private(folio) && folio_get_private(folio) == eb) { BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(PageDirty(page)); BUG_ON(PageWriteback(page)); - /* - * We need to make sure we haven't be attached - * to a new eb. - */ - detach_page_private(page); + /* We need to make sure we haven't be attached to a new eb. */ + folio_detach_private(folio); } if (mapped) spin_unlock(&page->mapping->private_lock); @@ -3121,9 +3127,9 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag } /* - * For subpage, we can have dummy eb with page private. In this case, - * we can directly detach the private as such page is only attached to - * one dummy eb, no sharing. + * For subpage, we can have dummy eb with folio private attached. In + * this case, we can directly detach the private as such folio is only + * attached to one dummy eb, no sharing. */ if (!mapped) { btrfs_detach_subpage(fs_info, page); @@ -3133,7 +3139,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag btrfs_page_dec_eb_refs(fs_info, page); /* - * We can only detach the page private if there are no other ebs in the + * We can only detach the folio private if there are no other ebs in the * page range and no unfinished IO. */ if (!page_range_has_eb(fs_info, page)) @@ -3410,6 +3416,7 @@ free_eb: static struct extent_buffer *grab_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page) { + struct folio *folio = page_folio(page); struct extent_buffer *exists; /* @@ -3421,21 +3428,21 @@ static struct extent_buffer *grab_extent_buffer( return NULL; /* Page not yet attached to an extent buffer */ - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return NULL; /* * We could have already allocated an eb for this page and attached one * so lets see if we can get a ref on the existing eb, and if we can we * know it's good and we can just return that one, else we know we can - * just overwrite page->private. + * just overwrite folio private. */ - exists = (struct extent_buffer *)page->private; + exists = folio_get_private(folio); if (atomic_inc_not_zero(&exists->refs)) return exists; WARN_ON(PageDirty(page)); - detach_page_private(page); + folio_detach_private(folio); return NULL; } @@ -3519,7 +3526,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, num_pages = num_extent_pages(eb); /* - * Preallocate page->private for subpage case, so that we won't + * Preallocate folio private for subpage case, so that we won't * allocate memory with private_lock nor page lock hold. * * The memory will be freed by attach_extent_buffer_page() or freed @@ -3556,7 +3563,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, ASSERT(!ret); /* * To inform we have extra eb under allocation, so that - * detach_extent_buffer_page() won't release the page private + * detach_extent_buffer_page() won't release the folio private * when the eb hasn't yet been inserted into radix tree. * * The ref will be decreased when the eb released the page, in @@ -4520,7 +4527,7 @@ static int try_release_subpage_extent_buffer(struct page *page) struct extent_buffer *eb = NULL; /* - * Unlike try_release_extent_buffer() which uses page->private + * Unlike try_release_extent_buffer() which uses folio private * to grab buffer, for subpage case we rely on radix tree, thus * we need to ensure radix tree consistency. * @@ -4560,17 +4567,17 @@ static int try_release_subpage_extent_buffer(struct page *page) /* * Here we don't care about the return value, we will always - * check the page private at the end. And + * check the folio private at the end. And * release_extent_buffer() will release the refs_lock. */ release_extent_buffer(eb); } /* - * Finally to check if we have cleared page private, as if we have - * released all ebs in the page, the page private should be cleared now. + * Finally to check if we have cleared folio private, as if we have + * released all ebs in the page, the folio private should be cleared now. */ spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) + if (!folio_test_private(page_folio(page))) ret = 1; else ret = 0; @@ -4581,22 +4588,23 @@ static int try_release_subpage_extent_buffer(struct page *page) int try_release_extent_buffer(struct page *page) { + struct folio *folio = page_folio(page); struct extent_buffer *eb; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return try_release_subpage_extent_buffer(page); /* - * We need to make sure nobody is changing page->private, as we rely on - * page->private as the pointer to extent buffer. + * We need to make sure nobody is changing folio private, as we rely on + * folio private as the pointer to extent buffer. */ spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) { + if (!folio_test_private(folio)) { spin_unlock(&page->mapping->private_lock); return 1; } - eb = (struct extent_buffer *)page->private; + eb = folio_get_private(folio); BUG_ON(!eb); /* diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2171057a4477..8eac8384b24c 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -43,10 +43,10 @@ enum { }; /* - * page->private values. Every page that is controlled by the extent - * map has page->private set to one. + * Folio private values. Every page that is controlled by the extent map has + * folio private set to this value. */ -#define EXTENT_PAGE_PRIVATE 1 +#define EXTENT_FOLIO_PRIVATE 1 /* * The extent buffer bitmap operations are done with byte granularity instead of diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 32611a4edd6b..7a71720aaed2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -869,9 +869,9 @@ static int prepare_uptodate_page(struct inode *inode, * released. * * The private flag check is essential for subpage as we need - * to store extra bitmap using page->private. + * to store extra bitmap using folio private. */ - if (page->mapping != inode->i_mapping || !PagePrivate(page)) { + if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { unlock_page(page); return -EAGAIN; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3305472453f9..dfef726e84e8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4725,7 +4725,7 @@ again: /* * We unlock the page after the io is completed and then re-lock it * above. release_folio() could have come in between that and cleared - * PagePrivate(), but left the page in the mapping. Set the page mapped + * folio private, but left the page in the mapping. Set the page mapped * here to make sure it's properly set for the subpage stuff. */ ret = set_page_extent_mapped(page); @@ -7851,13 +7851,14 @@ static void btrfs_readahead(struct readahead_control *rac) static void wait_subpage_spinlock(struct page *page) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; if (!btrfs_is_subpage(fs_info, page)) return; - ASSERT(PagePrivate(page) && page->private); - subpage = (struct btrfs_subpage *)page->private; + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + subpage = folio_get_private(folio); /* * This may look insane as we just acquire the spinlock and release it, diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 1b999c6e4193..caf0013f2545 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -118,6 +118,7 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type) { + struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; /* @@ -127,28 +128,29 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, if (page->mapping) ASSERT(PageLocked(page)); - /* Either not subpage, or the page already has private attached */ - if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page)) + /* Either not subpage, or the folio already has private attached. */ + if (!btrfs_is_subpage(fs_info, page) || folio_test_private(folio)) return 0; subpage = btrfs_alloc_subpage(fs_info, type); if (IS_ERR(subpage)) return PTR_ERR(subpage); - attach_page_private(page, subpage); + folio_attach_private(folio, subpage); return 0; } void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct page *page) { + struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; - /* Either not subpage, or already detached */ - if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page)) + /* Either not subpage, or the folio already has private attached. */ + if (!btrfs_is_subpage(fs_info, page) || !folio_test_private(folio)) return; - subpage = detach_page_private(page); + subpage = folio_detach_private(folio); ASSERT(subpage); btrfs_free_subpage(subpage); } @@ -188,36 +190,38 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage) * This is important for eb allocation, to prevent race with last eb freeing * of the same page. * With the eb_refs increased before the eb inserted into radix tree, - * detach_extent_buffer_page() won't detach the page private while we're still + * detach_extent_buffer_page() won't detach the folio private while we're still * allocating the extent buffer. */ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct page *page) { + struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; if (!btrfs_is_subpage(fs_info, page)) return; - ASSERT(PagePrivate(page) && page->mapping); + ASSERT(folio_test_private(folio) && page->mapping); lockdep_assert_held(&page->mapping->private_lock); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); atomic_inc(&subpage->eb_refs); } void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct page *page) { + struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; if (!btrfs_is_subpage(fs_info, page)) return; - ASSERT(PagePrivate(page) && page->mapping); + ASSERT(folio_test_private(folio) && page->mapping); lockdep_assert_held(&page->mapping->private_lock); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); ASSERT(atomic_read(&subpage->eb_refs)); atomic_dec(&subpage->eb_refs); } @@ -225,8 +229,10 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { + struct folio *folio = page_folio(page); + /* Basic checks */ - ASSERT(PagePrivate(page) && page->private); + ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); /* @@ -241,7 +247,8 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = len >> fs_info->sectorsize_bits; btrfs_subpage_assert(fs_info, page, start, len); @@ -252,7 +259,8 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = len >> fs_info->sectorsize_bits; bool is_data; bool last; @@ -294,7 +302,8 @@ static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = (len >> fs_info->sectorsize_bits); int ret; @@ -308,7 +317,8 @@ void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = (len >> fs_info->sectorsize_bits); btrfs_subpage_assert(fs_info, page, start, len); @@ -340,12 +350,14 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { + struct folio *folio = page_folio(page); + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { lock_page(page); return 0; } lock_page(page); - if (!PagePrivate(page) || !page->private) { + if (!folio_test_private(folio) || !folio_get_private(folio)) { unlock_page(page); return -EAGAIN; } @@ -387,7 +399,8 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, page, uptodate, start, len); unsigned long flags; @@ -402,7 +415,8 @@ void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, page, uptodate, start, len); unsigned long flags; @@ -416,7 +430,8 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, page, dirty, start, len); unsigned long flags; @@ -440,7 +455,8 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, page, dirty, start, len); unsigned long flags; @@ -467,7 +483,8 @@ void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, page, writeback, start, len); unsigned long flags; @@ -481,7 +498,8 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, page, writeback, start, len); unsigned long flags; @@ -498,7 +516,8 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, page, ordered, start, len); unsigned long flags; @@ -512,7 +531,8 @@ void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, page, ordered, start, len); unsigned long flags; @@ -527,7 +547,8 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, page, checked, start, len); unsigned long flags; @@ -542,7 +563,8 @@ void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, page, checked, start, len); unsigned long flags; @@ -561,7 +583,8 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ + struct folio *folio = page_folio(page); \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \ name, start, len); \ unsigned long flags; \ @@ -656,7 +679,8 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked) void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct page *page) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; @@ -665,7 +689,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, if (!btrfs_is_subpage(fs_info, page)) return; - ASSERT(PagePrivate(page) && page->private); + ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty)); } @@ -687,6 +711,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { + struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; ASSERT(PageLocked(page)); @@ -694,8 +719,8 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, if (!btrfs_is_subpage(fs_info, page)) return unlock_page(page); - ASSERT(PagePrivate(page) && page->private); - subpage = (struct btrfs_subpage *)page->private; + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + subpage = folio_get_private(folio); /* * For subpage case, there are two types of locked page. With or @@ -720,6 +745,7 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; + struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; unsigned long uptodate_bitmap; unsigned long error_bitmap; @@ -729,9 +755,9 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long checked_bitmap; unsigned long flags; - ASSERT(PagePrivate(page) && page->private); + ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(subpage_info); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap); From a5e182d85fa55557496ad751c88a37f3c0590242 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 21 Nov 2023 02:50:13 +0100 Subject: [PATCH 0826/1562] btrfs: scrub: remove unused scrub_ctx::sectors_per_bio The recent scrub rewrite forgot to remove the sectors_per_bio in 6.3 in 13a62fd997f0 ("btrfs: scrub: remove scrub_bio structure"). This was found by tool https://github.com/jirislaby/clang-struct . Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f62a408671cb..00826644bca8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -192,7 +192,6 @@ struct scrub_ctx { int cur_stripe; atomic_t cancel_req; int readonly; - int sectors_per_bio; /* State of IO submission throttling affecting the associated device */ ktime_t throttle_deadline; From 3d72941664460153362f81ed66089d65538c3d39 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 21 Nov 2023 02:50:15 +0100 Subject: [PATCH 0827/1562] btrfs: remove unused btrfs_ordered_extent::outstanding_isize The whole isize code was deleted in 5.6 3f1c64ce0438 ("btrfs: delete the ordered isize update code"), except the struct member. This was found by tool https://github.com/jirislaby/clang-struct . Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/ordered-data.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 567a6d3d4712..127ef8bf0ffd 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -96,13 +96,6 @@ struct btrfs_ordered_extent { /* number of bytes that still need writing */ u64 bytes_left; - /* - * the end of the ordered extent which is behind it but - * didn't update disk_i_size. Please see the comment of - * btrfs_ordered_update_i_size(); - */ - u64 outstanding_isize; - /* * If we get truncated we need to adjust the file extent we enter for * this ordered extent so that we do not expose stale data. From a0df0a2680353fbfd7a14aaab4624f22d539b876 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 21 Nov 2023 02:50:17 +0100 Subject: [PATCH 0828/1562] btrfs: raid56: remove unused btrfs_plug_cb::work The raid56 changes in 6.2 reworked the IO path to RMW, commit 93723095b5d5 ("btrfs: raid56: switch write path to rmw_rbio()") in particular removed the last use of the work member so it can be removed as well. This was found by tool https://github.com/jirislaby/clang-struct . Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 3e014b9370a3..90f12c0e88a1 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1549,7 +1549,6 @@ struct btrfs_plug_cb { struct blk_plug_cb cb; struct btrfs_fs_info *info; struct list_head rbio_list; - struct work_struct work; }; /* From 49542050b1a172c67005e4d63f90429b4ae50b01 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 21 Nov 2023 02:50:19 +0100 Subject: [PATCH 0829/1562] btrfs: remove unused definition of tree_entry in extent-io-tree.c The declaration was temporarily moved in a4055213bf69 ("btrfs: unexport all the temporary exports for extent-io-tree.c") and then should have been removed in 6.0 in 071d19f5130f ("btrfs: remove struct tree_entry in extent-io-tree.c") but was not. This was found by tool https://github.com/jirislaby/clang-struct . Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index ea149be28dff..76061245a46b 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -87,12 +87,6 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, */ static struct lock_class_key file_extent_tree_class; -struct tree_entry { - u64 start; - u64 end; - struct rb_node rb_node; -}; - void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner) { From 46524fab690ea5ee7b7a8c6b788d06765cdf8db1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 21 Nov 2023 02:50:21 +0100 Subject: [PATCH 0830/1562] btrfs: remove unused btrfs_root::type Looks like the struct member was added in 2007 in 2.6.29 in commit 87ee04eb0f2f ("Btrfs: Add simple stripe size parameter") but hasn't been used at all since. So let's remove it. This was found by tool https://github.com/jirislaby/clang-struct, then build tested after removing the struct member. Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9c0800f5bdcb..54fd4eb92745 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -212,8 +212,6 @@ struct btrfs_root { u64 last_trans; - u32 type; - u64 free_objectid; struct btrfs_key defrag_progress; From 5031660a1b6a7ca7f9a1c55ebf0c157255826915 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 21 Nov 2023 13:38:34 +0000 Subject: [PATCH 0831/1562] btrfs: mark sanity checks when getting chunk map as unlikely When getting a chunk map, at btrfs_get_chunk_map(), we do some sanity checks to verify that we found an extent map and that it includes the requested logical address. These are never expected to fail, so mark them as unlikely to make it more clear as well as to allow a compiler to generate more efficient code. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f627674b37db..c66e1c6c0410 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3005,14 +3005,14 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, em = lookup_extent_mapping(em_tree, logical, length); read_unlock(&em_tree->lock); - if (!em) { + if (unlikely(!em)) { btrfs_crit(fs_info, "unable to find chunk map for logical %llu length %llu", logical, length); return ERR_PTR(-EINVAL); } - if (em->start > logical || em->start + em->len <= logical) { + if (unlikely(em->start > logical || em->start + em->len <= logical)) { btrfs_crit(fs_info, "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", logical, logical + length, em->start, em->start + em->len); From 3128b548c759da4263b44306093d3a1751dcc58d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 21 Nov 2023 13:38:35 +0000 Subject: [PATCH 0832/1562] btrfs: split assert into two different asserts when removing block group When starting a transaction to remove a block group we have one ASSERT that checks we found an extent map and that the extent map's start offset matches the desired chunk offset. In case one of the conditions fails, we get a stack trace that point to the respective line of code, however we can't tell which condition failed: either there's no extent map or we got one with an unexpected start offset. To make such an issue easier to debug and analyse, split the assertion into two, one for each condition. This was actually triggered during development of another upcoming change. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6e5dc68ff661..fca653cc977c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1303,7 +1303,8 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, chunk_offset, 1); read_unlock(&em_tree->lock); - ASSERT(em && em->start == chunk_offset); + ASSERT(em != NULL); + ASSERT(em->start == chunk_offset); /* * We need to reserve 3 + N units from the metadata space info in order From 2ecec0d6a5b5817edf50fe80196ca774e72dae46 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 21 Nov 2023 13:38:36 +0000 Subject: [PATCH 0833/1562] btrfs: unexport extent_map_block_end() The helper extent_map_block_end() is currently not used anywhere outside extent_map.c, so move into from extent_map.h into extent_map.c. While at it, also make the extent map pointer argument as const. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 7 +++++++ fs/btrfs/extent_map.h | 7 ------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index a6d8368ed0ed..bced39dc0da8 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -182,6 +182,13 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return NULL; } +static inline u64 extent_map_block_end(const struct extent_map *em) +{ + if (em->block_start + em->block_len < em->block_start) + return (u64)-1; + return em->block_start + em->block_len; +} + /* Check to see if two extent_map structs are adjacent and safe to merge. */ static int mergable_maps(struct extent_map *prev, struct extent_map *next) { diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 35d27c756e08..d0328127f89c 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -77,13 +77,6 @@ static inline u64 extent_map_end(struct extent_map *em) return em->start + em->len; } -static inline u64 extent_map_block_end(struct extent_map *em) -{ - if (em->block_start + em->block_len < em->block_start) - return (u64)-1; - return em->block_start + em->block_len; -} - void extent_map_tree_init(struct extent_map_tree *tree); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); From ebb0beca6c6a2d33f809a74bad63261651237833 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 21 Nov 2023 13:38:37 +0000 Subject: [PATCH 0834/1562] btrfs: use btrfs_next_item() at scrub.c:find_first_extent_item() There's no reason to open code what btrfs_next_item() does when searching for extent items at scrub.c:scrub.c:find_first_extent_item(), so remove the logic to find the next item and use btrfs_next_item() instead, making the code shorter and less nested code blocks. While at it also fix the comment to the plural "items" instead of "item" and end it with proper punctuation. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 00826644bca8..2b04cdb3c4e9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1408,14 +1408,11 @@ search_forward: if (ret > 0) break; next: - path->slots[0]++; - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(extent_root, path); - if (ret) { - /* Either no more item or fatal error */ - btrfs_release_path(path); - return ret; - } + ret = btrfs_next_item(extent_root, path); + if (ret) { + /* Either no more items or a fatal error. */ + btrfs_release_path(path); + return ret; } } btrfs_release_path(path); From 7dc66abb5a47778d7db327783a0ba172b8cff0b5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 21 Nov 2023 13:38:38 +0000 Subject: [PATCH 0835/1562] btrfs: use a dedicated data structure for chunk maps Currently we abuse the extent_map structure for two purposes: 1) To actually represent extents for inodes; 2) To represent chunk mappings. This is odd and has several disadvantages: 1) To create a chunk map, we need to do two memory allocations: one for an extent_map structure and another one for a map_lookup structure, so more potential for an allocation failure and more complicated code to manage and link two structures; 2) For a chunk map we actually only use 3 fields (24 bytes) of the respective extent map structure: the 'start' field to have the logical start address of the chunk, the 'len' field to have the chunk's size, and the 'orig_block_len' field to contain the chunk's stripe size. Besides wasting a memory, it's also odd and not intuitive at all to have the stripe size in a field named 'orig_block_len'. We are also using 'block_len' of the extent_map structure to contain the chunk size, so we have 2 fields for the same value, 'len' and 'block_len', which is pointless; 3) When an extent map is associated to a chunk mapping, we set the bit EXTENT_FLAG_FS_MAPPING on its flags and then make its member named 'map_lookup' point to the associated map_lookup structure. This means that for an extent map associated to an inode extent, we are not using this 'map_lookup' pointer, so wasting 8 bytes (on a 64 bits platform); 4) Extent maps associated to a chunk mapping are never merged or split so it's pointless to use the existing extent map infrastructure. So add a dedicated data structure named 'btrfs_chunk_map' to represent chunk mappings, this is basically the existing map_lookup structure with some extra fields: 1) 'start' to contain the chunk logical address; 2) 'chunk_len' to contain the chunk's length; 3) 'stripe_size' for the stripe size; 4) 'rb_node' for insertion into a rb tree; 5) 'refs' for reference counting. This way we do a single memory allocation for chunk mappings and we don't waste memory for them with unused/unnecessary fields from an extent_map. We also save 8 bytes from the extent_map structure by removing the 'map_lookup' pointer, so the size of struct extent_map is reduced from 144 bytes down to 136 bytes, and we can now have 30 extents map per 4K page instead of 28. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 163 ++++----- fs/btrfs/block-group.h | 6 +- fs/btrfs/dev-replace.c | 28 +- fs/btrfs/disk-io.c | 7 +- fs/btrfs/extent_map.c | 46 --- fs/btrfs/extent_map.h | 4 - fs/btrfs/fs.h | 3 +- fs/btrfs/inode.c | 25 +- fs/btrfs/raid56.h | 2 +- fs/btrfs/scrub.c | 39 +-- fs/btrfs/tests/btrfs-tests.c | 3 +- fs/btrfs/tests/btrfs-tests.h | 1 + fs/btrfs/tests/extent-map-tests.c | 40 +-- fs/btrfs/volumes.c | 540 ++++++++++++++++++------------ fs/btrfs/volumes.h | 45 ++- fs/btrfs/zoned.c | 35 +- include/trace/events/btrfs.h | 11 +- 17 files changed, 505 insertions(+), 493 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index fca653cc977c..0fea258eea15 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -168,7 +168,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) cache); kfree(cache->free_space_ctl); - kfree(cache->physical_map); + btrfs_free_chunk_map(cache->physical_map); kfree(cache); } } @@ -1047,7 +1047,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans, } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - u64 group_start, struct extent_map *em) + struct btrfs_chunk_map *map) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path; @@ -1059,10 +1059,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int index; int factor; struct btrfs_caching_control *caching_ctl = NULL; - bool remove_em; + bool remove_map; bool remove_rsv = false; - block_group = btrfs_lookup_block_group(fs_info, group_start); + block_group = btrfs_lookup_block_group(fs_info, map->start); BUG_ON(!block_group); BUG_ON(!block_group->ro); @@ -1252,7 +1252,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * entries because we already removed them all when we called * btrfs_remove_free_space_cache(). * - * And we must not remove the extent map from the fs_info->mapping_tree + * And we must not remove the chunk map from the fs_info->mapping_tree * to prevent the same logical address range and physical device space * ranges from being reused for a new block group. This is needed to * avoid races with trimming and scrub. @@ -1268,19 +1268,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * in place until the extents have been discarded completely when * the transaction commit has completed. */ - remove_em = (atomic_read(&block_group->frozen) == 0); + remove_map = (atomic_read(&block_group->frozen) == 0); spin_unlock(&block_group->lock); - if (remove_em) { - struct extent_map_tree *em_tree; - - em_tree = &fs_info->mapping_tree; - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - /* once for the tree */ - free_extent_map(em); - } + if (remove_map) + btrfs_remove_chunk_map(fs_info, map); out: /* Once for the lookup reference */ @@ -1295,16 +1287,12 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset) { struct btrfs_root *root = btrfs_block_group_root(fs_info); - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; unsigned int num_items; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - ASSERT(em != NULL); - ASSERT(em->start == chunk_offset); + map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); + ASSERT(map != NULL); + ASSERT(map->start == chunk_offset); /* * We need to reserve 3 + N units from the metadata space info in order @@ -1325,9 +1313,8 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( * more device items and remove one chunk item), but this is done at * btrfs_remove_chunk() through a call to check_system_chunk(). */ - map = em->map_lookup; num_items = 3 + map->num_stripes; - free_extent_map(em); + btrfs_free_chunk_map(map); return btrfs_start_transaction_fallback_global_rsv(root, num_items); } @@ -1928,8 +1915,7 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct btrfs_path *path) { - struct extent_map_tree *em_tree; - struct extent_map *em; + struct btrfs_chunk_map *map; struct btrfs_block_group_item bg; struct extent_buffer *leaf; int slot; @@ -1939,23 +1925,20 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, slot = path->slots[0]; leaf = path->nodes[0]; - em_tree = &fs_info->mapping_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, key->objectid, key->offset); - read_unlock(&em_tree->lock); - if (!em) { + map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset); + if (!map) { btrfs_err(fs_info, "logical %llu len %llu found bg but no related chunk", key->objectid, key->offset); return -ENOENT; } - if (em->start != key->objectid || em->len != key->offset) { + if (map->start != key->objectid || map->chunk_len != key->offset) { btrfs_err(fs_info, "block group %llu len %llu mismatch with chunk %llu len %llu", - key->objectid, key->offset, em->start, em->len); + key->objectid, key->offset, map->start, map->chunk_len); ret = -EUCLEAN; - goto out_free_em; + goto out_free_map; } read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), @@ -1963,16 +1946,16 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, flags = btrfs_stack_block_group_flags(&bg) & BTRFS_BLOCK_GROUP_TYPE_MASK; - if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(fs_info, "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", key->objectid, key->offset, flags, - (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type)); + (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type)); ret = -EUCLEAN; } -out_free_em: - free_extent_map(em); +out_free_map: + btrfs_free_chunk_map(map); return ret; } @@ -2025,8 +2008,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 **logical, int *naddrs, int *stripe_len) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 *buf; u64 bytenr; u64 data_stripe_length; @@ -2034,14 +2016,13 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, int i, nr = 0; int ret = 0; - em = btrfs_get_chunk_map(fs_info, chunk_start, 1); - if (IS_ERR(em)) + map = btrfs_get_chunk_map(fs_info, chunk_start, 1); + if (IS_ERR(map)) return -EIO; - map = em->map_lookup; - data_stripe_length = em->orig_block_len; + data_stripe_length = map->stripe_size; io_stripe_size = BTRFS_STRIPE_LEN; - chunk_start = em->start; + chunk_start = map->start; /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -2095,7 +2076,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, *naddrs = nr; *stripe_len = io_stripe_size; out: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -2200,49 +2181,47 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( */ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct btrfs_block_group *bg; u64 start = 0; int ret = 0; while (1) { - read_lock(&map_tree->lock); + struct btrfs_chunk_map *map; + struct btrfs_block_group *bg; + /* - * lookup_extent_mapping will return the first extent map - * intersecting the range, so setting @len to 1 is enough to + * btrfs_find_chunk_map() will return the first chunk map + * intersecting the range, so setting @length to 1 is enough to * get the first chunk. */ - em = lookup_extent_mapping(map_tree, start, 1); - read_unlock(&map_tree->lock); - if (!em) + map = btrfs_find_chunk_map(fs_info, start, 1); + if (!map) break; - bg = btrfs_lookup_block_group(fs_info, em->start); + bg = btrfs_lookup_block_group(fs_info, map->start); if (!bg) { btrfs_err(fs_info, "chunk start=%llu len=%llu doesn't have corresponding block group", - em->start, em->len); + map->start, map->chunk_len); ret = -EUCLEAN; - free_extent_map(em); + btrfs_free_chunk_map(map); break; } - if (bg->start != em->start || bg->length != em->len || + if (bg->start != map->start || bg->length != map->chunk_len || (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != - (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(fs_info, "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", - em->start, em->len, - em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, + map->start, map->chunk_len, + map->type & BTRFS_BLOCK_GROUP_TYPE_MASK, bg->start, bg->length, bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); ret = -EUCLEAN; - free_extent_map(em); + btrfs_free_chunk_map(map); btrfs_put_block_group(bg); break; } - start = em->start + em->len; - free_extent_map(em); + start = map->start + map->chunk_len; + btrfs_free_chunk_map(map); btrfs_put_block_group(bg); } return ret; @@ -2370,28 +2349,25 @@ error: static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct rb_node *node; int ret = 0; - for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { - struct extent_map *em; - struct map_lookup *map; + for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { + struct btrfs_chunk_map *map; struct btrfs_block_group *bg; - em = rb_entry(node, struct extent_map, rb_node); - map = em->map_lookup; - bg = btrfs_create_block_group_cache(fs_info, em->start); + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + bg = btrfs_create_block_group_cache(fs_info, map->start); if (!bg) { ret = -ENOMEM; break; } /* Fill dummy cache as FULL */ - bg->length = em->len; + bg->length = map->chunk_len; bg->flags = map->type; bg->cached = BTRFS_CACHE_FINISHED; - bg->used = em->len; + bg->used = map->chunk_len; bg->flags = map->type; ret = btrfs_add_block_group_cache(fs_info, bg); /* @@ -2619,19 +2595,17 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_device *device; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 dev_offset; u64 stripe_size; int i; int ret = 0; - em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); - if (IS_ERR(em)) - return PTR_ERR(em); + map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); + if (IS_ERR(map)) + return PTR_ERR(map); - map = em->map_lookup; - stripe_size = em->orig_block_len; + stripe_size = map->stripe_size; /* * Take the device list mutex to prevent races with the final phase of @@ -2654,7 +2628,7 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans, } mutex_unlock(&fs_info->fs_devices->device_list_mutex); - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -4407,8 +4381,6 @@ void btrfs_freeze_block_group(struct btrfs_block_group *cache) void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct extent_map_tree *em_tree; - struct extent_map *em; bool cleanup; spin_lock(&block_group->lock); @@ -4417,17 +4389,16 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) spin_unlock(&block_group->lock); if (cleanup) { - em_tree = &fs_info->mapping_tree; - write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, block_group->start, - 1); - BUG_ON(!em); /* logic error, can't happen */ - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); + struct btrfs_chunk_map *map; - /* once for us and once for the tree */ - free_extent_map(em); - free_extent_map(em); + map = btrfs_find_chunk_map(fs_info, block_group->start, 1); + /* Logic error, can't happen. */ + ASSERT(map); + + btrfs_remove_chunk_map(fs_info, map); + + /* Once for our lookup reference. */ + btrfs_free_chunk_map(map); /* * We may have left one free space entry and other possible diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 2bdbcb834f95..c4a1f01cc1c2 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -5,6 +5,8 @@ #include "free-space-cache.h" +struct btrfs_chunk_map; + enum btrfs_disk_cache_state { BTRFS_DC_WRITTEN, BTRFS_DC_ERROR, @@ -243,7 +245,7 @@ struct btrfs_block_group { u64 zone_unusable; u64 zone_capacity; u64 meta_write_pointer; - struct map_lookup *physical_map; + struct btrfs_chunk_map *physical_map; struct list_head active_bg_list; struct work_struct zone_finish_work; struct extent_buffer *last_eb; @@ -297,7 +299,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - u64 group_start, struct extent_map *em); + struct btrfs_chunk_map *map); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_unused(struct btrfs_block_group *bg); void btrfs_reclaim_bgs_work(struct work_struct *work); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f9544fda38e9..1502d664c892 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -550,8 +550,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, u64 physical) { struct btrfs_fs_info *fs_info = cache->fs_info; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 chunk_offset = cache->start; int num_extents, cur_extent; int i; @@ -567,9 +566,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, } spin_unlock(&cache->lock); - em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); - ASSERT(!IS_ERR(em)); - map = em->map_lookup; + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + ASSERT(!IS_ERR(map)); num_extents = 0; cur_extent = 0; @@ -583,7 +581,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, cur_extent = i; } - free_extent_map(em); + btrfs_free_chunk_map(map); if (num_extents > 1 && cur_extent < num_extents - 1) { /* @@ -812,25 +810,23 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; u64 start = 0; int i; - write_lock(&em_tree->lock); + write_lock(&fs_info->mapping_tree_lock); do { - em = lookup_extent_mapping(em_tree, start, (u64)-1); - if (!em) + struct btrfs_chunk_map *map; + + map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX); + if (!map) break; - map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; - start = em->start + em->len; - free_extent_map(em); + start = map->start + map->chunk_len; + btrfs_free_chunk_map(map); } while (start); - write_unlock(&em_tree->lock); + write_unlock(&fs_info->mapping_tree_lock); } static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7d8d175d5a59..17ec983ea672 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2720,7 +2720,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->allocated_ebs); spin_lock_init(&fs_info->eb_leak_lock); #endif - extent_map_tree_init(&fs_info->mapping_tree); + fs_info->mapping_tree = RB_ROOT_CACHED; + rwlock_init(&fs_info->mapping_tree_lock); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); @@ -3604,7 +3605,7 @@ fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); fail_alloc: - btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_mapping_tree_free(fs_info); iput(fs_info->btree_inode); fail: @@ -4387,7 +4388,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) iput(fs_info->btree_inode); - btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_mapping_tree_free(fs_info); btrfs_close_devices(fs_info->fs_devices); } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index bced39dc0da8..c956b1ced69f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -67,8 +67,6 @@ void free_extent_map(struct extent_map *em) if (refcount_dec_and_test(&em->refs)) { WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); - if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) - kfree(em->map_lookup); kmem_cache_free(extent_map_cache, em); } } @@ -217,13 +215,8 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) ASSERT(next->block_start != EXTENT_MAP_DELALLOC && prev->block_start != EXTENT_MAP_DELALLOC); - if (prev->map_lookup || next->map_lookup) - ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) && - test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags)); - if (extent_map_end(prev) == next->start && prev->flags == next->flags && - prev->map_lookup == next->map_lookup && ((next->block_start == EXTENT_MAP_HOLE && prev->block_start == EXTENT_MAP_HOLE) || (next->block_start == EXTENT_MAP_INLINE && @@ -361,39 +354,6 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree, try_merge_map(tree, em); } -static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) -{ - struct map_lookup *map = em->map_lookup; - u64 stripe_size = em->orig_block_len; - int i; - - for (i = 0; i < map->num_stripes; i++) { - struct btrfs_io_stripe *stripe = &map->stripes[i]; - struct btrfs_device *device = stripe->dev; - - set_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, - bits | EXTENT_NOWAIT, NULL); - } -} - -static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) -{ - struct map_lookup *map = em->map_lookup; - u64 stripe_size = em->orig_block_len; - int i; - - for (i = 0; i < map->num_stripes; i++) { - struct btrfs_io_stripe *stripe = &map->stripes[i]; - struct btrfs_device *device = stripe->dev; - - __clear_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, - bits | EXTENT_NOWAIT, - NULL, NULL); - } -} - /* * Add new extent map to the extent tree * @@ -419,10 +379,6 @@ int add_extent_mapping(struct extent_map_tree *tree, goto out; setup_extent_mapping(tree, em, modified); - if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) { - extent_map_device_set_bits(em, CHUNK_ALLOCATED); - extent_map_device_clear_bits(em, CHUNK_TRIMMED); - } out: return ret; } @@ -506,8 +462,6 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) rb_erase_cached(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) list_del_init(&em->list); - if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) - extent_map_device_clear_bits(em, CHUNK_ALLOCATED); RB_CLEAR_NODE(&em->rb_node); } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index d0328127f89c..bae14af197ef 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -23,8 +23,6 @@ enum { EXTENT_FLAG_LOGGING, /* Filling in a preallocated extent */ EXTENT_FLAG_FILLING, - /* filesystem extent mapping type */ - EXTENT_FLAG_FS_MAPPING, /* This em is merged from two or more physically adjacent ems */ EXTENT_FLAG_MERGED, }; @@ -50,8 +48,6 @@ struct extent_map { */ u64 generation; unsigned long flags; - /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */ - struct map_lookup *map_lookup; refcount_t refs; unsigned int compress_type; struct list_head list; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 318df6f9d9cb..a3debac2819a 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -398,7 +398,8 @@ struct btrfs_fs_info { struct extent_io_tree excluded_extents; /* logical->physical extent mapping */ - struct extent_map_tree mapping_tree; + struct rb_root_cached mapping_tree; + rwlock_t mapping_tree_lock; /* * Block reservation for extent, checksum, root tree and delayed dir diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dfef726e84e8..140e9c045cf1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10565,6 +10565,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; struct extent_map *em = NULL; + struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, @@ -10704,13 +10705,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, goto out; } - em = btrfs_get_chunk_map(fs_info, logical_block_start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + map = btrfs_get_chunk_map(fs_info, logical_block_start, len); + if (IS_ERR(map)) { + ret = PTR_ERR(map); goto out; } - if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { btrfs_warn(fs_info, "swapfile must have single data profile"); ret = -EINVAL; @@ -10718,23 +10719,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, } if (device == NULL) { - device = em->map_lookup->stripes[0].dev; + device = map->stripes[0].dev; ret = btrfs_add_swapfile_pin(inode, device, false); if (ret == 1) ret = 0; else if (ret) goto out; - } else if (device != em->map_lookup->stripes[0].dev) { + } else if (device != map->stripes[0].dev) { btrfs_warn(fs_info, "swapfile must be on one device"); ret = -EINVAL; goto out; } - physical_block_start = (em->map_lookup->stripes[0].physical + - (logical_block_start - em->start)); - len = min(len, em->len - (logical_block_start - em->start)); - free_extent_map(em); - em = NULL; + physical_block_start = (map->stripes[0].physical + + (logical_block_start - map->start)); + len = min(len, map->chunk_len - (logical_block_start - map->start)); + btrfs_free_chunk_map(map); + map = NULL; bg = btrfs_lookup_block_group(fs_info, logical_block_start); if (!bg) { @@ -10787,6 +10788,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, out: if (!IS_ERR_OR_NULL(em)) free_extent_map(em); + if (!IS_ERR_OR_NULL(map)) + btrfs_free_chunk_map(map); unlock_extent(io_tree, 0, isize - 1, &cached_state); diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 45e6ff78316f..470213688872 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -164,7 +164,7 @@ struct raid56_bio_trace_info { u8 stripe_nr; }; -static inline int nr_data_stripes(const struct map_lookup *map) +static inline int nr_data_stripes(const struct btrfs_chunk_map *map) { return map->num_stripes - btrfs_nr_parity_stripes(map->type); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2b04cdb3c4e9..061d54148568 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1279,7 +1279,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d * return 0 if it is a data stripe, 1 means parity stripe. */ static int get_raid56_logic_offset(u64 physical, int num, - struct map_lookup *map, u64 *offset, + struct btrfs_chunk_map *map, u64 *offset, u64 *stripe_start) { int i; @@ -1894,7 +1894,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, u64 full_stripe_start) { DECLARE_COMPLETION_ONSTACK(io_done); @@ -2063,7 +2063,7 @@ out: */ static int scrub_simple_mirror(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, u64 logical_start, u64 logical_length, struct btrfs_device *device, u64 physical, int mirror_num) @@ -2124,7 +2124,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, } /* Calculate the full stripe length for simple stripe based profiles */ -static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) +static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map) { ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); @@ -2133,7 +2133,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) } /* Get the logical bytenr for the stripe */ -static u64 simple_stripe_get_logical(struct map_lookup *map, +static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map, struct btrfs_block_group *bg, int stripe_index) { @@ -2150,7 +2150,7 @@ static u64 simple_stripe_get_logical(struct map_lookup *map, } /* Get the mirror number for the stripe */ -static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) +static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index) { ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); @@ -2162,7 +2162,7 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) static int scrub_simple_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct btrfs_device *device, int stripe_index) { @@ -2195,18 +2195,17 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct extent_map *em, + struct btrfs_chunk_map *map, struct btrfs_device *scrub_dev, int stripe_index) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct map_lookup *map = em->map_lookup; const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; const u64 chunk_logical = bg->start; int ret; int ret2; u64 physical = map->stripes[stripe_index].physical; - const u64 dev_stripe_len = btrfs_calc_stripe_length(em); + const u64 dev_stripe_len = btrfs_calc_stripe_length(map); const u64 physical_end = physical + dev_stripe_len; u64 logical; u64 logic_end; @@ -2369,17 +2368,12 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, u64 dev_extent_len) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; + struct btrfs_chunk_map *map; int i; int ret = 0; - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, bg->start, bg->length); - read_unlock(&map_tree->lock); - - if (!em) { + map = btrfs_find_chunk_map(fs_info, bg->start, bg->length); + if (!map) { /* * Might have been an unused block group deleted by the cleaner * kthread or relocation. @@ -2391,22 +2385,21 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, return ret; } - if (em->start != bg->start) + if (map->start != bg->start) goto out; - if (em->len < dev_extent_len) + if (map->chunk_len < dev_extent_len) goto out; - map = em->map_lookup; for (i = 0; i < map->num_stripes; ++i) { if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, bg, em, scrub_dev, i); + ret = scrub_stripe(sctx, bg, map, scrub_dev, i); if (ret) goto out; } } out: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index ca09cf9afce8..b50cfac7ad4e 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -28,6 +28,7 @@ const char *test_error[] = { [TEST_ALLOC_INODE] = "cannot allocate inode", [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group", [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", + [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map", }; static const struct super_operations btrfs_test_super_ops = { @@ -185,7 +186,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->buffer_lock); - btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_mapping_tree_free(fs_info); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, dev_list) { btrfs_free_dummy_device(dev); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 7a2d7ffbe30e..dc2f2ab15fa5 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -23,6 +23,7 @@ enum { TEST_ALLOC_INODE, TEST_ALLOC_BLOCK_GROUP, TEST_ALLOC_EXTENT_MAP, + TEST_ALLOC_CHUNK_MAP, }; extern const char *test_error[]; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 29bdd08b241f..8602f94cc29d 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -859,33 +859,21 @@ struct rmap_test_vector { static int test_rmap_block(struct btrfs_fs_info *fs_info, struct rmap_test_vector *test) { - struct extent_map *em; - struct map_lookup *map = NULL; + struct btrfs_chunk_map *map; u64 *logical = NULL; int i, out_ndaddrs, out_stripe_len; int ret; - em = alloc_extent_map(); - if (!em) { - test_std_err(TEST_ALLOC_EXTENT_MAP); - return -ENOMEM; - } - - map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL); + map = btrfs_alloc_chunk_map(test->num_stripes, GFP_KERNEL); if (!map) { - kfree(em); - test_std_err(TEST_ALLOC_EXTENT_MAP); + test_std_err(TEST_ALLOC_CHUNK_MAP); return -ENOMEM; } - set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); /* Start at 4GiB logical address */ - em->start = SZ_4G; - em->len = test->data_stripe_size * test->num_data_stripes; - em->block_len = em->len; - em->orig_block_len = test->data_stripe_size; - em->map_lookup = map; - + map->start = SZ_4G; + map->chunk_len = test->data_stripe_size * test->num_data_stripes; + map->stripe_size = test->data_stripe_size; map->num_stripes = test->num_stripes; map->type = test->raid_type; @@ -901,15 +889,13 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, map->stripes[i].physical = test->data_stripe_phys_start[i]; } - write_lock(&fs_info->mapping_tree.lock); - ret = add_extent_mapping(&fs_info->mapping_tree, em, 0); - write_unlock(&fs_info->mapping_tree.lock); + ret = btrfs_add_chunk_map(fs_info, map); if (ret) { - test_err("error adding block group mapping to mapping tree"); + test_err("error adding chunk map to mapping tree"); goto out_free; } - ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), + ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1), &logical, &out_ndaddrs, &out_stripe_len); if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { test_err("didn't rmap anything but expected %d", @@ -938,14 +924,8 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, ret = 0; out: - write_lock(&fs_info->mapping_tree.lock); - remove_extent_mapping(&fs_info->mapping_tree, em); - write_unlock(&fs_info->mapping_tree.lock); - /* For us */ - free_extent_map(em); + btrfs_remove_chunk_map(fs_info, map); out_free: - /* For the tree */ - free_extent_map(em); kfree(logical); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c66e1c6c0410..1cc6b5d5eb61 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1742,19 +1742,18 @@ out: static u64 find_next_chunk(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree; - struct extent_map *em; struct rb_node *n; u64 ret = 0; - em_tree = &fs_info->mapping_tree; - read_lock(&em_tree->lock); - n = rb_last(&em_tree->map.rb_root); + read_lock(&fs_info->mapping_tree_lock); + n = rb_last(&fs_info->mapping_tree.rb_root); if (n) { - em = rb_entry(n, struct extent_map, rb_node); - ret = em->start + em->len; + struct btrfs_chunk_map *map; + + map = rb_entry(n, struct btrfs_chunk_map, rb_node); + ret = map->start + map->chunk_len; } - read_unlock(&em_tree->lock); + read_unlock(&fs_info->mapping_tree_lock); return ret; } @@ -2986,6 +2985,81 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) return ret; } +struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev; + struct btrfs_chunk_map *map; + struct btrfs_chunk_map *prev_map = NULL; + + while (node) { + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + prev = node; + prev_map = map; + + if (logical < map->start) { + node = node->rb_left; + } else if (logical >= map->start + map->chunk_len) { + node = node->rb_right; + } else { + refcount_inc(&map->refs); + return map; + } + } + + if (!prev) + return NULL; + + orig_prev = prev; + while (prev && logical >= prev_map->start + prev_map->chunk_len) { + prev = rb_next(prev); + prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); + } + + if (!prev) { + prev = orig_prev; + prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); + while (prev && logical < prev_map->start) { + prev = rb_prev(prev); + prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); + } + } + + if (prev) { + u64 end = logical + length; + + /* + * Caller can pass a U64_MAX length when it wants to get any + * chunk starting at an offset of 'logical' or higher, so deal + * with underflow by resetting the end offset to U64_MAX. + */ + if (end < logical) + end = U64_MAX; + + if (end > prev_map->start && + logical < prev_map->start + prev_map->chunk_len) { + refcount_inc(&prev_map->refs); + return prev_map; + } + } + + return NULL; +} + +struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct btrfs_chunk_map *map; + + read_lock(&fs_info->mapping_tree_lock); + map = btrfs_find_chunk_map_nolock(fs_info, logical, length); + read_unlock(&fs_info->mapping_tree_lock); + + return map; +} + /* * Find the mapping containing the given logical extent. * @@ -2994,38 +3068,37 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) * * Return: Chunk mapping or ERR_PTR. */ -struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) +struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { - struct extent_map_tree *em_tree; - struct extent_map *em; + struct btrfs_chunk_map *map; - em_tree = &fs_info->mapping_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, length); - read_unlock(&em_tree->lock); + map = btrfs_find_chunk_map(fs_info, logical, length); - if (unlikely(!em)) { + if (unlikely(!map)) { + read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "unable to find chunk map for logical %llu length %llu", logical, length); return ERR_PTR(-EINVAL); } - if (unlikely(em->start > logical || em->start + em->len <= logical)) { + if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { + read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", - logical, logical + length, em->start, em->start + em->len); - free_extent_map(em); + logical, logical + length, map->start, + map->start + map->chunk_len); + btrfs_free_chunk_map(map); return ERR_PTR(-EINVAL); } - /* callers are responsible for dropping em's ref. */ - return em; + /* Callers are responsible for dropping the reference. */ + return map; } static int remove_chunk_item(struct btrfs_trans_handle *trans, - struct map_lookup *map, u64 chunk_offset) + struct btrfs_chunk_map *map, u64 chunk_offset) { int i; @@ -3050,23 +3123,21 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans, int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 dev_extent_len = 0; int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); - if (IS_ERR(em)) { + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(map)) { /* * This is a logic error, but we don't want to just rely on the * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ ASSERT(0); - return PTR_ERR(em); + return PTR_ERR(map); } - map = em->map_lookup; /* * First delete the device extent items from the devices btree. @@ -3169,7 +3240,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) goto out; } - trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); + trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(fs_info, chunk_offset); @@ -3188,7 +3259,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) */ btrfs_trans_release_chunk_metadata(trans); - ret = btrfs_remove_block_group(trans, chunk_offset, em); + ret = btrfs_remove_block_group(trans, map); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3200,7 +3271,7 @@ out: trans->removing_chunk = false; } /* once for us */ - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -5347,24 +5418,131 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, } } +static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits) +{ + for (int i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + struct btrfs_device *device = stripe->dev; + + set_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, NULL); + } +} + +static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits) +{ + for (int i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + struct btrfs_device *device = stripe->dev; + + __clear_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, + NULL, NULL); + } +} + +void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) +{ + write_lock(&fs_info->mapping_tree_lock); + rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); + RB_CLEAR_NODE(&map->rb_node); + chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); + write_unlock(&fs_info->mapping_tree_lock); + + /* Once for the tree reference. */ + btrfs_free_chunk_map(map); +} + +EXPORT_FOR_TESTS +int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + bool leftmost = true; + + write_lock(&fs_info->mapping_tree_lock); + p = &fs_info->mapping_tree.rb_root.rb_node; + while (*p) { + struct btrfs_chunk_map *entry; + + parent = *p; + entry = rb_entry(parent, struct btrfs_chunk_map, rb_node); + + if (map->start < entry->start) { + p = &(*p)->rb_left; + } else if (map->start > entry->start) { + p = &(*p)->rb_right; + leftmost = false; + } else { + write_unlock(&fs_info->mapping_tree_lock); + return -EEXIST; + } + } + rb_link_node(&map->rb_node, parent, p); + rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost); + chunk_map_device_set_bits(map, CHUNK_ALLOCATED); + chunk_map_device_clear_bits(map, CHUNK_TRIMMED); + write_unlock(&fs_info->mapping_tree_lock); + + return 0; +} + +EXPORT_FOR_TESTS +struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp) +{ + struct btrfs_chunk_map *map; + + map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp); + if (!map) + return NULL; + + refcount_set(&map->refs, 1); + RB_CLEAR_NODE(&map->rb_node); + + return map; +} + +struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp) +{ + const int size = btrfs_chunk_map_size(map->num_stripes); + struct btrfs_chunk_map *clone; + + clone = kmemdup(map, size, gfp); + if (!clone) + return NULL; + + refcount_set(&clone->refs, 1); + RB_CLEAR_NODE(&clone->rb_node); + + return clone; +} + static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { struct btrfs_fs_info *info = trans->fs_info; - struct map_lookup *map = NULL; - struct extent_map_tree *em_tree; + struct btrfs_chunk_map *map; struct btrfs_block_group *block_group; - struct extent_map *em; u64 start = ctl->start; u64 type = ctl->type; int ret; int i; int j; - map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); + map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS); if (!map) return ERR_PTR(-ENOMEM); + + map->start = start; + map->chunk_len = ctl->chunk_size; + map->stripe_size = ctl->stripe_size; + map->type = type; + map->io_align = BTRFS_STRIPE_LEN; + map->io_width = BTRFS_STRIPE_LEN; + map->sub_stripes = ctl->sub_stripes; map->num_stripes = ctl->num_stripes; for (i = 0; i < ctl->ndevs; ++i) { @@ -5375,41 +5553,22 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, j * ctl->stripe_size; } } - map->io_align = BTRFS_STRIPE_LEN; - map->io_width = BTRFS_STRIPE_LEN; - map->type = type; - map->sub_stripes = ctl->sub_stripes; trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); - em = alloc_extent_map(); - if (!em) { - kfree(map); - return ERR_PTR(-ENOMEM); - } - set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->map_lookup = map; - em->start = start; - em->len = ctl->chunk_size; - em->block_start = 0; - em->block_len = em->len; - em->orig_block_len = ctl->stripe_size; - - em_tree = &info->mapping_tree; - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_chunk_map(info, map); if (ret) { - write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_chunk_map(map); return ERR_PTR(ret); } - write_unlock(&em_tree->lock); block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); - if (IS_ERR(block_group)) - goto error_del_extent; + if (IS_ERR(block_group)) { + btrfs_remove_chunk_map(info, map); + return block_group; + } - for (i = 0; i < map->num_stripes; i++) { + for (int i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; btrfs_device_set_bytes_used(dev, @@ -5422,22 +5581,9 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, atomic64_sub(ctl->stripe_size * map->num_stripes, &info->free_chunk_space); - free_extent_map(em); check_raid56_incompat_flag(info, type); check_raid1c34_incompat_flag(info, type); - return block_group; - -error_del_extent: - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* One for our allocation */ - free_extent_map(em); - /* One for the tree reference */ - free_extent_map(em); - return block_group; } @@ -5514,8 +5660,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_key key; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; size_t item_size; int i; int ret; @@ -5544,14 +5689,13 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, */ lockdep_assert_held(&fs_info->chunk_mutex); - em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + map = btrfs_get_chunk_map(fs_info, bg->start, bg->length); + if (IS_ERR(map)) { + ret = PTR_ERR(map); btrfs_abort_transaction(trans, ret); return ret; } - map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); chunk = kzalloc(item_size, GFP_NOFS); @@ -5608,7 +5752,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, out: kfree(chunk); - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -5653,7 +5797,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) return 0; } -static inline int btrfs_chunk_max_errors(struct map_lookup *map) +static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map) { const int index = btrfs_bg_flags_to_raid_index(map->type); @@ -5662,17 +5806,15 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map) bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; int miss_ndevs = 0; int i; bool ret = true; - em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); - if (IS_ERR(em)) + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(map)) return false; - map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { if (test_bit(BTRFS_DEV_STATE_MISSING, &map->stripes[i].dev->dev_state)) { @@ -5693,38 +5835,37 @@ bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) if (miss_ndevs > btrfs_chunk_max_errors(map)) ret = false; end: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } -void btrfs_mapping_tree_free(struct extent_map_tree *tree) +void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info) { - struct extent_map *em; + write_lock(&fs_info->mapping_tree_lock); + while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) { + struct btrfs_chunk_map *map; + struct rb_node *node; - while (1) { - write_lock(&tree->lock); - em = lookup_extent_mapping(tree, 0, (u64)-1); - if (em) - remove_extent_mapping(tree, em); - write_unlock(&tree->lock); - if (!em) - break; - /* once for us */ - free_extent_map(em); - /* once for the tree */ - free_extent_map(em); + node = rb_first_cached(&fs_info->mapping_tree); + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); + RB_CLEAR_NODE(&map->rb_node); + chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); + /* Once for the tree ref. */ + btrfs_free_chunk_map(map); + cond_resched_rwlock_write(&fs_info->mapping_tree_lock); } + write_unlock(&fs_info->mapping_tree_lock); } int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; enum btrfs_raid_types index; int ret = 1; - em = btrfs_get_chunk_map(fs_info, logical, len); - if (IS_ERR(em)) + map = btrfs_get_chunk_map(fs_info, logical, len); + if (IS_ERR(map)) /* * We could return errors for these cases, but that could get * ugly and we'd probably do the same thing which is just not do @@ -5733,7 +5874,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) */ return 1; - map = em->map_lookup; index = btrfs_bg_flags_to_raid_index(map->type); /* Non-RAID56, use their ncopies from btrfs_raid_array. */ @@ -5750,53 +5890,49 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) * stripe under reconstruction. */ ret = map->num_stripes; - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; unsigned long len = fs_info->sectorsize; if (!btrfs_fs_incompat(fs_info, RAID56)) return len; - em = btrfs_get_chunk_map(fs_info, logical, len); + map = btrfs_get_chunk_map(fs_info, logical, len); - if (!WARN_ON(IS_ERR(em))) { - map = em->map_lookup; + if (!WARN_ON(IS_ERR(map))) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); - free_extent_map(em); + btrfs_free_chunk_map(map); } return len; } int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; int ret = 0; if (!btrfs_fs_incompat(fs_info, RAID56)) return 0; - em = btrfs_get_chunk_map(fs_info, logical, len); + map = btrfs_get_chunk_map(fs_info, logical, len); - if(!WARN_ON(IS_ERR(em))) { - map = em->map_lookup; + if (!WARN_ON(IS_ERR(map))) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; - free_extent_map(em); + btrfs_free_chunk_map(map); } return ret; } static int find_live_mirror(struct btrfs_fs_info *fs_info, - struct map_lookup *map, int first, + struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) { int i; @@ -5903,8 +6039,7 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; struct btrfs_discard_stripe *stripes; u64 length = *length_ret; u64 offset; @@ -5922,11 +6057,9 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, int ret; int i; - em = btrfs_get_chunk_map(fs_info, logical, length); - if (IS_ERR(em)) - return ERR_CAST(em); - - map = em->map_lookup; + map = btrfs_get_chunk_map(fs_info, logical, length); + if (IS_ERR(map)) + return ERR_CAST(map); /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { @@ -5934,8 +6067,8 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, goto out_free_map; } - offset = logical - em->start; - length = min_t(u64, em->start + em->len - logical, length); + offset = logical - map->start; + length = min_t(u64, map->start + map->chunk_len - logical, length); *length_ret = length; /* @@ -6032,10 +6165,10 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, } } - free_extent_map(em); + btrfs_free_chunk_map(map); return stripes; out_free_map: - free_extent_map(em); + btrfs_free_chunk_map(map); return ERR_PTR(ret); } @@ -6133,7 +6266,7 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->replace_nr_stripes = nr_extra_stripes; } -static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, +static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, enum btrfs_map_op op, u64 offset, u32 *stripe_nr, u64 *stripe_offset, u64 *full_stripe_start) { @@ -6183,7 +6316,7 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_stripe *dst, - struct map_lookup *map, u32 stripe_index, + struct btrfs_chunk_map *map, u32 stripe_index, u64 stripe_offset, u64 stripe_nr) { dst->dev = map->stripes[stripe_index].dev; @@ -6237,8 +6370,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_io_context **bioc_ret, struct btrfs_io_stripe *smap, int *mirror_num_ret) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 map_offset; u64 stripe_offset; u32 stripe_nr; @@ -6263,17 +6395,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (mirror_num > num_copies) return -EINVAL; - em = btrfs_get_chunk_map(fs_info, logical, *length); - if (IS_ERR(em)) - return PTR_ERR(em); + map = btrfs_get_chunk_map(fs_info, logical, *length); + if (IS_ERR(map)) + return PTR_ERR(map); - map = em->map_lookup; data_stripes = nr_data_stripes(map); - map_offset = logical - em->start; + map_offset = logical - map->start; max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, &stripe_offset, &raid56_full_stripe_start); - *length = min_t(u64, em->len - map_offset, max_len); + *length = min_t(u64, map->chunk_len - map_offset, max_len); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); @@ -6350,7 +6481,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* Return the length to the full stripe end */ *length = min(logical + *length, - raid56_full_stripe_start + em->start + + raid56_full_stripe_start + map->start + btrfs_stripe_nr_to_offset(data_stripes)) - logical; stripe_index = 0; @@ -6437,7 +6568,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * In this case, we just add @stripe_nr with @i, then do the * modulo, to reduce one modulo call. */ - bioc->full_stripe_logical = em->start + + bioc->full_stripe_logical = map->start + btrfs_stripe_nr_to_offset(stripe_nr * data_stripes); for (int i = 0; i < num_stripes; i++) { ret = set_io_stripe(fs_info, op, logical, length, @@ -6488,7 +6619,7 @@ out: /* Unlock and let waiting writers proceed */ up_read(&dev_replace->rwsem); } - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -6660,12 +6791,11 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, devid, uuid); } -u64 btrfs_calc_stripe_length(const struct extent_map *em) +u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map) { - const struct map_lookup *map = em->map_lookup; const int data_stripes = calc_data_stripes(map->type, map->num_stripes); - return div_u64(em->len, data_stripes); + return div_u64(map->chunk_len, data_stripes); } #if BITS_PER_LONG == 32 @@ -6734,9 +6864,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = leaf->fs_info; - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; + struct btrfs_chunk_map *map; u64 logical; u64 length; u64 devid; @@ -6770,35 +6898,22 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, return ret; } - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, logical, 1); - read_unlock(&map_tree->lock); + map = btrfs_find_chunk_map(fs_info, logical, 1); /* already mapped? */ - if (em && em->start <= logical && em->start + em->len > logical) { - free_extent_map(em); + if (map && map->start <= logical && map->start + map->chunk_len > logical) { + btrfs_free_chunk_map(map); return 0; - } else if (em) { - free_extent_map(em); + } else if (map) { + btrfs_free_chunk_map(map); } - em = alloc_extent_map(); - if (!em) + map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS); + if (!map) return -ENOMEM; - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) { - free_extent_map(em); - return -ENOMEM; - } - - set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->map_lookup = map; - em->start = logical; - em->len = length; - em->orig_start = 0; - em->block_start = 0; - em->block_len = em->len; + map->start = logical; + map->chunk_len = length; map->num_stripes = num_stripes; map->io_width = btrfs_chunk_io_width(leaf, chunk); map->io_align = btrfs_chunk_io_align(leaf, chunk); @@ -6813,7 +6928,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, */ map->sub_stripes = btrfs_raid_array[index].sub_stripes; map->verified_stripes = 0; - em->orig_block_len = btrfs_calc_stripe_length(em); + map->stripe_size = btrfs_calc_stripe_length(map); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); @@ -6829,7 +6944,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, devid, uuid); if (IS_ERR(map->stripes[i].dev)) { ret = PTR_ERR(map->stripes[i].dev); - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } } @@ -6838,15 +6953,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, &(map->stripes[i].dev->dev_state)); } - write_lock(&map_tree->lock); - ret = add_extent_mapping(map_tree, em, 0); - write_unlock(&map_tree->lock); + ret = btrfs_add_chunk_map(fs_info, map); if (ret < 0) { btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", - em->start, em->len, ret); + map->start, map->chunk_len, ret); } - free_extent_map(em); return ret; } @@ -7156,26 +7268,21 @@ out_short_read: bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev) { - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct extent_map *em; - u64 next_start = 0; + struct btrfs_chunk_map *map; + u64 next_start; bool ret = true; - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, 0, (u64)-1); - read_unlock(&map_tree->lock); + map = btrfs_find_chunk_map(fs_info, 0, U64_MAX); /* No chunk at all? Return false anyway */ - if (!em) { + if (!map) { ret = false; goto out; } - while (em) { - struct map_lookup *map; + while (map) { int missing = 0; int max_tolerated; int i; - map = em->map_lookup; max_tolerated = btrfs_get_num_tolerated_disk_barrier_failures( map->type); @@ -7193,18 +7300,15 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, if (!failing_dev) btrfs_warn(fs_info, "chunk %llu missing %d devices, max tolerance is %d for writable mount", - em->start, missing, max_tolerated); - free_extent_map(em); + map->start, missing, max_tolerated); + btrfs_free_chunk_map(map); ret = false; goto out; } - next_start = extent_map_end(em); - free_extent_map(em); + next_start = map->start + map->chunk_len; + btrfs_free_chunk_map(map); - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, next_start, - (u64)(-1) - next_start); - read_unlock(&map_tree->lock); + map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start); } out: return ret; @@ -7697,20 +7801,15 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 physical_offset, u64 physical_len) { struct btrfs_dev_lookup_args args = { .devid = devid }; - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; struct btrfs_device *dev; u64 stripe_len; bool found = false; int ret = 0; int i; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - - if (!em) { + map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); + if (!map) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", physical_offset, devid); @@ -7718,12 +7817,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, goto out; } - map = em->map_lookup; - stripe_len = btrfs_calc_stripe_length(em); + stripe_len = btrfs_calc_stripe_length(map); if (physical_len != stripe_len) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", - physical_offset, devid, em->start, physical_len, + physical_offset, devid, map->start, physical_len, stripe_len); ret = -EUCLEAN; goto out; @@ -7746,7 +7844,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, if (map->verified_stripes >= map->num_stripes) { btrfs_err(fs_info, "too many dev extents for chunk %llu found", - em->start); + map->start); ret = -EUCLEAN; goto out; } @@ -7792,32 +7890,30 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } out: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; struct rb_node *node; int ret = 0; - read_lock(&em_tree->lock); - for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { - em = rb_entry(node, struct extent_map, rb_node); - if (em->map_lookup->num_stripes != - em->map_lookup->verified_stripes) { + read_lock(&fs_info->mapping_tree_lock); + for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { + struct btrfs_chunk_map *map; + + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + if (map->num_stripes != map->verified_stripes) { btrfs_err(fs_info, "chunk %llu has missing dev extent, have %d expect %d", - em->start, em->map_lookup->verified_stripes, - em->map_lookup->num_stripes); + map->start, map->verified_stripes, map->num_stripes); ret = -EUCLEAN; goto out; } } out: - read_unlock(&em_tree->lock); + read_unlock(&fs_info->mapping_tree_lock); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 9cc374864a79..6c6faed2468a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -426,7 +426,8 @@ struct btrfs_discard_stripe { struct btrfs_io_context { refcount_t refs; struct btrfs_fs_info *fs_info; - u64 map_type; /* get from map_lookup->type */ + /* Taken from struct btrfs_chunk_map::type. */ + u64 map_type; struct bio *orig_bio; atomic_t error; u16 max_errors; @@ -529,18 +530,32 @@ struct btrfs_raid_attr { extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; -struct map_lookup { +struct btrfs_chunk_map { + struct rb_node rb_node; + /* For mount time dev extent verification. */ + int verified_stripes; + refcount_t refs; + u64 start; + u64 chunk_len; + u64 stripe_size; u64 type; int io_align; int io_width; int num_stripes; int sub_stripes; - int verified_stripes; /* For mount time dev extent verification */ struct btrfs_io_stripe stripes[]; }; -#define map_lookup_size(n) (sizeof(struct map_lookup) + \ - (sizeof(struct btrfs_io_stripe) * (n))) +#define btrfs_chunk_map_size(n) (sizeof(struct btrfs_chunk_map) + \ + (sizeof(struct btrfs_io_stripe) * (n))) + +static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map) +{ + if (map && refcount_dec_and_test(&map->refs)) { + ASSERT(RB_EMPTY_NODE(&map->rb_node)); + kfree(map); + } +} struct btrfs_balance_args; struct btrfs_balance_progress; @@ -624,7 +639,7 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); -void btrfs_mapping_tree_free(struct extent_map_tree *tree); +void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, @@ -680,13 +695,25 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); -u64 btrfs_calc_stripe_length(const struct extent_map *em); +u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); int btrfs_nr_parity_stripes(u64 type); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); -struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, - u64 logical, u64 length); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp); +int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map); +#endif + +struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp); +struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); +struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); +struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); +void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map); void btrfs_release_disk_super(struct btrfs_super_block *super); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 188378ca19c7..830f0b6ec89e 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1290,7 +1290,7 @@ struct zone_info { static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct zone_info *info, unsigned long *active, - struct map_lookup *map) + struct btrfs_chunk_map *map) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *device = map->stripes[zone_idx].dev; @@ -1393,7 +1393,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, } static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { @@ -1435,7 +1435,7 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, } static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { @@ -1483,7 +1483,7 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, } static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { @@ -1515,7 +1515,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, } static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { @@ -1552,9 +1552,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 logical = cache->start; u64 length = cache->length; struct zone_info *zone_info = NULL; @@ -1575,17 +1573,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) return -EIO; } - /* Get the chunk mapping */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, length); - read_unlock(&em_tree->lock); - - if (!em) + map = btrfs_find_chunk_map(fs_info, logical, length); + if (!map) return -EINVAL; - map = em->map_lookup; - - cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS); + cache->physical_map = btrfs_clone_chunk_map(map, GFP_NOFS); if (!cache->physical_map) { ret = -ENOMEM; goto out; @@ -1687,12 +1679,11 @@ out: spin_unlock(&fs_info->zone_active_bgs_lock); } } else { - kfree(cache->physical_map); + btrfs_free_chunk_map(cache->physical_map); cache->physical_map = NULL; } bitmap_free(active); kfree(zone_info); - free_extent_map(em); return ret; } @@ -2082,7 +2073,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct map_lookup *map; + struct btrfs_chunk_map *map; struct btrfs_device *device; u64 physical; const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA); @@ -2194,7 +2185,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct map_lookup *map; + struct btrfs_chunk_map *map; const bool is_metadata = (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); int ret = 0; @@ -2643,7 +2634,7 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) /* Release reservation for currently active block groups. */ spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { - struct map_lookup *map = block_group->physical_map; + struct btrfs_chunk_map *map = block_group->physical_map; if (!(block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 279a7a0c90c0..4a95097ab590 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -21,7 +21,7 @@ struct btrfs_delayed_data_ref; struct btrfs_delayed_ref_head; struct btrfs_block_group; struct btrfs_free_cluster; -struct map_lookup; +struct btrfs_chunk_map; struct extent_buffer; struct btrfs_work; struct btrfs_workqueue; @@ -277,8 +277,7 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict, { (1 << EXTENT_FLAG_COMPRESSED), "COMPRESSED" },\ { (1 << EXTENT_FLAG_PREALLOC), "PREALLOC" },\ { (1 << EXTENT_FLAG_LOGGING), "LOGGING" },\ - { (1 << EXTENT_FLAG_FILLING), "FILLING" },\ - { (1 << EXTENT_FLAG_FS_MAPPING), "FS_MAPPING" }) + { (1 << EXTENT_FLAG_FILLING), "FILLING" }) TRACE_EVENT_CONDITION(btrfs_get_extent, @@ -1061,7 +1060,7 @@ DEFINE_EVENT(btrfs_delayed_ref_head, run_delayed_ref_head, DECLARE_EVENT_CLASS(btrfs__chunk, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct map_lookup *map, u64 offset, u64 size), + const struct btrfs_chunk_map *map, u64 offset, u64 size), TP_ARGS(fs_info, map, offset, size), @@ -1095,7 +1094,7 @@ DECLARE_EVENT_CLASS(btrfs__chunk, DEFINE_EVENT(btrfs__chunk, btrfs_chunk_alloc, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct map_lookup *map, u64 offset, u64 size), + const struct btrfs_chunk_map *map, u64 offset, u64 size), TP_ARGS(fs_info, map, offset, size) ); @@ -1103,7 +1102,7 @@ DEFINE_EVENT(btrfs__chunk, btrfs_chunk_alloc, DEFINE_EVENT(btrfs__chunk, btrfs_chunk_free, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct map_lookup *map, u64 offset, u64 size), + const struct btrfs_chunk_map *map, u64 offset, u64 size), TP_ARGS(fs_info, map, offset, size) ); From 71fca47b644910485c49d1da31bc963cf286fe77 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 21 Nov 2023 13:38:39 +0000 Subject: [PATCH 0836/1562] btrfs: remove stripe size local variable from insert_dev_extents() It's not needed to have a local variable to store the stripe size at insert_dev_extents(), we can just take from the chunk map as it's only used once and typing 'map->stripe_size' is not much more verbose than simply typing 'stripe_size'. So remove the local variable. This was added before the recent addition of a dedicated structure for chunk mappings because the stripe size was encoded in the 'orig_block_len' field of an extent_map structure, so the use of the local variable made things more readable. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 0fea258eea15..4365f7b6b94d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2597,7 +2597,6 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans, struct btrfs_device *device; struct btrfs_chunk_map *map; u64 dev_offset; - u64 stripe_size; int i; int ret = 0; @@ -2605,8 +2604,6 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans, if (IS_ERR(map)) return PTR_ERR(map); - stripe_size = map->stripe_size; - /* * Take the device list mutex to prevent races with the final phase of * a device replace operation that replaces the device object associated @@ -2622,7 +2619,7 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans, dev_offset = map->stripes[i].physical; ret = insert_dev_extent(trans, device, chunk_offset, dev_offset, - stripe_size); + map->stripe_size); if (ret) break; } From 516095cdf07af0c7223681079d87e9c42c7cf599 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 21 Nov 2023 14:20:15 +0100 Subject: [PATCH 0837/1562] btrfs: move lockdep class setting out of extent_io_tree_init The per-inode file extent tree was added in 41a2ee75aab0 ("btrfs: introduce per-inode file extent tree"), it's the only tree type that requires the lockdep class. Move it to the file where it is actually used. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 10 ---------- fs/btrfs/inode.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 76061245a46b..56be64e656da 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -78,14 +78,6 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif -/* - * For the file_extent_tree, we want to hold the inode lock when we lookup and - * update the disk_i_size, but lockdep will complain because our io_tree we hold - * the tree lock and get the inode lock when setting delalloc. These two things - * are unrelated, so make a class for the file_extent_tree so we don't get the - * two locking patterns mixed up. - */ -static struct lock_class_key file_extent_tree_class; void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner) @@ -95,8 +87,6 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, spin_lock_init(&tree->lock); tree->inode = NULL; tree->owner = owner; - if (owner == IO_TREE_INODE_FILE_EXTENT) - lockdep_set_class(&tree->lock, &file_extent_tree_class); } /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 140e9c045cf1..7cb5139e3d7f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -114,6 +114,15 @@ struct data_reloc_warn { int mirror_num; }; +/* + * For the file_extent_tree, we want to hold the inode lock when we lookup and + * update the disk_i_size, but lockdep will complain because our io_tree we hold + * the tree lock and get the inode lock when setting delalloc. These two things + * are unrelated, so make a class for the file_extent_tree so we don't get the + * two locking patterns mixed up. + */ +static struct lock_class_key file_extent_tree_class; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_special_inode_operations; @@ -8506,6 +8515,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); + /* Lockdep class is set only for the file extent tree. */ + lockdep_set_class(&ei->file_extent_tree.lock, &file_extent_tree_class); mutex_init(&ei->log_mutex); spin_lock_init(&ei->ordered_tree_lock); ei->ordered_tree = RB_ROOT; From ab76c43e7474eafdc95f7d83aa6ab1a53fde01c4 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 21 Nov 2023 14:20:17 +0100 Subject: [PATCH 0838/1562] btrfs: drop error message in extent_io_tree insert_state() The helper insert_state errors are handled in all callers and reported by extent_io_tree_panic so we don't need to do it twice. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 56be64e656da..887d9beb7b10 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -442,9 +442,6 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, } node = &(*node)->rb_right; } else { - btrfs_err(tree->fs_info, - "found node %llu %llu on insert of %llu %llu", - entry->start, entry->end, state->start, state->end); return ERR_PTR(-EEXIST); } } From 3a97347ea694b6c091513135095128f099b73143 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 21 Nov 2023 14:20:19 +0100 Subject: [PATCH 0839/1562] btrfs: constify fs_info parameter in __btrfs_panic() The printk helpers take const fs_info if it's used just for the identifier in the messages, __btrfs_panic() lacks that. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/messages.c | 2 +- fs/btrfs/messages.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index b8f9c9e56c8c..cdada4865837 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -287,7 +287,7 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) * panic or BUGs, depending on mount options. */ __cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int error, const char *fmt, ...) { char *s_id = ""; diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 4d04c1fa5899..08a9272399d2 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -194,7 +194,7 @@ const char * __attribute_const__ btrfs_decode_error(int error); __printf(5, 6) __cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int error, const char *fmt, ...); /* * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic From 70146f2b093844c656774bfc9a98b79e2177893a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 21 Nov 2023 14:20:21 +0100 Subject: [PATCH 0840/1562] btrfs: enhance extent_io_tree error reports Pass the type of the extent io tree operation which failed in the report helper. The message wording and contents is updated, though locking might be the cause of the error it's probably not the only one and we're interested in the state. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 887d9beb7b10..2d564ead9dbe 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -313,10 +313,14 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 return tree_search_for_insert(tree, offset, NULL, NULL); } -static void extent_io_tree_panic(struct extent_io_tree *tree, int err) +static void extent_io_tree_panic(const struct extent_io_tree *tree, + const struct extent_state *state, + const char *opname, + int err) { btrfs_panic(tree->fs_info, err, - "locking error: extent tree was modified by another thread while locked"); + "extent io tree error on %s state start %llu end %llu", + opname, state->start, state->end); } static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state) @@ -676,7 +680,7 @@ hit_next: goto search_again; err = split_state(tree, state, prealloc, start); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); prealloc = NULL; if (err) @@ -698,7 +702,7 @@ hit_next: goto search_again; err = split_state(tree, state, prealloc, end + 1); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); if (wake) wake_up(&state->wq); @@ -1133,7 +1137,7 @@ hit_next: goto search_again; err = split_state(tree, state, prealloc, start); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); prealloc = NULL; if (err) @@ -1181,7 +1185,7 @@ hit_next: inserted_state = insert_state(tree, prealloc, bits, changeset); if (IS_ERR(inserted_state)) { err = PTR_ERR(inserted_state); - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, prealloc, "insert", err); } cache_state(inserted_state, cached_state); @@ -1209,7 +1213,7 @@ hit_next: goto search_again; err = split_state(tree, state, prealloc, end + 1); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); set_state_bits(tree, prealloc, bits, changeset); cache_state(prealloc, cached_state); @@ -1363,7 +1367,7 @@ hit_next: } err = split_state(tree, state, prealloc, start); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); prealloc = NULL; if (err) goto out; @@ -1411,7 +1415,7 @@ hit_next: inserted_state = insert_state(tree, prealloc, bits, NULL); if (IS_ERR(inserted_state)) { err = PTR_ERR(inserted_state); - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, prealloc, "insert", err); } cache_state(inserted_state, cached_state); if (inserted_state == prealloc) @@ -1434,7 +1438,7 @@ hit_next: err = split_state(tree, state, prealloc, end + 1); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); From 738290c056e28d83177ecbed3894e094e161939e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 21 Nov 2023 14:20:24 +0100 Subject: [PATCH 0841/1562] btrfs: always set extent_io_tree::inode and drop fs_info The extent_io_tree is embedded in several structures, notably in struct btrfs_inode. The fs_info is only used for reporting errors and for reference in trace points. We can get to the pointer through the inode, but not all io trees set it. However, we always know the owner and can recognize if inode is valid. For access helpers are provided, const variant for the trace points. This reduces size of extent_io_tree by 8 bytes and following structures in turn: - btrfs_inode 1104 -> 1088 - btrfs_device 520 -> 512 - btrfs_root 1360 -> 1344 - btrfs_transaction 456 -> 440 - btrfs_fs_info 3600 -> 3592 - reloc_control 1520 -> 1512 Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 80 ++++++++++++++++++++++++++---------- fs/btrfs/extent-io-tree.h | 18 ++++++-- fs/btrfs/inode.c | 3 ++ fs/btrfs/tests/btrfs-tests.c | 2 +- include/trace/events/btrfs.h | 45 +++++++------------- 5 files changed, 93 insertions(+), 55 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 2d564ead9dbe..dbd201a99693 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -58,12 +58,13 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - struct btrfs_inode *inode = tree->inode; + const struct btrfs_inode *inode; u64 isize; - if (!inode) + if (tree->owner != IO_TREE_INODE_IO) return; + inode = extent_io_tree_to_inode_const(tree); isize = i_size_read(&inode->vfs_inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { btrfs_debug_rl(inode->root->fs_info, @@ -79,13 +80,44 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, #endif +/* + * The only tree allowed to set the inode is IO_TREE_INODE_IO. + */ +static bool is_inode_io_tree(const struct extent_io_tree *tree) +{ + return tree->owner == IO_TREE_INODE_IO; +} + +/* Return the inode if it's valid for the given tree, otherwise NULL. */ +struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree) +{ + if (tree->owner == IO_TREE_INODE_IO) + return tree->inode; + return NULL; +} + +/* Read-only access to the inode. */ +const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree) +{ + if (tree->owner == IO_TREE_INODE_IO) + return tree->inode; + return NULL; +} + +/* For read-only access to fs_info. */ +const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree) +{ + if (tree->owner == IO_TREE_INODE_IO) + return tree->inode->root->fs_info; + return tree->fs_info; +} + void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner) { - tree->fs_info = fs_info; tree->state = RB_ROOT; spin_lock_init(&tree->lock); - tree->inode = NULL; + tree->fs_info = fs_info; tree->owner = owner; } @@ -318,7 +350,7 @@ static void extent_io_tree_panic(const struct extent_io_tree *tree, const char *opname, int err) { - btrfs_panic(tree->fs_info, err, + btrfs_panic(extent_io_tree_to_fs_info(tree), err, "extent io tree error on %s state start %llu end %llu", opname, state->start, state->end); } @@ -329,8 +361,9 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s prev = prev_state(state); if (prev && prev->end == state->start - 1 && prev->state == state->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, state, prev); + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), + state, prev); state->start = prev->start; rb_erase(&prev->rb_node, &tree->state); RB_CLEAR_NODE(&prev->rb_node); @@ -344,8 +377,9 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s next = next_state(state); if (next && next->start == state->end + 1 && next->state == state->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, state, next); + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), + state, next); state->end = next->end; rb_erase(&next->rb_node, &tree->state); RB_CLEAR_NODE(&next->rb_node); @@ -378,8 +412,8 @@ static void set_state_bits(struct extent_io_tree *tree, u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; - if (tree->inode) - btrfs_set_delalloc_extent(tree->inode, state, bits); + if (is_inode_io_tree(tree)) + btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits); ret = add_extent_changeset(state, bits_to_set, changeset, 1); BUG_ON(ret < 0); @@ -424,9 +458,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, if (state->end < entry->start) { if (try_merge && end == entry->start && state->state == entry->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, - state, entry); + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent( + extent_io_tree_to_inode(tree), + state, entry); entry->start = state->start; merge_prev_state(tree, entry); state->state = 0; @@ -436,9 +471,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, } else if (state->end > entry->end) { if (try_merge && entry->end == start && state->state == entry->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, - state, entry); + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent( + extent_io_tree_to_inode(tree), + state, entry); entry->end = state->end; merge_next_state(tree, entry); state->state = 0; @@ -490,8 +526,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct rb_node *parent = NULL; struct rb_node **node; - if (tree->inode) - btrfs_split_delalloc_extent(tree->inode, orig, split); + if (is_inode_io_tree(tree)) + btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig, + split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -538,8 +575,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; - if (tree->inode) - btrfs_clear_delalloc_extent(tree->inode, state, bits); + if (is_inode_io_tree(tree)) + btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state, + bits); ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 5602b0137fcd..ebe6390d65e9 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -87,9 +87,17 @@ enum { struct extent_io_tree { struct rb_root state; - struct btrfs_fs_info *fs_info; - /* Inode associated with this tree, or NULL. */ - struct btrfs_inode *inode; + /* + * The fs_info is needed for trace points, a tree attached to an inode + * needs the inode. + * + * owner == IO_TREE_INODE_IO - then inode is valid and fs_info can be + * accessed as inode->root->fs_info + */ + union { + struct btrfs_fs_info *fs_info; + struct btrfs_inode *inode; + }; /* Who owns this io tree, should be one of IO_TREE_* */ u8 owner; @@ -112,6 +120,10 @@ struct extent_state { #endif }; +struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree); +const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree); +const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree); + void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner); void extent_io_tree_release(struct extent_io_tree *tree); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7cb5139e3d7f..7320c1ea7926 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8511,8 +8511,11 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); + + /* This io tree sets the valid inode. */ extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; + extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); /* Lockdep class is set only for the file extent tree. */ diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index b50cfac7ad4e..709c6cc9706a 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -103,7 +103,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) if (!dev) return ERR_PTR(-ENOMEM); - extent_io_tree_init(NULL, &dev->alloc_state, 0); + extent_io_tree_init(fs_info, &dev->alloc_state, 0); INIT_LIST_HEAD(&dev->dev_list); list_add(&dev->dev_list, &fs_info->fs_devices->devices); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 4a95097ab590..856109048999 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2098,17 +2098,12 @@ TRACE_EVENT(btrfs_set_extent_bit, __field( unsigned, set_bits) ), - TP_fast_assign_btrfs(tree->fs_info, - __entry->owner = tree->owner; - if (tree->inode) { - const struct btrfs_inode *inode = tree->inode; + TP_fast_assign_btrfs(extent_io_tree_to_fs_info(tree), + const struct btrfs_inode *inode = extent_io_tree_to_inode_const(tree); - __entry->ino = btrfs_ino(inode); - __entry->rootid = inode->root->root_key.objectid; - } else { - __entry->ino = 0; - __entry->rootid = 0; - } + __entry->owner = tree->owner; + __entry->ino = inode ? btrfs_ino(inode) : 0; + __entry->rootid = inode ? inode->root->root_key.objectid : 0; __entry->start = start; __entry->len = len; __entry->set_bits = set_bits; @@ -2136,17 +2131,12 @@ TRACE_EVENT(btrfs_clear_extent_bit, __field( unsigned, clear_bits) ), - TP_fast_assign_btrfs(tree->fs_info, - __entry->owner = tree->owner; - if (tree->inode) { - const struct btrfs_inode *inode = tree->inode; + TP_fast_assign_btrfs(extent_io_tree_to_fs_info(tree), + const struct btrfs_inode *inode = extent_io_tree_to_inode_const(tree); - __entry->ino = btrfs_ino(inode); - __entry->rootid = inode->root->root_key.objectid; - } else { - __entry->ino = 0; - __entry->rootid = 0; - } + __entry->owner = tree->owner; + __entry->ino = inode ? btrfs_ino(inode) : 0; + __entry->rootid = inode ? inode->root->root_key.objectid : 0; __entry->start = start; __entry->len = len; __entry->clear_bits = clear_bits; @@ -2175,17 +2165,12 @@ TRACE_EVENT(btrfs_convert_extent_bit, __field( unsigned, clear_bits) ), - TP_fast_assign_btrfs(tree->fs_info, - __entry->owner = tree->owner; - if (tree->inode) { - const struct btrfs_inode *inode = tree->inode; + TP_fast_assign_btrfs(extent_io_tree_to_fs_info(tree), + const struct btrfs_inode *inode = extent_io_tree_to_inode_const(tree); - __entry->ino = btrfs_ino(inode); - __entry->rootid = inode->root->root_key.objectid; - } else { - __entry->ino = 0; - __entry->rootid = 0; - } + __entry->owner = tree->owner; + __entry->ino = inode ? btrfs_ino(inode) : 0; + __entry->rootid = inode ? inode->root->root_key.objectid : 0; __entry->start = start; __entry->len = len; __entry->set_bits = set_bits; From cbf44cd93db3a470ead92a938210f41095cea562 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 23 Nov 2023 07:47:15 -0800 Subject: [PATCH 0842/1562] btrfs: rename EXTENT_BUFFER_NO_CHECK to EXTENT_BUFFER_ZONED_ZEROOUT EXTENT_BUFFER_ZONED_ZEROOUT better describes the state of the extent buffer, namely it is written as all zeros. This is needed in zoned mode, to preserve I/O ordering. Reviewed-by: Christoph Hellwig Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/extent_io.h | 3 ++- fs/btrfs/zoned.c | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 17ec983ea672..21c7835b46ec 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -254,7 +254,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) return BLK_STS_IOERR; - if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { + if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) { WARN_ON_ONCE(found_start != 0); return BLK_STS_OK; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 01423670bc8a..1f48c7eb233e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5061,7 +5061,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, __btrfs_tree_lock(buf, nest); btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); - clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); + clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags); set_extent_buffer_uptodate(buf); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d68626d1c286..85cee54200ce 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4152,7 +4152,7 @@ static void __write_extent_buffer(const struct extent_buffer *eb, /* For unmapped (dummy) ebs, no need to check their uptodate status. */ const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); - WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); + WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (check_eb_range(eb, start, len)) return; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 8eac8384b24c..021040b3117e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -28,7 +28,8 @@ enum { EXTENT_BUFFER_IN_TREE, /* write IO error */ EXTENT_BUFFER_WRITE_ERR, - EXTENT_BUFFER_NO_CHECK, + /* Indicate the extent buffer is written zeroed out (for zoned) */ + EXTENT_BUFFER_ZONED_ZEROOUT, /* Indicate that extent buffer pages a being read */ EXTENT_BUFFER_READING, }; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 830f0b6ec89e..f1bcf2ac6180 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1716,7 +1716,7 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); memzero_extent_buffer(eb, 0, eb->len); - set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); + set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); set_extent_buffer_dirty(eb); set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, EXTENT_DIRTY, NULL); From aa6313e6ff2bfbf736a2739047bba355d8241584 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 23 Nov 2023 07:47:16 -0800 Subject: [PATCH 0843/1562] btrfs: zoned: don't clear dirty flag of extent buffer One a zoned filesystem, never clear the dirty flag of an extent buffer, but instead mark it as zeroout. On writeout, when encountering a marked extent_buffer, zero it out. Reviewed-by: Christoph Hellwig Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 7 ++++++- fs/btrfs/extent_io.c | 16 ++++++++++++++-- fs/btrfs/zoned.c | 3 ++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 21c7835b46ec..7b55b59115e7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -254,8 +254,13 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) return BLK_STS_IOERR; + /* + * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't + * checksum it but zero-out its content. This is done to preserve + * ordering of I/O without unnecessarily writing out data. + */ if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) { - WARN_ON_ONCE(found_start != 0); + memzero_extent_buffer(eb, 0, eb->len); return BLK_STS_OK; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 85cee54200ce..671010de7964 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3761,6 +3761,20 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, if (trans && btrfs_header_generation(eb) != trans->transid) return; + /* + * Instead of clearing the dirty flag off of the buffer, mark it as + * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve + * write-ordering in zoned mode, without the need to later re-dirty + * the extent_buffer. + * + * The actual zeroout of the buffer will happen later in + * btree_csum_one_bio. + */ + if (btrfs_is_zoned(fs_info)) { + set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); + return; + } + if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) return; @@ -4152,8 +4166,6 @@ static void __write_extent_buffer(const struct extent_buffer *eb, /* For unmapped (dummy) ebs, no need to check their uptodate status. */ const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); - WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); - if (check_eb_range(eb, start, len)) return; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f1bcf2ac6180..2c7fe6699cf4 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1713,7 +1713,8 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)) return; - ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + ASSERT(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); memzero_extent_buffer(eb, 0, eb->len); set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); From 2aae747a4938c2c3c398ff55aa2ddaf51b135899 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 23 Nov 2023 07:47:17 -0800 Subject: [PATCH 0844/1562] btrfs: remove now unneeded btrfs_redirty_list_add Now that we're not clearing the dirty flag off of extent_buffers in zoned mode, all that is left of btrfs_redirty_list_add() is a memzero() and some ASSERT()ions. As we're also memzero()ing the buffer on write-out btrfs_redirty_list_add() has become obsolete and can be removed. Reviewed-by: Christoph Hellwig Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 5 +---- fs/btrfs/tree-log.c | 1 - fs/btrfs/zoned.c | 17 ----------------- fs/btrfs/zoned.h | 5 ----- 4 files changed, 1 insertion(+), 27 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1f48c7eb233e..99f98767e201 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3466,10 +3466,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root_id != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); - if (!ret) { - btrfs_redirty_list_add(trans->transaction, buf); + if (!ret) goto out; - } } cache = btrfs_lookup_block_group(fs_info, buf->start); @@ -3500,7 +3498,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, must_pin = true; if (must_pin || btrfs_is_zoned(fs_info)) { - btrfs_redirty_list_add(trans->transaction, buf); pin_down_extent(trans, cache, buf->start, buf->len, 1); btrfs_put_block_group(cache); goto out; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7d6729d9fd2f..bee065851185 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2575,7 +2575,6 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans, ret = btrfs_pin_reserved_extent(trans, eb); if (ret) return ret; - btrfs_redirty_list_add(trans->transaction, eb); } else { unaccount_log_buffer(eb->fs_info, eb->start); } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 2c7fe6699cf4..910841b6b0a8 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1706,23 +1706,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) cache->zone_unusable = unusable; } -void btrfs_redirty_list_add(struct btrfs_transaction *trans, - struct extent_buffer *eb) -{ - if (!btrfs_is_zoned(eb->fs_info) || - btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)) - return; - - ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - ASSERT(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); - - memzero_extent_buffer(eb, 0, eb->len); - set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); - set_extent_buffer_dirty(eb); - set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, - EXTENT_DIRTY, NULL); -} - bool btrfs_use_zone_append(struct btrfs_bio *bbio) { u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index b9cec523b778..7bfe1d677310 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -59,8 +59,6 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); -void btrfs_redirty_list_add(struct btrfs_transaction *trans, - struct extent_buffer *eb); bool btrfs_use_zone_append(struct btrfs_bio *bbio); void btrfs_record_physical_zoned(struct btrfs_bio *bbio); int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, @@ -180,9 +178,6 @@ static inline int btrfs_load_block_group_zone_info( static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { } -static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, - struct extent_buffer *eb) { } - static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) { return false; From b0d823840936dd63ae41d93b690288de767849d6 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 23 Nov 2023 07:47:18 -0800 Subject: [PATCH 0845/1562] btrfs: use memset_page instead of opencoding it Use memset_page() in memset_extent_buffer() instead of opencoding it. This does not not change any functionality. Reviewed-by: Christoph Hellwig Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 671010de7964..cc9a454810d0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4208,7 +4208,7 @@ static void memset_extent_buffer(const struct extent_buffer *eb, int c, struct page *page = eb->pages[index]; assert_eb_page_uptodate(eb, page); - memset(page_address(page) + offset, c, cur_len); + memset_page(page, offset, c, cur_len); cur += cur_len; } From 3ba2d3648f9dcd6af6326352bb2775e8b31372e0 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 23 Nov 2023 07:47:19 -0800 Subject: [PATCH 0846/1562] btrfs: reflow btrfs_free_tree_block Reflow btrfs_free_tree_block() so that there is one level of indentation needed. This patch has no functional changes. Reviewed-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 107 +++++++++++++++++++++-------------------- 1 file changed, 54 insertions(+), 53 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 99f98767e201..f396aba92c57 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3447,6 +3447,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_ref generic_ref = { 0 }; + struct btrfs_block_group *bg; int ret; btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, @@ -3460,64 +3461,64 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ } - if (last_ref && btrfs_header_generation(buf) == trans->transid) { - struct btrfs_block_group *cache; - bool must_pin = false; + if (!last_ref) + return; - if (root_id != BTRFS_TREE_LOG_OBJECTID) { - ret = check_ref_cleanup(trans, buf->start); - if (!ret) - goto out; - } + if (btrfs_header_generation(buf) != trans->transid) + goto out; - cache = btrfs_lookup_block_group(fs_info, buf->start); - - if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(trans, cache, buf->start, buf->len, 1); - btrfs_put_block_group(cache); + if (root_id != BTRFS_TREE_LOG_OBJECTID) { + ret = check_ref_cleanup(trans, buf->start); + if (!ret) goto out; - } - - /* - * If there are tree mod log users we may have recorded mod log - * operations for this node. If we re-allocate this node we - * could replay operations on this node that happened when it - * existed in a completely different root. For example if it - * was part of root A, then was reallocated to root B, and we - * are doing a btrfs_old_search_slot(root b), we could replay - * operations that happened when the block was part of root A, - * giving us an inconsistent view of the btree. - * - * We are safe from races here because at this point no other - * node or root points to this extent buffer, so if after this - * check a new tree mod log user joins we will not have an - * existing log of operations on this node that we have to - * contend with. - */ - if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) - must_pin = true; - - if (must_pin || btrfs_is_zoned(fs_info)) { - pin_down_extent(trans, cache, buf->start, buf->len, 1); - btrfs_put_block_group(cache); - goto out; - } - - WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); - - btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_free_reserved_bytes(cache, buf->len, 0); - btrfs_put_block_group(cache); - trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); } + + bg = btrfs_lookup_block_group(fs_info, buf->start); + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + pin_down_extent(trans, bg, buf->start, buf->len, 1); + btrfs_put_block_group(bg); + goto out; + } + + /* + * If there are tree mod log users we may have recorded mod log + * operations for this node. If we re-allocate this node we + * could replay operations on this node that happened when it + * existed in a completely different root. For example if it + * was part of root A, then was reallocated to root B, and we + * are doing a btrfs_old_search_slot(root b), we could replay + * operations that happened when the block was part of root A, + * giving us an inconsistent view of the btree. + * + * We are safe from races here because at this point no other + * node or root points to this extent buffer, so if after this + * check a new tree mod log user joins we will not have an + * existing log of operations on this node that we have to + * contend with. + */ + + if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags) + || btrfs_is_zoned(fs_info)) { + pin_down_extent(trans, bg, buf->start, buf->len, 1); + btrfs_put_block_group(bg); + goto out; + } + + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + + btrfs_add_free_space(bg, buf->start, buf->len); + btrfs_free_reserved_bytes(bg, buf->len, 0); + btrfs_put_block_group(bg); + trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); + out: - if (last_ref) { - /* - * Deleting the buffer, clear the corrupt flag since it doesn't - * matter anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); - } + + /* + * Deleting the buffer, clear the corrupt flag since it doesn't + * matter anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); } /* Can return -ENOMEM */ From 397239ed6a6c88b002fbba0b25ed5a719c578c2f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 16 Nov 2023 15:49:06 +1030 Subject: [PATCH 0847/1562] btrfs: allow extent buffer helpers to skip cross-page handling Currently btrfs extent buffer helpers are doing all the cross-page handling, as there is no guarantee that all those eb pages are contiguous. However on systems with enough memory, there is a very high chance the page cache for btree_inode are allocated with physically contiguous pages. In that case, we can skip all the complex cross-page handling, thus speeding up the code. This patch adds a new member, extent_buffer::addr, which is only set to non-NULL if all the extent buffer pages are physically contiguous. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 17 +++++++++++--- fs/btrfs/extent_io.c | 54 ++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/extent_io.h | 7 ++++++ 3 files changed, 75 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7b55b59115e7..125b749d2c6f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -74,15 +74,26 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) static void csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; - const int num_pages = num_extent_pages(buf); - const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); + int num_pages; + u32 first_page_part; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; int i; shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start); + + if (buf->addr) { + /* Pages are contiguous, handle them as a big one. */ + kaddr = buf->addr; + first_page_part = fs_info->nodesize; + num_pages = 1; + } else { + kaddr = page_address(buf->pages[0]); + first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); + num_pages = num_extent_pages(buf); + } + crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, first_page_part - BTRFS_CSUM_SIZE); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cc9a454810d0..7f7ecee9e048 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3489,6 +3489,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct address_space *mapping = fs_info->btree_inode->i_mapping; struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; + bool page_contig = true; int uptodate = 1; int ret; @@ -3575,6 +3576,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); eb->pages[i] = p; + + /* + * Check if the current page is physically contiguous with previous eb + * page. + */ + if (i && eb->pages[i - 1] + 1 != p) + page_contig = false; + if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len)) uptodate = 0; @@ -3588,6 +3597,9 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + /* All pages are physically contiguous, can skip cross page handling. */ + if (page_contig) + eb->addr = page_address(eb->pages[0]) + offset_in_page(eb->start); again: ret = radix_tree_preload(GFP_NOFS); if (ret) { @@ -4036,6 +4048,11 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, return; } + if (eb->addr) { + memcpy(dstv, eb->addr + start, len); + return; + } + offset = get_eb_offset_in_page(eb, start); while (len > 0) { @@ -4067,6 +4084,12 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); + if (eb->addr) { + if (copy_to_user_nofault(dstv, eb->addr + start, len)) + ret = -EFAULT; + return ret; + } + offset = get_eb_offset_in_page(eb, start); while (len > 0) { @@ -4102,6 +4125,9 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, if (check_eb_range(eb, start, len)) return -EINVAL; + if (eb->addr) + return memcmp(ptrv, eb->addr + start, len); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { @@ -4169,6 +4195,14 @@ static void __write_extent_buffer(const struct extent_buffer *eb, if (check_eb_range(eb, start, len)) return; + if (eb->addr) { + if (use_memmove) + memmove(eb->addr + start, srcv, len); + else + memcpy(eb->addr + start, srcv, len); + return; + } + offset = get_eb_offset_in_page(eb, start); while (len > 0) { @@ -4201,6 +4235,11 @@ static void memset_extent_buffer(const struct extent_buffer *eb, int c, { unsigned long cur = start; + if (eb->addr) { + memset(eb->addr + start, c, len); + return; + } + while (cur < start + len) { unsigned long index = get_eb_page_index(cur); unsigned int offset = get_eb_offset_in_page(eb, cur); @@ -4428,6 +4467,16 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, check_eb_range(dst, src_offset, len)) return; + if (dst->addr) { + const bool use_memmove = areas_overlap(src_offset, dst_offset, len); + + if (use_memmove) + memmove(dst->addr + dst_offset, dst->addr + src_offset, len); + else + memcpy(dst->addr + dst_offset, dst->addr + src_offset, len); + return; + } + while (cur_off < len) { unsigned long cur_src = cur_off + src_offset; unsigned long pg_index = get_eb_page_index(cur_src); @@ -4460,6 +4509,11 @@ void memmove_extent_buffer(const struct extent_buffer *dst, return; } + if (dst->addr) { + memmove(dst->addr + dst_offset, dst->addr + src_offset, len); + return; + } + while (len > 0) { unsigned long src_i; size_t cur; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 021040b3117e..c2c6bfba63c0 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -78,6 +78,13 @@ struct extent_buffer { unsigned long len; unsigned long bflags; struct btrfs_fs_info *fs_info; + + /* + * The address where the eb can be accessed without any cross-page handling. + * This can be NULL if not possible. + */ + void *addr; + spinlock_t refs_lock; atomic_t refs; int read_mirror; From 3c0e918b8fb3a6a7da1558913302a3e89cf87343 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 23 Nov 2023 23:53:51 +0000 Subject: [PATCH 0848/1562] btrfs: remove no longer used EXTENT_MAP_DELALLOC block start value After commit ac3c0d36a2a2 ("btrfs: make fiemap more efficient and accurate reporting extent sharedness") we no longer need to create special extent maps during fiemap that have a block start with the EXTENT_MAP_DELALLOC value. So this block start value for extent maps is no longer used since then, therefore remove it. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 3 +-- fs/btrfs/extent_map.c | 3 --- fs/btrfs/extent_map.h | 2 -- include/trace/events/btrfs.h | 3 +-- 4 files changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 5244561e2016..9bcb60c68c58 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -996,9 +996,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, em->len <= inode->root->fs_info->max_inline) goto next; - /* Skip hole/delalloc/preallocated extents */ + /* Skip holes and preallocated extents. */ if (em->block_start == EXTENT_MAP_HOLE || - em->block_start == EXTENT_MAP_DELALLOC || test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) goto next; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index c956b1ced69f..80f86503a5cd 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -212,9 +212,6 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (!list_empty(&prev->list) || !list_empty(&next->list)) return 0; - ASSERT(next->block_start != EXTENT_MAP_DELALLOC && - prev->block_start != EXTENT_MAP_DELALLOC); - if (extent_map_end(prev) == next->start && prev->flags == next->flags && ((next->block_start == EXTENT_MAP_HOLE && diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index bae14af197ef..66f8dd26487b 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -9,8 +9,6 @@ #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) #define EXTENT_MAP_INLINE ((u64)-2) -/* used only during fiemap calls */ -#define EXTENT_MAP_DELALLOC ((u64)-1) /* bits for the extent_map::flags field */ enum { diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 856109048999..31da1456f953 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -265,8 +265,7 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict, __print_symbolic_u64(type, \ { EXTENT_MAP_LAST_BYTE, "LAST_BYTE" }, \ { EXTENT_MAP_HOLE, "HOLE" }, \ - { EXTENT_MAP_INLINE, "INLINE" }, \ - { EXTENT_MAP_DELALLOC, "DELALLOC" }) + { EXTENT_MAP_INLINE, "INLINE" }) #define show_map_type(type) \ type, (type >= EXTENT_MAP_LAST_BYTE) ? "-" : __show_map_type(type) From f67d922edb4e95a4a56d07d5d40a76dd4f23a85b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 12:17:37 -0500 Subject: [PATCH 0849/1562] fs: indicate request originates from old mount API We already communicate to filesystems when a remount request comes from the old mount API as some filesystems choose to implement different behavior in the new mount API than the old mount API to e.g., take the chance to fix significant API bugs. Allow the same for regular mount requests. Fixes: b330966f79fb ("fuse: reject options on reconfigure via fsconfig(2)") Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/namespace.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index fbf0e596fcd3..6c39ec020a5f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2875,7 +2875,12 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, if (IS_ERR(fc)) return PTR_ERR(fc); + /* + * Indicate to the filesystem that the remount request is coming + * from the legacy mount system call. + */ fc->oldapi = true; + err = parse_monolithic_mount_data(fc, data); if (!err) { down_write(&sb->s_umount); @@ -3324,6 +3329,12 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, if (IS_ERR(fc)) return PTR_ERR(fc); + /* + * Indicate to the filesystem that the mount request is coming + * from the legacy mount system call. + */ + fc->oldapi = true; + if (subtype) err = vfs_parse_fs_string(fc, "subtype", subtype, strlen(subtype)); From 2b41b19dd6d063a3dca8c1f855a056515f0f678d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:38 -0500 Subject: [PATCH 0850/1562] btrfs: split out the mount option validation code into its own helper We're going to need to validate mount options after they're all parsed with the new mount API, split this code out into its own helper so we can use it when we swap over to the new mount API. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ minor adjustments in the messages ] Signed-off-by: David Sterba --- fs/btrfs/super.c | 66 +++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ef256b944c72..2be3ae63b153 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -236,6 +236,41 @@ static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt, return false; } +static bool check_options(struct btrfs_fs_info *info, unsigned long flags) +{ + bool ret = true; + + if (!(flags & SB_RDONLY) && + (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || + check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || + check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))) + ret = false; + + if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && + !btrfs_test_opt(info, FREE_SPACE_TREE) && + !btrfs_test_opt(info, CLEAR_CACHE)) { + btrfs_err(info, "cannot disable free-space-tree"); + ret = false; + } + if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && + !btrfs_test_opt(info, FREE_SPACE_TREE)) { + btrfs_err(info, "cannot disable free-space-tree with block-group-tree feature"); + ret = false; + } + + if (btrfs_check_mountopts_zoned(info)) + ret = false; + + if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) { + if (btrfs_test_opt(info, SPACE_CACHE)) + btrfs_info(info, "disk space caching is enabled"); + if (btrfs_test_opt(info, FREE_SPACE_TREE)) + btrfs_info(info, "using free-space-tree"); + } + + return ret; +} + static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) { char *opts; @@ -314,7 +349,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, int saved_compress_level; bool saved_compress_force; int no_compress = 0; - const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state); if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); @@ -333,7 +367,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, * against new flags */ if (!options) - goto check; + goto out; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -777,35 +811,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; } } -check: - /* We're read-only, don't have to check. */ - if (new_flags & SB_RDONLY) - goto out; - - if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || - check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || - check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")) - ret = -EINVAL; out: - if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && - !btrfs_test_opt(info, FREE_SPACE_TREE) && - !btrfs_test_opt(info, CLEAR_CACHE)) { - btrfs_err(info, "cannot disable free space tree"); + if (!ret && !check_options(info, new_flags)) ret = -EINVAL; - } - if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && - !btrfs_test_opt(info, FREE_SPACE_TREE)) { - btrfs_err(info, "cannot disable free space tree with block-group-tree feature"); - ret = -EINVAL; - } - if (!ret) - ret = btrfs_check_mountopts_zoned(info); - if (!ret && !remounting) { - if (btrfs_test_opt(info, SPACE_CACHE)) - btrfs_info(info, "disk space caching is enabled"); - if (btrfs_test_opt(info, FREE_SPACE_TREE)) - btrfs_info(info, "using free space tree"); - } return ret; } From 6207c9e3c2059530e4f9b885c61ef2fb4e200036 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:39 -0500 Subject: [PATCH 0851/1562] btrfs: set default compress type at btrfs_init_fs_info time With the new mount API we'll be setting our compression well before we call open_ctree. We don't want to overwrite our settings, so set the default in btrfs_init_fs_info instead of open_ctree. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 125b749d2c6f..969887b2f8fe 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2807,6 +2807,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->sectorsize_bits = ilog2(4096); fs_info->stripesize = 4096; + /* Default compress algorithm when user does -o compress */ + fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE; spin_lock_init(&fs_info->swapfile_pins_lock); @@ -3289,13 +3292,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) WRITE_ONCE(fs_info->fs_error, -EUCLEAN); - /* - * In the long term, we'll store the compression type in the super - * block, and it'll be used for per file compression control. - */ - fs_info->compress_type = BTRFS_COMPRESS_ZLIB; - - /* Set up fs_info before parsing mount options */ nodesize = btrfs_super_nodesize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); From a6a8f22a4af6c572d9e01ca9f7b515bf0cbb63b1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:40 -0500 Subject: [PATCH 0852/1562] btrfs: move space cache settings into open_ctree Currently we pre-load the space cache settings in btrfs_parse_options, however when we switch to the new mount API the mount option parsing will happen before we have the super block loaded. Add a helper to set the appropriate options based on the fs settings, this will allow us to have consistent free space cache settings. This also folds in the space cache related decisions we make for subpage sectorsize support, so all of this is done in one place. Since this was being called by parse options it looks like we're changing the behavior of remount, but in fact we aren't. The pre-loading of the free space cache settings is done because we want to handle the case of users not using any space_cache options, we'll derive the appropriate mount option based on the on disk state. On remount this wouldn't reset anything as we'll have cleared the v1 cache generation if we mounted -o nospace_cache. Similarly it's impossible to turn off the free space tree without specifically saying -o nospace_cache,clear_cache, which will delete the free space tree and clear the compat_ro option. Again in this case calling this code in remount wouldn't result in any change. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 17 +++++--------- fs/btrfs/super.c | 56 +++++++++++++++++++++++++++++++++++----------- fs/btrfs/super.h | 1 + 3 files changed, 50 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 969887b2f8fe..33e48e3865c5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3305,6 +3305,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; + /* + * Handle the space caching options appropriately now that we have the + * super block loaded and validated. + */ + btrfs_set_free_space_cache_settings(fs_info); + ret = btrfs_parse_options(fs_info, options, sb->s_flags); if (ret) goto fail_alloc; @@ -3316,17 +3322,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; - /* - * V1 space cache has some hardcoded PAGE_SIZE usage, and is - * going to be deprecated. - * - * Force to use v2 cache for subpage case. - */ - btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); - btrfs_set_and_info(fs_info, FREE_SPACE_TREE, - "forcing free space tree for sector size %u with page size %lu", - sectorsize, PAGE_SIZE); - btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2be3ae63b153..332d6d2c9376 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -271,6 +271,43 @@ static bool check_options(struct btrfs_fs_info *info, unsigned long flags) return ret; } +/* + * This is subtle, we only call this during open_ctree(). We need to pre-load + * the mount options with the on-disk settings. Before the new mount API took + * effect we would do this on mount and remount. With the new mount API we'll + * only do this on the initial mount. + * + * This isn't a change in behavior, because we're using the current state of the + * file system to set the current mount options. If you mounted with special + * options to disable these features and then remounted we wouldn't revert the + * settings, because mounting without these features cleared the on-disk + * settings, so this being called on re-mount is not needed. + */ +void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info) +{ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + else if (btrfs_free_space_cache_v1_active(fs_info)) { + if (btrfs_is_zoned(fs_info)) { + btrfs_info(fs_info, + "zoned: clearing existing space cache"); + btrfs_set_super_cache_generation(fs_info->super_copy, 0); + } else { + btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); + } + } + + if (fs_info->sectorsize < PAGE_SIZE) { + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + if (!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) { + btrfs_info(fs_info, + "forcing free space tree for sector size %u with page size %lu", + fs_info->sectorsize, PAGE_SIZE); + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + } + } +} + static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) { char *opts; @@ -350,18 +387,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, bool saved_compress_force; int no_compress = 0; - if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) - btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); - else if (btrfs_free_space_cache_v1_active(info)) { - if (btrfs_is_zoned(info)) { - btrfs_info(info, - "zoned: clearing existing space cache"); - btrfs_set_super_cache_generation(info->super_copy, 0); - } else { - btrfs_set_opt(info->mount_opt, SPACE_CACHE); - } - } - /* * Even the options are empty, we still need to do extra check * against new flags @@ -654,8 +679,13 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, * compat_ro(FREE_SPACE_TREE) set, and we aren't going * to allow v1 to be set for extent tree v2, simply * ignore this setting if we're extent tree v2. + * + * For subpage blocksize we don't allow space cache v1, + * and we'll turn on v2, so we can skip the settings + * here as well. */ - if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + if (btrfs_fs_incompat(info, EXTENT_TREE_V2) || + info->sectorsize < PAGE_SIZE) break; if (token == Opt_space_cache || strcmp(args[0].from, "v1") == 0) { diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h index 8dbb909b364f..7c1cd7527e76 100644 --- a/fs/btrfs/super.h +++ b/fs/btrfs/super.h @@ -8,6 +8,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, int btrfs_sync_fs(struct super_block *sb, int wait); char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid); +void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info); static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { From 272efa308fb6bfc7b04a4b6f6dde7b0431b51fee Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:41 -0500 Subject: [PATCH 0853/1562] btrfs: do not allow free space tree rebuild on extent tree v2 We currently don't allow these options to be set if we're extent tree v2 via the mount option parsing. However when we switch to the new mount API we'll no longer have the super block loaded, so won't be able to make this distinction at mount option parsing time. Address this by checking for extent tree v2 at the point where we make the decision to rebuild the free space tree. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 33e48e3865c5..6fcb9390913e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2969,7 +2969,11 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (btrfs_test_opt(fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - rebuild_free_space_tree = true; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + btrfs_warn(fs_info, + "'clear_cache' option is ignored with extent tree v2"); + else + rebuild_free_space_tree = true; } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { btrfs_warn(fs_info, "free space tree is invalid"); From 9ef40c2e9b26bbf9b2110003107e46dabfd4e7dd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:42 -0500 Subject: [PATCH 0854/1562] btrfs: split out ro->rw and rw->ro helpers into their own functions When we remount ro->rw or rw->ro we have some cleanup tasks that have to be managed. Split these out into their own function to make btrfs_remount smaller. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 229 ++++++++++++++++++++++++----------------------- 1 file changed, 116 insertions(+), 113 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 332d6d2c9376..53d6d8f054ff 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1676,6 +1676,115 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); } +static int btrfs_remount_rw(struct btrfs_fs_info *fs_info) +{ + int ret; + + if (BTRFS_FS_ERROR(fs_info)) { + btrfs_err(fs_info, + "remounting read-write after error is not allowed"); + return -EINVAL; + } + + if (fs_info->fs_devices->rw_devices == 0) + return -EACCES; + + if (!btrfs_check_rw_degradable(fs_info, NULL)) { + btrfs_warn(fs_info, + "too many missing devices, writable remount is not allowed"); + return -EACCES; + } + + if (btrfs_super_log_root(fs_info->super_copy) != 0) { + btrfs_warn(fs_info, + "mount required to replay tree-log, cannot remount read-write"); + return -EINVAL; + } + + /* + * NOTE: when remounting with a change that does writes, don't put it + * anywhere above this point, as we are not sure to be safe to write + * until we pass the above checks. + */ + ret = btrfs_start_pre_rw_mount(fs_info); + if (ret) + return ret; + + btrfs_clear_sb_rdonly(fs_info->sb); + + set_bit(BTRFS_FS_OPEN, &fs_info->flags); + + /* + * If we've gone from readonly -> read-write, we need to get our + * sync/async discard lists in the right state. + */ + btrfs_discard_resume(fs_info); + + return 0; +} + +static int btrfs_remount_ro(struct btrfs_fs_info *fs_info) +{ + /* + * This also happens on 'umount -rf' or on shutdown, when the + * filesystem is busy. + */ + cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); + + btrfs_discard_cleanup(fs_info); + + /* Wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* Avoid complains from lockdep et al. */ + up(&fs_info->uuid_tree_rescan_sem); + + btrfs_set_sb_rdonly(fs_info->sb); + + /* + * Setting SB_RDONLY will put the cleaner thread to sleep at the next + * loop if it's already active. If it's already asleep, we'll leave + * unused block groups on disk until we're mounted read-write again + * unless we clean them up here. + */ + btrfs_delete_unused_bgs(fs_info); + + /* + * The cleaner task could be already running before we set the flag + * BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). We must make + * sure that after we finish the remount, i.e. after we call + * btrfs_commit_super(), the cleaner can no longer start a transaction + * - either because it was dropping a dead root, running delayed iputs + * or deleting an unused block group (the cleaner picked a block + * group from the list of unused block groups before we were able to + * in the previous call to btrfs_delete_unused_bgs()). + */ + wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, TASK_UNINTERRUPTIBLE); + + /* + * We've set the superblock to RO mode, so we might have made the + * cleaner task sleep without running all pending delayed iputs. Go + * through all the delayed iputs here, so that if an unmount happens + * without remounting RW we don't end up at finishing close_ctree() + * with a non-empty list of delayed iputs. + */ + btrfs_run_delayed_iputs(fs_info); + + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); + btrfs_pause_balance(fs_info); + + /* + * Pause the qgroup rescan worker if it is running. We don't want it to + * be still running after we are in RO mode, as after that, by the time + * we unmount, it might have left a transaction open, so we would leak + * the transaction and/or crash. + */ + btrfs_qgroup_wait_for_completion(fs_info, false); + + return btrfs_commit_super(fs_info); +} + static int btrfs_remount(struct super_block *sb, int *flags, char *data) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -1729,120 +1838,14 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } } - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) - goto out; + ret = 0; + if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) + ret = btrfs_remount_ro(fs_info); + else if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) + ret = btrfs_remount_rw(fs_info); + if (ret) + goto restore; - if (*flags & SB_RDONLY) { - /* - * this also happens on 'umount -rf' or on shutdown, when - * the filesystem is busy. - */ - cancel_work_sync(&fs_info->async_reclaim_work); - cancel_work_sync(&fs_info->async_data_reclaim_work); - - btrfs_discard_cleanup(fs_info); - - /* wait for the uuid_scan task to finish */ - down(&fs_info->uuid_tree_rescan_sem); - /* avoid complains from lockdep et al. */ - up(&fs_info->uuid_tree_rescan_sem); - - btrfs_set_sb_rdonly(sb); - - /* - * Setting SB_RDONLY will put the cleaner thread to - * sleep at the next loop if it's already active. - * If it's already asleep, we'll leave unused block - * groups on disk until we're mounted read-write again - * unless we clean them up here. - */ - btrfs_delete_unused_bgs(fs_info); - - /* - * The cleaner task could be already running before we set the - * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). - * We must make sure that after we finish the remount, i.e. after - * we call btrfs_commit_super(), the cleaner can no longer start - * a transaction - either because it was dropping a dead root, - * running delayed iputs or deleting an unused block group (the - * cleaner picked a block group from the list of unused block - * groups before we were able to in the previous call to - * btrfs_delete_unused_bgs()). - */ - wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, - TASK_UNINTERRUPTIBLE); - - /* - * We've set the superblock to RO mode, so we might have made - * the cleaner task sleep without running all pending delayed - * iputs. Go through all the delayed iputs here, so that if an - * unmount happens without remounting RW we don't end up at - * finishing close_ctree() with a non-empty list of delayed - * iputs. - */ - btrfs_run_delayed_iputs(fs_info); - - btrfs_dev_replace_suspend_for_unmount(fs_info); - btrfs_scrub_cancel(fs_info); - btrfs_pause_balance(fs_info); - - /* - * Pause the qgroup rescan worker if it is running. We don't want - * it to be still running after we are in RO mode, as after that, - * by the time we unmount, it might have left a transaction open, - * so we would leak the transaction and/or crash. - */ - btrfs_qgroup_wait_for_completion(fs_info, false); - - ret = btrfs_commit_super(fs_info); - if (ret) - goto restore; - } else { - if (BTRFS_FS_ERROR(fs_info)) { - btrfs_err(fs_info, - "Remounting read-write after error is not allowed"); - ret = -EINVAL; - goto restore; - } - if (fs_info->fs_devices->rw_devices == 0) { - ret = -EACCES; - goto restore; - } - - if (!btrfs_check_rw_degradable(fs_info, NULL)) { - btrfs_warn(fs_info, - "too many missing devices, writable remount is not allowed"); - ret = -EACCES; - goto restore; - } - - if (btrfs_super_log_root(fs_info->super_copy) != 0) { - btrfs_warn(fs_info, - "mount required to replay tree-log, cannot remount read-write"); - ret = -EINVAL; - goto restore; - } - - /* - * NOTE: when remounting with a change that does writes, don't - * put it anywhere above this point, as we are not sure to be - * safe to write until we pass the above checks. - */ - ret = btrfs_start_pre_rw_mount(fs_info); - if (ret) - goto restore; - - btrfs_clear_sb_rdonly(sb); - - set_bit(BTRFS_FS_OPEN, &fs_info->flags); - - /* - * If we've gone from readonly -> read/write, we need to get - * our sync/async discard lists in the right state. - */ - btrfs_discard_resume(fs_info); - } -out: /* * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS, * since the absence of the flag means it can be toggled off by remount. From 2496bff6e53d3ad0541a5a3f720c3f7924bb2550 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:43 -0500 Subject: [PATCH 0855/1562] btrfs: add a NOSPACECACHE mount option flag With the old mount API we'd pre-populate the mount options with the space cache settings of the file system, and then the user toggled them on or off with the mount options. When we switch to the new mount API the mount options will be set before we get into opening the file system, so we need a flag to indicate that the user explicitly asked for -o nospace_cache so we can make the appropriate changes after the fact. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + fs/btrfs/fs.h | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6fcb9390913e..4bac16d74179 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2955,6 +2955,7 @@ void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) { btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); + btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE); } /* diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index a3debac2819a..e6f7ee85032e 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -188,6 +188,7 @@ enum { BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27), BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28), BTRFS_MOUNT_NODISCARD = (1UL << 29), + BTRFS_MOUNT_NOSPACECACHE = (1UL << 30), }; /* From 15ddcdd34ebfe7ab58ff4ef4199fd5796da6a6e3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:44 -0500 Subject: [PATCH 0856/1562] btrfs: add fs_parameter definitions In order to convert to the new mount API we have to change how we do the mount option parsing. For now we're going to duplicate these helpers to make it easier to follow, and then remove the old code once everything is in place. This patch contains the re-definition of all of our mount options into the new fs_parameter_spec format. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 125 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 53d6d8f054ff..373ffeba5dcc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "messages.h" #include "delayed-inode.h" #include "ctree.h" @@ -135,7 +136,7 @@ enum { /* Debugging options */ Opt_enospc_debug, Opt_noenospc_debug, #ifdef CONFIG_BTRFS_DEBUG - Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, + Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, #endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY Opt_ref_verify, @@ -225,6 +226,129 @@ static const match_table_t rescue_tokens = { {Opt_err, NULL}, }; +enum { + Opt_fatal_errors_panic, + Opt_fatal_errors_bug, +}; + +static const struct constant_table btrfs_parameter_fatal_errors[] = { + { "panic", Opt_fatal_errors_panic }, + { "bug", Opt_fatal_errors_bug }, + {} +}; + +enum { + Opt_discard_sync, + Opt_discard_async, +}; + +static const struct constant_table btrfs_parameter_discard[] = { + { "sync", Opt_discard_sync }, + { "async", Opt_discard_async }, + {} +}; + +enum { + Opt_space_cache_v1, + Opt_space_cache_v2, +}; + +static const struct constant_table btrfs_parameter_space_cache[] = { + { "v1", Opt_space_cache_v1 }, + { "v2", Opt_space_cache_v2 }, + {} +}; + +enum { + Opt_rescue_usebackuproot, + Opt_rescue_nologreplay, + Opt_rescue_ignorebadroots, + Opt_rescue_ignoredatacsums, + Opt_rescue_parameter_all, +}; + +static const struct constant_table btrfs_parameter_rescue[] = { + { "usebackuproot", Opt_rescue_usebackuproot }, + { "nologreplay", Opt_rescue_nologreplay }, + { "ignorebadroots", Opt_rescue_ignorebadroots }, + { "ibadroots", Opt_rescue_ignorebadroots }, + { "ignoredatacsums", Opt_rescue_ignoredatacsums }, + { "idatacsums", Opt_rescue_ignoredatacsums }, + { "all", Opt_rescue_parameter_all }, + {} +}; + +#ifdef CONFIG_BTRFS_DEBUG +enum { + Opt_fragment_parameter_data, + Opt_fragment_parameter_metadata, + Opt_fragment_parameter_all, +}; + +static const struct constant_table btrfs_parameter_fragment[] = { + { "data", Opt_fragment_parameter_data }, + { "metadata", Opt_fragment_parameter_metadata }, + { "all", Opt_fragment_parameter_all }, + {} +}; +#endif + +static const struct fs_parameter_spec btrfs_fs_parameters[] __maybe_unused = { + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("autodefrag", Opt_defrag), + fsparam_flag_no("barrier", Opt_barrier), + fsparam_flag("clear_cache", Opt_clear_cache), + fsparam_u32("commit", Opt_commit_interval), + fsparam_flag("compress", Opt_compress), + fsparam_string("compress", Opt_compress_type), + fsparam_flag("compress-force", Opt_compress_force), + fsparam_string("compress-force", Opt_compress_force_type), + fsparam_flag_no("datacow", Opt_datacow), + fsparam_flag_no("datasum", Opt_datasum), + fsparam_flag("degraded", Opt_degraded), + fsparam_string("device", Opt_device), + fsparam_flag_no("discard", Opt_discard), + fsparam_enum("discard", Opt_discard_mode, btrfs_parameter_discard), + fsparam_enum("fatal_errors", Opt_fatal_errors, btrfs_parameter_fatal_errors), + fsparam_flag_no("flushoncommit", Opt_flushoncommit), + fsparam_flag_no("inode_cache", Opt_inode_cache), + fsparam_string("max_inline", Opt_max_inline), + fsparam_u32("metadata_ratio", Opt_ratio), + fsparam_flag("rescan_uuid_tree", Opt_rescan_uuid_tree), + fsparam_flag("skip_balance", Opt_skip_balance), + fsparam_flag_no("space_cache", Opt_space_cache), + fsparam_enum("space_cache", Opt_space_cache_version, btrfs_parameter_space_cache), + fsparam_flag_no("ssd", Opt_ssd), + fsparam_flag_no("ssd_spread", Opt_ssd_spread), + fsparam_string("subvol", Opt_subvol), + fsparam_flag("subvol=", Opt_subvol_empty), + fsparam_u64("subvolid", Opt_subvolid), + fsparam_u32("thread_pool", Opt_thread_pool), + fsparam_flag_no("treelog", Opt_treelog), + fsparam_flag("user_subvol_rm_allowed", Opt_user_subvol_rm_allowed), + + /* Rescue options. */ + fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue), + /* Deprecated, with alias rescue=nologreplay */ + __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL), + /* Deprecated, with alias rescue=usebackuproot */ + __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL), + + /* Deprecated options. */ + __fsparam(NULL, "recovery", Opt_recovery, + fs_param_neg_with_no | fs_param_deprecated, NULL), + + /* Debugging options. */ + fsparam_flag_no("enospc_debug", Opt_enospc_debug), +#ifdef CONFIG_BTRFS_DEBUG + fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment), +#endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + fsparam_flag("ref_verify", Opt_ref_verify), +#endif + {} +}; + static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt, const char *opt_name) { From 17b3612022fe533e70c0a83ea7634069e5ce33f1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:45 -0500 Subject: [PATCH 0857/1562] btrfs: add parse_param callback for the new mount API The parse_param callback handles one parameter at a time, take our existing mount option parsing loop and adjust it to handle one parameter at a time, and tie it into the fs_context_operations. Create a btrfs_fs_context object that will store the various mount properties, we'll house this in fc->fs_private. This is necessary to separate because remounting will use ->reconfigure, and we'll get a new copy of the parsed parameters, so we can no longer directly mess with the fs_info in this stage. In the future we'll add this to the btrfs_fs_info and update the users to use the new context object instead. There's a change how the option device= is processed. Previously all mount options were parsed in one go under uuid_mutex and the devices opened. This prevented a concurrent scan to happen during mount. Now we could see a device scan happen (e.g. by udev) but this should not affect the end result, mount will either see the populated fs_devices or will scan the device by itself. Alternatively we could save all the device paths first and then process them in one go as before but this does not seem to be necessary. Reviewed-by: Johannes Thumshirn Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ add note about device scanning ] Signed-off-by: David Sterba --- fs/btrfs/super.c | 380 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 380 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 373ffeba5dcc..2f981fb87520 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -87,6 +87,19 @@ static void btrfs_put_super(struct super_block *sb) close_ctree(fs_info); } +/* Store the mount options related information. */ +struct btrfs_fs_context { + char *subvol_name; + u64 subvol_objectid; + u64 max_inline; + u32 commit_interval; + u32 metadata_ratio; + u32 thread_pool_size; + unsigned long mount_opt; + unsigned long compress_type:4; + unsigned int compress_level; +}; + enum { Opt_acl, Opt_noacl, Opt_clear_cache, @@ -349,6 +362,369 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] __maybe_unused = { {} }; +static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct btrfs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, btrfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_degraded: + btrfs_set_opt(ctx->mount_opt, DEGRADED); + break; + case Opt_subvol_empty: + /* + * This exists because we used to allow it on accident, so we're + * keeping it to maintain ABI. See 37becec95ac3 ("Btrfs: allow + * empty subvol= again"). + */ + break; + case Opt_subvol: + kfree(ctx->subvol_name); + ctx->subvol_name = kstrdup(param->string, GFP_KERNEL); + if (!ctx->subvol_name) + return -ENOMEM; + break; + case Opt_subvolid: + ctx->subvol_objectid = result.uint_64; + + /* subvolid=0 means give me the original fs_tree. */ + if (!ctx->subvol_objectid) + ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID; + break; + case Opt_device: { + struct btrfs_device *device; + blk_mode_t mode = sb_open_mode(fc->sb_flags); + + mutex_lock(&uuid_mutex); + device = btrfs_scan_one_device(param->string, mode, false); + mutex_unlock(&uuid_mutex); + if (IS_ERR(device)) + return PTR_ERR(device); + break; + } + case Opt_datasum: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NODATASUM); + } else { + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } + break; + case Opt_datacow: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + btrfs_set_opt(ctx->mount_opt, NODATACOW); + btrfs_set_opt(ctx->mount_opt, NODATASUM); + } else { + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + } + break; + case Opt_compress_force: + case Opt_compress_force_type: + btrfs_set_opt(ctx->mount_opt, FORCE_COMPRESS); + fallthrough; + case Opt_compress: + case Opt_compress_type: + if (opt == Opt_compress || opt == Opt_compress_force) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "zlib", 4) == 0) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = + btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, + param->string + 4); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "lzo", 3) == 0) { + ctx->compress_type = BTRFS_COMPRESS_LZO; + ctx->compress_level = 0; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "zstd", 4) == 0) { + ctx->compress_type = BTRFS_COMPRESS_ZSTD; + ctx->compress_level = + btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, + param->string + 4); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "no", 2) == 0) { + ctx->compress_level = 0; + ctx->compress_type = 0; + btrfs_clear_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + } else { + btrfs_err(NULL, "unrecognized compression value %s", + param->string); + return -EINVAL; + } + break; + case Opt_ssd: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NOSSD); + btrfs_clear_opt(ctx->mount_opt, SSD); + btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD); + } else { + btrfs_set_opt(ctx->mount_opt, SSD); + btrfs_clear_opt(ctx->mount_opt, NOSSD); + } + break; + case Opt_ssd_spread: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD); + } else { + btrfs_set_opt(ctx->mount_opt, SSD); + btrfs_set_opt(ctx->mount_opt, SSD_SPREAD); + btrfs_clear_opt(ctx->mount_opt, NOSSD); + } + break; + case Opt_barrier: + if (result.negated) + btrfs_set_opt(ctx->mount_opt, NOBARRIER); + else + btrfs_clear_opt(ctx->mount_opt, NOBARRIER); + break; + case Opt_thread_pool: + if (result.uint_32 == 0) { + btrfs_err(NULL, "invalid value 0 for thread_pool"); + return -EINVAL; + } + ctx->thread_pool_size = result.uint_32; + break; + case Opt_max_inline: + ctx->max_inline = memparse(param->string, NULL); + break; + case Opt_acl: + if (result.negated) { + fc->sb_flags &= ~SB_POSIXACL; + } else { +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#else + btrfs_err(NULL, "support for ACL not compiled in"); + return -EINVAL; +#endif + } + /* + * VFS limits the ability to toggle ACL on and off via remount, + * despite every file system allowing this. This seems to be + * an oversight since we all do, but it'll fail if we're + * remounting. So don't set the mask here, we'll check it in + * btrfs_reconfigure and do the toggling ourselves. + */ + if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) + fc->sb_flags_mask |= SB_POSIXACL; + break; + case Opt_treelog: + if (result.negated) + btrfs_set_opt(ctx->mount_opt, NOTREELOG); + else + btrfs_clear_opt(ctx->mount_opt, NOTREELOG); + break; + case Opt_recovery: + /* + * -o recovery used to be an alias for usebackuproot, and then + * norecovery was an alias for nologreplay, hence the different + * behaviors for negated and not. + */ + if (result.negated) { + btrfs_warn(NULL, + "'norecovery' is deprecated, use 'rescue=nologreplay' instead"); + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); + } else { + btrfs_warn(NULL, + "'recovery' is deprecated, use 'rescue=usebackuproot' instead"); + btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); + } + break; + case Opt_nologreplay: + btrfs_warn(NULL, + "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); + break; + case Opt_flushoncommit: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT); + else + btrfs_set_opt(ctx->mount_opt, FLUSHONCOMMIT); + break; + case Opt_ratio: + ctx->metadata_ratio = result.uint_32; + break; + case Opt_discard: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + btrfs_set_opt(ctx->mount_opt, NODISCARD); + } else { + btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + } + break; + case Opt_discard_mode: + switch (result.uint_32) { + case Opt_discard_sync: + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC); + break; + case Opt_discard_async: + btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_set_opt(ctx->mount_opt, DISCARD_ASYNC); + break; + default: + btrfs_err(NULL, "unrecognized discard mode value %s", + param->key); + return -EINVAL; + } + btrfs_clear_opt(ctx->mount_opt, NODISCARD); + break; + case Opt_space_cache: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NOSPACECACHE); + btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); + } else { + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); + btrfs_set_opt(ctx->mount_opt, SPACE_CACHE); + } + break; + case Opt_space_cache_version: + switch (result.uint_32) { + case Opt_space_cache_v1: + btrfs_set_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); + break; + case Opt_space_cache_v2: + btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_set_opt(ctx->mount_opt, FREE_SPACE_TREE); + break; + default: + btrfs_err(NULL, "unrecognized space_cache value %s", + param->key); + return -EINVAL; + } + break; + case Opt_rescan_uuid_tree: + btrfs_set_opt(ctx->mount_opt, RESCAN_UUID_TREE); + break; + case Opt_inode_cache: + btrfs_warn(NULL, + "the 'inode_cache' option is deprecated and has no effect since 5.11"); + break; + case Opt_clear_cache: + btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); + break; + case Opt_user_subvol_rm_allowed: + btrfs_set_opt(ctx->mount_opt, USER_SUBVOL_RM_ALLOWED); + break; + case Opt_enospc_debug: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, ENOSPC_DEBUG); + else + btrfs_set_opt(ctx->mount_opt, ENOSPC_DEBUG); + break; + case Opt_defrag: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, AUTO_DEFRAG); + else + btrfs_set_opt(ctx->mount_opt, AUTO_DEFRAG); + break; + case Opt_usebackuproot: + btrfs_warn(NULL, + "'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead"); + btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); + break; + case Opt_skip_balance: + btrfs_set_opt(ctx->mount_opt, SKIP_BALANCE); + break; + case Opt_fatal_errors: + switch (result.uint_32) { + case Opt_fatal_errors_panic: + btrfs_set_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR); + break; + case Opt_fatal_errors_bug: + btrfs_clear_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR); + break; + default: + btrfs_err(NULL, "unrecognized fatal_errors value %s", + param->key); + return -EINVAL; + } + break; + case Opt_commit_interval: + ctx->commit_interval = result.uint_32; + if (ctx->commit_interval == 0) + ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + break; + case Opt_rescue: + switch (result.uint_32) { + case Opt_rescue_usebackuproot: + btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); + break; + case Opt_rescue_nologreplay: + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); + break; + case Opt_rescue_ignorebadroots: + btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS); + break; + case Opt_rescue_ignoredatacsums: + btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS); + break; + case Opt_rescue_parameter_all: + btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS); + btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS); + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); + break; + default: + btrfs_info(NULL, "unrecognized rescue option '%s'", + param->key); + return -EINVAL; + } + break; +#ifdef CONFIG_BTRFS_DEBUG + case Opt_fragment: + switch (result.uint_32) { + case Opt_fragment_parameter_all: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA); + btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA); + break; + case Opt_fragment_parameter_metadata: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA); + break; + case Opt_fragment_parameter_data: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA); + break; + default: + btrfs_info(NULL, "unrecognized fragment option '%s'", + param->key); + return -EINVAL; + } + break; +#endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + case Opt_ref_verify: + btrfs_set_opt(ctx->mount_opt, REF_VERIFY); + break; +#endif + default: + btrfs_err(NULL, "unrecognized mount option '%s'", param->key); + return -EINVAL; + } + + return 0; +} + static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt, const char *opt_name) { @@ -2266,6 +2642,10 @@ static void btrfs_kill_super(struct super_block *sb) btrfs_free_fs_info(fs_info); } +static const struct fs_context_operations btrfs_fs_context_ops __maybe_unused = { + .parse_param = btrfs_parse_param, +}; + static struct file_system_type btrfs_fs_type = { .owner = THIS_MODULE, .name = "btrfs", From 0f85e244dfc5c22cb5e115ccad651df65e6fd68a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:46 -0500 Subject: [PATCH 0858/1562] btrfs: add fs context handling functions We are going to use the fs context to hold the mount options, so allocate the btrfs_fs_context when we're asked to init the fs context, and free it in the free callback. Reviewed-by: Christian Brauner Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2f981fb87520..78e6e4c30124 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2642,10 +2642,44 @@ static void btrfs_kill_super(struct super_block *sb) btrfs_free_fs_info(fs_info); } -static const struct fs_context_operations btrfs_fs_context_ops __maybe_unused = { +static void btrfs_free_fs_context(struct fs_context *fc) +{ + struct btrfs_fs_context *ctx = fc->fs_private; + + if (!ctx) + return; + + kfree(ctx->subvol_name); + kfree(ctx); +} + +static const struct fs_context_operations btrfs_fs_context_ops = { .parse_param = btrfs_parse_param, + .free = btrfs_free_fs_context, }; +static int __maybe_unused btrfs_init_fs_context(struct fs_context *fc) +{ + struct btrfs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct btrfs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); + ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE; + ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID; +#ifndef CONFIG_BTRFS_FS_POSIX_ACL + ctx->noacl = true; +#endif + + fc->fs_private = ctx; + fc->ops = &btrfs_fs_context_ops; + + return 0; +} + static struct file_system_type btrfs_fs_type = { .owner = THIS_MODULE, .name = "btrfs", From eddb1a433f2631ef211b3253ba7e7aba20310ebc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:47 -0500 Subject: [PATCH 0859/1562] btrfs: add reconfigure callback for fs_context This is what is used to remount the file system with the new mount API. Because the mount options are parsed separately and one at a time I've added a helper to emit the mount options after the fact once the mount is configured, this matches the dmesg output for what happens with the old mount API. Reviewed-by: Johannes Thumshirn Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 206 ++++++++++++++++++++++++++++++++++++++++++----- fs/btrfs/zoned.c | 15 ++-- fs/btrfs/zoned.h | 5 +- 3 files changed, 197 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 78e6e4c30124..7c46acb4b2e4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -725,10 +725,11 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } -static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt, +static bool check_ro_option(struct btrfs_fs_info *fs_info, + unsigned long mount_opt, unsigned long opt, const char *opt_name) { - if (fs_info->mount_opt & opt) { + if (mount_opt & opt) { btrfs_err(fs_info, "%s must be used with ro mount option", opt_name); return true; @@ -736,35 +737,36 @@ static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt, return false; } -static bool check_options(struct btrfs_fs_info *info, unsigned long flags) +static bool check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, + unsigned long flags) { bool ret = true; if (!(flags & SB_RDONLY) && - (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || - check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || - check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))) + (check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))) ret = false; if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && - !btrfs_test_opt(info, FREE_SPACE_TREE) && - !btrfs_test_opt(info, CLEAR_CACHE)) { + !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE) && + !btrfs_raw_test_opt(*mount_opt, CLEAR_CACHE)) { btrfs_err(info, "cannot disable free-space-tree"); ret = false; } if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && - !btrfs_test_opt(info, FREE_SPACE_TREE)) { + !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) { btrfs_err(info, "cannot disable free-space-tree with block-group-tree feature"); ret = false; } - if (btrfs_check_mountopts_zoned(info)) + if (btrfs_check_mountopts_zoned(info, mount_opt)) ret = false; if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) { - if (btrfs_test_opt(info, SPACE_CACHE)) + if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) btrfs_info(info, "disk space caching is enabled"); - if (btrfs_test_opt(info, FREE_SPACE_TREE)) + if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) btrfs_info(info, "using free-space-tree"); } @@ -1342,7 +1344,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } } out: - if (!ret && !check_options(info, new_flags)) + if (!ret && !check_options(info, &info->mount_opt, new_flags)) ret = -EINVAL; return ret; } @@ -2378,6 +2380,166 @@ restore: return ret; } +static void btrfs_ctx_to_info(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx) +{ + fs_info->max_inline = ctx->max_inline; + fs_info->commit_interval = ctx->commit_interval; + fs_info->metadata_ratio = ctx->metadata_ratio; + fs_info->thread_pool_size = ctx->thread_pool_size; + fs_info->mount_opt = ctx->mount_opt; + fs_info->compress_type = ctx->compress_type; + fs_info->compress_level = ctx->compress_level; +} + +static void btrfs_info_to_ctx(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx) +{ + ctx->max_inline = fs_info->max_inline; + ctx->commit_interval = fs_info->commit_interval; + ctx->metadata_ratio = fs_info->metadata_ratio; + ctx->thread_pool_size = fs_info->thread_pool_size; + ctx->mount_opt = fs_info->mount_opt; + ctx->compress_type = fs_info->compress_type; + ctx->compress_level = fs_info->compress_level; +} + +#define btrfs_info_if_set(fs_info, old_ctx, opt, fmt, args...) \ +do { \ + if ((!old_ctx || !btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \ + btrfs_raw_test_opt(fs_info->mount_opt, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ +} while (0) + +#define btrfs_info_if_unset(fs_info, old_ctx, opt, fmt, args...) \ +do { \ + if ((old_ctx && btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \ + !btrfs_raw_test_opt(fs_info->mount_opt, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ +} while (0) + +static void btrfs_emit_options(struct btrfs_fs_info *info, + struct btrfs_fs_context *old) +{ + btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); + btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts"); + btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); + btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations"); + btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme"); + btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers"); + btrfs_info_if_set(info, old, NOTREELOG, "disabling tree log"); + btrfs_info_if_set(info, old, NOLOGREPLAY, "disabling log replay at mount time"); + btrfs_info_if_set(info, old, FLUSHONCOMMIT, "turning on flush-on-commit"); + btrfs_info_if_set(info, old, DISCARD_SYNC, "turning on sync discard"); + btrfs_info_if_set(info, old, DISCARD_ASYNC, "turning on async discard"); + btrfs_info_if_set(info, old, FREE_SPACE_TREE, "enabling free space tree"); + btrfs_info_if_set(info, old, SPACE_CACHE, "enabling disk space caching"); + btrfs_info_if_set(info, old, CLEAR_CACHE, "force clearing of disk cache"); + btrfs_info_if_set(info, old, AUTO_DEFRAG, "enabling auto defrag"); + btrfs_info_if_set(info, old, FRAGMENT_DATA, "fragmenting data"); + btrfs_info_if_set(info, old, FRAGMENT_METADATA, "fragmenting metadata"); + btrfs_info_if_set(info, old, REF_VERIFY, "doing ref verification"); + btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time"); + btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots"); + btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums"); + + btrfs_info_if_unset(info, old, NODATACOW, "setting datacow"); + btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations"); + btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme"); + btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers"); + btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log"); + btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching"); + btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree"); + btrfs_info_if_unset(info, old, AUTO_DEFRAG, "disabling auto defrag"); + btrfs_info_if_unset(info, old, COMPRESS, "use no compression"); + + /* Did the compression settings change? */ + if (btrfs_test_opt(info, COMPRESS) && + (!old || + old->compress_type != info->compress_type || + old->compress_level != info->compress_level || + (!btrfs_raw_test_opt(old->mount_opt, FORCE_COMPRESS) && + btrfs_raw_test_opt(info->mount_opt, FORCE_COMPRESS)))) { + const char *compress_type = btrfs_compress_type2str(info->compress_type); + + btrfs_info(info, "%s %s compression, level %d", + btrfs_test_opt(info, FORCE_COMPRESS) ? "force" : "use", + compress_type, info->compress_level); + } + + if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) + btrfs_info(info, "max_inline set to %llu", info->max_inline); +} + +static int btrfs_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_context old_ctx; + int ret = 0; + + btrfs_info_to_ctx(fs_info, &old_ctx); + + sync_filesystem(sb); + set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + + if (!check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) + return -EINVAL; + + ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY)); + if (ret < 0) + return ret; + + btrfs_ctx_to_info(fs_info, ctx); + btrfs_remount_begin(fs_info, old_ctx.mount_opt, fc->sb_flags); + btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, + old_ctx.thread_pool_size); + + if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) != + (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + (!sb_rdonly(sb) || (fc->sb_flags & SB_RDONLY))) { + btrfs_warn(fs_info, + "remount supports changing free space tree only from RO to RW"); + /* Make sure free space cache options match the state on disk. */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + } + if (btrfs_free_space_cache_v1_active(fs_info)) { + btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); + } + } + + ret = 0; + if (!sb_rdonly(sb) && (fc->sb_flags & SB_RDONLY)) + ret = btrfs_remount_ro(fs_info); + else if (sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY)) + ret = btrfs_remount_rw(fs_info); + if (ret) + goto restore; + + /* + * If we set the mask during the parameter parsing VFS would reject the + * remount. Here we can set the mask and the value will be updated + * appropriately. + */ + if ((fc->sb_flags & SB_POSIXACL) != (sb->s_flags & SB_POSIXACL)) + fc->sb_flags_mask |= SB_POSIXACL; + + btrfs_emit_options(fs_info, &old_ctx); + wake_up_process(fs_info->transaction_kthread); + btrfs_remount_cleanup(fs_info, old_ctx.mount_opt); + btrfs_clear_oneshot_options(fs_info); + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + + return 0; +restore: + btrfs_ctx_to_info(fs_info, &old_ctx); + btrfs_remount_cleanup(fs_info, old_ctx.mount_opt); + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + return ret; +} + /* Used to sort the devices by max_avail(descending sort) */ static int btrfs_cmp_device_free_bytes(const void *a, const void *b) { @@ -2655,6 +2817,7 @@ static void btrfs_free_fs_context(struct fs_context *fc) static const struct fs_context_operations btrfs_fs_context_ops = { .parse_param = btrfs_parse_param, + .reconfigure = btrfs_reconfigure, .free = btrfs_free_fs_context, }; @@ -2666,17 +2829,18 @@ static int __maybe_unused btrfs_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; - ctx->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); - ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE; - ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; - ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID; -#ifndef CONFIG_BTRFS_FS_POSIX_ACL - ctx->noacl = true; -#endif - fc->fs_private = ctx; fc->ops = &btrfs_fs_context_ops; + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + btrfs_info_to_ctx(btrfs_sb(fc->root->d_sb), ctx); + } else { + ctx->thread_pool_size = + min_t(unsigned long, num_online_cpus() + 2, 8); + ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE; + ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + } + return 0; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 910841b6b0a8..12066afc235c 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -781,7 +781,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) * Check mount options here, because we might change fs_info->zoned * from fs_info->zone_size. */ - ret = btrfs_check_mountopts_zoned(fs_info); + ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt); if (ret) return ret; @@ -789,7 +789,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) +int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt) { if (!btrfs_is_zoned(info)) return 0; @@ -798,18 +798,21 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) * Space cache writing is not COWed. Disable that to avoid write errors * in sequential zones. */ - if (btrfs_test_opt(info, SPACE_CACHE)) { + if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { btrfs_err(info, "zoned: space cache v1 is not supported"); return -EINVAL; } - if (btrfs_test_opt(info, NODATACOW)) { + if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) { btrfs_err(info, "zoned: NODATACOW not supported"); return -EINVAL; } - btrfs_clear_and_info(info, DISCARD_ASYNC, - "zoned: async discard ignored and disabled for zoned mode"); + if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) { + btrfs_info(info, + "zoned: async discard ignored and disabled for zoned mode"); + btrfs_clear_opt(*mount_opt, DISCARD_ASYNC); + } return 0; } diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 7bfe1d677310..74e660eec20e 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -45,7 +45,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); -int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); +int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt); int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, u64 *bytenr_ret); int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, @@ -121,7 +121,8 @@ static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info) return -EOPNOTSUPP; } -static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) +static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, + unsigned long *mount_opt) { return 0; } From 3bb17a25bcb09abbd667c6ac86c7c9109ae82bcd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:48 -0500 Subject: [PATCH 0860/1562] btrfs: add get_tree callback for new mount API This is the actual mounting callback for the new mount API. Implement this using our current fill super as a guideline, making the appropriate adjustments for the new mount API. Our old mount operation had two fs_types, one to handle the actual opening, and the one that we called to handle the actual opening and then did the subvol lookup for returning the actual root dentry. This is mirrored here, but simply with different behaviors for ->get_tree. We use the existence of ->s_fs_info to tell which part we're in. The initial call allocates the fs_info, then call mount_fc() with a duplicated fc to do the actual open_ctree part. Then we take that vfsmount and use it to look up our subvolume that we're mounting and return that as our s_root. This idea was taken from Christians attempt to convert us to the new mount API [1]. In btrfs_get_tree_super() the mount device is scanned and opened in one go under uuid_mutex we expect that all related devices have been already scanned, either by mount or from the outside. A device forget can be called on some of the devices as the whole context is not protected but it's an unlikely event, though it's a minor behaviour change. References: https://lore.kernel.org/all/20230626-fs-btrfs-mount-api-v1-2-045e9735a00b@kernel.org/ Reviewed-by: Christian Brauner Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ add note about device scanning ] Signed-off-by: David Sterba --- fs/btrfs/super.c | 208 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 204 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7c46acb4b2e4..9672d7b26e54 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -98,6 +98,7 @@ struct btrfs_fs_context { unsigned long mount_opt; unsigned long compress_type:4; unsigned int compress_level; + refcount_t refs; }; enum { @@ -2797,6 +2798,180 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static int btrfs_fc_test_super(struct super_block *sb, struct fs_context *fc) +{ + struct btrfs_fs_info *p = fc->s_fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + return fs_info->fs_devices == p->fs_devices; +} + +static int btrfs_get_tree_super(struct fs_context *fc) +{ + struct btrfs_fs_info *fs_info = fc->s_fs_info; + struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_devices *fs_devices = NULL; + struct block_device *bdev; + struct btrfs_device *device; + struct super_block *sb; + blk_mode_t mode = sb_open_mode(fc->sb_flags); + int ret; + + btrfs_ctx_to_info(fs_info, ctx); + mutex_lock(&uuid_mutex); + + /* + * With 'true' passed to btrfs_scan_one_device() (mount time) we expect + * either a valid device or an error. + */ + device = btrfs_scan_one_device(fc->source, mode, true); + ASSERT(device != NULL); + if (IS_ERR(device)) { + mutex_unlock(&uuid_mutex); + return PTR_ERR(device); + } + + fs_devices = device->fs_devices; + fs_info->fs_devices = fs_devices; + + ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type); + mutex_unlock(&uuid_mutex); + if (ret) + return ret; + + if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) { + ret = -EACCES; + goto error; + } + + bdev = fs_devices->latest_dev->bdev; + + /* + * From now on the error handling is not straightforward. + * + * If successful, this will transfer the fs_info into the super block, + * and fc->s_fs_info will be NULL. However if there's an existing + * super, we'll still have fc->s_fs_info populated. If we error + * completely out it'll be cleaned up when we drop the fs_context, + * otherwise it's tied to the lifetime of the super_block. + */ + sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc); + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + goto error; + } + + if (sb->s_root) { + btrfs_close_devices(fs_devices); + if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) + ret = -EBUSY; + } else { + snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); + shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); + btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; + ret = btrfs_fill_super(sb, fs_devices, NULL); + } + + if (ret) { + deactivate_locked_super(sb); + return ret; + } + + fc->root = dget(sb->s_root); + return 0; + +error: + btrfs_close_devices(fs_devices); + return ret; +} + +static int btrfs_get_tree_subvol(struct fs_context *fc) +{ + struct btrfs_fs_info *fs_info = NULL; + struct btrfs_fs_context *ctx = fc->fs_private; + struct fs_context *dup_fc; + struct dentry *dentry; + struct vfsmount *mnt; + + /* + * Setup a dummy root and fs_info for test/set super. This is because + * we don't actually fill this stuff out until open_ctree, but we need + * then open_ctree will properly initialize the file system specific + * settings later. btrfs_init_fs_info initializes the static elements + * of the fs_info (locks and such) to make cleanup easier if we find a + * superblock with our given fs_devices later on at sget() time. + */ + fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); + if (!fs_info) + return -ENOMEM; + + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + btrfs_free_fs_info(fs_info); + return -ENOMEM; + } + btrfs_init_fs_info(fs_info); + + dup_fc = vfs_dup_fs_context(fc); + if (IS_ERR(dup_fc)) { + btrfs_free_fs_info(fs_info); + return PTR_ERR(dup_fc); + } + + /* + * When we do the sget_fc this gets transferred to the sb, so we only + * need to set it on the dup_fc as this is what creates the super block. + */ + dup_fc->s_fs_info = fs_info; + + /* + * We'll do the security settings in our btrfs_get_tree_super() mount + * loop, they were duplicated into dup_fc, we can drop the originals + * here. + */ + security_free_mnt_opts(&fc->security); + fc->security = NULL; + + mnt = fc_mount(dup_fc); + put_fs_context(dup_fc); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + /* + * This free's ->subvol_name, because if it isn't set we have to + * allocate a buffer to hold the subvol_name, so we just drop our + * reference to it here. + */ + dentry = mount_subvol(ctx->subvol_name, ctx->subvol_objectid, mnt); + ctx->subvol_name = NULL; + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + fc->root = dentry; + return 0; +} + +static int btrfs_get_tree(struct fs_context *fc) +{ + /* + * Since we use mount_subtree to mount the default/specified subvol, we + * have to do mounts in two steps. + * + * First pass through we call btrfs_get_tree_subvol(), this is just a + * wrapper around fc_mount() to call back into here again, and this time + * we'll call btrfs_get_tree_super(). This will do the open_ctree() and + * everything to open the devices and file system. Then we return back + * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and + * from there we can do our mount_subvol() call, which will lookup + * whichever subvol we're mounting and setup this fc with the + * appropriate dentry for the subvol. + */ + if (fc->s_fs_info) + return btrfs_get_tree_super(fc); + return btrfs_get_tree_subvol(fc); +} + static void btrfs_kill_super(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -2807,17 +2982,41 @@ static void btrfs_kill_super(struct super_block *sb) static void btrfs_free_fs_context(struct fs_context *fc) { struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_info *fs_info = fc->s_fs_info; - if (!ctx) - return; + if (fs_info) + btrfs_free_fs_info(fs_info); - kfree(ctx->subvol_name); - kfree(ctx); + if (ctx && refcount_dec_and_test(&ctx->refs)) { + kfree(ctx->subvol_name); + kfree(ctx); + } +} + +static int btrfs_dup_fs_context(struct fs_context *fc, struct fs_context *src_fc) +{ + struct btrfs_fs_context *ctx = src_fc->fs_private; + + /* + * Give a ref to our ctx to this dup, as we want to keep it around for + * our original fc so we can have the subvolume name or objectid. + * + * We unset ->source in the original fc because the dup needs it for + * mounting, and then once we free the dup it'll free ->source, so we + * need to make sure we're only pointing to it in one fc. + */ + refcount_inc(&ctx->refs); + fc->fs_private = ctx; + fc->source = src_fc->source; + src_fc->source = NULL; + return 0; } static const struct fs_context_operations btrfs_fs_context_ops = { .parse_param = btrfs_parse_param, .reconfigure = btrfs_reconfigure, + .get_tree = btrfs_get_tree, + .dup = btrfs_dup_fs_context, .free = btrfs_free_fs_context, }; @@ -2829,6 +3028,7 @@ static int __maybe_unused btrfs_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; + refcount_set(&ctx->refs, 1); fc->fs_private = ctx; fc->ops = &btrfs_fs_context_ops; From f044b318675f0347ecfb88377542651ba4eb9e1f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:49 -0500 Subject: [PATCH 0861/1562] btrfs: handle the ro->rw transition for mounting different subvolumes This is a special case that we've carried around since 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes with different ro/rw options") where we'll under the covers flip the file system to RW if you're mixing and matching ro/rw options with different subvol mounts. The first mount is what the super gets setup as, so we'd handle this by remount the super as rw under the covers to facilitate this behavior. With the new mount API we can't really allow this, because user space has the ability to specify the super block settings, and the mount settings. So if the user explicitly sets the super block as read only, and then tried to mount a rw mount with the super block we'll reject this. However the old API was less descriptive and thus we allowed this kind of behavior. This patch preserves this behavior for the old API calls. This is inspired by Christians work [1], and includes his comment in btrfs_get_tree_super() explaining the history and how it all works in the old and new APIs. Link: https://lore.kernel.org/all/20230626-fs-btrfs-mount-api-v1-2-045e9735a00b@kernel.org/ Reviewed-by: Christian Brauner Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 128 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9672d7b26e54..66c109c85104 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2477,13 +2477,15 @@ static int btrfs_reconfigure(struct fs_context *fc) struct btrfs_fs_context *ctx = fc->fs_private; struct btrfs_fs_context old_ctx; int ret = 0; + bool mount_reconfigure = (fc->s_fs_info != NULL); btrfs_info_to_ctx(fs_info, &old_ctx); sync_filesystem(sb); set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); - if (!check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) + if (!mount_reconfigure && + !check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) return -EINVAL; ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY)); @@ -2885,6 +2887,129 @@ error: return ret; } +/* + * Ever since commit 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes + * with different ro/rw options") the following works: + * + * (i) mount /dev/sda3 -o subvol=foo,ro /mnt/foo + * (ii) mount /dev/sda3 -o subvol=bar,rw /mnt/bar + * + * which looks nice and innocent but is actually pretty intricate and deserves + * a long comment. + * + * On another filesystem a subvolume mount is close to something like: + * + * (iii) # create rw superblock + initial mount + * mount -t xfs /dev/sdb /opt/ + * + * # create ro bind mount + * mount --bind -o ro /opt/foo /mnt/foo + * + * # unmount initial mount + * umount /opt + * + * Of course, there's some special subvolume sauce and there's the fact that the + * sb->s_root dentry is really swapped after mount_subtree(). But conceptually + * it's very close and will help us understand the issue. + * + * The old mount API didn't cleanly distinguish between a mount being made ro + * and a superblock being made ro. The only way to change the ro state of + * either object was by passing ms_rdonly. If a new mount was created via + * mount(2) such as: + * + * mount("/dev/sdb", "/mnt", "xfs", ms_rdonly, null); + * + * the MS_RDONLY flag being specified had two effects: + * + * (1) MNT_READONLY was raised -> the resulting mount got + * @mnt->mnt_flags |= MNT_READONLY raised. + * + * (2) MS_RDONLY was passed to the filesystem's mount method and the filesystems + * made the superblock ro. Note, how SB_RDONLY has the same value as + * ms_rdonly and is raised whenever MS_RDONLY is passed through mount(2). + * + * Creating a subtree mount via (iii) ends up leaving a rw superblock with a + * subtree mounted ro. + * + * But consider the effect on the old mount API on btrfs subvolume mounting + * which combines the distinct step in (iii) into a single step. + * + * By issuing (i) both the mount and the superblock are turned ro. Now when (ii) + * is issued the superblock is ro and thus even if the mount created for (ii) is + * rw it wouldn't help. Hence, btrfs needed to transition the superblock from ro + * to rw for (ii) which it did using an internal remount call. + * + * IOW, subvolume mounting was inherently complicated due to the ambiguity of + * MS_RDONLY in mount(2). Note, this ambiguity has mount(8) always translate + * "ro" to MS_RDONLY. IOW, in both (i) and (ii) "ro" becomes MS_RDONLY when + * passed by mount(8) to mount(2). + * + * Enter the new mount API. The new mount API disambiguates making a mount ro + * and making a superblock ro. + * + * (3) To turn a mount ro the MOUNT_ATTR_ONLY flag can be used with either + * fsmount() or mount_setattr() this is a pure VFS level change for a + * specific mount or mount tree that is never seen by the filesystem itself. + * + * (4) To turn a superblock ro the "ro" flag must be used with + * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem + * in fc->sb_flags. + * + * This disambiguation has rather positive consequences. Mounting a subvolume + * ro will not also turn the superblock ro. Only the mount for the subvolume + * will become ro. + * + * So, if the superblock creation request comes from the new mount API the + * caller must have explicitly done: + * + * fsconfig(FSCONFIG_SET_FLAG, "ro") + * fsmount/mount_setattr(MOUNT_ATTR_RDONLY) + * + * IOW, at some point the caller must have explicitly turned the whole + * superblock ro and we shouldn't just undo it like we did for the old mount + * API. In any case, it lets us avoid the hack in the new mount API. + * + * Consequently, the remounting hack must only be used for requests originating + * from the old mount API and should be marked for full deprecation so it can be + * turned off in a couple of years. + * + * The new mount API has no reason to support this hack. + */ +static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc) +{ + struct vfsmount *mnt; + int ret; + const bool ro2rw = !(fc->sb_flags & SB_RDONLY); + + /* + * We got an EBUSY because our SB_RDONLY flag didn't match the existing + * super block, so invert our setting here and retry the mount so we + * can get our vfsmount. + */ + if (ro2rw) + fc->sb_flags |= SB_RDONLY; + else + fc->sb_flags &= ~SB_RDONLY; + + mnt = fc_mount(fc); + if (IS_ERR(mnt)) + return mnt; + + if (!fc->oldapi || !ro2rw) + return mnt; + + /* We need to convert to rw, call reconfigure. */ + fc->sb_flags &= ~SB_RDONLY; + down_write(&mnt->mnt_sb->s_umount); + ret = btrfs_reconfigure(fc); + up_write(&mnt->mnt_sb->s_umount); + if (ret) { + mntput(mnt); + return ERR_PTR(ret); + } + return mnt; +} + static int btrfs_get_tree_subvol(struct fs_context *fc) { struct btrfs_fs_info *fs_info = NULL; @@ -2934,6 +3059,8 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) fc->security = NULL; mnt = fc_mount(dup_fc); + if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) + mnt = btrfs_reconfigure_for_mount(dup_fc); put_fs_context(dup_fc); if (IS_ERR(mnt)) return PTR_ERR(mnt); From ad21f15b0f795daf8723dddbcb61797d4f1c2aed Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:50 -0500 Subject: [PATCH 0862/1562] btrfs: switch to the new mount API Now that we have all of the parts in place to use the new mount API, switch our fs_type to use the new callbacks. There are a few things that have to be done at the same time because of the order of operations changes that come along with the new mount API. These must be done in the same patch otherwise things will go wrong. 1. Export and use btrfs_check_options in open_ctree(). This is because the options are done ahead of time, and we need to check them once we have the feature flags loaded. 2. Update the free space cache settings. Since we're coming in with the options already set we need to make sure we don't undo what the user has asked for. 3. Set our sb_flags at init_fs_context time, the fs_context stuff is trying to manage the sb_flagss itself, so move that into init_fs_context and out of the fill super part. Additionally I've marked the unused functions with __maybe_unused and will remove them in a future patch. Reviewed-by: Johannes Thumshirn Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 11 ++++-- fs/btrfs/super.c | 88 ++++++++++++++++++++++++++-------------------- fs/btrfs/super.h | 2 ++ 3 files changed, 60 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4bac16d74179..beae0dbbc039 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3316,14 +3316,21 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ btrfs_set_free_space_cache_settings(fs_info); - ret = btrfs_parse_options(fs_info, options, sb->s_flags); - if (ret) + if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) { + ret = -EINVAL; goto fail_alloc; + } ret = btrfs_check_features(fs_info, !sb_rdonly(sb)); if (ret < 0) goto fail_alloc; + /* + * At this point our mount options are validated, if we set ->max_inline + * to something non-standard make sure we truncate it to sectorsize. + */ + fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); + if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 66c109c85104..4ff26c00eebe 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -307,7 +307,7 @@ static const struct constant_table btrfs_parameter_fragment[] = { }; #endif -static const struct fs_parameter_spec btrfs_fs_parameters[] __maybe_unused = { +static const struct fs_parameter_spec btrfs_fs_parameters[] = { fsparam_flag_no("acl", Opt_acl), fsparam_flag_no("autodefrag", Opt_defrag), fsparam_flag_no("barrier", Opt_barrier), @@ -738,8 +738,8 @@ static bool check_ro_option(struct btrfs_fs_info *fs_info, return false; } -static bool check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, - unsigned long flags) +bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, + unsigned long flags) { bool ret = true; @@ -788,18 +788,6 @@ static bool check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, */ void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info) { - if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) - btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); - else if (btrfs_free_space_cache_v1_active(fs_info)) { - if (btrfs_is_zoned(fs_info)) { - btrfs_info(fs_info, - "zoned: clearing existing space cache"); - btrfs_set_super_cache_generation(fs_info->super_copy, 0); - } else { - btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); - } - } - if (fs_info->sectorsize < PAGE_SIZE) { btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); if (!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) { @@ -809,6 +797,35 @@ void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info) btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); } } + + /* + * At this point our mount options are populated, so we only mess with + * these settings if we don't have any settings already. + */ + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) + return; + + if (btrfs_is_zoned(fs_info) && + btrfs_free_space_cache_v1_active(fs_info)) { + btrfs_info(fs_info, "zoned: clearing existing space cache"); + btrfs_set_super_cache_generation(fs_info->super_copy, 0); + return; + } + + if (btrfs_test_opt(fs_info, SPACE_CACHE)) + return; + + if (btrfs_test_opt(fs_info, NOSPACECACHE)) + return; + + /* + * At this point we don't have explicit options set by the user, set + * them ourselves based on the state of the file system. + */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + else if (btrfs_free_space_cache_v1_active(fs_info)) + btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); } static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) @@ -1345,7 +1362,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } } out: - if (!ret && !check_options(info, &info->mount_opt, new_flags)) + if (!ret && !btrfs_check_options(info, &info->mount_opt, new_flags)) ret = -EINVAL; return ret; } @@ -1646,10 +1663,6 @@ static int btrfs_fill_super(struct super_block *sb, #endif sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - sb->s_flags |= SB_POSIXACL; -#endif - sb->s_flags |= SB_I_VERSION; sb->s_iflags |= SB_I_CGROUPWB; err = super_setup_bdi(sb); @@ -1929,7 +1942,7 @@ out: * Note: This is based on mount_bdev from fs/super.c with a few additions * for multiple device setup. Make sure to keep it in sync. */ -static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, +static __maybe_unused struct dentry *btrfs_mount_root(struct file_system_type *fs_type, int flags, const char *device_name, void *data) { struct block_device *bdev = NULL; @@ -2062,7 +2075,7 @@ error_sec_opts: * 3. Call mount_subvol() to get the dentry of subvolume. Since there is * "btrfs subvolume set-default", mount_subvol() is called always. */ -static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, +static __maybe_unused struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, const char *device_name, void *data) { struct vfsmount *mnt_root; @@ -2485,7 +2498,7 @@ static int btrfs_reconfigure(struct fs_context *fc) set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); if (!mount_reconfigure && - !check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) + !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) return -EINVAL; ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY)); @@ -3147,7 +3160,7 @@ static const struct fs_context_operations btrfs_fs_context_ops = { .free = btrfs_free_fs_context, }; -static int __maybe_unused btrfs_init_fs_context(struct fs_context *fc) +static int btrfs_init_fs_context(struct fs_context *fc) { struct btrfs_fs_context *ctx; @@ -3168,24 +3181,22 @@ static int __maybe_unused btrfs_init_fs_context(struct fs_context *fc) ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; } +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#endif + fc->sb_flags |= SB_I_VERSION; + return 0; } static struct file_system_type btrfs_fs_type = { - .owner = THIS_MODULE, - .name = "btrfs", - .mount = btrfs_mount, - .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, -}; - -static struct file_system_type btrfs_root_fs_type = { - .owner = THIS_MODULE, - .name = "btrfs", - .mount = btrfs_mount_root, - .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, -}; + .owner = THIS_MODULE, + .name = "btrfs", + .init_fs_context = btrfs_init_fs_context, + .parameters = btrfs_fs_parameters, + .kill_sb = btrfs_kill_super, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, + }; MODULE_ALIAS_FS("btrfs"); @@ -3398,7 +3409,6 @@ static const struct super_operations btrfs_super_ops = { .destroy_inode = btrfs_destroy_inode, .free_inode = btrfs_free_inode, .statfs = btrfs_statfs, - .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, .unfreeze_fs = btrfs_unfreeze, }; diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h index 7c1cd7527e76..7f6577d69902 100644 --- a/fs/btrfs/super.h +++ b/fs/btrfs/super.h @@ -3,6 +3,8 @@ #ifndef BTRFS_SUPER_H #define BTRFS_SUPER_H +bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, + unsigned long flags); int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); int btrfs_sync_fs(struct super_block *sb, int wait); From 41d46b290ef9b5563ae5b3c46cf86e0ae1e4bf95 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:51 -0500 Subject: [PATCH 0863/1562] btrfs: move the device specific mount options to super.c We add these mount options based on the fs_devices settings, which can be set once we've opened the fs_devices. Move these into their own helper and call it from get_tree_super. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 23 ----------------------- fs/btrfs/super.c | 25 +++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index beae0dbbc039..42e6d818a5a8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3510,29 +3510,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_cleaner; } - if (!btrfs_test_opt(fs_info, NOSSD) && - !fs_info->fs_devices->rotating) { - btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations"); - } - - /* - * For devices supporting discard turn on discard=async automatically, - * unless it's already set or disabled. This could be turned off by - * nodiscard for the same mount. - * - * The zoned mode piggy backs on the discard functionality for - * resetting a zone. There is no reason to delay the zone reset as it is - * fast enough. So, do not enable async discard for zoned mode. - */ - if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || - btrfs_test_opt(fs_info, DISCARD_ASYNC) || - btrfs_test_opt(fs_info, NODISCARD)) && - fs_info->fs_devices->discardable && - !btrfs_is_zoned(fs_info)) { - btrfs_set_and_info(fs_info, DISCARD_ASYNC, - "auto enabling async discard"); - } - ret = btrfs_read_qgroup_config(fs_info); if (ret) goto fail_trans_kthread; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4ff26c00eebe..4cb9d35d71f8 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -828,6 +828,29 @@ void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info) btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); } +static void set_device_specific_options(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_test_opt(fs_info, NOSSD) && + !fs_info->fs_devices->rotating) + btrfs_set_opt(fs_info->mount_opt, SSD); + + /* + * For devices supporting discard turn on discard=async automatically, + * unless it's already set or disabled. This could be turned off by + * nodiscard for the same mount. + * + * The zoned mode piggy backs on the discard functionality for + * resetting a zone. There is no reason to delay the zone reset as it is + * fast enough. So, do not enable async discard for zoned mode. + */ + if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC) || + btrfs_test_opt(fs_info, NODISCARD)) && + fs_info->fs_devices->discardable && + !btrfs_is_zoned(fs_info)) + btrfs_set_opt(fs_info->mount_opt, DISCARD_ASYNC); +} + static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) { char *opts; @@ -2876,6 +2899,8 @@ static int btrfs_get_tree_super(struct fs_context *fc) goto error; } + set_device_specific_options(fs_info); + if (sb->s_root) { btrfs_close_devices(fs_devices); if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) From 6941823cc87812dba4d02c67f46768cba372970b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:52 -0500 Subject: [PATCH 0864/1562] btrfs: remove old mount API code Now that we've switched to the new mount API, remove the old stuff. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/fs.h | 14 - fs/btrfs/super.c | 1078 +--------------------------------------------- fs/btrfs/super.h | 2 - 3 files changed, 13 insertions(+), 1081 deletions(-) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index e6f7ee85032e..f8bb73d6ab68 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -962,20 +962,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, #define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) -#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (!btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_set_opt(fs_info->mount_opt, opt); \ -} while (0) - -#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_clear_opt(fs_info->mount_opt, opt); \ -} while (0) - static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) { /* Do it this way so we only ever do one test_bit in the normal case. */ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4cb9d35d71f8..f9488161bf83 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -65,19 +65,7 @@ #include static const struct super_operations btrfs_super_ops; - -/* - * Types for mounting the default subvolume and a subvolume explicitly - * requested by subvol=/path. That way the callchain is straightforward and we - * don't have to play tricks with the mount options and recursive calls to - * btrfs_mount. - * - * The new btrfs_root_fs_type also servers as a tag for the bdev_holder. - */ static struct file_system_type btrfs_fs_type; -static struct file_system_type btrfs_root_fs_type; - -static int btrfs_remount(struct super_block *sb, int *flags, char *data); static void btrfs_put_super(struct super_block *sb) { @@ -102,7 +90,7 @@ struct btrfs_fs_context { }; enum { - Opt_acl, Opt_noacl, + Opt_acl, Opt_clear_cache, Opt_commit_interval, Opt_compress, @@ -112,27 +100,26 @@ enum { Opt_degraded, Opt_device, Opt_fatal_errors, - Opt_flushoncommit, Opt_noflushoncommit, + Opt_flushoncommit, Opt_max_inline, - Opt_barrier, Opt_nobarrier, - Opt_datacow, Opt_nodatacow, - Opt_datasum, Opt_nodatasum, - Opt_defrag, Opt_nodefrag, - Opt_discard, Opt_nodiscard, + Opt_barrier, + Opt_datacow, + Opt_datasum, + Opt_defrag, + Opt_discard, Opt_discard_mode, - Opt_norecovery, Opt_ratio, Opt_rescan_uuid_tree, Opt_skip_balance, - Opt_space_cache, Opt_no_space_cache, + Opt_space_cache, Opt_space_cache_version, - Opt_ssd, Opt_nossd, - Opt_ssd_spread, Opt_nossd_spread, + Opt_ssd, + Opt_ssd_spread, Opt_subvol, Opt_subvol_empty, Opt_subvolid, Opt_thread_pool, - Opt_treelog, Opt_notreelog, + Opt_treelog, Opt_user_subvol_rm_allowed, /* Rescue options */ @@ -145,10 +132,10 @@ enum { /* Deprecated options */ Opt_recovery, - Opt_inode_cache, Opt_noinode_cache, + Opt_inode_cache, /* Debugging options */ - Opt_enospc_debug, Opt_noenospc_debug, + Opt_enospc_debug, #ifdef CONFIG_BTRFS_DEBUG Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, #endif @@ -158,88 +145,6 @@ enum { Opt_err, }; -static const match_table_t tokens = { - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_clear_cache, "clear_cache"}, - {Opt_commit_interval, "commit=%u"}, - {Opt_compress, "compress"}, - {Opt_compress_type, "compress=%s"}, - {Opt_compress_force, "compress-force"}, - {Opt_compress_force_type, "compress-force=%s"}, - {Opt_degraded, "degraded"}, - {Opt_device, "device=%s"}, - {Opt_fatal_errors, "fatal_errors=%s"}, - {Opt_flushoncommit, "flushoncommit"}, - {Opt_noflushoncommit, "noflushoncommit"}, - {Opt_inode_cache, "inode_cache"}, - {Opt_noinode_cache, "noinode_cache"}, - {Opt_max_inline, "max_inline=%s"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_datacow, "datacow"}, - {Opt_nodatacow, "nodatacow"}, - {Opt_datasum, "datasum"}, - {Opt_nodatasum, "nodatasum"}, - {Opt_defrag, "autodefrag"}, - {Opt_nodefrag, "noautodefrag"}, - {Opt_discard, "discard"}, - {Opt_discard_mode, "discard=%s"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_norecovery, "norecovery"}, - {Opt_ratio, "metadata_ratio=%u"}, - {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, - {Opt_skip_balance, "skip_balance"}, - {Opt_space_cache, "space_cache"}, - {Opt_no_space_cache, "nospace_cache"}, - {Opt_space_cache_version, "space_cache=%s"}, - {Opt_ssd, "ssd"}, - {Opt_nossd, "nossd"}, - {Opt_ssd_spread, "ssd_spread"}, - {Opt_nossd_spread, "nossd_spread"}, - {Opt_subvol, "subvol=%s"}, - {Opt_subvol_empty, "subvol="}, - {Opt_subvolid, "subvolid=%s"}, - {Opt_thread_pool, "thread_pool=%u"}, - {Opt_treelog, "treelog"}, - {Opt_notreelog, "notreelog"}, - {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, - - /* Rescue options */ - {Opt_rescue, "rescue=%s"}, - /* Deprecated, with alias rescue=nologreplay */ - {Opt_nologreplay, "nologreplay"}, - /* Deprecated, with alias rescue=usebackuproot */ - {Opt_usebackuproot, "usebackuproot"}, - - /* Deprecated options */ - {Opt_recovery, "recovery"}, - - /* Debugging options */ - {Opt_enospc_debug, "enospc_debug"}, - {Opt_noenospc_debug, "noenospc_debug"}, -#ifdef CONFIG_BTRFS_DEBUG - {Opt_fragment_data, "fragment=data"}, - {Opt_fragment_metadata, "fragment=metadata"}, - {Opt_fragment_all, "fragment=all"}, -#endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - {Opt_ref_verify, "ref_verify"}, -#endif - {Opt_err, NULL}, -}; - -static const match_table_t rescue_tokens = { - {Opt_usebackuproot, "usebackuproot"}, - {Opt_nologreplay, "nologreplay"}, - {Opt_ignorebadroots, "ignorebadroots"}, - {Opt_ignorebadroots, "ibadroots"}, - {Opt_ignoredatacsums, "ignoredatacsums"}, - {Opt_ignoredatacsums, "idatacsums"}, - {Opt_rescue_all, "all"}, - {Opt_err, NULL}, -}; - enum { Opt_fatal_errors_panic, Opt_fatal_errors_bug, @@ -851,660 +756,6 @@ static void set_device_specific_options(struct btrfs_fs_info *fs_info) btrfs_set_opt(fs_info->mount_opt, DISCARD_ASYNC); } -static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) -{ - char *opts; - char *orig; - char *p; - substring_t args[MAX_OPT_ARGS]; - int ret = 0; - - opts = kstrdup(options, GFP_KERNEL); - if (!opts) - return -ENOMEM; - orig = opts; - - while ((p = strsep(&opts, ":")) != NULL) { - int token; - - if (!*p) - continue; - token = match_token(p, rescue_tokens, args); - switch (token){ - case Opt_usebackuproot: - btrfs_info(info, - "trying to use backup root at mount time"); - btrfs_set_opt(info->mount_opt, USEBACKUPROOT); - break; - case Opt_nologreplay: - btrfs_set_and_info(info, NOLOGREPLAY, - "disabling log replay at mount time"); - break; - case Opt_ignorebadroots: - btrfs_set_and_info(info, IGNOREBADROOTS, - "ignoring bad roots"); - break; - case Opt_ignoredatacsums: - btrfs_set_and_info(info, IGNOREDATACSUMS, - "ignoring data csums"); - break; - case Opt_rescue_all: - btrfs_info(info, "enabling all of the rescue options"); - btrfs_set_and_info(info, IGNOREDATACSUMS, - "ignoring data csums"); - btrfs_set_and_info(info, IGNOREBADROOTS, - "ignoring bad roots"); - btrfs_set_and_info(info, NOLOGREPLAY, - "disabling log replay at mount time"); - break; - case Opt_err: - btrfs_info(info, "unrecognized rescue option '%s'", p); - ret = -EINVAL; - goto out; - default: - break; - } - - } -out: - kfree(orig); - return ret; -} - -/* - * Regular mount options parser. Everything that is needed only when - * reading in a new superblock is parsed here. - * XXX JDM: This needs to be cleaned up for remount. - */ -int btrfs_parse_options(struct btrfs_fs_info *info, char *options, - unsigned long new_flags) -{ - substring_t args[MAX_OPT_ARGS]; - char *p, *num; - int intarg; - int ret = 0; - char *compress_type; - bool compress_force = false; - enum btrfs_compression_type saved_compress_type; - int saved_compress_level; - bool saved_compress_force; - int no_compress = 0; - - /* - * Even the options are empty, we still need to do extra check - * against new flags - */ - if (!options) - goto out; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_degraded: - btrfs_info(info, "allowing degraded mounts"); - btrfs_set_opt(info->mount_opt, DEGRADED); - break; - case Opt_subvol: - case Opt_subvol_empty: - case Opt_subvolid: - case Opt_device: - /* - * These are parsed by btrfs_parse_subvol_options or - * btrfs_parse_device_options and can be ignored here. - */ - break; - case Opt_nodatasum: - btrfs_set_and_info(info, NODATASUM, - "setting nodatasum"); - break; - case Opt_datasum: - if (btrfs_test_opt(info, NODATASUM)) { - if (btrfs_test_opt(info, NODATACOW)) - btrfs_info(info, - "setting datasum, datacow enabled"); - else - btrfs_info(info, "setting datasum"); - } - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - break; - case Opt_nodatacow: - if (!btrfs_test_opt(info, NODATACOW)) { - if (!btrfs_test_opt(info, COMPRESS) || - !btrfs_test_opt(info, FORCE_COMPRESS)) { - btrfs_info(info, - "setting nodatacow, compression disabled"); - } else { - btrfs_info(info, "setting nodatacow"); - } - } - btrfs_clear_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); - btrfs_set_opt(info->mount_opt, NODATACOW); - btrfs_set_opt(info->mount_opt, NODATASUM); - break; - case Opt_datacow: - btrfs_clear_and_info(info, NODATACOW, - "setting datacow"); - break; - case Opt_compress_force: - case Opt_compress_force_type: - compress_force = true; - fallthrough; - case Opt_compress: - case Opt_compress_type: - saved_compress_type = btrfs_test_opt(info, - COMPRESS) ? - info->compress_type : BTRFS_COMPRESS_NONE; - saved_compress_force = - btrfs_test_opt(info, FORCE_COMPRESS); - saved_compress_level = info->compress_level; - if (token == Opt_compress || - token == Opt_compress_force || - strncmp(args[0].from, "zlib", 4) == 0) { - compress_type = "zlib"; - - info->compress_type = BTRFS_COMPRESS_ZLIB; - info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; - /* - * args[0] contains uninitialized data since - * for these tokens we don't expect any - * parameter. - */ - if (token != Opt_compress && - token != Opt_compress_force) - info->compress_level = - btrfs_compress_str2level( - BTRFS_COMPRESS_ZLIB, - args[0].from + 4); - btrfs_set_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - no_compress = 0; - } else if (strncmp(args[0].from, "lzo", 3) == 0) { - compress_type = "lzo"; - info->compress_type = BTRFS_COMPRESS_LZO; - info->compress_level = 0; - btrfs_set_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - btrfs_set_fs_incompat(info, COMPRESS_LZO); - no_compress = 0; - } else if (strncmp(args[0].from, "zstd", 4) == 0) { - compress_type = "zstd"; - info->compress_type = BTRFS_COMPRESS_ZSTD; - info->compress_level = - btrfs_compress_str2level( - BTRFS_COMPRESS_ZSTD, - args[0].from + 4); - btrfs_set_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - btrfs_set_fs_incompat(info, COMPRESS_ZSTD); - no_compress = 0; - } else if (strncmp(args[0].from, "no", 2) == 0) { - compress_type = "no"; - info->compress_level = 0; - info->compress_type = 0; - btrfs_clear_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); - compress_force = false; - no_compress++; - } else { - btrfs_err(info, "unrecognized compression value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - - if (compress_force) { - btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); - } else { - /* - * If we remount from compress-force=xxx to - * compress=xxx, we need clear FORCE_COMPRESS - * flag, otherwise, there is no way for users - * to disable forcible compression separately. - */ - btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); - } - if (no_compress == 1) { - btrfs_info(info, "use no compression"); - } else if ((info->compress_type != saved_compress_type) || - (compress_force != saved_compress_force) || - (info->compress_level != saved_compress_level)) { - btrfs_info(info, "%s %s compression, level %d", - (compress_force) ? "force" : "use", - compress_type, info->compress_level); - } - compress_force = false; - break; - case Opt_ssd: - btrfs_set_and_info(info, SSD, - "enabling ssd optimizations"); - btrfs_clear_opt(info->mount_opt, NOSSD); - break; - case Opt_ssd_spread: - btrfs_set_and_info(info, SSD, - "enabling ssd optimizations"); - btrfs_set_and_info(info, SSD_SPREAD, - "using spread ssd allocation scheme"); - btrfs_clear_opt(info->mount_opt, NOSSD); - break; - case Opt_nossd: - btrfs_set_opt(info->mount_opt, NOSSD); - btrfs_clear_and_info(info, SSD, - "not using ssd optimizations"); - fallthrough; - case Opt_nossd_spread: - btrfs_clear_and_info(info, SSD_SPREAD, - "not using spread ssd allocation scheme"); - break; - case Opt_barrier: - btrfs_clear_and_info(info, NOBARRIER, - "turning on barriers"); - break; - case Opt_nobarrier: - btrfs_set_and_info(info, NOBARRIER, - "turning off barriers"); - break; - case Opt_thread_pool: - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, "unrecognized thread_pool value %s", - args[0].from); - goto out; - } else if (intarg == 0) { - btrfs_err(info, "invalid value 0 for thread_pool"); - ret = -EINVAL; - goto out; - } - info->thread_pool_size = intarg; - break; - case Opt_max_inline: - num = match_strdup(&args[0]); - if (num) { - info->max_inline = memparse(num, NULL); - kfree(num); - - if (info->max_inline) { - info->max_inline = min_t(u64, - info->max_inline, - info->sectorsize); - } - btrfs_info(info, "max_inline at %llu", - info->max_inline); - } else { - ret = -ENOMEM; - goto out; - } - break; - case Opt_acl: -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - info->sb->s_flags |= SB_POSIXACL; - break; -#else - btrfs_err(info, "support for ACL not compiled in!"); - ret = -EINVAL; - goto out; -#endif - case Opt_noacl: - info->sb->s_flags &= ~SB_POSIXACL; - break; - case Opt_notreelog: - btrfs_set_and_info(info, NOTREELOG, - "disabling tree log"); - break; - case Opt_treelog: - btrfs_clear_and_info(info, NOTREELOG, - "enabling tree log"); - break; - case Opt_norecovery: - case Opt_nologreplay: - btrfs_warn(info, - "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); - btrfs_set_and_info(info, NOLOGREPLAY, - "disabling log replay at mount time"); - break; - case Opt_flushoncommit: - btrfs_set_and_info(info, FLUSHONCOMMIT, - "turning on flush-on-commit"); - break; - case Opt_noflushoncommit: - btrfs_clear_and_info(info, FLUSHONCOMMIT, - "turning off flush-on-commit"); - break; - case Opt_ratio: - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, "unrecognized metadata_ratio value %s", - args[0].from); - goto out; - } - info->metadata_ratio = intarg; - btrfs_info(info, "metadata ratio %u", - info->metadata_ratio); - break; - case Opt_discard: - case Opt_discard_mode: - if (token == Opt_discard || - strcmp(args[0].from, "sync") == 0) { - btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC); - btrfs_set_and_info(info, DISCARD_SYNC, - "turning on sync discard"); - } else if (strcmp(args[0].from, "async") == 0) { - btrfs_clear_opt(info->mount_opt, DISCARD_SYNC); - btrfs_set_and_info(info, DISCARD_ASYNC, - "turning on async discard"); - } else { - btrfs_err(info, "unrecognized discard mode value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - btrfs_clear_opt(info->mount_opt, NODISCARD); - break; - case Opt_nodiscard: - btrfs_clear_and_info(info, DISCARD_SYNC, - "turning off discard"); - btrfs_clear_and_info(info, DISCARD_ASYNC, - "turning off async discard"); - btrfs_set_opt(info->mount_opt, NODISCARD); - break; - case Opt_space_cache: - case Opt_space_cache_version: - /* - * We already set FREE_SPACE_TREE above because we have - * compat_ro(FREE_SPACE_TREE) set, and we aren't going - * to allow v1 to be set for extent tree v2, simply - * ignore this setting if we're extent tree v2. - * - * For subpage blocksize we don't allow space cache v1, - * and we'll turn on v2, so we can skip the settings - * here as well. - */ - if (btrfs_fs_incompat(info, EXTENT_TREE_V2) || - info->sectorsize < PAGE_SIZE) - break; - if (token == Opt_space_cache || - strcmp(args[0].from, "v1") == 0) { - btrfs_clear_opt(info->mount_opt, - FREE_SPACE_TREE); - btrfs_set_and_info(info, SPACE_CACHE, - "enabling disk space caching"); - } else if (strcmp(args[0].from, "v2") == 0) { - btrfs_clear_opt(info->mount_opt, - SPACE_CACHE); - btrfs_set_and_info(info, FREE_SPACE_TREE, - "enabling free space tree"); - } else { - btrfs_err(info, "unrecognized space_cache value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - break; - case Opt_rescan_uuid_tree: - btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); - break; - case Opt_no_space_cache: - /* - * We cannot operate without the free space tree with - * extent tree v2, ignore this option. - */ - if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) - break; - if (btrfs_test_opt(info, SPACE_CACHE)) { - btrfs_clear_and_info(info, SPACE_CACHE, - "disabling disk space caching"); - } - if (btrfs_test_opt(info, FREE_SPACE_TREE)) { - btrfs_clear_and_info(info, FREE_SPACE_TREE, - "disabling free space tree"); - } - break; - case Opt_inode_cache: - case Opt_noinode_cache: - btrfs_warn(info, - "the 'inode_cache' option is deprecated and has no effect since 5.11"); - break; - case Opt_clear_cache: - /* - * We cannot clear the free space tree with extent tree - * v2, ignore this option. - */ - if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) - break; - btrfs_set_and_info(info, CLEAR_CACHE, - "force clearing of disk cache"); - break; - case Opt_user_subvol_rm_allowed: - btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); - break; - case Opt_enospc_debug: - btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); - break; - case Opt_noenospc_debug: - btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); - break; - case Opt_defrag: - btrfs_set_and_info(info, AUTO_DEFRAG, - "enabling auto defrag"); - break; - case Opt_nodefrag: - btrfs_clear_and_info(info, AUTO_DEFRAG, - "disabling auto defrag"); - break; - case Opt_recovery: - case Opt_usebackuproot: - btrfs_warn(info, - "'%s' is deprecated, use 'rescue=usebackuproot' instead", - token == Opt_recovery ? "recovery" : - "usebackuproot"); - btrfs_info(info, - "trying to use backup root at mount time"); - btrfs_set_opt(info->mount_opt, USEBACKUPROOT); - break; - case Opt_skip_balance: - btrfs_set_opt(info->mount_opt, SKIP_BALANCE); - break; - case Opt_fatal_errors: - if (strcmp(args[0].from, "panic") == 0) { - btrfs_set_opt(info->mount_opt, - PANIC_ON_FATAL_ERROR); - } else if (strcmp(args[0].from, "bug") == 0) { - btrfs_clear_opt(info->mount_opt, - PANIC_ON_FATAL_ERROR); - } else { - btrfs_err(info, "unrecognized fatal_errors value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - break; - case Opt_commit_interval: - intarg = 0; - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, "unrecognized commit_interval value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - if (intarg == 0) { - btrfs_info(info, - "using default commit interval %us", - BTRFS_DEFAULT_COMMIT_INTERVAL); - intarg = BTRFS_DEFAULT_COMMIT_INTERVAL; - } else if (intarg > 300) { - btrfs_warn(info, "excessive commit interval %d", - intarg); - } - info->commit_interval = intarg; - break; - case Opt_rescue: - ret = parse_rescue_options(info, args[0].from); - if (ret < 0) { - btrfs_err(info, "unrecognized rescue value %s", - args[0].from); - goto out; - } - break; -#ifdef CONFIG_BTRFS_DEBUG - case Opt_fragment_all: - btrfs_info(info, "fragmenting all space"); - btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); - btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA); - break; - case Opt_fragment_metadata: - btrfs_info(info, "fragmenting metadata"); - btrfs_set_opt(info->mount_opt, - FRAGMENT_METADATA); - break; - case Opt_fragment_data: - btrfs_info(info, "fragmenting data"); - btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); - break; -#endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - case Opt_ref_verify: - btrfs_info(info, "doing ref verification"); - btrfs_set_opt(info->mount_opt, REF_VERIFY); - break; -#endif - case Opt_err: - btrfs_err(info, "unrecognized mount option '%s'", p); - ret = -EINVAL; - goto out; - default: - break; - } - } -out: - if (!ret && !btrfs_check_options(info, &info->mount_opt, new_flags)) - ret = -EINVAL; - return ret; -} - -/* - * Parse mount options that are required early in the mount process. - * - * All other options will be parsed on much later in the mount process and - * only when we need to allocate a new super block. - */ -static int btrfs_parse_device_options(const char *options, blk_mode_t flags) -{ - substring_t args[MAX_OPT_ARGS]; - char *device_name, *opts, *orig, *p; - struct btrfs_device *device = NULL; - int error = 0; - - lockdep_assert_held(&uuid_mutex); - - if (!options) - return 0; - - /* - * strsep changes the string, duplicate it because btrfs_parse_options - * gets called later - */ - opts = kstrdup(options, GFP_KERNEL); - if (!opts) - return -ENOMEM; - orig = opts; - - while ((p = strsep(&opts, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - if (token == Opt_device) { - device_name = match_strdup(&args[0]); - if (!device_name) { - error = -ENOMEM; - goto out; - } - device = btrfs_scan_one_device(device_name, flags, false); - kfree(device_name); - if (IS_ERR(device)) { - error = PTR_ERR(device); - goto out; - } - } - } - -out: - kfree(orig); - return error; -} - -/* - * Parse mount options that are related to subvolume id - * - * The value is later passed to mount_subvol() - */ -static int btrfs_parse_subvol_options(const char *options, char **subvol_name, - u64 *subvol_objectid) -{ - substring_t args[MAX_OPT_ARGS]; - char *opts, *orig, *p; - int error = 0; - u64 subvolid; - - if (!options) - return 0; - - /* - * strsep changes the string, duplicate it because - * btrfs_parse_device_options gets called later - */ - opts = kstrdup(options, GFP_KERNEL); - if (!opts) - return -ENOMEM; - orig = opts; - - while ((p = strsep(&opts, ",")) != NULL) { - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_subvol: - kfree(*subvol_name); - *subvol_name = match_strdup(&args[0]); - if (!*subvol_name) { - error = -ENOMEM; - goto out; - } - break; - case Opt_subvolid: - error = match_u64(&args[0], &subvolid); - if (error) - goto out; - - /* we want the original fs_tree */ - if (subvolid == 0) - subvolid = BTRFS_FS_TREE_OBJECTID; - - *subvol_objectid = subvolid; - break; - default: - break; - } - } - -out: - kfree(orig); - return error; -} - char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid) { @@ -1868,22 +1119,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) return 0; } -static int btrfs_test_super(struct super_block *s, void *data) -{ - struct btrfs_fs_info *p = data; - struct btrfs_fs_info *fs_info = btrfs_sb(s); - - return fs_info->fs_devices == p->fs_devices; -} - -static int btrfs_set_super(struct super_block *s, void *data) -{ - int err = set_anon_super(s, data); - if (!err) - s->s_fs_info = data; - return err; -} - /* * subvolumes are identified by ino 256 */ @@ -1959,200 +1194,6 @@ out: return root; } -/* - * Find a superblock for the given device / mount point. - * - * Note: This is based on mount_bdev from fs/super.c with a few additions - * for multiple device setup. Make sure to keep it in sync. - */ -static __maybe_unused struct dentry *btrfs_mount_root(struct file_system_type *fs_type, - int flags, const char *device_name, void *data) -{ - struct block_device *bdev = NULL; - struct super_block *s; - struct btrfs_device *device = NULL; - struct btrfs_fs_devices *fs_devices = NULL; - struct btrfs_fs_info *fs_info = NULL; - void *new_sec_opts = NULL; - blk_mode_t mode = sb_open_mode(flags); - int error = 0; - - if (data) { - error = security_sb_eat_lsm_opts(data, &new_sec_opts); - if (error) - return ERR_PTR(error); - } - - /* - * Setup a dummy root and fs_info for test/set super. This is because - * we don't actually fill this stuff out until open_ctree, but we need - * then open_ctree will properly initialize the file system specific - * settings later. btrfs_init_fs_info initializes the static elements - * of the fs_info (locks and such) to make cleanup easier if we find a - * superblock with our given fs_devices later on at sget() time. - */ - fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); - if (!fs_info) { - error = -ENOMEM; - goto error_sec_opts; - } - btrfs_init_fs_info(fs_info); - - fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); - fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); - if (!fs_info->super_copy || !fs_info->super_for_commit) { - error = -ENOMEM; - goto error_fs_info; - } - - mutex_lock(&uuid_mutex); - error = btrfs_parse_device_options(data, mode); - if (error) { - mutex_unlock(&uuid_mutex); - goto error_fs_info; - } - - /* - * With 'true' passed to btrfs_scan_one_device() (mount time) we expect - * either a valid device or an error. - */ - device = btrfs_scan_one_device(device_name, mode, true); - ASSERT(device != NULL); - if (IS_ERR(device)) { - mutex_unlock(&uuid_mutex); - error = PTR_ERR(device); - goto error_fs_info; - } - - fs_devices = device->fs_devices; - fs_info->fs_devices = fs_devices; - - error = btrfs_open_devices(fs_devices, mode, fs_type); - mutex_unlock(&uuid_mutex); - if (error) - goto error_fs_info; - - if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) { - error = -EACCES; - goto error_close_devices; - } - - bdev = fs_devices->latest_dev->bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC, - fs_info); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto error_close_devices; - } - - if (s->s_root) { - btrfs_close_devices(fs_devices); - btrfs_free_fs_info(fs_info); - if ((flags ^ s->s_flags) & SB_RDONLY) - error = -EBUSY; - } else { - snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name, - s->s_id); - btrfs_sb(s)->bdev_holder = fs_type; - error = btrfs_fill_super(s, fs_devices, data); - } - if (!error) - error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL); - security_free_mnt_opts(&new_sec_opts); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - - return dget(s->s_root); - -error_close_devices: - btrfs_close_devices(fs_devices); -error_fs_info: - btrfs_free_fs_info(fs_info); -error_sec_opts: - security_free_mnt_opts(&new_sec_opts); - return ERR_PTR(error); -} - -/* - * Mount function which is called by VFS layer. - * - * In order to allow mounting a subvolume directly, btrfs uses mount_subtree() - * which needs vfsmount* of device's root (/). This means device's root has to - * be mounted internally in any case. - * - * Operation flow: - * 1. Parse subvol id related options for later use in mount_subvol(). - * - * 2. Mount device's root (/) by calling vfs_kern_mount(). - * - * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the - * first place. In order to avoid calling btrfs_mount() again, we use - * different file_system_type which is not registered to VFS by - * register_filesystem() (btrfs_root_fs_type). As a result, - * btrfs_mount_root() is called. The return value will be used by - * mount_subtree() in mount_subvol(). - * - * 3. Call mount_subvol() to get the dentry of subvolume. Since there is - * "btrfs subvolume set-default", mount_subvol() is called always. - */ -static __maybe_unused struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, - const char *device_name, void *data) -{ - struct vfsmount *mnt_root; - struct dentry *root; - char *subvol_name = NULL; - u64 subvol_objectid = 0; - int error = 0; - - error = btrfs_parse_subvol_options(data, &subvol_name, - &subvol_objectid); - if (error) { - kfree(subvol_name); - return ERR_PTR(error); - } - - /* mount device's root (/) */ - mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data); - if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) { - if (flags & SB_RDONLY) { - mnt_root = vfs_kern_mount(&btrfs_root_fs_type, - flags & ~SB_RDONLY, device_name, data); - } else { - mnt_root = vfs_kern_mount(&btrfs_root_fs_type, - flags | SB_RDONLY, device_name, data); - if (IS_ERR(mnt_root)) { - root = ERR_CAST(mnt_root); - kfree(subvol_name); - goto out; - } - - down_write(&mnt_root->mnt_sb->s_umount); - error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL); - up_write(&mnt_root->mnt_sb->s_umount); - if (error < 0) { - root = ERR_PTR(error); - mntput(mnt_root); - kfree(subvol_name); - goto out; - } - } - } - if (IS_ERR(mnt_root)) { - root = ERR_CAST(mnt_root); - kfree(subvol_name); - goto out; - } - - /* mount_subvol() will free subvol_name and mnt_root */ - root = mount_subvol(subvol_name, subvol_objectid, mnt_root); - -out: - return root; -} - static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, u32 new_pool_size, u32 old_pool_size) { @@ -2324,99 +1365,6 @@ static int btrfs_remount_ro(struct btrfs_fs_info *fs_info) return btrfs_commit_super(fs_info); } -static int btrfs_remount(struct super_block *sb, int *flags, char *data) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - unsigned old_flags = sb->s_flags; - unsigned long old_opts = fs_info->mount_opt; - unsigned long old_compress_type = fs_info->compress_type; - u64 old_max_inline = fs_info->max_inline; - u32 old_thread_pool_size = fs_info->thread_pool_size; - u32 old_metadata_ratio = fs_info->metadata_ratio; - int ret; - - sync_filesystem(sb); - set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); - - if (data) { - void *new_sec_opts = NULL; - - ret = security_sb_eat_lsm_opts(data, &new_sec_opts); - if (!ret) - ret = security_sb_remount(sb, new_sec_opts); - security_free_mnt_opts(&new_sec_opts); - if (ret) - goto restore; - } - - ret = btrfs_parse_options(fs_info, data, *flags); - if (ret) - goto restore; - - ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY)); - if (ret < 0) - goto restore; - - btrfs_remount_begin(fs_info, old_opts, *flags); - btrfs_resize_thread_pool(fs_info, - fs_info->thread_pool_size, old_thread_pool_size); - - if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) != - (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && - (!sb_rdonly(sb) || (*flags & SB_RDONLY))) { - btrfs_warn(fs_info, - "remount supports changing free space tree only from ro to rw"); - /* Make sure free space cache options match the state on disk */ - if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); - btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); - } - if (btrfs_free_space_cache_v1_active(fs_info)) { - btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE); - btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); - } - } - - ret = 0; - if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) - ret = btrfs_remount_ro(fs_info); - else if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) - ret = btrfs_remount_rw(fs_info); - if (ret) - goto restore; - - /* - * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS, - * since the absence of the flag means it can be toggled off by remount. - */ - *flags |= SB_I_VERSION; - - wake_up_process(fs_info->transaction_kthread); - btrfs_remount_cleanup(fs_info, old_opts); - btrfs_clear_oneshot_options(fs_info); - clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); - - return 0; - -restore: - /* We've hit an error - don't reset SB_RDONLY */ - if (sb_rdonly(sb)) - old_flags |= SB_RDONLY; - if (!(old_flags & SB_RDONLY)) - clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); - sb->s_flags = old_flags; - fs_info->mount_opt = old_opts; - fs_info->compress_type = old_compress_type; - fs_info->max_inline = old_max_inline; - btrfs_resize_thread_pool(fs_info, - old_thread_pool_size, fs_info->thread_pool_size); - fs_info->metadata_ratio = old_metadata_ratio; - btrfs_remount_cleanup(fs_info, old_opts); - clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); - - return ret; -} - static void btrfs_ctx_to_info(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx) { fs_info->max_inline = ctx->max_inline; diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h index 7f6577d69902..f18253ca280d 100644 --- a/fs/btrfs/super.h +++ b/fs/btrfs/super.h @@ -5,8 +5,6 @@ bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, unsigned long flags); -int btrfs_parse_options(struct btrfs_fs_info *info, char *options, - unsigned long new_flags); int btrfs_sync_fs(struct super_block *sb, int wait); char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid); From 83e3a40a69f8dd57048089af31a1430c1808d924 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:53 -0500 Subject: [PATCH 0865/1562] btrfs: move one shot mount option clearing to super.c There's no reason this has to happen in open_ctree, and in fact in the old mount API we had to call this from remount. Move this to super.c, unexport it, and call it from both mount and reconfigure. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 16 +--------------- fs/btrfs/disk-io.h | 1 - fs/btrfs/super.c | 14 ++++++++++++++ 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 42e6d818a5a8..12daa8a36ba1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2946,18 +2946,6 @@ out: return err; } -/* - * Some options only have meaning at mount time and shouldn't persist across - * remounts, or be displayed. Clear these at the end of mount and remount - * code paths. - */ -void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) -{ - btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); - btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); - btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE); -} - /* * Mounting logic specific to read-write file systems. Shared by open_ctree * and btrfs_remount when remounting from read-only to read-write. @@ -3535,7 +3523,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } if (sb_rdonly(sb)) - goto clear_oneshot; + return 0; ret = btrfs_start_pre_rw_mount(fs_info); if (ret) { @@ -3563,8 +3551,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); -clear_oneshot: - btrfs_clear_oneshot_options(fs_info); return 0; fail_qgroup: diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index e589359e6a68..9413726b329b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -37,7 +37,6 @@ struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level); -void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f9488161bf83..95d6392a1acf 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -631,6 +631,18 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } +/* + * Some options only have meaning at mount time and shouldn't persist across + * remounts, or be displayed. Clear these at the end of mount and remount code + * paths. + */ +static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) +{ + btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); + btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); + btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE); +} + static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long mount_opt, unsigned long opt, const char *opt_name) @@ -1865,6 +1877,8 @@ static int btrfs_get_tree_super(struct fs_context *fc) return ret; } + btrfs_clear_oneshot_options(fs_info); + fc->root = dget(sb->s_root); return 0; From 9fb3b1a7fed796510a5c34f5d492840dfd0eb96c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:54 -0500 Subject: [PATCH 0866/1562] btrfs: set clear_cache if we use usebackuproot We're currently setting this when we try to load the roots and we see that usebackuproot is set. Instead set this at mount option parsing time. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 --- fs/btrfs/super.c | 9 +++++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 12daa8a36ba1..00c9181fd356 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2630,9 +2630,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) */ btrfs_set_super_log_root(sb, 0); - /* We can't trust the free space cache either */ - btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); - btrfs_warn(fs_info, "try to load backup roots slot %d", i); ret = read_backup_root(fs_info, i); backup_index = ret; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 95d6392a1acf..04352d3861a2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -452,6 +452,12 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) btrfs_warn(NULL, "'recovery' is deprecated, use 'rescue=usebackuproot' instead"); btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); + + /* + * If we're loading the backup roots we can't trust the + * space cache. + */ + btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); } break; case Opt_nologreplay: @@ -550,6 +556,9 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) btrfs_warn(NULL, "'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead"); btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); + + /* If we're loading the backup roots we can't trust the space cache. */ + btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); break; case Opt_skip_balance: btrfs_set_opt(ctx->mount_opt, SKIP_BALANCE); From a1912f712188291f9d7d434fba155461f1ebef66 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 22 Nov 2023 12:17:55 -0500 Subject: [PATCH 0867/1562] btrfs: remove code for inode_cache and recovery mount options We've deprecated these a while ago in 5.11, go ahead and remove the code for them. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Acked-by: Christian Brauner Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 04352d3861a2..3a677b808f0f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -130,10 +130,6 @@ enum { Opt_ignoredatacsums, Opt_rescue_all, - /* Deprecated options */ - Opt_recovery, - Opt_inode_cache, - /* Debugging options */ Opt_enospc_debug, #ifdef CONFIG_BTRFS_DEBUG @@ -230,7 +226,6 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { fsparam_enum("discard", Opt_discard_mode, btrfs_parameter_discard), fsparam_enum("fatal_errors", Opt_fatal_errors, btrfs_parameter_fatal_errors), fsparam_flag_no("flushoncommit", Opt_flushoncommit), - fsparam_flag_no("inode_cache", Opt_inode_cache), fsparam_string("max_inline", Opt_max_inline), fsparam_u32("metadata_ratio", Opt_ratio), fsparam_flag("rescan_uuid_tree", Opt_rescan_uuid_tree), @@ -253,10 +248,6 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { /* Deprecated, with alias rescue=usebackuproot */ __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL), - /* Deprecated options. */ - __fsparam(NULL, "recovery", Opt_recovery, - fs_param_neg_with_no | fs_param_deprecated, NULL), - /* Debugging options. */ fsparam_flag_no("enospc_debug", Opt_enospc_debug), #ifdef CONFIG_BTRFS_DEBUG @@ -438,28 +429,6 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) else btrfs_clear_opt(ctx->mount_opt, NOTREELOG); break; - case Opt_recovery: - /* - * -o recovery used to be an alias for usebackuproot, and then - * norecovery was an alias for nologreplay, hence the different - * behaviors for negated and not. - */ - if (result.negated) { - btrfs_warn(NULL, - "'norecovery' is deprecated, use 'rescue=nologreplay' instead"); - btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); - } else { - btrfs_warn(NULL, - "'recovery' is deprecated, use 'rescue=usebackuproot' instead"); - btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); - - /* - * If we're loading the backup roots we can't trust the - * space cache. - */ - btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); - } - break; case Opt_nologreplay: btrfs_warn(NULL, "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); @@ -530,10 +499,6 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_rescan_uuid_tree: btrfs_set_opt(ctx->mount_opt, RESCAN_UUID_TREE); break; - case Opt_inode_cache: - btrfs_warn(NULL, - "the 'inode_cache' option is deprecated and has no effect since 5.11"); - break; case Opt_clear_cache: btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); break; From ed9b50a13edf442f5493603cc54f73bfc6eca1e9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 29 Nov 2023 13:10:31 -0500 Subject: [PATCH 0868/1562] btrfs: cache that we don't have security.capability set When profiling a workload I noticed we were constantly calling getxattr. These were mostly coming from __remove_privs, which will lookup if security.capability exists to remove it. However instrumenting getxattr showed we get called nearly constantly on an idle machine on a lot of accesses. These are wasteful and not free. Other security LSMs have a way to cache their results, but capability doesn't have this, so it's asking us all the time for the xattr. Fix this by setting a flag in our inode that it doesn't have a security.capability xattr. We set this on new inodes and after a failed lookup of security.capability. If we set this xattr at all we'll clear the flag. I haven't found a test in fsperf that this makes a visible difference on, but I assume fs_mark related tests would show it clearly. This is a perf report output of the smallfiles100k run where it shows 20% of our time spent in __remove_privs because we're looking up the non-existent xattr. --21.86%--btrfs_write_check.constprop.0 --21.62%--__file_remove_privs --21.55%--security_inode_need_killpriv --21.54%--cap_inode_need_killpriv --21.53%--__vfs_getxattr --20.89%--btrfs_getxattr Obviously this is just CPU time in a mostly IO bound test, so the actual effect of removing this callchain is minimal. However in just normal testing of an idle system tracing showed around 100 getxattr calls per minute, and with this patch there are 0. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 ++ fs/btrfs/inode.c | 7 ++++++ fs/btrfs/xattr.c | 55 ++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5572ae52444e..74a2b02669e7 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -69,6 +69,8 @@ enum { BTRFS_INODE_VERITY_IN_PROGRESS, /* Set when this inode is a free space inode. */ BTRFS_INODE_FREE_SPACE_INODE, + /* Set when there are no capabilities in XATTs for the inode. */ + BTRFS_INODE_NO_CAP_XATTR, }; /* in memory btrfs inode */ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7320c1ea7926..e79a047aa5d1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6225,6 +6225,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; + /* + * We don't have any capability xattrs set here yet, shortcut any + * queries for the xattrs here. If we add them later via the inode + * security init path or any other path this flag will be cleared. + */ + set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + /* * Subvolumes don't inherit flags from their parent directory. * Originally this was probably by accident, but we probably can't diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3cf236fb40a4..6287763fdccc 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -382,6 +382,53 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, return btrfs_setxattr_trans(inode, name, buffer, size, flags); } +static int btrfs_xattr_handler_get_security(const struct xattr_handler *handler, + struct dentry *unused, + struct inode *inode, + const char *name, void *buffer, + size_t size) +{ + int ret; + bool is_cap = false; + + name = xattr_full_name(handler, name); + + /* + * security.capability doesn't cache the results, so calls into us + * constantly to see if there's a capability xattr. Cache the result + * here in order to avoid wasting time doing lookups for xattrs we know + * don't exist. + */ + if (strcmp(name, XATTR_NAME_CAPS) == 0) { + is_cap = true; + if (test_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags)) + return -ENODATA; + } + + ret = btrfs_getxattr(inode, name, buffer, size); + if (ret == -ENODATA && is_cap) + set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + return ret; +} + +static int btrfs_xattr_handler_set_security(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, + struct inode *inode, + const char *name, + const void *buffer, + size_t size, int flags) +{ + if (btrfs_root_readonly(BTRFS_I(inode)->root)) + return -EROFS; + + name = xattr_full_name(handler, name); + if (strcmp(name, XATTR_NAME_CAPS) == 0) + clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + + return btrfs_setxattr_trans(inode, name, buffer, size, flags); +} + static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, @@ -420,8 +467,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, static const struct xattr_handler btrfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, - .get = btrfs_xattr_handler_get, - .set = btrfs_xattr_handler_set, + .get = btrfs_xattr_handler_get_security, + .set = btrfs_xattr_handler_set_security, }; static const struct xattr_handler btrfs_trusted_xattr_handler = { @@ -473,6 +520,10 @@ static int btrfs_initxattrs(struct inode *inode, } strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + + if (strcmp(name, XATTR_NAME_CAPS) == 0) + clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + err = btrfs_setxattr(trans, inode, name, xattr->value, xattr->value_len, 0); kfree(name); From 637e6e0f50d20dcf2f37d62b3f9edf9567b69503 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 30 Nov 2023 23:42:01 +0100 Subject: [PATCH 0869/1562] btrfs: allocate btrfs_inode::file_extent_tree only without NO_HOLES The file_extent_tree was added in 41a2ee75aab0 ("btrfs: introduce per-inode file extent tree") so we have an explicit mapping of the file extents to know where it is safe to update i_size. When the feature NO_HOLES is enabled, and it's been a mkfs default since 5.15, the tree is not necessary. To save some space in the inode, allocate the tree only when necessary. This reduces size by 16 bytes from 1096 to 1080 on a x86_64 release config. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 6 ++++-- fs/btrfs/extent-io-tree.c | 2 ++ fs/btrfs/file-item.c | 6 +++--- fs/btrfs/inode.c | 25 ++++++++++++++++++++----- 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 74a2b02669e7..bd629d011fdc 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -109,9 +109,11 @@ struct btrfs_inode { /* * Keep track of where the inode has extent items mapped in order to - * make sure the i_size adjustments are accurate + * make sure the i_size adjustments are accurate. Not required when the + * filesystem is NO_HOLES, the status can't be set while mounted as + * it's a mkfs-time feature. */ - struct extent_io_tree file_extent_tree; + struct extent_io_tree *file_extent_tree; /* held while logging the inode in tree-log.c */ struct mutex log_mutex; diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index dbd201a99693..e3ee5449cc4a 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -962,6 +962,8 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, struct extent_state *state; int ret = 1; + ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES)); + spin_lock(&tree->lock); state = find_first_extent_bit_state(tree, start, bits); if (state) { diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 45cae356e89b..1f0110f48353 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -59,7 +59,7 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz goto out_unlock; } - ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start, + ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start, &end, EXTENT_DIRTY); if (!ret && start == 0) i_size = min(i_size, end + 1); @@ -94,7 +94,7 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) return 0; - return set_extent_bit(&inode->file_extent_tree, start, start + len - 1, + return set_extent_bit(inode->file_extent_tree, start, start + len - 1, EXTENT_DIRTY, NULL); } @@ -123,7 +123,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) return 0; - return clear_extent_bit(&inode->file_extent_tree, start, + return clear_extent_bit(inode->file_extent_tree, start, start + len - 1, EXTENT_DIRTY, NULL); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e79a047aa5d1..38adab92a015 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8479,10 +8479,20 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_inode *ei; struct inode *inode; + struct extent_io_tree *file_extent_tree = NULL; + + /* Self tests may pass a NULL fs_info. */ + if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) { + file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); + if (!file_extent_tree) + return NULL; + } ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); - if (!ei) + if (!ei) { + kfree(file_extent_tree); return NULL; + } ei->root = NULL; ei->generation = 0; @@ -8523,10 +8533,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; - extent_io_tree_init(fs_info, &ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT); - /* Lockdep class is set only for the file extent tree. */ - lockdep_set_class(&ei->file_extent_tree.lock, &file_extent_tree_class); + ei->file_extent_tree = file_extent_tree; + if (file_extent_tree) { + extent_io_tree_init(fs_info, ei->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT); + /* Lockdep class is set only for the file extent tree. */ + lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class); + } mutex_init(&ei->log_mutex); spin_lock_init(&ei->ordered_tree_lock); ei->ordered_tree = RB_ROOT; @@ -8543,12 +8556,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) void btrfs_test_destroy_inode(struct inode *inode) { btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); + kfree(BTRFS_I(inode)->file_extent_tree); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } #endif void btrfs_free_inode(struct inode *inode) { + kfree(BTRFS_I(inode)->file_extent_tree); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } From 32d53f6f7b01f572dac6f0c2f4dbfc03ebe38112 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Dec 2023 16:20:23 +0000 Subject: [PATCH 0870/1562] btrfs: assert extent map is not in a list when setting it up When setting up a new extent map, at setup_extent_mapping(), we're doing a list move operation to add the extent map the tree's list of modified extents. This is confusing because at this point the extent map can not be in any list, because it's a new extent map. So replace the list move with a list add and add an assertion that checks that the extent map is not currently in any list. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 80f86503a5cd..d29097a8550a 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -345,8 +345,10 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree, em->mod_start = em->start; em->mod_len = em->len; + ASSERT(list_empty(&em->list)); + if (modified) - list_move(&em->list, &tree->modified_extents); + list_add(&em->list, &tree->modified_extents); else try_merge_map(tree, em); } From b30aa1c176ba86a035d40c2d2f12dc0e0f687e0e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Dec 2023 16:20:24 +0000 Subject: [PATCH 0871/1562] btrfs: tests: fix error messages for test case 4 of extent map tests In test case 4 for extent maps, if we error out we are supposed to print in interval but instead of printing a non-inclusive end offset, we are printing the length of the interval, which makes it confusing. So fix that to print the exclusive end offset instead. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tests/extent-map-tests.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 8602f94cc29d..ac64eafad703 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -388,13 +388,13 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, write_unlock(&em_tree->lock); if (ret) { test_err("case4 [0x%llx 0x%llx): ret %d", - start, len, ret); + start, start + len, ret); goto out; } if (em && (start < em->start || start + len > extent_map_end(em))) { test_err( "case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", - start, len, ret, em->start, em->len, em->block_start, + start, start + len, ret, em->start, em->len, em->block_start, em->block_len); ret = -EINVAL; } From eca3aaec0de1e7059340f906a0741a68c1cf9e2b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Dec 2023 16:20:25 +0000 Subject: [PATCH 0872/1562] btrfs: tests: do not ignore NULL extent maps for extent maps tests Several of the extent map tests call btrfs_add_extent_mapping() which is supposed to succeed and return an extent map through the pointer to pointer argument. However the tests are deliberately ignoring a NULL extent map, which is not expected to happen. So change the tests to error out if a NULL extent map is found. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tests/extent-map-tests.c | 40 +++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index ac64eafad703..024588d02551 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -121,9 +121,14 @@ static int test_case_1(struct btrfs_fs_info *fs_info, test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); goto out; } - if (em && - (em->start != 0 || extent_map_end(em) != SZ_16K || - em->block_start != 0 || em->block_len != SZ_16K)) { + if (!em) { + test_err("case1 [%llu %llu]: no extent map returned", + start, start + len); + ret = -ENOENT; + goto out; + } + if (em->start != 0 || extent_map_end(em) != SZ_16K || + em->block_start != 0 || em->block_len != SZ_16K) { test_err( "case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", start, start + len, ret, em->start, em->len, @@ -209,9 +214,13 @@ static int test_case_2(struct btrfs_fs_info *fs_info, test_err("case2 [0 1K]: ret %d", ret); goto out; } - if (em && - (em->start != 0 || extent_map_end(em) != SZ_1K || - em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) { + if (!em) { + test_err("case2 [0 1K]: no extent map returned"); + ret = -ENOENT; + goto out; + } + if (em->start != 0 || extent_map_end(em) != SZ_1K || + em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1) { test_err( "case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", ret, em->start, em->len, em->block_start, @@ -272,13 +281,18 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, start, start + len, ret); goto out; } + if (!em) { + test_err("case3 [0x%llx 0x%llx): no extent map returned", + start, start + len); + ret = -ENOENT; + goto out; + } /* * Since bytes within em are contiguous, em->block_start is identical to * em->start. */ - if (em && - (start < em->start || start + len > extent_map_end(em) || - em->start != em->block_start || em->len != em->block_len)) { + if (start < em->start || start + len > extent_map_end(em) || + em->start != em->block_start || em->len != em->block_len) { test_err( "case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", start, start + len, ret, em->start, em->len, @@ -391,7 +405,13 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, start, start + len, ret); goto out; } - if (em && (start < em->start || start + len > extent_map_end(em))) { + if (!em) { + test_err("case4 [0x%llx 0x%llx): no extent map returned", + start, start + len); + ret = -ENOENT; + goto out; + } + if (start < em->start || start + len > extent_map_end(em)) { test_err( "case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", start, start + len, ret, em->start, em->len, em->block_start, From c9201b4fec0d8ebac1399825353c7a266665cccd Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Dec 2023 16:20:26 +0000 Subject: [PATCH 0873/1562] btrfs: tests: print all values as decimal in messages for extent map tests Some error messages of the extent map tests print decimal values of start offsets and lengths, while other are oddly printing in hexadecimal, which is far less human friendly, specially taking into consideration that all the values are small and multiples of 4K, so it's a lot easier to read them as decimal values. Change the format specifiers to print as decimal instead. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tests/extent-map-tests.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 024588d02551..1eb442ea89a5 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -25,7 +25,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) #ifdef CONFIG_BTRFS_DEBUG if (refcount_read(&em->refs) != 1) { test_err( -"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d", +"em leak: em (start %llu len %llu block_start %llu block_len %llu) refs %d", em->start, em->len, em->block_start, em->block_len, refcount_read(&em->refs)); @@ -277,12 +277,12 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); if (ret) { - test_err("case3 [0x%llx 0x%llx): ret %d", + test_err("case3 [%llu %llu): ret %d", start, start + len, ret); goto out; } if (!em) { - test_err("case3 [0x%llx 0x%llx): no extent map returned", + test_err("case3 [%llu %llu): no extent map returned", start, start + len); ret = -ENOENT; goto out; @@ -294,7 +294,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, if (start < em->start || start + len > extent_map_end(em) || em->start != em->block_start || em->len != em->block_len) { test_err( -"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", +"case3 [%llu %llu): ret %d em (start %llu len %llu block_start %llu block_len %llu)", start, start + len, ret, em->start, em->len, em->block_start, em->block_len); ret = -EINVAL; @@ -401,19 +401,19 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); if (ret) { - test_err("case4 [0x%llx 0x%llx): ret %d", + test_err("case4 [%llu %llu): ret %d", start, start + len, ret); goto out; } if (!em) { - test_err("case4 [0x%llx 0x%llx): no extent map returned", + test_err("case4 [%llu %llu): no extent map returned", start, start + len); ret = -ENOENT; goto out; } if (start < em->start || start + len > extent_map_end(em)) { test_err( -"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", +"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu block_start %llu block_len %llu)", start, start + len, ret, em->start, em->len, em->block_start, em->block_len); ret = -EINVAL; From db9d94464a7acb149d014de1b0aa982b8c3856a6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Dec 2023 16:20:27 +0000 Subject: [PATCH 0874/1562] btrfs: unexport add_extent_mapping() There's no need to export add_extent_mapping(), as it's only used inside extent_map.c and in the self tests. For the tests we can use instead btrfs_add_extent_mapping(), which will accomplish exactly the same as we don't expect collisions in any of them. So unexport it and make the tests use btrfs_add_extent_mapping() instead of add_extent_mapping(). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 4 +-- fs/btrfs/extent_map.h | 2 -- fs/btrfs/tests/extent-map-tests.c | 45 ++++++++++++++++--------------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index d29097a8550a..18a5c4332ed6 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -366,8 +366,8 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree, * into the tree directly, with an additional reference taken, or a * reference dropped if the merge attempt was successful. */ -int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em, int modified) +static int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em, int modified) { int ret = 0; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 66f8dd26487b..5663137471fe 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -74,8 +74,6 @@ static inline u64 extent_map_end(struct extent_map *em) void extent_map_tree_init(struct extent_map_tree *tree); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); -int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em, int modified); void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, u64 new_logical); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 1eb442ea89a5..59bbf714225c 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -73,7 +73,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->block_start = 0; em->block_len = SZ_16K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 16K)"); @@ -94,7 +94,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->block_start = SZ_32K; /* avoid merging */ em->block_len = SZ_4K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [16K, 20K)"); @@ -166,7 +166,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 1K)"); @@ -187,7 +187,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->block_start = SZ_4K; em->block_len = SZ_4K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); @@ -253,7 +253,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, em->block_start = SZ_4K; em->block_len = SZ_4K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); @@ -357,7 +357,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->block_start = 0; em->block_len = SZ_8K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 8K)"); @@ -378,7 +378,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->block_start = SZ_16K; /* avoid merging */ em->block_len = 24 * SZ_1K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [8K, 32K)"); @@ -463,7 +463,8 @@ static int test_case_4(struct btrfs_fs_info *fs_info, return ret; } -static int add_compressed_extent(struct extent_map_tree *em_tree, +static int add_compressed_extent(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, u64 start, u64 len, u64 block_start) { struct extent_map *em; @@ -481,7 +482,7 @@ static int add_compressed_extent(struct extent_map_tree *em_tree, em->block_len = SZ_4K; set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); free_extent_map(em); if (ret < 0) { @@ -587,7 +588,7 @@ static int validate_range(struct extent_map_tree *em_tree, int index) * They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from * merging the em's. */ -static int test_case_5(void) +static int test_case_5(struct btrfs_fs_info *fs_info) { struct extent_map_tree *em_tree; struct inode *inode; @@ -605,35 +606,35 @@ static int test_case_5(void) em_tree = &BTRFS_I(inode)->extent_tree; /* [0, 12k) */ - ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0); + ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K * 3, 0); if (ret) { test_err("cannot add extent range [0, 12K)"); goto out; } /* [12k, 24k) */ - ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K); + ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [24k, 36k) */ - ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K); + ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [36k, 40k) */ - ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); + ret = add_compressed_extent(fs_info, em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [40k, 64k) */ - ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K); + ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; @@ -685,11 +686,11 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em struct extent_map *em = NULL; int ret; - ret = add_compressed_extent(em_tree, 0, SZ_4K, 0); + ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K, 0); if (ret) goto out; - ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0); + ret = add_compressed_extent(fs_info, em_tree, SZ_4K, SZ_4K, 0); if (ret) goto out; @@ -733,7 +734,7 @@ out: * true would mess up the start/end calculations and subsequent splits would be * incorrect. */ -static int test_case_7(void) +static int test_case_7(struct btrfs_fs_info *fs_info) { struct extent_map_tree *em_tree; struct extent_map *em; @@ -764,7 +765,7 @@ static int test_case_7(void) em->block_len = SZ_4K; set_bit(EXTENT_FLAG_PINNED, &em->flags); write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("couldn't add extent map"); @@ -785,7 +786,7 @@ static int test_case_7(void) em->block_start = SZ_32K; em->block_len = SZ_16K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("couldn't add extent map"); @@ -1022,13 +1023,13 @@ int btrfs_test_extent_map(void) ret = test_case_4(fs_info, em_tree); if (ret) goto out; - ret = test_case_5(); + ret = test_case_5(fs_info); if (ret) goto out; ret = test_case_6(fs_info, em_tree); if (ret) goto out; - ret = test_case_7(); + ret = test_case_7(fs_info); if (ret) goto out; From d224d2ef959a768fc88242224d8527e5f88789b6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Dec 2023 16:20:28 +0000 Subject: [PATCH 0875/1562] btrfs: remove redundant value assignment at btrfs_add_extent_mapping() At btrfs_add_extent_mapping(), in case add_extent_mapping() returned -EEXIST, it's pointless to assign 0 to 'ret' since we will assign a value to it shortly after, without 'ret' being used before that. So remove that pointless assignment. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 18a5c4332ed6..a3d69c943eec 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -586,8 +586,6 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, if (ret == -EEXIST) { struct extent_map *existing; - ret = 0; - existing = search_extent_mapping(em_tree, start, len); trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); From 00deaf04df35536d192544ea57b6da9a88519422 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Dec 2023 16:20:29 +0000 Subject: [PATCH 0876/1562] btrfs: log messages at unpin_extent_range() during unexpected cases At unpin_extent_range() we trigger a WARN_ON() when we don't find an extent map or we find one with a start offset not matching the start offset of the target range. This however isn't very useful for debugging because: 1) We don't know which condition was triggered, as they are both in the same WARN_ON() call; 2) We don't know which inode was affected, from which root, for which range, what's the start offset of the extent map, and so on. So trigger a separate warning for each case and log a message for each case providing information about the inode, its root, the target range, the generation and the start offset of the extent map we found. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 22 ++++++++++++++++------ fs/btrfs/extent_map.h | 2 +- fs/btrfs/inode.c | 2 +- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index a3d69c943eec..48230a1179b0 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -280,7 +280,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) /* * Unpin an extent from the cache. * - * @tree: tree to unpin the extent in + * @inode: the inode from which we are unpinning an extent range * @start: logical offset in the file * @len: length of the extent * @gen: generation that this extent has been modified in @@ -289,9 +289,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) * to the generation that actually added the file item to the inode so we know * we need to sync this extent when we call fsync(). */ -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, - u64 gen) +int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map_tree *tree = &inode->extent_tree; int ret = 0; struct extent_map *em; bool prealloc = false; @@ -299,10 +300,19 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, write_lock(&tree->lock); em = lookup_extent_mapping(tree, start, len); - WARN_ON(!em || em->start != start); - - if (!em) + if (WARN_ON(!em)) { + btrfs_warn(fs_info, +"no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu", + btrfs_ino(inode), btrfs_root_id(inode->root), + start, len, gen); goto out; + } + + if (WARN_ON(em->start != start)) + btrfs_warn(fs_info, +"found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu", + btrfs_ino(inode), btrfs_root_id(inode->root), + em->start, start, len, gen); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 5663137471fe..cd1a9115908d 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -82,7 +82,7 @@ struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void __cold extent_map_exit(void); -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen); void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 38adab92a015..88614bb87b95 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3127,7 +3127,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) ordered_extent->disk_num_bytes); } } - unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset, + unpin_extent_cache(inode, ordered_extent->file_offset, ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); From 1a9fb16c60526253ecf9913b6ea48cfcdcb4c023 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Dec 2023 16:20:30 +0000 Subject: [PATCH 0877/1562] btrfs: avoid useless rbtree iterations when attempting to merge extent map When trying to merge an extent map that was just inserted or unpinned, we will try to merge it with any adjacent extent map that is suitable. However we will only check if our extent map is mergeable after searching for the previous and next extent maps in the rbtree, meaning that we are doing unnecessary calls to rb_prev() and rb_next() in case our extent map is not mergeable (it's compressed, in the list of modifed extents, being logged or pinned), wasting CPU time chasing rbtree pointers and pulling in unnecessary cache lines. So change the logic to check first if an extent map is mergeable before searching for the next and previous extent maps in the rbtree. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 48230a1179b0..72df548a4c86 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -187,31 +187,32 @@ static inline u64 extent_map_block_end(const struct extent_map *em) return em->block_start + em->block_len; } -/* Check to see if two extent_map structs are adjacent and safe to merge. */ -static int mergable_maps(struct extent_map *prev, struct extent_map *next) +static bool can_merge_extent_map(const struct extent_map *em) { - if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) - return 0; + if (test_bit(EXTENT_FLAG_PINNED, &em->flags)) + return false; - /* - * don't merge compressed extents, we need to know their - * actual size - */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) - return 0; + /* Don't merge compressed extents, we need to know their actual size. */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + return false; - if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || - test_bit(EXTENT_FLAG_LOGGING, &next->flags)) - return 0; + if (test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + return false; /* * We don't want to merge stuff that hasn't been written to the log yet * since it may not reflect exactly what is on disk, and that would be * bad. */ - if (!list_empty(&prev->list) || !list_empty(&next->list)) - return 0; + if (!list_empty(&em->list)) + return false; + return true; +} + +/* Check to see if two extent_map structs are adjacent and safe to merge. */ +static int mergable_maps(struct extent_map *prev, struct extent_map *next) +{ if (extent_map_end(prev) == next->start && prev->flags == next->flags && ((next->block_start == EXTENT_MAP_HOLE && @@ -241,11 +242,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) if (refcount_read(&em->refs) > 2) return; + if (!can_merge_extent_map(em)) + return; + if (em->start != 0) { rb = rb_prev(&em->rb_node); if (rb) merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(merge, em)) { + if (rb && can_merge_extent_map(merge) && mergable_maps(merge, em)) { em->start = merge->start; em->orig_start = merge->orig_start; em->len += merge->len; @@ -265,7 +269,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) rb = rb_next(&em->rb_node); if (rb) merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(em, merge)) { + if (rb && can_merge_extent_map(merge) && mergable_maps(em, merge)) { em->len += merge->len; em->block_len += merge->block_len; rb_erase_cached(&merge->rb_node, &tree->map); From b144cc0415e76b29bde86a969a0e1e8b4c8dbce2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Dec 2023 16:20:31 +0000 Subject: [PATCH 0878/1562] btrfs: make extent_map_end() argument const The extent map pointer argument for extent_map_end() can be const as we are not modifyng anything in the extent map. So make it const, as it will allow further changes to callers that have a const extent map pointer. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index cd1a9115908d..44dc0cb310ea 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -64,7 +64,7 @@ static inline int extent_map_in_tree(const struct extent_map *em) return !RB_EMPTY_NODE(&em->rb_node); } -static inline u64 extent_map_end(struct extent_map *em) +static inline u64 extent_map_end(const struct extent_map *em) { if (em->start + em->len < em->start) return (u64)-1; From 27f0d9c98d1554a3c0021116aef1a250088d35a0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Dec 2023 16:20:32 +0000 Subject: [PATCH 0879/1562] btrfs: refactor mergable_maps() for more readability At mergable_maps() instead of having a single if statement with many ORed and ANDed conditions, refactor it with multiple if statements that check a single condition and return immediately once a requirement fails. This makes it easier to read. Also change the return type from int to bool, make the arguments const and rename the function from mergable_maps() to mergeable_maps(). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 72df548a4c86..0d1167f454d2 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -211,19 +211,19 @@ static bool can_merge_extent_map(const struct extent_map *em) } /* Check to see if two extent_map structs are adjacent and safe to merge. */ -static int mergable_maps(struct extent_map *prev, struct extent_map *next) +static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next) { - if (extent_map_end(prev) == next->start && - prev->flags == next->flags && - ((next->block_start == EXTENT_MAP_HOLE && - prev->block_start == EXTENT_MAP_HOLE) || - (next->block_start == EXTENT_MAP_INLINE && - prev->block_start == EXTENT_MAP_INLINE) || - (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && - next->block_start == extent_map_block_end(prev)))) { - return 1; - } - return 0; + if (extent_map_end(prev) != next->start) + return false; + + if (prev->flags != next->flags) + return false; + + if (next->block_start < EXTENT_MAP_LAST_BYTE - 1) + return next->block_start == extent_map_block_end(prev); + + /* HOLES and INLINE extents. */ + return next->block_start == prev->block_start; } static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) @@ -249,7 +249,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) rb = rb_prev(&em->rb_node); if (rb) merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && can_merge_extent_map(merge) && mergable_maps(merge, em)) { + if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) { em->start = merge->start; em->orig_start = merge->orig_start; em->len += merge->len; @@ -269,7 +269,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) rb = rb_next(&em->rb_node); if (rb) merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && can_merge_extent_map(merge) && mergable_maps(em, merge)) { + if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { em->len += merge->len; em->block_len += merge->block_len; rb_erase_cached(&merge->rb_node, &tree->map); From f86f7a75e2fb5fd7d31d00eab8a392f97ba42ce9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Dec 2023 16:20:33 +0000 Subject: [PATCH 0880/1562] btrfs: use the flags of an extent map to identify the compression type Currently, in struct extent_map, we use an unsigned int (32 bits) to identify the compression type of an extent and an unsigned long (64 bits on a 64 bits platform, 32 bits otherwise) for flags. We are only using 6 different flags, so an unsigned long is excessive and we can use flags to identify the compression type instead of using a dedicated 32 bits field. We can easily have tens or hundreds of thousands (or more) of extent maps on busy and large filesystems, specially with compression enabled or many or large files with tons of small extents. So it's convenient to have the extent_map structure as small as possible in order to use less memory. So remove the compression type field from struct extent_map, use flags to identify the compression type and shorten the flags field from an unsigned long to a u32. This saves 8 bytes (on 64 bits platforms) and reduces the size of the structure from 136 bytes down to 128 bytes, using now only two cache lines, and increases the number of extent maps we can have per 4K page from 30 to 32. By using a u32 for the flags instead of an unsigned long, we no longer use test_bit(), set_bit() and clear_bit(), but that level of atomicity is not needed as most flags are never cleared once set (before adding an extent map to the tree), and the ones that can be cleared or set after an extent map is added to the tree, are always performed while holding the write lock on the extent map tree, while the reader holds a lock on the tree or tests for a flag that never changes once the extent map is in the tree (such as compression flags). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 4 +-- fs/btrfs/defrag.c | 8 ++--- fs/btrfs/extent_io.c | 13 ++++--- fs/btrfs/extent_map.c | 51 ++++++++++++-------------- fs/btrfs/extent_map.h | 58 +++++++++++++++++++++++++----- fs/btrfs/file-item.c | 9 ++--- fs/btrfs/file.c | 10 +++--- fs/btrfs/inode.c | 33 ++++++++--------- fs/btrfs/relocation.c | 2 +- fs/btrfs/tests/extent-map-tests.c | 4 +-- fs/btrfs/tests/inode-tests.c | 60 +++++++++++++++---------------- fs/btrfs/tree-log.c | 16 +++++---- include/trace/events/btrfs.h | 21 ++++++----- 13 files changed, 158 insertions(+), 131 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 05595d113ff8..2d9974c283c6 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -584,7 +584,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) goto out; } - ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); + ASSERT(extent_map_is_compressed(em)); compressed_len = em->block_len; cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, @@ -596,7 +596,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) cb->len = bbio->bio.bi_iter.bi_size; cb->compressed_len = compressed_len; - cb->compress_type = em->compress_type; + cb->compress_type = extent_map_compression(em); cb->orig_bbio = bbio; free_extent_map(em); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 9bcb60c68c58..a9a068af8d6e 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -775,7 +775,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * this em, as either we don't care about the generation, or the * merged extent map will be rejected anyway. */ - if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && + if (em && (em->flags & EXTENT_FLAG_MERGED) && newer_than && em->generation >= newer_than) { free_extent_map(em); em = NULL; @@ -802,7 +802,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, const struct extent_map *em) { - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + if (extent_map_is_compressed(em)) return BTRFS_MAX_COMPRESSED; return fs_info->max_extent_size; } @@ -828,7 +828,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, /* No more em or hole */ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) goto out; - if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + if (next->flags & EXTENT_FLAG_PREALLOC) goto out; /* * If the next extent is at its max capacity, defragging current extent @@ -998,7 +998,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, /* Skip holes and preallocated extents. */ if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + (em->flags & EXTENT_FLAG_PREALLOC)) goto next; /* Skip older extent */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7f7ecee9e048..bf13aebe2384 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1032,8 +1032,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - compress_type = em->compress_type; + compress_type = extent_map_compression(em); iosize = min(extent_map_end(em) - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); @@ -1042,7 +1041,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, else disk_bytenr = em->block_start + extent_offset; block_start = em->block_start; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + if (em->flags & EXTENT_FLAG_PREALLOC) block_start = EXTENT_MAP_HOLE; /* @@ -1079,7 +1078,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * is a corner case so we prioritize correctness over * non-optimal behavior (submitting 2 bios for the same extent). */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && + if (compress_type != BTRFS_COMPRESS_NONE && prev_em_start && *prev_em_start != (u64)-1 && *prev_em_start != em->start) force_bio_submit = true; @@ -1358,7 +1357,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, block_start = em->block_start; disk_bytenr = em->block_start + extent_offset; - ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(!extent_map_is_compressed(em)); ASSERT(block_start != EXTENT_MAP_HOLE); ASSERT(block_start != EXTENT_MAP_INLINE); @@ -2360,7 +2359,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) write_unlock(&map->lock); break; } - if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || + if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { write_unlock(&map->lock); free_extent_map(em); @@ -2377,7 +2376,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) * extra reference on the em. */ if (list_empty(&em->list) || - test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + (em->flags & EXTENT_FLAG_LOGGING)) goto remove_em; /* * If it's in the list of modified extents, remove it diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 0d1167f454d2..b61099bf97a8 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -50,7 +50,6 @@ struct extent_map *alloc_extent_map(void) if (!em) return NULL; RB_CLEAR_NODE(&em->rb_node); - em->compress_type = BTRFS_COMPRESS_NONE; refcount_set(&em->refs, 1); INIT_LIST_HEAD(&em->list); return em; @@ -189,14 +188,14 @@ static inline u64 extent_map_block_end(const struct extent_map *em) static bool can_merge_extent_map(const struct extent_map *em) { - if (test_bit(EXTENT_FLAG_PINNED, &em->flags)) + if (em->flags & EXTENT_FLAG_PINNED) return false; /* Don't merge compressed extents, we need to know their actual size. */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + if (extent_map_is_compressed(em)) return false; - if (test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + if (em->flags & EXTENT_FLAG_LOGGING) return false; /* @@ -258,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); - set_bit(EXTENT_FLAG_MERGED, &em->flags); + em->flags |= EXTENT_FLAG_MERGED; rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); @@ -276,7 +275,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) RB_CLEAR_NODE(&merge->rb_node); em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); - set_bit(EXTENT_FLAG_MERGED, &em->flags); + em->flags |= EXTENT_FLAG_MERGED; free_extent_map(merge); } } @@ -319,13 +318,13 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) em->start, start, len, gen); em->generation = gen; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->flags &= ~EXTENT_FLAG_PINNED; em->mod_start = em->start; em->mod_len = em->len; - if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { + if (em->flags & EXTENT_FLAG_FILLING) { prealloc = true; - clear_bit(EXTENT_FLAG_FILLING, &em->flags); + em->flags &= ~EXTENT_FLAG_FILLING; } try_merge_map(tree, em); @@ -346,7 +345,7 @@ void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) { lockdep_assert_held_write(&tree->lock); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + em->flags &= ~EXTENT_FLAG_LOGGING; if (extent_map_in_tree(em)) try_merge_map(tree, em); } @@ -471,9 +470,9 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { lockdep_assert_held_write(&tree->lock); - WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + WARN_ON(em->flags & EXTENT_FLAG_PINNED); rb_erase_cached(&em->rb_node, &tree->map); - if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + if (!(em->flags & EXTENT_FLAG_LOGGING)) list_del_init(&em->list); RB_CLEAR_NODE(&em->rb_node); } @@ -485,9 +484,9 @@ static void replace_extent_mapping(struct extent_map_tree *tree, { lockdep_assert_held_write(&tree->lock); - WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); + WARN_ON(cur->flags & EXTENT_FLAG_PINNED); ASSERT(extent_map_in_tree(cur)); - if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) + if (!(cur->flags & EXTENT_FLAG_LOGGING)) list_del_init(&cur->list); rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map); RB_CLEAR_NODE(&cur->rb_node); @@ -550,7 +549,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, em->start = start; em->len = end - start; if (em->block_start < EXTENT_MAP_LAST_BYTE && - !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + !extent_map_is_compressed(em)) { em->block_start += start_diff; em->block_len = em->len; } @@ -653,8 +652,7 @@ static void drop_all_extent_maps_fast(struct extent_map_tree *tree) node = rb_first_cached(&tree->map); em = rb_entry(node, struct extent_map, rb_node); - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); remove_extent_mapping(tree, em); free_extent_map(em); cond_resched_rwlock_write(&tree->lock); @@ -730,19 +728,18 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, } } - if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { + if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) { start = em_end; goto next; } flags = em->flags; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); /* * In case we split the extent map, we want to preserve the * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want * it on the new extent maps. */ - clear_bit(EXTENT_FLAG_LOGGING, &flags); + em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); modified = !list_empty(&em->list); /* @@ -753,7 +750,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, goto remove_em; gen = em->generation; - compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + compressed = extent_map_is_compressed(em); if (em->start < start) { if (!split) { @@ -786,7 +783,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->generation = gen; split->flags = flags; - split->compress_type = em->compress_type; replace_extent_mapping(em_tree, em, split, modified); free_extent_map(split); split = split2; @@ -803,7 +799,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->len = em_end - end; split->block_start = em->block_start; split->flags = flags; - split->compress_type = em->compress_type; split->generation = gen; if (em->block_start < EXTENT_MAP_LAST_BYTE) { @@ -969,14 +964,14 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, } ASSERT(em->len == len); - ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(!extent_map_is_compressed(em)); ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); - ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); + ASSERT(em->flags & EXTENT_FLAG_PINNED); + ASSERT(!(em->flags & EXTENT_FLAG_LOGGING)); ASSERT(!list_empty(&em->list)); flags = em->flags; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->flags &= ~EXTENT_FLAG_PINNED; /* First, replace the em with a new extent_map starting from * em->start */ split_pre->start = em->start; @@ -987,7 +982,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, split_pre->orig_block_len = split_pre->block_len; split_pre->ram_bytes = split_pre->len; split_pre->flags = flags; - split_pre->compress_type = em->compress_type; split_pre->generation = em->generation; replace_extent_mapping(em_tree, em, split_pre, 1); @@ -1006,7 +1000,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, split_mid->orig_block_len = split_mid->block_len; split_mid->ram_bytes = split_mid->len; split_mid->flags = flags; - split_mid->compress_type = em->compress_type; split_mid->generation = em->generation; add_extent_mapping(em_tree, split_mid, 1); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 44dc0cb310ea..e380fc08bbe4 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -5,6 +5,7 @@ #include #include +#include "compression.h" #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) @@ -13,18 +14,24 @@ /* bits for the extent_map::flags field */ enum { /* this entry not yet on disk, don't free it */ - EXTENT_FLAG_PINNED, - EXTENT_FLAG_COMPRESSED, + ENUM_BIT(EXTENT_FLAG_PINNED), + ENUM_BIT(EXTENT_FLAG_COMPRESS_ZLIB), + ENUM_BIT(EXTENT_FLAG_COMPRESS_LZO), + ENUM_BIT(EXTENT_FLAG_COMPRESS_ZSTD), /* pre-allocated extent */ - EXTENT_FLAG_PREALLOC, + ENUM_BIT(EXTENT_FLAG_PREALLOC), /* Logging this extent */ - EXTENT_FLAG_LOGGING, + ENUM_BIT(EXTENT_FLAG_LOGGING), /* Filling in a preallocated extent */ - EXTENT_FLAG_FILLING, + ENUM_BIT(EXTENT_FLAG_FILLING), /* This em is merged from two or more physically adjacent ems */ - EXTENT_FLAG_MERGED, + ENUM_BIT(EXTENT_FLAG_MERGED), }; +/* + * Keep this structure as compact as possible, as we can have really large + * amounts of allocated extent maps at any time. + */ struct extent_map { struct rb_node rb_node; @@ -45,9 +52,8 @@ struct extent_map { * For non-merged extents, it's from btrfs_file_extent_item::generation. */ u64 generation; - unsigned long flags; + u32 flags; refcount_t refs; - unsigned int compress_type; struct list_head list; }; @@ -59,6 +65,42 @@ struct extent_map_tree { struct btrfs_inode; +static inline void extent_map_set_compression(struct extent_map *em, + enum btrfs_compression_type type) +{ + if (type == BTRFS_COMPRESS_ZLIB) + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; + else if (type == BTRFS_COMPRESS_LZO) + em->flags |= EXTENT_FLAG_COMPRESS_LZO; + else if (type == BTRFS_COMPRESS_ZSTD) + em->flags |= EXTENT_FLAG_COMPRESS_ZSTD; +} + +static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em) +{ + if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB) + return BTRFS_COMPRESS_ZLIB; + + if (em->flags & EXTENT_FLAG_COMPRESS_LZO) + return BTRFS_COMPRESS_LZO; + + if (em->flags & EXTENT_FLAG_COMPRESS_ZSTD) + return BTRFS_COMPRESS_ZSTD; + + return BTRFS_COMPRESS_NONE; +} + +/* + * More efficient way to determine if extent is compressed, instead of using + * 'extent_map_compression() != BTRFS_COMPRESS_NONE'. + */ +static inline bool extent_map_is_compressed(const struct extent_map *em) +{ + return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB | + EXTENT_FLAG_COMPRESS_LZO | + EXTENT_FLAG_COMPRESS_ZSTD)) != 0; +} + static inline int extent_map_in_tree(const struct extent_map *em) { return !RB_EMPTY_NODE(&em->rb_node); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 1f0110f48353..81ac1d474bf1 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -1294,8 +1294,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, return; } if (compress_type != BTRFS_COMPRESS_NONE) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; + extent_map_set_compression(em, compress_type); em->block_start = bytenr; em->block_len = em->orig_block_len; } else { @@ -1303,7 +1302,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, em->block_start = bytenr; em->block_len = em->len; if (type == BTRFS_FILE_EXTENT_PREALLOC) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + em->flags |= EXTENT_FLAG_PREALLOC; } } else if (type == BTRFS_FILE_EXTENT_INLINE) { em->block_start = EXTENT_MAP_INLINE; @@ -1315,9 +1314,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, */ em->orig_start = EXTENT_MAP_HOLE; em->block_len = (u64)-1; - em->compress_type = compress_type; - if (compress_type != BTRFS_COMPRESS_NONE) - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + extent_map_set_compression(em, compress_type); } else { btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7a71720aaed2..98ef859f8938 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2150,7 +2150,6 @@ out: hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; hole_em->orig_block_len = 0; - hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; ret = btrfs_replace_extent_map_range(inode, hole_em, true); @@ -2839,7 +2838,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, if (em->block_start == EXTENT_MAP_HOLE) ret = RANGE_BOUNDARY_HOLE; - else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + else if (em->flags & EXTENT_FLAG_PREALLOC) ret = RANGE_BOUNDARY_PREALLOC_EXTENT; else ret = RANGE_BOUNDARY_WRITTEN_EXTENT; @@ -2879,8 +2878,7 @@ static int btrfs_zero_range(struct inode *inode, * extents and holes, we drop all the existing extents and allocate a * new prealloc extent, so that we get a larger contiguous disk extent. */ - if (em->start <= alloc_start && - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) { const u64 em_end = em->start + em->len; if (em_end >= offset + len) { @@ -2915,7 +2913,7 @@ static int btrfs_zero_range(struct inode *inode, goto out; } - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (em->flags & EXTENT_FLAG_PREALLOC) { free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); @@ -3136,7 +3134,7 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte = ALIGN(last_byte, blocksize); if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + !(em->flags & EXTENT_FLAG_PREALLOC))) { const u64 range_len = last_byte - cur_offset; ret = add_falloc_range(&reserve_list, cur_offset, range_len); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 88614bb87b95..a1f6e8d3b546 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4898,7 +4898,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) last_byte = ALIGN(last_byte, fs_info->sectorsize); hole_size = last_byte - cur_offset; - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (!(em->flags & EXTENT_FLAG_PREALLOC)) { struct extent_map *hole_em; err = maybe_insert_hole(inode, cur_offset, hole_size); @@ -4926,7 +4926,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->block_len = 0; hole_em->orig_block_len = 0; hole_em->ram_bytes = hole_size; - hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = btrfs_get_fs_generation(fs_info); err = btrfs_replace_extent_map_range(inode, hole_em, true); @@ -7274,13 +7273,11 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, em->orig_block_len = orig_block_len; em->ram_bytes = ram_bytes; em->generation = -1; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - if (type == BTRFS_ORDERED_PREALLOC) { - set_bit(EXTENT_FLAG_FILLING, &em->flags); - } else if (type == BTRFS_ORDERED_COMPRESSED) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - } + em->flags |= EXTENT_FLAG_PINNED; + if (type == BTRFS_ORDERED_PREALLOC) + em->flags |= EXTENT_FLAG_FILLING; + else if (type == BTRFS_ORDERED_COMPRESSED) + extent_map_set_compression(em, compress_type); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { @@ -7320,10 +7317,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, * just use the extent. * */ - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + if ((em->flags & EXTENT_FLAG_PREALLOC) || ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) { - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + if (em->flags & EXTENT_FLAG_PREALLOC) type = BTRFS_ORDERED_PREALLOC; else type = BTRFS_ORDERED_NOCOW; @@ -7558,7 +7555,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || + if (extent_map_is_compressed(em) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em); /* @@ -7654,7 +7651,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * that, since we have locked only the parts we are performing I/O in. */ if ((em->block_start == EXTENT_MAP_HOLE) || - (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { + ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; } else { @@ -9669,7 +9666,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->block_len = ins.offset; em->orig_block_len = ins.offset; em->ram_bytes = ins.offset; - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + em->flags |= EXTENT_FLAG_PREALLOC; em->generation = trans->transid; ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); @@ -10150,12 +10147,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->len = min_t(u64, extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + (em->flags & EXTENT_FLAG_PREALLOC)) { disk_bytenr = EXTENT_MAP_HOLE; count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; - } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + } else if (extent_map_is_compressed(em)) { disk_bytenr = em->block_start; /* * Bail if the buffer isn't large enough to return the whole @@ -10170,7 +10167,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - em->orig_start; ret = btrfs_encoded_io_compression_from_extent(fs_info, - em->compress_type); + extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; @@ -10718,7 +10715,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + if (extent_map_is_compressed(em)) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f5d9e5f74a52..78c2770eb52f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2951,7 +2951,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod em->len = end + 1 - start; em->block_len = em->len; em->block_start = block_start; - set_bit(EXTENT_FLAG_PINNED, &em->flags); + em->flags |= EXTENT_FLAG_PINNED; lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 59bbf714225c..253cce7ffecf 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -480,7 +480,7 @@ static int add_compressed_extent(struct btrfs_fs_info *fs_info, em->len = len; em->block_start = block_start; em->block_len = SZ_4K; - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); @@ -763,7 +763,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info) em->len = SZ_16K; em->block_start = 0; em->block_len = SZ_4K; - set_bit(EXTENT_FLAG_PINNED, &em->flags); + em->flags |= EXTENT_FLAG_PINNED; write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 492d69d2fa73..9957de9f7806 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -211,9 +211,9 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); } -static unsigned long prealloc_only = 0; -static unsigned long compressed_only = 0; -static unsigned long vacancy_only = 0; +static u32 prealloc_only = 0; +static u32 compressed_only = 0; +static u32 vacancy_only = 0; static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) { @@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } /* @@ -332,7 +332,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } offset = em->start + em->len; @@ -355,7 +355,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -383,7 +383,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -412,7 +412,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } offset = em->start + em->len; @@ -434,7 +434,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != orig_start) { @@ -468,7 +468,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != prealloc_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", prealloc_only, em->flags); goto out; } @@ -497,7 +497,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != prealloc_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", prealloc_only, em->flags); goto out; } @@ -527,7 +527,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != orig_start) { @@ -560,7 +560,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != prealloc_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", prealloc_only, em->flags); goto out; } @@ -595,7 +595,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != compressed_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", compressed_only, em->flags); goto out; } @@ -604,9 +604,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, em->orig_start); goto out; } - if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, em->compress_type); + BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); goto out; } offset = em->start + em->len; @@ -629,7 +629,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != compressed_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", compressed_only, em->flags); goto out; } @@ -638,9 +638,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, em->orig_start); goto out; } - if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, em->compress_type); + BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); goto out; } disk_bytenr = em->block_start; @@ -664,7 +664,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -692,7 +692,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != compressed_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", compressed_only, em->flags); goto out; } @@ -701,9 +701,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, orig_start); goto out; } - if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, em->compress_type); + BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); goto out; } offset = em->start + em->len; @@ -726,7 +726,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -758,7 +758,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != vacancy_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", vacancy_only, em->flags); goto out; } @@ -786,7 +786,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -866,7 +866,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != vacancy_only) { - test_err("wrong flags, wanted %lu, have %lu", vacancy_only, + test_err("wrong flags, wanted %u, have %u", vacancy_only, em->flags); goto out; } @@ -888,7 +888,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, wanted 0 got %lu", + test_err("unexpected flags set, wanted 0 got %u", em->flags); goto out; } @@ -1095,8 +1095,8 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize) test_msg("running inode tests"); - set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); - set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); + compressed_only |= EXTENT_FLAG_COMPRESS_ZLIB; + prealloc_only |= EXTENT_FLAG_PREALLOC; ret = test_btrfs_get_extent(sectorsize, nodesize); if (ret) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index bee065851185..331fc7429952 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4519,7 +4519,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, int ret = 0; if (inode->flags & BTRFS_INODE_NODATASUM || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + (em->flags & EXTENT_FLAG_PREALLOC) || em->block_start == EXTENT_MAP_HOLE) return 0; @@ -4582,7 +4582,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, return 0; /* If we're compressed we have to save the entire range of csums. */ - if (em->compress_type) { + if (extent_map_is_compressed(em)) { csum_offset = 0; csum_len = max(em->block_len, em->orig_block_len); } else { @@ -4622,18 +4622,20 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item fi = { 0 }; struct extent_buffer *leaf; struct btrfs_key key; + enum btrfs_compression_type compress_type; u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; btrfs_set_stack_file_extent_generation(&fi, trans->transid); - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + if (em->flags & EXTENT_FLAG_PREALLOC) btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); else btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); block_len = max(em->block_len, em->orig_block_len); - if (em->compress_type != BTRFS_COMPRESS_NONE) { + compress_type = extent_map_compression(em); + if (compress_type != BTRFS_COMPRESS_NONE) { btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { @@ -4645,7 +4647,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_offset(&fi, extent_offset); btrfs_set_stack_file_extent_num_bytes(&fi, em->len); btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); - btrfs_set_stack_file_extent_compression(&fi, em->compress_type); + btrfs_set_stack_file_extent_compression(&fi, compress_type); ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) @@ -4858,13 +4860,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, continue; /* We log prealloc extents beyond eof later. */ - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && + if ((em->flags & EXTENT_FLAG_PREALLOC) && em->start >= i_size_read(&inode->vfs_inode)) continue; /* Need a ref to keep it from getting evicted from cache */ refcount_inc(&em->refs); - set_bit(EXTENT_FLAG_LOGGING, &em->flags); + em->flags |= EXTENT_FLAG_LOGGING; list_add_tail(&em->list, &extents); num++; } diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 31da1456f953..90b0222390e5 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -272,11 +272,13 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict, #define show_map_flags(flag) \ __print_flags(flag, "|", \ - { (1 << EXTENT_FLAG_PINNED), "PINNED" },\ - { (1 << EXTENT_FLAG_COMPRESSED), "COMPRESSED" },\ - { (1 << EXTENT_FLAG_PREALLOC), "PREALLOC" },\ - { (1 << EXTENT_FLAG_LOGGING), "LOGGING" },\ - { (1 << EXTENT_FLAG_FILLING), "FILLING" }) + { EXTENT_FLAG_PINNED, "PINNED" },\ + { EXTENT_FLAG_COMPRESS_ZLIB, "COMPRESS_ZLIB" },\ + { EXTENT_FLAG_COMPRESS_LZO, "COMPRESS_LZO" },\ + { EXTENT_FLAG_COMPRESS_ZSTD, "COMPRESS_ZSTD" },\ + { EXTENT_FLAG_PREALLOC, "PREALLOC" },\ + { EXTENT_FLAG_LOGGING, "LOGGING" },\ + { EXTENT_FLAG_FILLING, "FILLING" }) TRACE_EVENT_CONDITION(btrfs_get_extent, @@ -295,9 +297,8 @@ TRACE_EVENT_CONDITION(btrfs_get_extent, __field( u64, orig_start ) __field( u64, block_start ) __field( u64, block_len ) - __field( unsigned long, flags ) + __field( u32, flags ) __field( int, refs ) - __field( unsigned int, compress_type ) ), TP_fast_assign_btrfs(root->fs_info, @@ -310,13 +311,11 @@ TRACE_EVENT_CONDITION(btrfs_get_extent, __entry->block_len = map->block_len; __entry->flags = map->flags; __entry->refs = refcount_read(&map->refs); - __entry->compress_type = map->compress_type; ), TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu len=%llu " "orig_start=%llu block_start=%llu(%s) " - "block_len=%llu flags=%s refs=%u " - "compress_type=%u", + "block_len=%llu flags=%s refs=%u", show_root_type(__entry->root_objectid), __entry->ino, __entry->start, @@ -325,7 +324,7 @@ TRACE_EVENT_CONDITION(btrfs_get_extent, show_map_type(__entry->block_start), __entry->block_len, show_map_flags(__entry->flags), - __entry->refs, __entry->compress_type) + __entry->refs) ); TRACE_EVENT(btrfs_handle_em_exist, From 4618d0a66b505a81cc39b17935118227a7fc24f8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 5 Dec 2023 18:21:29 +1030 Subject: [PATCH 0881/1562] btrfs: fix mismatching parameter names for btrfs_get_extent() The definition for btrfs_get_extent() is using "u64 end" as the last parameter, but in implementation we go "u64 len", and all call sites follows the implementation. This can be very confusing during development, as most developers including me, would just use the snippet returned by LSP (clangd in my case), which would only check the definition. Unfortunately this mismatch is introduced from the very beginning of btrfs. Fix it to prevent further confusion. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index bd629d011fdc..7f7c5a92d2b8 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -491,7 +491,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino, struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 end); + u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, From eefaf0a1a6f10726faa4d1b7800fdf307e97ef55 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 5 Dec 2023 19:26:39 +0100 Subject: [PATCH 0882/1562] btrfs: fix typos found by codespell Signed-off-by: David Sterba --- fs/btrfs/bio.c | 2 +- fs/btrfs/block-group.c | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/lru_cache.c | 2 +- fs/btrfs/qgroup.c | 2 +- fs/btrfs/scrub.c | 8 ++++---- fs/btrfs/tree-checker.h | 2 +- fs/btrfs/volumes.h | 2 +- fs/btrfs/zoned.h | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 4f3b693a16b1..67a885d3f9a8 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -626,7 +626,7 @@ static bool should_async_write(struct btrfs_bio *bbio) /* * Submit bio to an async queue. * - * Return true if the work has been succesfuly submitted, else false. + * Return true if the work has been successfully submitted, else false. */ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, struct btrfs_io_context *bioc, diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 4365f7b6b94d..a9be9ac99222 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2882,7 +2882,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, goto unlock_out; /* - * Skip chunk alloction if the bg is SYSTEM, this is to avoid system + * Skip chunk allocation if the bg is SYSTEM, this is to avoid system * chunk allocation storm to exhaust the system chunk array. Otherwise * we still want to try our best to mark the block group read-only. */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bf13aebe2384..921e9e17fc65 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4041,7 +4041,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, if (check_eb_range(eb, start, len)) { /* * Invalid range hit, reset the memory, so callers won't get - * some random garbage for their uninitialzed memory. + * some random garbage for their uninitialized memory. */ memset(dstv, 0, len); return; diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c index 0fe0ae54ac67..fd88af17d8d9 100644 --- a/fs/btrfs/lru_cache.c +++ b/fs/btrfs/lru_cache.c @@ -9,7 +9,7 @@ * * @cache: The cache. * @max_size: Maximum size (number of entries) for the cache. - * Use 0 for unlimited size, it's the user's responsability to + * Use 0 for unlimited size, it's the user's responsibility to * trim the cache in that case. */ void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index e46774e8f49f..63b426cc7798 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -194,7 +194,7 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, * * Must be called with qgroup_lock held and @prealloc preallocated. * - * The control on the lifespan of @prealloc would be transfered to this + * The control on the lifespan of @prealloc would be transferred to this * function, thus caller should no longer touch @prealloc. */ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 061d54148568..c0269e126f43 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -43,7 +43,7 @@ struct scrub_ctx; /* * The following value only influences the performance. * - * This detemines how many stripes would be submitted in one go, + * This determines how many stripes would be submitted in one go, * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). */ #define SCRUB_STRIPES_PER_GROUP 8 @@ -709,7 +709,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) /* Metadata, verify the full tree block. */ if (sector->is_metadata) { /* - * Check if the tree block crosses the stripe boudary. If + * Check if the tree block crosses the stripe boundary. If * crossed the boundary, we cannot verify it but only give a * warning. * @@ -883,7 +883,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, /* * Init needed infos for error reporting. * - * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio() + * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() * thus no need for dev/physical, error reporting still needs dev and physical. */ if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) { @@ -1812,7 +1812,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) if (sctx->is_dev_replace) { /* * For dev-replace, if we know there is something wrong with - * metadata, we should immedately abort. + * metadata, we should immediately abort. */ for (int i = 0; i < nr_stripes; i++) { if (stripe_has_metadata_error(&sctx->stripes[i])) { diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 3c2a02a72f64..14b9fbe82da4 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -22,7 +22,7 @@ struct btrfs_tree_parent_check { /* * Expected transid, can be 0 to skip the check, but such skip - * should only be utlized for backref walk related code. + * should only be utilized for backref walk related code. */ u64 transid; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6c6faed2468a..53f87f398da7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -613,7 +613,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) } /* - * Do the type safe converstion from stripe_nr to offset inside the chunk. + * Do the type safe conversion from stripe_nr to offset inside the chunk. * * @stripe_nr is u32, with left shift it can overflow u32 for chunks larger * than 4G. This does the proper type cast to avoid overflow. diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 74e660eec20e..f24a5ffb7807 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -319,7 +319,7 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i (bdev_zone_sectors(bdev) << SECTOR_SHIFT); } - /* Do not allow Host Manged zoned device */ + /* Do not allow Host Managed zoned device. */ return bdev_zoned_model(bdev) != BLK_ZONED_HM; } From 6140ba8a0a1460986ee98b4062df7d4876b88295 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 6 Dec 2023 15:16:03 +0100 Subject: [PATCH 0883/1562] btrfs: switch btrfs_root::delayed_nodes_tree to xarray from radix-tree The radix-tree has been superseded by the xarray (https://lwn.net/Articles/745073), this patch converts the btrfs_root::delayed_nodes, the APIs are used in a simple way. First idea is to do xa_insert() but this would require GFP_ATOMIC allocation which we want to avoid if possible. The preload mechanism of radix-tree can be emulated within the xarray API. - xa_reserve() with GFP_NOFS outside of the lock, the reserved entry is inserted atomically at most once - xa_store() under a lock, in case something races in we can detect that and xa_load() returns a valid pointer All uses of xa_load() must check for a valid pointer in case they manage to get between the xa_reserve() and xa_store(), this is handled in btrfs_get_delayed_node(). Otherwise the functionality is equivalent, xarray implements the radix-tree and there should be no performance difference. The patch continues the efforts started in 253bf57555e451 ("btrfs: turn delayed_nodes_tree into an XArray") and fixes the problems with locking and GFP flags 088aea3b97e0ae ("Revert "btrfs: turn delayed_nodes_tree into an XArray""). Reviewed-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 ++-- fs/btrfs/delayed-inode.c | 64 ++++++++++++++++++++++------------------ fs/btrfs/disk-io.c | 3 +- fs/btrfs/inode.c | 2 +- 4 files changed, 41 insertions(+), 34 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 54fd4eb92745..70e828d33177 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -227,10 +227,10 @@ struct btrfs_root { struct rb_root inode_tree; /* - * radix tree that keeps track of delayed nodes of every inode, - * protected by inode_lock + * Xarray that keeps track of delayed nodes of every inode, protected + * by @inode_lock. */ - struct radix_tree_root delayed_nodes_tree; + struct xarray delayed_nodes; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 91159dd7355b..08102883f560 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -71,7 +71,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( } spin_lock(&root->inode_lock); - node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + node = xa_load(&root->delayed_nodes, ino); if (node) { if (btrfs_inode->delayed_node) { @@ -83,9 +83,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( /* * It's possible that we're racing into the middle of removing - * this node from the radix tree. In this case, the refcount + * this node from the xarray. In this case, the refcount * was zero and it should never go back to one. Just return - * NULL like it was never in the radix at all; our release + * NULL like it was never in the xarray at all; our release * function is in the process of removing it. * * Some implementations of refcount_inc refuse to bump the @@ -93,7 +93,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * here, refcount_inc() may decide to just WARN_ONCE() instead * of actually bumping the refcount. * - * If this node is properly in the radix, we want to bump the + * If this node is properly in the xarray, we want to bump the * refcount twice, once for the inode and once for this get * operation. */ @@ -120,6 +120,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( struct btrfs_root *root = btrfs_inode->root; u64 ino = btrfs_ino(btrfs_inode); int ret; + void *ptr; again: node = btrfs_get_delayed_node(btrfs_inode); @@ -131,26 +132,30 @@ again: return ERR_PTR(-ENOMEM); btrfs_init_delayed_node(node, root, ino); - /* cached in the btrfs inode and can be accessed */ + /* Cached in the inode and can be accessed. */ refcount_set(&node->refs, 2); - ret = radix_tree_preload(GFP_NOFS); - if (ret) { + /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */ + ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS); + if (ret == -ENOMEM) { kmem_cache_free(delayed_node_cache, node); - return ERR_PTR(ret); + return ERR_PTR(-ENOMEM); } - spin_lock(&root->inode_lock); - ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); - if (ret == -EEXIST) { + ptr = xa_load(&root->delayed_nodes, ino); + if (ptr) { + /* Somebody inserted it, go back and read it. */ spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, node); - radix_tree_preload_end(); + node = NULL; goto again; } + ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC); + ASSERT(xa_err(ptr) != -EINVAL); + ASSERT(xa_err(ptr) != -ENOMEM); + ASSERT(ptr == NULL); btrfs_inode->delayed_node = node; spin_unlock(&root->inode_lock); - radix_tree_preload_end(); return node; } @@ -269,8 +274,7 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); - radix_tree_delete(&root->delayed_nodes_tree, - delayed_node->inode_id); + xa_erase(&root->delayed_nodes, delayed_node->inode_id); spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, delayed_node); } @@ -2038,34 +2042,36 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { - u64 inode_id = 0; + unsigned long index = 0; struct btrfs_delayed_node *delayed_nodes[8]; - int i, n; while (1) { + struct btrfs_delayed_node *node; + int count; + spin_lock(&root->inode_lock); - n = radix_tree_gang_lookup(&root->delayed_nodes_tree, - (void **)delayed_nodes, inode_id, - ARRAY_SIZE(delayed_nodes)); - if (!n) { + if (xa_empty(&root->delayed_nodes)) { spin_unlock(&root->inode_lock); - break; + return; } - inode_id = delayed_nodes[n - 1]->inode_id + 1; - for (i = 0; i < n; i++) { + count = 0; + xa_for_each_start(&root->delayed_nodes, index, node, index) { /* * Don't increase refs in case the node is dead and * about to be removed from the tree in the loop below */ - if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) - delayed_nodes[i] = NULL; + if (refcount_inc_not_zero(&node->refs)) { + delayed_nodes[count] = node; + count++; + } + if (count >= ARRAY_SIZE(delayed_nodes)) + break; } spin_unlock(&root->inode_lock); + index++; - for (i = 0; i < n; i++) { - if (!delayed_nodes[i]) - continue; + for (int i = 0; i < count; i++) { __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 00c9181fd356..1b6afff66c32 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -655,7 +655,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; - INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); + /* GFP flags are compatible with XA_FLAGS_*. */ + xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); btrfs_init_root_block_rsv(root); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a1f6e8d3b546..17cfba9ee273 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3805,7 +3805,7 @@ cache_index: * cache. * * This is required for both inode re-read from disk and delayed inode - * in delayed_nodes_tree. + * in the delayed_nodes xarray. */ if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, From 2b0122aaa800b021e36027d7f29e206f87c761d6 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Fri, 8 Dec 2023 11:41:56 +1100 Subject: [PATCH 0884/1562] btrfs: sysfs: validate scrub_speed_max value The value set as scrub_speed_max accepts size with suffixes (k/m/g/t/p/e) but we should still validate it for trailing characters, similar to what we do with chunk_size_store. CC: stable@vger.kernel.org # 5.15+ Signed-off-by: David Disseldorp Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e6b51fb3ddc1..84c05246ffd8 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1783,6 +1783,10 @@ static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, unsigned long long limit; limit = memparse(buf, &endptr); + /* There could be trailing '\n', also catch any typos after the value. */ + endptr = skip_spaces(endptr); + if (*endptr != 0) + return -EINVAL; WRITE_ONCE(device->scrub_speed_max, limit); return len; } From 09e6cef19c9fc0e10547135476865b5272aa0406 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 30 Nov 2023 09:02:08 +1030 Subject: [PATCH 0885/1562] btrfs: refactor alloc_extent_buffer() to allocate-then-attach method Currently alloc_extent_buffer() utilizes find_or_create_page() to allocate one page a time for an extent buffer. This method has the following disadvantages: - find_or_create_page() is the legacy way of allocating new pages With the new folio infrastructure, find_or_create_page() is just redirected to filemap_get_folio(). - Lacks the way to support higher order (order >= 1) folios As we can not yet let filemap give us a higher order folio. This patch would change the workflow by the following way: Old | new -----------------------------------+------------------------------------- | ret = btrfs_alloc_page_array(); for (i = 0; i < num_pages; i++) { | for (i = 0; i < num_pages; i++) { p = find_or_create_page(); | ret = filemap_add_folio(); /* Attach page private */ | /* Reuse page cache if needed */ /* Reused eb if needed */ | | /* Attach page private and | reuse eb if needed */ | } By this we split the page allocation and private attaching into two parts, allowing future updates to each part more easily, and migrate to folio interfaces (especially for possible higher order folios). Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 +- fs/btrfs/extent_io.c | 156 ++++++++++++++++++++++++++++++----------- fs/btrfs/extent_io.h | 3 +- fs/btrfs/inode.c | 2 +- fs/btrfs/raid56.c | 6 +- fs/btrfs/scrub.c | 2 +- 6 files changed, 124 insertions(+), 47 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 2d9974c283c6..5678e73f1509 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -608,7 +608,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) goto out_free_bio; } - ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); + ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages, 0); if (ret2) { ret = BLK_STS_RESOURCE; goto out_free_compressed_pages; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 921e9e17fc65..cecd8939e99a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -674,19 +674,22 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) * @nr_pages: number of pages to allocate * @page_array: the array to fill with pages; any existing non-null entries in * the array will be skipped + * @extra_gfp: the extra GFP flags for the allocation. * * Return: 0 if all pages were able to be allocated; * -ENOMEM otherwise, the partially allocated pages would be freed and * the array slots zeroed */ -int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, + gfp_t extra_gfp) { unsigned int allocated; for (allocated = 0; allocated < nr_pages;) { unsigned int last = allocated; - allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); + allocated = alloc_pages_bulk_array(GFP_NOFS | extra_gfp, + nr_pages, page_array); if (allocated == nr_pages) return 0; @@ -3219,7 +3222,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); - ret = btrfs_alloc_page_array(num_pages, new->pages); + ret = btrfs_alloc_page_array(num_pages, new->pages, 0); if (ret) { btrfs_release_extent_buffer(new); return NULL; @@ -3255,7 +3258,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, return NULL; num_pages = num_extent_pages(eb); - ret = btrfs_alloc_page_array(num_pages, eb->pages); + ret = btrfs_alloc_page_array(num_pages, eb->pages, 0); if (ret) goto err; @@ -3475,16 +3478,75 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) return 0; } + +/* + * Return 0 if eb->pages[i] is attached to btree inode successfully. + * Return >0 if there is already annother extent buffer for the range, + * and @found_eb_ret would be updated. + */ +static int attach_eb_page_to_filemap(struct extent_buffer *eb, int i, + struct extent_buffer **found_eb_ret) +{ + + struct btrfs_fs_info *fs_info = eb->fs_info; + struct address_space *mapping = fs_info->btree_inode->i_mapping; + const unsigned long index = eb->start >> PAGE_SHIFT; + struct folio *existing_folio; + int ret; + + ASSERT(found_eb_ret); + + /* Caller should ensure the page exists. */ + ASSERT(eb->pages[i]); + +retry: + ret = filemap_add_folio(mapping, page_folio(eb->pages[i]), index + i, + GFP_NOFS | __GFP_NOFAIL); + if (!ret) + return 0; + + existing_folio = filemap_lock_folio(mapping, index + i); + /* The page cache only exists for a very short time, just retry. */ + if (IS_ERR(existing_folio)) + goto retry; + + /* For now, we should only have single-page folios for btree inode. */ + ASSERT(folio_nr_pages(existing_folio) == 1); + + if (fs_info->nodesize < PAGE_SIZE) { + /* + * We're going to reuse the existing page, can drop our page + * and subpage structure now. + */ + __free_page(eb->pages[i]); + eb->pages[i] = folio_page(existing_folio, 0); + } else { + struct extent_buffer *existing_eb; + + existing_eb = grab_extent_buffer(fs_info, + folio_page(existing_folio, 0)); + if (existing_eb) { + /* The extent buffer still exists, we can use it directly. */ + *found_eb_ret = existing_eb; + folio_unlock(existing_folio); + folio_put(existing_folio); + return 1; + } + /* The extent buffer no longer exists, we can reuse the folio. */ + __free_page(eb->pages[i]); + eb->pages[i] = folio_page(existing_folio, 0); + } + return 0; +} + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { unsigned long len = fs_info->nodesize; int num_pages; - int i; - unsigned long index = start >> PAGE_SHIFT; + int attached = 0; struct extent_buffer *eb; - struct extent_buffer *exists = NULL; - struct page *p; + struct extent_buffer *existing_eb = NULL; struct address_space *mapping = fs_info->btree_inode->i_mapping; struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; @@ -3535,29 +3597,36 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (fs_info->nodesize < PAGE_SIZE) { prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { - exists = ERR_CAST(prealloc); - goto free_eb; + ret = PTR_ERR(prealloc); + goto out; } } - for (i = 0; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); - if (!p) { - exists = ERR_PTR(-ENOMEM); - btrfs_free_subpage(prealloc); - goto free_eb; - } + /* Allocate all pages first. */ + ret = btrfs_alloc_page_array(num_pages, eb->pages, __GFP_NOFAIL); + if (ret < 0) { + btrfs_free_subpage(prealloc); + goto out; + } - spin_lock(&mapping->private_lock); - exists = grab_extent_buffer(fs_info, p); - if (exists) { - spin_unlock(&mapping->private_lock); - unlock_page(p); - put_page(p); - mark_extent_buffer_accessed(exists, p); - btrfs_free_subpage(prealloc); - goto free_eb; + /* Attach all pages to the filemap. */ + for (int i = 0; i < num_pages; i++) { + struct page *p; + + ret = attach_eb_page_to_filemap(eb, i, &existing_eb); + if (ret > 0) { + ASSERT(existing_eb); + goto out; } + attached++; + + /* + * Only after attach_eb_page_to_filemap(), eb->pages[] is + * reliable, as we may choose to reuse the existing page cache + * and free the allocated page. + */ + p = eb->pages[i]; + spin_lock(&mapping->private_lock); /* Should not fail, as we have preallocated the memory */ ret = attach_extent_buffer_page(eb, p, prealloc); ASSERT(!ret); @@ -3574,7 +3643,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, spin_unlock(&mapping->private_lock); WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); - eb->pages[i] = p; /* * Check if the current page is physically contiguous with previous eb @@ -3601,10 +3669,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, eb->addr = page_address(eb->pages[0]) + offset_in_page(eb->start); again: ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; - } + if (ret) + goto out; spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, @@ -3612,9 +3678,10 @@ again: spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; + ret = 0; + existing_eb = find_extent_buffer(fs_info, start); + if (existing_eb) + goto out; else goto again; } @@ -3627,19 +3694,28 @@ again: * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ - for (i = 0; i < num_pages; i++) + for (int i = 0; i < num_pages; i++) unlock_page(eb->pages[i]); return eb; -free_eb: +out: WARN_ON(!atomic_dec_and_test(&eb->refs)); - for (i = 0; i < num_pages; i++) { - if (eb->pages[i]) - unlock_page(eb->pages[i]); + for (int i = 0; i < attached; i++) { + ASSERT(eb->pages[i]); + detach_extent_buffer_page(eb, eb->pages[i]); + unlock_page(eb->pages[i]); } + /* + * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, + * so it can be cleaned up without utlizing page->mapping. + */ + set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); btrfs_release_extent_buffer(eb); - return exists; + if (ret < 0) + return ERR_PTR(ret); + ASSERT(existing_eb); + return existing_eb; } static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c2c6bfba63c0..c73d53c22ec5 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -302,7 +302,8 @@ int extent_invalidate_folio(struct extent_io_tree *tree, void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *buf); -int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, + gfp_t extra_gfp); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 17cfba9ee273..4e8c82e5d7a6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10028,7 +10028,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) return -ENOMEM; - ret = btrfs_alloc_page_array(nr_pages, pages); + ret = btrfs_alloc_page_array(nr_pages, pages, 0); if (ret) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 90f12c0e88a1..792c8e17c31d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -964,7 +964,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) { int ret; - ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); + ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, 0); if (ret < 0) return ret; /* Mapping all sectors */ @@ -979,7 +979,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) int ret; ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, - rbio->stripe_pages + data_pages); + rbio->stripe_pages + data_pages, 0); if (ret < 0) return ret; @@ -1530,7 +1530,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) const int data_pages = rbio->nr_data * rbio->stripe_npages; int ret; - ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages); + ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, 0); if (ret < 0) return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c0269e126f43..a01807cbd4d4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -261,7 +261,7 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info, atomic_set(&stripe->pending_io, 0); spin_lock_init(&stripe->write_error_lock); - ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages); + ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, 0); if (ret < 0) goto error; From 082d5bb9b336d533b7b968f4f8712e7755a9876a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 7 Dec 2023 09:39:27 +1030 Subject: [PATCH 0886/1562] btrfs: migrate extent_buffer::pages[] to folio For now extent_buffer::pages[] are still only accepting single page pointer, thus we can migrate to folios pretty easily. As for single page, page and folio are 1:1 mapped, including their page flags. This patch would just do the conversion from struct page to struct folio, providing the first step to higher order folio in the future. This conversion is pretty simple: - extent_buffer::pages[] -> extent_buffer::folios[] - page_address(eb->pages[i]) -> folio_address(eb->pages[i]) - eb->pages[i] -> folio_page(eb->folios[i], 0) There would be more specific cleanups preparing for the incoming higher order folio support. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/accessors.c | 20 ++--- fs/btrfs/accessors.h | 4 +- fs/btrfs/ctree.c | 2 +- fs/btrfs/disk-io.c | 19 ++--- fs/btrfs/extent_io.c | 125 ++++++++++++++++++------------- fs/btrfs/extent_io.h | 7 +- fs/btrfs/tests/extent-io-tests.c | 4 +- 7 files changed, 104 insertions(+), 77 deletions(-) diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 206cf1612c1d..8f7cbb7154d4 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -27,7 +27,7 @@ static bool check_setget_bounds(const struct extent_buffer *eb, void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) { token->eb = eb; - token->kaddr = page_address(eb->pages[0]); + token->kaddr = folio_address(eb->folios[0]); token->offset = 0; } @@ -50,7 +50,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e * an offset into the extent buffer page array, cast to a specific type. This * gives us all the type checking. * - * The extent buffer pages stored in the array pages do not form a contiguous + * The extent buffer pages stored in the array folios may not form a contiguous * phyusical range, but the API functions assume the linear offset to the range * from 0 to metadata node size. */ @@ -74,13 +74,13 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ member_offset + size <= token->offset + PAGE_SIZE) { \ return get_unaligned_le##bits(token->kaddr + oip); \ } \ - token->kaddr = page_address(token->eb->pages[idx]); \ + token->kaddr = folio_address(token->eb->folios[idx]); \ token->offset = idx << PAGE_SHIFT; \ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \ return get_unaligned_le##bits(token->kaddr + oip); \ \ memcpy(lebytes, token->kaddr + oip, part); \ - token->kaddr = page_address(token->eb->pages[idx + 1]); \ + token->kaddr = folio_address(token->eb->folios[idx + 1]); \ token->offset = (idx + 1) << PAGE_SHIFT; \ memcpy(lebytes + part, token->kaddr, size - part); \ return get_unaligned_le##bits(lebytes); \ @@ -91,7 +91,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const unsigned long member_offset = (unsigned long)ptr + off; \ const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ const unsigned long idx = get_eb_page_index(member_offset); \ - char *kaddr = page_address(eb->pages[idx]); \ + char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ const int part = PAGE_SIZE - oip; \ u8 lebytes[sizeof(u##bits)]; \ @@ -101,7 +101,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ return get_unaligned_le##bits(kaddr + oip); \ \ memcpy(lebytes, kaddr + oip, part); \ - kaddr = page_address(eb->pages[idx + 1]); \ + kaddr = folio_address(eb->folios[idx + 1]); \ memcpy(lebytes + part, kaddr, size - part); \ return get_unaligned_le##bits(lebytes); \ } \ @@ -125,7 +125,7 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ put_unaligned_le##bits(val, token->kaddr + oip); \ return; \ } \ - token->kaddr = page_address(token->eb->pages[idx]); \ + token->kaddr = folio_address(token->eb->folios[idx]); \ token->offset = idx << PAGE_SHIFT; \ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ put_unaligned_le##bits(val, token->kaddr + oip); \ @@ -133,7 +133,7 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ } \ put_unaligned_le##bits(val, lebytes); \ memcpy(token->kaddr + oip, lebytes, part); \ - token->kaddr = page_address(token->eb->pages[idx + 1]); \ + token->kaddr = folio_address(token->eb->folios[idx + 1]); \ token->offset = (idx + 1) << PAGE_SHIFT; \ memcpy(token->kaddr, lebytes + part, size - part); \ } \ @@ -143,7 +143,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ const unsigned long member_offset = (unsigned long)ptr + off; \ const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ const unsigned long idx = get_eb_page_index(member_offset); \ - char *kaddr = page_address(eb->pages[idx]); \ + char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ const int part = PAGE_SIZE - oip; \ u8 lebytes[sizeof(u##bits)]; \ @@ -156,7 +156,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ \ put_unaligned_le##bits(val, lebytes); \ memcpy(kaddr + oip, lebytes, part); \ - kaddr = page_address(eb->pages[idx + 1]); \ + kaddr = folio_address(eb->folios[idx + 1]); \ memcpy(kaddr, lebytes + part, size - part); \ } diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index aa0844535644..ed7aa32972ad 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -90,14 +90,14 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ - const type *p = page_address(eb->pages[0]) + \ + const type *p = folio_address(eb->folios[0]) + \ offset_in_page(eb->start); \ return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ - type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ + type *p = folio_address(eb->folios[0]) + offset_in_page(eb->start); \ put_unaligned_le##bits(val, &p->member); \ } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 137c4eb24c28..e6c535cf3749 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -832,7 +832,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, if (oip + key_size <= PAGE_SIZE) { const unsigned long idx = get_eb_page_index(offset); - char *kaddr = page_address(eb->pages[idx]); + char *kaddr = folio_address(eb->folios[idx]); oip = get_eb_offset_in_page(eb, offset); tmp = (struct btrfs_disk_key *)(kaddr + oip); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1b6afff66c32..74ccf43d47bc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -89,7 +89,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) first_page_part = fs_info->nodesize; num_pages = 1; } else { - kaddr = page_address(buf->pages[0]); + kaddr = folio_address(buf->folios[0]); first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); num_pages = num_extent_pages(buf); } @@ -98,7 +98,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) first_page_part - BTRFS_CSUM_SIZE); for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) { - kaddr = page_address(buf->pages[i]); + kaddr = folio_address(buf->folios[i]); crypto_shash_update(shash, kaddr, PAGE_SIZE); } memset(result, 0, BTRFS_CSUM_SIZE); @@ -184,13 +184,14 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, return -EROFS; for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - u64 start = max_t(u64, eb->start, page_offset(p)); - u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE); + u64 start = max_t(u64, eb->start, folio_pos(eb->folios[i])); + u64 end = min_t(u64, eb->start + eb->len, + folio_pos(eb->folios[i]) + PAGE_SIZE); u32 len = end - start; ret = btrfs_repair_io_failure(fs_info, 0, start, len, - start, p, offset_in_page(start), mirror_num); + start, folio_page(eb->folios[i], 0), + offset_in_page(start), mirror_num); if (ret) break; } @@ -277,8 +278,8 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) if (WARN_ON_ONCE(found_start != eb->start)) return BLK_STS_IOERR; - if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start, - eb->len))) + if (WARN_ON(!btrfs_page_test_uptodate(fs_info, folio_page(eb->folios[0], 0), + eb->start, eb->len))) return BLK_STS_IOERR; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, @@ -387,7 +388,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, } csum_tree_block(eb, result); - header_csum = page_address(eb->pages[0]) + + header_csum = folio_address(eb->folios[0]) + get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum)); if (memcmp(result, header_csum, csum_size) != 0) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cecd8939e99a..557b9c65840e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -712,6 +712,26 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, return 0; } +/* + * Populate needed folios for the extent buffer. + * + * For now, the folios populated are always in order 0 (aka, single page). + */ +static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp) +{ + struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 }; + int num_pages = num_extent_pages(eb); + int ret; + + ret = btrfs_alloc_page_array(num_pages, page_array, extra_gfp); + if (ret < 0) + return ret; + + for (int i = 0; i < num_pages; i++) + eb->folios[i] = page_folio(page_array[i]); + return 0; +} + static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, unsigned int pg_offset) @@ -1688,7 +1708,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; if (fs_info->nodesize < PAGE_SIZE) { - struct page *p = eb->pages[0]; + struct page *p = folio_page(eb->folios[0], 0); lock_page(p); btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len); @@ -1702,7 +1722,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, unlock_page(p); } else { for (int i = 0; i < num_extent_pages(eb); i++) { - struct page *p = eb->pages[i]; + struct page *p = folio_page(eb->folios[i], 0); lock_page(p); clear_page_dirty_for_io(p); @@ -3160,7 +3180,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { - struct page *page = eb->pages[i]; + struct page *page = folio_page(eb->folios[i], 0); if (!page) continue; @@ -3222,7 +3242,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); - ret = btrfs_alloc_page_array(num_pages, new->pages, 0); + ret = alloc_eb_folio_array(new, 0); if (ret) { btrfs_release_extent_buffer(new); return NULL; @@ -3230,7 +3250,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) for (i = 0; i < num_pages; i++) { int ret; - struct page *p = new->pages[i]; + struct page *p = folio_page(new->folios[i], 0); ret = attach_extent_buffer_page(new, p, NULL); if (ret < 0) { @@ -3258,12 +3278,12 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, return NULL; num_pages = num_extent_pages(eb); - ret = btrfs_alloc_page_array(num_pages, eb->pages, 0); + ret = alloc_eb_folio_array(eb, 0); if (ret) goto err; for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; + struct page *p = folio_page(eb->folios[i], 0); ret = attach_extent_buffer_page(eb, p, NULL); if (ret < 0) @@ -3277,9 +3297,9 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, return eb; err: for (i = 0; i < num_pages; i++) { - if (eb->pages[i]) { - detach_extent_buffer_page(eb, eb->pages[i]); - __free_page(eb->pages[i]); + if (eb->folios[i]) { + detach_extent_buffer_page(eb, folio_page(eb->folios[i], 0)); + __free_page(folio_page(eb->folios[i], 0)); } } __free_extent_buffer(eb); @@ -3337,7 +3357,7 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb, num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; + struct page *p = folio_page(eb->folios[i], 0); if (p != accessed) mark_page_accessed(p); @@ -3480,8 +3500,8 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) /* - * Return 0 if eb->pages[i] is attached to btree inode successfully. - * Return >0 if there is already annother extent buffer for the range, + * Return 0 if eb->folios[i] is attached to btree inode successfully. + * Return >0 if there is already another extent buffer for the range, * and @found_eb_ret would be updated. */ static int attach_eb_page_to_filemap(struct extent_buffer *eb, int i, @@ -3496,11 +3516,11 @@ static int attach_eb_page_to_filemap(struct extent_buffer *eb, int i, ASSERT(found_eb_ret); - /* Caller should ensure the page exists. */ - ASSERT(eb->pages[i]); + /* Caller should ensure the folio exists. */ + ASSERT(eb->folios[i]); retry: - ret = filemap_add_folio(mapping, page_folio(eb->pages[i]), index + i, + ret = filemap_add_folio(mapping, eb->folios[i], index + i, GFP_NOFS | __GFP_NOFAIL); if (!ret) return 0; @@ -3518,8 +3538,8 @@ retry: * We're going to reuse the existing page, can drop our page * and subpage structure now. */ - __free_page(eb->pages[i]); - eb->pages[i] = folio_page(existing_folio, 0); + __free_page(folio_page(eb->folios[i], 0)); + eb->folios[i] = existing_folio; } else { struct extent_buffer *existing_eb; @@ -3533,8 +3553,8 @@ retry: return 1; } /* The extent buffer no longer exists, we can reuse the folio. */ - __free_page(eb->pages[i]); - eb->pages[i] = folio_page(existing_folio, 0); + __free_page(folio_page(eb->folios[i], 0)); + eb->folios[i] = existing_folio; } return 0; } @@ -3603,7 +3623,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } /* Allocate all pages first. */ - ret = btrfs_alloc_page_array(num_pages, eb->pages, __GFP_NOFAIL); + ret = alloc_eb_folio_array(eb, __GFP_NOFAIL); if (ret < 0) { btrfs_free_subpage(prealloc); goto out; @@ -3621,11 +3641,11 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, attached++; /* - * Only after attach_eb_page_to_filemap(), eb->pages[] is + * Only after attach_eb_page_to_filemap(), eb->folios[] is * reliable, as we may choose to reuse the existing page cache * and free the allocated page. */ - p = eb->pages[i]; + p = folio_page(eb->folios[i], 0); spin_lock(&mapping->private_lock); /* Should not fail, as we have preallocated the memory */ ret = attach_extent_buffer_page(eb, p, prealloc); @@ -3648,7 +3668,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * Check if the current page is physically contiguous with previous eb * page. */ - if (i && eb->pages[i - 1] + 1 != p) + if (i && folio_page(eb->folios[i - 1], 0) + 1 != p) page_contig = false; if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len)) @@ -3666,7 +3686,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); /* All pages are physically contiguous, can skip cross page handling. */ if (page_contig) - eb->addr = page_address(eb->pages[0]) + offset_in_page(eb->start); + eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); again: ret = radix_tree_preload(GFP_NOFS); if (ret) @@ -3695,15 +3715,15 @@ again: * live buffer and won't free them prematurely. */ for (int i = 0; i < num_pages; i++) - unlock_page(eb->pages[i]); + unlock_page(folio_page(eb->folios[i], 0)); return eb; out: WARN_ON(!atomic_dec_and_test(&eb->refs)); for (int i = 0; i < attached; i++) { - ASSERT(eb->pages[i]); - detach_extent_buffer_page(eb, eb->pages[i]); - unlock_page(eb->pages[i]); + ASSERT(eb->folios[i]); + detach_extent_buffer_page(eb, folio_page(eb->folios[i], 0)); + unlock_page(folio_page(eb->folios[i], 0)); } /* * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, @@ -3822,7 +3842,7 @@ static void btree_clear_page_dirty(struct page *page) static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page = eb->pages[0]; + struct page *page = folio_page(eb->folios[0], 0); bool last; /* btree_clear_page_dirty() needs page locked */ @@ -3874,7 +3894,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; + page = folio_page(eb->folios[i], 0); if (!PageDirty(page)) continue; lock_page(page); @@ -3913,19 +3933,19 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) * the above race. */ if (subpage) - lock_page(eb->pages[0]); + lock_page(folio_page(eb->folios[0], 0)); for (i = 0; i < num_pages; i++) - btrfs_page_set_dirty(eb->fs_info, eb->pages[i], + btrfs_page_set_dirty(eb->fs_info, folio_page(eb->folios[i], 0), eb->start, eb->len); if (subpage) - unlock_page(eb->pages[0]); + unlock_page(folio_page(eb->folios[0], 0)); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, eb->len, eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG for (i = 0; i < num_pages; i++) - ASSERT(PageDirty(eb->pages[i])); + ASSERT(PageDirty(folio_page(eb->folios[i], 0))); #endif } @@ -3939,7 +3959,7 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb) clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; + page = folio_page(eb->folios[i], 0); if (!page) continue; @@ -3965,7 +3985,7 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; + page = folio_page(eb->folios[i], 0); /* * This is special handling for metadata subpage, as regular @@ -4056,11 +4076,12 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); if (eb->fs_info->nodesize < PAGE_SIZE) { - __bio_add_page(&bbio->bio, eb->pages[0], eb->len, - eb->start - page_offset(eb->pages[0])); + __bio_add_page(&bbio->bio, folio_page(eb->folios[0], 0), eb->len, + eb->start - folio_pos(eb->folios[0])); } else { for (i = 0; i < num_pages; i++) - __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0); + __bio_add_page(&bbio->bio, folio_page(eb->folios[i], 0), + PAGE_SIZE, 0); } btrfs_submit_bio(bbio, mirror_num); @@ -4131,7 +4152,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, offset = get_eb_offset_in_page(eb, start); while (len > 0) { - page = eb->pages[i]; + page = folio_page(eb->folios[i], 0); cur = min(len, (PAGE_SIZE - offset)); kaddr = page_address(page); @@ -4168,7 +4189,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, offset = get_eb_offset_in_page(eb, start); while (len > 0) { - page = eb->pages[i]; + page = folio_page(eb->folios[i], 0); cur = min(len, (PAGE_SIZE - offset)); kaddr = page_address(page); @@ -4206,7 +4227,7 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, offset = get_eb_offset_in_page(eb, start); while (len > 0) { - page = eb->pages[i]; + page = folio_page(eb->folios[i], 0); cur = min(len, (PAGE_SIZE - offset)); @@ -4281,7 +4302,7 @@ static void __write_extent_buffer(const struct extent_buffer *eb, offset = get_eb_offset_in_page(eb, start); while (len > 0) { - page = eb->pages[i]; + page = folio_page(eb->folios[i], 0); if (check_uptodate) assert_eb_page_uptodate(eb, page); @@ -4319,7 +4340,7 @@ static void memset_extent_buffer(const struct extent_buffer *eb, int c, unsigned long index = get_eb_page_index(cur); unsigned int offset = get_eb_offset_in_page(eb, cur); unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset); - struct page *page = eb->pages[index]; + struct page *page = folio_page(eb->folios[index], 0); assert_eb_page_uptodate(eb, page); memset_page(page, offset, c, cur_len); @@ -4347,7 +4368,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, unsigned long index = get_eb_page_index(cur); unsigned long offset = get_eb_offset_in_page(src, cur); unsigned long cur_len = min(src->len, PAGE_SIZE - offset); - void *addr = page_address(src->pages[index]) + offset; + void *addr = folio_address(src->folios[index]) + offset; write_extent_buffer(dst, addr, cur, cur_len); @@ -4376,7 +4397,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, offset = get_eb_offset_in_page(dst, dst_offset); while (len > 0) { - page = dst->pages[i]; + page = folio_page(dst->folios[i], 0); assert_eb_page_uptodate(dst, page); cur = min(len, (unsigned long)(PAGE_SIZE - offset)); @@ -4439,7 +4460,7 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, size_t offset; eb_bitmap_offset(eb, start, nr, &i, &offset); - page = eb->pages[i]; + page = folio_page(eb->folios[i], 0); assert_eb_page_uptodate(eb, page); kaddr = page_address(page); return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); @@ -4451,7 +4472,7 @@ static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long if (check_eb_range(eb, bytenr, 1)) return NULL; - return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr); + return folio_address(eb->folios[index]) + get_eb_offset_in_page(eb, bytenr); } /* @@ -4558,7 +4579,7 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long pg_off = get_eb_offset_in_page(dst, cur_src); unsigned long cur_len = min(src_offset + len - cur_src, PAGE_SIZE - pg_off); - void *src_addr = page_address(dst->pages[pg_index]) + pg_off; + void *src_addr = folio_address(dst->folios[pg_index]) + pg_off; const bool use_memmove = areas_overlap(src_offset + cur_off, dst_offset + cur_off, cur_len); @@ -4605,8 +4626,8 @@ void memmove_extent_buffer(const struct extent_buffer *dst, cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); - src_addr = page_address(dst->pages[src_i]) + src_off_in_page - - cur + 1; + src_addr = folio_address(dst->folios[src_i]) + src_off_in_page - + cur + 1; use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, cur); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c73d53c22ec5..66c2e214b141 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -94,7 +94,12 @@ struct extent_buffer { struct rw_semaphore lock; - struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; + /* + * Pointers to all the folios of the extent buffer. + * + * For now the folio is always order 0 (aka, a single page). + */ + struct folio *folios[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; pid_t lock_owner; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 1cc86af97dc6..25b3349595e0 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -652,7 +652,7 @@ static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory, const char *test_name) { for (int i = 0; i < eb->len; i++) { - struct page *page = eb->pages[i >> PAGE_SHIFT]; + struct page *page = folio_page(eb->folios[i >> PAGE_SHIFT], 0); void *addr = page_address(page) + offset_in_page(i); if (memcmp(addr, memory + i, 1) != 0) { @@ -668,7 +668,7 @@ static int verify_eb_and_memory(struct extent_buffer *eb, void *memory, const char *test_name) { for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) { - void *eb_addr = page_address(eb->pages[i]); + void *eb_addr = folio_address(eb->folios[i]); if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) { dump_eb_and_memory_contents(eb, memory, test_name); From 13df3775efcaf412980c45aba2c321479bfc209a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 7 Dec 2023 09:39:28 +1030 Subject: [PATCH 0887/1562] btrfs: cleanup metadata page pointer usage Although we have migrated extent_buffer::pages[] to folios[], we're still mostly using the folio_page() help to grab the page. This patch would do the following cleanups for metadata: - Introduce num_extent_folios() helper This is to replace most num_extent_pages() callers. - Use num_extent_folios() to iterate future large folios This allows us to use things like bio_add_folio()/bio_add_folio_nofail(), and only set the needed flags for the folio (aka the leading/tailing page), which reduces the loop iteration to 1 for large folios. - Change metadata related functions to use folio pointers Including their function name, involving: * attach_extent_buffer_page() * detach_extent_buffer_page() * page_range_has_eb() * btrfs_release_extent_buffer_pages() * btree_clear_page_dirty() * btrfs_page_inc_eb_refs() * btrfs_page_dec_eb_refs() - Change btrfs_is_subpage() to accept an address_space pointer This is to allow both page->mapping and folio->mapping to be utilized. As data is still using the old per-page code, and may keep so for a while. - Special corner case place holder for future order mismatches between extent buffer and inode filemap For now it's just a block of comments and a dead ASSERT(), no real handling yet. The subpage code would still go page, just because subpage and large folio are conflicting conditions, thus we don't need to bother subpage with higher order folios at all. Just folio_page(folio, 0) would be enough. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba [ minor styling tweaks ] Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 + fs/btrfs/extent_io.c | 317 +++++++++++++++++++++++-------------------- fs/btrfs/extent_io.h | 14 ++ fs/btrfs/inode.c | 2 +- fs/btrfs/subpage.c | 55 ++++---- fs/btrfs/subpage.h | 8 +- 6 files changed, 223 insertions(+), 179 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 74ccf43d47bc..93702782fe76 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -97,6 +97,12 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, first_page_part - BTRFS_CSUM_SIZE); + /* + * Multiple single-page folios case would reach here. + * + * nodesize <= PAGE_SIZE and large folio all handled by above + * crypto_shash_update() already. + */ for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) { kaddr = folio_address(buf->folios[i]); crypto_shash_update(shash, kaddr, PAGE_SIZE); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 557b9c65840e..a5c2acd5c8ae 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -441,7 +441,7 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) else btrfs_page_clear_uptodate(fs_info, page, start, len); - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, page->mapping)) unlock_page(page); else btrfs_subpage_end_reader(fs_info, page, start, len); @@ -565,7 +565,7 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) struct folio *folio = page_folio(page); ASSERT(PageLocked(page)); - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, page->mapping)) return; ASSERT(folio_test_private(folio)); @@ -886,11 +886,10 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, } while (size); } -static int attach_extent_buffer_page(struct extent_buffer *eb, - struct page *page, - struct btrfs_subpage *prealloc) +static int attach_extent_buffer_folio(struct extent_buffer *eb, + struct folio *folio, + struct btrfs_subpage *prealloc) { - struct folio *folio = page_folio(page); struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0; @@ -900,8 +899,8 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, * For cloned or dummy extent buffers, their pages are not mapped and * will not race with any other ebs. */ - if (page->mapping) - lockdep_assert_held(&page->mapping->private_lock); + if (folio->mapping) + lockdep_assert_held(&folio->mapping->private_lock); if (fs_info->nodesize >= PAGE_SIZE) { if (!folio_test_private(folio)) @@ -922,7 +921,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, folio_attach_private(folio, prealloc); else /* Do new allocation to attach subpage */ - ret = btrfs_attach_subpage(fs_info, page, + ret = btrfs_attach_subpage(fs_info, folio_page(folio, 0), BTRFS_SUBPAGE_METADATA); return ret; } @@ -939,7 +938,7 @@ int set_page_extent_mapped(struct page *page) fs_info = btrfs_sb(page->mapping->host->i_sb); - if (btrfs_is_subpage(fs_info, page)) + if (btrfs_is_subpage(fs_info, page->mapping)) return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); @@ -957,7 +956,7 @@ void clear_page_extent_mapped(struct page *page) return; fs_info = btrfs_sb(page->mapping->host->i_sb); - if (btrfs_is_subpage(fs_info, page)) + if (btrfs_is_subpage(fs_info, page->mapping)) return btrfs_detach_subpage(fs_info, page); folio_detach_private(folio); @@ -1280,7 +1279,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, * For regular sector size == page size case, since one page only * contains one sector, we return the page offset directly. */ - if (!btrfs_is_subpage(fs_info, page)) { + if (!btrfs_is_subpage(fs_info, page->mapping)) { *start = page_offset(page); *end = page_offset(page) + PAGE_SIZE; return; @@ -1721,16 +1720,21 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, wbc_account_cgroup_owner(wbc, p, eb->len); unlock_page(p); } else { - for (int i = 0; i < num_extent_pages(eb); i++) { - struct page *p = folio_page(eb->folios[i], 0); + int num_folios = num_extent_folios(eb); - lock_page(p); - clear_page_dirty_for_io(p); - set_page_writeback(p); - __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0); - wbc_account_cgroup_owner(wbc, p, PAGE_SIZE); - wbc->nr_to_write--; - unlock_page(p); + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + bool ret; + + folio_lock(folio); + folio_clear_dirty_for_io(folio); + folio_start_writeback(folio); + ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ASSERT(ret); + wbc_account_cgroup_owner(wbc, folio_page(folio, 0), + folio_size(folio)); + wbc->nr_to_write -= folio_nr_pages(folio); + folio_unlock(folio); } } btrfs_submit_bio(bbio, 0); @@ -3088,12 +3092,11 @@ static int extent_buffer_under_io(const struct extent_buffer *eb) test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } -static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) +static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&folio->mapping->private_lock); if (folio_test_private(folio)) { subpage = folio_get_private(folio); @@ -3109,22 +3112,21 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) return false; } -static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page) +static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio) { struct btrfs_fs_info *fs_info = eb->fs_info; const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); - struct folio *folio = page_folio(page); /* * For mapped eb, we're going to change the folio private, which should * be done under the private_lock. */ if (mapped) - spin_lock(&page->mapping->private_lock); + spin_lock(&folio->mapping->private_lock); if (!folio_test_private(folio)) { if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->private_lock); return; } @@ -3138,13 +3140,13 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag */ if (folio_test_private(folio) && folio_get_private(folio) == eb) { BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(PageDirty(page)); - BUG_ON(PageWriteback(page)); + BUG_ON(folio_test_dirty(folio)); + BUG_ON(folio_test_writeback(folio)); /* We need to make sure we haven't be attached to a new eb. */ folio_detach_private(folio); } if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->private_lock); return; } @@ -3154,41 +3156,39 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag * attached to one dummy eb, no sharing. */ if (!mapped) { - btrfs_detach_subpage(fs_info, page); + btrfs_detach_subpage(fs_info, folio_page(folio, 0)); return; } - btrfs_page_dec_eb_refs(fs_info, page); + btrfs_folio_dec_eb_refs(fs_info, folio); /* * We can only detach the folio private if there are no other ebs in the * page range and no unfinished IO. */ - if (!page_range_has_eb(fs_info, page)) - btrfs_detach_subpage(fs_info, page); + if (!folio_range_has_eb(fs_info, folio)) + btrfs_detach_subpage(fs_info, folio_page(folio, 0)); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->private_lock); } /* Release all pages attached to the extent buffer */ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) { - int i; - int num_pages; + int num_folios = num_extent_folios(eb); ASSERT(!extent_buffer_under_io(eb)); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *page = folio_page(eb->folios[i], 0); + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; - if (!page) + if (!folio) continue; - detach_extent_buffer_page(eb, page); + detach_extent_buffer_folio(eb, folio); - /* One for when we allocated the page */ - put_page(page); + /* One for when we allocated the folio. */ + folio_put(folio); } } @@ -3226,9 +3226,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { - int i; struct extent_buffer *new; - int num_pages = num_extent_pages(src); + int num_folios = num_extent_folios(src); int ret; new = __alloc_extent_buffer(src->fs_info, src->start, src->len); @@ -3248,16 +3247,16 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) return NULL; } - for (i = 0; i < num_pages; i++) { + for (int i = 0; i < num_folios; i++) { + struct folio *folio = new->folios[i]; int ret; - struct page *p = folio_page(new->folios[i], 0); - ret = attach_extent_buffer_page(new, p, NULL); + ret = attach_extent_buffer_folio(new, folio, NULL); if (ret < 0) { btrfs_release_extent_buffer(new); return NULL; } - WARN_ON(PageDirty(p)); + WARN_ON(folio_test_dirty(folio)); } copy_extent_buffer_full(new, src); set_extent_buffer_uptodate(new); @@ -3269,23 +3268,20 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len) { struct extent_buffer *eb; - int num_pages; - int i; + int num_folios = 0; int ret; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; - num_pages = num_extent_pages(eb); ret = alloc_eb_folio_array(eb, 0); if (ret) goto err; - for (i = 0; i < num_pages; i++) { - struct page *p = folio_page(eb->folios[i], 0); - - ret = attach_extent_buffer_page(eb, p, NULL); + num_folios = num_extent_folios(eb); + for (int i = 0; i < num_folios; i++) { + ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); if (ret < 0) goto err; } @@ -3296,10 +3292,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, return eb; err: - for (i = 0; i < num_pages; i++) { + for (int i = 0; i < num_folios; i++) { if (eb->folios[i]) { - detach_extent_buffer_page(eb, folio_page(eb->folios[i], 0)); - __free_page(folio_page(eb->folios[i], 0)); + detach_extent_buffer_folio(eb, eb->folios[i]); + __folio_put(eb->folios[i]); } } __free_extent_buffer(eb); @@ -3348,20 +3344,14 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } -static void mark_extent_buffer_accessed(struct extent_buffer *eb, - struct page *accessed) +static void mark_extent_buffer_accessed(struct extent_buffer *eb) { - int num_pages, i; + int num_folios= num_extent_folios(eb); check_buffer_tree_ref(eb); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *p = folio_page(eb->folios[i], 0); - - if (p != accessed) - mark_page_accessed(p); - } + for (int i = 0; i < num_folios; i++) + folio_mark_accessed(eb->folios[i]); } struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, @@ -3389,7 +3379,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, spin_lock(&eb->refs_lock); spin_unlock(&eb->refs_lock); } - mark_extent_buffer_accessed(eb, NULL); + mark_extent_buffer_accessed(eb); return eb; } @@ -3503,9 +3493,12 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) * Return 0 if eb->folios[i] is attached to btree inode successfully. * Return >0 if there is already another extent buffer for the range, * and @found_eb_ret would be updated. + * Return -EAGAIN if the filemap has an existing folio but with different size + * than @eb. + * The caller needs to free the existing folios and retry using the same order. */ -static int attach_eb_page_to_filemap(struct extent_buffer *eb, int i, - struct extent_buffer **found_eb_ret) +static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, + struct extent_buffer **found_eb_ret) { struct btrfs_fs_info *fs_info = eb->fs_info; @@ -3533,6 +3526,12 @@ retry: /* For now, we should only have single-page folios for btree inode. */ ASSERT(folio_nr_pages(existing_folio) == 1); + if (folio_size(existing_folio) != folio_size(eb->folios[0])) { + folio_unlock(existing_folio); + folio_put(existing_folio); + return -EAGAIN; + } + if (fs_info->nodesize < PAGE_SIZE) { /* * We're going to reuse the existing page, can drop our page @@ -3563,7 +3562,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { unsigned long len = fs_info->nodesize; - int num_pages; + int num_folios; int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; @@ -3605,8 +3604,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); - num_pages = num_extent_pages(eb); - /* * Preallocate folio private for subpage case, so that we won't * allocate memory with private_lock nor page lock hold. @@ -3622,6 +3619,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } } +reallocate: /* Allocate all pages first. */ ret = alloc_eb_folio_array(eb, __GFP_NOFAIL); if (ret < 0) { @@ -3629,26 +3627,51 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, goto out; } + num_folios = num_extent_folios(eb); /* Attach all pages to the filemap. */ - for (int i = 0; i < num_pages; i++) { - struct page *p; + for (int i = 0; i < num_folios; i++) { + struct folio *folio; - ret = attach_eb_page_to_filemap(eb, i, &existing_eb); + ret = attach_eb_folio_to_filemap(eb, i, &existing_eb); if (ret > 0) { ASSERT(existing_eb); goto out; } + + /* + * TODO: Special handling for a corner case where the order of + * folios mismatch between the new eb and filemap. + * + * This happens when: + * + * - the new eb is using higher order folio + * + * - the filemap is still using 0-order folios for the range + * This can happen at the previous eb allocation, and we don't + * have higher order folio for the call. + * + * - the existing eb has already been freed + * + * In this case, we have to free the existing folios first, and + * re-allocate using the same order. + * Thankfully this is not going to happen yet, as we're still + * using 0-order folios. + */ + if (unlikely(ret == -EAGAIN)) { + ASSERT(0); + goto reallocate; + } attached++; /* - * Only after attach_eb_page_to_filemap(), eb->folios[] is + * Only after attach_eb_folio_to_filemap(), eb->folios[] is * reliable, as we may choose to reuse the existing page cache * and free the allocated page. */ - p = folio_page(eb->folios[i], 0); + folio = eb->folios[i]; spin_lock(&mapping->private_lock); /* Should not fail, as we have preallocated the memory */ - ret = attach_extent_buffer_page(eb, p, prealloc); + ret = attach_extent_buffer_folio(eb, folio, prealloc); ASSERT(!ret); /* * To inform we have extra eb under allocation, so that @@ -3659,19 +3682,23 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * detach_extent_buffer_page(). * Thus needs no special handling in error path. */ - btrfs_page_inc_eb_refs(fs_info, p); + btrfs_folio_inc_eb_refs(fs_info, folio); spin_unlock(&mapping->private_lock); - WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); + WARN_ON(btrfs_page_test_dirty(fs_info, folio_page(folio, 0), + eb->start, eb->len)); /* * Check if the current page is physically contiguous with previous eb * page. + * At this stage, either we allocated a large folio, thus @i + * would only be 0, or we fall back to per-page allocation. */ - if (i && folio_page(eb->folios[i - 1], 0) + 1 != p) + if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) page_contig = false; - if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len)) + if (!btrfs_page_test_uptodate(fs_info, folio_page(folio, 0), + eb->start, eb->len)) uptodate = 0; /* @@ -3714,7 +3741,7 @@ again: * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ - for (int i = 0; i < num_pages; i++) + for (int i = 0; i < num_folios; i++) unlock_page(folio_page(eb->folios[i], 0)); return eb; @@ -3722,7 +3749,7 @@ out: WARN_ON(!atomic_dec_and_test(&eb->refs)); for (int i = 0; i < attached; i++) { ASSERT(eb->folios[i]); - detach_extent_buffer_page(eb, folio_page(eb->folios[i], 0)); + detach_extent_buffer_folio(eb, eb->folios[i]); unlock_page(folio_page(eb->folios[i], 0)); } /* @@ -3827,31 +3854,31 @@ void free_extent_buffer_stale(struct extent_buffer *eb) release_extent_buffer(eb); } -static void btree_clear_page_dirty(struct page *page) +static void btree_clear_folio_dirty(struct folio *folio) { - ASSERT(PageDirty(page)); - ASSERT(PageLocked(page)); - clear_page_dirty_for_io(page); - xa_lock_irq(&page->mapping->i_pages); - if (!PageDirty(page)) - __xa_clear_mark(&page->mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); - xa_unlock_irq(&page->mapping->i_pages); + ASSERT(folio_test_dirty(folio)); + ASSERT(folio_test_locked(folio)); + folio_clear_dirty_for_io(folio); + xa_lock_irq(&folio->mapping->i_pages); + if (!folio_test_dirty(folio)) + __xa_clear_mark(&folio->mapping->i_pages, + folio_index(folio), PAGECACHE_TAG_DIRTY); + xa_unlock_irq(&folio->mapping->i_pages); } static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page = folio_page(eb->folios[0], 0); + struct folio *folio = eb->folios[0]; bool last; - /* btree_clear_page_dirty() needs page locked */ - lock_page(page); - last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start, - eb->len); + /* btree_clear_folio_dirty() needs page locked. */ + folio_lock(folio); + last = btrfs_subpage_clear_and_test_dirty(fs_info, folio_page(folio, 0), + eb->start, eb->len); if (last) - btree_clear_page_dirty(page); - unlock_page(page); + btree_clear_folio_dirty(folio); + folio_unlock(folio); WARN_ON(atomic_read(&eb->refs) == 0); } @@ -3859,9 +3886,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i; - int num_pages; - struct page *page; + int num_folios; btrfs_assert_tree_write_locked(eb); @@ -3891,30 +3916,29 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); - num_pages = num_extent_pages(eb); + num_folios = num_extent_folios(eb); + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; - for (i = 0; i < num_pages; i++) { - page = folio_page(eb->folios[i], 0); - if (!PageDirty(page)) + if (!folio_test_dirty(folio)) continue; - lock_page(page); - btree_clear_page_dirty(page); - unlock_page(page); + folio_lock(folio); + btree_clear_folio_dirty(folio); + folio_unlock(folio); } WARN_ON(atomic_read(&eb->refs) == 0); } void set_extent_buffer_dirty(struct extent_buffer *eb) { - int i; - int num_pages; + int num_folios; bool was_dirty; check_buffer_tree_ref(eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - num_pages = num_extent_pages(eb); + num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); @@ -3934,7 +3958,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) */ if (subpage) lock_page(folio_page(eb->folios[0], 0)); - for (i = 0; i < num_pages; i++) + for (int i = 0; i < num_folios; i++) btrfs_page_set_dirty(eb->fs_info, folio_page(eb->folios[i], 0), eb->start, eb->len); if (subpage) @@ -3944,23 +3968,21 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG - for (i = 0; i < num_pages; i++) - ASSERT(PageDirty(folio_page(eb->folios[i], 0))); + for (int i = 0; i < num_folios; i++) + ASSERT(folio_test_dirty(eb->folios[i])); #endif } void clear_extent_buffer_uptodate(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page; - int num_pages; - int i; + int num_folios = num_extent_folios(eb); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - page = folio_page(eb->folios[i], 0); - if (!page) + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + + if (!folio) continue; /* @@ -3968,34 +3990,31 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb) * btrfs_is_subpage() can not handle cloned/dummy metadata. */ if (fs_info->nodesize >= PAGE_SIZE) - ClearPageUptodate(page); + folio_clear_uptodate(folio); else - btrfs_subpage_clear_uptodate(fs_info, page, eb->start, - eb->len); + btrfs_subpage_clear_uptodate(fs_info, folio_page(folio, 0), + eb->start, eb->len); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page; - int num_pages; - int i; + int num_folios = num_extent_folios(eb); set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - page = folio_page(eb->folios[i], 0); + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; /* * This is special handling for metadata subpage, as regular * btrfs_is_subpage() can not handle cloned/dummy metadata. */ if (fs_info->nodesize >= PAGE_SIZE) - SetPageUptodate(page); + folio_mark_uptodate(folio); else - btrfs_subpage_set_uptodate(fs_info, page, eb->start, - eb->len); + btrfs_subpage_set_uptodate(fs_info, folio_page(folio, 0), + eb->start, eb->len); } } @@ -4045,8 +4064,8 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio) int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, struct btrfs_tree_parent_check *check) { - int num_pages = num_extent_pages(eb), i; struct btrfs_bio *bbio; + bool ret; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -4076,12 +4095,18 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); if (eb->fs_info->nodesize < PAGE_SIZE) { - __bio_add_page(&bbio->bio, folio_page(eb->folios[0], 0), eb->len, - eb->start - folio_pos(eb->folios[0])); + ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len, + eb->start - folio_pos(eb->folios[0])); + ASSERT(ret); } else { - for (i = 0; i < num_pages; i++) - __bio_add_page(&bbio->bio, folio_page(eb->folios[i], 0), - PAGE_SIZE, 0); + int num_folios = num_extent_folios(eb); + + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + + ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ASSERT(ret); + } } btrfs_submit_bio(bbio, mirror_num); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 66c2e214b141..a5fd5cb20a3c 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -243,6 +243,20 @@ static inline int num_extent_pages(const struct extent_buffer *eb) return (eb->len >> PAGE_SHIFT) ?: 1; } +/* + * This can only be determined at runtime by checking eb::folios[0]. + * + * As we can have either one large folio covering the whole eb + * (either nodesize <= PAGE_SIZE, or high order folio), or multiple + * single-paged folios. + */ +static inline int num_extent_folios(const struct extent_buffer *eb) +{ + if (folio_order(eb->folios[0])) + return 1; + return num_extent_pages(eb); +} + static inline int extent_buffer_uptodate(const struct extent_buffer *eb) { return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4e8c82e5d7a6..ac01f49161ff 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7867,7 +7867,7 @@ static void wait_subpage_spinlock(struct page *page) struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, page->mapping)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index caf0013f2545..f11ebaf87def 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -64,7 +64,7 @@ * This means a slightly higher tree locking latency. */ -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) { if (fs_info->sectorsize >= PAGE_SIZE) return false; @@ -74,8 +74,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) * mapping. And if page->mapping->host is data inode, it's subpage. * As we have ruled our sectorsize >= PAGE_SIZE case already. */ - if (!page->mapping || !page->mapping->host || - is_data_inode(page->mapping->host)) + if (!mapping || !mapping->host || is_data_inode(mapping->host)) return true; /* @@ -129,7 +128,7 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, ASSERT(PageLocked(page)); /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, page) || folio_test_private(folio)) + if (!btrfs_is_subpage(fs_info, page->mapping) || folio_test_private(folio)) return 0; subpage = btrfs_alloc_subpage(fs_info, type); @@ -147,7 +146,7 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct btrfs_subpage *subpage; /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, page) || !folio_test_private(folio)) + if (!btrfs_is_subpage(fs_info, page->mapping) || !folio_test_private(folio)) return; subpage = folio_detach_private(folio); @@ -193,33 +192,29 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage) * detach_extent_buffer_page() won't detach the folio private while we're still * allocating the extent buffer. */ -void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; - ASSERT(folio_test_private(folio) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + ASSERT(folio_test_private(folio) && folio->mapping); + lockdep_assert_held(&folio->mapping->private_lock); subpage = folio_get_private(folio); atomic_inc(&subpage->eb_refs); } -void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; - ASSERT(folio_test_private(folio) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + ASSERT(folio_test_private(folio) && folio->mapping); + lockdep_assert_held(&folio->mapping->private_lock); subpage = folio_get_private(folio); ASSERT(atomic_read(&subpage->eb_refs)); @@ -352,7 +347,7 @@ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, { struct folio *folio = page_folio(page); - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page->mapping)) { lock_page(page); return 0; } @@ -369,7 +364,7 @@ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page->mapping)) return unlock_page(page); btrfs_subpage_clamp_range(page, &start, &len); if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) @@ -612,7 +607,8 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked); void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, page->mapping)) { \ set_page_func(page); \ return; \ } \ @@ -621,7 +617,8 @@ void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, page->mapping)) { \ clear_page_func(page); \ return; \ } \ @@ -630,14 +627,16 @@ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, page->mapping)) \ return test_page_func(page); \ return btrfs_subpage_test_##name(fs_info, page, start, len); \ } \ void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, page->mapping)) { \ set_page_func(page); \ return; \ } \ @@ -647,7 +646,8 @@ void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, page->mapping)) { \ clear_page_func(page); \ return; \ } \ @@ -657,7 +657,8 @@ void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, page->mapping)) \ return test_page_func(page); \ btrfs_subpage_clamp_range(page, &start, &len); \ return btrfs_subpage_test_##name(fs_info, page, start, len); \ @@ -686,7 +687,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, return; ASSERT(!PageDirty(page)); - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, page->mapping)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -716,7 +717,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, ASSERT(PageLocked(page)); /* For non-subpage case, we just unlock the page */ - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, page->mapping)) return unlock_page(page); ASSERT(folio_test_private(folio) && folio_get_private(folio)); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 5cbf67ccbdeb..facd5c808e6f 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -73,7 +73,7 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page); +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, @@ -86,10 +86,8 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, enum btrfs_subpage_type type); void btrfs_free_subpage(struct btrfs_subpage *subpage); -void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page); -void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page); +void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); +void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); From 4a565c8069b7578a79d193d277e9c760aacf3e75 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Dec 2023 17:39:38 -0500 Subject: [PATCH 0888/1562] btrfs: don't double put our subpage reference in alloc_extent_buffer This fixes as case in "btrfs: refactor alloc_extent_buffer() to allocate-then-attach method". We have been seeing panics in the CI for the subpage stuff recently, it happens on btrfs/187 but could potentially happen anywhere. In the subpage case, if we race with somebody else inserting the same extent buffer, the error case will end up calling detach_extent_buffer_page() on the page twice. This is done first in the bit for (int i = 0; i < attached; i++) detach_extent_buffer_page(eb, eb->pages[i]; and then again in btrfs_release_extent_buffer(). This works fine for !subpage because we're the only person who ever has ourselves on the private, and so when we do the initial detach_extent_buffer_page() we know we've completely removed it. However for subpage we could be using this page private elsewhere, so this results in a double put on the subpage, which can result in an early freeing. The fix here is to clear eb->pages[i] for everything we detach. Then anything still attached to the eb is freed in btrfs_release_extent_buffer(). Because of this change we must update btrfs_release_extent_buffer_pages() to not use num_extent_folios, because it assumes eb->folio[0] is set properly. Since this is only interested in freeing any pages we have on the extent buffer we can simply use INLINE_EXTENT_BUFFER_PAGES. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a5c2acd5c8ae..2c69e1f0fa10 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3175,11 +3175,9 @@ static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *f /* Release all pages attached to the extent buffer */ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) { - int num_folios = num_extent_folios(eb); - ASSERT(!extent_buffer_under_io(eb)); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) { struct folio *folio = eb->folios[i]; if (!folio) @@ -3747,10 +3745,28 @@ again: out: WARN_ON(!atomic_dec_and_test(&eb->refs)); + + /* + * Any attached folios need to be detached before we unlock them. This + * is because when we're inserting our new folios into the mapping, and + * then attaching our eb to that folio. If we fail to insert our folio + * we'll lookup the folio for that index, and grab that EB. We do not + * want that to grab this eb, as we're getting ready to free it. So we + * have to detach it first and then unlock it. + * + * We have to drop our reference and NULL it out here because in the + * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb. + * Below when we call btrfs_release_extent_buffer() we will call + * detach_extent_buffer_folio() on our remaining pages in the !subpage + * case. If we left eb->folios[i] populated in the subpage case we'd + * double put our reference and be super sad. + */ for (int i = 0; i < attached; i++) { ASSERT(eb->folios[i]); detach_extent_buffer_folio(eb, eb->folios[i]); unlock_page(folio_page(eb->folios[i], 0)); + folio_put(eb->folios[i]); + eb->folios[i] = NULL; } /* * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, From 8d993618350c86da11cb408ba529c13e83d09527 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 12 Dec 2023 12:58:36 +1030 Subject: [PATCH 0889/1562] btrfs: migrate get_eb_page_index() and get_eb_offset_in_page() to folios These two functions are still using the old page based code, which is not going to handle larger folios at all. The migration itself is going to involve the following changes: - PAGE_SIZE -> folio_size() - PAGE_SHIFT -> folio_shift() - get_eb_page_index() -> get_eb_folio_index() - get_eb_offset_in_page() -> get_eb_offset_in_folio() And since we're going to support larger folios, although above straight conversion is good enough, this patch would add extra comments in the involved functions to explain why the same single line code can now cover 3 cases: - folio_size == PAGE_SIZE, sectorsize == PAGE_SIZE, nodesize >= PAGE_SIZE The common, non-subpage case with per-page folio. - folio_size > PAGE_SIZE, sectorsize == PAGE_SIZE, nodesize >= PAGE_SIZE The incoming larger folio, non-subpage case. - folio_size == PAGE_SIZE, sectorsize < PAGE_SIZE, nodesize < PAGE_SIZE The existing subpage case, we won't larger folio anyway. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/accessors.c | 78 +++++++++++++++------------ fs/btrfs/ctree.c | 13 ++--- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 125 +++++++++++++++++++++---------------------- fs/btrfs/extent_io.h | 40 +++++++++----- 5 files changed, 141 insertions(+), 117 deletions(-) diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 8f7cbb7154d4..1925a0919ca6 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -60,28 +60,30 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_page_index(member_offset); \ - const unsigned long oip = get_eb_offset_in_page(token->eb, \ - member_offset); \ + const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ + const unsigned long oil = get_eb_offset_in_folio(token->eb, \ + member_offset);\ + const int unit_size = folio_size(token->eb->folios[0]); \ + const int unit_shift = folio_shift(token->eb->folios[0]); \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ \ ASSERT(token); \ ASSERT(token->kaddr); \ ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ if (token->offset <= member_offset && \ - member_offset + size <= token->offset + PAGE_SIZE) { \ - return get_unaligned_le##bits(token->kaddr + oip); \ + member_offset + size <= token->offset + unit_size) { \ + return get_unaligned_le##bits(token->kaddr + oil); \ } \ token->kaddr = folio_address(token->eb->folios[idx]); \ - token->offset = idx << PAGE_SHIFT; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \ - return get_unaligned_le##bits(token->kaddr + oip); \ + token->offset = idx << unit_shift; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \ + return get_unaligned_le##bits(token->kaddr + oil); \ \ - memcpy(lebytes, token->kaddr + oip, part); \ + memcpy(lebytes, token->kaddr + oil, part); \ token->kaddr = folio_address(token->eb->folios[idx + 1]); \ - token->offset = (idx + 1) << PAGE_SHIFT; \ + token->offset = (idx + 1) << unit_shift; \ memcpy(lebytes + part, token->kaddr, size - part); \ return get_unaligned_le##bits(lebytes); \ } \ @@ -89,18 +91,20 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ - const unsigned long idx = get_eb_page_index(member_offset); \ + const unsigned long idx = get_eb_folio_index(eb, member_offset);\ + const unsigned long oil = get_eb_offset_in_folio(eb, \ + member_offset);\ + const int unit_size = folio_size(eb->folios[0]); \ char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ u8 lebytes[sizeof(u##bits)]; \ \ ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \ - return get_unaligned_le##bits(kaddr + oip); \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \ + return get_unaligned_le##bits(kaddr + oil); \ \ - memcpy(lebytes, kaddr + oip, part); \ + memcpy(lebytes, kaddr + oil, part); \ kaddr = folio_address(eb->folios[idx + 1]); \ memcpy(lebytes + part, kaddr, size - part); \ return get_unaligned_le##bits(lebytes); \ @@ -110,52 +114,58 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_page_index(member_offset); \ - const unsigned long oip = get_eb_offset_in_page(token->eb, \ - member_offset); \ + const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ + const unsigned long oil = get_eb_offset_in_folio(token->eb, \ + member_offset);\ + const int unit_size = folio_size(token->eb->folios[0]); \ + const int unit_shift = folio_shift(token->eb->folios[0]); \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ \ ASSERT(token); \ ASSERT(token->kaddr); \ ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ if (token->offset <= member_offset && \ - member_offset + size <= token->offset + PAGE_SIZE) { \ - put_unaligned_le##bits(val, token->kaddr + oip); \ + member_offset + size <= token->offset + unit_size) { \ + put_unaligned_le##bits(val, token->kaddr + oil); \ return; \ } \ token->kaddr = folio_address(token->eb->folios[idx]); \ - token->offset = idx << PAGE_SHIFT; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ - put_unaligned_le##bits(val, token->kaddr + oip); \ + token->offset = idx << unit_shift; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || \ + oil + size <= unit_size) { \ + put_unaligned_le##bits(val, token->kaddr + oil); \ return; \ } \ put_unaligned_le##bits(val, lebytes); \ - memcpy(token->kaddr + oip, lebytes, part); \ + memcpy(token->kaddr + oil, lebytes, part); \ token->kaddr = folio_address(token->eb->folios[idx + 1]); \ - token->offset = (idx + 1) << PAGE_SHIFT; \ + token->offset = (idx + 1) << unit_shift; \ memcpy(token->kaddr, lebytes + part, size - part); \ } \ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ unsigned long off, u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ - const unsigned long idx = get_eb_page_index(member_offset); \ + const unsigned long idx = get_eb_folio_index(eb, member_offset);\ + const unsigned long oil = get_eb_offset_in_folio(eb, \ + member_offset);\ + const int unit_size = folio_size(eb->folios[0]); \ char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ u8 lebytes[sizeof(u##bits)]; \ \ ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ - put_unaligned_le##bits(val, kaddr + oip); \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || \ + oil + size <= unit_size) { \ + put_unaligned_le##bits(val, kaddr + oil); \ return; \ } \ \ put_unaligned_le##bits(val, lebytes); \ - memcpy(kaddr + oip, lebytes, part); \ + memcpy(kaddr + oil, lebytes, part); \ kaddr = folio_address(eb->folios[idx + 1]); \ memcpy(kaddr, lebytes + part, size - part); \ } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e6c535cf3749..e65e012bac55 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -820,7 +820,8 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, } while (low < high) { - unsigned long oip; + const int unit_size = folio_size(eb->folios[0]); + unsigned long oil; unsigned long offset; struct btrfs_disk_key *tmp; struct btrfs_disk_key unaligned; @@ -828,14 +829,14 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, mid = (low + high) / 2; offset = p + mid * item_size; - oip = offset_in_page(offset); + oil = get_eb_offset_in_folio(eb, offset); - if (oip + key_size <= PAGE_SIZE) { - const unsigned long idx = get_eb_page_index(offset); + if (oil + key_size <= unit_size) { + const unsigned long idx = get_eb_folio_index(eb, offset); char *kaddr = folio_address(eb->folios[idx]); - oip = get_eb_offset_in_page(eb, offset); - tmp = (struct btrfs_disk_key *)(kaddr + oip); + oil = get_eb_offset_in_folio(eb, offset); + tmp = (struct btrfs_disk_key *)(kaddr + oil); } else { read_extent_buffer(eb, &unaligned, offset, key_size); tmp = &unaligned; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 93702782fe76..c23b655fb8f8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -395,7 +395,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, csum_tree_block(eb, result); header_csum = folio_address(eb->folios[0]) + - get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum)); + get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum)); if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2c69e1f0fa10..7d8c1c14c9dd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4169,12 +4169,11 @@ static inline int check_eb_range(const struct extent_buffer *eb, void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { + const int unit_size = folio_size(eb->folios[0]); size_t cur; size_t offset; - struct page *page; - char *kaddr; char *dst = (char *)dstv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); if (check_eb_range(eb, start, len)) { /* @@ -4190,13 +4189,13 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, return; } - offset = get_eb_offset_in_page(eb, start); + offset = get_eb_offset_in_folio(eb, start); while (len > 0) { - page = folio_page(eb->folios[i], 0); + char *kaddr; - cur = min(len, (PAGE_SIZE - offset)); - kaddr = page_address(page); + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); memcpy(dst, kaddr + offset, cur); dst += cur; @@ -4210,12 +4209,11 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, void __user *dstv, unsigned long start, unsigned long len) { + const int unit_size = folio_size(eb->folios[0]); size_t cur; size_t offset; - struct page *page; - char *kaddr; char __user *dst = (char __user *)dstv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); int ret = 0; WARN_ON(start > eb->len); @@ -4227,13 +4225,13 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, return ret; } - offset = get_eb_offset_in_page(eb, start); + offset = get_eb_offset_in_folio(eb, start); while (len > 0) { - page = folio_page(eb->folios[i], 0); + char *kaddr; - cur = min(len, (PAGE_SIZE - offset)); - kaddr = page_address(page); + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); if (copy_to_user_nofault(dst, kaddr + offset, cur)) { ret = -EFAULT; break; @@ -4251,12 +4249,12 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) { + const int unit_size = folio_size(eb->folios[0]); size_t cur; size_t offset; - struct page *page; char *kaddr; char *ptr = (char *)ptrv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); int ret = 0; if (check_eb_range(eb, start, len)) @@ -4265,14 +4263,11 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, if (eb->addr) return memcmp(ptrv, eb->addr + start, len); - offset = get_eb_offset_in_page(eb, start); + offset = get_eb_offset_in_folio(eb, start); while (len > 0) { - page = folio_page(eb->folios[i], 0); - - cur = min(len, (PAGE_SIZE - offset)); - - kaddr = page_address(page); + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); ret = memcmp(ptr, kaddr + offset, cur); if (ret) break; @@ -4291,10 +4286,12 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, * For regular sector size == PAGE_SIZE case, check if @page is uptodate. * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. */ -static void assert_eb_page_uptodate(const struct extent_buffer *eb, - struct page *page) +static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) { struct btrfs_fs_info *fs_info = eb->fs_info; + struct folio *folio = eb->folios[i]; + + ASSERT(folio); /* * If we are using the commit root we could potentially clear a page @@ -4308,11 +4305,13 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, return; if (fs_info->nodesize < PAGE_SIZE) { + struct page *page = folio_page(folio, 0); + if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len))) btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len); } else { - WARN_ON(!PageUptodate(page)); + WARN_ON(!folio_test_uptodate(folio)); } } @@ -4320,12 +4319,12 @@ static void __write_extent_buffer(const struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len, bool use_memmove) { + const int unit_size = folio_size(eb->folios[0]); size_t cur; size_t offset; - struct page *page; char *kaddr; char *src = (char *)srcv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); /* For unmapped (dummy) ebs, no need to check their uptodate status. */ const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); @@ -4340,15 +4339,14 @@ static void __write_extent_buffer(const struct extent_buffer *eb, return; } - offset = get_eb_offset_in_page(eb, start); + offset = get_eb_offset_in_folio(eb, start); while (len > 0) { - page = folio_page(eb->folios[i], 0); if (check_uptodate) - assert_eb_page_uptodate(eb, page); + assert_eb_folio_uptodate(eb, i); - cur = min(len, PAGE_SIZE - offset); - kaddr = page_address(page); + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); if (use_memmove) memmove(kaddr + offset, src, cur); else @@ -4370,6 +4368,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, static void memset_extent_buffer(const struct extent_buffer *eb, int c, unsigned long start, unsigned long len) { + const int unit_size = folio_size(eb->folios[0]); unsigned long cur = start; if (eb->addr) { @@ -4378,13 +4377,12 @@ static void memset_extent_buffer(const struct extent_buffer *eb, int c, } while (cur < start + len) { - unsigned long index = get_eb_page_index(cur); - unsigned int offset = get_eb_offset_in_page(eb, cur); - unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset); - struct page *page = folio_page(eb->folios[index], 0); + unsigned long index = get_eb_folio_index(eb, cur); + unsigned int offset = get_eb_offset_in_folio(eb, cur); + unsigned int cur_len = min(start + len - cur, unit_size - offset); - assert_eb_page_uptodate(eb, page); - memset_page(page, offset, c, cur_len); + assert_eb_folio_uptodate(eb, index); + memset(folio_address(eb->folios[index]) + offset, c, cur_len); cur += cur_len; } @@ -4401,14 +4399,15 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src) { + const int unit_size = folio_size(src->folios[0]); unsigned long cur = 0; ASSERT(dst->len == src->len); while (cur < src->len) { - unsigned long index = get_eb_page_index(cur); - unsigned long offset = get_eb_offset_in_page(src, cur); - unsigned long cur_len = min(src->len, PAGE_SIZE - offset); + unsigned long index = get_eb_folio_index(src, cur); + unsigned long offset = get_eb_offset_in_folio(src, cur); + unsigned long cur_len = min(src->len, unit_size - offset); void *addr = folio_address(src->folios[index]) + offset; write_extent_buffer(dst, addr, cur, cur_len); @@ -4422,12 +4421,12 @@ void copy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { + const int unit_size = folio_size(dst->folios[0]); u64 dst_len = dst->len; size_t cur; size_t offset; - struct page *page; char *kaddr; - unsigned long i = get_eb_page_index(dst_offset); + unsigned long i = get_eb_folio_index(dst, dst_offset); if (check_eb_range(dst, dst_offset, len) || check_eb_range(src, src_offset, len)) @@ -4435,15 +4434,14 @@ void copy_extent_buffer(const struct extent_buffer *dst, WARN_ON(src->len != dst_len); - offset = get_eb_offset_in_page(dst, dst_offset); + offset = get_eb_offset_in_folio(dst, dst_offset); while (len > 0) { - page = folio_page(dst->folios[i], 0); - assert_eb_page_uptodate(dst, page); + assert_eb_folio_uptodate(dst, i); - cur = min(len, (unsigned long)(PAGE_SIZE - offset)); + cur = min(len, (unsigned long)(unit_size - offset)); - kaddr = page_address(page); + kaddr = folio_address(dst->folios[i]); read_extent_buffer(src, kaddr + offset, src_offset, cur); src_offset += cur; @@ -4502,18 +4500,18 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, eb_bitmap_offset(eb, start, nr, &i, &offset); page = folio_page(eb->folios[i], 0); - assert_eb_page_uptodate(eb, page); + assert_eb_folio_uptodate(eb, i); kaddr = page_address(page); return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr) { - unsigned long index = get_eb_page_index(bytenr); + unsigned long index = get_eb_folio_index(eb, bytenr); if (check_eb_range(eb, bytenr, 1)) return NULL; - return folio_address(eb->folios[index]) + get_eb_offset_in_page(eb, bytenr); + return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr); } /* @@ -4598,6 +4596,7 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { + const int unit_size = folio_size(dst->folios[0]); unsigned long cur_off = 0; if (check_eb_range(dst, dst_offset, len) || @@ -4616,11 +4615,11 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, while (cur_off < len) { unsigned long cur_src = cur_off + src_offset; - unsigned long pg_index = get_eb_page_index(cur_src); - unsigned long pg_off = get_eb_offset_in_page(dst, cur_src); + unsigned long folio_index = get_eb_folio_index(dst, cur_src); + unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src); unsigned long cur_len = min(src_offset + len - cur_src, - PAGE_SIZE - pg_off); - void *src_addr = folio_address(dst->folios[pg_index]) + pg_off; + unit_size - folio_off); + void *src_addr = folio_address(dst->folios[folio_index]) + folio_off; const bool use_memmove = areas_overlap(src_offset + cur_off, dst_offset + cur_off, cur_len); @@ -4654,20 +4653,20 @@ void memmove_extent_buffer(const struct extent_buffer *dst, while (len > 0) { unsigned long src_i; size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; + size_t dst_off_in_folio; + size_t src_off_in_folio; void *src_addr; bool use_memmove; - src_i = get_eb_page_index(src_end); + src_i = get_eb_folio_index(dst, src_end); - dst_off_in_page = get_eb_offset_in_page(dst, dst_end); - src_off_in_page = get_eb_offset_in_page(dst, src_end); + dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end); + src_off_in_folio = get_eb_offset_in_folio(dst, src_end); - cur = min_t(unsigned long, len, src_off_in_page + 1); - cur = min(cur, dst_off_in_page + 1); + cur = min_t(unsigned long, len, src_off_in_folio + 1); + cur = min(cur, dst_off_in_folio + 1); - src_addr = folio_address(dst->folios[src_i]) + src_off_in_page - + src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio - cur + 1; use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, cur); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a5fd5cb20a3c..46050500529b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -121,29 +121,43 @@ struct btrfs_eb_write_context { * * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. */ -static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, - unsigned long offset) +static inline size_t get_eb_offset_in_folio(const struct extent_buffer *eb, + unsigned long offset) { /* - * For sectorsize == PAGE_SIZE case, eb->start will always be aligned - * to PAGE_SIZE, thus adding it won't cause any difference. + * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case + * 1.1) One large folio covering the whole eb + * The eb->start is aligned to folio size, thus adding it + * won't cause any difference. + * 1.2) Several page sized folios + * The eb->start is aligned to folio (page) size, thus + * adding it won't cause any difference. * - * For sectorsize < PAGE_SIZE, we must only read the data that belongs - * to the eb, thus we have to take the eb->start into consideration. + * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case + * In this case there would only be one page sized folio, and there + * may be several different extent buffers in the page/folio. + * We need to add eb->start to properly access the offset inside + * that eb. */ - return offset_in_page(offset + eb->start); + return offset_in_folio(eb->folios[0], offset + eb->start); } -static inline unsigned long get_eb_page_index(unsigned long offset) +static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb, + unsigned long offset) { /* - * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. + * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case + * 1.1) One large folio covering the whole eb. + * the folio_shift would be large enough to always make us + * return 0 as index. + * 1.2) Several page sized folios + * The folio_shift() would be PAGE_SHIFT, giving us the correct + * index. * - * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, - * and have ensured that all tree blocks are contained in one page, - * thus we always get index == 0. + * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case + * The folio would only be page sized, and always give us 0 as index. */ - return offset >> PAGE_SHIFT; + return offset >> folio_shift(eb->folios[0]); } /* From 55151ea9ec1b40170dad5766c2d7f36105be42cd Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 12 Dec 2023 12:58:37 +1030 Subject: [PATCH 0890/1562] btrfs: migrate subpage code to folio interfaces Although subpage itself is conflicting with higher folio, since subpage (sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE) means we will never need higher order folio, there is a hidden pitfall: - btrfs_page_*() helpers Those helpers are an abstraction to handle both subpage and non-subpage cases, which means we're going to pass pages pointers to those helpers. And since those helpers are shared between data and metadata paths, it's unavoidable to let them to handle folios, including higher order folios). Meanwhile for true subpage case, we should only have a single page backed folios anyway, thus add a new ASSERT() for btrfs_subpage_assert() to ensure that. Also since those helpers are shared between both data and metadata, add some extra ASSERT()s for data path to make sure we only get single page backed folio for now. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/compression.c | 7 +- fs/btrfs/defrag.c | 2 +- fs/btrfs/disk-io.c | 4 +- fs/btrfs/extent_io.c | 105 +++++++------ fs/btrfs/file.c | 13 +- fs/btrfs/free-space-cache.c | 4 +- fs/btrfs/inode.c | 34 ++-- fs/btrfs/ordered-data.c | 5 +- fs/btrfs/reflink.c | 6 +- fs/btrfs/relocation.c | 5 +- fs/btrfs/subpage.c | 304 +++++++++++++++++------------------- fs/btrfs/subpage.h | 74 +++++---- 12 files changed, 280 insertions(+), 283 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5678e73f1509..5422a00214b0 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -306,8 +306,8 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) for (i = 0; i < ret; i++) { struct folio *folio = fbatch.folios[i]; - btrfs_page_clamp_clear_writeback(fs_info, &folio->page, - cb->start, cb->len); + btrfs_folio_clamp_clear_writeback(fs_info, folio, + cb->start, cb->len); } folio_batch_release(&fbatch); } @@ -541,7 +541,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, * subpage::readers and to unlock the page. */ if (fs_info->sectorsize < PAGE_SIZE) - btrfs_subpage_start_reader(fs_info, page, cur, add_size); + btrfs_subpage_start_reader(fs_info, page_folio(page), + cur, add_size); put_page(page); cur += add_size; } diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index a9a068af8d6e..c276b136ab63 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -1189,7 +1189,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, /* Update the page status */ for (i = start_index - first_index; i <= last_index - first_index; i++) { ClearPageChecked(pages[i]); - btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); + btrfs_folio_clamp_set_dirty(fs_info, page_folio(pages[i]), start, len); } btrfs_delalloc_release_extents(inode, len); extent_changeset_free(data_reserved); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c23b655fb8f8..2c83da36a9c7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -284,8 +284,8 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) if (WARN_ON_ONCE(found_start != eb->start)) return BLK_STS_IOERR; - if (WARN_ON(!btrfs_page_test_uptodate(fs_info, folio_page(eb->folios[0], 0), - eb->start, eb->len))) + if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0], + eb->start, eb->len))) return BLK_STS_IOERR; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7d8c1c14c9dd..4c8e14ce21a8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -184,22 +184,23 @@ static void process_one_page(struct btrfs_fs_info *fs_info, struct page *page, struct page *locked_page, unsigned long page_ops, u64 start, u64 end) { + struct folio *folio = page_folio(page); u32 len; ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); len = end + 1 - start; if (page_ops & PAGE_SET_ORDERED) - btrfs_page_clamp_set_ordered(fs_info, page, start, len); + btrfs_folio_clamp_set_ordered(fs_info, folio, start, len); if (page_ops & PAGE_START_WRITEBACK) { - btrfs_page_clamp_clear_dirty(fs_info, page, start, len); - btrfs_page_clamp_set_writeback(fs_info, page, start, len); + btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len); + btrfs_folio_clamp_set_writeback(fs_info, folio, start, len); } if (page_ops & PAGE_END_WRITEBACK) - btrfs_page_clamp_clear_writeback(fs_info, page, start, len); + btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); if (page != locked_page && (page_ops & PAGE_UNLOCK)) - btrfs_page_end_writer_lock(fs_info, page, start, len); + btrfs_folio_end_writer_lock(fs_info, folio, start, len); } static void __process_pages_contig(struct address_space *mapping, @@ -271,19 +272,20 @@ static noinline int lock_delalloc_pages(struct inode *inode, goto out; for (i = 0; i < found_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; + struct page *page = folio_page(folio, 0); u32 len = end + 1 - start; if (page == locked_page) continue; - if (btrfs_page_start_writer_lock(fs_info, page, start, - len)) + if (btrfs_folio_start_writer_lock(fs_info, folio, start, + len)) goto out; if (!PageDirty(page) || page->mapping != mapping) { - btrfs_page_end_writer_lock(fs_info, page, start, - len); + btrfs_folio_end_writer_lock(fs_info, folio, start, + len); goto out; } @@ -432,19 +434,20 @@ static bool btrfs_verify_page(struct page *page, u64 start) static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct folio *folio = page_folio(page); ASSERT(page_offset(page) <= start && start + len <= page_offset(page) + PAGE_SIZE); if (uptodate && btrfs_verify_page(page, start)) - btrfs_page_set_uptodate(fs_info, page, start, len); + btrfs_folio_set_uptodate(fs_info, folio, start, len); else - btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_folio_clear_uptodate(fs_info, folio, start, len); if (!btrfs_is_subpage(fs_info, page->mapping)) unlock_page(page); else - btrfs_subpage_end_reader(fs_info, page, start, len); + btrfs_subpage_end_reader(fs_info, folio, start, len); } /* @@ -485,7 +488,7 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error); if (error) mapping_set_error(page->mapping, error); - btrfs_page_clear_writeback(fs_info, page, start, len); + btrfs_folio_clear_writeback(fs_info, page_folio(page), start, len); } bio_put(bio); @@ -564,12 +567,12 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) { struct folio *folio = page_folio(page); - ASSERT(PageLocked(page)); - if (!btrfs_is_subpage(fs_info, page->mapping)) + ASSERT(folio_test_locked(folio)); + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; ASSERT(folio_test_private(folio)); - btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); + btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE); } /* @@ -921,8 +924,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, folio_attach_private(folio, prealloc); else /* Do new allocation to attach subpage */ - ret = btrfs_attach_subpage(fs_info, folio_page(folio, 0), - BTRFS_SUBPAGE_METADATA); + ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); return ret; } @@ -939,7 +941,7 @@ int set_page_extent_mapped(struct page *page) fs_info = btrfs_sb(page->mapping->host->i_sb); if (btrfs_is_subpage(fs_info, page->mapping)) - return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); + return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); return 0; @@ -957,7 +959,7 @@ void clear_page_extent_mapped(struct page *page) fs_info = btrfs_sb(page->mapping->host->i_sb); if (btrfs_is_subpage(fs_info, page->mapping)) - return btrfs_detach_subpage(fs_info, page); + return btrfs_detach_subpage(fs_info, folio); folio_detach_private(folio); } @@ -1352,7 +1354,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_page_clear_dirty(fs_info, page, cur, len); + btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len); break; } @@ -1404,7 +1406,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * So clear subpage dirty bit here so next time we won't submit * page for range already written to disk. */ - btrfs_page_clear_dirty(fs_info, page, cur, iosize); + btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize); submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, cur - page_offset(page)); @@ -1412,7 +1414,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, nr++; } - btrfs_page_assert_not_dirty(fs_info, page); + btrfs_folio_assert_not_dirty(fs_info, page_folio(page)); *nr_ret = nr; return 0; @@ -1651,7 +1653,7 @@ static void extent_buffer_write_end_io(struct btrfs_bio *bbio) struct page *page = bvec->bv_page; u32 len = bvec->bv_len; - btrfs_page_clear_writeback(fs_info, page, start, len); + btrfs_folio_clear_writeback(fs_info, page_folio(page), start, len); bio_offset += len; } @@ -1707,18 +1709,21 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; if (fs_info->nodesize < PAGE_SIZE) { - struct page *p = folio_page(eb->folios[0], 0); + struct folio *folio = eb->folios[0]; + bool ret; - lock_page(p); - btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len); - if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start, + folio_lock(folio); + btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len); + if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len)) { - clear_page_dirty_for_io(p); + folio_clear_dirty_for_io(folio); wbc->nr_to_write--; } - __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p)); - wbc_account_cgroup_owner(wbc, p, eb->len); - unlock_page(p); + ret = bio_add_folio(&bbio->bio, folio, eb->len, + eb->start - folio_pos(folio)); + ASSERT(ret); + wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len); + folio_unlock(folio); } else { int num_folios = num_extent_folios(eb); @@ -2235,7 +2240,7 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page, cur, cur_len, !ret); mapping_set_error(page->mapping, ret); } - btrfs_page_unlock_writer(fs_info, page, cur, cur_len); + btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len); if (ret < 0) found_error = true; next_page: @@ -3156,7 +3161,7 @@ static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *f * attached to one dummy eb, no sharing. */ if (!mapped) { - btrfs_detach_subpage(fs_info, folio_page(folio, 0)); + btrfs_detach_subpage(fs_info, folio); return; } @@ -3167,7 +3172,7 @@ static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *f * page range and no unfinished IO. */ if (!folio_range_has_eb(fs_info, folio)) - btrfs_detach_subpage(fs_info, folio_page(folio, 0)); + btrfs_detach_subpage(fs_info, folio); spin_unlock(&folio->mapping->private_lock); } @@ -3683,8 +3688,7 @@ reallocate: btrfs_folio_inc_eb_refs(fs_info, folio); spin_unlock(&mapping->private_lock); - WARN_ON(btrfs_page_test_dirty(fs_info, folio_page(folio, 0), - eb->start, eb->len)); + WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); /* * Check if the current page is physically contiguous with previous eb @@ -3695,8 +3699,7 @@ reallocate: if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) page_contig = false; - if (!btrfs_page_test_uptodate(fs_info, folio_page(folio, 0), - eb->start, eb->len)) + if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len)) uptodate = 0; /* @@ -3890,8 +3893,7 @@ static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) /* btree_clear_folio_dirty() needs page locked. */ folio_lock(folio); - last = btrfs_subpage_clear_and_test_dirty(fs_info, folio_page(folio, 0), - eb->start, eb->len); + last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len); if (last) btree_clear_folio_dirty(folio); folio_unlock(folio); @@ -3975,8 +3977,8 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) if (subpage) lock_page(folio_page(eb->folios[0], 0)); for (int i = 0; i < num_folios; i++) - btrfs_page_set_dirty(eb->fs_info, folio_page(eb->folios[i], 0), - eb->start, eb->len); + btrfs_folio_set_dirty(eb->fs_info, eb->folios[i], + eb->start, eb->len); if (subpage) unlock_page(folio_page(eb->folios[0], 0)); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, @@ -4008,7 +4010,7 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb) if (fs_info->nodesize >= PAGE_SIZE) folio_clear_uptodate(folio); else - btrfs_subpage_clear_uptodate(fs_info, folio_page(folio, 0), + btrfs_subpage_clear_uptodate(fs_info, folio, eb->start, eb->len); } } @@ -4029,7 +4031,7 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) if (fs_info->nodesize >= PAGE_SIZE) folio_mark_uptodate(folio); else - btrfs_subpage_set_uptodate(fs_info, folio_page(folio, 0), + btrfs_subpage_set_uptodate(fs_info, folio, eb->start, eb->len); } } @@ -4062,9 +4064,9 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio) u32 len = bvec->bv_len; if (uptodate) - btrfs_page_set_uptodate(fs_info, page, start, len); + btrfs_folio_set_uptodate(fs_info, page_folio(page), start, len); else - btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_folio_clear_uptodate(fs_info, page_folio(page), start, len); bio_offset += len; } @@ -4305,11 +4307,12 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) return; if (fs_info->nodesize < PAGE_SIZE) { - struct page *page = folio_page(folio, 0); + struct folio *folio = eb->folios[0]; - if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page, + ASSERT(i == 0); + if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, eb->start, eb->len))) - btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len); + btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len); } else { WARN_ON(!folio_test_uptodate(folio)); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 98ef859f8938..38dfcac47609 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -111,8 +111,8 @@ static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, * accessed as prepare_pages should have marked them accessed * in prepare_pages via find_or_create_page() */ - btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start, - block_len); + btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]), + block_start, block_len); unlock_page(pages[i]); put_page(pages[i]); } @@ -168,9 +168,12 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; - btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); - btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes); - btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); + btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p), + start_pos, num_bytes); + btrfs_folio_clamp_clear_checked(fs_info, page_folio(p), + start_pos, num_bytes); + btrfs_folio_clamp_set_dirty(fs_info, page_folio(p), + start_pos, num_bytes); } /* diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6f93c9a2c3e3..d372c7ce0e6b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -439,8 +439,8 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) for (i = 0; i < io_ctl->num_pages; i++) { if (io_ctl->pages[i]) { - btrfs_page_clear_checked(io_ctl->fs_info, - io_ctl->pages[i], + btrfs_folio_clear_checked(io_ctl->fs_info, + page_folio(io_ctl->pages[i]), page_offset(io_ctl->pages[i]), PAGE_SIZE); unlock_page(io_ctl->pages[i]); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ac01f49161ff..b3e39610cc95 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -456,8 +456,8 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ - btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, - offset, bytes); + btrfs_folio_clamp_clear_ordered(inode->root->fs_info, + page_folio(page), offset, bytes); put_page(page); } @@ -2802,7 +2802,7 @@ out_page: PAGE_SIZE, !ret); clear_page_dirty_for_io(page); } - btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); + btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); unlock_page(page); put_page(page); kfree(fixup); @@ -2857,7 +2857,7 @@ int btrfs_writepage_cow_fixup(struct page *page) * page->mapping outside of the page lock. */ ihold(inode); - btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE); + btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE); get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); fixup->page = page; @@ -4776,9 +4776,10 @@ again: memzero_page(page, (block_start - page_offset(page)) + offset, len); } - btrfs_page_clear_checked(fs_info, page, block_start, - block_end + 1 - block_start); - btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); + btrfs_folio_clear_checked(fs_info, page_folio(page), block_start, + block_end + 1 - block_start); + btrfs_folio_set_dirty(fs_info, page_folio(page), block_start, + block_end + 1 - block_start); unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) @@ -8009,7 +8010,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, page_end); ASSERT(range_end + 1 - cur < U32_MAX); range_len = range_end + 1 - cur; - if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) { + if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { /* * If Ordered (Private2) is cleared, it means endio has * already been executed for the range. @@ -8018,7 +8019,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, */ goto next; } - btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); + btrfs_folio_clear_ordered(fs_info, folio, cur, range_len); /* * IO on this page will never be started, so we need to account @@ -8088,7 +8089,7 @@ next: * did something wrong. */ ASSERT(!folio_test_ordered(folio)); - btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio)); + btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); if (!inode_evicting) __btrfs_release_folio(folio, GFP_NOFS); clear_page_extent_mapped(&folio->page); @@ -8112,6 +8113,7 @@ next: vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; + struct folio *folio = page_folio(page); struct inode *inode = file_inode(vmf->vma->vm_file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -8128,6 +8130,8 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) u64 page_end; u64 end; + ASSERT(folio_order(folio) == 0); + reserved_space = PAGE_SIZE; sb_start_pagefault(inode->i_sb); @@ -8231,9 +8235,9 @@ again: if (zero_start != PAGE_SIZE) memzero_page(page, zero_start, PAGE_SIZE - zero_start); - btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); - btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); - btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); + btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); @@ -9819,7 +9823,9 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) page = find_get_page(inode->vfs_inode.i_mapping, index); ASSERT(page); /* Pages should be in the extent_io_tree */ - btrfs_page_set_writeback(fs_info, page, start, len); + /* This is for data, which doesn't yet support larger folio. */ + ASSERT(folio_order(page_folio(page)) == 0); + btrfs_folio_set_writeback(fs_info, page_folio(page), start, len); put_page(page); index++; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a82e1417c4d2..59850dc17b22 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -323,9 +323,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * * If there's no such bit, we need to skip to next range. */ - if (!btrfs_page_test_ordered(fs_info, page, file_offset, len)) + if (!btrfs_folio_test_ordered(fs_info, page_folio(page), + file_offset, len)) return false; - btrfs_page_clear_ordered(fs_info, page, file_offset, len); + btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len); } /* Now we're fine to update the accounting. */ diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f88b0c2ac3fe..ae90894dc7dc 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -141,9 +141,9 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (datal < block_size) memzero_page(page, datal, block_size - datal); - btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); - btrfs_page_clear_checked(fs_info, page, file_offset, block_size); - btrfs_page_set_dirty(fs_info, page, file_offset, block_size); + btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size); + btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size); + btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size); out_unlock: if (page) { unlock_page(page); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 78c2770eb52f..abe594f77f99 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2895,7 +2895,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * will re-read the whole page anyway. */ if (page) { - btrfs_subpage_clear_uptodate(fs_info, page, i_size, + btrfs_subpage_clear_uptodate(fs_info, page_folio(page), i_size, round_up(i_size, PAGE_SIZE) - i_size); unlock_page(page); put_page(page); @@ -3070,7 +3070,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, clamped_len); goto release_page; } - btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len); + btrfs_folio_set_dirty(fs_info, page_folio(page), + clamped_start, clamped_len); /* * Set the boundary if it's inside the page. diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index f11ebaf87def..d9a30b93d543 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -115,20 +115,19 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector } int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page, enum btrfs_subpage_type type) + struct folio *folio, enum btrfs_subpage_type type) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; /* * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. */ - if (page->mapping) - ASSERT(PageLocked(page)); + if (folio->mapping) + ASSERT(folio_test_locked(folio)); /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, page->mapping) || folio_test_private(folio)) + if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio)) return 0; subpage = btrfs_alloc_subpage(fs_info, type); @@ -139,14 +138,12 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, return 0; } -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, page->mapping) || !folio_test_private(folio)) + if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio)) return; subpage = folio_detach_private(folio); @@ -222,9 +219,10 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * } static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); + /* For subpage support, the folio must be single page. */ + ASSERT(folio_order(folio) == 0); /* Basic checks */ ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -234,34 +232,32 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, * The range check only works for mapped page, we can still have * unmapped page like dummy extent buffer pages. */ - if (page->mapping) - ASSERT(page_offset(page) <= start && - start + len <= page_offset(page) + PAGE_SIZE); + if (folio->mapping) + ASSERT(folio_pos(folio) <= start && + start + len <= folio_pos(folio) + PAGE_SIZE); } void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = len >> fs_info->sectorsize_bits; - btrfs_subpage_assert(fs_info, page, start, len); + btrfs_subpage_assert(fs_info, folio, start, len); atomic_add(nbits, &subpage->readers); } void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = len >> fs_info->sectorsize_bits; bool is_data; bool last; - btrfs_subpage_assert(fs_info, page, start, len); - is_data = is_data_inode(page->mapping->host); + btrfs_subpage_assert(fs_info, folio, start, len); + is_data = is_data_inode(folio->mapping->host); ASSERT(atomic_read(&subpage->readers) >= nbits); last = atomic_sub_and_test(nbits, &subpage->readers); @@ -273,36 +269,35 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, * As we want the atomic_sub_and_test() to be always executed. */ if (is_data && last) - unlock_page(page); + folio_unlock(folio); } -static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) +static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) { u64 orig_start = *start; u32 orig_len = *len; - *start = max_t(u64, page_offset(page), orig_start); + *start = max_t(u64, folio_pos(folio), orig_start); /* * For certain call sites like btrfs_drop_pages(), we may have pages * beyond the target range. In that case, just set @len to 0, subpage * helpers can handle @len == 0 without any problem. */ - if (page_offset(page) >= orig_start + orig_len) + if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, page_offset(page) + PAGE_SIZE, + *len = min_t(u64, folio_pos(folio) + PAGE_SIZE, orig_start + orig_len) - *start; } void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = (len >> fs_info->sectorsize_bits); int ret; - btrfs_subpage_assert(fs_info, page, start, len); + btrfs_subpage_assert(fs_info, folio, start, len); ASSERT(atomic_read(&subpage->readers) == 0); ret = atomic_add_return(nbits, &subpage->writers); @@ -310,13 +305,12 @@ void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, } bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = (len >> fs_info->sectorsize_bits); - btrfs_subpage_assert(fs_info, page, start, len); + btrfs_subpage_assert(fs_info, folio, start, len); /* * We have call sites passing @lock_page into @@ -333,7 +327,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, } /* - * Lock a page for delalloc page writeback. + * Lock a folio for delalloc page writeback. * * Return -EAGAIN if the page is not properly initialized. * Return 0 with the page locked, and writer counter updated. @@ -342,40 +336,40 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, * it's really the correct page, as the caller is using * filemap_get_folios_contig(), which can race with page invalidating. */ -int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) +int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); - - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page->mapping)) { - lock_page(page); + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + folio_lock(folio); return 0; } - lock_page(page); + folio_lock(folio); if (!folio_test_private(folio) || !folio_get_private(folio)) { - unlock_page(page); + folio_unlock(folio); return -EAGAIN; } - btrfs_subpage_clamp_range(page, &start, &len); - btrfs_subpage_start_writer(fs_info, page, start, len); + btrfs_subpage_clamp_range(folio, &start, &len); + btrfs_subpage_start_writer(fs_info, folio, start, len); return 0; } -void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) +void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page->mapping)) - return unlock_page(page); - btrfs_subpage_clamp_range(page, &start, &len); - if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) - unlock_page(page); + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + folio_unlock(folio); + return; + } + btrfs_subpage_clamp_range(folio, &start, &len); + if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len)) + folio_unlock(folio); } -#define subpage_calc_start_bit(fs_info, page, name, start, len) \ +#define subpage_calc_start_bit(fs_info, folio, name, start, len) \ ({ \ unsigned int start_bit; \ \ - btrfs_subpage_assert(fs_info, page, start, len); \ + btrfs_subpage_assert(fs_info, folio, start, len); \ start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ start_bit += fs_info->subpage_info->name##_offset; \ start_bit; \ @@ -392,49 +386,46 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, fs_info->subpage_info->bitmap_nr_bits) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate)) - SetPageUptodate(page); + folio_mark_uptodate(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - ClearPageUptodate(page); + folio_clear_uptodate(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&subpage->lock, flags); - set_page_dirty(page); + folio_mark_dirty(folio); } /* @@ -448,11 +439,10 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, * extra handling for tree blocks. */ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; bool last = false; @@ -466,107 +456,101 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, } void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { bool last; - last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); + last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, start, len); if (last) - clear_page_dirty_for_io(page); + folio_clear_dirty_for_io(folio); } void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - set_page_writeback(page); + folio_start_writeback(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) { - ASSERT(PageWriteback(page)); - end_page_writeback(page); + ASSERT(folio_test_writeback(folio)); + folio_end_writeback(folio); } spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - SetPageOrdered(page); + folio_set_ordered(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered)) - ClearPageOrdered(page); + folio_clear_ordered(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_set(fs_info, subpage, checked)) - SetPageChecked(page); + folio_set_checked(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - ClearPageChecked(page); + folio_clear_checked(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -576,11 +560,10 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, */ #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ + struct folio *folio, u64 start, u32 len) \ { \ - struct folio *folio = page_folio(page); \ struct btrfs_subpage *subpage = folio_get_private(folio); \ - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \ + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \ name, start, len); \ unsigned long flags; \ bool ret; \ @@ -602,92 +585,91 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked); * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall * back to regular sectorsize branch. */ -#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ - test_page_func) \ -void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +#define IMPLEMENT_BTRFS_PAGE_OPS(name, folio_set_func, \ + folio_clear_func, folio_test_func) \ +void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, page->mapping)) { \ - set_page_func(page); \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_set_func(folio); \ return; \ } \ - btrfs_subpage_set_##name(fs_info, page, start, len); \ + btrfs_subpage_set_##name(fs_info, folio, start, len); \ } \ -void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, page->mapping)) { \ - clear_page_func(page); \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_clear_func(folio); \ return; \ } \ - btrfs_subpage_clear_##name(fs_info, page, start, len); \ + btrfs_subpage_clear_##name(fs_info, folio, start, len); \ } \ -bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, page->mapping)) \ - return test_page_func(page); \ - return btrfs_subpage_test_##name(fs_info, page, start, len); \ + !btrfs_is_subpage(fs_info, folio->mapping)) \ + return folio_test_func(folio); \ + return btrfs_subpage_test_##name(fs_info, folio, start, len); \ } \ -void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, page->mapping)) { \ - set_page_func(page); \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_set_func(folio); \ return; \ } \ - btrfs_subpage_clamp_range(page, &start, &len); \ - btrfs_subpage_set_##name(fs_info, page, start, len); \ + btrfs_subpage_clamp_range(folio, &start, &len); \ + btrfs_subpage_set_##name(fs_info, folio, start, len); \ } \ -void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, page->mapping)) { \ - clear_page_func(page); \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_clear_func(folio); \ return; \ } \ - btrfs_subpage_clamp_range(page, &start, &len); \ - btrfs_subpage_clear_##name(fs_info, page, start, len); \ + btrfs_subpage_clamp_range(folio, &start, &len); \ + btrfs_subpage_clear_##name(fs_info, folio, start, len); \ } \ -bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, page->mapping)) \ - return test_page_func(page); \ - btrfs_subpage_clamp_range(page, &start, &len); \ - return btrfs_subpage_test_##name(fs_info, page, start, len); \ + !btrfs_is_subpage(fs_info, folio->mapping)) \ + return folio_test_func(folio); \ + btrfs_subpage_clamp_range(folio, &start, &len); \ + return btrfs_subpage_test_##name(fs_info, folio, start, len); \ } -IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, - PageUptodate); -IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, - PageDirty); -IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, - PageWriteback); -IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, - PageOrdered); -IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked); +IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate, + folio_test_uptodate); +IMPLEMENT_BTRFS_PAGE_OPS(dirty, folio_mark_dirty, folio_clear_dirty_for_io, + folio_test_dirty); +IMPLEMENT_BTRFS_PAGE_OPS(writeback, folio_start_writeback, folio_end_writeback, + folio_test_writeback); +IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, + folio_test_ordered); +IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, + folio_test_checked); /* * Make sure not only the page dirty bit is cleared, but also subpage dirty bit * is cleared. */ -void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; - ASSERT(!PageDirty(page)); - if (!btrfs_is_subpage(fs_info, page->mapping)) + ASSERT(!folio_test_dirty(folio)); + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -709,16 +691,17 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, * extent_write_locked_range(). * In this case, we have to call subpage helper to handle the case. */ -void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, - u64 start, u32 len) +void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; - ASSERT(PageLocked(page)); + ASSERT(folio_test_locked(folio)); /* For non-subpage case, we just unlock the page */ - if (!btrfs_is_subpage(fs_info, page->mapping)) - return unlock_page(page); + if (!btrfs_is_subpage(fs_info, folio->mapping)) { + folio_unlock(folio); + return; + } ASSERT(folio_test_private(folio) && folio_get_private(folio)); subpage = folio_get_private(folio); @@ -730,12 +713,14 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, * Since we own the page lock, no one else could touch subpage::writers * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->writers) == 0) + if (atomic_read(&subpage->writers) == 0) { /* No writers, locked by plain lock_page() */ - return unlock_page(page); + folio_unlock(folio); + return; + } /* Have writers, use proper subpage helper to end it */ - btrfs_page_end_writer_lock(fs_info, page, start, len); + btrfs_folio_end_writer_lock(fs_info, folio, start, len); } #define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ @@ -743,10 +728,9 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, subpage_info->name##_offset, subpage_info->bitmap_nr_bits) void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; unsigned long uptodate_bitmap; unsigned long error_bitmap; @@ -768,10 +752,10 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); - dump_page(page, "btrfs subpage dump"); + dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", - start, len, page_offset(page), + start, len, folio_pos(folio), subpage_info->bitmap_nr_bits, &uptodate_bitmap, subpage_info->bitmap_nr_bits, &error_bitmap, subpage_info->bitmap_nr_bits, &dirty_bitmap, diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index facd5c808e6f..793c2b314a58 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -77,9 +77,8 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page, enum btrfs_subpage_type type); -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page); + struct folio *folio, enum btrfs_subpage_type type); +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); /* Allocate additional data where page represents more than one sector */ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, @@ -90,52 +89,52 @@ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); -int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); -void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); +int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); /* * Template for subpage related operations. * - * btrfs_subpage_*() are for call sites where the page has subpage attached and - * the range is ensured to be inside the page. + * btrfs_subpage_*() are for call sites where the folio has subpage attached and + * the range is ensured to be inside the folio's single page. * - * btrfs_page_*() are for call sites where the page can either be subpage - * specific or regular page. The function will handle both cases. - * But the range still needs to be inside the page. + * btrfs_folio_*() are for call sites where the page can either be subpage + * specific or regular folios. The function will handle both cases. + * But the range still needs to be inside one single page. * - * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't + * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't * need to be inside the page. Those functions will truncate the range * automatically. */ #define DECLARE_BTRFS_SUBPAGE_OPS(name) \ void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ + struct folio *folio, u64 start, u32 len); \ void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ + struct folio *folio, u64 start, u32 len); \ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(dirty); @@ -144,13 +143,12 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered); DECLARE_BTRFS_SUBPAGE_OPS(checked); bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); -void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, - struct page *page); -void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, - u64 start, u32 len); +void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio); +void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); #endif From a700ca5ec4ee9c2feb6e56469ce808f9769dc9f3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 12 Dec 2023 12:58:38 +1030 Subject: [PATCH 0891/1562] btrfs: migrate various end io functions to folios If we still go the old page based iterator functions, like bio_for_each_segment_all(), we can hit middle pages of a folio (compound page). In that case if we set any page flag on those middle pages, we can easily trigger VM_BUG_ON(), as for compound page flags, they should follow their flag policies (normally only set on leading or tail pages). To avoid such problem in the future full folio migration, here we do: - Change from bio_for_each_segment_all() to bio_for_each_folio_all() This completely removes the ability to access the middle page. - Add extra ASSERT()s for data read/write paths To ensure we only get single paged folio for data now. - Rename those end io functions to follow a certain schema * end_bbio_compressed_read() * end_bbio_compressed_write() These two endio functions don't set any page flags, as they use pages not mapped to any address space. They can be very good candidates for higher order folio testing. And they are shared between compression and encoded IO. * end_bbio_data_read() * end_bbio_data_write() * end_bbio_meta_read() * end_bbio_meta_write() The old function names are not unified: - end_bio_extent_writepage() - end_bio_extent_readpage() - extent_buffer_write_end_io() - extent_buffer_read_end_io() They share no schema on where the "end_*io" string should be, nor can be confusing just using "extent_buffer" and "extent" to distinguish data and metadata paths. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/compression.c | 8 +-- fs/btrfs/extent_io.c | 150 +++++++++++++++++++++-------------------- 2 files changed, 81 insertions(+), 77 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5422a00214b0..193168214eeb 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -264,7 +264,7 @@ void btrfs_free_compr_page(struct page *page) put_page(page); } -static void end_compressed_bio_read(struct btrfs_bio *bbio) +static void end_bbio_comprssed_read(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); blk_status_t status = bbio->bio.bi_status; @@ -337,7 +337,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) * This also calls the writeback end hooks for the file pages so that metadata * and checksums can be updated in the file. */ -static void end_compressed_bio_write(struct btrfs_bio *bbio) +static void end_bbio_comprssed_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; @@ -384,7 +384,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, cb = alloc_compressed_bio(inode, ordered->file_offset, REQ_OP_WRITE | write_flags, - end_compressed_bio_write); + end_bbio_comprssed_write); cb->start = ordered->file_offset; cb->len = ordered->num_bytes; cb->compressed_pages = compressed_pages; @@ -589,7 +589,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) compressed_len = em->block_len; cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, - end_compressed_bio_read); + end_bbio_comprssed_read); cb->start = em->orig_start; em_len = em->len; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4c8e14ce21a8..423861e5e8f5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -451,44 +451,48 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) } /* - * after a writepage IO is done, we need to: - * clear the uptodate bits on error - * clear the writeback bits in the extent tree for this IO - * end_page_writeback if the page has no more pending IO + * After a write IO is done, we need to: + * + * - clear the uptodate bits on error + * - clear the writeback bits in the extent tree for the range + * - filio_end_writeback() if there is no more pending io for the folio * * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_writepage(struct btrfs_bio *bbio) +static void end_bbio_data_write(struct btrfs_bio *bbio) { struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - struct inode *inode = page->mapping->host; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; - u64 start = page_offset(page) + bvec->bv_offset; - u32 len = bvec->bv_len; + u64 start = folio_pos(folio) + fi.offset; + u32 len = fi.length; + + /* Only order 0 (single page) folios are allowed for data. */ + ASSERT(folio_order(folio) == 0); /* Our read/write should always be sector aligned. */ - if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + if (!IS_ALIGNED(fi.offset, sectorsize)) btrfs_err(fs_info, - "partial page write in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else if (!IS_ALIGNED(bvec->bv_len, sectorsize)) + "partial page write in btrfs with offset %zu and length %zu", + fi.offset, fi.length); + else if (!IS_ALIGNED(fi.length, sectorsize)) btrfs_info(fs_info, - "incomplete page write with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); + "incomplete page write with offset %zu and length %zu", + fi.offset, fi.length); - btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error); + btrfs_finish_ordered_extent(bbio->ordered, + folio_page(folio, 0), start, len, !error); if (error) - mapping_set_error(page->mapping, error); - btrfs_folio_clear_writeback(fs_info, page_folio(page), start, len); + mapping_set_error(folio->mapping, error); + btrfs_folio_clear_writeback(fs_info, folio, start, len); } bio_put(bio); @@ -576,89 +580,91 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) } /* - * after a readpage IO is done, we need to: - * clear the uptodate bits on error - * set the uptodate bits if things worked - * set the page up to date if all extents in the tree are uptodate - * clear the lock bit in the extent tree - * unlock the page if there are no other extents locked for it + * After a data read IO is done, we need to: + * + * - clear the uptodate bits on error + * - set the uptodate bits if things worked + * - set the folio up to date if all extents in the tree are uptodate + * - clear the lock bit in the extent tree + * - unlock the folio if there are no other extents locked for it * * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_readpage(struct btrfs_bio *bbio) +static void end_bbio_data_read(struct btrfs_bio *bbio) { struct bio *bio = &bbio->bio; - struct bio_vec *bvec; struct processed_extent processed = { 0 }; + struct folio_iter fi; /* * The offset to the beginning of a bio, since one bio can never be * larger than UINT_MAX, u32 here is enough. */ u32 bio_offset = 0; - struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { + bio_for_each_folio_all(fi, &bbio->bio) { bool uptodate = !bio->bi_status; - struct page *page = bvec->bv_page; - struct inode *inode = page->mapping->host; + struct folio *folio = fi.folio; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; u64 start; u64 end; u32 len; + /* For now only order 0 folios are supported for data. */ + ASSERT(folio_order(folio) == 0); btrfs_debug(fs_info, - "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", - bio->bi_iter.bi_sector, bio->bi_status, + "%s: bi_sector=%llu, err=%d, mirror=%u", + __func__, bio->bi_iter.bi_sector, bio->bi_status, bbio->mirror_num); /* * We always issue full-sector reads, but if some block in a - * page fails to read, blk_update_request() will advance + * folio fails to read, blk_update_request() will advance * bv_offset and adjust bv_len to compensate. Print a warning * for unaligned offsets, and an error if they don't add up to * a full sector. */ - if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + if (!IS_ALIGNED(fi.offset, sectorsize)) btrfs_err(fs_info, - "partial page read in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len, - sectorsize)) + "partial page read in btrfs with offset %zu and length %zu", + fi.offset, fi.length); + else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize)) btrfs_info(fs_info, - "incomplete page read with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); + "incomplete page read with offset %zu and length %zu", + fi.offset, fi.length); - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; - len = bvec->bv_len; + start = folio_pos(folio) + fi.offset; + end = start + fi.length - 1; + len = fi.length; if (likely(uptodate)) { loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; + pgoff_t end_index = i_size >> folio_shift(folio); /* * Zero out the remaining part if this range straddles * i_size. * - * Here we should only zero the range inside the bvec, + * Here we should only zero the range inside the folio, * not touch anything else. * * NOTE: i_size is exclusive while end is inclusive. */ - if (page->index == end_index && i_size <= end) { - u32 zero_start = max(offset_in_page(i_size), - offset_in_page(start)); + if (folio_index(folio) == end_index && i_size <= end) { + u32 zero_start = max(offset_in_folio(folio, i_size), + offset_in_folio(folio, start)); + u32 zero_len = offset_in_folio(folio, end) + 1 - + zero_start; - zero_user_segment(page, zero_start, - offset_in_page(end) + 1); + folio_zero_range(folio, zero_start, zero_len); } } /* Update page status and unlock. */ - end_page_read(page, uptodate, start, len); + end_page_read(folio_page(folio, 0), uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); @@ -1030,7 +1036,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, memzero_page(page, zero_offset, iosize); } } - bio_ctrl->end_io_func = end_bio_extent_readpage; + bio_ctrl->end_io_func = end_bbio_data_read; begin_page_read(fs_info, page); while (cur <= end) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; @@ -1334,7 +1340,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, return 1; } - bio_ctrl->end_io_func = end_bio_extent_writepage; + bio_ctrl->end_io_func = end_bbio_data_write; while (cur <= end) { u32 len = end - cur + 1; u64 disk_bytenr; @@ -1636,24 +1642,23 @@ static struct extent_buffer *find_extent_buffer_nolock( return NULL; } -static void extent_buffer_write_end_io(struct btrfs_bio *bbio) +static void end_bbio_meta_write(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct bvec_iter_all iter_all; - struct bio_vec *bvec; + struct folio_iter fi; u32 bio_offset = 0; if (!uptodate) set_btree_ioerr(eb); - bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + bio_for_each_folio_all(fi, &bbio->bio) { u64 start = eb->start + bio_offset; - struct page *page = bvec->bv_page; - u32 len = bvec->bv_len; + struct folio *folio = fi.folio; + u32 len = fi.length; - btrfs_folio_clear_writeback(fs_info, page_folio(page), start, len); + btrfs_folio_clear_writeback(fs_info, folio, start, len); bio_offset += len; } @@ -1702,7 +1707,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), - eb->fs_info, extent_buffer_write_end_io, eb); + eb->fs_info, end_bbio_meta_write, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, &bbio->bio); @@ -4036,13 +4041,12 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } } -static void extent_buffer_read_end_io(struct btrfs_bio *bbio) +static void end_bbio_meta_read(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct bvec_iter_all iter_all; - struct bio_vec *bvec; + struct folio_iter fi; u32 bio_offset = 0; eb->read_mirror = bbio->mirror_num; @@ -4058,15 +4062,15 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio) set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); } - bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + bio_for_each_folio_all(fi, &bbio->bio) { + struct folio *folio = fi.folio; u64 start = eb->start + bio_offset; - struct page *page = bvec->bv_page; - u32 len = bvec->bv_len; + u32 len = fi.length; if (uptodate) - btrfs_folio_set_uptodate(fs_info, page_folio(page), start, len); + btrfs_folio_set_uptodate(fs_info, folio, start, len); else - btrfs_folio_clear_uptodate(fs_info, page_folio(page), start, len); + btrfs_folio_clear_uptodate(fs_info, folio, start, len); bio_offset += len; } @@ -4107,7 +4111,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_READ | REQ_META, eb->fs_info, - extent_buffer_read_end_io, eb); + end_bbio_meta_read, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; From f4521b01c5246b921debc6db6f112f89f94cc61b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 12 Dec 2023 15:54:09 +1030 Subject: [PATCH 0892/1562] btrfs: migrate eb_bitmap_offset() to folio interfaces [BUG] Test case btrfs/002 would fail if larger folios are enabled for metadata: assertion failed: folio, in fs/btrfs/extent_io.c:4358 ------------[ cut here ]------------ kernel BUG at fs/btrfs/extent_io.c:4358! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 30916 Comm: fsstress Tainted: G OE 6.7.0-rc3-custom+ #128 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 2/2/2022 RIP: 0010:assert_eb_folio_uptodate+0x98/0xe0 [btrfs] Call Trace: extent_buffer_test_bit+0x3c/0x70 [btrfs] free_space_test_bit+0xcd/0x140 [btrfs] modify_free_space_bitmap+0x27a/0x430 [btrfs] add_to_free_space_tree+0x8d/0x160 [btrfs] __btrfs_free_extent.isra.0+0xef1/0x13c0 [btrfs] __btrfs_run_delayed_refs+0x786/0x13c0 [btrfs] btrfs_run_delayed_refs+0x33/0x120 [btrfs] btrfs_commit_transaction+0xa2/0x1350 [btrfs] iterate_supers+0x77/0xe0 ksys_sync+0x60/0xa0 __do_sys_sync+0xa/0x20 do_syscall_64+0x3f/0xf0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 [CAUSE] The function extent_buffer_test_bit() is not folio compatible. It still assumes the old fixed page size, when an extent buffer with large folio passed in, only eb->folios[0] is populated. Then if the target bit range falls in the 2nd page of the folio, then we would check eb->folios[1], and trigger the ASSERT(). [FIX] Just migrate eb_bitmap_offset() to folio interfaces, using the folio_size() to replace PAGE_SIZE. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 423861e5e8f5..a0ffd41c5cc1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4459,22 +4459,22 @@ void copy_extent_buffer(const struct extent_buffer *dst, } /* - * Calculate the page and offset of the byte containing the given bit number. + * Calculate the folio and offset of the byte containing the given bit number. * * @eb: the extent buffer * @start: offset of the bitmap item in the extent buffer * @nr: bit number - * @page_index: return index of the page in the extent buffer that contains + * @folio_index: return index of the folio in the extent buffer that contains * the given bit number - * @page_offset: return offset into the page given by page_index + * @folio_offset: return offset into the folio given by folio_index * * This helper hides the ugliness of finding the byte in an extent buffer which * contains a given bit. */ static inline void eb_bitmap_offset(const struct extent_buffer *eb, unsigned long start, unsigned long nr, - unsigned long *page_index, - size_t *page_offset) + unsigned long *folio_index, + size_t *folio_offset) { size_t byte_offset = BIT_BYTE(nr); size_t offset; @@ -4484,10 +4484,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start + offset_in_page(eb->start) + byte_offset; + offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset; - *page_index = offset >> PAGE_SHIFT; - *page_offset = offset_in_page(offset); + *folio_index = offset >> folio_shift(eb->folios[0]); + *folio_offset = offset_in_folio(eb->folios[0], offset); } /* @@ -4500,15 +4500,13 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, unsigned long nr) { - u8 *kaddr; - struct page *page; unsigned long i; size_t offset; + u8 *kaddr; eb_bitmap_offset(eb, start, nr, &i, &offset); - page = folio_page(eb->folios[i], 0); assert_eb_folio_uptodate(eb, i); - kaddr = page_address(page); + kaddr = folio_address(eb->folios[i]); return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } From 96c36eaa7730081e5c946819e4dfad0f432c70f7 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 12 Dec 2023 15:54:10 +1030 Subject: [PATCH 0893/1562] btrfs: migrate btrfs_repair_io_failure() to folio interfaces [BUG] Test case btrfs/124 failed if larger metadata folio is enabled, the dying message looks like this: BTRFS error (device dm-2): bad tree block start, mirror 2 want 31686656 have 0 BTRFS info (device dm-2): read error corrected: ino 0 off 31686656 (dev /dev/mapper/test-scratch2 sector 20928) BUG: kernel NULL pointer dereference, address: 0000000000000020 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page CPU: 6 PID: 350881 Comm: btrfs Tainted: G OE 6.7.0-rc3-custom+ #128 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 2/2/2022 RIP: 0010:btrfs_read_extent_buffer+0x106/0x180 [btrfs] PKRU: 55555554 Call Trace: read_tree_block+0x33/0xb0 [btrfs] read_block_for_search+0x23e/0x340 [btrfs] btrfs_search_slot+0x2f9/0xe60 [btrfs] btrfs_lookup_csum+0x75/0x160 [btrfs] btrfs_lookup_bio_sums+0x21a/0x560 [btrfs] btrfs_submit_chunk+0x152/0x680 [btrfs] btrfs_submit_bio+0x1c/0x50 [btrfs] submit_one_bio+0x40/0x80 [btrfs] submit_extent_page+0x158/0x390 [btrfs] btrfs_do_readpage+0x330/0x740 [btrfs] extent_readahead+0x38d/0x6c0 [btrfs] read_pages+0x94/0x2c0 page_cache_ra_unbounded+0x12d/0x190 relocate_file_extent_cluster+0x7c1/0x9d0 [btrfs] relocate_block_group+0x2d3/0x560 [btrfs] btrfs_relocate_block_group+0x2c7/0x4b0 [btrfs] btrfs_relocate_chunk+0x4c/0x1a0 [btrfs] btrfs_balance+0x925/0x13c0 [btrfs] btrfs_ioctl+0x19f1/0x25d0 [btrfs] __x64_sys_ioctl+0x90/0xd0 do_syscall_64+0x3f/0xf0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 [CAUSE] The dying line is at btrfs_repair_io_failure() call inside btrfs_repair_eb_io_failure(). The function is still relying on the extent buffer using page sized folios. When the extent buffer is using larger folio, we go into the 2nd slot of folios[], and triggered the NULL pointer dereference. [FIX] Migrate btrfs_repair_io_failure() to folio interfaces. So that when we hit a larger folio, we just submit the whole folio in one go. This also affects data repair path through btrfs_end_repair_bio(), thankfully data is still fully page based, we can just add an ASSERT(), and use page_folio() to convert the page to folio. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/bio.c | 15 +++++++++++---- fs/btrfs/bio.h | 4 ++-- fs/btrfs/disk-io.c | 13 +++++++------ 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 67a885d3f9a8..928f512cdb4a 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -194,6 +194,12 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); int mirror = repair_bbio->mirror_num; + /* + * We can only trigger this for data bio, which doesn't support larger + * folios yet. + */ + ASSERT(folio_order(page_folio(bv->bv_page)) == 0); + if (repair_bbio->bio.bi_status || !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); @@ -215,7 +221,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, btrfs_repair_io_failure(fs_info, btrfs_ino(inode), repair_bbio->file_offset, fs_info->sectorsize, repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, - bv->bv_page, bv->bv_offset, mirror); + page_folio(bv->bv_page), bv->bv_offset, mirror); } while (mirror != fbio->bbio->mirror_num); done: @@ -767,8 +773,8 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) * freeing the bio. */ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) + u64 length, u64 logical, struct folio *folio, + unsigned int folio_offset, int mirror_num) { struct btrfs_io_stripe smap = { 0 }; struct bio_vec bvec; @@ -799,7 +805,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - __bio_add_page(&bio, page, length, pg_offset); + ret = bio_add_folio(&bio, folio, length, folio_offset); + ASSERT(ret); ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */ diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index ca79decee060..bbaed317161a 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -105,7 +105,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num); + u64 length, u64 logical, struct folio *folio, + unsigned int folio_offset, int mirror_num); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2c83da36a9c7..c6907d533fe8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -183,21 +183,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i, num_pages = num_extent_pages(eb); + int num_folios = num_extent_folios(eb); int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; - for (i = 0; i < num_pages; i++) { - u64 start = max_t(u64, eb->start, folio_pos(eb->folios[i])); + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, - folio_pos(eb->folios[i]) + PAGE_SIZE); + folio_pos(folio) + folio_size(folio)); u32 len = end - start; ret = btrfs_repair_io_failure(fs_info, 0, start, len, - start, folio_page(eb->folios[i], 0), - offset_in_page(start), mirror_num); + start, folio, offset_in_folio(folio, start), + mirror_num); if (ret) break; } From 02d05b6416b1f09a877c71c2761b45d1548d8856 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Dec 2023 06:42:56 -0800 Subject: [PATCH 0894/1562] btrfs: factor out helper for single device IO check The check in btrfs_map_block() deciding if a particular I/O is targeting a single device is getting more and more convoluted. Factor out the check conditions into a helper function, with no functional change otherwise. Reviewed-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1cc6b5d5eb61..f23223f0ea5b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6330,6 +6330,27 @@ static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, return 0; } +static bool is_single_device_io(struct btrfs_fs_info *fs_info, + const struct btrfs_io_stripe *smap, + const struct btrfs_chunk_map *map, + int num_alloc_stripes, + enum btrfs_map_op op, int mirror_num) +{ + if (!smap) + return false; + + if (num_alloc_stripes != 1) + return false; + + if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ) + return false; + + if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) + return false; + + return true; +} + /* * Map one logical range to one or more physical ranges. * @@ -6532,10 +6553,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * physical block information on the stack instead of allocating an * I/O context structure. */ - if (smap && num_alloc_stripes == 1 && - !(btrfs_need_stripe_tree_update(fs_info, map->type) && - op != BTRFS_MAP_READ) && - !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) { + if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op, + mirror_num)) { ret = set_io_stripe(fs_info, op, logical, length, smap, map, stripe_index, stripe_offset, stripe_nr); if (mirror_num_ret) From fd747f2d5f9bdf16b65326be9742338c770ba35f Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Dec 2023 06:42:57 -0800 Subject: [PATCH 0895/1562] btrfs: re-introduce struct btrfs_io_geometry Re-introduce struct btrfs_io_geometry, holding the necessary bits and pieces needed in btrfs_map_block() to decide the I/O geometry of a specific block mapping. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 158 +++++++++++++++++++++++++-------------------- 1 file changed, 88 insertions(+), 70 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f23223f0ea5b..e3f75ede9174 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -41,6 +41,17 @@ BTRFS_BLOCK_GROUP_RAID10 | \ BTRFS_BLOCK_GROUP_RAID56_MASK) +struct btrfs_io_geometry { + u32 stripe_index; + u32 stripe_nr; + int mirror_num; + int num_stripes; + u64 stripe_offset; + u64 raid56_full_stripe_start; + int max_errors; + enum btrfs_map_op op; +}; + const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, @@ -6392,28 +6403,27 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_io_stripe *smap, int *mirror_num_ret) { struct btrfs_chunk_map *map; + struct btrfs_io_geometry io_geom = { 0 }; u64 map_offset; - u64 stripe_offset; - u32 stripe_nr; - u32 stripe_index; int data_stripes; int i; int ret = 0; - int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); - int num_stripes; int num_copies; - int max_errors = 0; struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; u16 num_alloc_stripes; - u64 raid56_full_stripe_start = (u64)-1; u64 max_len; ASSERT(bioc_ret); + io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); + io_geom.num_stripes = 1; + io_geom.stripe_index = 0; + io_geom.op = op; + num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); - if (mirror_num > num_copies) + if (io_geom.mirror_num > num_copies) return -EINVAL; map = btrfs_get_chunk_map(fs_info, logical, *length); @@ -6423,8 +6433,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, data_stripes = nr_data_stripes(map); map_offset = logical - map->start; - max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, - &stripe_offset, &raid56_full_stripe_start); + io_geom.raid56_full_stripe_start = (u64)-1; + max_len = btrfs_max_io_len(map, io_geom.op, map_offset, &io_geom.stripe_nr, + &io_geom.stripe_offset, + &io_geom.raid56_full_stripe_start); *length = min_t(u64, map->chunk_len - map_offset, max_len); down_read(&dev_replace->rwsem); @@ -6436,53 +6448,51 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (!dev_replace_is_ongoing) up_read(&dev_replace->rwsem); - num_stripes = 1; - stripe_index = 0; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_index = stripe_nr % map->num_stripes; - stripe_nr /= map->num_stripes; + io_geom.stripe_index = io_geom.stripe_nr % map->num_stripes; + io_geom.stripe_nr /= map->num_stripes; if (op == BTRFS_MAP_READ) - mirror_num = 1; + io_geom.mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { if (op != BTRFS_MAP_READ) { - num_stripes = map->num_stripes; - } else if (mirror_num) { - stripe_index = mirror_num - 1; + io_geom.num_stripes = map->num_stripes; + } else if (io_geom.mirror_num) { + io_geom.stripe_index = io_geom.mirror_num - 1; } else { - stripe_index = find_live_mirror(fs_info, map, 0, + io_geom.stripe_index = find_live_mirror(fs_info, map, 0, dev_replace_is_ongoing); - mirror_num = stripe_index + 1; + io_geom.mirror_num = io_geom.stripe_index + 1; } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { if (op != BTRFS_MAP_READ) { - num_stripes = map->num_stripes; - } else if (mirror_num) { - stripe_index = mirror_num - 1; + io_geom.num_stripes = map->num_stripes; + } else if (io_geom.mirror_num) { + io_geom.stripe_index = io_geom.mirror_num - 1; } else { - mirror_num = 1; + io_geom.mirror_num = 1; } } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { u32 factor = map->num_stripes / map->sub_stripes; - stripe_index = (stripe_nr % factor) * map->sub_stripes; - stripe_nr /= factor; + io_geom.stripe_index = (io_geom.stripe_nr % factor) * map->sub_stripes; + io_geom.stripe_nr /= factor; if (op != BTRFS_MAP_READ) - num_stripes = map->sub_stripes; - else if (mirror_num) - stripe_index += mirror_num - 1; + io_geom.num_stripes = map->sub_stripes; + else if (io_geom.mirror_num) + io_geom.stripe_index += io_geom.mirror_num - 1; else { - int old_stripe_index = stripe_index; - stripe_index = find_live_mirror(fs_info, map, - stripe_index, + int old_stripe_index = io_geom.stripe_index; + io_geom.stripe_index = find_live_mirror(fs_info, map, + io_geom.stripe_index, dev_replace_is_ongoing); - mirror_num = stripe_index - old_stripe_index + 1; + io_geom.mirror_num = io_geom.stripe_index - old_stripe_index + 1; } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (op != BTRFS_MAP_READ || mirror_num > 1) { + if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1) { /* * Needs full stripe mapping. * @@ -6494,29 +6504,33 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * but that can be expensive. Here we just divide * @stripe_nr with @data_stripes. */ - stripe_nr /= data_stripes; + io_geom.stripe_nr /= data_stripes; /* RAID[56] write or recovery. Return all stripes */ - num_stripes = map->num_stripes; - max_errors = btrfs_chunk_max_errors(map); + io_geom.num_stripes = map->num_stripes; + io_geom.max_errors = btrfs_chunk_max_errors(map); /* Return the length to the full stripe end */ *length = min(logical + *length, - raid56_full_stripe_start + map->start + - btrfs_stripe_nr_to_offset(data_stripes)) - + io_geom.raid56_full_stripe_start + + map->start + + btrfs_stripe_nr_to_offset( + data_stripes)) - logical; - stripe_index = 0; - stripe_offset = 0; + io_geom.stripe_index = 0; + io_geom.stripe_offset = 0; } else { - ASSERT(mirror_num <= 1); + ASSERT(io_geom.mirror_num <= 1); /* Just grab the data stripe directly. */ - stripe_index = stripe_nr % data_stripes; - stripe_nr /= data_stripes; + io_geom.stripe_index = io_geom.stripe_nr % data_stripes; + io_geom.stripe_nr /= data_stripes; /* We distribute the parity blocks across stripes */ - stripe_index = (stripe_nr + stripe_index) % map->num_stripes; - if (op == BTRFS_MAP_READ && mirror_num < 1) - mirror_num = 1; + io_geom.stripe_index = + (io_geom.stripe_nr + io_geom.stripe_index) % + map->num_stripes; + if (op == BTRFS_MAP_READ && io_geom.mirror_num < 1) + io_geom.mirror_num = 1; } } else { /* @@ -6524,19 +6538,19 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ - stripe_index = stripe_nr % map->num_stripes; - stripe_nr /= map->num_stripes; - mirror_num = stripe_index + 1; + io_geom.stripe_index = io_geom.stripe_nr % map->num_stripes; + io_geom.stripe_nr /= map->num_stripes; + io_geom.mirror_num = io_geom.stripe_index + 1; } - if (stripe_index >= map->num_stripes) { + if (io_geom.stripe_index >= map->num_stripes) { btrfs_crit(fs_info, "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", - stripe_index, map->num_stripes); + io_geom.stripe_index, map->num_stripes); ret = -EINVAL; goto out; } - num_alloc_stripes = num_stripes; + num_alloc_stripes = io_geom.num_stripes; if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && op != BTRFS_MAP_READ) /* @@ -6554,11 +6568,12 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * I/O context structure. */ if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op, - mirror_num)) { + io_geom.mirror_num)) { ret = set_io_stripe(fs_info, op, logical, length, smap, map, - stripe_index, stripe_offset, stripe_nr); + io_geom.stripe_index, io_geom.stripe_offset, + io_geom.stripe_nr); if (mirror_num_ret) - *mirror_num_ret = mirror_num; + *mirror_num_ret = io_geom.mirror_num; *bioc_ret = NULL; goto out; } @@ -6578,7 +6593,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * It's still mostly the same as other profiles, just with extra rotation. */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && - (op != BTRFS_MAP_READ || mirror_num > 1)) { + (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) { /* * For RAID56 @stripe_nr is already the number of full stripes * before us, which is also the rotation value (needs to modulo @@ -6588,12 +6603,13 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * modulo, to reduce one modulo call. */ bioc->full_stripe_logical = map->start + - btrfs_stripe_nr_to_offset(stripe_nr * data_stripes); - for (int i = 0; i < num_stripes; i++) { + btrfs_stripe_nr_to_offset(io_geom.stripe_nr * data_stripes); + for (int i = 0; i < io_geom.num_stripes; i++) { ret = set_io_stripe(fs_info, op, logical, length, &bioc->stripes[i], map, - (i + stripe_nr) % num_stripes, - stripe_offset, stripe_nr); + (i + io_geom.stripe_nr) % io_geom.num_stripes, + io_geom.stripe_offset, + io_geom.stripe_nr); if (ret < 0) break; } @@ -6602,13 +6618,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * For all other non-RAID56 profiles, just copy the target * stripe into the bioc. */ - for (i = 0; i < num_stripes; i++) { + for (i = 0; i < io_geom.num_stripes; i++) { ret = set_io_stripe(fs_info, op, logical, length, - &bioc->stripes[i], map, stripe_index, - stripe_offset, stripe_nr); + &bioc->stripes[i], map, + io_geom.stripe_index, + io_geom.stripe_offset, + io_geom.stripe_nr); if (ret < 0) break; - stripe_index++; + io_geom.stripe_index++; } } @@ -6619,18 +6637,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } if (op != BTRFS_MAP_READ) - max_errors = btrfs_chunk_max_errors(map); + io_geom.max_errors = btrfs_chunk_max_errors(map); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && op != BTRFS_MAP_READ) { handle_ops_on_dev_replace(op, bioc, dev_replace, logical, - &num_stripes, &max_errors); + &io_geom.num_stripes, &io_geom.max_errors); } *bioc_ret = bioc; - bioc->num_stripes = num_stripes; - bioc->max_errors = max_errors; - bioc->mirror_num = mirror_num; + bioc->num_stripes = io_geom.num_stripes; + bioc->max_errors = io_geom.max_errors; + bioc->mirror_num = io_geom.mirror_num; out: if (dev_replace_is_ongoing) { From 30e8534b538e8e7372e49516a8cddacdfd80f863 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Dec 2023 06:42:58 -0800 Subject: [PATCH 0896/1562] btrfs: factor out block-mapping for RAID0 Now that we have a container for the I/O geometry that has all the needed information for the block mappings of RAID0, factor out a helper calculating this information. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e3f75ede9174..21a53acd52ac 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6362,6 +6362,15 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info, return true; } +static void map_blocks_raid0(const struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes; + io_geom->stripe_nr /= map->num_stripes; + if (io_geom->op == BTRFS_MAP_READ) + io_geom->mirror_num = 1; +} + /* * Map one logical range to one or more physical ranges. * @@ -6449,10 +6458,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, up_read(&dev_replace->rwsem); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - io_geom.stripe_index = io_geom.stripe_nr % map->num_stripes; - io_geom.stripe_nr /= map->num_stripes; - if (op == BTRFS_MAP_READ) - io_geom.mirror_num = 1; + map_blocks_raid0(map, &io_geom); } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { if (op != BTRFS_MAP_READ) { io_geom.num_stripes = map->num_stripes; From 5e36aba8377b78b4ec8e15d29a1dee0d626d735d Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Dec 2023 06:42:59 -0800 Subject: [PATCH 0897/1562] btrfs: factor out RAID1 block mapping Now that we have a container for the I/O geometry that has all the needed information for the block mappings of RAID1, factor out a helper calculating this information. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 21a53acd52ac..2d25e82f24a4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6371,6 +6371,26 @@ static void map_blocks_raid0(const struct btrfs_chunk_map *map, io_geom->mirror_num = 1; } +static void map_blocks_raid1(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom, + bool dev_replace_is_ongoing) +{ + if (io_geom->op != BTRFS_MAP_READ) { + io_geom->num_stripes = map->num_stripes; + return; + } + + if (io_geom->mirror_num) { + io_geom->stripe_index = io_geom->mirror_num - 1; + return; + } + + io_geom->stripe_index = find_live_mirror(fs_info, map, 0, + dev_replace_is_ongoing); + io_geom->mirror_num = io_geom->stripe_index + 1; +} + /* * Map one logical range to one or more physical ranges. * @@ -6460,16 +6480,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (map->type & BTRFS_BLOCK_GROUP_RAID0) { map_blocks_raid0(map, &io_geom); } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - if (op != BTRFS_MAP_READ) { - io_geom.num_stripes = map->num_stripes; - } else if (io_geom.mirror_num) { - io_geom.stripe_index = io_geom.mirror_num - 1; - } else { - io_geom.stripe_index = find_live_mirror(fs_info, map, 0, - dev_replace_is_ongoing); - io_geom.mirror_num = io_geom.stripe_index + 1; - } - + map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing); } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { if (op != BTRFS_MAP_READ) { io_geom.num_stripes = map->num_stripes; From 5aeb15c8ca0d0cbd30e21391d2c7e25554f1e65e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Dec 2023 06:43:00 -0800 Subject: [PATCH 0898/1562] btrfs: factor out block mapping for DUP profiles Now that we have a container for the I/O geometry that has all the needed information for the block mappings of DUP, factor out a helper calculating this information. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2d25e82f24a4..921c64a7289a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6391,6 +6391,22 @@ static void map_blocks_raid1(struct btrfs_fs_info *fs_info, io_geom->mirror_num = io_geom->stripe_index + 1; } +static void map_blocks_dup(const struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + if (io_geom->op != BTRFS_MAP_READ) { + io_geom->num_stripes = map->num_stripes; + return; + } + + if (io_geom->mirror_num) { + io_geom->stripe_index = io_geom->mirror_num - 1; + return; + } + + io_geom->mirror_num = 1; +} + /* * Map one logical range to one or more physical ranges. * @@ -6482,14 +6498,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing); } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op != BTRFS_MAP_READ) { - io_geom.num_stripes = map->num_stripes; - } else if (io_geom.mirror_num) { - io_geom.stripe_index = io_geom.mirror_num - 1; - } else { - io_geom.mirror_num = 1; - } - + map_blocks_dup(map, &io_geom); } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { u32 factor = map->num_stripes / map->sub_stripes; From 8938f112b9c41aaf66f652fc18aa424d2990e15c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Dec 2023 06:43:01 -0800 Subject: [PATCH 0899/1562] btrfs: factor out block mapping for RAID10 Now that we have a container for the I/O geometry that has all the needed information for the block mappings of RAID10, factor out a helper calculating this information. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 921c64a7289a..125aa0f25d20 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6407,6 +6407,34 @@ static void map_blocks_dup(const struct btrfs_chunk_map *map, io_geom->mirror_num = 1; } +static void map_blocks_raid10(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom, + bool dev_replace_is_ongoing) +{ + u32 factor = map->num_stripes / map->sub_stripes; + int old_stripe_index; + + io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes; + io_geom->stripe_nr /= factor; + + if (io_geom->op != BTRFS_MAP_READ) { + io_geom->num_stripes = map->sub_stripes; + return; + } + + if (io_geom->mirror_num) { + io_geom->stripe_index += io_geom->mirror_num - 1; + return; + } + + old_stripe_index = io_geom->stripe_index; + io_geom->stripe_index = find_live_mirror(fs_info, map, + io_geom->stripe_index, + dev_replace_is_ongoing); + io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1; +} + /* * Map one logical range to one or more physical ranges. * @@ -6500,23 +6528,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { map_blocks_dup(map, &io_geom); } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - u32 factor = map->num_stripes / map->sub_stripes; - - io_geom.stripe_index = (io_geom.stripe_nr % factor) * map->sub_stripes; - io_geom.stripe_nr /= factor; - - if (op != BTRFS_MAP_READ) - io_geom.num_stripes = map->sub_stripes; - else if (io_geom.mirror_num) - io_geom.stripe_index += io_geom.mirror_num - 1; - else { - int old_stripe_index = io_geom.stripe_index; - io_geom.stripe_index = find_live_mirror(fs_info, map, - io_geom.stripe_index, - dev_replace_is_ongoing); - io_geom.mirror_num = io_geom.stripe_index - old_stripe_index + 1; - } - + map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing); } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1) { /* From d9d4ce9f297febc1463872475e4d1f6a97deb357 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Dec 2023 06:43:02 -0800 Subject: [PATCH 0900/1562] btrfs: reduce scope of data_stripes in btrfs_map_block Reduce the scope of 'data_stripes' in btrfs_map_block(). While the change alone may not make too much sense, it helps us factoring out a helper function for the block mapping of RAID56 I/O. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 125aa0f25d20..d5c55724666b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6478,7 +6478,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_chunk_map *map; struct btrfs_io_geometry io_geom = { 0 }; u64 map_offset; - int data_stripes; int i; int ret = 0; int num_copies; @@ -6503,8 +6502,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (IS_ERR(map)) return PTR_ERR(map); - data_stripes = nr_data_stripes(map); - map_offset = logical - map->start; io_geom.raid56_full_stripe_start = (u64)-1; max_len = btrfs_max_io_len(map, io_geom.op, map_offset, &io_geom.stripe_nr, @@ -6530,6 +6527,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing); } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + int data_stripes = nr_data_stripes(map); + if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1) { /* * Needs full stripe mapping. @@ -6641,7 +6640,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * modulo, to reduce one modulo call. */ bioc->full_stripe_logical = map->start + - btrfs_stripe_nr_to_offset(io_geom.stripe_nr * data_stripes); + btrfs_stripe_nr_to_offset(io_geom.stripe_nr * + nr_data_stripes(map)); for (int i = 0; i < io_geom.num_stripes; i++) { ret = set_io_stripe(fs_info, op, logical, length, &bioc->stripes[i], map, From 089221d3457b8756d6823be9857884d938af817c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Dec 2023 06:43:03 -0800 Subject: [PATCH 0901/1562] btrfs: factor out block mapping for RAID5/6 Now that we have a container for the I/O geometry that has all the needed information for the block mappings of RAID5 and RAID6, factor out a helper calculating this information. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 95 ++++++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 42 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d5c55724666b..19ad793e60fa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6435,6 +6435,55 @@ static void map_blocks_raid10(struct btrfs_fs_info *fs_info, io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1; } +static void map_blocks_raid56_write(struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom, + u64 logical, u64 *length) +{ + int data_stripes = nr_data_stripes(map); + + /* + * Needs full stripe mapping. + * + * Push stripe_nr back to the start of the full stripe For those cases + * needing a full stripe, @stripe_nr is the full stripe number. + * + * Originally we go raid56_full_stripe_start / full_stripe_len, but + * that can be expensive. Here we just divide @stripe_nr with + * @data_stripes. + */ + io_geom->stripe_nr /= data_stripes; + + /* RAID[56] write or recovery. Return all stripes */ + io_geom->num_stripes = map->num_stripes; + io_geom->max_errors = btrfs_chunk_max_errors(map); + + /* Return the length to the full stripe end. */ + *length = min(logical + *length, + io_geom->raid56_full_stripe_start + map->start + + btrfs_stripe_nr_to_offset(data_stripes)) - + logical; + io_geom->stripe_index = 0; + io_geom->stripe_offset = 0; +} + +static void map_blocks_raid56_read(struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + int data_stripes = nr_data_stripes(map); + + ASSERT(io_geom->mirror_num <= 1); + /* Just grab the data stripe directly. */ + io_geom->stripe_index = io_geom->stripe_nr % data_stripes; + io_geom->stripe_nr /= data_stripes; + + /* We distribute the parity blocks across stripes. */ + io_geom->stripe_index = + (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes; + + if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1) + io_geom->mirror_num = 1; +} + /* * Map one logical range to one or more physical ranges. * @@ -6527,48 +6576,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing); } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - int data_stripes = nr_data_stripes(map); - - if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1) { - /* - * Needs full stripe mapping. - * - * Push stripe_nr back to the start of the full stripe - * For those cases needing a full stripe, @stripe_nr - * is the full stripe number. - * - * Originally we go raid56_full_stripe_start / full_stripe_len, - * but that can be expensive. Here we just divide - * @stripe_nr with @data_stripes. - */ - io_geom.stripe_nr /= data_stripes; - - /* RAID[56] write or recovery. Return all stripes */ - io_geom.num_stripes = map->num_stripes; - io_geom.max_errors = btrfs_chunk_max_errors(map); - - /* Return the length to the full stripe end */ - *length = min(logical + *length, - io_geom.raid56_full_stripe_start + - map->start + - btrfs_stripe_nr_to_offset( - data_stripes)) - - logical; - io_geom.stripe_index = 0; - io_geom.stripe_offset = 0; - } else { - ASSERT(io_geom.mirror_num <= 1); - /* Just grab the data stripe directly. */ - io_geom.stripe_index = io_geom.stripe_nr % data_stripes; - io_geom.stripe_nr /= data_stripes; - - /* We distribute the parity blocks across stripes */ - io_geom.stripe_index = - (io_geom.stripe_nr + io_geom.stripe_index) % - map->num_stripes; - if (op == BTRFS_MAP_READ && io_geom.mirror_num < 1) - io_geom.mirror_num = 1; - } + if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1) + map_blocks_raid56_write(map, &io_geom, logical, length); + else + map_blocks_raid56_read(map, &io_geom); } else { /* * After this, stripe_nr is the number of stripes on this From a16fb8c6f61863f18fab61eeba10a457ff6d71d2 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Dec 2023 06:43:04 -0800 Subject: [PATCH 0902/1562] btrfs: factor out block mapping for single profiles Now that we have a container for the I/O geometry that has all the needed information for the block mappings of SINGLE profiles, factor out a helper calculating this information. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 19ad793e60fa..046f5f6345eb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6484,6 +6484,14 @@ static void map_blocks_raid56_read(struct btrfs_chunk_map *map, io_geom->mirror_num = 1; } +static void map_blocks_single(const struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes; + io_geom->stripe_nr /= map->num_stripes; + io_geom->mirror_num = io_geom->stripe_index + 1; +} + /* * Map one logical range to one or more physical ranges. * @@ -6586,9 +6594,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ - io_geom.stripe_index = io_geom.stripe_nr % map->num_stripes; - io_geom.stripe_nr /= map->num_stripes; - io_geom.mirror_num = io_geom.stripe_index + 1; + map_blocks_single(map, &io_geom); } if (io_geom.stripe_index >= map->num_stripes) { btrfs_crit(fs_info, From b55b307785ad88298914bc5c18c7d37bc5b88cb7 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Dec 2023 06:43:05 -0800 Subject: [PATCH 0903/1562] btrfs: change block mapping to switch/case in btrfs_map_block Now that all the per-profile if/else statement blocks have been converted to calls to helper the conversion to switch/case is straightforward. Reviewed-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 046f5f6345eb..c79708df2a12 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6575,26 +6575,36 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (!dev_replace_is_ongoing) up_read(&dev_replace->rwsem); - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + case BTRFS_BLOCK_GROUP_RAID0: map_blocks_raid0(map, &io_geom); - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { + break; + case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID1C3: + case BTRFS_BLOCK_GROUP_RAID1C4: map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing); - } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { + break; + case BTRFS_BLOCK_GROUP_DUP: map_blocks_dup(map, &io_geom); - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + break; + case BTRFS_BLOCK_GROUP_RAID10: map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing); - } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + break; + case BTRFS_BLOCK_GROUP_RAID5: + case BTRFS_BLOCK_GROUP_RAID6: if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1) map_blocks_raid56_write(map, &io_geom, logical, length); else map_blocks_raid56_read(map, &io_geom); - } else { + break; + default: /* * After this, stripe_nr is the number of stripes on this * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ map_blocks_single(map, &io_geom); + break; } if (io_geom.stripe_index >= map->num_stripes) { btrfs_crit(fs_info, From 89f547c6cc61baa77bb226c0a5284f56871d6080 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Dec 2023 06:43:06 -0800 Subject: [PATCH 0904/1562] btrfs: open code set_io_stripe for RAID56 Open code set_io_stripe() for RAID56, as it a) uses a different method to calculate the stripe_index b) doesn't need to go through raid-stripe-tree mapping code. Reviewed-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c79708df2a12..8b858244f0a8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6670,13 +6670,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, btrfs_stripe_nr_to_offset(io_geom.stripe_nr * nr_data_stripes(map)); for (int i = 0; i < io_geom.num_stripes; i++) { - ret = set_io_stripe(fs_info, op, logical, length, - &bioc->stripes[i], map, - (i + io_geom.stripe_nr) % io_geom.num_stripes, - io_geom.stripe_offset, - io_geom.stripe_nr); - if (ret < 0) - break; + struct btrfs_io_stripe *dst = &bioc->stripes[i]; + u32 stripe_index; + + stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes; + dst->dev = map->stripes[stripe_index].dev; + dst->physical = + map->stripes[stripe_index].physical + + io_geom.stripe_offset + + btrfs_stripe_nr_to_offset(io_geom.stripe_nr); } } else { /* From 6edf68223679be380e567e664f97043871133537 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Dec 2023 06:43:07 -0800 Subject: [PATCH 0905/1562] btrfs: pass struct btrfs_io_geometry to set_io_stripe Instead of passing three members of 'struct btrfs_io_geometry' into set_io_stripe() pass a pointer to the whole structure and then get the needed members out of btrfs_io_geometry. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8b858244f0a8..974b5e1598f1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6325,19 +6325,22 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, enum btrfs_map_op op, return U64_MAX; } -static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, struct btrfs_io_stripe *dst, - struct btrfs_chunk_map *map, u32 stripe_index, - u64 stripe_offset, u64 stripe_nr) +static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical, + u64 *length, struct btrfs_io_stripe *dst, + struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) { - dst->dev = map->stripes[stripe_index].dev; + dst->dev = map->stripes[io_geom->stripe_index].dev; - if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type)) + if (io_geom->op == BTRFS_MAP_READ && + btrfs_need_stripe_tree_update(fs_info, map->type)) return btrfs_get_raid_extent_offset(fs_info, logical, length, - map->type, stripe_index, dst); + map->type, + io_geom->stripe_index, dst); - dst->physical = map->stripes[stripe_index].physical + - stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); + dst->physical = map->stripes[io_geom->stripe_index].physical + + io_geom->stripe_offset + + btrfs_stripe_nr_to_offset(io_geom->stripe_nr); return 0; } @@ -6633,9 +6636,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, */ if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op, io_geom.mirror_num)) { - ret = set_io_stripe(fs_info, op, logical, length, smap, map, - io_geom.stripe_index, io_geom.stripe_offset, - io_geom.stripe_nr); + ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom); if (mirror_num_ret) *mirror_num_ret = io_geom.mirror_num; *bioc_ret = NULL; @@ -6686,11 +6687,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * stripe into the bioc. */ for (i = 0; i < io_geom.num_stripes; i++) { - ret = set_io_stripe(fs_info, op, logical, length, - &bioc->stripes[i], map, - io_geom.stripe_index, - io_geom.stripe_offset, - io_geom.stripe_nr); + ret = set_io_stripe(fs_info, logical, length, + &bioc->stripes[i], map, &io_geom); if (ret < 0) break; io_geom.stripe_index++; From e94dfb7a2935cb91faca88bf7136177d1ce0dda8 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Dec 2023 06:43:08 -0800 Subject: [PATCH 0906/1562] btrfs: pass btrfs_io_geometry into btrfs_max_io_len Instead of passing three individual members of 'struct btrfs_io_geometry' into btrfs_max_io_len(), pass a pointer to btrfs_io_geometry. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 974b5e1598f1..4c32497311d2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6277,17 +6277,16 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->replace_nr_stripes = nr_extra_stripes; } -static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, enum btrfs_map_op op, - u64 offset, u32 *stripe_nr, u64 *stripe_offset, - u64 *full_stripe_start) +static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, + struct btrfs_io_geometry *io_geom) { /* * Stripe_nr is the stripe where this block falls. stripe_offset is * the offset of this block in its stripe. */ - *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; - *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; - ASSERT(*stripe_offset < U32_MAX); + io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; + io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; + ASSERT(io_geom->stripe_offset < U32_MAX); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { unsigned long full_stripe_len = @@ -6302,18 +6301,17 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, enum btrfs_map_op op, * to go rounddown(), not round_down(), as nr_data_stripes is * not ensured to be power of 2. */ - *full_stripe_start = - btrfs_stripe_nr_to_offset( - rounddown(*stripe_nr, nr_data_stripes(map))); + io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset( + rounddown(io_geom->stripe_nr, nr_data_stripes(map))); - ASSERT(*full_stripe_start + full_stripe_len > offset); - ASSERT(*full_stripe_start <= offset); + ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset); + ASSERT(io_geom->raid56_full_stripe_start <= offset); /* * For writes to RAID56, allow to write a full stripe set, but * no straddling of stripe sets. */ - if (op == BTRFS_MAP_WRITE) - return full_stripe_len - (offset - *full_stripe_start); + if (io_geom->op == BTRFS_MAP_WRITE) + return full_stripe_len - (offset - io_geom->raid56_full_stripe_start); } /* @@ -6321,7 +6319,7 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, enum btrfs_map_op op, * a single disk). */ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) - return BTRFS_STRIPE_LEN - *stripe_offset; + return BTRFS_STRIPE_LEN - io_geom->stripe_offset; return U64_MAX; } @@ -6564,9 +6562,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, map_offset = logical - map->start; io_geom.raid56_full_stripe_start = (u64)-1; - max_len = btrfs_max_io_len(map, io_geom.op, map_offset, &io_geom.stripe_nr, - &io_geom.stripe_offset, - &io_geom.raid56_full_stripe_start); + max_len = btrfs_max_io_len(map, map_offset, &io_geom); *length = min_t(u64, map->chunk_len - map_offset, max_len); down_read(&dev_replace->rwsem); From 57eb6dcd32cf6b49c38eff81e60e8fd471aa05a8 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 16 Dec 2023 17:59:38 +0100 Subject: [PATCH 0907/1562] platform/chrome/wilco_ec: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/898d9aa181a84f1d17725ca047004bad532c37e9.1702745959.git.christophe.jaillet@wanadoo.fr Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/wilco_ec/event.c | 4 ++-- drivers/platform/chrome/wilco_ec/telemetry.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/platform/chrome/wilco_ec/event.c b/drivers/platform/chrome/wilco_ec/event.c index f80a7c83cfba..13291fb4214e 100644 --- a/drivers/platform/chrome/wilco_ec/event.c +++ b/drivers/platform/chrome/wilco_ec/event.c @@ -495,7 +495,7 @@ static int event_device_add(struct acpi_device *adev) free_dev_data: hangup_device(dev_data); free_minor: - ida_simple_remove(&event_ida, minor); + ida_free(&event_ida, minor); return error; } @@ -504,7 +504,7 @@ static void event_device_remove(struct acpi_device *adev) struct event_device_data *dev_data = adev->driver_data; cdev_device_del(&dev_data->cdev, &dev_data->dev); - ida_simple_remove(&event_ida, MINOR(dev_data->dev.devt)); + ida_free(&event_ida, MINOR(dev_data->dev.devt)); hangup_device(dev_data); } diff --git a/drivers/platform/chrome/wilco_ec/telemetry.c b/drivers/platform/chrome/wilco_ec/telemetry.c index 253098bace63..b7c616f3d179 100644 --- a/drivers/platform/chrome/wilco_ec/telemetry.c +++ b/drivers/platform/chrome/wilco_ec/telemetry.c @@ -372,7 +372,7 @@ static int telem_device_probe(struct platform_device *pdev) dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); if (!dev_data) { - ida_simple_remove(&telem_ida, minor); + ida_free(&telem_ida, minor); return -ENOMEM; } @@ -393,7 +393,7 @@ static int telem_device_probe(struct platform_device *pdev) error = cdev_device_add(&dev_data->cdev, &dev_data->dev); if (error) { put_device(&dev_data->dev); - ida_simple_remove(&telem_ida, minor); + ida_free(&telem_ida, minor); return error; } @@ -405,7 +405,7 @@ static void telem_device_remove(struct platform_device *pdev) struct telem_device_data *dev_data = platform_get_drvdata(pdev); cdev_device_del(&dev_data->cdev, &dev_data->dev); - ida_simple_remove(&telem_ida, MINOR(dev_data->dev.devt)); + ida_free(&telem_ida, MINOR(dev_data->dev.devt)); put_device(&dev_data->dev); } From 0990319a0400db1d6069b5549327cd9105a266d5 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Fri, 15 Dec 2023 16:37:06 +0100 Subject: [PATCH 0908/1562] cpufreq: armada-8k: Fix parameter type warning The second parameter of clk_get() is of type 'const char *', so use NULL instead of the integer 0 to resolve a sparse warning: drivers/cpufreq/armada-8k-cpufreq.c:60:40: warning: Using plain integer as NULL pointer drivers/cpufreq/armada-8k-cpufreq.c:168:40: warning: Using plain integer as NULL pointer Reported-by: kernel test robot Signed-off-by: Gregory CLEMENT Reviewed-by: Andrew Lunn Signed-off-by: Viresh Kumar --- drivers/cpufreq/armada-8k-cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/armada-8k-cpufreq.c b/drivers/cpufreq/armada-8k-cpufreq.c index 8afefdea4d80..ce5a5641b6dd 100644 --- a/drivers/cpufreq/armada-8k-cpufreq.c +++ b/drivers/cpufreq/armada-8k-cpufreq.c @@ -57,7 +57,7 @@ static void __init armada_8k_get_sharing_cpus(struct clk *cur_clk, continue; } - clk = clk_get(cpu_dev, 0); + clk = clk_get(cpu_dev, NULL); if (IS_ERR(clk)) { pr_warn("Cannot get clock for CPU %d\n", cpu); } else { @@ -165,7 +165,7 @@ static int __init armada_8k_cpufreq_init(void) continue; } - clk = clk_get(cpu_dev, 0); + clk = clk_get(cpu_dev, NULL); if (IS_ERR(clk)) { pr_err("Cannot get clock for CPU %d\n", cpu); From e5aba911dee5e20fa82efbe13e0af8f38ea459e7 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 15 Dec 2023 00:13:37 +0800 Subject: [PATCH 0909/1562] erofs: fix ztailpacking for subpage compressed blocks `pageofs_in` should be the compressed data offset of the page rather than of the block. Acked-by: Chao Yu Reviewed-by: Yue Hu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231214161337.753049-1-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index d02989466711..5d5640173412 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -810,7 +810,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ - pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa); } else { pcl->obj.index = erofs_blknr(sb, map->m_pa); @@ -887,6 +886,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) } get_page(map->buf.page); WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); + fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* file-backed inplace I/O pages are traversed in reverse order */ From 1ca01520148af399899ed66af5c78330bb9ecaf2 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:56 +0800 Subject: [PATCH 0910/1562] erofs: refine z_erofs_transform_plain() for sub-page block support Sub-page block support is still unusable even with previous commits if interlaced PLAIN pclusters exist. Such pclusters can be found if the fragment feature is enabled. This commit tries to handle "the head part" of interlaced PLAIN pclusters first: it was once explained in commit fdffc091e6f9 ("erofs: support interlaced uncompressed data for compressed files"). It uses a unique way for both shifted and interlaced PLAIN pclusters. As an added bonus, PLAIN pclusters larger than the block size is also supported now for the upcoming large lclusters. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-5-hsiangkao@linux.alibaba.com --- fs/erofs/decompressor.c | 81 ++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 33 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index e0d609c3958f..d08a6ee23ac5 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -320,43 +320,58 @@ dstmap_out: static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, struct page **pagepool) { - const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - const unsigned int outpages = + const unsigned int nrpages_in = + PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT; + const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const unsigned int righthalf = min_t(unsigned int, rq->outputsize, - PAGE_SIZE - rq->pageofs_out); - const unsigned int lefthalf = rq->outputsize - righthalf; - const unsigned int interlaced_offset = - rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; - u8 *src; + const unsigned int bs = rq->sb->s_blocksize; + unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt; + u8 *kin; - if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - if (rq->out[0] == *rq->in) { - DBG_BUGON(rq->pageofs_out); - return 0; - } - - src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; - if (rq->out[0]) - memcpy_to_page(rq->out[0], rq->pageofs_out, - src + interlaced_offset, righthalf); - - if (outpages > inpages) { - DBG_BUGON(!rq->out[outpages - 1]); - if (rq->out[outpages - 1] != rq->in[inpages - 1]) { - memcpy_to_page(rq->out[outpages - 1], 0, src + - (interlaced_offset ? 0 : righthalf), - lefthalf); - } else if (!interlaced_offset) { - memmove(src, src + righthalf, lefthalf); - flush_dcache_page(rq->in[inpages - 1]); + DBG_BUGON(rq->outputsize > rq->inputsize); + if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) { + cur = bs - (rq->pageofs_out & (bs - 1)); + pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK; + cur = min(cur, rq->outputsize); + if (cur && rq->out[0]) { + kin = kmap_local_page(rq->in[nrpages_in - 1]); + if (rq->out[0] == rq->in[nrpages_in - 1]) { + memmove(kin + rq->pageofs_out, kin + pi, cur); + flush_dcache_page(rq->out[0]); + } else { + memcpy_to_page(rq->out[0], rq->pageofs_out, + kin + pi, cur); + } + kunmap_local(kin); } + rq->outputsize -= cur; } - kunmap_local(src); + + for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) { + insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize); + rq->outputsize -= insz; + if (!rq->in[ni]) + continue; + kin = kmap_local_page(rq->in[ni]); + pi = 0; + do { + no = (rq->pageofs_out + cur + pi) >> PAGE_SHIFT; + po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK; + DBG_BUGON(no >= nrpages_out); + cnt = min(insz - pi, PAGE_SIZE - po); + if (rq->out[no] == rq->in[ni]) { + memmove(kin + po, + kin + rq->pageofs_in + pi, cnt); + flush_dcache_page(rq->out[no]); + } else if (rq->out[no]) { + memcpy_to_page(rq->out[no], po, + kin + rq->pageofs_in + pi, cnt); + } + pi += cnt; + } while (pi < insz); + kunmap_local(kin); + } + DBG_BUGON(ni > nrpages_in); return 0; } From 0ee3a0d59e007320167a2e9f4b8bf1304ada7771 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:57 +0800 Subject: [PATCH 0911/1562] erofs: enable sub-page compressed block support Let's just disable cached decompression and inplace I/Os for partial pages as the first step in order to enable sub-page block initial support. In other words, currently it works primarily based on temporary short-lived pages. Don't expect too much in terms of performance. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-6-hsiangkao@linux.alibaba.com --- fs/erofs/inode.c | 6 ++++-- fs/erofs/zdata.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 14a79d3226ab..3d616dea55dc 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -259,8 +259,10 @@ static int erofs_fill_inode(struct inode *inode) if (erofs_inode_is_data_compressed(vi->datalayout)) { #ifdef CONFIG_EROFS_FS_ZIP - if (!erofs_is_fscache_mode(inode->i_sb) && - inode->i_sb->s_blocksize_bits == PAGE_SHIFT) { + if (!erofs_is_fscache_mode(inode->i_sb)) { + DO_ONCE_LITE_IF(inode->i_sb->s_blocksize != PAGE_SIZE, + erofs_info, inode->i_sb, + "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); inode->i_mapping->a_ops = &z_erofs_aops; err = 0; goto out_unlock; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5d5640173412..8264936b8612 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -563,6 +563,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; unsigned int i; + if (i_blocksize(fe->inode) != PAGE_SIZE) + return; if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; @@ -967,12 +969,12 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); + const unsigned int bs = i_blocksize(inode); bool tight = true, exclusive; unsigned int cur, end, len, split; int err = 0; z_erofs_onlinepage_init(page); - split = 0; end = PAGE_SIZE; repeat: @@ -1021,7 +1023,7 @@ repeat: * for inplace I/O or bvpage (should be processed in a strict order.) */ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); - exclusive = (!cur && ((split <= 1) || tight)); + exclusive = (!cur && ((split <= 1) || (tight && bs == PAGE_SIZE))); if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); From ac052d8c08f9da225bea09c7e71527831368462b Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 12 Dec 2023 07:16:54 +0530 Subject: [PATCH 0912/1562] platform/x86/amd/pmf: Add PMF TEE interface AMD PMF driver loads the PMF TA (Trusted Application) into the AMD ASP's (AMD Security Processor) TEE (Trusted Execution Environment). PMF Trusted Application is a secured firmware placed under /lib/firmware/amdtee gets loaded only when the TEE environment is initialized. Add the initial code path to build these pipes. Reviewed-by: Mario Limonciello Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20231212014705.2017474-2-Shyam-sundar.S-k@amd.com Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/Kconfig | 1 + drivers/platform/x86/amd/pmf/Makefile | 3 +- drivers/platform/x86/amd/pmf/core.c | 10 ++- drivers/platform/x86/amd/pmf/pmf.h | 16 ++++ drivers/platform/x86/amd/pmf/tee-if.c | 105 ++++++++++++++++++++++++++ 5 files changed, 130 insertions(+), 5 deletions(-) create mode 100644 drivers/platform/x86/amd/pmf/tee-if.c diff --git a/drivers/platform/x86/amd/pmf/Kconfig b/drivers/platform/x86/amd/pmf/Kconfig index 3064bc8ea167..32a029e8db80 100644 --- a/drivers/platform/x86/amd/pmf/Kconfig +++ b/drivers/platform/x86/amd/pmf/Kconfig @@ -9,6 +9,7 @@ config AMD_PMF depends on POWER_SUPPLY depends on AMD_NB select ACPI_PLATFORM_PROFILE + depends on TEE help This driver provides support for the AMD Platform Management Framework. The goal is to enhance end user experience by making AMD PCs smarter, diff --git a/drivers/platform/x86/amd/pmf/Makefile b/drivers/platform/x86/amd/pmf/Makefile index fdededf54392..d2746ee7369f 100644 --- a/drivers/platform/x86/amd/pmf/Makefile +++ b/drivers/platform/x86/amd/pmf/Makefile @@ -6,4 +6,5 @@ obj-$(CONFIG_AMD_PMF) += amd-pmf.o amd-pmf-objs := core.o acpi.o sps.o \ - auto-mode.o cnqf.o + auto-mode.o cnqf.o \ + tee-if.o diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index 78ed3ee22555..ec92d1cc0dac 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -309,13 +309,13 @@ static void amd_pmf_init_features(struct amd_pmf_dev *dev) dev_dbg(dev->dev, "SPS enabled and Platform Profiles registered\n"); } - /* Enable Auto Mode */ - if (is_apmf_func_supported(dev, APMF_FUNC_AUTO_MODE)) { + if (!amd_pmf_init_smart_pc(dev)) { + dev_dbg(dev->dev, "Smart PC Solution Enabled\n"); + } else if (is_apmf_func_supported(dev, APMF_FUNC_AUTO_MODE)) { amd_pmf_init_auto_mode(dev); dev_dbg(dev->dev, "Auto Mode Init done\n"); } else if (is_apmf_func_supported(dev, APMF_FUNC_DYN_SLIDER_AC) || is_apmf_func_supported(dev, APMF_FUNC_DYN_SLIDER_DC)) { - /* Enable Cool n Quiet Framework (CnQF) */ ret = amd_pmf_init_cnqf(dev); if (ret) dev_warn(dev->dev, "CnQF Init failed\n"); @@ -330,7 +330,9 @@ static void amd_pmf_deinit_features(struct amd_pmf_dev *dev) amd_pmf_deinit_sps(dev); } - if (is_apmf_func_supported(dev, APMF_FUNC_AUTO_MODE)) { + if (!dev->smart_pc_enabled) { + amd_pmf_deinit_smart_pc(dev); + } else if (is_apmf_func_supported(dev, APMF_FUNC_AUTO_MODE)) { amd_pmf_deinit_auto_mode(dev); } else if (is_apmf_func_supported(dev, APMF_FUNC_DYN_SLIDER_AC) || is_apmf_func_supported(dev, APMF_FUNC_DYN_SLIDER_DC)) { diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index deba88e6e4c8..bd40458937ba 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -179,6 +179,12 @@ struct amd_pmf_dev { bool cnqf_enabled; bool cnqf_supported; struct notifier_block pwr_src_notifier; + /* Smart PC solution builder */ + struct tee_context *tee_ctx; + struct tee_shm *fw_shm_pool; + u32 session_id; + void *shbuf; + bool smart_pc_enabled; }; struct apmf_sps_prop_granular { @@ -389,6 +395,13 @@ struct apmf_dyn_slider_output { struct apmf_cnqf_power_set ps[APMF_CNQF_MAX]; } __packed; +struct ta_pmf_shared_memory { + int command_id; + int resp_id; + u32 pmf_result; + u32 if_version; +}; + /* Core Layer */ int apmf_acpi_init(struct amd_pmf_dev *pmf_dev); void apmf_acpi_deinit(struct amd_pmf_dev *pmf_dev); @@ -433,4 +446,7 @@ void amd_pmf_deinit_cnqf(struct amd_pmf_dev *dev); int amd_pmf_trans_cnqf(struct amd_pmf_dev *dev, int socket_power, ktime_t time_lapsed_ms); extern const struct attribute_group cnqf_feature_attribute_group; +/* Smart PC builder Layer */ +int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev); +void amd_pmf_deinit_smart_pc(struct amd_pmf_dev *dev); #endif /* PMF_H */ diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c new file mode 100644 index 000000000000..6ec8c3726624 --- /dev/null +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Platform Management Framework Driver - TEE Interface + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Shyam Sundar S K + */ + +#include +#include +#include "pmf.h" + +#define MAX_TEE_PARAM 4 +static const uuid_t amd_pmf_ta_uuid = UUID_INIT(0x6fd93b77, 0x3fb8, 0x524d, + 0xb1, 0x2d, 0xc5, 0x29, 0xb1, 0x3d, 0x85, 0x43); + +static int amd_pmf_amdtee_ta_match(struct tee_ioctl_version_data *ver, const void *data) +{ + return ver->impl_id == TEE_IMPL_ID_AMDTEE; +} + +static int amd_pmf_ta_open_session(struct tee_context *ctx, u32 *id) +{ + struct tee_ioctl_open_session_arg sess_arg = {}; + int rc; + + export_uuid(sess_arg.uuid, &amd_pmf_ta_uuid); + sess_arg.clnt_login = TEE_IOCTL_LOGIN_PUBLIC; + sess_arg.num_params = 0; + + rc = tee_client_open_session(ctx, &sess_arg, NULL); + if (rc < 0 || sess_arg.ret != 0) { + pr_err("Failed to open TEE session err:%#x, rc:%d\n", sess_arg.ret, rc); + return rc; + } + + *id = sess_arg.session; + + return rc; +} + +static int amd_pmf_tee_init(struct amd_pmf_dev *dev) +{ + u32 size; + int ret; + + dev->tee_ctx = tee_client_open_context(NULL, amd_pmf_amdtee_ta_match, NULL, NULL); + if (IS_ERR(dev->tee_ctx)) { + dev_err(dev->dev, "Failed to open TEE context\n"); + return PTR_ERR(dev->tee_ctx); + } + + ret = amd_pmf_ta_open_session(dev->tee_ctx, &dev->session_id); + if (ret) { + dev_err(dev->dev, "Failed to open TA session (%d)\n", ret); + ret = -EINVAL; + goto out_ctx; + } + + size = sizeof(struct ta_pmf_shared_memory); + dev->fw_shm_pool = tee_shm_alloc_kernel_buf(dev->tee_ctx, size); + if (IS_ERR(dev->fw_shm_pool)) { + dev_err(dev->dev, "Failed to alloc TEE shared memory\n"); + ret = PTR_ERR(dev->fw_shm_pool); + goto out_sess; + } + + dev->shbuf = tee_shm_get_va(dev->fw_shm_pool, 0); + if (IS_ERR(dev->shbuf)) { + dev_err(dev->dev, "Failed to get TEE virtual address\n"); + ret = PTR_ERR(dev->shbuf); + goto out_shm; + } + dev_dbg(dev->dev, "TEE init done\n"); + + return 0; + +out_shm: + tee_shm_free(dev->fw_shm_pool); +out_sess: + tee_client_close_session(dev->tee_ctx, dev->session_id); +out_ctx: + tee_client_close_context(dev->tee_ctx); + + return ret; +} + +static void amd_pmf_tee_deinit(struct amd_pmf_dev *dev) +{ + tee_shm_free(dev->fw_shm_pool); + tee_client_close_session(dev->tee_ctx, dev->session_id); + tee_client_close_context(dev->tee_ctx); +} + +int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) +{ + return amd_pmf_tee_init(dev); +} + +void amd_pmf_deinit_smart_pc(struct amd_pmf_dev *dev) +{ + amd_pmf_tee_deinit(dev); +} From ae82cef7d9c53cad0852d2d79d430b210432a025 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 12 Dec 2023 07:16:55 +0530 Subject: [PATCH 0913/1562] platform/x86/amd/pmf: Add support for PMF-TA interaction PMF TA (Trusted Application) loads via the TEE environment into the AMD ASP. PMF-TA supports two commands: 1) Init: Initialize the TA with the PMF Smart PC policy binary and start the policy engine. A policy is a combination of inputs and outputs, where; - the inputs are the changing dynamics of the system like the user behaviour, system heuristics etc. - the outputs, which are the actions to be set on the system which lead to better power management and enhanced user experience. PMF driver acts as a central manager in this case to supply the inputs required to the TA (either by getting the information from the other kernel subsystems or from userland) 2) Enact: Enact the output actions from the TA. The action could be applying a new thermal limit to boost/throttle the power limits or change system behavior. Reviewed-by: Mario Limonciello Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20231212014705.2017474-3-Shyam-sundar.S-k@amd.com Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/pmf.h | 10 +++ drivers/platform/x86/amd/pmf/tee-if.c | 97 ++++++++++++++++++++++++++- 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index bd40458937ba..a24e34e42032 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -59,6 +59,9 @@ #define ARG_NONE 0 #define AVG_SAMPLE_SIZE 3 +/* TA macros */ +#define PMF_TA_IF_VERSION_MAJOR 1 + /* AMD PMF BIOS interfaces */ struct apmf_verify_interface { u16 size; @@ -184,6 +187,7 @@ struct amd_pmf_dev { struct tee_shm *fw_shm_pool; u32 session_id; void *shbuf; + struct delayed_work pb_work; bool smart_pc_enabled; }; @@ -395,6 +399,12 @@ struct apmf_dyn_slider_output { struct apmf_cnqf_power_set ps[APMF_CNQF_MAX]; } __packed; +/* Command ids for TA communication */ +enum ta_pmf_command { + TA_PMF_COMMAND_POLICY_BUILDER_INITIALIZE, + TA_PMF_COMMAND_POLICY_BUILDER_ENACT_POLICIES, +}; + struct ta_pmf_shared_memory { int command_id; int resp_id; diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index 6ec8c3726624..4036f435f1e2 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -13,9 +13,96 @@ #include "pmf.h" #define MAX_TEE_PARAM 4 + +/* Policy binary actions sampling frequency (in ms) */ +static int pb_actions_ms = MSEC_PER_SEC; +#ifdef CONFIG_AMD_PMF_DEBUG +module_param(pb_actions_ms, int, 0644); +MODULE_PARM_DESC(pb_actions_ms, "Policy binary actions sampling frequency (default = 1000ms)"); +#endif + static const uuid_t amd_pmf_ta_uuid = UUID_INIT(0x6fd93b77, 0x3fb8, 0x524d, 0xb1, 0x2d, 0xc5, 0x29, 0xb1, 0x3d, 0x85, 0x43); +static void amd_pmf_prepare_args(struct amd_pmf_dev *dev, int cmd, + struct tee_ioctl_invoke_arg *arg, + struct tee_param *param) +{ + memset(arg, 0, sizeof(*arg)); + memset(param, 0, MAX_TEE_PARAM * sizeof(*param)); + + arg->func = cmd; + arg->session = dev->session_id; + arg->num_params = MAX_TEE_PARAM; + + /* Fill invoke cmd params */ + param[0].u.memref.size = sizeof(struct ta_pmf_shared_memory); + param[0].attr = TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT; + param[0].u.memref.shm = dev->fw_shm_pool; + param[0].u.memref.shm_offs = 0; +} + +static int amd_pmf_invoke_cmd_enact(struct amd_pmf_dev *dev) +{ + struct ta_pmf_shared_memory *ta_sm = NULL; + struct tee_param param[MAX_TEE_PARAM]; + struct tee_ioctl_invoke_arg arg; + int ret = 0; + + if (!dev->tee_ctx) + return -ENODEV; + + ta_sm = dev->shbuf; + memset(ta_sm, 0, sizeof(*ta_sm)); + ta_sm->command_id = TA_PMF_COMMAND_POLICY_BUILDER_ENACT_POLICIES; + ta_sm->if_version = PMF_TA_IF_VERSION_MAJOR; + + amd_pmf_prepare_args(dev, TA_PMF_COMMAND_POLICY_BUILDER_ENACT_POLICIES, &arg, param); + + ret = tee_client_invoke_func(dev->tee_ctx, &arg, param); + if (ret < 0 || arg.ret != 0) { + dev_err(dev->dev, "TEE enact cmd failed. err: %x, ret:%d\n", arg.ret, ret); + return ret; + } + + return 0; +} + +static int amd_pmf_invoke_cmd_init(struct amd_pmf_dev *dev) +{ + struct ta_pmf_shared_memory *ta_sm = NULL; + struct tee_param param[MAX_TEE_PARAM]; + struct tee_ioctl_invoke_arg arg; + int ret = 0; + + if (!dev->tee_ctx) { + dev_err(dev->dev, "Failed to get TEE context\n"); + return -ENODEV; + } + + ta_sm = dev->shbuf; + ta_sm->command_id = TA_PMF_COMMAND_POLICY_BUILDER_INITIALIZE; + ta_sm->if_version = PMF_TA_IF_VERSION_MAJOR; + + amd_pmf_prepare_args(dev, TA_PMF_COMMAND_POLICY_BUILDER_INITIALIZE, &arg, param); + + ret = tee_client_invoke_func(dev->tee_ctx, &arg, param); + if (ret < 0 || arg.ret != 0) { + dev_err(dev->dev, "Failed to invoke TEE init cmd. err: %x, ret:%d\n", arg.ret, ret); + return ret; + } + + return ta_sm->pmf_result; +} + +static void amd_pmf_invoke_cmd(struct work_struct *work) +{ + struct amd_pmf_dev *dev = container_of(work, struct amd_pmf_dev, pb_work.work); + + amd_pmf_invoke_cmd_enact(dev); + schedule_delayed_work(&dev->pb_work, msecs_to_jiffies(pb_actions_ms)); +} + static int amd_pmf_amdtee_ta_match(struct tee_ioctl_version_data *ver, const void *data) { return ver->impl_id == TEE_IMPL_ID_AMDTEE; @@ -96,10 +183,18 @@ static void amd_pmf_tee_deinit(struct amd_pmf_dev *dev) int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) { - return amd_pmf_tee_init(dev); + int ret; + + ret = amd_pmf_tee_init(dev); + if (ret) + return ret; + + INIT_DELAYED_WORK(&dev->pb_work, amd_pmf_invoke_cmd); + return 0; } void amd_pmf_deinit_smart_pc(struct amd_pmf_dev *dev) { + cancel_delayed_work_sync(&dev->pb_work); amd_pmf_tee_deinit(dev); } From 2b3a7f06caaf1aa7379cc0233462799852fcd8b4 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 12 Dec 2023 07:16:56 +0530 Subject: [PATCH 0914/1562] platform/x86/amd/pmf: Change return type of amd_pmf_set_dram_addr() In the current code, the metrics table information was required only for auto-mode or CnQF at a given time. Hence keeping the return type of amd_pmf_set_dram_addr() as static made sense. But with the addition of Smart PC builder feature, the metrics table information has to be shared by the Smart PC also and this feature resides outside of core.c. To make amd_pmf_set_dram_addr() visible outside of core.c make it as a non-static function and move the allocation of memory for metrics table from amd_pmf_init_metrics_table() to amd_pmf_set_dram_addr() as amd_pmf_set_dram_addr() is the common function to set the DRAM address. Add a suspend handler that can free up the allocated memory for getting the metrics table information. Reviewed-by: Mario Limonciello Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20231212014705.2017474-4-Shyam-sundar.S-k@amd.com Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/core.c | 41 ++++++++++++++++++++++------- drivers/platform/x86/amd/pmf/pmf.h | 1 + 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index ec92d1cc0dac..9953619a830b 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -251,29 +251,37 @@ static const struct pci_device_id pmf_pci_ids[] = { { } }; -static void amd_pmf_set_dram_addr(struct amd_pmf_dev *dev) +int amd_pmf_set_dram_addr(struct amd_pmf_dev *dev, bool alloc_buffer) { u64 phys_addr; u32 hi, low; + /* Get Metrics Table Address */ + if (alloc_buffer) { + dev->buf = kzalloc(sizeof(dev->m_table), GFP_KERNEL); + if (!dev->buf) + return -ENOMEM; + } + phys_addr = virt_to_phys(dev->buf); hi = phys_addr >> 32; low = phys_addr & GENMASK(31, 0); amd_pmf_send_cmd(dev, SET_DRAM_ADDR_HIGH, 0, hi, NULL); amd_pmf_send_cmd(dev, SET_DRAM_ADDR_LOW, 0, low, NULL); + + return 0; } int amd_pmf_init_metrics_table(struct amd_pmf_dev *dev) { - /* Get Metrics Table Address */ - dev->buf = kzalloc(sizeof(dev->m_table), GFP_KERNEL); - if (!dev->buf) - return -ENOMEM; + int ret; INIT_DELAYED_WORK(&dev->work_buffer, amd_pmf_get_metrics); - amd_pmf_set_dram_addr(dev); + ret = amd_pmf_set_dram_addr(dev, true); + if (ret) + return ret; /* * Start collecting the metrics data after a small delay @@ -284,17 +292,30 @@ int amd_pmf_init_metrics_table(struct amd_pmf_dev *dev) return 0; } -static int amd_pmf_resume_handler(struct device *dev) +static int amd_pmf_suspend_handler(struct device *dev) { struct amd_pmf_dev *pdev = dev_get_drvdata(dev); - if (pdev->buf) - amd_pmf_set_dram_addr(pdev); + kfree(pdev->buf); return 0; } -static DEFINE_SIMPLE_DEV_PM_OPS(amd_pmf_pm, NULL, amd_pmf_resume_handler); +static int amd_pmf_resume_handler(struct device *dev) +{ + struct amd_pmf_dev *pdev = dev_get_drvdata(dev); + int ret; + + if (pdev->buf) { + ret = amd_pmf_set_dram_addr(pdev, false); + if (ret) + return ret; + } + + return 0; +} + +static DEFINE_SIMPLE_DEV_PM_OPS(amd_pmf_pm, amd_pmf_suspend_handler, amd_pmf_resume_handler); static void amd_pmf_init_features(struct amd_pmf_dev *dev) { diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index a24e34e42032..6c1aba5d2027 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -421,6 +421,7 @@ int amd_pmf_init_metrics_table(struct amd_pmf_dev *dev); int amd_pmf_get_power_source(void); int apmf_install_handler(struct amd_pmf_dev *pmf_dev); int apmf_os_power_slider_update(struct amd_pmf_dev *dev, u8 flag); +int amd_pmf_set_dram_addr(struct amd_pmf_dev *dev, bool alloc_buffer); /* SPS Layer */ int amd_pmf_get_pprof_modes(struct amd_pmf_dev *pmf); From 7c45534afa4435c9fceeeb8ca33c0fdc269c2240 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 12 Dec 2023 07:16:57 +0530 Subject: [PATCH 0915/1562] platform/x86/amd/pmf: Add support for PMF Policy Binary PMF Policy binary is a encrypted and signed binary that will be part of the BIOS. PMF driver via the ACPI interface checks the existence of Smart PC bit. If the advertised bit is found, PMF driver walks the acpi namespace to find out the policy binary size and the address which has to be passed to the TA during the TA init sequence. The policy binary is comprised of inputs (or the events) and outputs (or the actions). With the PMF ecosystem, OEMs generate the policy binary (or could be multiple binaries) that contains a supported set of inputs and outputs which could be specifically carved out for each usage segment (or for each user also) that could influence the system behavior either by enriching the user experience or/and boost/throttle power limits. Once the TA init command succeeds, the PMF driver sends the changing events in the current environment to the TA for a constant sampling frequency time (the event here could be a lid close or open) and if the policy binary has corresponding action built within it, the TA sends the action for it in the subsequent enact command. If the inputs sent to the TA has no output defined in the policy binary generated by OEMs, there will be no action to be performed by the PMF driver. Example policies: 1) if slider is performance ; set the SPL to 40W Here PMF driver registers with the platform profile interface and when the slider position is changed, PMF driver lets the TA know about this. TA sends back an action to update the Sustained Power Limit (SPL). PMF driver updates this limit via the PMFW mailbox. 2) if user_away ; then lock the system Here PMF driver hooks to the AMD SFH driver to know the user presence and send the inputs to TA and if the condition is met, the TA sends the action of locking the system. PMF driver generates a uevent and based on the udev rule in the userland the system gets locked with systemctl. The intent here is to provide the OEM's to make a policy to lock the system when the user is away ; but the userland can make a choice to ignore it. The OEMs will have an utility to create numerous such policies and the policies shall be reviewed by AMD before signing and encrypting them. Policies are shared between operating systems to have seemless user experience. Since all this action has to happen via the "amdtee" driver, currently there is no caller for it in the kernel which can load the amdtee driver. Without amdtee driver loading onto the system the "tee" calls shall fail from the PMF driver. Hence an explicit MODULE_SOFTDEP has been added to address this. Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20231212014705.2017474-5-Shyam-sundar.S-k@amd.com Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/Kconfig | 2 +- drivers/platform/x86/amd/pmf/acpi.c | 37 ++++++ drivers/platform/x86/amd/pmf/core.c | 1 + drivers/platform/x86/amd/pmf/pmf.h | 141 +++++++++++++++++++++++ drivers/platform/x86/amd/pmf/tee-if.c | 158 +++++++++++++++++++++++++- 5 files changed, 336 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/amd/pmf/Kconfig b/drivers/platform/x86/amd/pmf/Kconfig index 32a029e8db80..f246252bddd8 100644 --- a/drivers/platform/x86/amd/pmf/Kconfig +++ b/drivers/platform/x86/amd/pmf/Kconfig @@ -9,7 +9,7 @@ config AMD_PMF depends on POWER_SUPPLY depends on AMD_NB select ACPI_PLATFORM_PROFILE - depends on TEE + depends on TEE && AMDTEE help This driver provides support for the AMD Platform Management Framework. The goal is to enhance end user experience by making AMD PCs smarter, diff --git a/drivers/platform/x86/amd/pmf/acpi.c b/drivers/platform/x86/amd/pmf/acpi.c index 3fc5e4547d9f..4ec7957eb707 100644 --- a/drivers/platform/x86/amd/pmf/acpi.c +++ b/drivers/platform/x86/amd/pmf/acpi.c @@ -286,6 +286,43 @@ int apmf_install_handler(struct amd_pmf_dev *pmf_dev) return 0; } +static acpi_status apmf_walk_resources(struct acpi_resource *res, void *data) +{ + struct amd_pmf_dev *dev = data; + + switch (res->type) { + case ACPI_RESOURCE_TYPE_ADDRESS64: + dev->policy_addr = res->data.address64.address.minimum; + dev->policy_sz = res->data.address64.address.address_length; + break; + case ACPI_RESOURCE_TYPE_FIXED_MEMORY32: + dev->policy_addr = res->data.fixed_memory32.address; + dev->policy_sz = res->data.fixed_memory32.address_length; + break; + } + + if (!dev->policy_addr || dev->policy_sz > POLICY_BUF_MAX_SZ || dev->policy_sz == 0) { + pr_err("Incorrect Policy params, possibly a SBIOS bug\n"); + return AE_ERROR; + } + + return AE_OK; +} + +int apmf_check_smart_pc(struct amd_pmf_dev *pmf_dev) +{ + acpi_handle ahandle = ACPI_HANDLE(pmf_dev->dev); + acpi_status status; + + status = acpi_walk_resources(ahandle, METHOD_NAME__CRS, apmf_walk_resources, pmf_dev); + if (ACPI_FAILURE(status)) { + dev_err(pmf_dev->dev, "acpi_walk_resources failed :%d\n", status); + return -EINVAL; + } + + return 0; +} + void apmf_acpi_deinit(struct amd_pmf_dev *pmf_dev) { acpi_handle ahandle = ACPI_HANDLE(pmf_dev->dev); diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index 9953619a830b..c10d40b33667 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -471,3 +471,4 @@ module_platform_driver(amd_pmf_driver); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("AMD Platform Management Framework Driver"); +MODULE_SOFTDEP("pre: amdtee"); diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index 6c1aba5d2027..031e6d3ebd4d 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -14,6 +14,11 @@ #include #include +#define POLICY_BUF_MAX_SZ 0x4b000 +#define POLICY_SIGN_COOKIE 0x31535024 +#define POLICY_COOKIE_OFFSET 0x10 +#define POLICY_COOKIE_LEN 0x14 + /* APMF Functions */ #define APMF_FUNC_VERIFY_INTERFACE 0 #define APMF_FUNC_GET_SYS_PARAMS 1 @@ -59,8 +64,21 @@ #define ARG_NONE 0 #define AVG_SAMPLE_SIZE 3 +/* Policy Actions */ +#define PMF_POLICY_SPL 2 +#define PMF_POLICY_SPPT 3 +#define PMF_POLICY_FPPT 4 +#define PMF_POLICY_SPPT_APU_ONLY 5 +#define PMF_POLICY_STT_MIN 6 +#define PMF_POLICY_STT_SKINTEMP_APU 7 +#define PMF_POLICY_STT_SKINTEMP_HS2 8 + /* TA macros */ #define PMF_TA_IF_VERSION_MAJOR 1 +#define TA_PMF_ACTION_MAX 32 +#define TA_PMF_UNDO_MAX 8 +#define TA_OUTPUT_RESERVED_MEM 906 +#define MAX_OPERATION_PARAMS 4 /* AMD PMF BIOS interfaces */ struct apmf_verify_interface { @@ -183,11 +201,16 @@ struct amd_pmf_dev { bool cnqf_supported; struct notifier_block pwr_src_notifier; /* Smart PC solution builder */ + unsigned char *policy_buf; + u32 policy_sz; struct tee_context *tee_ctx; struct tee_shm *fw_shm_pool; u32 session_id; void *shbuf; struct delayed_work pb_work; + struct pmf_action_table *prev_data; + u64 policy_addr; + void *policy_base; bool smart_pc_enabled; }; @@ -399,17 +422,134 @@ struct apmf_dyn_slider_output { struct apmf_cnqf_power_set ps[APMF_CNQF_MAX]; } __packed; +enum smart_pc_status { + PMF_SMART_PC_ENABLED, + PMF_SMART_PC_DISABLED, +}; + +/* Smart PC - TA internals */ +enum ta_slider { + TA_BEST_BATTERY, + TA_BETTER_BATTERY, + TA_BETTER_PERFORMANCE, + TA_BEST_PERFORMANCE, + TA_MAX, +}; + /* Command ids for TA communication */ enum ta_pmf_command { TA_PMF_COMMAND_POLICY_BUILDER_INITIALIZE, TA_PMF_COMMAND_POLICY_BUILDER_ENACT_POLICIES, }; +enum ta_pmf_error_type { + TA_PMF_TYPE_SUCCESS, + TA_PMF_ERROR_TYPE_GENERIC, + TA_PMF_ERROR_TYPE_CRYPTO, + TA_PMF_ERROR_TYPE_CRYPTO_VALIDATE, + TA_PMF_ERROR_TYPE_CRYPTO_VERIFY_OEM, + TA_PMF_ERROR_TYPE_POLICY_BUILDER, + TA_PMF_ERROR_TYPE_PB_CONVERT, + TA_PMF_ERROR_TYPE_PB_SETUP, + TA_PMF_ERROR_TYPE_PB_ENACT, + TA_PMF_ERROR_TYPE_ASD_GET_DEVICE_INFO, + TA_PMF_ERROR_TYPE_ASD_GET_DEVICE_PCIE_INFO, + TA_PMF_ERROR_TYPE_SYS_DRV_FW_VALIDATION, + TA_PMF_ERROR_TYPE_MAX, +}; + +struct pmf_action_table { + u32 spl; /* in mW */ + u32 sppt; /* in mW */ + u32 sppt_apuonly; /* in mW */ + u32 fppt; /* in mW */ + u32 stt_minlimit; /* in mW */ + u32 stt_skintemp_apu; /* in C */ + u32 stt_skintemp_hs2; /* in C */ +}; + +/* Input conditions */ +struct ta_pmf_condition_info { + u32 power_source; + u32 bat_percentage; + u32 power_slider; + u32 lid_state; + bool user_present; + u32 rsvd1[2]; + u32 monitor_count; + u32 rsvd2[2]; + u32 bat_design; + u32 full_charge_capacity; + int drain_rate; + bool user_engaged; + u32 device_state; + u32 socket_power; + u32 skin_temperature; + u32 rsvd3[5]; + u32 ambient_light; + u32 length; + u32 avg_c0residency; + u32 max_c0residency; + u32 s0i3_entry; + u32 gfx_busy; + u32 rsvd4[7]; + bool camera_state; + u32 workload_type; + u32 display_type; + u32 display_state; + u32 rsvd5[150]; +}; + +struct ta_pmf_load_policy_table { + u32 table_size; + u8 table[POLICY_BUF_MAX_SZ]; +}; + +/* TA initialization params */ +struct ta_pmf_init_table { + u32 frequency; /* SMU sampling frequency */ + bool validate; + bool sku_check; + bool metadata_macrocheck; + struct ta_pmf_load_policy_table policies_table; +}; + +/* Everything the TA needs to Enact Policies */ +struct ta_pmf_enact_table { + struct ta_pmf_condition_info ev_info; + u32 name; +}; + +struct ta_pmf_action { + u32 action_index; + u32 value; +}; + +/* Output actions from TA */ +struct ta_pmf_enact_result { + u32 actions_count; + struct ta_pmf_action actions_list[TA_PMF_ACTION_MAX]; + u32 undo_count; + struct ta_pmf_action undo_list[TA_PMF_UNDO_MAX]; +}; + +union ta_pmf_input { + struct ta_pmf_enact_table enact_table; + struct ta_pmf_init_table init_table; +}; + +union ta_pmf_output { + struct ta_pmf_enact_result policy_apply_table; + u32 rsvd[TA_OUTPUT_RESERVED_MEM]; +}; + struct ta_pmf_shared_memory { int command_id; int resp_id; u32 pmf_result; u32 if_version; + union ta_pmf_output pmf_output; + union ta_pmf_input pmf_input; }; /* Core Layer */ @@ -460,4 +600,5 @@ extern const struct attribute_group cnqf_feature_attribute_group; /* Smart PC builder Layer */ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev); void amd_pmf_deinit_smart_pc(struct amd_pmf_dev *dev); +int apmf_check_smart_pc(struct amd_pmf_dev *pmf_dev); #endif /* PMF_H */ diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index 4036f435f1e2..f99dc79ebb40 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -42,9 +42,77 @@ static void amd_pmf_prepare_args(struct amd_pmf_dev *dev, int cmd, param[0].u.memref.shm_offs = 0; } +static void amd_pmf_apply_policies(struct amd_pmf_dev *dev, struct ta_pmf_enact_result *out) +{ + u32 val; + int idx; + + for (idx = 0; idx < out->actions_count; idx++) { + val = out->actions_list[idx].value; + switch (out->actions_list[idx].action_index) { + case PMF_POLICY_SPL: + if (dev->prev_data->spl != val) { + amd_pmf_send_cmd(dev, SET_SPL, false, val, NULL); + dev_dbg(dev->dev, "update SPL: %u\n", val); + dev->prev_data->spl = val; + } + break; + + case PMF_POLICY_SPPT: + if (dev->prev_data->sppt != val) { + amd_pmf_send_cmd(dev, SET_SPPT, false, val, NULL); + dev_dbg(dev->dev, "update SPPT: %u\n", val); + dev->prev_data->sppt = val; + } + break; + + case PMF_POLICY_FPPT: + if (dev->prev_data->fppt != val) { + amd_pmf_send_cmd(dev, SET_FPPT, false, val, NULL); + dev_dbg(dev->dev, "update FPPT: %u\n", val); + dev->prev_data->fppt = val; + } + break; + + case PMF_POLICY_SPPT_APU_ONLY: + if (dev->prev_data->sppt_apuonly != val) { + amd_pmf_send_cmd(dev, SET_SPPT_APU_ONLY, false, val, NULL); + dev_dbg(dev->dev, "update SPPT_APU_ONLY: %u\n", val); + dev->prev_data->sppt_apuonly = val; + } + break; + + case PMF_POLICY_STT_MIN: + if (dev->prev_data->stt_minlimit != val) { + amd_pmf_send_cmd(dev, SET_STT_MIN_LIMIT, false, val, NULL); + dev_dbg(dev->dev, "update STT_MIN: %u\n", val); + dev->prev_data->stt_minlimit = val; + } + break; + + case PMF_POLICY_STT_SKINTEMP_APU: + if (dev->prev_data->stt_skintemp_apu != val) { + amd_pmf_send_cmd(dev, SET_STT_LIMIT_APU, false, val, NULL); + dev_dbg(dev->dev, "update STT_SKINTEMP_APU: %u\n", val); + dev->prev_data->stt_skintemp_apu = val; + } + break; + + case PMF_POLICY_STT_SKINTEMP_HS2: + if (dev->prev_data->stt_skintemp_hs2 != val) { + amd_pmf_send_cmd(dev, SET_STT_LIMIT_HS2, false, val, NULL); + dev_dbg(dev->dev, "update STT_SKINTEMP_HS2: %u\n", val); + dev->prev_data->stt_skintemp_hs2 = val; + } + break; + } + } +} + static int amd_pmf_invoke_cmd_enact(struct amd_pmf_dev *dev) { struct ta_pmf_shared_memory *ta_sm = NULL; + struct ta_pmf_enact_result *out = NULL; struct tee_param param[MAX_TEE_PARAM]; struct tee_ioctl_invoke_arg arg; int ret = 0; @@ -52,7 +120,10 @@ static int amd_pmf_invoke_cmd_enact(struct amd_pmf_dev *dev) if (!dev->tee_ctx) return -ENODEV; + memset(dev->shbuf, 0, dev->policy_sz); ta_sm = dev->shbuf; + out = &ta_sm->pmf_output.policy_apply_table; + memset(ta_sm, 0, sizeof(*ta_sm)); ta_sm->command_id = TA_PMF_COMMAND_POLICY_BUILDER_ENACT_POLICIES; ta_sm->if_version = PMF_TA_IF_VERSION_MAJOR; @@ -65,6 +136,12 @@ static int amd_pmf_invoke_cmd_enact(struct amd_pmf_dev *dev) return ret; } + if (ta_sm->pmf_result == TA_PMF_TYPE_SUCCESS && out->actions_count) { + dev_dbg(dev->dev, "action count:%u result:%x\n", out->actions_count, + ta_sm->pmf_result); + amd_pmf_apply_policies(dev, out); + } + return 0; } @@ -72,6 +149,7 @@ static int amd_pmf_invoke_cmd_init(struct amd_pmf_dev *dev) { struct ta_pmf_shared_memory *ta_sm = NULL; struct tee_param param[MAX_TEE_PARAM]; + struct ta_pmf_init_table *in = NULL; struct tee_ioctl_invoke_arg arg; int ret = 0; @@ -80,10 +158,21 @@ static int amd_pmf_invoke_cmd_init(struct amd_pmf_dev *dev) return -ENODEV; } + dev_dbg(dev->dev, "Policy Binary size: %u bytes\n", dev->policy_sz); + memset(dev->shbuf, 0, dev->policy_sz); ta_sm = dev->shbuf; + in = &ta_sm->pmf_input.init_table; + ta_sm->command_id = TA_PMF_COMMAND_POLICY_BUILDER_INITIALIZE; ta_sm->if_version = PMF_TA_IF_VERSION_MAJOR; + in->metadata_macrocheck = false; + in->sku_check = false; + in->validate = true; + in->frequency = pb_actions_ms; + in->policies_table.table_size = dev->policy_sz; + + memcpy(in->policies_table.table, dev->policy_buf, dev->policy_sz); amd_pmf_prepare_args(dev, TA_PMF_COMMAND_POLICY_BUILDER_INITIALIZE, &arg, param); ret = tee_client_invoke_func(dev->tee_ctx, &arg, param); @@ -103,6 +192,52 @@ static void amd_pmf_invoke_cmd(struct work_struct *work) schedule_delayed_work(&dev->pb_work, msecs_to_jiffies(pb_actions_ms)); } +static int amd_pmf_start_policy_engine(struct amd_pmf_dev *dev) +{ + u32 cookie, length; + int res; + + cookie = readl(dev->policy_buf + POLICY_COOKIE_OFFSET); + length = readl(dev->policy_buf + POLICY_COOKIE_LEN); + + if (cookie != POLICY_SIGN_COOKIE || !length) + return -EINVAL; + + /* Update the actual length */ + dev->policy_sz = length + 512; + res = amd_pmf_invoke_cmd_init(dev); + if (res == TA_PMF_TYPE_SUCCESS) { + /* Now its safe to announce that smart pc is enabled */ + dev->smart_pc_enabled = PMF_SMART_PC_ENABLED; + /* + * Start collecting the data from TA FW after a small delay + * or else, we might end up getting stale values. + */ + schedule_delayed_work(&dev->pb_work, msecs_to_jiffies(pb_actions_ms * 3)); + } else { + dev_err(dev->dev, "ta invoke cmd init failed err: %x\n", res); + dev->smart_pc_enabled = PMF_SMART_PC_DISABLED; + return res; + } + + return 0; +} + +static int amd_pmf_get_bios_buffer(struct amd_pmf_dev *dev) +{ + dev->policy_buf = kzalloc(dev->policy_sz, GFP_KERNEL); + if (!dev->policy_buf) + return -ENOMEM; + + dev->policy_base = devm_ioremap(dev->dev, dev->policy_addr, dev->policy_sz); + if (!dev->policy_base) + return -ENOMEM; + + memcpy(dev->policy_buf, dev->policy_base, dev->policy_sz); + + return amd_pmf_start_policy_engine(dev); +} + static int amd_pmf_amdtee_ta_match(struct tee_ioctl_version_data *ver, const void *data) { return ver->impl_id == TEE_IMPL_ID_AMDTEE; @@ -146,7 +281,7 @@ static int amd_pmf_tee_init(struct amd_pmf_dev *dev) goto out_ctx; } - size = sizeof(struct ta_pmf_shared_memory); + size = sizeof(struct ta_pmf_shared_memory) + dev->policy_sz; dev->fw_shm_pool = tee_shm_alloc_kernel_buf(dev->tee_ctx, size); if (IS_ERR(dev->fw_shm_pool)) { dev_err(dev->dev, "Failed to alloc TEE shared memory\n"); @@ -185,16 +320,35 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) { int ret; + ret = apmf_check_smart_pc(dev); + if (ret) { + /* + * Lets not return from here if Smart PC bit is not advertised in + * the BIOS. This way, there will be some amount of power savings + * to the user with static slider (if enabled). + */ + dev_info(dev->dev, "PMF Smart PC not advertised in BIOS!:%d\n", ret); + return -ENODEV; + } + ret = amd_pmf_tee_init(dev); if (ret) return ret; INIT_DELAYED_WORK(&dev->pb_work, amd_pmf_invoke_cmd); - return 0; + amd_pmf_set_dram_addr(dev, true); + amd_pmf_get_bios_buffer(dev); + dev->prev_data = kzalloc(sizeof(*dev->prev_data), GFP_KERNEL); + if (!dev->prev_data) + return -ENOMEM; + + return dev->smart_pc_enabled; } void amd_pmf_deinit_smart_pc(struct amd_pmf_dev *dev) { + kfree(dev->prev_data); + kfree(dev->policy_buf); cancel_delayed_work_sync(&dev->pb_work); amd_pmf_tee_deinit(dev); } From c7af165372a8612eae08dbbab787d1d84d7f0384 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 12 Dec 2023 07:16:58 +0530 Subject: [PATCH 0916/1562] platform/x86/amd/pmf: change amd_pmf_init_features() call sequence To sideload pmf policy binaries, the Smart PC Solution Builder provides a debugfs file called "update_policy"; that gets created under a new debugfs directory called "pb" and this new directory has to be associated with existing parent directory for PMF driver called "amd_pmf". In the current code structure, amd_pmf_dbgfs_register() is called after amd_pmf_init_features(). This will not help when the Smart PC builder feature has to be assoicated to the parent directory. Hence change the order of amd_pmf_dbgfs_register() and call it before amd_pmf_init_features() so that when the Smart PC init happens, it has the parent debugfs directory to get itself hooked. Reviewed-by: Mario Limonciello Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20231212014705.2017474-6-Shyam-sundar.S-k@amd.com Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index c10d40b33667..feaa09f5b35a 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -431,9 +431,9 @@ static int amd_pmf_probe(struct platform_device *pdev) apmf_acpi_init(dev); platform_set_drvdata(pdev, dev); + amd_pmf_dbgfs_register(dev); amd_pmf_init_features(dev); apmf_install_handler(dev); - amd_pmf_dbgfs_register(dev); dev_info(dev->dev, "registered PMF device successfully\n"); From f4627dfd0e1924ad31c6476c0fc2308cfe12b561 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 12 Dec 2023 07:16:59 +0530 Subject: [PATCH 0917/1562] platform/x86/amd/pmf: Add support to get inputs from other subsystems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PMF driver sends changing inputs from each subystem to TA for evaluating the conditions in the policy binary. Add initial support of plumbing in the PMF driver for Smart PC to get information from other subsystems in the kernel. Reviewed-by: Mario Limonciello Reviewed-by: Ilpo Järvinen Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20231212014705.2017474-7-Shyam-sundar.S-k@amd.com Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/Makefile | 2 +- drivers/platform/x86/amd/pmf/pmf.h | 18 ++++ drivers/platform/x86/amd/pmf/spc.c | 122 ++++++++++++++++++++++++++ drivers/platform/x86/amd/pmf/tee-if.c | 3 + 4 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 drivers/platform/x86/amd/pmf/spc.c diff --git a/drivers/platform/x86/amd/pmf/Makefile b/drivers/platform/x86/amd/pmf/Makefile index d2746ee7369f..6b26e48ce8ad 100644 --- a/drivers/platform/x86/amd/pmf/Makefile +++ b/drivers/platform/x86/amd/pmf/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_AMD_PMF) += amd-pmf.o amd-pmf-objs := core.o acpi.o sps.o \ auto-mode.o cnqf.o \ - tee-if.o + tee-if.o spc.o diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index 031e6d3ebd4d..4da51eb28b6f 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -150,6 +150,21 @@ struct smu_pmf_metrics { u16 infra_gfx_maxfreq; /* in MHz */ u16 skin_temp; /* in centi-Celsius */ u16 device_state; + u16 curtemp; /* in centi-Celsius */ + u16 filter_alpha_value; + u16 avg_gfx_clkfrequency; + u16 avg_fclk_frequency; + u16 avg_gfx_activity; + u16 avg_socclk_frequency; + u16 avg_vclk_frequency; + u16 avg_vcn_activity; + u16 avg_dram_reads; + u16 avg_dram_writes; + u16 avg_socket_power; + u16 avg_core_power[2]; + u16 avg_core_c0residency[16]; + u16 spare1; + u32 metrics_counter; } __packed; enum amd_stt_skin_temp { @@ -601,4 +616,7 @@ extern const struct attribute_group cnqf_feature_attribute_group; int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev); void amd_pmf_deinit_smart_pc(struct amd_pmf_dev *dev); int apmf_check_smart_pc(struct amd_pmf_dev *pmf_dev); + +/* Smart PC - TA interfaces */ +void amd_pmf_populate_ta_inputs(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in); #endif /* PMF_H */ diff --git a/drivers/platform/x86/amd/pmf/spc.c b/drivers/platform/x86/amd/pmf/spc.c new file mode 100644 index 000000000000..351efcbe83c4 --- /dev/null +++ b/drivers/platform/x86/amd/pmf/spc.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Platform Management Framework Driver - Smart PC Capabilities + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Authors: Shyam Sundar S K + * Patil Rajesh Reddy + */ + +#include +#include +#include +#include "pmf.h" + +static void amd_pmf_get_smu_info(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in) +{ + u16 max, avg = 0; + int i; + + memset(dev->buf, 0, sizeof(dev->m_table)); + amd_pmf_send_cmd(dev, SET_TRANSFER_TABLE, 0, 7, NULL); + memcpy(&dev->m_table, dev->buf, sizeof(dev->m_table)); + + in->ev_info.socket_power = dev->m_table.apu_power + dev->m_table.dgpu_power; + in->ev_info.skin_temperature = dev->m_table.skin_temp; + + /* Get the avg and max C0 residency of all the cores */ + max = dev->m_table.avg_core_c0residency[0]; + for (i = 0; i < ARRAY_SIZE(dev->m_table.avg_core_c0residency); i++) { + avg += dev->m_table.avg_core_c0residency[i]; + if (dev->m_table.avg_core_c0residency[i] > max) + max = dev->m_table.avg_core_c0residency[i]; + } + + avg = DIV_ROUND_CLOSEST(avg, ARRAY_SIZE(dev->m_table.avg_core_c0residency)); + in->ev_info.avg_c0residency = avg; + in->ev_info.max_c0residency = max; + in->ev_info.gfx_busy = dev->m_table.avg_gfx_activity; +} + +static const char * const pmf_battery_supply_name[] = { + "BATT", + "BAT0", +}; + +static int amd_pmf_get_battery_prop(enum power_supply_property prop) +{ + union power_supply_propval value; + struct power_supply *psy; + int i, ret; + + for (i = 0; i < ARRAY_SIZE(pmf_battery_supply_name); i++) { + psy = power_supply_get_by_name(pmf_battery_supply_name[i]); + if (!psy) + continue; + + ret = power_supply_get_property(psy, prop, &value); + if (ret) { + power_supply_put(psy); + return ret; + } + } + + return value.intval; +} + +static int amd_pmf_get_battery_info(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in) +{ + int val; + + val = amd_pmf_get_battery_prop(POWER_SUPPLY_PROP_PRESENT); + if (val < 0) + return val; + if (val != 1) + return -ENODEV; + + in->ev_info.bat_percentage = amd_pmf_get_battery_prop(POWER_SUPPLY_PROP_CAPACITY); + /* all values in mWh metrics */ + in->ev_info.bat_design = amd_pmf_get_battery_prop(POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN) / + MILLIWATT_PER_WATT; + in->ev_info.full_charge_capacity = amd_pmf_get_battery_prop(POWER_SUPPLY_PROP_ENERGY_FULL) / + MILLIWATT_PER_WATT; + in->ev_info.drain_rate = amd_pmf_get_battery_prop(POWER_SUPPLY_PROP_POWER_NOW) / + MILLIWATT_PER_WATT; + + return 0; +} + +static int amd_pmf_get_slider_info(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in) +{ + int val; + + switch (dev->current_profile) { + case PLATFORM_PROFILE_PERFORMANCE: + val = TA_BEST_PERFORMANCE; + break; + case PLATFORM_PROFILE_BALANCED: + val = TA_BETTER_PERFORMANCE; + break; + case PLATFORM_PROFILE_LOW_POWER: + val = TA_BEST_BATTERY; + break; + default: + dev_err(dev->dev, "Unknown Platform Profile.\n"); + return -EOPNOTSUPP; + } + in->ev_info.power_slider = val; + + return 0; +} + +void amd_pmf_populate_ta_inputs(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in) +{ + /* TA side lid open is 1 and close is 0, hence the ! here */ + in->ev_info.lid_state = !acpi_lid_open(); + in->ev_info.power_source = amd_pmf_get_power_source(); + amd_pmf_get_smu_info(dev, in); + amd_pmf_get_battery_info(dev, in); + amd_pmf_get_slider_info(dev, in); +} diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index f99dc79ebb40..e96db406e91b 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -113,6 +113,7 @@ static int amd_pmf_invoke_cmd_enact(struct amd_pmf_dev *dev) { struct ta_pmf_shared_memory *ta_sm = NULL; struct ta_pmf_enact_result *out = NULL; + struct ta_pmf_enact_table *in = NULL; struct tee_param param[MAX_TEE_PARAM]; struct tee_ioctl_invoke_arg arg; int ret = 0; @@ -123,11 +124,13 @@ static int amd_pmf_invoke_cmd_enact(struct amd_pmf_dev *dev) memset(dev->shbuf, 0, dev->policy_sz); ta_sm = dev->shbuf; out = &ta_sm->pmf_output.policy_apply_table; + in = &ta_sm->pmf_input.enact_table; memset(ta_sm, 0, sizeof(*ta_sm)); ta_sm->command_id = TA_PMF_COMMAND_POLICY_BUILDER_ENACT_POLICIES; ta_sm->if_version = PMF_TA_IF_VERSION_MAJOR; + amd_pmf_populate_ta_inputs(dev, in); amd_pmf_prepare_args(dev, TA_PMF_COMMAND_POLICY_BUILDER_ENACT_POLICIES, &arg, param); ret = tee_client_invoke_func(dev->tee_ctx, &arg, param); From c3b40930a214545919d1385b8aa71cb665904571 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 12 Dec 2023 07:17:00 +0530 Subject: [PATCH 0918/1562] platform/x86/amd/pmf: Add support update p3t limit P3T (Peak Package Power Limit) is a metric within the SMU controller that can influence the power limits. Add support from the driver to update P3T limits accordingly. Reviewed-by: Mario Limonciello Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20231212014705.2017474-8-Shyam-sundar.S-k@amd.com Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/pmf.h | 3 +++ drivers/platform/x86/amd/pmf/tee-if.c | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index 4da51eb28b6f..37bf1c701361 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -49,6 +49,7 @@ #define GET_STT_MIN_LIMIT 0x1F #define GET_STT_LIMIT_APU 0x20 #define GET_STT_LIMIT_HS2 0x21 +#define SET_P3T 0x23 /* P3T: Peak Package Power Limit */ /* OS slider update notification */ #define DC_BEST_PERF 0 @@ -72,6 +73,7 @@ #define PMF_POLICY_STT_MIN 6 #define PMF_POLICY_STT_SKINTEMP_APU 7 #define PMF_POLICY_STT_SKINTEMP_HS2 8 +#define PMF_POLICY_P3T 38 /* TA macros */ #define PMF_TA_IF_VERSION_MAJOR 1 @@ -481,6 +483,7 @@ struct pmf_action_table { u32 stt_minlimit; /* in mW */ u32 stt_skintemp_apu; /* in C */ u32 stt_skintemp_hs2; /* in C */ + u32 p3t_limit; /* in mW */ }; /* Input conditions */ diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index e96db406e91b..bf8cb98d41ec 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -105,6 +105,14 @@ static void amd_pmf_apply_policies(struct amd_pmf_dev *dev, struct ta_pmf_enact_ dev->prev_data->stt_skintemp_hs2 = val; } break; + + case PMF_POLICY_P3T: + if (dev->prev_data->p3t_limit != val) { + amd_pmf_send_cmd(dev, SET_P3T, false, val, NULL); + dev_dbg(dev->dev, "update P3T: %u\n", val); + dev->prev_data->p3t_limit = val; + } + break; } } } From d0ba7ad438dfed944232cf8c96141ae5057605ee Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 12 Dec 2023 07:17:01 +0530 Subject: [PATCH 0919/1562] platform/x86/amd/pmf: Add support to update system state PMF driver based on the output actions from the TA can request to update the system states like entering s0i3, lock screen etc. by generating an uevent. Based on the udev rules set in the userspace the event id matching the uevent shall get updated accordingly using the systemctl. Sample udev rules under Documentation/admin-guide/pmf.rst. Reviewed-by: Mario Limonciello Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20231212014705.2017474-9-Shyam-sundar.S-k@amd.com Signed-off-by: Hans de Goede --- Documentation/admin-guide/index.rst | 1 + Documentation/admin-guide/pmf.rst | 24 +++++++++++++++++++ drivers/platform/x86/amd/pmf/pmf.h | 9 +++++++ drivers/platform/x86/amd/pmf/tee-if.c | 34 +++++++++++++++++++++++++++ 4 files changed, 68 insertions(+) create mode 100644 Documentation/admin-guide/pmf.rst diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 43ea35613dfc..fb40a1f6f79e 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -119,6 +119,7 @@ configure specific aspects of kernel behavior to your liking. parport perf-security pm/index + pmf pnp rapidio ras diff --git a/Documentation/admin-guide/pmf.rst b/Documentation/admin-guide/pmf.rst new file mode 100644 index 000000000000..9ee729ffc19b --- /dev/null +++ b/Documentation/admin-guide/pmf.rst @@ -0,0 +1,24 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Set udev rules for PMF Smart PC Builder +--------------------------------------- + +AMD PMF(Platform Management Framework) Smart PC Solution builder has to set the system states +like S0i3, Screen lock, hibernate etc, based on the output actions provided by the PMF +TA (Trusted Application). + +In order for this to work the PMF driver generates a uevent for userspace to react to. Below are +sample udev rules that can facilitate this experience when a machine has PMF Smart PC solution builder +enabled. + +Please add the following line(s) to +``/etc/udev/rules.d/99-local.rules``:: + + DRIVERS=="amd-pmf", ACTION=="change", ENV{EVENT_ID}=="0", RUN+="/usr/bin/systemctl suspend" + DRIVERS=="amd-pmf", ACTION=="change", ENV{EVENT_ID}=="1", RUN+="/usr/bin/systemctl hibernate" + DRIVERS=="amd-pmf", ACTION=="change", ENV{EVENT_ID}=="2", RUN+="/bin/loginctl lock-sessions" + +EVENT_ID values: +0= Put the system to S0i3/S2Idle +1= Put the system to hibernate +2= Lock the screen diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index 37bf1c701361..50f98c398727 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -73,6 +73,7 @@ #define PMF_POLICY_STT_MIN 6 #define PMF_POLICY_STT_SKINTEMP_APU 7 #define PMF_POLICY_STT_SKINTEMP_HS2 8 +#define PMF_POLICY_SYSTEM_STATE 9 #define PMF_POLICY_P3T 38 /* TA macros */ @@ -445,6 +446,13 @@ enum smart_pc_status { }; /* Smart PC - TA internals */ +enum system_state { + SYSTEM_STATE_S0i3, + SYSTEM_STATE_S4, + SYSTEM_STATE_SCREEN_LOCK, + SYSTEM_STATE_MAX, +}; + enum ta_slider { TA_BEST_BATTERY, TA_BETTER_BATTERY, @@ -476,6 +484,7 @@ enum ta_pmf_error_type { }; struct pmf_action_table { + enum system_state system_state; u32 spl; /* in mW */ u32 sppt; /* in mW */ u32 sppt_apuonly; /* in mW */ diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index bf8cb98d41ec..8811631c7be5 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -24,6 +24,20 @@ MODULE_PARM_DESC(pb_actions_ms, "Policy binary actions sampling frequency (defau static const uuid_t amd_pmf_ta_uuid = UUID_INIT(0x6fd93b77, 0x3fb8, 0x524d, 0xb1, 0x2d, 0xc5, 0x29, 0xb1, 0x3d, 0x85, 0x43); +static const char *amd_pmf_uevent_as_str(unsigned int state) +{ + switch (state) { + case SYSTEM_STATE_S0i3: + return "S0i3"; + case SYSTEM_STATE_S4: + return "S4"; + case SYSTEM_STATE_SCREEN_LOCK: + return "SCREEN_LOCK"; + default: + return "Unknown Smart PC event"; + } +} + static void amd_pmf_prepare_args(struct amd_pmf_dev *dev, int cmd, struct tee_ioctl_invoke_arg *arg, struct tee_param *param) @@ -42,6 +56,20 @@ static void amd_pmf_prepare_args(struct amd_pmf_dev *dev, int cmd, param[0].u.memref.shm_offs = 0; } +static int amd_pmf_update_uevents(struct amd_pmf_dev *dev, u16 event) +{ + char *envp[2] = {}; + + envp[0] = kasprintf(GFP_KERNEL, "EVENT_ID=%d", event); + if (!envp[0]) + return -EINVAL; + + kobject_uevent_env(&dev->dev->kobj, KOBJ_CHANGE, envp); + + kfree(envp[0]); + return 0; +} + static void amd_pmf_apply_policies(struct amd_pmf_dev *dev, struct ta_pmf_enact_result *out) { u32 val; @@ -113,6 +141,12 @@ static void amd_pmf_apply_policies(struct amd_pmf_dev *dev, struct ta_pmf_enact_ dev->prev_data->p3t_limit = val; } break; + + case PMF_POLICY_SYSTEM_STATE: + amd_pmf_update_uevents(dev, val); + dev_dbg(dev->dev, "update SYSTEM_STATE: %s\n", + amd_pmf_uevent_as_str(val)); + break; } } } From 4984dbb60789ccb8674708446431f3bc0dc73100 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 12 Dec 2023 07:17:02 +0530 Subject: [PATCH 0920/1562] platform/x86/amd/pmf: Make source_as_str() as non-static Add amd_pmf prefix to source_as_str() function, so that the function name does not look generic. As this is a helper function make it as non-static so that it can be reused across multiple PMF features. Reviewed-by: Mario Limonciello Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20231212014705.2017474-10-Shyam-sundar.S-k@amd.com Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/pmf.h | 1 + drivers/platform/x86/amd/pmf/sps.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index 50f98c398727..25e369477f86 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -600,6 +600,7 @@ int apmf_get_static_slider_granular(struct amd_pmf_dev *pdev, struct apmf_static_slider_granular_output *output); bool is_pprof_balanced(struct amd_pmf_dev *pmf); int amd_pmf_power_slider_update_event(struct amd_pmf_dev *dev); +const char *amd_pmf_source_as_str(unsigned int state); int apmf_update_fan_idx(struct amd_pmf_dev *pdev, bool manual, u32 idx); diff --git a/drivers/platform/x86/amd/pmf/sps.c b/drivers/platform/x86/amd/pmf/sps.c index a70e67749be3..33e23e25c8b1 100644 --- a/drivers/platform/x86/amd/pmf/sps.c +++ b/drivers/platform/x86/amd/pmf/sps.c @@ -27,7 +27,7 @@ static const char *slider_as_str(unsigned int state) } } -static const char *source_as_str(unsigned int state) +const char *amd_pmf_source_as_str(unsigned int state) { switch (state) { case POWER_SOURCE_AC: @@ -47,7 +47,8 @@ static void amd_pmf_dump_sps_defaults(struct amd_pmf_static_slider_granular *dat for (i = 0; i < POWER_SOURCE_MAX; i++) { for (j = 0; j < POWER_MODE_MAX; j++) { - pr_debug("--- Source:%s Mode:%s ---\n", source_as_str(i), slider_as_str(j)); + pr_debug("--- Source:%s Mode:%s ---\n", amd_pmf_source_as_str(i), + slider_as_str(j)); pr_debug("SPL: %u mW\n", data->prop[i][j].spl); pr_debug("SPPT: %u mW\n", data->prop[i][j].sppt); pr_debug("SPPT_ApuOnly: %u mW\n", data->prop[i][j].sppt_apu_only); From 69e76c5af973854556625a8e156a39d1edbe8d6f Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 12 Dec 2023 07:17:03 +0530 Subject: [PATCH 0921/1562] platform/x86/amd/pmf: Add facility to dump TA inputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PMF driver sends constant inputs to TA which its gets via the other subsystems in the kernel. To debug certain TA issues knowing what inputs being sent to TA becomes critical. Add debug facility to the driver which can isolate Smart PC and TA related issues. Also, make source_as_str() as non-static function as this helper is required outside of sps.c file. Reviewed-by: Mario Limonciello Reviewed-by: Ilpo Järvinen Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20231212014705.2017474-11-Shyam-sundar.S-k@amd.com Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/pmf.h | 3 +++ drivers/platform/x86/amd/pmf/spc.c | 36 +++++++++++++++++++++++++++ drivers/platform/x86/amd/pmf/tee-if.c | 1 + 3 files changed, 40 insertions(+) diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index 25e369477f86..55cd2b301bbb 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -602,6 +602,7 @@ bool is_pprof_balanced(struct amd_pmf_dev *pmf); int amd_pmf_power_slider_update_event(struct amd_pmf_dev *dev); const char *amd_pmf_source_as_str(unsigned int state); +const char *amd_pmf_source_as_str(unsigned int state); int apmf_update_fan_idx(struct amd_pmf_dev *pdev, bool manual, u32 idx); int amd_pmf_set_sps_power_limits(struct amd_pmf_dev *pmf); @@ -632,4 +633,6 @@ int apmf_check_smart_pc(struct amd_pmf_dev *pmf_dev); /* Smart PC - TA interfaces */ void amd_pmf_populate_ta_inputs(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in); +void amd_pmf_dump_ta_inputs(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in); + #endif /* PMF_H */ diff --git a/drivers/platform/x86/amd/pmf/spc.c b/drivers/platform/x86/amd/pmf/spc.c index 351efcbe83c4..a0423942f771 100644 --- a/drivers/platform/x86/amd/pmf/spc.c +++ b/drivers/platform/x86/amd/pmf/spc.c @@ -14,6 +14,42 @@ #include #include "pmf.h" +#ifdef CONFIG_AMD_PMF_DEBUG +static const char *ta_slider_as_str(unsigned int state) +{ + switch (state) { + case TA_BEST_PERFORMANCE: + return "PERFORMANCE"; + case TA_BETTER_PERFORMANCE: + return "BALANCED"; + case TA_BEST_BATTERY: + return "POWER_SAVER"; + default: + return "Unknown TA Slider State"; + } +} + +void amd_pmf_dump_ta_inputs(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in) +{ + dev_dbg(dev->dev, "==== TA inputs START ====\n"); + dev_dbg(dev->dev, "Slider State: %s\n", ta_slider_as_str(in->ev_info.power_slider)); + dev_dbg(dev->dev, "Power Source: %s\n", amd_pmf_source_as_str(in->ev_info.power_source)); + dev_dbg(dev->dev, "Battery Percentage: %u\n", in->ev_info.bat_percentage); + dev_dbg(dev->dev, "Designed Battery Capacity: %u\n", in->ev_info.bat_design); + dev_dbg(dev->dev, "Fully Charged Capacity: %u\n", in->ev_info.full_charge_capacity); + dev_dbg(dev->dev, "Drain Rate: %d\n", in->ev_info.drain_rate); + dev_dbg(dev->dev, "Socket Power: %u\n", in->ev_info.socket_power); + dev_dbg(dev->dev, "Skin Temperature: %u\n", in->ev_info.skin_temperature); + dev_dbg(dev->dev, "Avg C0 Residency: %u\n", in->ev_info.avg_c0residency); + dev_dbg(dev->dev, "Max C0 Residency: %u\n", in->ev_info.max_c0residency); + dev_dbg(dev->dev, "GFX Busy: %u\n", in->ev_info.gfx_busy); + dev_dbg(dev->dev, "LID State: %s\n", in->ev_info.lid_state ? "close" : "open"); + dev_dbg(dev->dev, "==== TA inputs END ====\n"); +} +#else +void amd_pmf_dump_ta_inputs(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in) {} +#endif + static void amd_pmf_get_smu_info(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in) { u16 max, avg = 0; diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index 8811631c7be5..38b75198cc3f 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -182,6 +182,7 @@ static int amd_pmf_invoke_cmd_enact(struct amd_pmf_dev *dev) } if (ta_sm->pmf_result == TA_PMF_TYPE_SUCCESS && out->actions_count) { + amd_pmf_dump_ta_inputs(dev, in); dev_dbg(dev->dev, "action count:%u result:%x\n", out->actions_count, ta_sm->pmf_result); amd_pmf_apply_policies(dev, out); From 10817f28e5337e5ddb873c8431d4db8d93712587 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 12 Dec 2023 07:17:04 +0530 Subject: [PATCH 0922/1562] platform/x86/amd/pmf: Add capability to sideload of policy binary A policy binary is OS agnostic, and the same policies are expected to work across the OSes. At times it becomes difficult to debug when the policies inside the policy binaries starts to misbehave. Add a way to sideload such policies independently to debug them via a debugfs entry. Reviewed-by: Mario Limonciello Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20231212014705.2017474-12-Shyam-sundar.S-k@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/pmf.h | 1 + drivers/platform/x86/amd/pmf/tee-if.c | 63 +++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index 55cd2b301bbb..16999c5b334f 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -219,6 +219,7 @@ struct amd_pmf_dev { bool cnqf_supported; struct notifier_block pwr_src_notifier; /* Smart PC solution builder */ + struct dentry *esbin; unsigned char *policy_buf; u32 policy_sz; struct tee_context *tee_ctx; diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index 38b75198cc3f..72248407138f 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -8,6 +8,7 @@ * Author: Shyam Sundar S K */ +#include #include #include #include "pmf.h" @@ -16,9 +17,14 @@ /* Policy binary actions sampling frequency (in ms) */ static int pb_actions_ms = MSEC_PER_SEC; +/* Sideload policy binaries to debug policy failures */ +static bool pb_side_load; + #ifdef CONFIG_AMD_PMF_DEBUG module_param(pb_actions_ms, int, 0644); MODULE_PARM_DESC(pb_actions_ms, "Policy binary actions sampling frequency (default = 1000ms)"); +module_param(pb_side_load, bool, 0444); +MODULE_PARM_DESC(pb_side_load, "Sideload policy binaries debug policy failures"); #endif static const uuid_t amd_pmf_ta_uuid = UUID_INIT(0x6fd93b77, 0x3fb8, 0x524d, @@ -269,6 +275,57 @@ static int amd_pmf_start_policy_engine(struct amd_pmf_dev *dev) return 0; } +#ifdef CONFIG_AMD_PMF_DEBUG +static ssize_t amd_pmf_get_pb_data(struct file *filp, const char __user *buf, + size_t length, loff_t *pos) +{ + struct amd_pmf_dev *dev = filp->private_data; + unsigned char *new_policy_buf; + int ret; + + /* Policy binary size cannot exceed POLICY_BUF_MAX_SZ */ + if (length > POLICY_BUF_MAX_SZ || length == 0) + return -EINVAL; + + /* re-alloc to the new buffer length of the policy binary */ + new_policy_buf = kzalloc(length, GFP_KERNEL); + if (!new_policy_buf) + return -ENOMEM; + + if (copy_from_user(new_policy_buf, buf, length)) + return -EFAULT; + + kfree(dev->policy_buf); + dev->policy_buf = new_policy_buf; + dev->policy_sz = length; + + ret = amd_pmf_start_policy_engine(dev); + if (ret) + return -EINVAL; + + return length; +} + +static const struct file_operations pb_fops = { + .write = amd_pmf_get_pb_data, + .open = simple_open, +}; + +static void amd_pmf_open_pb(struct amd_pmf_dev *dev, struct dentry *debugfs_root) +{ + dev->esbin = debugfs_create_dir("pb", debugfs_root); + debugfs_create_file("update_policy", 0644, dev->esbin, dev, &pb_fops); +} + +static void amd_pmf_remove_pb(struct amd_pmf_dev *dev) +{ + debugfs_remove_recursive(dev->esbin); +} +#else +static void amd_pmf_open_pb(struct amd_pmf_dev *dev, struct dentry *debugfs_root) {} +static void amd_pmf_remove_pb(struct amd_pmf_dev *dev) {} +#endif + static int amd_pmf_get_bios_buffer(struct amd_pmf_dev *dev) { dev->policy_buf = kzalloc(dev->policy_sz, GFP_KERNEL); @@ -281,6 +338,9 @@ static int amd_pmf_get_bios_buffer(struct amd_pmf_dev *dev) memcpy(dev->policy_buf, dev->policy_base, dev->policy_sz); + if (pb_side_load) + amd_pmf_open_pb(dev, dev->dbgfs_dir); + return amd_pmf_start_policy_engine(dev); } @@ -393,6 +453,9 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) void amd_pmf_deinit_smart_pc(struct amd_pmf_dev *dev) { + if (pb_side_load) + amd_pmf_remove_pb(dev); + kfree(dev->prev_data); kfree(dev->policy_buf); cancel_delayed_work_sync(&dev->pb_work); From f533fa142258024dfe9a8fcba1a28d25a3cbe51b Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 12 Dec 2023 07:17:05 +0530 Subject: [PATCH 0923/1562] platform/x86/amd/pmf: dump policy binary data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sometimes policy binary retrieved from the BIOS maybe incorrect that can end up in failing to enable the Smart PC solution feature. Use print_hex_dump_debug() to dump the policy binary in hex, so that we debug the issues related to the binary even before sending that to TA. Reviewed-by: Mario Limonciello Reviewed-by: Ilpo Järvinen Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20231212014705.2017474-13-Shyam-sundar.S-k@amd.com Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/tee-if.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index 72248407138f..502ce93d5cdd 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -276,6 +276,12 @@ static int amd_pmf_start_policy_engine(struct amd_pmf_dev *dev) } #ifdef CONFIG_AMD_PMF_DEBUG +static void amd_pmf_hex_dump_pb(struct amd_pmf_dev *dev) +{ + print_hex_dump_debug("(pb): ", DUMP_PREFIX_OFFSET, 16, 1, dev->policy_buf, + dev->policy_sz, false); +} + static ssize_t amd_pmf_get_pb_data(struct file *filp, const char __user *buf, size_t length, loff_t *pos) { @@ -299,6 +305,7 @@ static ssize_t amd_pmf_get_pb_data(struct file *filp, const char __user *buf, dev->policy_buf = new_policy_buf; dev->policy_sz = length; + amd_pmf_hex_dump_pb(dev); ret = amd_pmf_start_policy_engine(dev); if (ret) return -EINVAL; @@ -324,6 +331,7 @@ static void amd_pmf_remove_pb(struct amd_pmf_dev *dev) #else static void amd_pmf_open_pb(struct amd_pmf_dev *dev, struct dentry *debugfs_root) {} static void amd_pmf_remove_pb(struct amd_pmf_dev *dev) {} +static void amd_pmf_hex_dump_pb(struct amd_pmf_dev *dev) {} #endif static int amd_pmf_get_bios_buffer(struct amd_pmf_dev *dev) @@ -338,6 +346,7 @@ static int amd_pmf_get_bios_buffer(struct amd_pmf_dev *dev) memcpy(dev->policy_buf, dev->policy_base, dev->policy_sz); + amd_pmf_hex_dump_pb(dev); if (pb_side_load) amd_pmf_open_pb(dev, dev->dbgfs_dir); From 8877243beafa7c6bfc42022cbfdf9e39b25bd4fa Mon Sep 17 00:00:00 2001 From: Osama Muhammad Date: Mon, 6 Nov 2023 21:21:29 +0500 Subject: [PATCH 0924/1562] gfs2: Fix kernel NULL pointer dereference in gfs2_rgrp_dump Syzkaller has reported a NULL pointer dereference when accessing rgd->rd_rgl in gfs2_rgrp_dump(). This can happen when creating rgd->rd_gl fails in read_rindex_entry(). Add a NULL pointer check in gfs2_rgrp_dump() to prevent that. Reported-and-tested-by: syzbot+da0fc229cc1ff4bb2e6d@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=da0fc229cc1ff4bb2e6d Fixes: 72244b6bc752 ("gfs2: improve debug information when lvb mismatches are found") Signed-off-by: Osama Muhammad Signed-off-by: Andreas Gruenbacher --- fs/gfs2/rgrp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c2060203b98a..396d0f4a259d 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -2306,7 +2306,7 @@ void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, (unsigned long long)rgd->rd_addr, rgd->rd_flags, rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, rgd->rd_requested, rgd->rd_reserved, rgd->rd_extfail_pt); - if (rgd->rd_sbd->sd_args.ar_rgrplvb) { + if (rgd->rd_sbd->sd_args.ar_rgrplvb && rgd->rd_rgl) { struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; gfs2_print_dbg(seq, "%s L: f:%02x b:%u i:%u\n", fs_id_buf, From 1181f2d9fef7307b46850bc11e043f2180d636c1 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 13 Nov 2023 16:54:59 +0100 Subject: [PATCH 0925/1562] gfs2: Fix inode_go_instantiate description Fixes a "function parameter or member gl not described in inode_go_instantiate" warning. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index b41c78bd2cc0..15d0e653fd2b 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -494,7 +494,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip) /** * inode_go_instantiate - read in an inode if necessary - * @gh: The glock holder + * @gl: The glock * * Returns: errno */ From 71733b4922007500ae259af9e96017080f5d36d9 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Sat, 2 Dec 2023 17:25:49 +0800 Subject: [PATCH 0926/1562] gfs2: fix kernel BUG in gfs2_quota_cleanup [Syz report] kernel BUG at fs/gfs2/quota.c:1508! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 5060 Comm: syz-executor505 Not tainted 6.7.0-rc3-syzkaller-00134-g994d5c58e50e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/10/2023 RIP: 0010:gfs2_quota_cleanup+0x6b5/0x6c0 fs/gfs2/quota.c:1508 Code: fe e9 cf fd ff ff 44 89 e9 80 e1 07 80 c1 03 38 c1 0f 8c 2d fe ff ff 4c 89 ef e8 b6 19 23 fe e9 20 fe ff ff e8 ec 11 c7 fd 90 <0f> 0b e8 84 9c 4f 07 0f 1f 40 00 66 0f 1f 00 55 41 57 41 56 41 54 RSP: 0018:ffffc9000409f9e0 EFLAGS: 00010293 RAX: ffffffff83c76854 RBX: 0000000000000002 RCX: ffff888026001dc0 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000000 RBP: ffffc9000409fb00 R08: ffffffff83c762b0 R09: 1ffff1100fd38015 R10: dffffc0000000000 R11: ffffed100fd38016 R12: dffffc0000000000 R13: ffff88807e9c0828 R14: ffff888014693580 R15: ffff88807e9c0000 FS: 0000000000000000(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f16d1bd70f8 CR3: 0000000027199000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: gfs2_put_super+0x2e1/0x940 fs/gfs2/super.c:611 generic_shutdown_super+0x13a/0x2c0 fs/super.c:696 kill_block_super+0x44/0x90 fs/super.c:1667 deactivate_locked_super+0xc1/0x130 fs/super.c:484 cleanup_mnt+0x426/0x4c0 fs/namespace.c:1256 task_work_run+0x24a/0x300 kernel/task_work.c:180 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0xa34/0x2750 kernel/exit.c:871 do_group_exit+0x206/0x2c0 kernel/exit.c:1021 __do_sys_exit_group kernel/exit.c:1032 [inline] __se_sys_exit_group kernel/exit.c:1030 [inline] __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1030 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x45/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b ... [pid 5060] fsconfig(4, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0) = 0 [pid 5060] exit_group(1) = ? ... [Analysis] When the task exits, it will execute cleanup_mnt() to recycle the mounted gfs2 file system, but it performs a system call fsconfig(4, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0) before executing the task exit operation. This will execute the following kernel path to complete the setting of SDF_JOURNAL_LIVE for sd_flags: SYSCALL_DEFINE5(fsconfig, ..)-> vfs_fsconfig_locked()-> vfs_cmd_reconfigure()-> gfs2_reconfigure()-> gfs2_make_fs_rw()-> set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); [Fix] Add SDF_NORECOVERY check in gfs2_quota_cleanup() to avoid checking SDF_JOURNAL_LIVE on the path where gfs2 is being unmounted. Reported-and-tested-by: syzbot+3b6e67ac2b646da57862@syzkaller.appspotmail.com Fixes: f66af88e3321 ("gfs2: Stop using gfs2_make_fs_ro for withdraw") Signed-off-by: Edward Adam Davis Signed-off-by: Andreas Gruenbacher --- fs/gfs2/quota.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 95dae7838b4e..f139ce8cf5ce 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1505,7 +1505,8 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) LIST_HEAD(dispose); int count; - BUG_ON(test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); + BUG_ON(!test_bit(SDF_NORECOVERY, &sdp->sd_flags) && + test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); spin_lock(&qd_lock); list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { From 95d0f6252564420d6c660593db8505af61c2dd0a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 5 Dec 2023 18:36:36 -0800 Subject: [PATCH 0927/1562] gfs2: rgrp: fix kernel-doc warnings Fix kernel-doc warnings found when using "W=1". rgrp.c:162: warning: missing initial short description on line: * gfs2_bit_search rgrp.c:1200: warning: Function parameter or member 'gl' not described in 'gfs2_rgrp_go_instantiate' rgrp.c:1200: warning: Excess function parameter 'gh' description in 'gfs2_rgrp_go_instantiate' rgrp.c:1970: warning: missing initial short description on line: * gfs2_rgrp_used_recently Signed-off-by: Randy Dunlap Signed-off-by: Andreas Gruenbacher --- fs/gfs2/rgrp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 396d0f4a259d..26d6c1eea559 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -159,13 +159,13 @@ static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone) } /** - * gfs2_bit_search + * gfs2_bit_search - search bitmap for a state * @ptr: Pointer to bitmap data * @mask: Mask to use (normally 0x55555.... but adjusted for search start) * @state: The state we are searching for * - * We xor the bitmap data with a patter which is the bitwise opposite - * of what we are looking for, this gives rise to a pattern of ones + * We xor the bitmap data with a pattern which is the bitwise opposite + * of what we are looking for. This gives rise to a pattern of ones * wherever there is a match. Since we have two bits per entry, we * take this pattern, shift it down by one place and then and it with * the original. All the even bit positions (0,2,4, etc) then represent @@ -1188,7 +1188,7 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd) /** * gfs2_rgrp_go_instantiate - Read in a RG's header and bitmaps - * @gh: the glock holder representing the rgrpd to read in + * @gl: the glock representing the rgrpd to read in * * Read in all of a Resource Group's header and bitmap blocks. * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps. @@ -1967,7 +1967,7 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops) } /** - * gfs2_rgrp_used_recently + * gfs2_rgrp_used_recently - test if an rgrp has been used recently * @rs: The block reservation with the rgrp to test * @msecs: The time limit in milliseconds * From f9f229c1f75df2f1fe63b16615d184da4e90bb10 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 13 Nov 2023 16:49:38 +0100 Subject: [PATCH 0928/1562] gfs2: Add GL_NOBLOCK flag Add a GL_NOBLOCK flag for trying to take a glock without sleeping. This will be used for implementing non-blocking lookup (MAY_NOT_BLOCK in gfs2_permission, LOOKUP_RCU in gfs2_drevalidate). Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 39 ++++++++++++++++++++++++++++++++++++++- fs/gfs2/glock.h | 1 + 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index d6bf1f8c25dc..2cb65f76eec8 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -516,6 +516,23 @@ static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl) return NULL; } +/** + * find_last_waiter - find the last gh that's waiting for the glock + * @gl: the glock + * + * This also is a fast way of finding out if there are any waiters. + */ + +static inline struct gfs2_holder *find_last_waiter(const struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + + if (list_empty(&gl->gl_holders)) + return NULL; + gh = list_last_entry(&gl->gl_holders, struct gfs2_holder, gh_list); + return test_bit(HIF_HOLDER, &gh->gh_iflags) ? NULL : gh; +} + /** * state_change - record that the glock is now in a different state * @gl: the glock @@ -1555,11 +1572,30 @@ trap_recursive: int gfs2_glock_nq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; - int error = 0; + int error; if (glock_blocked_by_withdraw(gl) && !(gh->gh_flags & LM_FLAG_NOEXP)) return -EIO; + if (gh->gh_flags & GL_NOBLOCK) { + struct gfs2_holder *current_gh; + + error = -ECHILD; + spin_lock(&gl->gl_lockref.lock); + if (find_last_waiter(gl)) + goto unlock; + current_gh = find_first_holder(gl); + if (!may_grant(gl, current_gh, gh)) + goto unlock; + set_bit(HIF_HOLDER, &gh->gh_iflags); + list_add_tail(&gh->gh_list, &gl->gl_holders); + trace_gfs2_promote(gh); + error = 0; +unlock: + spin_unlock(&gl->gl_lockref.lock); + return error; + } + if (test_bit(GLF_LRU, &gl->gl_flags)) gfs2_glock_remove_from_lru(gl); @@ -1575,6 +1611,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh) run_queue(gl, 1); spin_unlock(&gl->gl_lockref.lock); + error = 0; if (!(gh->gh_flags & GL_ASYNC)) error = gfs2_glock_wait(gh); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 61197598abfd..0114f3e0ebe0 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -84,6 +84,7 @@ enum { #define GL_SKIP 0x0100 #define GL_NOPID 0x0200 #define GL_NOCACHE 0x0400 +#define GL_NOBLOCK 0x0800 /* * lm_async_cb return flags From dd00aaeb343255a8a30de671bd27bde79a47c8e5 Mon Sep 17 00:00:00 2001 From: Abhi Das Date: Fri, 10 Nov 2023 13:10:08 +0100 Subject: [PATCH 0929/1562] gfs2: Use GL_NOBLOCK flag for non-blocking lookups Add the GL_NOBLOCK flag to the locking requests in gfs2_permission() and gfs2_drevalidate() when called with the MAY_NOT_BLOCK flag and LOOKUP_RCU flag, respectively. This will cause the locking requests to be handled without sleeping if possible. We bail out with -ECHILD if we can't grant the glock immediately. Make sure not to dget() + dput() the parent dentry in gfs2_drevalidate() in LOOKUP_RCU mode; dput() is a sleeping operation. Signed-off-by: Abhi Das Signed-off-by: Andreas Gruenbacher --- fs/gfs2/dentry.c | 23 ++++++++++++++--------- fs/gfs2/inode.c | 8 ++++---- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 2e215e8c3c88..177f1f41f225 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -32,21 +32,25 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) { - struct dentry *parent; + struct dentry *parent = NULL; struct gfs2_sbd *sdp; struct gfs2_inode *dip; - struct inode *inode; + struct inode *dinode, *inode; struct gfs2_holder d_gh; struct gfs2_inode *ip = NULL; int error, valid = 0; int had_lock = 0; - if (flags & LOOKUP_RCU) - return -ECHILD; - - parent = dget_parent(dentry); - sdp = GFS2_SB(d_inode(parent)); - dip = GFS2_I(d_inode(parent)); + if (flags & LOOKUP_RCU) { + dinode = d_inode_rcu(READ_ONCE(dentry->d_parent)); + if (!dinode) + return -ECHILD; + } else { + parent = dget_parent(dentry); + dinode = d_inode(parent); + } + sdp = GFS2_SB(dinode); + dip = GFS2_I(dinode); inode = d_inode(dentry); if (inode) { @@ -62,7 +66,8 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); if (!had_lock) { - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, + flags & LOOKUP_RCU ? GL_NOBLOCK : 0, &d_gh); if (error) goto out; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 1b95db2c3aac..6bfc9383b7b8 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1882,10 +1882,10 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, WARN_ON_ONCE(!may_not_block); return -ECHILD; } - if (gfs2_glock_is_locked_by_me(gl) == NULL) { - if (may_not_block) - return -ECHILD; - error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { + int noblock = may_not_block ? GL_NOBLOCK : 0; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_ANY | noblock, &i_gh); if (error) return error; } From b23ae451d7b1ddb10536fa10f5dc9842ef66f387 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 15 Dec 2023 11:27:46 +0000 Subject: [PATCH 0930/1562] platform/x86: silicom-platform: Fix spelling mistake "platfomr" -> "platform" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a spelling mistake in a literal string. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231215112746.13752-1-colin.i.king@gmail.com Signed-off-by: Hans de Goede --- drivers/platform/x86/silicom-platform.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/silicom-platform.c b/drivers/platform/x86/silicom-platform.c index 84b92b3f9f4b..6ce43ccb3112 100644 --- a/drivers/platform/x86/silicom-platform.c +++ b/drivers/platform/x86/silicom-platform.c @@ -866,7 +866,7 @@ static int silicom_fan_control_read_labels(struct device *dev, { switch (type) { case hwmon_fan: - *str = "Silicom_platfomr: Fan Speed"; + *str = "Silicom_platform: Fan Speed"; return 0; case hwmon_temp: *str = "Silicom_platform: Thermostat Sensor"; From 784a00474633aa7ff4940b3adf74d900e54f6a36 Mon Sep 17 00:00:00 2001 From: Rajvi Jingar Date: Fri, 15 Dec 2023 16:51:46 -0800 Subject: [PATCH 0931/1562] platform/x86/intel/vsec: Add support for Lunar Lake M MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Lunar Lake M PMT telemetry support. Signed-off-by: Rajvi Jingar Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231216005146.1735455-1-rajvi.jingar@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/vsec.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index b68586731e45..778eb0aa3479 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -419,6 +419,11 @@ static const struct intel_vsec_platform_info tgl_info = { .quirks = VSEC_QUIRK_TABLE_SHIFT | VSEC_QUIRK_EARLY_HW, }; +/* LNL info */ +static const struct intel_vsec_platform_info lnl_info = { + .caps = VSEC_CAP_TELEMETRY | VSEC_CAP_WATCHER, +}; + #define PCI_DEVICE_ID_INTEL_VSEC_ADL 0x467d #define PCI_DEVICE_ID_INTEL_VSEC_DG1 0x490e #define PCI_DEVICE_ID_INTEL_VSEC_MTL_M 0x7d0d @@ -426,6 +431,7 @@ static const struct intel_vsec_platform_info tgl_info = { #define PCI_DEVICE_ID_INTEL_VSEC_OOBMSM 0x09a7 #define PCI_DEVICE_ID_INTEL_VSEC_RPL 0xa77d #define PCI_DEVICE_ID_INTEL_VSEC_TGL 0x9a0d +#define PCI_DEVICE_ID_INTEL_VSEC_LNL_M 0x647d static const struct pci_device_id intel_vsec_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, VSEC_ADL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_DG1, &dg1_info) }, @@ -434,6 +440,7 @@ static const struct pci_device_id intel_vsec_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, VSEC_OOBMSM, &oobmsm_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_RPL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_TGL, &tgl_info) }, + { PCI_DEVICE_DATA(INTEL, VSEC_LNL_M, &lnl_info) }, { } }; MODULE_DEVICE_TABLE(pci, intel_vsec_pci_ids); From a92d3078244891c1bc4dc2112ae58b416875d296 Mon Sep 17 00:00:00 2001 From: Rajvi Jingar Date: Fri, 15 Dec 2023 17:17:02 -0800 Subject: [PATCH 0932/1562] platform/x86/intel/pmc: Fix in pmc_core_ssram_get_pmc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Passing PMC_IDX_MAIN in pmc_core_pmc_add() adds only primary pmc to pmcdev. Use pmc_idx instead to add all available pmcs. Fixes: a01486dc4bb1 ("platform/x86/intel/pmc: Cleanup SSRAM discovery") Signed-off-by: Rajvi Jingar Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231216011702.1976408-1-rajvi.jingar@linux.intel.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/core_ssram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/pmc/core_ssram.c b/drivers/platform/x86/intel/pmc/core_ssram.c index 3501c7bd6b33..55e54207987c 100644 --- a/drivers/platform/x86/intel/pmc/core_ssram.c +++ b/drivers/platform/x86/intel/pmc/core_ssram.c @@ -287,7 +287,7 @@ pmc_core_ssram_get_pmc(struct pmc_dev *pmcdev, int pmc_idx, u32 offset) if (!map) return -ENODEV; - return pmc_core_pmc_add(pmcdev, pwrm_base, map, PMC_IDX_MAIN); + return pmc_core_pmc_add(pmcdev, pwrm_base, map, pmc_idx); } int pmc_core_ssram_init(struct pmc_dev *pmcdev) From ad663ce6780477177e301756ade6cf236f36ae4c Mon Sep 17 00:00:00 2001 From: Varadarajan Narayanan Date: Thu, 14 Dec 2023 16:10:52 +0530 Subject: [PATCH 0933/1562] regulator: qcom_smd: Add LDO5 MP5496 regulator Add support for LDO5 regulator. This is used by IPQ9574 USB. Signed-off-by: Varadarajan Narayanan Rule: Link: https://lore.kernel.org/stable/20231214104052.3267039-1-quic_varada%40quicinc.com Link: https://msgid.link/r/20231214104052.3267039-1-quic_varada@quicinc.com Signed-off-by: Mark Brown --- drivers/regulator/qcom_smd-regulator.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/regulator/qcom_smd-regulator.c b/drivers/regulator/qcom_smd-regulator.c index 09c471a0ba2e..d1be9568025e 100644 --- a/drivers/regulator/qcom_smd-regulator.c +++ b/drivers/regulator/qcom_smd-regulator.c @@ -796,6 +796,7 @@ static const struct rpm_regulator_data rpm_mp5496_regulators[] = { { "s1", QCOM_SMD_RPM_SMPA, 1, &mp5496_smps, "s1" }, { "s2", QCOM_SMD_RPM_SMPA, 2, &mp5496_smps, "s2" }, { "l2", QCOM_SMD_RPM_LDOA, 2, &mp5496_ldoa2, "l2" }, + { "l5", QCOM_SMD_RPM_LDOA, 5, &mp5496_ldoa2, "l5" }, {} }; From 15009a1b145b033c39a6b65d529c83de71a8d732 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 18 Dec 2023 08:53:45 -0600 Subject: [PATCH 0934/1562] spi: axi-spi-engine: fix struct member doc warnings The build bots are complaining that the members of struct spi_engine_message_state are not described. This adds the proper @name: syntax to the comments to fix this. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312182101.QOWovo29-lkp@intel.com/ Signed-off-by: David Lechner Link: https://msgid.link/r/20231218145348.339470-1-dlechner@baylibre.com Signed-off-by: Mark Brown --- drivers/spi/spi-axi-spi-engine.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 58280dd1c901..9ace259d2d29 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -85,25 +85,25 @@ struct spi_engine_program { * struct spi_engine_message_state - SPI engine per-message state */ struct spi_engine_message_state { - /** Instructions for executing this message. */ + /** @p: Instructions for executing this message. */ struct spi_engine_program *p; - /** Number of elements in cmd_buf array. */ + /** @cmd_length: Number of elements in cmd_buf array. */ unsigned cmd_length; - /** Array of commands not yet written to CMD FIFO. */ + /** @cmd_buf: Array of commands not yet written to CMD FIFO. */ const uint16_t *cmd_buf; - /** Next xfer with tx_buf not yet fully written to TX FIFO. */ + /** @tx_xfer: Next xfer with tx_buf not yet fully written to TX FIFO. */ struct spi_transfer *tx_xfer; - /** Size of tx_buf in bytes. */ + /** @tx_length: Size of tx_buf in bytes. */ unsigned int tx_length; - /** Bytes not yet written to TX FIFO. */ + /** @tx_buf: Bytes not yet written to TX FIFO. */ const uint8_t *tx_buf; - /** Next xfer with rx_buf not yet fully written to RX FIFO. */ + /** @rx_xfer: Next xfer with rx_buf not yet fully written to RX FIFO. */ struct spi_transfer *rx_xfer; - /** Size of tx_buf in bytes. */ + /** @rx_length: Size of tx_buf in bytes. */ unsigned int rx_length; - /** Bytes not yet written to the RX FIFO. */ + /** @rx_buf: Bytes not yet written to the RX FIFO. */ uint8_t *rx_buf; - /** ID to correlate SYNC interrupts with this message. */ + /** @sync_id: ID to correlate SYNC interrupts with this message. */ u8 sync_id; }; From 56778b49c9a2cbc32c6b0fbd3ba1a9d64192d3af Mon Sep 17 00:00:00 2001 From: David Gow Date: Tue, 28 Nov 2023 15:24:05 +0800 Subject: [PATCH 0935/1562] kunit: Add a macro to wrap a deferred action function KUnit's deferred action API accepts a void(*)(void *) function pointer which is called when the test is exited. However, we very frequently want to use existing functions which accept a single pointer, but which may not be of type void*. While this is probably dodgy enough to be on the wrong side of the C standard, it's been often used for similar callbacks, and gcc's -Wcast-function-type seems to ignore cases where the only difference is the type of the argument, assuming it's compatible (i.e., they're both pointers to data). However, clang 16 has introduced -Wcast-function-type-strict, which no longer permits any deviation in function pointer type. This seems to be because it'd break CFI, which validates the type of function calls. This rather ruins our attempts to cast functions to defer them, and leaves us with a few options. The one we've chosen is to implement a macro which will generate a wrapper function which accepts a void*, and casts the argument to the appropriate type. For example, if you were trying to wrap: void foo_close(struct foo *handle); you could use: KUNIT_DEFINE_ACTION_WRAPPER(kunit_action_foo_close, foo_close, struct foo *); This would create a new kunit_action_foo_close() function, of type kunit_action_t, which could be passed into kunit_add_action() and similar functions. In addition to defining this macro, update KUnit and its tests to use it. Link: https://github.com/ClangBuiltLinux/linux/issues/1750 Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Acked-by: Daniel Vetter Reviewed-by: Maxime Ripard Signed-off-by: David Gow Signed-off-by: Shuah Khan --- Documentation/dev-tools/kunit/usage.rst | 10 +++++++--- include/kunit/resource.h | 21 +++++++++++++++++++++ lib/kunit/kunit-test.c | 5 +---- lib/kunit/test.c | 6 ++++-- 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst index c27e1646ecd9..9db12e91668e 100644 --- a/Documentation/dev-tools/kunit/usage.rst +++ b/Documentation/dev-tools/kunit/usage.rst @@ -651,12 +651,16 @@ For example: } Note that, for functions like device_unregister which only accept a single -pointer-sized argument, it's possible to directly cast that function to -a ``kunit_action_t`` rather than writing a wrapper function, for example: +pointer-sized argument, it's possible to automatically generate a wrapper +with the ``KUNIT_DEFINE_ACTION_WRAPPER()`` macro, for example: .. code-block:: C - kunit_add_action(test, (kunit_action_t *)&device_unregister, &dev); + KUNIT_DEFINE_ACTION_WRAPPER(device_unregister, device_unregister_wrapper, struct device *); + kunit_add_action(test, &device_unregister_wrapper, &dev); + +You should do this in preference to manually casting to the ``kunit_action_t`` type, +as casting function pointers will break Control Flow Integrity (CFI). ``kunit_add_action`` can fail if, for example, the system is out of memory. You can use ``kunit_add_action_or_reset`` instead which runs the action diff --git a/include/kunit/resource.h b/include/kunit/resource.h index c7383e90f5c9..4ad69a2642a5 100644 --- a/include/kunit/resource.h +++ b/include/kunit/resource.h @@ -390,6 +390,27 @@ void kunit_remove_resource(struct kunit *test, struct kunit_resource *res); /* A 'deferred action' function to be used with kunit_add_action. */ typedef void (kunit_action_t)(void *); +/** + * KUNIT_DEFINE_ACTION_WRAPPER() - Wrap a function for use as a deferred action. + * + * @wrapper: The name of the new wrapper function define. + * @orig: The original function to wrap. + * @arg_type: The type of the argument accepted by @orig. + * + * Defines a wrapper for a function which accepts a single, pointer-sized + * argument. This wrapper can then be passed to kunit_add_action() and + * similar. This should be used in preference to casting a function + * directly to kunit_action_t, as casting function pointers will break + * control flow integrity (CFI), leading to crashes. + */ +#define KUNIT_DEFINE_ACTION_WRAPPER(wrapper, orig, arg_type) \ + static void wrapper(void *in) \ + { \ + arg_type arg = (arg_type)in; \ + orig(arg); \ + } + + /** * kunit_add_action() - Call a function when the test ends. * @test: Test case to associate the action with. diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c index de2113a58fa0..ee6927c60979 100644 --- a/lib/kunit/kunit-test.c +++ b/lib/kunit/kunit-test.c @@ -538,10 +538,7 @@ static struct kunit_suite kunit_resource_test_suite = { #if IS_BUILTIN(CONFIG_KUNIT_TEST) /* This avoids a cast warning if kfree() is passed direct to kunit_add_action(). */ -static void kfree_wrapper(void *p) -{ - kfree(p); -} +KUNIT_DEFINE_ACTION_WRAPPER(kfree_wrapper, kfree, const void *); static void kunit_log_test(struct kunit *test) { diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 7aceb07a1af9..7deee3701d20 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -810,6 +810,8 @@ static struct notifier_block kunit_mod_nb = { }; #endif +KUNIT_DEFINE_ACTION_WRAPPER(kfree_action_wrapper, kfree, const void *) + void *kunit_kmalloc_array(struct kunit *test, size_t n, size_t size, gfp_t gfp) { void *data; @@ -819,7 +821,7 @@ void *kunit_kmalloc_array(struct kunit *test, size_t n, size_t size, gfp_t gfp) if (!data) return NULL; - if (kunit_add_action_or_reset(test, (kunit_action_t *)kfree, data) != 0) + if (kunit_add_action_or_reset(test, kfree_action_wrapper, data) != 0) return NULL; return data; @@ -831,7 +833,7 @@ void kunit_kfree(struct kunit *test, const void *ptr) if (!ptr) return; - kunit_release_action(test, (kunit_action_t *)kfree, (void *)ptr); + kunit_release_action(test, kfree_action_wrapper, (void *)ptr); } EXPORT_SYMBOL_GPL(kunit_kfree); From e847934bb124b2ad14bf967d6682e43b0b94c78a Mon Sep 17 00:00:00 2001 From: David Gow Date: Tue, 28 Nov 2023 15:24:06 +0800 Subject: [PATCH 0936/1562] drm/tests: Use KUNIT_DEFINE_ACTION_WRAPPER() In order to pass functions to kunit_add_action(), they need to be of the kunit_action_t type. While casting the function pointer can work, it will break control-flow integrity. drm_kunit_helpers already defines wrappers, but we now have a macro which does this automatically. Using this greatly reduces the boilerplate needed. Acked-by: Maxime Ripard Signed-off-by: David Gow Signed-off-by: Shuah Khan --- drivers/gpu/drm/tests/drm_kunit_helpers.c | 30 +++++++---------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/tests/drm_kunit_helpers.c b/drivers/gpu/drm/tests/drm_kunit_helpers.c index bccb33b900f3..c251e6b34de0 100644 --- a/drivers/gpu/drm/tests/drm_kunit_helpers.c +++ b/drivers/gpu/drm/tests/drm_kunit_helpers.c @@ -27,27 +27,15 @@ static struct platform_driver fake_platform_driver = { }, }; -static void kunit_action_platform_driver_unregister(void *ptr) -{ - struct platform_driver *drv = ptr; - - platform_driver_unregister(drv); - -} - -static void kunit_action_platform_device_put(void *ptr) -{ - struct platform_device *pdev = ptr; - - platform_device_put(pdev); -} - -static void kunit_action_platform_device_del(void *ptr) -{ - struct platform_device *pdev = ptr; - - platform_device_del(pdev); -} +KUNIT_DEFINE_ACTION_WRAPPER(kunit_action_platform_driver_unregister, + platform_driver_unregister, + struct platform_driver *); +KUNIT_DEFINE_ACTION_WRAPPER(kunit_action_platform_device_put, + platform_device_put, + struct platform_device *); +KUNIT_DEFINE_ACTION_WRAPPER(kunit_action_platform_device_del, + platform_device_del, + struct platform_device *); /** * drm_kunit_helper_alloc_device - Allocate a mock device for a KUnit test From a08d4d6284393d44ef4e076288c31d04fc469a58 Mon Sep 17 00:00:00 2001 From: David Gow Date: Tue, 28 Nov 2023 15:24:07 +0800 Subject: [PATCH 0937/1562] drm/vc4: tests: Use KUNIT_DEFINE_ACTION_WRAPPER In order to pass functions to kunit_add_action(), they need to be of the kunit_action_t type. While casting the function pointer can work, it will break control-flow integrity. vc4_mock already defines such a wrapper for drm_dev_unregister(), but it involves less boilerplate to use the new macro, so replace the manual implementation. Signed-off-by: David Gow Reviewed-by: Maxime Ripard Signed-off-by: Shuah Khan --- drivers/gpu/drm/vc4/tests/vc4_mock.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/vc4/tests/vc4_mock.c b/drivers/gpu/drm/vc4/tests/vc4_mock.c index 63ca46f4cb35..becb3dbaa548 100644 --- a/drivers/gpu/drm/vc4/tests/vc4_mock.c +++ b/drivers/gpu/drm/vc4/tests/vc4_mock.c @@ -153,12 +153,9 @@ static int __build_mock(struct kunit *test, struct drm_device *drm, return 0; } -static void kunit_action_drm_dev_unregister(void *ptr) -{ - struct drm_device *drm = ptr; - - drm_dev_unregister(drm); -} +KUNIT_DEFINE_ACTION_WRAPPER(kunit_action_drm_dev_unregister, + drm_dev_unregister, + struct drm_device *); static struct vc4_dev *__mock_device(struct kunit *test, bool is_vc5) { From 37f0d37ffce1d162bfaf7f426afca38f6fe15472 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 21 Nov 2023 16:24:57 +0000 Subject: [PATCH 0938/1562] kunit: string-stream-test: Avoid cast warning when testing gfp_t flags Passing a gfp_t to KUNIT_EXPECT_EQ() causes a cast warning: lib/kunit/string-stream-test.c:73:9: sparse: sparse: incorrect type in initializer (different base types) expected long long right_value got restricted gfp_t const __right Avoid this by testing stream->gfp for the expected value and passing the boolean result of this comparison to KUNIT_EXPECT_TRUE(), as was already done a few lines above in string_stream_managed_init_test(). Signed-off-by: Richard Fitzgerald Fixes: d1a0d699bfc0 ("kunit: string-stream: Add tests for freeing resource-managed string_stream") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311181918.0mpCu2Xh-lkp@intel.com/ Reviewed-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/string-stream-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/kunit/string-stream-test.c b/lib/kunit/string-stream-test.c index 06822766f29a..03fb511826f7 100644 --- a/lib/kunit/string-stream-test.c +++ b/lib/kunit/string-stream-test.c @@ -72,7 +72,7 @@ static void string_stream_unmanaged_init_test(struct kunit *test) KUNIT_EXPECT_EQ(test, stream->length, 0); KUNIT_EXPECT_TRUE(test, list_empty(&stream->fragments)); - KUNIT_EXPECT_EQ(test, stream->gfp, GFP_KERNEL); + KUNIT_EXPECT_TRUE(test, (stream->gfp == GFP_KERNEL)); KUNIT_EXPECT_FALSE(test, stream->append_newlines); KUNIT_EXPECT_TRUE(test, string_stream_is_empty(stream)); From 15bf0000147ae9fc8fa4969025f71848fc558cba Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 30 Oct 2023 10:47:46 +0000 Subject: [PATCH 0939/1562] kunit: string-stream: Allow ERR_PTR to be passed to string_stream_destroy() Check the stream pointer passed to string_stream_destroy() for IS_ERR_OR_NULL() instead of only NULL. Whatever alloc_string_stream() returns should be safe to pass to string_stream_destroy(), and that will be an ERR_PTR. It's obviously good practise and generally helpful to also check for NULL pointers so that client cleanup code can call string_stream_destroy() unconditionally - which could include pointers that have never been set to anything and so are NULL. Signed-off-by: Richard Fitzgerald Reviewed-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/string-stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/kunit/string-stream.c b/lib/kunit/string-stream.c index a6f3616c2048..54f4fdcbfac8 100644 --- a/lib/kunit/string-stream.c +++ b/lib/kunit/string-stream.c @@ -173,7 +173,7 @@ void string_stream_destroy(struct string_stream *stream) { KUNIT_STATIC_STUB_REDIRECT(string_stream_destroy, stream); - if (!stream) + if (IS_ERR_OR_NULL(stream)) return; string_stream_clear(stream); From 34dfd5bb2e5507e69d9b6d6c90f546600c7a4977 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 30 Oct 2023 10:47:58 +0000 Subject: [PATCH 0940/1562] kunit: debugfs: Fix unchecked dereference in debugfs_print_results() Move the call to kunit_suite_has_succeeded() after the check that the kunit_suite pointer is valid. This was found by smatch: lib/kunit/debugfs.c:66 debugfs_print_results() warn: variable dereferenced before check 'suite' (see line 63) Signed-off-by: Richard Fitzgerald Reported-by: Dan Carpenter Fixes: 38289a26e1b8 ("kunit: fix debugfs code to use enum kunit_status, not bool") Reviewed-by: Rae Moar Signed-off-by: Shuah Khan --- lib/kunit/debugfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/kunit/debugfs.c b/lib/kunit/debugfs.c index 270d185737e6..5bfc18ad5fff 100644 --- a/lib/kunit/debugfs.c +++ b/lib/kunit/debugfs.c @@ -60,12 +60,14 @@ static void debugfs_print_result(struct seq_file *seq, struct string_stream *log static int debugfs_print_results(struct seq_file *seq, void *v) { struct kunit_suite *suite = (struct kunit_suite *)seq->private; - enum kunit_status success = kunit_suite_has_succeeded(suite); + enum kunit_status success; struct kunit_case *test_case; if (!suite) return 0; + success = kunit_suite_has_succeeded(suite); + /* Print KTAP header so the debugfs log can be parsed as valid KTAP. */ seq_puts(seq, "KTAP version 1\n"); seq_puts(seq, "1..1\n"); From 1557e89d3af51a4f1bd6870b3117bed651de5dbf Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 30 Oct 2023 10:47:32 +0000 Subject: [PATCH 0941/1562] kunit: debugfs: Handle errors from alloc_string_stream() In kunit_debugfs_create_suite() give up and skip creating the debugfs file if any of the alloc_string_stream() calls return an error or NULL. Only put a value in the log pointer of kunit_suite and kunit_test if it is a valid pointer to a log. This prevents the potential invalid dereference reported by smatch: lib/kunit/debugfs.c:115 kunit_debugfs_create_suite() error: 'suite->log' dereferencing possible ERR_PTR() lib/kunit/debugfs.c:119 kunit_debugfs_create_suite() error: 'test_case->log' dereferencing possible ERR_PTR() Signed-off-by: Richard Fitzgerald Reported-by: Dan Carpenter Fixes: 05e2006ce493 ("kunit: Use string_stream for test log") Reviewed-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/debugfs.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/lib/kunit/debugfs.c b/lib/kunit/debugfs.c index 5bfc18ad5fff..382706dfb47d 100644 --- a/lib/kunit/debugfs.c +++ b/lib/kunit/debugfs.c @@ -111,14 +111,28 @@ static const struct file_operations debugfs_results_fops = { void kunit_debugfs_create_suite(struct kunit_suite *suite) { struct kunit_case *test_case; + struct string_stream *stream; - /* Allocate logs before creating debugfs representation. */ - suite->log = alloc_string_stream(GFP_KERNEL); - string_stream_set_append_newlines(suite->log, true); + /* + * Allocate logs before creating debugfs representation. + * The suite->log and test_case->log pointer are expected to be NULL + * if there isn't a log, so only set it if the log stream was created + * successfully. + */ + stream = alloc_string_stream(GFP_KERNEL); + if (IS_ERR_OR_NULL(stream)) + return; + + string_stream_set_append_newlines(stream, true); + suite->log = stream; kunit_suite_for_each_test_case(suite, test_case) { - test_case->log = alloc_string_stream(GFP_KERNEL); - string_stream_set_append_newlines(test_case->log, true); + stream = alloc_string_stream(GFP_KERNEL); + if (IS_ERR_OR_NULL(stream)) + goto err; + + string_stream_set_append_newlines(stream, true); + test_case->log = stream; } suite->debugfs = debugfs_create_dir(suite->name, debugfs_rootdir); @@ -126,6 +140,12 @@ void kunit_debugfs_create_suite(struct kunit_suite *suite) debugfs_create_file(KUNIT_DEBUGFS_RESULTS, S_IFREG | 0444, suite->debugfs, suite, &debugfs_results_fops); + return; + +err: + string_stream_destroy(suite->log); + kunit_suite_for_each_test_case(suite, test_case) + string_stream_destroy(test_case->log); } void kunit_debugfs_destroy_suite(struct kunit_suite *suite) From 8ae27bc7fff4ef467a7964821a6cedb34a05d3b2 Mon Sep 17 00:00:00 2001 From: Rae Moar Date: Thu, 7 Dec 2023 21:34:09 +0000 Subject: [PATCH 0942/1562] kunit: tool: fix parsing of test attributes Add parsing of attributes as diagnostic data. Fixes issue with test plan being parsed incorrectly as diagnostic data when located after suite-level attributes. Note that if there does not exist a test plan line, the diagnostic lines between the suite header and the first result will be saved in the suite log rather than the first test case log. Signed-off-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_parser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py index 79d8832c862a..ce34be15c929 100644 --- a/tools/testing/kunit/kunit_parser.py +++ b/tools/testing/kunit/kunit_parser.py @@ -450,7 +450,7 @@ def parse_diagnostic(lines: LineStream) -> List[str]: Log of diagnostic lines """ log = [] # type: List[str] - non_diagnostic_lines = [TEST_RESULT, TEST_HEADER, KTAP_START, TAP_START] + non_diagnostic_lines = [TEST_RESULT, TEST_HEADER, KTAP_START, TAP_START, TEST_PLAN] while lines and not any(re.match(lines.peek()) for re in non_diagnostic_lines): log.append(lines.pop()) @@ -726,6 +726,7 @@ def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest: # test plan test.name = "main" ktap_line = parse_ktap_header(lines, test) + test.log.extend(parse_diagnostic(lines)) parse_test_plan(lines, test) parent_test = True else: @@ -737,6 +738,7 @@ def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest: if parent_test: # If KTAP version line and/or subtest header is found, attempt # to parse test plan and print test header + test.log.extend(parse_diagnostic(lines)) parse_test_plan(lines, test) print_test_header(test) expected_count = test.expected_count From 6eb0ea28c8e80ead03f08cf8cbdacf08d3073bd6 Mon Sep 17 00:00:00 2001 From: Rae Moar Date: Thu, 7 Dec 2023 21:34:10 +0000 Subject: [PATCH 0943/1562] kunit: tool: add test for parsing attributes Add test for parsing attributes to kunit_tool_test.py. Test checks attributes are parsed and saved in the test logs. This test also checks that the attributes have not interfered with the parsing of other test information, specifically the suite header as the test plan was being incorrectely parsed. Signed-off-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_tool_test.py | 16 ++++++++++++++++ .../kunit/test_data/test_parse_attributes.log | 9 +++++++++ 2 files changed, 25 insertions(+) create mode 100644 tools/testing/kunit/test_data/test_parse_attributes.log diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index b28c1510be2e..2beb7327e53f 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -331,6 +331,22 @@ class KUnitParserTest(unittest.TestCase): kunit_parser.parse_run_tests(file.readlines()) self.print_mock.assert_any_call(StrContains('suite (1 subtest)')) + def test_parse_attributes(self): + ktap_log = test_data_path('test_parse_attributes.log') + with open(ktap_log) as file: + result = kunit_parser.parse_run_tests(file.readlines()) + + # Test should pass with no errors + self.assertEqual(result.counts, kunit_parser.TestCounts(passed=1, errors=0)) + self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) + + # Ensure suite header is parsed correctly + self.print_mock.assert_any_call(StrContains('suite (1 subtest)')) + + # Ensure attributes in correct test log + self.assertContains('# module: example', result.subtests[0].log) + self.assertContains('# test.speed: slow', result.subtests[0].subtests[0].log) + def test_show_test_output_on_failure(self): output = """ KTAP version 1 diff --git a/tools/testing/kunit/test_data/test_parse_attributes.log b/tools/testing/kunit/test_data/test_parse_attributes.log new file mode 100644 index 000000000000..1a13c371fe9d --- /dev/null +++ b/tools/testing/kunit/test_data/test_parse_attributes.log @@ -0,0 +1,9 @@ +KTAP version 1 +1..1 + KTAP version 1 + # Subtest: suite + # module: example + 1..1 + # test.speed: slow + ok 1 test +ok 1 suite \ No newline at end of file From 69dfdce1c5161a37a14720e5f6f62a36e387aa33 Mon Sep 17 00:00:00 2001 From: Rae Moar Date: Wed, 13 Dec 2023 19:44:16 +0000 Subject: [PATCH 0944/1562] kunit: move KUNIT_TABLE out of INIT_DATA Alter the linker section of KUNIT_TABLE to move it out of INIT_DATA and into DATA_DATA. Data for KUnit tests does not need to be in the init section. In order to run tests again after boot the KUnit data cannot be labeled as init data as the kernel could write over it. Add a KUNIT_INIT_TABLE in the next patch for KUnit tests that test init data/functions. Reviewed-by: David Gow Signed-off-by: Rae Moar Signed-off-by: Shuah Khan --- include/asm-generic/vmlinux.lds.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index bae0fe4d499b..1107905d37fc 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -370,7 +370,8 @@ BRANCH_PROFILE() \ TRACE_PRINTKS() \ BPF_RAW_TP() \ - TRACEPOINT_STR() + TRACEPOINT_STR() \ + KUNIT_TABLE() /* * Data section helpers @@ -699,8 +700,7 @@ THERMAL_TABLE(governor) \ EARLYCON_TABLE() \ LSM_TABLE() \ - EARLY_LSM_TABLE() \ - KUNIT_TABLE() + EARLY_LSM_TABLE() #define INIT_TEXT \ *(.init.text .init.text.*) \ From d81f0d7b8b23ec79f80be602ed6129ded27862e8 Mon Sep 17 00:00:00 2001 From: Rae Moar Date: Wed, 13 Dec 2023 19:44:17 +0000 Subject: [PATCH 0945/1562] kunit: add KUNIT_INIT_TABLE to init linker section Add KUNIT_INIT_TABLE to the INIT_DATA linker section. Alter the KUnit macros to create init tests: kunit_test_init_section_suites Update lib/kunit/executor.c to run both the suites in KUNIT_TABLE and KUNIT_INIT_TABLE. Reviewed-by: David Gow Signed-off-by: Rae Moar Signed-off-by: Shuah Khan --- include/asm-generic/vmlinux.lds.h | 9 ++++- include/kunit/test.h | 30 +++++++++------ include/linux/module.h | 2 + kernel/module/main.c | 3 ++ lib/kunit/executor.c | 64 ++++++++++++++++++++++++++++--- lib/kunit/test.c | 26 +++++++++---- 6 files changed, 109 insertions(+), 25 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 1107905d37fc..5dd3a61d673d 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -700,7 +700,8 @@ THERMAL_TABLE(governor) \ EARLYCON_TABLE() \ LSM_TABLE() \ - EARLY_LSM_TABLE() + EARLY_LSM_TABLE() \ + KUNIT_INIT_TABLE() #define INIT_TEXT \ *(.init.text .init.text.*) \ @@ -926,6 +927,12 @@ . = ALIGN(8); \ BOUNDED_SECTION_POST_LABEL(.kunit_test_suites, __kunit_suites, _start, _end) +/* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */ +#define KUNIT_INIT_TABLE() \ + . = ALIGN(8); \ + BOUNDED_SECTION_POST_LABEL(.kunit_init_test_suites, \ + __kunit_init_suites, _start, _end) + #ifdef CONFIG_BLK_DEV_INITRD #define INIT_RAM_FS \ . = ALIGN(4); \ diff --git a/include/kunit/test.h b/include/kunit/test.h index 20ed9f9275c9..fe79cd736e94 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -337,6 +337,9 @@ void __kunit_test_suites_exit(struct kunit_suite **suites, int num_suites); void kunit_exec_run_tests(struct kunit_suite_set *suite_set, bool builtin); void kunit_exec_list_tests(struct kunit_suite_set *suite_set, bool include_attr); +struct kunit_suite_set kunit_merge_suite_sets(struct kunit_suite_set init_suite_set, + struct kunit_suite_set suite_set); + #if IS_BUILTIN(CONFIG_KUNIT) int kunit_run_all_tests(void); #else @@ -371,6 +374,11 @@ static inline int kunit_run_all_tests(void) #define kunit_test_suite(suite) kunit_test_suites(&suite) +#define __kunit_init_test_suites(unique_array, ...) \ + static struct kunit_suite *unique_array[] \ + __aligned(sizeof(struct kunit_suite *)) \ + __used __section(".kunit_init_test_suites") = { __VA_ARGS__ } + /** * kunit_test_init_section_suites() - used to register one or more &struct * kunit_suite containing init functions or @@ -378,21 +386,21 @@ static inline int kunit_run_all_tests(void) * * @__suites: a statically allocated list of &struct kunit_suite. * - * This functions identically as kunit_test_suites() except that it suppresses - * modpost warnings for referencing functions marked __init or data marked - * __initdata; this is OK because currently KUnit only runs tests upon boot - * during the init phase or upon loading a module during the init phase. + * This functions similar to kunit_test_suites() except that it compiles the + * list of suites during init phase. * - * NOTE TO KUNIT DEVS: If we ever allow KUnit tests to be run after boot, these - * tests must be excluded. + * This macro also suffixes the array and suite declarations it makes with + * _probe; so that modpost suppresses warnings about referencing init data + * for symbols named in this manner. * - * The only thing this macro does that's different from kunit_test_suites is - * that it suffixes the array and suite declarations it makes with _probe; - * modpost suppresses warnings about referencing init data for symbols named in - * this manner. + * Note: these init tests are not able to be run after boot so there is no + * "run" debugfs file generated for these tests. + * + * Also, do not mark the suite or test case structs with __initdata because + * they will be used after the init phase with debugfs. */ #define kunit_test_init_section_suites(__suites...) \ - __kunit_test_suites(CONCATENATE(__UNIQUE_ID(array), _probe), \ + __kunit_init_test_suites(CONCATENATE(__UNIQUE_ID(array), _probe), \ ##__suites) #define kunit_test_init_section_suite(suite) \ diff --git a/include/linux/module.h b/include/linux/module.h index a98e188cf37b..9cd0009bd050 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -540,6 +540,8 @@ struct module { struct static_call_site *static_call_sites; #endif #if IS_ENABLED(CONFIG_KUNIT) + int num_kunit_init_suites; + struct kunit_suite **kunit_init_suites; int num_kunit_suites; struct kunit_suite **kunit_suites; #endif diff --git a/kernel/module/main.c b/kernel/module/main.c index 98fedfdb8db5..36681911c05a 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2199,6 +2199,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->kunit_suites = section_objs(info, ".kunit_test_suites", sizeof(*mod->kunit_suites), &mod->num_kunit_suites); + mod->kunit_init_suites = section_objs(info, ".kunit_init_test_suites", + sizeof(*mod->kunit_init_suites), + &mod->num_kunit_init_suites); #endif mod->extable = section_objs(info, "__ex_table", diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c index 1236b3cd2fbb..847329c51e91 100644 --- a/lib/kunit/executor.c +++ b/lib/kunit/executor.c @@ -12,6 +12,8 @@ */ extern struct kunit_suite * const __kunit_suites_start[]; extern struct kunit_suite * const __kunit_suites_end[]; +extern struct kunit_suite * const __kunit_init_suites_start[]; +extern struct kunit_suite * const __kunit_init_suites_end[]; static char *action_param; @@ -292,6 +294,33 @@ void kunit_exec_list_tests(struct kunit_suite_set *suite_set, bool include_attr) } } +struct kunit_suite_set kunit_merge_suite_sets(struct kunit_suite_set init_suite_set, + struct kunit_suite_set suite_set) +{ + struct kunit_suite_set total_suite_set = {NULL, NULL}; + struct kunit_suite **total_suite_start = NULL; + size_t init_num_suites, num_suites, suite_size; + + init_num_suites = init_suite_set.end - init_suite_set.start; + num_suites = suite_set.end - suite_set.start; + suite_size = sizeof(suite_set.start); + + /* Allocate memory for array of all kunit suites */ + total_suite_start = kmalloc_array(init_num_suites + num_suites, suite_size, GFP_KERNEL); + if (!total_suite_start) + return total_suite_set; + + /* Append init suites and then all other kunit suites */ + memcpy(total_suite_start, init_suite_set.start, init_num_suites * suite_size); + memcpy(total_suite_start + init_num_suites, suite_set.start, num_suites * suite_size); + + /* Set kunit suite set start and end */ + total_suite_set.start = total_suite_start; + total_suite_set.end = total_suite_start + (init_num_suites + num_suites); + + return total_suite_set; +} + #if IS_BUILTIN(CONFIG_KUNIT) static char *kunit_shutdown; @@ -313,21 +342,41 @@ static void kunit_handle_shutdown(void) int kunit_run_all_tests(void) { - struct kunit_suite_set suite_set = { + struct kunit_suite_set suite_set = {NULL, NULL}; + struct kunit_suite_set filtered_suite_set = {NULL, NULL}; + struct kunit_suite_set init_suite_set = { + __kunit_init_suites_start, __kunit_init_suites_end, + }; + struct kunit_suite_set normal_suite_set = { __kunit_suites_start, __kunit_suites_end, }; + size_t init_num_suites = init_suite_set.end - init_suite_set.start; int err = 0; + + if (init_num_suites > 0) { + suite_set = kunit_merge_suite_sets(init_suite_set, normal_suite_set); + if (!suite_set.start) + goto out; + } else + suite_set = normal_suite_set; + if (!kunit_enabled()) { pr_info("kunit: disabled\n"); - goto out; + goto free_out; } if (filter_glob_param || filter_param) { - suite_set = kunit_filter_suites(&suite_set, filter_glob_param, + filtered_suite_set = kunit_filter_suites(&suite_set, filter_glob_param, filter_param, filter_action_param, &err); + + /* Free original suite set before using filtered suite set */ + if (init_num_suites > 0) + kfree(suite_set.start); + suite_set = filtered_suite_set; + if (err) { pr_err("kunit executor: error filtering suites: %d\n", err); - goto out; + goto free_out; } } @@ -340,9 +389,12 @@ int kunit_run_all_tests(void) else pr_err("kunit executor: unknown action '%s'\n", action_param); - if (filter_glob_param || filter_param) { /* a copy was made of each suite */ +free_out: + if (filter_glob_param || filter_param) kunit_free_suite_set(suite_set); - } + else if (init_num_suites > 0) + /* Don't use kunit_free_suite_set because suites aren't individually allocated */ + kfree(suite_set.start); out: kunit_handle_shutdown(); diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 7deee3701d20..6b60d85ce108 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -742,28 +742,40 @@ EXPORT_SYMBOL_GPL(__kunit_test_suites_exit); #ifdef CONFIG_MODULES static void kunit_module_init(struct module *mod) { - struct kunit_suite_set suite_set = { + struct kunit_suite_set suite_set, filtered_set; + struct kunit_suite_set normal_suite_set = { mod->kunit_suites, mod->kunit_suites + mod->num_kunit_suites, }; + struct kunit_suite_set init_suite_set = { + mod->kunit_init_suites, mod->kunit_init_suites + mod->num_kunit_init_suites, + }; const char *action = kunit_action(); int err = 0; - suite_set = kunit_filter_suites(&suite_set, + if (mod->num_kunit_init_suites > 0) + suite_set = kunit_merge_suite_sets(init_suite_set, normal_suite_set); + else + suite_set = normal_suite_set; + + filtered_set = kunit_filter_suites(&suite_set, kunit_filter_glob() ?: "*.*", kunit_filter(), kunit_filter_action(), &err); if (err) pr_err("kunit module: error filtering suites: %d\n", err); - mod->kunit_suites = (struct kunit_suite **)suite_set.start; - mod->num_kunit_suites = suite_set.end - suite_set.start; + mod->kunit_suites = (struct kunit_suite **)filtered_set.start; + mod->num_kunit_suites = filtered_set.end - filtered_set.start; + + if (mod->num_kunit_init_suites > 0) + kfree(suite_set.start); if (!action) - kunit_exec_run_tests(&suite_set, false); + kunit_exec_run_tests(&filtered_set, false); else if (!strcmp(action, "list")) - kunit_exec_list_tests(&suite_set, false); + kunit_exec_list_tests(&filtered_set, false); else if (!strcmp(action, "list_attr")) - kunit_exec_list_tests(&suite_set, true); + kunit_exec_list_tests(&filtered_set, true); else pr_err("kunit: unknown action '%s'\n", action); } From 2cf45281570f76f973bd8d17596684d1875002df Mon Sep 17 00:00:00 2001 From: Rae Moar Date: Wed, 13 Dec 2023 19:44:18 +0000 Subject: [PATCH 0946/1562] kunit: add example suite to test init suites Add example_init_test_suite to allow for testing the feature of running test suites marked as init to indicate they use init data and/or functions. This suite should always pass and uses a simple init function. This suite can also be used to test the is_init attribute introduced in the next patch. Signed-off-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/kunit-example-test.c | 37 ++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/lib/kunit/kunit-example-test.c b/lib/kunit/kunit-example-test.c index 6bb5c2ef6696..d2f7a3c62c18 100644 --- a/lib/kunit/kunit-example-test.c +++ b/lib/kunit/kunit-example-test.c @@ -287,4 +287,41 @@ static struct kunit_suite example_test_suite = { */ kunit_test_suites(&example_test_suite); +static int __init init_add(int x, int y) +{ + return (x + y); +} + +/* + * This test should always pass. Can be used to test init suites. + */ +static void __init example_init_test(struct kunit *test) +{ + KUNIT_EXPECT_EQ(test, init_add(1, 1), 2); +} + +/* + * The kunit_case struct cannot be marked as __initdata as this will be + * used in debugfs to retrieve results after test has run + */ +static struct kunit_case __refdata example_init_test_cases[] = { + KUNIT_CASE(example_init_test), + {} +}; + +/* + * The kunit_suite struct cannot be marked as __initdata as this will be + * used in debugfs to retrieve results after test has run + */ +static struct kunit_suite example_init_test_suite = { + .name = "example_init", + .test_cases = example_init_test_cases, +}; + +/* + * This registers the test suite and marks the suite as using init data + * and/or functions. + */ +kunit_test_init_section_suites(&example_init_test_suite); + MODULE_LICENSE("GPL v2"); From 6c4ea2f48de9860217ddfedee081d485dbeea7e8 Mon Sep 17 00:00:00 2001 From: Rae Moar Date: Wed, 13 Dec 2023 19:44:19 +0000 Subject: [PATCH 0947/1562] kunit: add is_init test attribute Add is_init test attribute of type bool. Add to_string, get, and filter methods to lib/kunit/attributes.c. Mark each of the tests in the init section with the is_init=true attribute. Add is_init to the attributes documentation. Reviewed-by: David Gow Signed-off-by: Rae Moar Signed-off-by: Shuah Khan --- .../dev-tools/kunit/running_tips.rst | 7 +++ include/kunit/test.h | 1 + lib/kunit/attributes.c | 60 +++++++++++++++++++ lib/kunit/executor.c | 6 +- 4 files changed, 73 insertions(+), 1 deletion(-) diff --git a/Documentation/dev-tools/kunit/running_tips.rst b/Documentation/dev-tools/kunit/running_tips.rst index 766f9cdea0fa..024e9ad1d1e9 100644 --- a/Documentation/dev-tools/kunit/running_tips.rst +++ b/Documentation/dev-tools/kunit/running_tips.rst @@ -428,3 +428,10 @@ This attribute indicates the name of the module associated with the test. This attribute is automatically saved as a string and is printed for each suite. Tests can also be filtered using this attribute. + +``is_init`` + +This attribute indicates whether the test uses init data or functions. + +This attribute is automatically saved as a boolean and tests can also be +filtered using this attribute. diff --git a/include/kunit/test.h b/include/kunit/test.h index fe79cd736e94..b163b9984b33 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -253,6 +253,7 @@ struct kunit_suite { struct dentry *debugfs; struct string_stream *log; int suite_init_err; + bool is_init; }; /* Stores an array of suites, end points one past the end */ diff --git a/lib/kunit/attributes.c b/lib/kunit/attributes.c index 1b512f7e1838..2cf04cc09372 100644 --- a/lib/kunit/attributes.c +++ b/lib/kunit/attributes.c @@ -58,6 +58,16 @@ static const char *attr_enum_to_string(void *attr, const char * const str_list[] return str_list[val]; } +static const char *attr_bool_to_string(void *attr, bool *to_free) +{ + bool val = (bool)attr; + + *to_free = false; + if (val) + return "true"; + return "false"; +} + static const char *attr_speed_to_string(void *attr, bool *to_free) { return attr_enum_to_string(attr, speed_str_list, to_free); @@ -166,6 +176,37 @@ static int attr_string_filter(void *attr, const char *input, int *err) return false; } +static int attr_bool_filter(void *attr, const char *input, int *err) +{ + int i, input_int = -1; + long val = (long)attr; + const char *input_str = NULL; + + for (i = 0; input[i]; i++) { + if (!strchr(op_list, input[i])) { + input_str = input + i; + break; + } + } + + if (!input_str) { + *err = -EINVAL; + pr_err("kunit executor: filter value not found: %s\n", input); + return false; + } + + if (!strcmp(input_str, "true")) + input_int = (int)true; + else if (!strcmp(input_str, "false")) + input_int = (int)false; + else { + *err = -EINVAL; + pr_err("kunit executor: invalid filter input: %s\n", input); + return false; + } + + return int_filter(val, input, input_int, err); +} /* Get Attribute Methods */ @@ -194,6 +235,17 @@ static void *attr_module_get(void *test_or_suite, bool is_test) return (void *) ""; } +static void *attr_is_init_get(void *test_or_suite, bool is_test) +{ + struct kunit_suite *suite = is_test ? NULL : test_or_suite; + struct kunit_case *test = is_test ? test_or_suite : NULL; + + if (test) + return ((void *) NULL); + else + return ((void *) suite->is_init); +} + /* List of all Test Attributes */ static struct kunit_attr kunit_attr_list[] = { @@ -212,6 +264,14 @@ static struct kunit_attr kunit_attr_list[] = { .filter = attr_string_filter, .attr_default = (void *)"", .print = PRINT_SUITE, + }, + { + .name = "is_init", + .get_attr = attr_is_init_get, + .to_string = attr_bool_to_string, + .filter = attr_bool_filter, + .attr_default = (void *)false, + .print = PRINT_SUITE, } }; diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c index 847329c51e91..717b9599036b 100644 --- a/lib/kunit/executor.c +++ b/lib/kunit/executor.c @@ -300,6 +300,7 @@ struct kunit_suite_set kunit_merge_suite_sets(struct kunit_suite_set init_suite_ struct kunit_suite_set total_suite_set = {NULL, NULL}; struct kunit_suite **total_suite_start = NULL; size_t init_num_suites, num_suites, suite_size; + int i = 0; init_num_suites = init_suite_set.end - init_suite_set.start; num_suites = suite_set.end - suite_set.start; @@ -310,8 +311,11 @@ struct kunit_suite_set kunit_merge_suite_sets(struct kunit_suite_set init_suite_ if (!total_suite_start) return total_suite_set; - /* Append init suites and then all other kunit suites */ + /* Append and mark init suites and then append all other kunit suites */ memcpy(total_suite_start, init_suite_set.start, init_num_suites * suite_size); + for (i = 0; i < init_num_suites; i++) + total_suite_start[i]->is_init = true; + memcpy(total_suite_start + init_num_suites, suite_set.start, num_suites * suite_size); /* Set kunit suite set start and end */ From c72a870926c2de694942aaac2b49e59ce789bb74 Mon Sep 17 00:00:00 2001 From: Rae Moar Date: Wed, 13 Dec 2023 19:44:20 +0000 Subject: [PATCH 0948/1562] kunit: add ability to run tests after boot using debugfs Add functionality to run built-in tests after boot by writing to a debugfs file. Add a new debugfs file labeled "run" for each test suite to use for this purpose. As an example, write to the file using the following: echo "any string" > /sys/kernel/debugfs/kunit//run This will trigger the test suite to run and will print results to the kernel log. To guard against running tests concurrently with this feature, add a mutex lock around running kunit. This supports the current practice of not allowing tests to be run concurrently on the same kernel. This new functionality could be used to design a parameter injection feature in the future. Fixed up merge conflict duing rebase to Linux 6.7-rc6 Signed-off-by: Shuah Khan Reviewed-by: David Gow Signed-off-by: Rae Moar Signed-off-by: Shuah Khan --- lib/kunit/debugfs.c | 68 +++++++++++++++++++++++++++++++++++++++++++++ lib/kunit/test.c | 10 +++++++ 2 files changed, 78 insertions(+) diff --git a/lib/kunit/debugfs.c b/lib/kunit/debugfs.c index 382706dfb47d..d548750a325a 100644 --- a/lib/kunit/debugfs.c +++ b/lib/kunit/debugfs.c @@ -8,12 +8,14 @@ #include #include +#include #include "string-stream.h" #include "debugfs.h" #define KUNIT_DEBUGFS_ROOT "kunit" #define KUNIT_DEBUGFS_RESULTS "results" +#define KUNIT_DEBUGFS_RUN "run" /* * Create a debugfs representation of test suites: @@ -21,6 +23,8 @@ * Path Semantics * /sys/kernel/debug/kunit//results Show results of last run for * testsuite + * /sys/kernel/debug/kunit//run Write to this file to trigger + * testsuite to run * */ @@ -101,6 +105,51 @@ static int debugfs_results_open(struct inode *inode, struct file *file) return single_open(file, debugfs_print_results, suite); } +/* + * Print a usage message to the debugfs "run" file + * (/sys/kernel/debug/kunit//run) if opened. + */ +static int debugfs_print_run(struct seq_file *seq, void *v) +{ + struct kunit_suite *suite = (struct kunit_suite *)seq->private; + + seq_puts(seq, "Write to this file to trigger the test suite to run.\n"); + seq_printf(seq, "usage: echo \"any string\" > /sys/kernel/debugfs/kunit/%s/run\n", + suite->name); + return 0; +} + +/* + * The debugfs "run" file (/sys/kernel/debug/kunit//run) + * contains no information. Write to the file to trigger the test suite + * to run. + */ +static int debugfs_run_open(struct inode *inode, struct file *file) +{ + struct kunit_suite *suite; + + suite = (struct kunit_suite *)inode->i_private; + + return single_open(file, debugfs_print_run, suite); +} + +/* + * Trigger a test suite to run by writing to the suite's "run" debugfs + * file found at: /sys/kernel/debug/kunit//run + * + * Note: what is written to this file will not be saved. + */ +static ssize_t debugfs_run(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct inode *f_inode = file->f_inode; + struct kunit_suite *suite = (struct kunit_suite *) f_inode->i_private; + + __kunit_test_suites_init(&suite, 1); + + return count; +} + static const struct file_operations debugfs_results_fops = { .open = debugfs_results_open, .read = seq_read, @@ -108,11 +157,23 @@ static const struct file_operations debugfs_results_fops = { .release = debugfs_release, }; +static const struct file_operations debugfs_run_fops = { + .open = debugfs_run_open, + .read = seq_read, + .write = debugfs_run, + .llseek = seq_lseek, + .release = debugfs_release, +}; + void kunit_debugfs_create_suite(struct kunit_suite *suite) { struct kunit_case *test_case; struct string_stream *stream; + /* If suite log already allocated, do not create new debugfs files. */ + if (suite->log) + return; + /* * Allocate logs before creating debugfs representation. * The suite->log and test_case->log pointer are expected to be NULL @@ -140,6 +201,13 @@ void kunit_debugfs_create_suite(struct kunit_suite *suite) debugfs_create_file(KUNIT_DEBUGFS_RESULTS, S_IFREG | 0444, suite->debugfs, suite, &debugfs_results_fops); + + /* Do not create file to re-run test if test runs on init */ + if (!suite->is_init) { + debugfs_create_file(KUNIT_DEBUGFS_RUN, S_IFREG | 0644, + suite->debugfs, + suite, &debugfs_run_fops); + } return; err: diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 6b60d85ce108..088489856db8 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -22,6 +23,8 @@ #include "string-stream.h" #include "try-catch-impl.h" +static DEFINE_MUTEX(kunit_run_lock); + /* * Hook to fail the current test and print an error message to the log. */ @@ -692,6 +695,7 @@ static void kunit_init_suite(struct kunit_suite *suite) kunit_debugfs_create_suite(suite); suite->status_comment[0] = '\0'; suite->suite_init_err = 0; + string_stream_clear(suite->log); } bool kunit_enabled(void) @@ -710,6 +714,11 @@ int __kunit_test_suites_init(struct kunit_suite * const * const suites, int num_ kunit_suite_counter = 1; + /* Use mutex lock to guard against running tests concurrently. */ + if (mutex_lock_interruptible(&kunit_run_lock)) { + pr_err("kunit: test interrupted\n"); + return -EINTR; + } static_branch_inc(&kunit_running); for (i = 0; i < num_suites; i++) { @@ -718,6 +727,7 @@ int __kunit_test_suites_init(struct kunit_suite * const * const suites, int num_ } static_branch_dec(&kunit_running); + mutex_unlock(&kunit_run_lock); return 0; } EXPORT_SYMBOL_GPL(__kunit_test_suites_init); From e9f0e21ceb65ea5e450ede7c9b9a5bfc90a403ae Mon Sep 17 00:00:00 2001 From: Rae Moar Date: Wed, 13 Dec 2023 19:44:21 +0000 Subject: [PATCH 0949/1562] Documentation: Add debugfs docs with run after boot Expand the documentation on the KUnit debugfs filesystem on the run_manual.rst page. Add section describing how to access results using debugfs. Add section describing how to run tests after boot using debugfs. Reviewed-by: David Gow Signed-off-by: Rae Moar Signed-off-by: Shuah Khan --- Documentation/dev-tools/kunit/run_manual.rst | 51 ++++++++++++++++++-- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/Documentation/dev-tools/kunit/run_manual.rst b/Documentation/dev-tools/kunit/run_manual.rst index e7b46421f247..699d92885075 100644 --- a/Documentation/dev-tools/kunit/run_manual.rst +++ b/Documentation/dev-tools/kunit/run_manual.rst @@ -49,9 +49,52 @@ loaded. The results will appear in TAP format in ``dmesg``. +debugfs +======= + +KUnit can be accessed from userspace via the debugfs filesystem (See more +information about debugfs at Documentation/filesystems/debugfs.rst). + +If ``CONFIG_KUNIT_DEBUGFS`` is enabled, the KUnit debugfs filesystem is +mounted at /sys/kernel/debug/kunit. You can use this filesystem to perform +the following actions. + +Retrieve Test Results +===================== + +You can use debugfs to retrieve KUnit test results. The test results are +accessible from the debugfs filesystem in the following read-only file: + +.. code-block :: bash + + /sys/kernel/debug/kunit//results + +The test results are printed in a KTAP document. Note this document is separate +to the kernel log and thus, may have different test suite numbering. + +Run Tests After Kernel Has Booted +================================= + +You can use the debugfs filesystem to trigger built-in tests to run after +boot. To run the test suite, you can use the following command to write to +the ``/sys/kernel/debug/kunit//run`` file: + +.. code-block :: bash + + echo "any string" > /sys/kernel/debugfs/kunit//run + +As a result, the test suite runs and the results are printed to the kernel +log. + +However, this feature is not available with KUnit suites that use init data, +because init data may have been discarded after the kernel boots. KUnit +suites that use init data should be defined using the +kunit_test_init_section_suites() macro. + +Also, you cannot use this feature to run tests concurrently. Instead a test +will wait to run until other tests have completed or failed. + .. note :: - If ``CONFIG_KUNIT_DEBUGFS`` is enabled, KUnit test results will - be accessible from the ``debugfs`` filesystem (if mounted). - They will be in ``/sys/kernel/debug/kunit//results``, in - TAP format. + For test authors, to use this feature, tests will need to correctly initialise + and/or clean up any data, so the test runs correctly a second time. From d03c720e03bd9bf0b784d80b5d3ede7e2daf3b6e Mon Sep 17 00:00:00 2001 From: "davidgow@google.com" Date: Fri, 15 Dec 2023 15:39:08 +0800 Subject: [PATCH 0950/1562] kunit: Add APIs for managing devices Tests for drivers often require a struct device to pass to other functions. While it's possible to create these with root_device_register(), or to use something like a platform device, this is both a misuse of those APIs, and can be difficult to clean up after, for example, a failed assertion. Add some KUnit-specific functions for registering and unregistering a struct device: - kunit_device_register() - kunit_device_register_with_driver() - kunit_device_unregister() These helpers allocate a on a 'kunit' bus which will either probe the driver passed in (kunit_device_register_with_driver), or will create a stub driver (kunit_device_register) which is cleaned up on test shutdown. Devices are automatically unregistered on test shutdown, but can be manually unregistered earlier with kunit_device_unregister() in order to, for example, test device release code. Reviewed-by: Matti Vaittinen Reviewed-by: Maxime Ripard Signed-off-by: David Gow Reviewed-by: Greg Kroah-Hartman Signed-off-by: Shuah Khan --- .../dev-tools/kunit/api/resource.rst | 9 + Documentation/dev-tools/kunit/usage.rst | 50 +++++ include/kunit/device.h | 80 ++++++++ lib/kunit/Makefile | 3 +- lib/kunit/device-impl.h | 17 ++ lib/kunit/device.c | 181 ++++++++++++++++++ lib/kunit/kunit-test.c | 134 ++++++++++++- lib/kunit/test.c | 3 + 8 files changed, 475 insertions(+), 2 deletions(-) create mode 100644 include/kunit/device.h create mode 100644 lib/kunit/device-impl.h create mode 100644 lib/kunit/device.c diff --git a/Documentation/dev-tools/kunit/api/resource.rst b/Documentation/dev-tools/kunit/api/resource.rst index 0a94f831259e..ec6002a6b0db 100644 --- a/Documentation/dev-tools/kunit/api/resource.rst +++ b/Documentation/dev-tools/kunit/api/resource.rst @@ -11,3 +11,12 @@ state on a per-test basis, register custom cleanup actions, and more. .. kernel-doc:: include/kunit/resource.h :internal: + +Managed Devices +--------------- + +Functions for using KUnit-managed struct device and struct device_driver. +Include ``kunit/device.h`` to use these. + +.. kernel-doc:: include/kunit/device.h + :internal: diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst index 9db12e91668e..53c6f7dc8a42 100644 --- a/Documentation/dev-tools/kunit/usage.rst +++ b/Documentation/dev-tools/kunit/usage.rst @@ -797,3 +797,53 @@ structures as shown below: KUnit is not enabled, or if no test is running in the current task, it will do nothing. This compiles down to either a no-op or a static key check, so will have a negligible performance impact when no test is running. + +Managing Fake Devices and Drivers +--------------------------------- + +When testing drivers or code which interacts with drivers, many functions will +require a ``struct device`` or ``struct device_driver``. In many cases, setting +up a real device is not required to test any given function, so a fake device +can be used instead. + +KUnit provides helper functions to create and manage these fake devices, which +are internally of type ``struct kunit_device``, and are attached to a special +``kunit_bus``. These devices support managed device resources (devres), as +described in Documentation/driver-api/driver-model/devres.rst + +To create a KUnit-managed ``struct device_driver``, use ``kunit_driver_create()``, +which will create a driver with the given name, on the ``kunit_bus``. This driver +will automatically be destroyed when the corresponding test finishes, but can also +be manually destroyed with ``driver_unregister()``. + +To create a fake device, use the ``kunit_device_register()``, which will create +and register a device, using a new KUnit-managed driver created with ``kunit_driver_create()``. +To provide a specific, non-KUnit-managed driver, use ``kunit_device_register_with_driver()`` +instead. Like with managed drivers, KUnit-managed fake devices are automatically +cleaned up when the test finishes, but can be manually cleaned up early with +``kunit_device_unregister()``. + +The KUnit devices should be used in preference to ``root_device_register()``, and +instead of ``platform_device_register()`` in cases where the device is not otherwise +a platform device. + +For example: + +.. code-block:: c + + #include + + static void test_my_device(struct kunit *test) + { + struct device *fake_device; + const char *dev_managed_string; + + // Create a fake device. + fake_device = kunit_device_register(test, "my_device"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, fake_device) + + // Pass it to functions which need a device. + dev_managed_string = devm_kstrdup(fake_device, "Hello, World!"); + + // Everything is cleaned up automatically when the test ends. + } \ No newline at end of file diff --git a/include/kunit/device.h b/include/kunit/device.h new file mode 100644 index 000000000000..2450110ad64e --- /dev/null +++ b/include/kunit/device.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KUnit basic device implementation + * + * Helpers for creating and managing fake devices for KUnit tests. + * + * Copyright (C) 2023, Google LLC. + * Author: David Gow + */ + +#ifndef _KUNIT_DEVICE_H +#define _KUNIT_DEVICE_H + +#if IS_ENABLED(CONFIG_KUNIT) + +#include + +struct device; +struct device_driver; + +/** + * kunit_driver_create() - Create a struct device_driver attached to the kunit_bus + * @test: The test context object. + * @name: The name to give the created driver. + * + * Creates a struct device_driver attached to the kunit_bus, with the name @name. + * This driver will automatically be cleaned up on test exit. + * + * Return: a stub struct device_driver, managed by KUnit, with the name @name. + */ +struct device_driver *kunit_driver_create(struct kunit *test, const char *name); + +/** + * kunit_device_register() - Create a struct device for use in KUnit tests + * @test: The test context object. + * @name: The name to give the created device. + * + * Creates a struct kunit_device (which is a struct device) with the given name, + * and a corresponding driver. The device and driver will be cleaned up on test + * exit, or when kunit_device_unregister is called. See also + * kunit_device_register_with_driver, if you wish to provide your own + * struct device_driver. + * + * Return: a pointer to a struct device which will be cleaned up when the test + * exits, or an error pointer if the device could not be allocated or registered. + */ +struct device *kunit_device_register(struct kunit *test, const char *name); + +/** + * kunit_device_register_with_driver() - Create a struct device for use in KUnit tests + * @test: The test context object. + * @name: The name to give the created device. + * @drv: The struct device_driver to associate with the device. + * + * Creates a struct kunit_device (which is a struct device) with the given + * name, and driver. The device will be cleaned up on test exit, or when + * kunit_device_unregister is called. See also kunit_device_register, if you + * wish KUnit to create and manage a driver for you. + * + * Return: a pointer to a struct device which will be cleaned up when the test + * exits, or an error pointer if the device could not be allocated or registered. + */ +struct device *kunit_device_register_with_driver(struct kunit *test, + const char *name, + const struct device_driver *drv); + +/** + * kunit_device_unregister() - Unregister a KUnit-managed device + * @test: The test context object which created the device + * @dev: The device. + * + * Unregisters and destroys a struct device which was created with + * kunit_device_register or kunit_device_register_with_driver. If KUnit created + * a driver, cleans it up as well. + */ +void kunit_device_unregister(struct kunit *test, struct device *dev); + +#endif + +#endif diff --git a/lib/kunit/Makefile b/lib/kunit/Makefile index 46f75f23dfe4..309659a32a78 100644 --- a/lib/kunit/Makefile +++ b/lib/kunit/Makefile @@ -7,7 +7,8 @@ kunit-objs += test.o \ assert.o \ try-catch.o \ executor.o \ - attributes.o + attributes.o \ + device.o ifeq ($(CONFIG_KUNIT_DEBUGFS),y) kunit-objs += debugfs.o diff --git a/lib/kunit/device-impl.h b/lib/kunit/device-impl.h new file mode 100644 index 000000000000..54bd55836405 --- /dev/null +++ b/lib/kunit/device-impl.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KUnit internal header for device helpers + * + * Header for KUnit-internal driver / bus management. + * + * Copyright (C) 2023, Google LLC. + * Author: David Gow + */ + +#ifndef _KUNIT_DEVICE_IMPL_H +#define _KUNIT_DEVICE_IMPL_H + +// For internal use only -- registers the kunit_bus. +int kunit_bus_init(void); + +#endif //_KUNIT_DEVICE_IMPL_H diff --git a/lib/kunit/device.c b/lib/kunit/device.c new file mode 100644 index 000000000000..1db4305b615a --- /dev/null +++ b/lib/kunit/device.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit-managed device implementation + * + * Implementation of struct kunit_device helpers for fake devices whose + * lifecycle is managed by KUnit. + * + * Copyright (C) 2023, Google LLC. + * Author: David Gow + */ + +#include + +#include +#include +#include + +#include "device-impl.h" + +/* Wrappers for use with kunit_add_action() */ +KUNIT_DEFINE_ACTION_WRAPPER(device_unregister_wrapper, device_unregister, struct device *); +KUNIT_DEFINE_ACTION_WRAPPER(driver_unregister_wrapper, driver_unregister, struct device_driver *); + +/* The root device for the KUnit bus, parent of all kunit_devices. */ +static struct device *kunit_bus_device; + +/* A device owned by a KUnit test. */ +struct kunit_device { + struct device dev; + /* The KUnit test which owns this device. */ + struct kunit *owner; + /* If the driver is managed by KUnit and unique to this device. */ + const struct device_driver *driver; +}; + +#define to_kunit_device(d) container_of_const(d, struct kunit_device, dev) + +static struct bus_type kunit_bus_type = { + .name = "kunit", +}; + +/* Register the 'kunit_bus' used for fake devices. */ +int kunit_bus_init(void) +{ + int error; + + kunit_bus_device = root_device_register("kunit"); + if (!kunit_bus_device) + return -ENOMEM; + + error = bus_register(&kunit_bus_type); + if (error) + bus_unregister(&kunit_bus_type); + return error; +} + +/* Release a 'fake' KUnit device. */ +static void kunit_device_release(struct device *d) +{ + kfree(to_kunit_device(d)); +} + +/** + * Create and register a KUnit-managed struct device_driver on the kunit_bus. + * Returns an error pointer on failure. + */ +struct device_driver *kunit_driver_create(struct kunit *test, const char *name) +{ + struct device_driver *driver; + int err = -ENOMEM; + + driver = kunit_kzalloc(test, sizeof(*driver), GFP_KERNEL); + + if (!driver) + return ERR_PTR(err); + + driver->name = name; + driver->bus = &kunit_bus_type; + driver->owner = THIS_MODULE; + + err = driver_register(driver); + if (err) { + kunit_kfree(test, driver); + return ERR_PTR(err); + } + + kunit_add_action(test, driver_unregister_wrapper, driver); + return driver; +} +EXPORT_SYMBOL_GPL(kunit_driver_create); + +/* Helper which creates a kunit_device, attaches it to the kunit_bus*/ +static struct kunit_device *kunit_device_register_internal(struct kunit *test, + const char *name, + const struct device_driver *drv) +{ + struct kunit_device *kunit_dev; + int err = -ENOMEM; + + kunit_dev = kzalloc(sizeof(*kunit_dev), GFP_KERNEL); + if (!kunit_dev) + return ERR_PTR(err); + + kunit_dev->owner = test; + + err = dev_set_name(&kunit_dev->dev, "%s.%s", test->name, name); + if (err) { + kfree(kunit_dev); + return ERR_PTR(err); + } + + kunit_dev->dev.release = kunit_device_release; + kunit_dev->dev.bus = &kunit_bus_type; + kunit_dev->dev.parent = kunit_bus_device; + + err = device_register(&kunit_dev->dev); + if (err) { + put_device(&kunit_dev->dev); + return ERR_PTR(err); + } + + kunit_add_action(test, device_unregister_wrapper, &kunit_dev->dev); + + return kunit_dev; +} + +/** + * Create and register a new KUnit-managed device, using the user-supplied device_driver. + * On failure, returns an error pointer. + */ +struct device *kunit_device_register_with_driver(struct kunit *test, + const char *name, + const struct device_driver *drv) +{ + struct kunit_device *kunit_dev = kunit_device_register_internal(test, name, drv); + + if (IS_ERR_OR_NULL(kunit_dev)) + return ERR_CAST(kunit_dev); + + return &kunit_dev->dev; +} +EXPORT_SYMBOL_GPL(kunit_device_register_with_driver); + +/** + * Create and register a new KUnit-managed device, including a matching device_driver. + * On failure, returns an error pointer. + */ +struct device *kunit_device_register(struct kunit *test, const char *name) +{ + struct device_driver *drv; + struct kunit_device *dev; + + drv = kunit_driver_create(test, name); + if (IS_ERR(drv)) + return ERR_CAST(drv); + + dev = kunit_device_register_internal(test, name, drv); + if (IS_ERR(dev)) { + kunit_release_action(test, driver_unregister_wrapper, (void *)drv); + return ERR_CAST(dev); + } + + /* Request the driver be freed. */ + dev->driver = drv; + + + return &dev->dev; +} +EXPORT_SYMBOL_GPL(kunit_device_register); + +/* Unregisters a KUnit-managed device early (including the driver, if automatically created). */ +void kunit_device_unregister(struct kunit *test, struct device *dev) +{ + const struct device_driver *driver = to_kunit_device(dev)->driver; + + kunit_release_action(test, device_unregister_wrapper, dev); + if (driver) + kunit_release_action(test, driver_unregister_wrapper, (void *)driver); +} +EXPORT_SYMBOL_GPL(kunit_device_unregister); + diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c index ee6927c60979..c4259d910356 100644 --- a/lib/kunit/kunit-test.c +++ b/lib/kunit/kunit-test.c @@ -5,9 +5,13 @@ * Copyright (C) 2019, Google LLC. * Author: Brendan Higgins */ +#include "linux/gfp_types.h" #include #include +#include +#include + #include "string-stream.h" #include "try-catch-impl.h" @@ -687,6 +691,134 @@ static struct kunit_case kunit_current_test_cases[] = { {} }; +static void test_dev_action(void *priv) +{ + *(void **)priv = (void *)1; +} + +static void kunit_device_test(struct kunit *test) +{ + struct device *test_device; + long action_was_run = 0; + + test_device = kunit_device_register(test, "my_device"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, test_device); + + // Add an action to verify cleanup. + devm_add_action(test_device, test_dev_action, &action_was_run); + + KUNIT_EXPECT_EQ(test, action_was_run, 0); + + kunit_device_unregister(test, test_device); + + KUNIT_EXPECT_EQ(test, action_was_run, 1); +} + +static void kunit_device_cleanup_test(struct kunit *test) +{ + struct device *test_device; + long action_was_run = 0; + + test_device = kunit_device_register(test, "my_device"); + KUNIT_ASSERT_NOT_NULL(test, test_device); + + /* Add an action to verify cleanup. */ + devm_add_action(test_device, test_dev_action, &action_was_run); + + KUNIT_EXPECT_EQ(test, action_was_run, 0); + + /* Force KUnit to run cleanup early. */ + kunit_cleanup(test); + + KUNIT_EXPECT_EQ(test, action_was_run, 1); +} + +struct driver_test_state { + bool driver_device_probed; + bool driver_device_removed; + long action_was_run; +}; + +static int driver_probe_hook(struct device *dev) +{ + struct kunit *test = kunit_get_current_test(); + struct driver_test_state *state = (struct driver_test_state *)test->priv; + + state->driver_device_probed = true; + return 0; +} + +static int driver_remove_hook(struct device *dev) +{ + struct kunit *test = kunit_get_current_test(); + struct driver_test_state *state = (struct driver_test_state *)test->priv; + + state->driver_device_removed = true; + return 0; +} + +static void kunit_device_driver_test(struct kunit *test) +{ + struct device_driver *test_driver; + struct device *test_device; + struct driver_test_state *test_state = kunit_kzalloc(test, sizeof(*test_state), GFP_KERNEL); + + test->priv = test_state; + test_driver = kunit_driver_create(test, "my_driver"); + + // This can fail with an error pointer. + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, test_driver); + + test_driver->probe = driver_probe_hook; + test_driver->remove = driver_remove_hook; + + test_device = kunit_device_register_with_driver(test, "my_device", test_driver); + + // This can fail with an error pointer. + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, test_device); + + // Make sure the probe function was called. + KUNIT_ASSERT_TRUE(test, test_state->driver_device_probed); + + // Add an action to verify cleanup. + devm_add_action(test_device, test_dev_action, &test_state->action_was_run); + + KUNIT_EXPECT_EQ(test, test_state->action_was_run, 0); + + kunit_device_unregister(test, test_device); + test_device = NULL; + + // Make sure the remove hook was called. + KUNIT_ASSERT_TRUE(test, test_state->driver_device_removed); + + // We're going to test this again. + test_state->driver_device_probed = false; + + // The driver should not automatically be destroyed by + // kunit_device_unregister, so we can re-use it. + test_device = kunit_device_register_with_driver(test, "my_device", test_driver); + + // This can fail with an error pointer. + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, test_device); + + // Probe was called again. + KUNIT_ASSERT_TRUE(test, test_state->driver_device_probed); + + // Everything is automatically freed here. +} + +static struct kunit_case kunit_device_test_cases[] = { + KUNIT_CASE(kunit_device_test), + KUNIT_CASE(kunit_device_cleanup_test), + KUNIT_CASE(kunit_device_driver_test), + {} +}; + +static struct kunit_suite kunit_device_test_suite = { + .name = "kunit_device", + .test_cases = kunit_device_test_cases, +}; + static struct kunit_suite kunit_current_test_suite = { .name = "kunit_current", .test_cases = kunit_current_test_cases, @@ -694,6 +826,6 @@ static struct kunit_suite kunit_current_test_suite = { kunit_test_suites(&kunit_try_catch_test_suite, &kunit_resource_test_suite, &kunit_log_test_suite, &kunit_status_test_suite, - &kunit_current_test_suite); + &kunit_current_test_suite, &kunit_device_test_suite); MODULE_LICENSE("GPL v2"); diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 088489856db8..14a77b2d702a 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -19,6 +19,7 @@ #include #include "debugfs.h" +#include "device-impl.h" #include "hooks-impl.h" #include "string-stream.h" #include "try-catch-impl.h" @@ -900,6 +901,8 @@ static int __init kunit_init(void) kunit_install_hooks(); kunit_debugfs_init(); + + kunit_bus_init(); #ifdef CONFIG_MODULES return register_module_notifier(&kunit_mod_nb); #else From 46ee8f688e43d1b3a81a3494e59f9861d4de73bf Mon Sep 17 00:00:00 2001 From: "davidgow@google.com" Date: Fri, 15 Dec 2023 15:39:09 +0800 Subject: [PATCH 0951/1562] fortify: test: Use kunit_device Using struct root_device to create fake devices for tests is something of a hack. The new struct kunit_device is meant for this purpose, so use it instead. Reviewed-by: Matti Vaittinen Acked-by: Kees Cook Signed-off-by: David Gow Signed-off-by: Shuah Khan --- lib/fortify_kunit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/fortify_kunit.c b/lib/fortify_kunit.c index c8c33cbaae9e..2e4fedc81621 100644 --- a/lib/fortify_kunit.c +++ b/lib/fortify_kunit.c @@ -15,6 +15,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -269,7 +270,7 @@ DEFINE_ALLOC_SIZE_TEST_PAIR(kvmalloc) size_t len; \ \ /* Create dummy device for devm_kmalloc()-family tests. */ \ - dev = root_device_register(dev_name); \ + dev = kunit_device_register(test, dev_name); \ KUNIT_ASSERT_FALSE_MSG(test, IS_ERR(dev), \ "Cannot register test device\n"); \ \ @@ -303,7 +304,7 @@ DEFINE_ALLOC_SIZE_TEST_PAIR(kvmalloc) checker(len, devm_kmemdup(dev, "Ohai", len, gfp), \ devm_kfree(dev, p)); \ \ - device_unregister(dev); \ + kunit_device_unregister(test, dev); \ } while (0) DEFINE_ALLOC_SIZE_TEST_PAIR(devm_kmalloc) From 837018388e18bd740fb4f4371858f3e3a477fab8 Mon Sep 17 00:00:00 2001 From: "davidgow@google.com" Date: Fri, 15 Dec 2023 15:39:10 +0800 Subject: [PATCH 0952/1562] overflow: Replace fake root_device with kunit_device Using struct root_device to create fake devices for tests is something of a hack. The new struct kunit_device is meant for this purpose, so use it instead. Reviewed-by: Matti Vaittinen Acked-by: Kees Cook Signed-off-by: David Gow Signed-off-by: Shuah Khan --- lib/overflow_kunit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c index 34db0b3aa502..c527f6b75789 100644 --- a/lib/overflow_kunit.c +++ b/lib/overflow_kunit.c @@ -6,6 +6,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -618,7 +619,7 @@ static void overflow_allocation_test(struct kunit *test) } while (0) /* Create dummy device for devm_kmalloc()-family tests. */ - dev = root_device_register(device_name); + dev = kunit_device_register(test, device_name); KUNIT_ASSERT_FALSE_MSG(test, IS_ERR(dev), "Cannot register test device\n"); @@ -634,8 +635,6 @@ static void overflow_allocation_test(struct kunit *test) check_allocation_overflow(devm_kmalloc); check_allocation_overflow(devm_kzalloc); - device_unregister(dev); - kunit_info(test, "%d allocation overflow tests finished\n", count); #undef check_allocation_overflow } From e57cdff0ddc493f23d510b0848a17d81028e329b Mon Sep 17 00:00:00 2001 From: "davidgow@google.com" Date: Fri, 15 Dec 2023 15:39:11 +0800 Subject: [PATCH 0953/1562] ASoC: topology: Replace fake root_device with kunit_device in tests Using struct root_device to create fake devices for tests is something of a hack. The new struct kunit_device is meant for this purpose, so use it instead. Acked-by: Mark Brown Signed-off-by: David Gow Signed-off-by: Shuah Khan --- sound/soc/soc-topology-test.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/sound/soc/soc-topology-test.c b/sound/soc/soc-topology-test.c index 2cd3540cec04..70cbccc42a42 100644 --- a/sound/soc/soc-topology-test.c +++ b/sound/soc/soc-topology-test.c @@ -9,6 +9,7 @@ #include #include #include +#include #include /* ===== HELPER FUNCTIONS =================================================== */ @@ -21,26 +22,19 @@ */ static struct device *test_dev; -static struct device_driver test_drv = { - .name = "sound-soc-topology-test-driver", -}; - static int snd_soc_tplg_test_init(struct kunit *test) { - test_dev = root_device_register("sound-soc-topology-test"); + test_dev = kunit_device_register(test, "sound-soc-topology-test"); test_dev = get_device(test_dev); if (!test_dev) return -ENODEV; - test_dev->driver = &test_drv; - return 0; } static void snd_soc_tplg_test_exit(struct kunit *test) { put_device(test_dev); - root_device_unregister(test_dev); } /* From d393acce7b3f046a1086362317a05f2cac01fa89 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Fri, 15 Dec 2023 15:39:12 +0800 Subject: [PATCH 0954/1562] drm/tests: Switch to kunit devices Kunit recently gained helpers to create test managed devices. This means that we no longer have to roll our own helpers in KMS and we can reuse them. Signed-off-by: Maxime Ripard Tested-by: David Gow Signed-off-by: David Gow Signed-off-by: Shuah Khan --- drivers/gpu/drm/tests/drm_kunit_helpers.c | 66 ++--------------------- 1 file changed, 3 insertions(+), 63 deletions(-) diff --git a/drivers/gpu/drm/tests/drm_kunit_helpers.c b/drivers/gpu/drm/tests/drm_kunit_helpers.c index c251e6b34de0..ca4f8e4c5d5d 100644 --- a/drivers/gpu/drm/tests/drm_kunit_helpers.c +++ b/drivers/gpu/drm/tests/drm_kunit_helpers.c @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -15,28 +16,6 @@ static const struct drm_mode_config_funcs drm_mode_config_funcs = { }; -static int fake_probe(struct platform_device *pdev) -{ - return 0; -} - -static struct platform_driver fake_platform_driver = { - .probe = fake_probe, - .driver = { - .name = KUNIT_DEVICE_NAME, - }, -}; - -KUNIT_DEFINE_ACTION_WRAPPER(kunit_action_platform_driver_unregister, - platform_driver_unregister, - struct platform_driver *); -KUNIT_DEFINE_ACTION_WRAPPER(kunit_action_platform_device_put, - platform_device_put, - struct platform_device *); -KUNIT_DEFINE_ACTION_WRAPPER(kunit_action_platform_device_del, - platform_device_del, - struct platform_device *); - /** * drm_kunit_helper_alloc_device - Allocate a mock device for a KUnit test * @test: The test context object @@ -54,34 +33,7 @@ KUNIT_DEFINE_ACTION_WRAPPER(kunit_action_platform_device_del, */ struct device *drm_kunit_helper_alloc_device(struct kunit *test) { - struct platform_device *pdev; - int ret; - - ret = platform_driver_register(&fake_platform_driver); - KUNIT_ASSERT_EQ(test, ret, 0); - - ret = kunit_add_action_or_reset(test, - kunit_action_platform_driver_unregister, - &fake_platform_driver); - KUNIT_ASSERT_EQ(test, ret, 0); - - pdev = platform_device_alloc(KUNIT_DEVICE_NAME, PLATFORM_DEVID_NONE); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pdev); - - ret = kunit_add_action_or_reset(test, - kunit_action_platform_device_put, - pdev); - KUNIT_ASSERT_EQ(test, ret, 0); - - ret = platform_device_add(pdev); - KUNIT_ASSERT_EQ(test, ret, 0); - - ret = kunit_add_action_or_reset(test, - kunit_action_platform_device_del, - pdev); - KUNIT_ASSERT_EQ(test, ret, 0); - - return &pdev->dev; + return kunit_device_register(test, KUNIT_DEVICE_NAME); } EXPORT_SYMBOL_GPL(drm_kunit_helper_alloc_device); @@ -94,19 +46,7 @@ EXPORT_SYMBOL_GPL(drm_kunit_helper_alloc_device); */ void drm_kunit_helper_free_device(struct kunit *test, struct device *dev) { - struct platform_device *pdev = to_platform_device(dev); - - kunit_release_action(test, - kunit_action_platform_device_del, - pdev); - - kunit_release_action(test, - kunit_action_platform_device_put, - pdev); - - kunit_release_action(test, - kunit_action_platform_driver_unregister, - &fake_platform_driver); + kunit_device_unregister(test, dev); } EXPORT_SYMBOL_GPL(drm_kunit_helper_free_device); From 2b61582acd19c1a3693b02f50b681a05236305ad Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Fri, 15 Dec 2023 16:13:26 +0100 Subject: [PATCH 0955/1562] kunit: Add example for using test->priv In a test->priv field the user can store arbitrary data. Add example how to use this feature in the test code. Signed-off-by: Michal Wajdeczko Cc: David Gow Cc: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/kunit-example-test.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/lib/kunit/kunit-example-test.c b/lib/kunit/kunit-example-test.c index d2f7a3c62c18..359dbee10201 100644 --- a/lib/kunit/kunit-example-test.c +++ b/lib/kunit/kunit-example-test.c @@ -221,6 +221,20 @@ static void example_params_test(struct kunit *test) KUNIT_EXPECT_EQ(test, param->value % param->value, 0); } +/* + * This test shows the use of test->priv. + */ +static void example_priv_test(struct kunit *test) +{ + /* unless setup in suite->init(), test->priv is NULL */ + KUNIT_ASSERT_NULL(test, test->priv); + + /* but can be used to pass arbitrary data to other functions */ + test->priv = kunit_kzalloc(test, 1, GFP_KERNEL); + KUNIT_EXPECT_NOT_NULL(test, test->priv); + KUNIT_ASSERT_PTR_EQ(test, test->priv, kunit_get_current_test()->priv); +} + /* * This test should always pass. Can be used to practice filtering attributes. */ @@ -245,6 +259,7 @@ static struct kunit_case example_test_cases[] = { KUNIT_CASE(example_mark_skipped_test), KUNIT_CASE(example_all_expect_macros_test), KUNIT_CASE(example_static_stub_test), + KUNIT_CASE(example_priv_test), KUNIT_CASE_PARAM(example_params_test, example_gen_params), KUNIT_CASE_SLOW(example_slow_test), {} From 342fb9789267ee3908959bfa136b82e88e2ce918 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Fri, 15 Dec 2023 16:13:27 +0100 Subject: [PATCH 0956/1562] kunit: Reset test->priv after each param iteration If we run parameterized test that uses test->priv to prepare some custom data, then value of test->priv will leak to the next param iteration and may be unexpected. This could be easily seen if we promote example_priv_test to parameterized test as then only first test iteration will be successful: $ ./tools/testing/kunit/kunit.py run \ --kunitconfig ./lib/kunit/.kunitconfig *.example_priv* [ ] Starting KUnit Kernel (1/1)... [ ] ============================================================ [ ] =================== example (1 subtest) ==================== [ ] ==================== example_priv_test ==================== [ ] [PASSED] example value 3 [ ] # example_priv_test: initializing [ ] # example_priv_test: ASSERTION FAILED at lib/kunit/kunit-example-test.c:230 [ ] Expected test->priv == ((void *)0), but [ ] test->priv == 0000000060dfe290 [ ] ((void *)0) == 0000000000000000 [ ] # example_priv_test: cleaning up [ ] [FAILED] example value 2 [ ] # example_priv_test: initializing [ ] # example_priv_test: ASSERTION FAILED at lib/kunit/kunit-example-test.c:230 [ ] Expected test->priv == ((void *)0), but [ ] test->priv == 0000000060dfe290 [ ] ((void *)0) == 0000000000000000 [ ] # example_priv_test: cleaning up [ ] [FAILED] example value 1 [ ] # example_priv_test: initializing [ ] # example_priv_test: ASSERTION FAILED at lib/kunit/kunit-example-test.c:230 [ ] Expected test->priv == ((void *)0), but [ ] test->priv == 0000000060dfe290 [ ] ((void *)0) == 0000000000000000 [ ] # example_priv_test: cleaning up [ ] [FAILED] example value 0 [ ] # example_priv_test: initializing [ ] # example_priv_test: cleaning up [ ] # example_priv_test: pass:1 fail:3 skip:0 total:4 [ ] ================ [FAILED] example_priv_test ================ [ ] # example: initializing suite [ ] # module: kunit_example_test [ ] # example: exiting suite [ ] # Totals: pass:1 fail:3 skip:0 total:4 [ ] ===================== [FAILED] example ===================== Fix that by resetting test->priv after each param iteration, in similar way what we did for the test->status. Signed-off-by: Michal Wajdeczko Cc: David Gow Cc: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 14a77b2d702a..3a3d4ebb35db 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -664,6 +664,7 @@ int kunit_run_tests(struct kunit_suite *suite) test.param_index++; test.status = KUNIT_SUCCESS; test.status_comment[0] = '\0'; + test.priv = NULL; } } From aed5ed595960c6d301dcd4ed31aeaa7a8054c0c6 Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Sat, 25 Nov 2023 02:41:58 +0530 Subject: [PATCH 0957/1562] PM / devfreq: Synchronize devfreq_monitor_[start/stop] There is a chance if a frequent switch of the governor done in a loop result in timer list corruption where timer cancel being done from two place one from cancel_delayed_work_sync() and followed by expire_timers() can be seen from the traces[1]. while true do echo "simple_ondemand" > /sys/class/devfreq/1d84000.ufshc/governor echo "performance" > /sys/class/devfreq/1d84000.ufshc/governor done It looks to be issue with devfreq driver where device_monitor_[start/stop] need to synchronized so that delayed work should get corrupted while it is either being queued or running or being cancelled. Let's use polling flag and devfreq lock to synchronize the queueing the timer instance twice and work data being corrupted. [1] ... .. -0 [003] 9436.209662: timer_cancel timer=0xffffff80444f0428 -0 [003] 9436.209664: timer_expire_entry timer=0xffffff80444f0428 now=0x10022da1c function=__typeid__ZTSFvP10timer_listE_global_addr baseclk=0x10022da1c -0 [003] 9436.209718: timer_expire_exit timer=0xffffff80444f0428 kworker/u16:6-14217 [003] 9436.209863: timer_start timer=0xffffff80444f0428 function=__typeid__ZTSFvP10timer_listE_global_addr expires=0x10022da2b now=0x10022da1c flags=182452227 vendor.xxxyyy.ha-1593 [004] 9436.209888: timer_cancel timer=0xffffff80444f0428 vendor.xxxyyy.ha-1593 [004] 9436.216390: timer_init timer=0xffffff80444f0428 vendor.xxxyyy.ha-1593 [004] 9436.216392: timer_start timer=0xffffff80444f0428 function=__typeid__ZTSFvP10timer_listE_global_addr expires=0x10022da2c now=0x10022da1d flags=186646532 vendor.xxxyyy.ha-1593 [005] 9436.220992: timer_cancel timer=0xffffff80444f0428 xxxyyyTraceManag-7795 [004] 9436.261641: timer_cancel timer=0xffffff80444f0428 [2] 9436.261653][ C4] Unable to handle kernel paging request at virtual address dead00000000012a [ 9436.261664][ C4] Mem abort info: [ 9436.261666][ C4] ESR = 0x96000044 [ 9436.261669][ C4] EC = 0x25: DABT (current EL), IL = 32 bits [ 9436.261671][ C4] SET = 0, FnV = 0 [ 9436.261673][ C4] EA = 0, S1PTW = 0 [ 9436.261675][ C4] Data abort info: [ 9436.261677][ C4] ISV = 0, ISS = 0x00000044 [ 9436.261680][ C4] CM = 0, WnR = 1 [ 9436.261682][ C4] [dead00000000012a] address between user and kernel address ranges [ 9436.261685][ C4] Internal error: Oops: 96000044 [#1] PREEMPT SMP [ 9436.261701][ C4] Skip md ftrace buffer dump for: 0x3a982d0 ... [ 9436.262138][ C4] CPU: 4 PID: 7795 Comm: TraceManag Tainted: G S W O 5.10.149-android12-9-o-g17f915d29d0c #1 [ 9436.262141][ C4] Hardware name: Qualcomm Technologies, Inc. (DT) [ 9436.262144][ C4] pstate: 22400085 (nzCv daIf +PAN -UAO +TCO BTYPE=--) [ 9436.262161][ C4] pc : expire_timers+0x9c/0x438 [ 9436.262164][ C4] lr : expire_timers+0x2a4/0x438 [ 9436.262168][ C4] sp : ffffffc010023dd0 [ 9436.262171][ C4] x29: ffffffc010023df0 x28: ffffffd0636fdc18 [ 9436.262178][ C4] x27: ffffffd063569dd0 x26: ffffffd063536008 [ 9436.262182][ C4] x25: 0000000000000001 x24: ffffff88f7c69280 [ 9436.262185][ C4] x23: 00000000000000e0 x22: dead000000000122 [ 9436.262188][ C4] x21: 000000010022da29 x20: ffffff8af72b4e80 [ 9436.262191][ C4] x19: ffffffc010023e50 x18: ffffffc010025038 [ 9436.262195][ C4] x17: 0000000000000240 x16: 0000000000000201 [ 9436.262199][ C4] x15: ffffffffffffffff x14: ffffff889f3c3100 [ 9436.262203][ C4] x13: ffffff889f3c3100 x12: 00000000049f56b8 [ 9436.262207][ C4] x11: 00000000049f56b8 x10: 00000000ffffffff [ 9436.262212][ C4] x9 : ffffffc010023e50 x8 : dead000000000122 [ 9436.262216][ C4] x7 : ffffffffffffffff x6 : ffffffc0100239d8 [ 9436.262220][ C4] x5 : 0000000000000000 x4 : 0000000000000101 [ 9436.262223][ C4] x3 : 0000000000000080 x2 : ffffff889edc155c [ 9436.262227][ C4] x1 : ffffff8001005200 x0 : ffffff80444f0428 [ 9436.262232][ C4] Call trace: [ 9436.262236][ C4] expire_timers+0x9c/0x438 [ 9436.262240][ C4] __run_timers+0x1f0/0x330 [ 9436.262245][ C4] run_timer_softirq+0x28/0x58 [ 9436.262255][ C4] efi_header_end+0x168/0x5ec [ 9436.262265][ C4] __irq_exit_rcu+0x108/0x124 [ 9436.262274][ C4] __handle_domain_irq+0x118/0x1e4 [ 9436.262282][ C4] gic_handle_irq.30369+0x6c/0x2bc [ 9436.262286][ C4] el0_irq_naked+0x60/0x6c Link: https://lore.kernel.org/all/1700860318-4025-1-git-send-email-quic_mojha@quicinc.com/ Reported-by: Joyyoung Huang Acked-by: MyungJoo Ham Signed-off-by: Mukesh Ojha Signed-off-by: Chanwoo Choi --- drivers/devfreq/devfreq.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 017a87465776..98657d3b9435 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -461,10 +461,14 @@ static void devfreq_monitor(struct work_struct *work) if (err) dev_err(&devfreq->dev, "dvfs failed with (%d) error\n", err); + if (devfreq->stop_polling) + goto out; + queue_delayed_work(devfreq_wq, &devfreq->work, msecs_to_jiffies(devfreq->profile->polling_ms)); - mutex_unlock(&devfreq->lock); +out: + mutex_unlock(&devfreq->lock); trace_devfreq_monitor(devfreq); } @@ -483,6 +487,10 @@ void devfreq_monitor_start(struct devfreq *devfreq) if (IS_SUPPORTED_FLAG(devfreq->governor->flags, IRQ_DRIVEN)) return; + mutex_lock(&devfreq->lock); + if (delayed_work_pending(&devfreq->work)) + goto out; + switch (devfreq->profile->timer) { case DEVFREQ_TIMER_DEFERRABLE: INIT_DEFERRABLE_WORK(&devfreq->work, devfreq_monitor); @@ -491,12 +499,16 @@ void devfreq_monitor_start(struct devfreq *devfreq) INIT_DELAYED_WORK(&devfreq->work, devfreq_monitor); break; default: - return; + goto out; } if (devfreq->profile->polling_ms) queue_delayed_work(devfreq_wq, &devfreq->work, msecs_to_jiffies(devfreq->profile->polling_ms)); + +out: + devfreq->stop_polling = false; + mutex_unlock(&devfreq->lock); } EXPORT_SYMBOL(devfreq_monitor_start); @@ -513,6 +525,14 @@ void devfreq_monitor_stop(struct devfreq *devfreq) if (IS_SUPPORTED_FLAG(devfreq->governor->flags, IRQ_DRIVEN)) return; + mutex_lock(&devfreq->lock); + if (devfreq->stop_polling) { + mutex_unlock(&devfreq->lock); + return; + } + + devfreq->stop_polling = true; + mutex_unlock(&devfreq->lock); cancel_delayed_work_sync(&devfreq->work); } EXPORT_SYMBOL(devfreq_monitor_stop); From 9fcb0999345e94303a0514f2d2850246c11308f4 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Fri, 15 Dec 2023 10:21:35 +0200 Subject: [PATCH 0958/1562] mtd: spi-nor: print flash ID instead of name We saw flash ID collisions which make the flash name unreliable. Print the manufacturer and device ID instead of the flash name. Lower the print to dev_dbg to stop polluting the kernel log. Suggested-by: Miquel Raynal Reviewed-by: Michael Walle Link: https://lore.kernel.org/r/20231215082138.16063-2-tudor.ambarus@linaro.org Signed-off-by: Tudor Ambarus --- drivers/mtd/spi-nor/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c index 503fed90c2fa..ca5bd93d1f17 100644 --- a/drivers/mtd/spi-nor/core.c +++ b/drivers/mtd/spi-nor/core.c @@ -3558,8 +3558,8 @@ int spi_nor_scan(struct spi_nor *nor, const char *name, /* No mtd_info fields should be used up to this point. */ spi_nor_set_mtd_info(nor); - dev_info(dev, "%s (%lld Kbytes)\n", info->name, - (long long)mtd->size >> 10); + dev_dbg(dev, "Manufacturer and device ID: %*phN\n", + SPI_NOR_MAX_ID_LEN, nor->id); dev_dbg(dev, "mtd .name = %s, .size = 0x%llx (%lldMiB), " From 15eb8303bb424e36a7f5f411c6bb489bc0c1c06e Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Fri, 15 Dec 2023 10:21:36 +0200 Subject: [PATCH 0959/1562] mtd: spi-nor: mark the flash name as obsolete The flash name is unreliable as we saw flash ID collisions. Mark the name as obsolete. Reviewed-by: Michael Walle Link: https://lore.kernel.org/r/20231215082138.16063-3-tudor.ambarus@linaro.org Signed-off-by: Tudor Ambarus --- drivers/mtd/spi-nor/core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/spi-nor/core.h b/drivers/mtd/spi-nor/core.h index 29ed67725b18..d36c0e072954 100644 --- a/drivers/mtd/spi-nor/core.h +++ b/drivers/mtd/spi-nor/core.h @@ -465,7 +465,7 @@ struct spi_nor_id { * struct flash_info - SPI NOR flash_info entry. * @id: pointer to struct spi_nor_id or NULL, which means "no ID" (mostly * older chips). - * @name: the name of the flash. + * @name: (obsolete) the name of the flash. Do not set it for new additions. * @size: the size of the flash in bytes. * @sector_size: (optional) the size listed here is what works with * SPINOR_OP_SE, which isn't necessarily called a "sector" by From fc2efaf90a4538781aac26cf44d705c1d93fb9f5 Mon Sep 17 00:00:00 2001 From: JaimeLiao Date: Fri, 15 Dec 2023 10:21:37 +0200 Subject: [PATCH 0960/1562] mtd: spi-nor: sysfs: hide the flash name if not set The flash name is not reliable as we saw flash ID collisions. Hide the flash name if not set. Signed-off-by: JaimeLiao Reviewed-by: Michael Walle [ta: update commit subject and description and the sysfs description] Link: https://lore.kernel.org/r/20231215082138.16063-4-tudor.ambarus@linaro.org Signed-off-by: Tudor Ambarus --- Documentation/ABI/testing/sysfs-bus-spi-devices-spi-nor | 3 +++ drivers/mtd/spi-nor/sysfs.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-spi-devices-spi-nor b/Documentation/ABI/testing/sysfs-bus-spi-devices-spi-nor index c800621eff95..9ed5582ddea2 100644 --- a/Documentation/ABI/testing/sysfs-bus-spi-devices-spi-nor +++ b/Documentation/ABI/testing/sysfs-bus-spi-devices-spi-nor @@ -25,6 +25,9 @@ KernelVersion: 5.14 Contact: linux-mtd@lists.infradead.org Description: (RO) Part name of the SPI NOR flash. + The attribute is optional. User space should not rely on + it to be present or even correct. Instead, user space + should read the jedec_id attribute. What: /sys/bus/spi/devices/.../spi-nor/sfdp Date: April 2021 diff --git a/drivers/mtd/spi-nor/sysfs.c b/drivers/mtd/spi-nor/sysfs.c index 2dfdc555a69f..96064e4babf0 100644 --- a/drivers/mtd/spi-nor/sysfs.c +++ b/drivers/mtd/spi-nor/sysfs.c @@ -78,6 +78,8 @@ static umode_t spi_nor_sysfs_is_visible(struct kobject *kobj, if (attr == &dev_attr_manufacturer.attr && !nor->manufacturer) return 0; + if (attr == &dev_attr_partname.attr && !nor->info->name) + return 0; if (attr == &dev_attr_jedec_id.attr && !nor->info->id && !nor->id) return 0; From fe18e22fa779718b7ec0effe8d5f10d86a124e31 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Fri, 15 Dec 2023 10:21:38 +0200 Subject: [PATCH 0961/1562] mtd: spi-nor: drop superfluous debug prints The mtd data shall be obtained with the mtd ioctls or with new debugfs entries if one cares. Drop the debug prints. Reviewed-by: Michael Walle Link: https://lore.kernel.org/r/20231215082138.16063-5-tudor.ambarus@linaro.org Signed-off-by: Tudor Ambarus --- drivers/mtd/spi-nor/core.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c index ca5bd93d1f17..92c992eb73d5 100644 --- a/drivers/mtd/spi-nor/core.c +++ b/drivers/mtd/spi-nor/core.c @@ -3492,9 +3492,7 @@ int spi_nor_scan(struct spi_nor *nor, const char *name, { const struct flash_info *info; struct device *dev = nor->dev; - struct mtd_info *mtd = &nor->mtd; int ret; - int i; ret = spi_nor_check(nor); if (ret) @@ -3561,22 +3559,6 @@ int spi_nor_scan(struct spi_nor *nor, const char *name, dev_dbg(dev, "Manufacturer and device ID: %*phN\n", SPI_NOR_MAX_ID_LEN, nor->id); - dev_dbg(dev, - "mtd .name = %s, .size = 0x%llx (%lldMiB), " - ".erasesize = 0x%.8x (%uKiB) .numeraseregions = %d\n", - mtd->name, (long long)mtd->size, (long long)(mtd->size >> 20), - mtd->erasesize, mtd->erasesize / 1024, mtd->numeraseregions); - - if (mtd->numeraseregions) - for (i = 0; i < mtd->numeraseregions; i++) - dev_dbg(dev, - "mtd.eraseregions[%d] = { .offset = 0x%llx, " - ".erasesize = 0x%.8x (%uKiB), " - ".numblocks = %d }\n", - i, (long long)mtd->eraseregions[i].offset, - mtd->eraseregions[i].erasesize, - mtd->eraseregions[i].erasesize / 1024, - mtd->eraseregions[i].numblocks); return 0; } EXPORT_SYMBOL_GPL(spi_nor_scan); From 174a0c565cea74a7811ff79fbee1b70247570ade Mon Sep 17 00:00:00 2001 From: Wang Yao Date: Tue, 19 Dec 2023 17:14:05 +0800 Subject: [PATCH 0962/1562] efi/loongarch: Directly position the loaded image file The use of the 'kernel_offset' variable to position the image file that has been loaded by UEFI or GRUB is unnecessary, because we can directly position the loaded image file through using the image_base field of the efi_loaded_image struct provided by UEFI. Replace kernel_offset with image_base to position the image file that has been loaded by UEFI or GRUB. Signed-off-by: Wang Yao Signed-off-by: Ard Biesheuvel --- arch/loongarch/include/asm/efi.h | 2 -- arch/loongarch/kernel/head.S | 1 - arch/loongarch/kernel/image-vars.h | 1 - arch/loongarch/kernel/vmlinux.lds.S | 1 - drivers/firmware/efi/libstub/loongarch-stub.c | 9 +++++---- drivers/firmware/efi/libstub/loongarch-stub.h | 4 ++++ drivers/firmware/efi/libstub/loongarch.c | 6 ++++-- 7 files changed, 13 insertions(+), 11 deletions(-) create mode 100644 drivers/firmware/efi/libstub/loongarch-stub.h diff --git a/arch/loongarch/include/asm/efi.h b/arch/loongarch/include/asm/efi.h index 91d81f9730ab..eddc8e79b3fa 100644 --- a/arch/loongarch/include/asm/efi.h +++ b/arch/loongarch/include/asm/efi.h @@ -32,6 +32,4 @@ static inline unsigned long efi_get_kimg_min_align(void) #define EFI_KIMG_PREFERRED_ADDRESS PHYSADDR(VMLINUX_LOAD_ADDRESS) -unsigned long kernel_entry_address(unsigned long kernel_addr); - #endif /* _ASM_LOONGARCH_EFI_H */ diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S index 53b883db0786..0ecab4216392 100644 --- a/arch/loongarch/kernel/head.S +++ b/arch/loongarch/kernel/head.S @@ -34,7 +34,6 @@ pe_header: SYM_DATA(kernel_asize, .long _kernel_asize); SYM_DATA(kernel_fsize, .long _kernel_fsize); -SYM_DATA(kernel_offset, .long _kernel_offset); #endif diff --git a/arch/loongarch/kernel/image-vars.h b/arch/loongarch/kernel/image-vars.h index 5087416b9678..41ddcf56d21c 100644 --- a/arch/loongarch/kernel/image-vars.h +++ b/arch/loongarch/kernel/image-vars.h @@ -11,7 +11,6 @@ __efistub_strcmp = strcmp; __efistub_kernel_entry = kernel_entry; __efistub_kernel_asize = kernel_asize; __efistub_kernel_fsize = kernel_fsize; -__efistub_kernel_offset = kernel_offset; #if defined(CONFIG_EFI_EARLYCON) || defined(CONFIG_SYSFB) __efistub_screen_info = screen_info; #endif diff --git a/arch/loongarch/kernel/vmlinux.lds.S b/arch/loongarch/kernel/vmlinux.lds.S index bb2ec86f37a8..a5d0cd2035da 100644 --- a/arch/loongarch/kernel/vmlinux.lds.S +++ b/arch/loongarch/kernel/vmlinux.lds.S @@ -143,7 +143,6 @@ SECTIONS _kernel_fsize = _edata - _text; _kernel_vsize = _end - __initdata_begin; _kernel_rsize = _edata - __initdata_begin; - _kernel_offset = kernel_offset - _text; #endif .gptab.sdata : { diff --git a/drivers/firmware/efi/libstub/loongarch-stub.c b/drivers/firmware/efi/libstub/loongarch-stub.c index d6ec5d4b8dbe..736b6aae323d 100644 --- a/drivers/firmware/efi/libstub/loongarch-stub.c +++ b/drivers/firmware/efi/libstub/loongarch-stub.c @@ -8,10 +8,10 @@ #include #include #include "efistub.h" +#include "loongarch-stub.h" extern int kernel_asize; extern int kernel_fsize; -extern int kernel_offset; extern int kernel_entry; efi_status_t handle_kernel_image(unsigned long *image_addr, @@ -24,7 +24,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, efi_status_t status; unsigned long kernel_addr = 0; - kernel_addr = (unsigned long)&kernel_offset - kernel_offset; + kernel_addr = (unsigned long)image->image_base; status = efi_relocate_kernel(&kernel_addr, kernel_fsize, kernel_asize, EFI_KIMG_PREFERRED_ADDRESS, efi_get_kimg_min_align(), 0x0); @@ -35,9 +35,10 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, return status; } -unsigned long kernel_entry_address(unsigned long kernel_addr) +unsigned long kernel_entry_address(unsigned long kernel_addr, + efi_loaded_image_t *image) { - unsigned long base = (unsigned long)&kernel_offset - kernel_offset; + unsigned long base = (unsigned long)image->image_base; return (unsigned long)&kernel_entry - base + kernel_addr; } diff --git a/drivers/firmware/efi/libstub/loongarch-stub.h b/drivers/firmware/efi/libstub/loongarch-stub.h new file mode 100644 index 000000000000..cd015955a015 --- /dev/null +++ b/drivers/firmware/efi/libstub/loongarch-stub.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +unsigned long kernel_entry_address(unsigned long kernel_addr, + efi_loaded_image_t *image); diff --git a/drivers/firmware/efi/libstub/loongarch.c b/drivers/firmware/efi/libstub/loongarch.c index 0e0aa6cda73f..684c9354637c 100644 --- a/drivers/firmware/efi/libstub/loongarch.c +++ b/drivers/firmware/efi/libstub/loongarch.c @@ -8,6 +8,7 @@ #include #include #include "efistub.h" +#include "loongarch-stub.h" typedef void __noreturn (*kernel_entry_t)(bool efi, unsigned long cmdline, unsigned long systab); @@ -37,7 +38,8 @@ static efi_status_t exit_boot_func(struct efi_boot_memmap *map, void *priv) return EFI_SUCCESS; } -unsigned long __weak kernel_entry_address(unsigned long kernel_addr) +unsigned long __weak kernel_entry_address(unsigned long kernel_addr, + efi_loaded_image_t *image) { return *(unsigned long *)(kernel_addr + 8) - VMLINUX_LOAD_ADDRESS + kernel_addr; } @@ -73,7 +75,7 @@ efi_status_t efi_boot_kernel(void *handle, efi_loaded_image_t *image, csr_write64(CSR_DMW0_INIT, LOONGARCH_CSR_DMWIN0); csr_write64(CSR_DMW1_INIT, LOONGARCH_CSR_DMWIN1); - real_kernel_entry = (void *)kernel_entry_address(kernel_addr); + real_kernel_entry = (void *)kernel_entry_address(kernel_addr, image); real_kernel_entry(true, (unsigned long)cmdline_ptr, (unsigned long)efi_system_table); From 5be50eb5ae99d890cc22ab49753330cce8731599 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 15 Nov 2023 15:02:29 -0600 Subject: [PATCH 0963/1562] ipmi: si: Use device_get_match_data() Use preferred device_get_match_data() instead of of_match_device() to get the driver match data. With this, adjust the includes to explicitly include the correct headers. Signed-off-by: Rob Herring Message-Id: <20231115210230.3744198-1-robh@kernel.org> Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_platform.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/char/ipmi/ipmi_si_platform.c b/drivers/char/ipmi/ipmi_si_platform.c index c3d8ac7873ba..cd2edd8f8a03 100644 --- a/drivers/char/ipmi/ipmi_si_platform.c +++ b/drivers/char/ipmi/ipmi_si_platform.c @@ -11,10 +11,11 @@ #include #include -#include -#include +#include #include #include +#include +#include #include #include "ipmi_si.h" #include "ipmi_dmi.h" @@ -224,7 +225,6 @@ MODULE_DEVICE_TABLE(of, of_ipmi_match); static int of_ipmi_probe(struct platform_device *pdev) { - const struct of_device_id *match; struct si_sm_io io; struct resource resource; const __be32 *regsize, *regspacing, *regshift; @@ -237,10 +237,6 @@ static int of_ipmi_probe(struct platform_device *pdev) dev_info(&pdev->dev, "probing via device tree\n"); - match = of_match_device(of_ipmi_match, &pdev->dev); - if (!match) - return -ENODEV; - if (!of_device_is_available(np)) return -EINVAL; @@ -269,7 +265,7 @@ static int of_ipmi_probe(struct platform_device *pdev) } memset(&io, 0, sizeof(io)); - io.si_type = (unsigned long) match->data; + io.si_type = (enum si_type)device_get_match_data(&pdev->dev); io.addr_source = SI_DEVICETREE; io.irq_setup = ipmi_std_irq_setup; From 242c6fd473a6a74eac4d4002be715a0d0dede036 Mon Sep 17 00:00:00 2001 From: Emilio Perez Date: Wed, 22 Nov 2023 20:34:28 +0000 Subject: [PATCH 0964/1562] ipmi: Use regspacings passed as a module parameter regspacings parameter is currently ignored and the platform data uses a default value of 0, this has been fixed by setting the appropriate field in the platform data. Fixes: 3cd83bac481d ("ipmi: Consolidate the adding of platform devices") Signed-off-by: Emilio Perez Message-Id: <20231122203433.443098-1-emiliopeju@gmail.com> Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_hardcode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmi_si_hardcode.c b/drivers/char/ipmi/ipmi_si_hardcode.c index ed5e91b1e040..0c92fa3eee88 100644 --- a/drivers/char/ipmi/ipmi_si_hardcode.c +++ b/drivers/char/ipmi/ipmi_si_hardcode.c @@ -80,10 +80,10 @@ static void __init ipmi_hardcode_init_one(const char *si_type_str, } p.regsize = regsizes[i]; + p.regspacing = regspacings[i]; p.slave_addr = slave_addrs[i]; p.addr_source = SI_HARDCODED; p.regshift = regshifts[i]; - p.regsize = regsizes[i]; p.addr = addr; p.space = addr_space; From 9bd9fbd9032a3b7e9ea916d6e58ba0116e0621be Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 19 Dec 2023 06:00:39 +0100 Subject: [PATCH 0965/1562] ipmi: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Signed-off-by: Christophe JAILLET Message-Id: Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index d6f14279684d..b0eedc4595b3 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -3053,7 +3053,7 @@ static void cleanup_bmc_work(struct work_struct *work) int id = bmc->pdev.id; /* Unregister overwrites id */ platform_device_unregister(&bmc->pdev); - ida_simple_remove(&ipmi_bmc_ida, id); + ida_free(&ipmi_bmc_ida, id); } static void @@ -3169,7 +3169,7 @@ static int __ipmi_bmc_register(struct ipmi_smi *intf, bmc->pdev.name = "ipmi_bmc"; - rv = ida_simple_get(&ipmi_bmc_ida, 0, 0, GFP_KERNEL); + rv = ida_alloc(&ipmi_bmc_ida, GFP_KERNEL); if (rv < 0) { kfree(bmc); goto out; From d939c02359a656a624d03c6f14ccadae4a1c66ac Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Dec 2023 08:57:49 +0100 Subject: [PATCH 0966/1562] dt-bindings: regulator: qcom,usb-vbus-regulator: clean up example Devicetree node names should be generic; fix up the qcom,usb-vbus-regulator binding example accordingly. While at it, drop an unnecessary label and add a newline separator before the child node to improve readability. Signed-off-by: Johan Hovold Link: https://msgid.link/r/20231219075749.25308-1-johan+linaro@kernel.org Signed-off-by: Mark Brown --- .../bindings/regulator/qcom,usb-vbus-regulator.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/regulator/qcom,usb-vbus-regulator.yaml b/Documentation/devicetree/bindings/regulator/qcom,usb-vbus-regulator.yaml index 89c564dfa5db..534f87e98716 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,usb-vbus-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/qcom,usb-vbus-regulator.yaml @@ -36,10 +36,11 @@ unevaluatedProperties: false examples: - | - pm8150b { + pmic { #address-cells = <1>; #size-cells = <0>; - pm8150b_vbus: usb-vbus-regulator@1100 { + + usb-vbus-regulator@1100 { compatible = "qcom,pm8150b-vbus-reg"; reg = <0x1100>; regulator-min-microamp = <500000>; From bdd7c5a5afdfad7b92178bba518e6ff40191eb09 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Thu, 7 Dec 2023 08:38:12 -0500 Subject: [PATCH 0967/1562] MAINTAINERS: Add Roberto Sassu as co-maintainer to IMA and EVM Roberto Sassu has been actively involved in IMA and EVM since 2011. His first major IMA contribution was IMA template support. He also contributed extending TPM 2.0 PCRs with properly calculated per TPM bank digests and included file metadata information in the IMA measurement list. Regarding EVM, Roberto contributed to making EVM portable and immutable signatures more usable. He also prepared the LSM infrastructure to support EVM as a fully fledged LSM, by ensuring that the latter receives from the former all xattrs provided by other registered LSMs at inode creation time, for HMAC calculation. Roberto is currently working on making IMA and EVM full fledged LSMs. Add Roberto as an IMA and EVM maintainer. Acked-by: Roberto Sassu Signed-off-by: Mimi Zohar --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 012df8ccf34e..ffaac404d1e0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7977,6 +7977,7 @@ F: include/uapi/linux/ext4.h Extended Verification Module (EVM) M: Mimi Zohar +M: Roberto Sassu L: linux-integrity@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity.git @@ -10554,6 +10555,7 @@ F: drivers/crypto/inside-secure/ INTEGRITY MEASUREMENT ARCHITECTURE (IMA) M: Mimi Zohar +M: Roberto Sassu M: Dmitry Kasatkin L: linux-integrity@vger.kernel.org S: Supported From 4e8daa792742635ea57c625098165eef64661901 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Fri, 8 Dec 2023 11:13:22 -0500 Subject: [PATCH 0968/1562] MAINTAINERS: Add Eric Snowberg as a reviewer to IMA Digital signature based IMA-appraisal relies heavily on kernel keyrings. Eric Snowberg has been involved in adding the machine keyring to allow the system owner to add their own keys. With this addition, IMA-appraisal usage can be extended to allow loading local and 3rd party software keys onto the IMA keyring. Add Eric as a reviewer. Acked-by: Eric Snowberg Signed-off-by: Mimi Zohar --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index ffaac404d1e0..4dbf1cc238c9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10557,6 +10557,7 @@ INTEGRITY MEASUREMENT ARCHITECTURE (IMA) M: Mimi Zohar M: Roberto Sassu M: Dmitry Kasatkin +R: Eric Snowberg L: linux-integrity@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity.git From 41dd6822949ec6e83416a0e245f32a726110056a Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 18 Dec 2023 20:24:15 +0100 Subject: [PATCH 0969/1562] platform/x86: wmi: Remove unused variable in address space handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The variable "i" is always zero and only used in shift operations. Remove it to make the code more readable. Reviewed-by: Ilpo Järvinen Reviewed-by: Hans de Goede Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231218192420.305411-2-W_Armin@gmx.de Signed-off-by: Hans de Goede --- drivers/platform/x86/wmi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index 7303702290e5..906d3a2831ae 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -1144,7 +1144,7 @@ acpi_wmi_ec_space_handler(u32 function, acpi_physical_address address, u32 bits, u64 *value, void *handler_context, void *region_context) { - int result = 0, i = 0; + int result = 0; u8 temp = 0; if ((address > 0xFF) || !value) @@ -1158,9 +1158,9 @@ acpi_wmi_ec_space_handler(u32 function, acpi_physical_address address, if (function == ACPI_READ) { result = ec_read(address, &temp); - (*value) |= ((u64)temp) << i; + *value = temp; } else { - temp = 0xff & ((*value) >> i); + temp = 0xff & *value; result = ec_write(address, temp); } From 22574e17626391ad969af9a13aaa58a1b37ad384 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 18 Dec 2023 20:24:16 +0100 Subject: [PATCH 0970/1562] platform/x86: wmi: Remove ACPI handlers after WMI devices When removing the ACPI notify/address space handlers, the WMI devices are still active and might still depend on ACPI EC access or WMI events. Fix this by removing the ACPI handlers after all WMI devices associated with an ACPI device have been removed. Reviewed-by: Hans de Goede Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231218192420.305411-3-W_Armin@gmx.de Signed-off-by: Hans de Goede --- drivers/platform/x86/wmi.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index 906d3a2831ae..2120c13e1676 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -1239,13 +1239,12 @@ static void acpi_wmi_remove(struct platform_device *device) struct acpi_device *acpi_device = ACPI_COMPANION(&device->dev); struct device *wmi_bus_device = dev_get_drvdata(&device->dev); - acpi_remove_notify_handler(acpi_device->handle, ACPI_ALL_NOTIFY, - acpi_wmi_notify_handler); - acpi_remove_address_space_handler(acpi_device->handle, - ACPI_ADR_SPACE_EC, &acpi_wmi_ec_space_handler); - device_for_each_child_reverse(wmi_bus_device, NULL, wmi_remove_device); device_unregister(wmi_bus_device); + + acpi_remove_notify_handler(acpi_device->handle, ACPI_ALL_NOTIFY, acpi_wmi_notify_handler); + acpi_remove_address_space_handler(acpi_device->handle, ACPI_ADR_SPACE_EC, + &acpi_wmi_ec_space_handler); } static int acpi_wmi_probe(struct platform_device *device) From 08e7f4d61d3f043f32e35c224898d0869f8b1530 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 18 Dec 2023 20:24:17 +0100 Subject: [PATCH 0971/1562] platform/x86: wmi: Use devres for resource handling Use devres for cleaning up the ACPI handlers and the WMI bus device to simplify the error handling. Reviewed-by: Hans de Goede Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231218192420.305411-4-W_Armin@gmx.de Signed-off-by: Hans de Goede --- drivers/platform/x86/wmi.c | 58 +++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index 2120c13e1676..4bc5da70c1b0 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -1236,17 +1236,33 @@ static int wmi_remove_device(struct device *dev, void *data) static void acpi_wmi_remove(struct platform_device *device) { - struct acpi_device *acpi_device = ACPI_COMPANION(&device->dev); struct device *wmi_bus_device = dev_get_drvdata(&device->dev); device_for_each_child_reverse(wmi_bus_device, NULL, wmi_remove_device); - device_unregister(wmi_bus_device); +} + +static void acpi_wmi_remove_notify_handler(void *data) +{ + struct acpi_device *acpi_device = data; acpi_remove_notify_handler(acpi_device->handle, ACPI_ALL_NOTIFY, acpi_wmi_notify_handler); +} + +static void acpi_wmi_remove_address_space_handler(void *data) +{ + struct acpi_device *acpi_device = data; + acpi_remove_address_space_handler(acpi_device->handle, ACPI_ADR_SPACE_EC, &acpi_wmi_ec_space_handler); } +static void acpi_wmi_remove_bus_device(void *data) +{ + struct device *wmi_bus_dev = data; + + device_unregister(wmi_bus_dev); +} + static int acpi_wmi_probe(struct platform_device *device) { struct acpi_device *acpi_device; @@ -1268,6 +1284,10 @@ static int acpi_wmi_probe(struct platform_device *device) dev_err(&device->dev, "Error installing EC region handler\n"); return -ENODEV; } + error = devm_add_action_or_reset(&device->dev, acpi_wmi_remove_address_space_handler, + acpi_device); + if (error < 0) + return error; status = acpi_install_notify_handler(acpi_device->handle, ACPI_ALL_NOTIFY, @@ -1275,39 +1295,31 @@ static int acpi_wmi_probe(struct platform_device *device) NULL); if (ACPI_FAILURE(status)) { dev_err(&device->dev, "Error installing notify handler\n"); - error = -ENODEV; - goto err_remove_ec_handler; + return -ENODEV; } + error = devm_add_action_or_reset(&device->dev, acpi_wmi_remove_notify_handler, + acpi_device); + if (error < 0) + return error; wmi_bus_dev = device_create(&wmi_bus_class, &device->dev, MKDEV(0, 0), NULL, "wmi_bus-%s", dev_name(&device->dev)); - if (IS_ERR(wmi_bus_dev)) { - error = PTR_ERR(wmi_bus_dev); - goto err_remove_notify_handler; - } + if (IS_ERR(wmi_bus_dev)) + return PTR_ERR(wmi_bus_dev); + + error = devm_add_action_or_reset(&device->dev, acpi_wmi_remove_bus_device, wmi_bus_dev); + if (error < 0) + return error; + dev_set_drvdata(&device->dev, wmi_bus_dev); error = parse_wdg(wmi_bus_dev, device); if (error) { pr_err("Failed to parse WDG method\n"); - goto err_remove_busdev; + return error; } return 0; - -err_remove_busdev: - device_unregister(wmi_bus_dev); - -err_remove_notify_handler: - acpi_remove_notify_handler(acpi_device->handle, ACPI_ALL_NOTIFY, - acpi_wmi_notify_handler); - -err_remove_ec_handler: - acpi_remove_address_space_handler(acpi_device->handle, - ACPI_ADR_SPACE_EC, - &acpi_wmi_ec_space_handler); - - return error; } int __must_check __wmi_driver_register(struct wmi_driver *driver, From 095fa72a19f13b15629611d52447cb17ce223bcd Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 18 Dec 2023 20:24:18 +0100 Subject: [PATCH 0972/1562] platform/x86: wmi: Create WMI bus device first Create the WMI bus device first so that it can be used by the ACPI handlers. Reviewed-by: Hans de Goede Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231218192420.305411-5-W_Armin@gmx.de Signed-off-by: Hans de Goede --- drivers/platform/x86/wmi.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index 4bc5da70c1b0..e2bfdc61c4ce 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -1276,6 +1276,17 @@ static int acpi_wmi_probe(struct platform_device *device) return -ENODEV; } + wmi_bus_dev = device_create(&wmi_bus_class, &device->dev, MKDEV(0, 0), NULL, "wmi_bus-%s", + dev_name(&device->dev)); + if (IS_ERR(wmi_bus_dev)) + return PTR_ERR(wmi_bus_dev); + + error = devm_add_action_or_reset(&device->dev, acpi_wmi_remove_bus_device, wmi_bus_dev); + if (error < 0) + return error; + + dev_set_drvdata(&device->dev, wmi_bus_dev); + status = acpi_install_address_space_handler(acpi_device->handle, ACPI_ADR_SPACE_EC, &acpi_wmi_ec_space_handler, @@ -1302,17 +1313,6 @@ static int acpi_wmi_probe(struct platform_device *device) if (error < 0) return error; - wmi_bus_dev = device_create(&wmi_bus_class, &device->dev, MKDEV(0, 0), - NULL, "wmi_bus-%s", dev_name(&device->dev)); - if (IS_ERR(wmi_bus_dev)) - return PTR_ERR(wmi_bus_dev); - - error = devm_add_action_or_reset(&device->dev, acpi_wmi_remove_bus_device, wmi_bus_dev); - if (error < 0) - return error; - - dev_set_drvdata(&device->dev, wmi_bus_dev); - error = parse_wdg(wmi_bus_dev, device); if (error) { pr_err("Failed to parse WDG method\n"); From 2c933755eaaa82fffe0201f376713fa2070b9428 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 18 Dec 2023 20:24:19 +0100 Subject: [PATCH 0973/1562] platform/x86: wmi: Decouple ACPI notify handler from wmi_block_list Currently, the ACPI notify handler searches all WMI devices for a matching WMI event device. This is inefficient since only WMI devices associated with the notified ACPI device need to be searched. Use the WMI bus device and device_for_each_child() to search for a matching WMI event device instead. Reviewed-by: Hans de Goede Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231218192420.305411-6-W_Armin@gmx.de Signed-off-by: Hans de Goede --- drivers/platform/x86/wmi.c | 46 +++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index e2bfdc61c4ce..559a99ebc624 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -1176,24 +1176,13 @@ acpi_wmi_ec_space_handler(u32 function, acpi_physical_address address, } } -static void acpi_wmi_notify_handler(acpi_handle handle, u32 event, - void *context) +static int wmi_notify_device(struct device *dev, void *data) { - struct wmi_block *wblock = NULL, *iter; + struct wmi_block *wblock = dev_to_wblock(dev); + u32 *event = data; - list_for_each_entry(iter, &wmi_block_list, list) { - struct guid_block *block = &iter->gblock; - - if (iter->acpi_device->handle == handle && - (block->flags & ACPI_WMI_EVENT) && - (block->notify_id == event)) { - wblock = iter; - break; - } - } - - if (!wblock) - return; + if (!(wblock->gblock.flags & ACPI_WMI_EVENT && wblock->gblock.notify_id == *event)) + return 0; /* If a driver is bound, then notify the driver. */ if (test_bit(WMI_PROBED, &wblock->flags) && wblock->dev.dev.driver) { @@ -1205,7 +1194,7 @@ static void acpi_wmi_notify_handler(acpi_handle handle, u32 event, status = get_event_data(wblock, &evdata); if (ACPI_FAILURE(status)) { dev_warn(&wblock->dev.dev, "failed to get event data\n"); - return; + return -EIO; } } @@ -1215,13 +1204,20 @@ static void acpi_wmi_notify_handler(acpi_handle handle, u32 event, kfree(evdata.pointer); } else if (wblock->handler) { /* Legacy handler */ - wblock->handler(event, wblock->handler_data); + wblock->handler(*event, wblock->handler_data); } - acpi_bus_generate_netlink_event( - wblock->acpi_device->pnp.device_class, - dev_name(&wblock->dev.dev), - event, 0); + acpi_bus_generate_netlink_event(wblock->acpi_device->pnp.device_class, + dev_name(&wblock->dev.dev), *event, 0); + + return -EBUSY; +} + +static void acpi_wmi_notify_handler(acpi_handle handle, u32 event, void *context) +{ + struct device *wmi_bus_dev = context; + + device_for_each_child(wmi_bus_dev, &event, wmi_notify_device); } static int wmi_remove_device(struct device *dev, void *data) @@ -1300,10 +1296,8 @@ static int acpi_wmi_probe(struct platform_device *device) if (error < 0) return error; - status = acpi_install_notify_handler(acpi_device->handle, - ACPI_ALL_NOTIFY, - acpi_wmi_notify_handler, - NULL); + status = acpi_install_notify_handler(acpi_device->handle, ACPI_ALL_NOTIFY, + acpi_wmi_notify_handler, wmi_bus_dev); if (ACPI_FAILURE(status)) { dev_err(&device->dev, "Error installing notify handler\n"); return -ENODEV; From bd142914f805b88dcb15acaab9fbc9bea666dd32 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 18 Dec 2023 20:24:20 +0100 Subject: [PATCH 0974/1562] platform/x86: wmi: Simplify get_subobj_info() All callers who call get_subobj_info() with **info being NULL should better use acpi_has_method() instead. Convert the only caller who does this to acpi_has_method() to drop the dummy info handling. Reviewed-by: Hans de Goede Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231218192420.305411-7-W_Armin@gmx.de Signed-off-by: Hans de Goede --- drivers/platform/x86/wmi.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index 559a99ebc624..a7cfcbf92432 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -132,23 +132,19 @@ static const void *find_guid_context(struct wmi_block *wblock, static int get_subobj_info(acpi_handle handle, const char *pathname, struct acpi_device_info **info) { - struct acpi_device_info *dummy_info, **info_ptr; acpi_handle subobj_handle; acpi_status status; - status = acpi_get_handle(handle, (char *)pathname, &subobj_handle); + status = acpi_get_handle(handle, pathname, &subobj_handle); if (status == AE_NOT_FOUND) return -ENOENT; - else if (ACPI_FAILURE(status)) - return -EIO; - info_ptr = info ? info : &dummy_info; - status = acpi_get_object_info(subobj_handle, info_ptr); if (ACPI_FAILURE(status)) return -EIO; - if (!info) - kfree(dummy_info); + status = acpi_get_object_info(subobj_handle, info); + if (ACPI_FAILURE(status)) + return -EIO; return 0; } @@ -998,9 +994,7 @@ static int wmi_create_device(struct device *wmi_bus_dev, kfree(info); get_acpi_method_name(wblock, 'S', method); - result = get_subobj_info(device->handle, method, NULL); - - if (result == 0) + if (acpi_has_method(device->handle, method)) wblock->dev.setable = true; out_init: From 1f5e56c9f6cc92c45d27adfe78fb54c716fed2e2 Mon Sep 17 00:00:00 2001 From: Rajvi Jingar Date: Mon, 18 Dec 2023 20:22:09 -0800 Subject: [PATCH 0975/1562] platform/x86/intel/pmc: Fix in mtl_punit_pmt_init() pci_get_domain_bus_and_slot() increases the reference count on the pci device that is used to register the endpoint. In case of failure in registration, decrease reference count using pci_dev_put(pcidev) before returning. Fixes: 6e7964855381 ("platform/x86/intel/pmc: Show Die C6 counter on Meteor Lake") Signed-off-by: Rajvi Jingar Signed-off-by: David E. Box Link: https://lore.kernel.org/r/20231219042216.2592029-1-rajvi.jingar@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/mtl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/pmc/mtl.c b/drivers/platform/x86/intel/pmc/mtl.c index 38c2f946ec23..fb59dffccf28 100644 --- a/drivers/platform/x86/intel/pmc/mtl.c +++ b/drivers/platform/x86/intel/pmc/mtl.c @@ -985,6 +985,7 @@ static void mtl_punit_pmt_init(struct pmc_dev *pmcdev) } ep = pmt_telem_find_and_register_endpoint(pcidev, MTL_PMT_DMU_GUID, 0); + pci_dev_put(pcidev); if (IS_ERR(ep)) { dev_err(&pmcdev->pdev->dev, "pmc_core: couldn't get DMU telem endpoint, %ld\n", @@ -992,7 +993,6 @@ static void mtl_punit_pmt_init(struct pmc_dev *pmcdev) return; } - pci_dev_put(pcidev); pmcdev->punit_ep = ep; pmcdev->has_die_c6 = true; From b6258fa2c7b3dd23e362801410f171567d0d16af Mon Sep 17 00:00:00 2001 From: Rajvi Jingar Date: Mon, 18 Dec 2023 20:22:10 -0800 Subject: [PATCH 0976/1562] platform/x86/intel/pmc: Add PSON residency counter Tiger Lake platform onwards, devices have the capability to track the duration of time that their Power Supply Units (PSUs) are turned off during S0ix. This patch adds a debugfs file `pson_residency_usec` to provide access to this counter. Signed-off-by: Michael Bottini Signed-off-by: Rajvi Jingar Signed-off-by: David E. Box Link: https://lore.kernel.org/r/20231219042216.2592029-2-rajvi.jingar@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/core.c | 37 +++++++++++++++++++++++++++ drivers/platform/x86/intel/pmc/core.h | 2 ++ 2 files changed, 39 insertions(+) diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c index 983e3a8f4910..91e5e500eb41 100644 --- a/drivers/platform/x86/intel/pmc/core.c +++ b/drivers/platform/x86/intel/pmc/core.c @@ -208,6 +208,20 @@ static int pmc_core_dev_state_get(void *data, u64 *val) DEFINE_DEBUGFS_ATTRIBUTE(pmc_core_dev_state, pmc_core_dev_state_get, NULL, "%llu\n"); +static int pmc_core_pson_residency_get(void *data, u64 *val) +{ + struct pmc *pmc = data; + const struct pmc_reg_map *map = pmc->map; + u32 value; + + value = pmc_core_reg_read(pmc, map->pson_residency_offset); + *val = (u64)value * map->pson_residency_counter_step; + + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(pmc_core_pson_residency, pmc_core_pson_residency_get, NULL, "%llu\n"); + static int pmc_core_check_read_lock_bit(struct pmc *pmc) { u32 value; @@ -1092,6 +1106,24 @@ int get_primary_reg_base(struct pmc *pmc) return 0; } +static bool pmc_core_is_pson_residency_enabled(struct pmc_dev *pmcdev) +{ + struct platform_device *pdev = pmcdev->pdev; + struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); + u8 val; + + if (!adev) + return false; + + if (fwnode_property_read_u8(acpi_fwnode_handle(adev), + "intel-cec-pson-switching-enabled-in-s0", + &val)) + return false; + + return val == 1; +} + + static void pmc_core_dbgfs_unregister(struct pmc_dev *pmcdev) { debugfs_remove_recursive(pmcdev->dbgfs_dir); @@ -1162,6 +1194,11 @@ static void pmc_core_dbgfs_register(struct pmc_dev *pmcdev) &pmc_core_substate_req_regs_fops); } + if (primary_pmc->map->pson_residency_offset && pmc_core_is_pson_residency_enabled(pmcdev)) { + debugfs_create_file("pson_residency_usec", 0444, + pmcdev->dbgfs_dir, primary_pmc, &pmc_core_pson_residency); + } + if (pmcdev->has_die_c6) { debugfs_create_file("die_c6_us_show", 0444, pmcdev->dbgfs_dir, pmcdev, diff --git a/drivers/platform/x86/intel/pmc/core.h b/drivers/platform/x86/intel/pmc/core.h index 6d7673145f90..91cb34a6505c 100644 --- a/drivers/platform/x86/intel/pmc/core.h +++ b/drivers/platform/x86/intel/pmc/core.h @@ -323,6 +323,8 @@ struct pmc_reg_map { const u32 lpm_live_status_offset; const u32 etr3_offset; const u8 *lpm_reg_index; + const u32 pson_residency_offset; + const u32 pson_residency_counter_step; }; /** From 544f7b7f651cf5745f3a1f3d28b298ee2b128eb1 Mon Sep 17 00:00:00 2001 From: Rajvi Jingar Date: Mon, 18 Dec 2023 20:22:11 -0800 Subject: [PATCH 0977/1562] platform/x86/intel/pmc: Add regmap for Tiger Lake H PCH Tiger Lake H PCH is same as Tiger Lake LP PCH from the driver perspective with the addition of the PSON residency counter. Add regmap for TGP H to add PSON register offsets for Tiger Lake H PCH. Signed-off-by: Rajvi Jingar Link: https://lore.kernel.org/r/20231219042216.2592029-3-rajvi.jingar@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/core.c | 10 +++--- drivers/platform/x86/intel/pmc/core.h | 6 ++++ drivers/platform/x86/intel/pmc/tgl.c | 48 ++++++++++++++++++++++++++- 3 files changed, 58 insertions(+), 6 deletions(-) diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c index 91e5e500eb41..e95105ad1243 100644 --- a/drivers/platform/x86/intel/pmc/core.c +++ b/drivers/platform/x86/intel/pmc/core.c @@ -1216,15 +1216,15 @@ static const struct x86_cpu_id intel_pmc_core_ids[] = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, icl_core_init), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, cnp_core_init), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, cnp_core_init), - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, tgl_core_init), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, tgl_l_core_init), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, tgl_core_init), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, tgl_core_init), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, tgl_l_core_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, icl_core_init), X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, tgl_core_init), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, tgl_core_init), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, tgl_core_init), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, tgl_l_core_init), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, tgl_l_core_init), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, adl_core_init), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, tgl_core_init), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, tgl_l_core_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, adl_core_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, adl_core_init), X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, mtl_core_init), diff --git a/drivers/platform/x86/intel/pmc/core.h b/drivers/platform/x86/intel/pmc/core.h index 91cb34a6505c..d09962940ad6 100644 --- a/drivers/platform/x86/intel/pmc/core.h +++ b/drivers/platform/x86/intel/pmc/core.h @@ -223,6 +223,10 @@ enum ppfear_regs { #define TGL_LPM_PRI_OFFSET 0x1C7C #define TGL_LPM_NUM_MAPS 6 +/* Tigerlake PSON residency register */ +#define TGL_PSON_RESIDENCY_OFFSET 0x18f8 +#define TGL_PSON_RES_COUNTER_STEP 0x7A + /* Extended Test Mode Register 3 (CNL and later) */ #define ETR3_OFFSET 0x1048 #define ETR3_CF9GR BIT(20) @@ -507,6 +511,8 @@ int spt_core_init(struct pmc_dev *pmcdev); int cnp_core_init(struct pmc_dev *pmcdev); int icl_core_init(struct pmc_dev *pmcdev); int tgl_core_init(struct pmc_dev *pmcdev); +int tgl_l_core_init(struct pmc_dev *pmcdev); +int tgl_core_generic_init(struct pmc_dev *pmcdev, int pch_tp); int adl_core_init(struct pmc_dev *pmcdev); int mtl_core_init(struct pmc_dev *pmcdev); diff --git a/drivers/platform/x86/intel/pmc/tgl.c b/drivers/platform/x86/intel/pmc/tgl.c index d5f1d2223c5a..91fd725951e5 100644 --- a/drivers/platform/x86/intel/pmc/tgl.c +++ b/drivers/platform/x86/intel/pmc/tgl.c @@ -13,6 +13,11 @@ #define ACPI_S0IX_DSM_UUID "57a6512e-3979-4e9d-9708-ff13b2508972" #define ACPI_GET_LOW_MODE_REGISTERS 1 +enum pch_type { + PCH_H, + PCH_LP +}; + const struct pmc_bit_map tgl_pfear_map[] = { {"PSF9", BIT(0)}, {"RES_66", BIT(1)}, @@ -205,6 +210,33 @@ const struct pmc_reg_map tgl_reg_map = { .etr3_offset = ETR3_OFFSET, }; +const struct pmc_reg_map tgl_h_reg_map = { + .pfear_sts = ext_tgl_pfear_map, + .slp_s0_offset = CNP_PMC_SLP_S0_RES_COUNTER_OFFSET, + .slp_s0_res_counter_step = TGL_PMC_SLP_S0_RES_COUNTER_STEP, + .ltr_show_sts = cnp_ltr_show_map, + .msr_sts = msr_map, + .ltr_ignore_offset = CNP_PMC_LTR_IGNORE_OFFSET, + .regmap_length = CNP_PMC_MMIO_REG_LEN, + .ppfear0_offset = CNP_PMC_HOST_PPFEAR0A, + .ppfear_buckets = ICL_PPFEAR_NUM_ENTRIES, + .pm_cfg_offset = CNP_PMC_PM_CFG_OFFSET, + .pm_read_disable_bit = CNP_PMC_READ_DISABLE_BIT, + .ltr_ignore_max = TGL_NUM_IP_IGN_ALLOWED, + .lpm_num_maps = TGL_LPM_NUM_MAPS, + .lpm_res_counter_step_x2 = TGL_PMC_LPM_RES_COUNTER_STEP_X2, + .lpm_sts_latch_en_offset = TGL_LPM_STS_LATCH_EN_OFFSET, + .lpm_en_offset = TGL_LPM_EN_OFFSET, + .lpm_priority_offset = TGL_LPM_PRI_OFFSET, + .lpm_residency_offset = TGL_LPM_RESIDENCY_OFFSET, + .lpm_sts = tgl_lpm_maps, + .lpm_status_offset = TGL_LPM_STATUS_OFFSET, + .lpm_live_status_offset = TGL_LPM_LIVE_STATUS_OFFSET, + .etr3_offset = ETR3_OFFSET, + .pson_residency_offset = TGL_PSON_RESIDENCY_OFFSET, + .pson_residency_counter_step = TGL_PSON_RES_COUNTER_STEP, +}; + void pmc_core_get_tgl_lpm_reqs(struct platform_device *pdev) { struct pmc_dev *pmcdev = platform_get_drvdata(pdev); @@ -253,12 +285,26 @@ free_acpi_obj: ACPI_FREE(out_obj); } +int tgl_l_core_init(struct pmc_dev *pmcdev) +{ + return tgl_core_generic_init(pmcdev, PCH_LP); +} + int tgl_core_init(struct pmc_dev *pmcdev) +{ + return tgl_core_generic_init(pmcdev, PCH_H); +} + +int tgl_core_generic_init(struct pmc_dev *pmcdev, int pch_tp) { struct pmc *pmc = pmcdev->pmcs[PMC_IDX_MAIN]; int ret; - pmc->map = &tgl_reg_map; + if (pch_tp == PCH_H) + pmc->map = &tgl_h_reg_map; + else + pmc->map = &tgl_reg_map; + ret = get_primary_reg_base(pmc); if (ret) return ret; From d873f380525c502904737f592008d509cff20c78 Mon Sep 17 00:00:00 2001 From: Rajvi Jingar Date: Mon, 18 Dec 2023 20:22:12 -0800 Subject: [PATCH 0978/1562] platform/x86/intel/pmc: Add PSON residency counter for Alder Lake Add PSON register offsets for Alder Lake PCH that provides an access to PSON residency counter. Signed-off-by: Rajvi Jingar Link: https://lore.kernel.org/r/20231219042216.2592029-4-rajvi.jingar@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/adl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/intel/pmc/adl.c b/drivers/platform/x86/intel/pmc/adl.c index 64c492391ede..882f2d5d8937 100644 --- a/drivers/platform/x86/intel/pmc/adl.c +++ b/drivers/platform/x86/intel/pmc/adl.c @@ -307,6 +307,8 @@ const struct pmc_reg_map adl_reg_map = { .lpm_sts = adl_lpm_maps, .lpm_status_offset = ADL_LPM_STATUS_OFFSET, .lpm_live_status_offset = ADL_LPM_LIVE_STATUS_OFFSET, + .pson_residency_offset = TGL_PSON_RESIDENCY_OFFSET, + .pson_residency_counter_step = TGL_PSON_RES_COUNTER_STEP, }; int adl_core_init(struct pmc_dev *pmcdev) From d79c3c82ee82cc99ffde4c4f5fe69db35bcfb733 Mon Sep 17 00:00:00 2001 From: Rajvi Jingar Date: Mon, 18 Dec 2023 20:22:13 -0800 Subject: [PATCH 0979/1562] platform/x86/intel/pmc: Move common code to core.c Functions like mtl_set_device_d3() and mtl_punit_pmt_init() were added for Meteor Lake. To be able to use them in Arrow Lake and future platforms, move them to core.c. Also, to support different guids, add guid argument in pmc_core_punit_pmt_init() and to support different PCI function numbers, add func arg in pmc_core_ssram_init(). Signed-off-by: Rajvi Jingar Link: https://lore.kernel.org/r/20231219042216.2592029-5-rajvi.jingar@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/core.c | 45 ++++++++++++++++ drivers/platform/x86/intel/pmc/core.h | 8 ++- drivers/platform/x86/intel/pmc/core_ssram.c | 4 +- drivers/platform/x86/intel/pmc/mtl.c | 60 +++------------------ 4 files changed, 60 insertions(+), 57 deletions(-) diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c index e95105ad1243..ac446b0f2192 100644 --- a/drivers/platform/x86/intel/pmc/core.c +++ b/drivers/platform/x86/intel/pmc/core.c @@ -1106,6 +1106,51 @@ int get_primary_reg_base(struct pmc *pmc) return 0; } +void pmc_core_punit_pmt_init(struct pmc_dev *pmcdev, u32 guid) +{ + struct telem_endpoint *ep; + struct pci_dev *pcidev; + + pcidev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(10, 0)); + if (!pcidev) { + dev_err(&pmcdev->pdev->dev, "PUNIT PMT device not found."); + return; + } + + ep = pmt_telem_find_and_register_endpoint(pcidev, guid, 0); + pci_dev_put(pcidev); + if (IS_ERR(ep)) { + dev_err(&pmcdev->pdev->dev, + "pmc_core: couldn't get DMU telem endpoint %ld", + PTR_ERR(ep)); + return; + } + + pmcdev->punit_ep = ep; + + pmcdev->has_die_c6 = true; + pmcdev->die_c6_offset = MTL_PMT_DMU_DIE_C6_OFFSET; +} + +void pmc_core_set_device_d3(unsigned int device) +{ + struct pci_dev *pcidev; + + pcidev = pci_get_device(PCI_VENDOR_ID_INTEL, device, NULL); + if (pcidev) { + if (!device_trylock(&pcidev->dev)) { + pci_dev_put(pcidev); + return; + } + if (!pcidev->dev.driver) { + dev_info(&pcidev->dev, "Setting to D3hot\n"); + pci_set_power_state(pcidev, PCI_D3hot); + } + device_unlock(&pcidev->dev); + pci_dev_put(pcidev); + } +} + static bool pmc_core_is_pson_residency_enabled(struct pmc_dev *pmcdev) { struct platform_device *pdev = pmcdev->pdev; diff --git a/drivers/platform/x86/intel/pmc/core.h b/drivers/platform/x86/intel/pmc/core.h index d09962940ad6..e678a1205514 100644 --- a/drivers/platform/x86/intel/pmc/core.h +++ b/drivers/platform/x86/intel/pmc/core.h @@ -268,6 +268,10 @@ enum ppfear_regs { #define MTL_SOCM_PPFEAR_NUM_ENTRIES 8 #define MTL_IOE_PPFEAR_NUM_ENTRIES 10 +/* Die C6 from PUNIT telemetry */ +#define MTL_PMT_DMU_DIE_C6_OFFSET 15 +#define MTL_PMT_DMU_GUID 0x1A067102 + extern const char *pmc_lpm_modes[]; struct pmc_bit_map { @@ -504,8 +508,10 @@ extern int pmc_core_send_ltr_ignore(struct pmc_dev *pmcdev, u32 value); int pmc_core_resume_common(struct pmc_dev *pmcdev); int get_primary_reg_base(struct pmc *pmc); extern void pmc_core_get_low_power_modes(struct pmc_dev *pmcdev); +extern void pmc_core_punit_pmt_init(struct pmc_dev *pmcdev, u32 guid); +extern void pmc_core_set_device_d3(unsigned int device); -extern int pmc_core_ssram_init(struct pmc_dev *pmcdev); +extern int pmc_core_ssram_init(struct pmc_dev *pmcdev, int func); int spt_core_init(struct pmc_dev *pmcdev); int cnp_core_init(struct pmc_dev *pmcdev); diff --git a/drivers/platform/x86/intel/pmc/core_ssram.c b/drivers/platform/x86/intel/pmc/core_ssram.c index 55e54207987c..1bde86c54eb9 100644 --- a/drivers/platform/x86/intel/pmc/core_ssram.c +++ b/drivers/platform/x86/intel/pmc/core_ssram.c @@ -290,12 +290,12 @@ pmc_core_ssram_get_pmc(struct pmc_dev *pmcdev, int pmc_idx, u32 offset) return pmc_core_pmc_add(pmcdev, pwrm_base, map, pmc_idx); } -int pmc_core_ssram_init(struct pmc_dev *pmcdev) +int pmc_core_ssram_init(struct pmc_dev *pmcdev, int func) { struct pci_dev *pcidev; int ret; - pcidev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(20, 2)); + pcidev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(20, func)); if (!pcidev) return -ENODEV; diff --git a/drivers/platform/x86/intel/pmc/mtl.c b/drivers/platform/x86/intel/pmc/mtl.c index fb59dffccf28..d2470f800298 100644 --- a/drivers/platform/x86/intel/pmc/mtl.c +++ b/drivers/platform/x86/intel/pmc/mtl.c @@ -17,10 +17,6 @@ #define IOEM_LPM_REQ_GUID 0x4357464 #define IOEP_LPM_REQ_GUID 0x5077612 -/* Die C6 from PUNIT telemetry */ -#define MTL_PMT_DMU_DIE_C6_OFFSET 15 -#define MTL_PMT_DMU_GUID 0x1A067102 - static const u8 MTL_LPM_REG_INDEX[] = {0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20}; /* @@ -973,63 +969,18 @@ static struct pmc_info mtl_pmc_info_list[] = { {} }; -static void mtl_punit_pmt_init(struct pmc_dev *pmcdev) -{ - struct telem_endpoint *ep; - struct pci_dev *pcidev; - - pcidev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(10, 0)); - if (!pcidev) { - dev_err(&pmcdev->pdev->dev, "PUNIT PMT device not found.\n"); - return; - } - - ep = pmt_telem_find_and_register_endpoint(pcidev, MTL_PMT_DMU_GUID, 0); - pci_dev_put(pcidev); - if (IS_ERR(ep)) { - dev_err(&pmcdev->pdev->dev, - "pmc_core: couldn't get DMU telem endpoint, %ld\n", - PTR_ERR(ep)); - return; - } - - pmcdev->punit_ep = ep; - - pmcdev->has_die_c6 = true; - pmcdev->die_c6_offset = MTL_PMT_DMU_DIE_C6_OFFSET; -} - #define MTL_GNA_PCI_DEV 0x7e4c #define MTL_IPU_PCI_DEV 0x7d19 #define MTL_VPU_PCI_DEV 0x7d1d -static void mtl_set_device_d3(unsigned int device) -{ - struct pci_dev *pcidev; - - pcidev = pci_get_device(PCI_VENDOR_ID_INTEL, device, NULL); - if (pcidev) { - if (!device_trylock(&pcidev->dev)) { - pci_dev_put(pcidev); - return; - } - if (!pcidev->dev.driver) { - dev_info(&pcidev->dev, "Setting to D3hot\n"); - pci_set_power_state(pcidev, PCI_D3hot); - } - device_unlock(&pcidev->dev); - pci_dev_put(pcidev); - } -} - /* * Set power state of select devices that do not have drivers to D3 * so that they do not block Package C entry. */ static void mtl_d3_fixup(void) { - mtl_set_device_d3(MTL_GNA_PCI_DEV); - mtl_set_device_d3(MTL_IPU_PCI_DEV); - mtl_set_device_d3(MTL_VPU_PCI_DEV); + pmc_core_set_device_d3(MTL_GNA_PCI_DEV); + pmc_core_set_device_d3(MTL_IPU_PCI_DEV); + pmc_core_set_device_d3(MTL_VPU_PCI_DEV); } static int mtl_resume(struct pmc_dev *pmcdev) @@ -1042,6 +993,7 @@ int mtl_core_init(struct pmc_dev *pmcdev) { struct pmc *pmc = pmcdev->pmcs[PMC_IDX_SOC]; int ret; + int func = 2; mtl_d3_fixup(); @@ -1052,7 +1004,7 @@ int mtl_core_init(struct pmc_dev *pmcdev) * If ssram init fails use legacy method to at least get the * primary PMC */ - ret = pmc_core_ssram_init(pmcdev); + ret = pmc_core_ssram_init(pmcdev, func); if (ret) { dev_warn(&pmcdev->pdev->dev, "ssram init failed, %d, using legacy init\n", ret); @@ -1063,7 +1015,7 @@ int mtl_core_init(struct pmc_dev *pmcdev) } pmc_core_get_low_power_modes(pmcdev); - mtl_punit_pmt_init(pmcdev); + pmc_core_punit_pmt_init(pmcdev, MTL_PMT_DMU_GUID); /* Due to a hardware limitation, the GBE LTR blocks PC10 * when a cable is attached. Tell the PMC to ignore it. From 1d62ada48d41d72d72232585eed0f3e1136ae1fb Mon Sep 17 00:00:00 2001 From: Rajvi Jingar Date: Mon, 18 Dec 2023 20:22:14 -0800 Subject: [PATCH 0980/1562] platform/x86/intel/pmc: Add ssram_init flag in PMC discovery in Meteor Lake If PMC discovery using pmc_core_ssram_init() was unsuccessful for the Meteor Lake platform, the legacy enumeration method is used. In this case pci device struct for the PMC SSRAM is not available and pmc_core_ssram_get_lpm_reqs() will not work. Add ssram_init flag to indicate if the PMC SSRAM initialization was successful or not. Call pmc_core_ssram_get_lpm_reqs() only if the ssram_init flag is set to true. Signed-off-by: Rajvi Jingar Link: https://lore.kernel.org/r/20231219042216.2592029-6-rajvi.jingar@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmc/mtl.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/pmc/mtl.c b/drivers/platform/x86/intel/pmc/mtl.c index d2470f800298..e75431325dda 100644 --- a/drivers/platform/x86/intel/pmc/mtl.c +++ b/drivers/platform/x86/intel/pmc/mtl.c @@ -994,6 +994,7 @@ int mtl_core_init(struct pmc_dev *pmcdev) struct pmc *pmc = pmcdev->pmcs[PMC_IDX_SOC]; int ret; int func = 2; + bool ssram_init = true; mtl_d3_fixup(); @@ -1006,6 +1007,7 @@ int mtl_core_init(struct pmc_dev *pmcdev) */ ret = pmc_core_ssram_init(pmcdev, func); if (ret) { + ssram_init = false; dev_warn(&pmcdev->pdev->dev, "ssram init failed, %d, using legacy init\n", ret); pmc->map = &mtl_socm_reg_map; @@ -1023,5 +1025,8 @@ int mtl_core_init(struct pmc_dev *pmcdev) dev_dbg(&pmcdev->pdev->dev, "ignoring GBE LTR\n"); pmc_core_send_ltr_ignore(pmcdev, 3); - return pmc_core_ssram_get_lpm_reqs(pmcdev); + if (ssram_init) + return pmc_core_ssram_get_lpm_reqs(pmcdev); + + return 0; } From 67ba055dd7758c34f6e64c9d35132362c1e1f0b5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 19 Dec 2023 17:40:12 +0200 Subject: [PATCH 0981/1562] regulator: Reuse LINEAR_RANGE() in REGULATOR_LINEAR_RANGE() REGULATOR_LINEAR_RANGE() repeats what LINEAR_RANGE() provides. Deduplicate the former by using the latter. No functional change intended. Signed-off-by: Andy Shevchenko Link: https://msgid.link/r/20231219154012.2478688-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 4b7eceb3828b..22a07c0900a4 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -51,12 +51,7 @@ enum regulator_detection_severity { /* Initialize struct linear_range for regulators */ #define REGULATOR_LINEAR_RANGE(_min_uV, _min_sel, _max_sel, _step_uV) \ -{ \ - .min = _min_uV, \ - .min_sel = _min_sel, \ - .max_sel = _max_sel, \ - .step = _step_uV, \ -} + LINEAR_RANGE(_min_uV, _min_sel, _max_sel, _step_uV) /** * struct regulator_ops - regulator operations. From d70d141bb15f328528f94557ddf754abeb027365 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 14 Dec 2023 12:07:55 +0100 Subject: [PATCH 0982/1562] ACPI: utils: Introduce helper for _DEP list lookup The ACPI LPSS driver and the Surface platform driver code use almost the same code pattern for checking if one ACPI device is present in the list returned by _DEP for another ACPI device. To reduce the resulting code duplication, introduce a helper for that called acpi_device_dep() and invoke it from both places. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Mika Westerberg --- drivers/acpi/acpi_lpss.c | 29 ++-------------- drivers/acpi/utils.c | 34 +++++++++++++++++++ .../platform/surface/surface_acpi_notify.c | 28 +-------------- include/acpi/acpi_bus.h | 1 + 4 files changed, 38 insertions(+), 54 deletions(-) diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c index 1623af8d62bc..920402dfe1ad 100644 --- a/drivers/acpi/acpi_lpss.c +++ b/drivers/acpi/acpi_lpss.c @@ -562,31 +562,6 @@ static struct device *acpi_lpss_find_device(const char *hid, const char *uid) return bus_find_device(&pci_bus_type, NULL, &data, match_hid_uid); } -static bool acpi_lpss_dep(struct acpi_device *adev, acpi_handle handle) -{ - struct acpi_handle_list dep_devices; - bool ret = false; - int i; - - if (!acpi_has_method(adev->handle, "_DEP")) - return false; - - if (!acpi_evaluate_reference(adev->handle, "_DEP", NULL, &dep_devices)) { - dev_dbg(&adev->dev, "Failed to evaluate _DEP.\n"); - return false; - } - - for (i = 0; i < dep_devices.count; i++) { - if (dep_devices.handles[i] == handle) { - ret = true; - break; - } - } - - acpi_handle_list_free(&dep_devices); - return ret; -} - static void acpi_lpss_link_consumer(struct device *dev1, const struct lpss_device_links *link) { @@ -597,7 +572,7 @@ static void acpi_lpss_link_consumer(struct device *dev1, return; if ((link->dep_missing_ids && dmi_check_system(link->dep_missing_ids)) - || acpi_lpss_dep(ACPI_COMPANION(dev2), ACPI_HANDLE(dev1))) + || acpi_device_dep(ACPI_HANDLE(dev2), ACPI_HANDLE(dev1))) device_link_add(dev2, dev1, link->flags); put_device(dev2); @@ -613,7 +588,7 @@ static void acpi_lpss_link_supplier(struct device *dev1, return; if ((link->dep_missing_ids && dmi_check_system(link->dep_missing_ids)) - || acpi_lpss_dep(ACPI_COMPANION(dev1), ACPI_HANDLE(dev2))) + || acpi_device_dep(ACPI_HANDLE(dev1), ACPI_HANDLE(dev2))) device_link_add(dev1, dev2, link->flags); put_device(dev2); diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index 57663065dbf6..abac5cc25477 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -450,6 +450,40 @@ void acpi_handle_list_free(struct acpi_handle_list *list) } EXPORT_SYMBOL_GPL(acpi_handle_list_free); +/** + * acpi_device_dep - Check ACPI device dependency + * @target: ACPI handle of the target ACPI device. + * @match: ACPI handle to look up in the target's _DEP list. + * + * Return true if @match is present in the list returned by _DEP for + * @target or false otherwise. + */ +bool acpi_device_dep(acpi_handle target, acpi_handle match) +{ + struct acpi_handle_list dep_devices; + bool ret = false; + int i; + + if (!acpi_has_method(target, "_DEP")) + return false; + + if (!acpi_evaluate_reference(target, "_DEP", NULL, &dep_devices)) { + acpi_handle_debug(target, "Failed to evaluate _DEP.\n"); + return false; + } + + for (i = 0; i < dep_devices.count; i++) { + if (dep_devices.handles[i] == match) { + ret = true; + break; + } + } + + acpi_handle_list_free(&dep_devices); + return ret; +} +EXPORT_SYMBOL_GPL(acpi_device_dep); + acpi_status acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld) { diff --git a/drivers/platform/surface/surface_acpi_notify.c b/drivers/platform/surface/surface_acpi_notify.c index 96ec052d0940..20f3870915d2 100644 --- a/drivers/platform/surface/surface_acpi_notify.c +++ b/drivers/platform/surface/surface_acpi_notify.c @@ -736,32 +736,6 @@ do { \ #define san_consumer_warn(dev, handle, fmt, ...) \ san_consumer_printk(warn, dev, handle, fmt, ##__VA_ARGS__) -static bool is_san_consumer(struct platform_device *pdev, acpi_handle handle) -{ - struct acpi_handle_list dep_devices; - acpi_handle supplier = ACPI_HANDLE(&pdev->dev); - bool ret = false; - int i; - - if (!acpi_has_method(handle, "_DEP")) - return false; - - if (!acpi_evaluate_reference(handle, "_DEP", NULL, &dep_devices)) { - san_consumer_dbg(&pdev->dev, handle, "failed to evaluate _DEP\n"); - return false; - } - - for (i = 0; i < dep_devices.count; i++) { - if (dep_devices.handles[i] == supplier) { - ret = true; - break; - } - } - - acpi_handle_list_free(&dep_devices); - return ret; -} - static acpi_status san_consumer_setup(acpi_handle handle, u32 lvl, void *context, void **rv) { @@ -770,7 +744,7 @@ static acpi_status san_consumer_setup(acpi_handle handle, u32 lvl, struct acpi_device *adev; struct device_link *link; - if (!is_san_consumer(pdev, handle)) + if (!acpi_device_dep(handle, ACPI_HANDLE(&pdev->dev))) return AE_OK; /* Ignore ACPI devices that are not present. */ diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 3dcf07b41428..b5c082e34539 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -33,6 +33,7 @@ bool acpi_handle_list_equal(struct acpi_handle_list *list1, void acpi_handle_list_replace(struct acpi_handle_list *dst, struct acpi_handle_list *src); void acpi_handle_list_free(struct acpi_handle_list *list); +bool acpi_device_dep(acpi_handle target, acpi_handle match); acpi_status acpi_evaluate_ost(acpi_handle handle, u32 source_event, u32 status_code, struct acpi_buffer *status_buf); From ac89d11b93cc37c52dc38206c3eaffd4fa603f91 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 14 Dec 2023 18:56:21 +0200 Subject: [PATCH 0983/1562] intel_idle: add Grand Ridge SoC support Add Intel Grand Ridge SoC C-states, which are C1, C1E, and C6S. The Grand Ridge SoC is built with modules, each module includes 4 cores (Crestmont microarchitecture). There is one L2 cache per module, shared between the 4 cores. There is no core C6 state, but there is C6S state, which has module scope: when all 4 cores request C6S, the entire module (4 cores + L2 cache) enters the low power state. Package C6 is not supported by Grand Ridge SoC. Signed-off-by: Artem Bityutskiy Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index cfd0b24fd7f1..3b846d4f8707 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1271,6 +1271,35 @@ static struct cpuidle_state snr_cstates[] __initdata = { .enter = NULL } }; +static struct cpuidle_state grr_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 2, + .target_residency = 10, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6S", + .desc = "MWAIT 0x22", + .flags = MWAIT2flg(0x22) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 140, + .target_residency = 500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + static const struct idle_cpu idle_cpu_nehalem __initconst = { .state_table = nehalem_cstates, .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE, @@ -1420,6 +1449,12 @@ static const struct idle_cpu idle_cpu_snr __initconst = { .use_acpi = true, }; +static const struct idle_cpu idle_cpu_grr __initconst = { + .state_table = grr_cstates, + .disable_promotion_to_c1e = true, + .use_acpi = true, +}; + static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &idle_cpu_nhx), X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &idle_cpu_nehalem), @@ -1466,6 +1501,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &idle_cpu_bxt), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &idle_cpu_dnv), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &idle_cpu_snr), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &idle_cpu_grr), {} }; From 92813fd5b1562e547120c8489137b040892fe1bc Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 14 Dec 2023 18:56:22 +0200 Subject: [PATCH 0984/1562] intel_idle: add Sierra Forest SoC support Add Sierra Forest SoC C-states, which are C1, C1E, C6S, and C6SP. Sierra Forest SoC is built with modules, each module includes 4 cores (Crestmont microarchitecture). There is one L2 cache per module, shared between the 4 cores. There is no core C6 state, but there is C6S state, which has module scope: when all 4 cores request C6S, the entire module (4 cores + L2 cache) enters the low power state. C6SP state has package scope - when all modules in the package enter C6S, the package enters the power state mode. Signed-off-by: Artem Bityutskiy Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 44 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 3b846d4f8707..b4390822edad 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1300,6 +1300,43 @@ static struct cpuidle_state grr_cstates[] __initdata = { .enter = NULL } }; +static struct cpuidle_state srf_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 2, + .target_residency = 10, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6S", + .desc = "MWAIT 0x22", + .flags = MWAIT2flg(0x22) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 270, + .target_residency = 700, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6SP", + .desc = "MWAIT 0x23", + .flags = MWAIT2flg(0x23) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 310, + .target_residency = 900, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + static const struct idle_cpu idle_cpu_nehalem __initconst = { .state_table = nehalem_cstates, .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE, @@ -1455,6 +1492,12 @@ static const struct idle_cpu idle_cpu_grr __initconst = { .use_acpi = true, }; +static const struct idle_cpu idle_cpu_srf __initconst = { + .state_table = srf_cstates, + .disable_promotion_to_c1e = true, + .use_acpi = true, +}; + static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &idle_cpu_nhx), X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &idle_cpu_nehalem), @@ -1502,6 +1545,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &idle_cpu_dnv), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &idle_cpu_snr), X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &idle_cpu_grr), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &idle_cpu_srf), {} }; From 489c693bd04a2308865dc50f37bd0b5f6ad52deb Mon Sep 17 00:00:00 2001 From: Chen Haonan Date: Tue, 19 Dec 2023 21:06:25 +0800 Subject: [PATCH 0985/1562] PM: hibernate: Use kmap_local_page() in copy_data_page() kmap_atomic() has been deprecated in favor of kmap_local_page(). kmap_atomic() disables page-faults and preemption (the latter only for !PREEMPT_RT kernels).The code between the mapping and un-mapping in this patch does not depend on the above-mentioned side effects.So simply replaced kmap_atomic() with kmap_local_page(). Signed-off-by: Chen Haonan [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index e3e8f1c6e75f..5c96ff067c64 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1487,11 +1487,11 @@ static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) s_page = pfn_to_page(src_pfn); d_page = pfn_to_page(dst_pfn); if (PageHighMem(s_page)) { - src = kmap_atomic(s_page); - dst = kmap_atomic(d_page); + src = kmap_local_page(s_page); + dst = kmap_local_page(d_page); zeros_only = do_copy_page(dst, src); - kunmap_atomic(dst); - kunmap_atomic(src); + kunmap_local(dst); + kunmap_local(src); } else { if (PageHighMem(d_page)) { /* @@ -1499,9 +1499,9 @@ static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) * data modified by kmap_atomic() */ zeros_only = safe_copy_page(buffer, s_page); - dst = kmap_atomic(d_page); + dst = kmap_local_page(d_page); copy_page(dst, buffer); - kunmap_atomic(dst); + kunmap_local(dst); } else { zeros_only = safe_copy_page(page_address(d_page), s_page); } From 4bbf0b6a64455c95586caf130e374586caef9986 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Tue, 12 Dec 2023 22:00:43 +0800 Subject: [PATCH 0986/1562] Documentation: PM: Adjust freezing-of-tasks.rst to the freezer changes The core freezer logic has been modified by commit f5d39b020809 ("freezer,sched: Rewrite core freezer logic"), so adjust the documentation to reflect the new code. The main changes include: - Drop references to PF_FROZEN and PF_FREEZER_SKIP - Describe TASK_FROZEN, TASK_FREEZABLE and __TASK_FREEZABLE_UNSAFE - Replace system_freezing_cnt with freezer_active - Use a different example for the loop of a freezable kernel thread, since the old code is gone gone Signed-off-by: Kevin Hao [ rjw: Subject and changelog edits, doc text adjustments ] Signed-off-by: Rafael J. Wysocki --- Documentation/power/freezing-of-tasks.rst | 81 +++++++++++++---------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/Documentation/power/freezing-of-tasks.rst b/Documentation/power/freezing-of-tasks.rst index 53b6a56c4635..df9755bfbd94 100644 --- a/Documentation/power/freezing-of-tasks.rst +++ b/Documentation/power/freezing-of-tasks.rst @@ -14,27 +14,28 @@ architectures). II. How does it work? ===================== -There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN -and PF_FREEZER_SKIP (the last one is auxiliary). The tasks that have -PF_NOFREEZE unset (all user space processes and some kernel threads) are -regarded as 'freezable' and treated in a special way before the system enters a -suspend state as well as before a hibernation image is created (in what follows -we only consider hibernation, but the description also applies to suspend). +There is one per-task flag (PF_NOFREEZE) and three per-task states +(TASK_FROZEN, TASK_FREEZABLE and __TASK_FREEZABLE_UNSAFE) used for that. +The tasks that have PF_NOFREEZE unset (all user space tasks and some kernel +threads) are regarded as 'freezable' and treated in a special way before the +system enters a sleep state as well as before a hibernation image is created +(hibernation is directly covered by what follows, but the description applies +to system-wide suspend too). Namely, as the first step of the hibernation procedure the function freeze_processes() (defined in kernel/power/process.c) is called. A system-wide -variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate -whether the system is to undergo a freezing operation. And freeze_processes() -sets this variable. After this, it executes try_to_freeze_tasks() that sends a -fake signal to all user space processes, and wakes up all the kernel threads. -All freezable tasks must react to that by calling try_to_freeze(), which -results in a call to __refrigerator() (defined in kernel/freezer.c), which sets -the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes -it loop until PF_FROZEN is cleared for it. Then, we say that the task is -'frozen' and therefore the set of functions handling this mechanism is referred -to as 'the freezer' (these functions are defined in kernel/power/process.c, -kernel/freezer.c & include/linux/freezer.h). User space processes are generally -frozen before kernel threads. +static key freezer_active (as opposed to a per-task flag or state) is used to +indicate whether the system is to undergo a freezing operation. And +freeze_processes() sets this static key. After this, it executes +try_to_freeze_tasks() that sends a fake signal to all user space processes, and +wakes up all the kernel threads. All freezable tasks must react to that by +calling try_to_freeze(), which results in a call to __refrigerator() (defined +in kernel/freezer.c), which changes the task's state to TASK_FROZEN, and makes +it loop until it is woken by an explicit TASK_FROZEN wakeup. Then, that task +is regarded as 'frozen' and so the set of functions handling this mechanism is +referred to as 'the freezer' (these functions are defined in +kernel/power/process.c, kernel/freezer.c & include/linux/freezer.h). User space +tasks are generally frozen before kernel threads. __refrigerator() must not be called directly. Instead, use the try_to_freeze() function (defined in include/linux/freezer.h), that checks @@ -43,31 +44,40 @@ if the task is to be frozen and makes the task enter __refrigerator(). For user space processes try_to_freeze() is called automatically from the signal-handling code, but the freezable kernel threads need to call it explicitly in suitable places or use the wait_event_freezable() or -wait_event_freezable_timeout() macros (defined in include/linux/freezer.h) -that combine interruptible sleep with checking if the task is to be frozen and -calling try_to_freeze(). The main loop of a freezable kernel thread may look +wait_event_freezable_timeout() macros (defined in include/linux/wait.h) +that put the task to sleep (TASK_INTERRUPTIBLE) or freeze it (TASK_FROZEN) if +freezer_active is set. The main loop of a freezable kernel thread may look like the following one:: set_freezable(); - do { - hub_events(); - wait_event_freezable(khubd_wait, - !list_empty(&hub_event_list) || - kthread_should_stop()); - } while (!kthread_should_stop() || !list_empty(&hub_event_list)); -(from drivers/usb/core/hub.c::hub_thread()). + while (true) { + struct task_struct *tsk = NULL; -If a freezable kernel thread fails to call try_to_freeze() after the freezer has -initiated a freezing operation, the freezing of tasks will fail and the entire -hibernation operation will be cancelled. For this reason, freezable kernel -threads must call try_to_freeze() somewhere or use one of the + wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); + spin_lock_irq(&oom_reaper_lock); + if (oom_reaper_list != NULL) { + tsk = oom_reaper_list; + oom_reaper_list = tsk->oom_reaper_list; + } + spin_unlock_irq(&oom_reaper_lock); + + if (tsk) + oom_reap_task(tsk); + } + +(from mm/oom_kill.c::oom_reaper()). + +If a freezable kernel thread is not put to the frozen state after the freezer +has initiated a freezing operation, the freezing of tasks will fail and the +entire system-wide transition will be cancelled. For this reason, freezable +kernel threads must call try_to_freeze() somewhere or use one of the wait_event_freezable() and wait_event_freezable_timeout() macros. After the system memory state has been restored from a hibernation image and devices have been reinitialized, the function thaw_processes() is called in -order to clear the PF_FROZEN flag for each frozen task. Then, the tasks that -have been frozen leave __refrigerator() and continue running. +order to wake up each frozen task. Then, the tasks that have been frozen leave +__refrigerator() and continue running. Rationale behind the functions dealing with freezing and thawing of tasks @@ -96,7 +106,8 @@ III. Which kernel threads are freezable? Kernel threads are not freezable by default. However, a kernel thread may clear PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE directly is not allowed). From this point it is regarded as freezable -and must call try_to_freeze() in a suitable place. +and must call try_to_freeze() or variants of wait_event_freezable() in a +suitable place. IV. Why do we do that? ====================== From e95013156ad88e6a1e1db6545881f49183e2ee0a Mon Sep 17 00:00:00 2001 From: Zhenguo Yao Date: Wed, 13 Dec 2023 18:28:08 +0800 Subject: [PATCH 0987/1562] cpufreq: intel_pstate: Add Emerald Rapids support in no-HWP mode Users may disable HWP in firmware, in which case intel_pstate will give up unless the CPU model is explicitly supported. See also the following past commits: - commit df51f287b5de ("cpufreq: intel_pstate: Add Sapphire Rapids support in no-HWP mode") - commit d8de7a44e11f ("cpufreq: intel_pstate: Add Skylake servers support") - commit 706c5328851d ("cpufreq: intel_pstate: Add Cometlake support in no-HWP mode") - commit fbdc21e9b038 ("cpufreq: intel_pstate: Add Icelake servers support in no-HWP mode") - commit 71bb5c82aaae ("cpufreq: intel_pstate: Add Tigerlake support in no-HWP mode") Signed-off-by: Zhenguo Yao Acked-by: Srinivas Pandruvada [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index dd6d23e389f1..3c69040920b8 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2406,6 +2406,7 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = { X86_MATCH(ICELAKE_X, core_funcs), X86_MATCH(TIGERLAKE, core_funcs), X86_MATCH(SAPPHIRERAPIDS_X, core_funcs), + X86_MATCH(EMERALDRAPIDS_X, core_funcs), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); From af2792abd4555b676105fe3073a39cb0ed3e8bfa Mon Sep 17 00:00:00 2001 From: JaimeLiao Date: Tue, 19 Dec 2023 18:21:03 +0800 Subject: [PATCH 0988/1562] mtd: spi-nor: sfdp: get the 1-1-8 and 1-8-8 protocol from SFDP BFPT 17th DWORD contains the information about 1-1-8 and 1-8-8. Parse BFPT DWORD[17] instruction to determine whether flash supports 1-1-8 and 1-8-8, and set its dummy cycles accordingly. Validated only the 1-1-8 read using a macronix flash with Xilinx board zynq-picozed. Signed-off-by: JaimeLiao Reviewed-by: Michael Walle Link: https://lore.kernel.org/r/20231219102103.92738-2-jaimeliao.tw@gmail.com [ta: update commit message, get rid of extra dereference] Signed-off-by: Tudor Ambarus --- drivers/mtd/spi-nor/sfdp.c | 29 +++++++++++++++++++++++++++++ drivers/mtd/spi-nor/sfdp.h | 7 +++++++ 2 files changed, 36 insertions(+) diff --git a/drivers/mtd/spi-nor/sfdp.c b/drivers/mtd/spi-nor/sfdp.c index b3b11dfed789..57713de32832 100644 --- a/drivers/mtd/spi-nor/sfdp.c +++ b/drivers/mtd/spi-nor/sfdp.c @@ -446,6 +446,7 @@ static int spi_nor_parse_bfpt(struct spi_nor *nor, u32 dword; u16 half; u8 erase_mask; + u8 wait_states, mode_clocks, opcode; /* JESD216 Basic Flash Parameter Table length is at least 9 DWORDs. */ if (bfpt_header->length < BFPT_DWORD_MAX_JESD216) @@ -631,6 +632,32 @@ static int spi_nor_parse_bfpt(struct spi_nor *nor, if (bfpt_header->length == BFPT_DWORD_MAX_JESD216B) return spi_nor_post_bfpt_fixups(nor, bfpt_header, &bfpt); + /* Parse 1-1-8 read instruction */ + opcode = FIELD_GET(BFPT_DWORD17_RD_1_1_8_CMD, bfpt.dwords[SFDP_DWORD(17)]); + if (opcode) { + mode_clocks = FIELD_GET(BFPT_DWORD17_RD_1_1_8_MODE_CLOCKS, + bfpt.dwords[SFDP_DWORD(17)]); + wait_states = FIELD_GET(BFPT_DWORD17_RD_1_1_8_WAIT_STATES, + bfpt.dwords[SFDP_DWORD(17)]); + params->hwcaps.mask |= SNOR_HWCAPS_READ_1_1_8; + spi_nor_set_read_settings(¶ms->reads[SNOR_CMD_READ_1_1_8], + mode_clocks, wait_states, opcode, + SNOR_PROTO_1_1_8); + } + + /* Parse 1-8-8 read instruction */ + opcode = FIELD_GET(BFPT_DWORD17_RD_1_8_8_CMD, bfpt.dwords[SFDP_DWORD(17)]); + if (opcode) { + mode_clocks = FIELD_GET(BFPT_DWORD17_RD_1_8_8_MODE_CLOCKS, + bfpt.dwords[SFDP_DWORD(17)]); + wait_states = FIELD_GET(BFPT_DWORD17_RD_1_8_8_WAIT_STATES, + bfpt.dwords[SFDP_DWORD(17)]); + params->hwcaps.mask |= SNOR_HWCAPS_READ_1_8_8; + spi_nor_set_read_settings(¶ms->reads[SNOR_CMD_READ_1_8_8], + mode_clocks, wait_states, opcode, + SNOR_PROTO_1_8_8); + } + /* 8D-8D-8D command extension. */ switch (bfpt.dwords[SFDP_DWORD(18)] & BFPT_DWORD18_CMD_EXT_MASK) { case BFPT_DWORD18_CMD_EXT_REP: @@ -968,6 +995,8 @@ static int spi_nor_parse_4bait(struct spi_nor *nor, { SNOR_HWCAPS_READ_1_1_1_DTR, BIT(13) }, { SNOR_HWCAPS_READ_1_2_2_DTR, BIT(14) }, { SNOR_HWCAPS_READ_1_4_4_DTR, BIT(15) }, + { SNOR_HWCAPS_READ_1_1_8, BIT(20) }, + { SNOR_HWCAPS_READ_1_8_8, BIT(21) }, }; static const struct sfdp_4bait programs[] = { { SNOR_HWCAPS_PP, BIT(6) }, diff --git a/drivers/mtd/spi-nor/sfdp.h b/drivers/mtd/spi-nor/sfdp.h index 6eb99e1cdd61..da0fe5aa9bb0 100644 --- a/drivers/mtd/spi-nor/sfdp.h +++ b/drivers/mtd/spi-nor/sfdp.h @@ -118,6 +118,13 @@ struct sfdp_bfpt { (BFPT_DWORD16_EN4B_EN4B | BFPT_DWORD16_EX4B_EX4B) #define BFPT_DWORD16_SWRST_EN_RST BIT(12) +#define BFPT_DWORD17_RD_1_1_8_CMD GENMASK(31, 24) +#define BFPT_DWORD17_RD_1_1_8_MODE_CLOCKS GENMASK(23, 21) +#define BFPT_DWORD17_RD_1_1_8_WAIT_STATES GENMASK(20, 16) +#define BFPT_DWORD17_RD_1_8_8_CMD GENMASK(15, 8) +#define BFPT_DWORD17_RD_1_8_8_MODE_CLOCKS GENMASK(7, 5) +#define BFPT_DWORD17_RD_1_8_8_WAIT_STATES GENMASK(4, 0) + #define BFPT_DWORD18_CMD_EXT_MASK GENMASK(30, 29) #define BFPT_DWORD18_CMD_EXT_REP (0x0UL << 29) /* Repeat */ #define BFPT_DWORD18_CMD_EXT_INV (0x1UL << 29) /* Invert */ From 3c0e1dfa703cd2a16fbfb1290b0970b61add3cde Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 19 Dec 2023 10:12:18 +0100 Subject: [PATCH 0989/1562] MAINTAINERS: change my mail to the kernel.org one As I'm doing more and more work professionally, move away from my private mail address. Signed-off-by: Michael Walle Link: https://lore.kernel.org/r/20231219091218.2846297-1-michael@walle.cc Signed-off-by: Tudor Ambarus --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 79e7d727022a..c4c493293e9e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9045,7 +9045,7 @@ F: drivers/gpio/gpio-mockup.c F: tools/testing/selftests/gpio/ GPIO REGMAP -M: Michael Walle +M: Michael Walle S: Maintained F: drivers/gpio/gpio-regmap.c F: include/linux/gpio/regmap.h @@ -19901,7 +19901,7 @@ W: http://www.winischhofer.at/linuxsisusbvga.shtml F: drivers/usb/misc/sisusbvga/ SL28 CPLD MFD DRIVER -M: Michael Walle +M: Michael Walle S: Maintained F: Documentation/devicetree/bindings/gpio/kontron,sl28cpld-gpio.yaml F: Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml @@ -19916,7 +19916,7 @@ F: drivers/pwm/pwm-sl28cpld.c F: drivers/watchdog/sl28cpld_wdt.c SL28 VPD NVMEM LAYOUT DRIVER -M: Michael Walle +M: Michael Walle S: Maintained F: Documentation/devicetree/bindings/nvmem/layouts/kontron,sl28-vpd.yaml F: drivers/nvmem/layouts/sl28vpd.c @@ -20426,7 +20426,7 @@ F: drivers/pinctrl/spear/ SPI NOR SUBSYSTEM M: Tudor Ambarus M: Pratyush Yadav -M: Michael Walle +M: Michael Walle L: linux-mtd@lists.infradead.org S: Maintained W: http://www.linux-mtd.infradead.org/ From 023e6aad7e5e7f2e086c399abd0675589c123728 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 15 Dec 2023 20:41:46 -0800 Subject: [PATCH 0990/1562] mtd: rawnand: s3c2410: fix Excess struct member description kernel-doc warnings Delete 2 lines to prevent warnings from scripts/kernel-doc: s3c2410.c:117: warning: Excess struct member 'mtd' description in 's3c2410_nand_mtd' s3c2410.c:168: warning: Excess struct member 'freq_transition' description in 's3c2410_nand_info' Signed-off-by: Randy Dunlap Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312150611.EZBAQYqf-lkp@intel.com/ Cc: Arnd Bergmann Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Cc: linux-mtd@lists.infradead.org Cc: Krzysztof Kozlowski Cc: Alim Akhtar Cc: linux-arm-kernel@lists.infradead.org Cc: linux-samsung-soc@vger.kernel.org Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231216044146.18645-1-rdunlap@infradead.org --- drivers/mtd/nand/raw/s3c2410.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/mtd/nand/raw/s3c2410.c b/drivers/mtd/nand/raw/s3c2410.c index 3d3d5c9814ff..48c1d0eb66ca 100644 --- a/drivers/mtd/nand/raw/s3c2410.c +++ b/drivers/mtd/nand/raw/s3c2410.c @@ -105,7 +105,6 @@ struct s3c2410_nand_info; /** * struct s3c2410_nand_mtd - driver MTD structure - * @mtd: The MTD instance to pass to the MTD layer. * @chip: The NAND chip information. * @set: The platform information supplied for this set of NAND chips. * @info: Link back to the hardware information. @@ -145,7 +144,6 @@ enum s3c_nand_clk_state { * @clk_rate: The clock rate from @clk. * @clk_state: The current clock state. * @cpu_type: The exact type of this controller. - * @freq_transition: CPUFreq notifier block */ struct s3c2410_nand_info { /* mtd info */ From 34d72246437155299dd08fd29277e6fa31081ea0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Dec 2023 05:21:44 +0000 Subject: [PATCH 0991/1562] gfs2: d_obtain_alias(ERR_PTR(...)) will do the right thing Signed-off-by: Al Viro Signed-off-by: Andreas Gruenbacher --- fs/gfs2/export.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index cf40895233f5..3334c394ce9c 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -138,8 +138,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, return ERR_PTR(-ESTALE); inode = gfs2_lookup_by_inum(sdp, inum->no_addr, inum->no_formal_ino, GFS2_BLKST_DINODE); - if (IS_ERR(inode)) - return ERR_CAST(inode); return d_obtain_alias(inode); } From 34d63b8162b7b93e616212bb1026fdc51a35ee21 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Dec 2023 05:31:57 +0000 Subject: [PATCH 0992/1562] gfs2: use is_subdir() ... instead of reimplementing it with misguiding name (is_ancestor(x, y) would normally imply "x is an ancestor of y", not the other way round). With races, while we are at it... Signed-off-by: Al Viro Signed-off-by: Andreas Gruenbacher --- fs/gfs2/super.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index d21c04a22d73..b5c75c8a8d62 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1065,16 +1065,6 @@ static int gfs2_drop_inode(struct inode *inode) return generic_drop_inode(inode); } -static int is_ancestor(const struct dentry *d1, const struct dentry *d2) -{ - do { - if (d1 == d2) - return 1; - d1 = d1->d_parent; - } while (!IS_ROOT(d1)); - return 0; -} - /** * gfs2_show_options - Show mount options for /proc/mounts * @s: seq_file structure @@ -1096,7 +1086,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) statfs_slow = sdp->sd_tune.gt_statfs_slow; spin_unlock(&sdp->sd_tune.gt_spin); - if (is_ancestor(root, sdp->sd_master_dir)) + if (is_subdir(root, sdp->sd_master_dir)) seq_puts(s, ",meta"); if (args->ar_lockproto[0]) seq_show_option(s, "lockproto", args->ar_lockproto); From 40ca4ee3136d2d09977d1cab8c0c0e1582c3359d Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 12 Dec 2023 06:12:43 -0500 Subject: [PATCH 0993/1562] evm: don't copy up 'security.evm' xattr The security.evm HMAC and the original file signatures contain filesystem specific data. As a result, the HMAC and signature are not the same on the stacked and backing filesystems. Don't copy up 'security.evm'. Reviewed-by: Amir Goldstein Reviewed-by: Christian Brauner Signed-off-by: Mimi Zohar --- include/linux/evm.h | 6 ++++++ security/integrity/evm/evm_main.c | 7 +++++++ security/security.c | 2 +- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/linux/evm.h b/include/linux/evm.h index 01fc495a83e2..36ec884320d9 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -31,6 +31,7 @@ extern void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len); +extern int evm_inode_copy_up_xattr(const char *name); extern int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name); extern void evm_inode_post_removexattr(struct dentry *dentry, @@ -117,6 +118,11 @@ static inline void evm_inode_post_setxattr(struct dentry *dentry, return; } +static inline int evm_inode_copy_up_xattr(const char *name) +{ + return 0; +} + static inline int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index 894570fe39bc..02adba635b02 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -863,6 +863,13 @@ void evm_inode_post_setattr(struct dentry *dentry, int ia_valid) evm_update_evmxattr(dentry, NULL, NULL, 0); } +int evm_inode_copy_up_xattr(const char *name) +{ + if (strcmp(name, XATTR_NAME_EVM) == 0) + return 1; /* Discard */ + return -EOPNOTSUPP; +} + /* * evm_inode_init_security - initializes security.evm HMAC value */ diff --git a/security/security.c b/security/security.c index dcb3e7014f9b..f00ec4d988b8 100644 --- a/security/security.c +++ b/security/security.c @@ -2539,7 +2539,7 @@ int security_inode_copy_up_xattr(const char *name) return rc; } - return LSM_RET_DEFAULT(inode_copy_up_xattr); + return evm_inode_copy_up_xattr(name); } EXPORT_SYMBOL(security_inode_copy_up_xattr); From cd708c938f055c9eb5a366ec1c8edcefa28afc28 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Mon, 18 Dec 2023 08:06:40 -0500 Subject: [PATCH 0994/1562] evm: add support to disable EVM on unsupported filesystems Identify EVM unsupported filesystems by defining a new flag SB_I_EVM_UNSUPPORTED. Don't verify, write, remove or update 'security.evm' on unsupported filesystems. Acked-by: Amir Goldstein Reviewed-by: Christian Brauner Signed-off-by: Mimi Zohar --- include/linux/fs.h | 1 + security/integrity/evm/evm_main.c | 35 ++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..1474f36e9b38 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1164,6 +1164,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 +#define SB_I_EVM_UNSUPPORTED 0x00000080 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index 02adba635b02..cc7956d7878b 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -151,6 +151,17 @@ static int evm_find_protected_xattrs(struct dentry *dentry) return count; } +static int is_unsupported_fs(struct dentry *dentry) +{ + struct inode *inode = d_backing_inode(dentry); + + if (inode->i_sb->s_iflags & SB_I_EVM_UNSUPPORTED) { + pr_info_once("%s not supported\n", inode->i_sb->s_type->name); + return 1; + } + return 0; +} + /* * evm_verify_hmac - calculate and compare the HMAC with the EVM xattr * @@ -181,6 +192,9 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry, iint->evm_status == INTEGRITY_PASS_IMMUTABLE)) return iint->evm_status; + if (is_unsupported_fs(dentry)) + return INTEGRITY_UNKNOWN; + /* if status is not PASS, try to check again - against -ENOMEM */ /* first need to know the sig type */ @@ -408,6 +422,9 @@ enum integrity_status evm_verifyxattr(struct dentry *dentry, if (!evm_key_loaded() || !evm_protected_xattr(xattr_name)) return INTEGRITY_UNKNOWN; + if (is_unsupported_fs(dentry)) + return INTEGRITY_UNKNOWN; + if (!iint) { iint = integrity_iint_find(d_backing_inode(dentry)); if (!iint) @@ -491,15 +508,21 @@ static int evm_protect_xattr(struct mnt_idmap *idmap, if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (is_unsupported_fs(dentry)) + return -EPERM; } else if (!evm_protected_xattr(xattr_name)) { if (!posix_xattr_acl(xattr_name)) return 0; + if (is_unsupported_fs(dentry)) + return 0; + evm_status = evm_verify_current_integrity(dentry); if ((evm_status == INTEGRITY_PASS) || (evm_status == INTEGRITY_NOXATTRS)) return 0; goto out; - } + } else if (is_unsupported_fs(dentry)) + return 0; evm_status = evm_verify_current_integrity(dentry); if (evm_status == INTEGRITY_NOXATTRS) { @@ -750,6 +773,9 @@ void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name, if (!(evm_initialized & EVM_INIT_HMAC)) return; + if (is_unsupported_fs(dentry)) + return; + evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len); } @@ -814,8 +840,12 @@ int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; + if (is_unsupported_fs(dentry)) + return 0; + if (!(ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))) return 0; + evm_status = evm_verify_current_integrity(dentry); /* * Writing attrs is safe for portable signatures, as portable signatures @@ -859,6 +889,9 @@ void evm_inode_post_setattr(struct dentry *dentry, int ia_valid) if (!(evm_initialized & EVM_INIT_HMAC)) return; + if (is_unsupported_fs(dentry)) + return; + if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) evm_update_evmxattr(dentry, NULL, NULL, 0); } From c00f94b3a5be428837868c0f2cdaa3fa5b4b1995 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 19 Dec 2023 10:11:25 -0500 Subject: [PATCH 0995/1562] overlay: disable EVM Until a complete solution is developed, update 'sb->s_iflags' to disable EVM. Acked-by: Amir Goldstein Reviewed-by: Christian Brauner Signed-off-by: Mimi Zohar --- fs/overlayfs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a0967bb25003..e3d9c6c80a47 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1454,6 +1454,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) * lead to unexpected results. */ sb->s_iflags |= SB_I_NOUMASK; + sb->s_iflags |= SB_I_EVM_UNSUPPORTED; err = -ENOMEM; root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe); From fe22944cf05ede8e6f841cfecdb7093a53a3d9b3 Mon Sep 17 00:00:00 2001 From: xiaoming Wang Date: Tue, 19 Dec 2023 11:34:11 +0800 Subject: [PATCH 0996/1562] cpu/hotplug: Increase the number of dynamic states The dynamically allocatable hotplug state space can be exhausted by the existing drivers and infrastructure which install CPU hotplug states dynamically. That prevents new drivers and infrastructure from installing dynamically allocated states. Increase the size of the CPUHP_AP_ONLINE_DYN state by 10 to make room. Signed-off-by: Xiaoming Wang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231219033411.816100-1-xiaoming.wang@intel.com --- include/linux/cpuhotplug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index af6c21aab985..8bd454dfe453 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -239,7 +239,7 @@ enum cpuhp_state { CPUHP_AP_RCUTREE_ONLINE, CPUHP_AP_BASE_CACHEINFO_ONLINE, CPUHP_AP_ONLINE_DYN, - CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, + CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 40, CPUHP_AP_X86_HPET_ONLINE, CPUHP_AP_X86_KVM_CLK_ONLINE, CPUHP_AP_ACTIVE, From cb665db94fc61512c9c94ed1d42af67e7bf6ce01 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:23 +0100 Subject: [PATCH 0997/1562] tick-sched: Fix function names in comments When referencing functions in comments, it might be helpful to use full function names (including the prefix) to be able to find it when grepping. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-2-anna-maria@linutronix.de --- kernel/time/tick-sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index be77b021e5d6..ff25fdff6b7c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -920,11 +920,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) } /* - * nohz_stop_sched_tick() can be called several times before - * nohz_restart_sched_tick() is called. This happens when - * interrupts arrive which do not cause a reschedule. In the - * first call we save the current tick time, so we can restart - * the scheduler tick in nohz_restart_sched_tick(). + * tick_nohz_stop_tick() can be called several times before + * tick_nohz_restart_sched_tick() is called. This happens when + * interrupts arrive which do not cause a reschedule. In the first + * call we save the current tick time, so we can restart the + * scheduler tick in tick_nohz_restart_sched_tick(). */ if (!ts->tick_stopped) { calc_load_nohz_start(); From 318050671affa92fd166d988d08d4041c7b113c4 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:24 +0100 Subject: [PATCH 0998/1562] tick/sched: Cleanup confusing variables tick_nohz_stop_tick() contains the expires (u64 variable) and tick (ktime_t) variable. In the beginning the value of expires is written to tick. Afterwards none of the variables is changed. They are only used for checks. Drop the not required variable tick and use always expires instead. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-3-anna-maria@linutronix.de --- kernel/time/tick-sched.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ff25fdff6b7c..fce3c6f0e4a6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -887,7 +887,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 basemono = ts->timer_expires_base; u64 expires = ts->timer_expires; - ktime_t tick = expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; @@ -910,7 +909,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) /* Skip reprogram of event if it's not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ - if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) + if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; WARN_ON_ONCE(1); @@ -935,7 +934,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) trace_tick_stop(1, TICK_DEP_MASK_NONE); } - ts->next_tick = tick; + ts->next_tick = expires; /* * If the expiration time == KTIME_MAX, then we simply stop @@ -950,11 +949,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, tick, + hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); } else { - hrtimer_set_expires(&ts->sched_timer, tick); - tick_program_event(tick, 1); + hrtimer_set_expires(&ts->sched_timer, expires); + tick_program_event(expires, 1); } } From cbf04a22026100dceeceec67fcbf1973383eb32f Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:25 +0100 Subject: [PATCH 0999/1562] tick-sched: Warn when next tick seems to be in the past When the next tick is in the past, the delta between basemono and the next tick gets negativ. But the next tick should never be in the past. The negative effect of a wrong next tick might be a stop of the tick and timers might expire late. To prevent expensive debugging when changing underlying code, add a WARN_ON_ONCE into this code path. To prevent complete misbehaviour, also reset next_tick to basemono in this case. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-4-anna-maria@linutronix.de --- kernel/time/tick-sched.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fce3c6f0e4a6..a17d26002831 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -839,6 +839,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) ts->next_timer = next_tick; } + /* Make sure next_tick is never before basemono! */ + if (WARN_ON_ONCE(basemono > next_tick)) + next_tick = basemono; + /* * If the tick is due in the next period, keep it ticking or * force prod the timer. From dbcdcb62b59db2cf6a24113873b90da15c6f0b19 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:26 +0100 Subject: [PATCH 1000/1562] tracing/timers: Enhance timer_start tracepoint For starting a timer, the timer is enqueued into a bucket of the timer wheel. The bucket expiry is the defacto expiry of the timer but it is not equal the timer expiry because of increasing granularity when bucket is in a higher level of the wheel. To be able to figure out in a trace whether a timer expired in time or not, the bucket expiry time is required as well. Add bucket expiry time to the timer_start tracepoint and thereby simplify the arguments. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-5-anna-maria@linutronix.de --- include/trace/events/timer.h | 20 ++++++++++---------- kernel/time/timer.c | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index b4bc2828fa09..99ada928d445 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -46,22 +46,21 @@ DEFINE_EVENT(timer_class, timer_init, /** * timer_start - called when the timer is started - * @timer: pointer to struct timer_list - * @expires: the timers expiry time - * @flags: the timers flags + * @timer: pointer to struct timer_list + * @bucket_expiry: the bucket expiry time */ TRACE_EVENT(timer_start, TP_PROTO(struct timer_list *timer, - unsigned long expires, - unsigned int flags), + unsigned long bucket_expiry), - TP_ARGS(timer, expires, flags), + TP_ARGS(timer, bucket_expiry), TP_STRUCT__entry( __field( void *, timer ) __field( void *, function ) __field( unsigned long, expires ) + __field( unsigned long, bucket_expiry ) __field( unsigned long, now ) __field( unsigned int, flags ) ), @@ -69,15 +68,16 @@ TRACE_EVENT(timer_start, TP_fast_assign( __entry->timer = timer; __entry->function = timer->function; - __entry->expires = expires; + __entry->expires = timer->expires; + __entry->bucket_expiry = bucket_expiry; __entry->now = jiffies; - __entry->flags = flags; + __entry->flags = timer->flags; ), - TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s", + TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s", __entry->timer, __entry->function, __entry->expires, (long)__entry->expires - __entry->now, - __entry->flags & TIMER_CPUMASK, + __entry->bucket_expiry, __entry->flags & TIMER_CPUMASK, __entry->flags >> TIMER_ARRAYSHIFT, decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK)) ); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 63a8ce7177dd..a81d793a43d0 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -606,7 +606,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); - trace_timer_start(timer, timer->expires, timer->flags); + trace_timer_start(timer, bucket_expiry); /* * Check whether this is the new first expiring timer. The From b573c73101d8786446535b2ab28cbc8907bda9a9 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:27 +0100 Subject: [PATCH 1001/1562] tracing/timers: Add tracepoint for tracking timer base is_idle flag When debugging timer code the timer tracepoints are very important. There is no tracepoint when the is_idle flag of the timer base changes. Instead of always adding manually trace_printk(), add tracepoints which can be easily enabled whenever required. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-6-anna-maria@linutronix.de --- include/trace/events/timer.h | 20 ++++++++++++++++++++ kernel/time/timer.c | 14 +++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 99ada928d445..1ef58a04fc57 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -142,6 +142,26 @@ DEFINE_EVENT(timer_class, timer_cancel, TP_ARGS(timer) ); +TRACE_EVENT(timer_base_idle, + + TP_PROTO(bool is_idle, unsigned int cpu), + + TP_ARGS(is_idle, cpu), + + TP_STRUCT__entry( + __field( bool, is_idle ) + __field( unsigned int, cpu ) + ), + + TP_fast_assign( + __entry->is_idle = is_idle; + __entry->cpu = cpu; + ), + + TP_printk("is_idle=%d cpu=%d", + __entry->is_idle, __entry->cpu) +); + #define decode_clockid(type) \ __print_symbolic(type, \ { CLOCK_REALTIME, "CLOCK_REALTIME" }, \ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a81d793a43d0..ed8d6063d9ef 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1950,7 +1950,10 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if (time_before_eq(nextevt, basej)) { expires = basem; - base->is_idle = false; + if (base->is_idle) { + base->is_idle = false; + trace_timer_base_idle(false, base->cpu); + } } else { if (base->timers_pending) expires = basem + (u64)(nextevt - basej) * TICK_NSEC; @@ -1961,8 +1964,10 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) * logic is only maintained for the BASE_STD base, deferrable * timers may still see large granularity skew (by design). */ - if ((expires - basem) > TICK_NSEC) + if ((expires - basem) > TICK_NSEC && !base->is_idle) { base->is_idle = true; + trace_timer_base_idle(true, base->cpu); + } } raw_spin_unlock(&base->lock); @@ -1984,7 +1989,10 @@ void timer_clear_idle(void) * sending the IPI a few instructions smaller for the cost of taking * the lock in the exit from idle path. */ - base->is_idle = false; + if (base->is_idle) { + base->is_idle = false; + trace_timer_base_idle(false, smp_processor_id()); + } } #endif From d124c3393e798b1fb142ee728d5c8976d11e722d Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:28 +0100 Subject: [PATCH 1002/1562] timers: Do not IPI for deferrable timers Deferrable timers do not prevent CPU from going idle and are not taken into account on idle path. Sending an IPI to a remote CPU when a new first deferrable timer was enqueued will wake up the remote CPU but nothing will be done regarding the deferrable timers. Drop IPI completely when a new first deferrable timer was enqueued. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-7-anna-maria@linutronix.de --- kernel/time/timer.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ed8d6063d9ef..91882059bf3d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -571,18 +571,15 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk, static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { - if (!is_timers_nohz_active()) - return; - /* - * TODO: This wants some optimizing similar to the code below, but we - * will do that when we switch from push to pull for deferrable timers. + * Deferrable timers do not prevent the CPU from entering dynticks and + * are not taken into account on the idle/nohz_full path. An IPI when a + * new deferrable timer is enqueued will wake up the remote CPU but + * nothing will be done with the deferrable timer base. Therefore skip + * the remote IPI for deferrable timers completely. */ - if (timer->flags & TIMER_DEFERRABLE) { - if (tick_nohz_full_cpu(base->cpu)) - wake_up_nohz_cpu(base->cpu); + if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE) return; - } /* * We might have to IPI the remote CPU if the base is idle and the From b5e6f59888c7bde3c05f61b3ce06b78a86713fc0 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:29 +0100 Subject: [PATCH 1003/1562] timers: Move store of next event into __next_timer_interrupt() Both call sites of __next_timer_interrupt() store the return value directly in base->next_expiry. Move the store into __next_timer_interrupt() and to make its purpose more clear, rename the function to next_expiry_recalc(). No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-8-anna-maria@linutronix.de --- kernel/time/timer.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 91882059bf3d..490ff8e66fc2 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1800,8 +1800,10 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset, /* * Search the first expiring timer in the various clock levels. Caller must * hold base->lock. + * + * Store next expiry time in base->next_expiry. */ -static unsigned long __next_timer_interrupt(struct timer_base *base) +static void next_expiry_recalc(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; @@ -1867,10 +1869,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk += adj; } + base->next_expiry = next; base->next_expiry_recalc = false; base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); - - return next; } #ifdef CONFIG_NO_HZ_COMMON @@ -1930,7 +1931,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) raw_spin_lock(&base->lock); if (base->next_expiry_recalc) - base->next_expiry = __next_timer_interrupt(base); + next_expiry_recalc(base); nextevt = base->next_expiry; /* @@ -2021,7 +2022,7 @@ static inline void __run_timers(struct timer_base *base) WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); base->clk++; - base->next_expiry = __next_timer_interrupt(base); + next_expiry_recalc(base); while (levels--) expire_timers(base, heads + levels); From 8a2c9c7e7848d7f63d38b698209148b5bb4ba7f3 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:30 +0100 Subject: [PATCH 1004/1562] timers: Clarify check in forward_timer_base() The current check whether a forward of the timer base is required can be simplified by using an already existing comparison function which is easier to read. The related comment is outdated and was not updated when the check changed in commit 36cd28a4cdd0 ("timers: Lower base clock forwarding threshold"). Use time_before_eq() for the check and replace the comment by copying the comment from the same check inside get_next_timer_interrupt(). Move the precious information of the outdated comment to the proper place in __run_timers(). No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-9-anna-maria@linutronix.de --- kernel/time/timer.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 490ff8e66fc2..f75f932b128e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -944,11 +944,10 @@ static inline void forward_timer_base(struct timer_base *base) unsigned long jnow = READ_ONCE(jiffies); /* - * No need to forward if we are close enough below jiffies. - * Also while executing timers, base->clk is 1 offset ahead - * of jiffies to avoid endless requeuing to current jiffies. + * Check whether we can forward the base. We can only do that when + * @basej is past base->clk otherwise we might rewind base->clk. */ - if ((long)(jnow - base->clk) < 1) + if (time_before_eq(jnow, base->clk)) return; /* @@ -2021,6 +2020,10 @@ static inline void __run_timers(struct timer_base *base) */ WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); + /* + * While executing timers, base->clk is set 1 offset ahead of + * jiffies to avoid endless requeuing to current jiffies. + */ base->clk++; next_expiry_recalc(base); From 1e490484aa3af42d4eeffabf96d6a02be69d586b Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:31 +0100 Subject: [PATCH 1005/1562] timers: Split out forward timer base functionality Forwarding timer base is done when the next expiry value is calculated and when a new timer is enqueued. When the next expiry value is calculated the jiffies value is already available and does not need to be reread a second time. Splitting out the forward timer base functionality to make it executable via both contextes - those where jiffies are already known and those, where jiffies need to be read. No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-10-anna-maria@linutronix.de --- kernel/time/timer.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index f75f932b128e..5b02e169ab23 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -939,30 +939,34 @@ get_target_base(struct timer_base *base, unsigned tflags) return get_timer_this_cpu_base(tflags); } -static inline void forward_timer_base(struct timer_base *base) +static inline void __forward_timer_base(struct timer_base *base, + unsigned long basej) { - unsigned long jnow = READ_ONCE(jiffies); - /* * Check whether we can forward the base. We can only do that when * @basej is past base->clk otherwise we might rewind base->clk. */ - if (time_before_eq(jnow, base->clk)) + if (time_before_eq(basej, base->clk)) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jnow)) { - base->clk = jnow; + if (time_after(base->next_expiry, basej)) { + base->clk = basej; } else { if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) return; base->clk = base->next_expiry; } + } +static inline void forward_timer_base(struct timer_base *base) +{ + __forward_timer_base(base, READ_ONCE(jiffies)); +} /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means From 7a39a5080ef0e3cf233d92165f6a778f08a08244 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:32 +0100 Subject: [PATCH 1006/1562] timers: Use already existing function for forwarding timer base There is an already existing function for forwarding the timer base. Forwarding the timer base is implemented directly in get_next_timer_interrupt() as well. Remove the code duplication and invoke __forward_timer_base() instead. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-11-anna-maria@linutronix.de --- kernel/time/timer.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 5b02e169ab23..1a73d396101b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1939,15 +1939,9 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) /* * We have a fresh next event. Check whether we can forward the - * base. We can only do that when @basej is past base->clk - * otherwise we might rewind base->clk. + * base. */ - if (time_after(basej, base->clk)) { - if (time_after(nextevt, basej)) - base->clk = basej; - else if (time_after(nextevt, base->clk)) - base->clk = nextevt; - } + __forward_timer_base(base, basej); if (time_before_eq(nextevt, basej)) { expires = basem; From bb8caad5083f8fbba70faf41f1d3bab7cf09da6d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Dec 2023 10:26:33 +0100 Subject: [PATCH 1007/1562] timers: Rework idle logic To improve readability of the code, split base->idle calculation and expires calculation into separate parts. While at it, update the comment about timer base idle marking. Thereby the following subtle change happens if the next event is just one jiffy ahead and the tick was already stopped: Originally base->is_idle remains true in this situation. Now base->is_idle turns to false. This may spare an IPI if a timer is enqueued remotely to an idle CPU that is going to tick on the next jiffy. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-12-anna-maria@linutronix.de --- kernel/time/timer.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 1a73d396101b..cf51655add64 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1924,6 +1924,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); u64 expires = KTIME_MAX; unsigned long nextevt; + bool was_idle; /* * Pretend that there is no timer pending if the cpu is offline. @@ -1943,27 +1944,26 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) */ __forward_timer_base(base, basej); - if (time_before_eq(nextevt, basej)) { - expires = basem; - if (base->is_idle) { - base->is_idle = false; - trace_timer_base_idle(false, base->cpu); - } - } else { - if (base->timers_pending) - expires = basem + (u64)(nextevt - basej) * TICK_NSEC; - /* - * If we expect to sleep more than a tick, mark the base idle. - * Also the tick is stopped so any added timer must forward - * the base clk itself to keep granularity small. This idle - * logic is only maintained for the BASE_STD base, deferrable - * timers may still see large granularity skew (by design). - */ - if ((expires - basem) > TICK_NSEC && !base->is_idle) { - base->is_idle = true; - trace_timer_base_idle(true, base->cpu); - } + if (base->timers_pending) { + /* If we missed a tick already, force 0 delta */ + if (time_before(nextevt, basej)) + nextevt = basej; + expires = basem + (u64)(nextevt - basej) * TICK_NSEC; } + + /* + * Base is idle if the next event is more than a tick away. + * + * If the base is marked idle then any timer add operation must forward + * the base clk itself to keep granularity small. This idle logic is + * only maintained for the BASE_STD base, deferrable timers may still + * see large granularity skew (by design). + */ + was_idle = base->is_idle; + base->is_idle = time_after(nextevt, basej + 1); + if (was_idle != base->is_idle) + trace_timer_base_idle(base->is_idle, base->cpu); + raw_spin_unlock(&base->lock); return cmp_next_hrtimer_event(basem, expires); From da65f29dada7f7cbbf0d6375b88a0316f5f7d6f5 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:34 +0100 Subject: [PATCH 1008/1562] timers: Fix nextevt calculation when no timers are pending When no timer is queued into an empty timer base, the next_expiry will not be updated. It was originally calculated as base->clk + NEXT_TIMER_MAX_DELTA When the timer base stays empty long enough (> NEXT_TIMER_MAX_DELTA), the next_expiry value of the empty base suggests that there is a timer pending soon. This might be more a kind of a theoretical problem, but the fix doesn't hurt. Use only base->next_expiry value as nextevt when timers are pending. Otherwise nextevt will be jiffies + NEXT_TIMER_MAX_DELTA. As all information is in place, update base->next_expiry value of the empty timer base as well. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-13-anna-maria@linutronix.de --- kernel/time/timer.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index cf51655add64..352b161113cd 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1922,8 +1922,8 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + unsigned long nextevt = basej + NEXT_TIMER_MAX_DELTA; u64 expires = KTIME_MAX; - unsigned long nextevt; bool was_idle; /* @@ -1936,7 +1936,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) raw_spin_lock(&base->lock); if (base->next_expiry_recalc) next_expiry_recalc(base); - nextevt = base->next_expiry; /* * We have a fresh next event. Check whether we can forward the @@ -1945,10 +1944,20 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) __forward_timer_base(base, basej); if (base->timers_pending) { + nextevt = base->next_expiry; + /* If we missed a tick already, force 0 delta */ if (time_before(nextevt, basej)) nextevt = basej; expires = basem + (u64)(nextevt - basej) * TICK_NSEC; + } else { + /* + * Move next_expiry for the empty base into the future to + * prevent a unnecessary raise of the timer softirq when the + * next_expiry value will be reached even if there is no timer + * pending. + */ + base->next_expiry = nextevt; } /* From 3b201c9af7c0cad2e8311d96c0c1b399606c70fa Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Wed, 20 Dec 2023 20:58:19 +0300 Subject: [PATCH 1009/1562] regmap: fix kcalloc() arguments order When compiling with gcc version 14.0.0 20231220 (experimental) and W=1, I've noticed a bunch of four similar warnings like: drivers/base/regmap/regmap-ram.c: In function '__regmap_init_ram': drivers/base/regmap/regmap-ram.c:68:37: warning: 'kcalloc' sizes specified with 'sizeof' in the earlier argument and not in the later argument [-Wcalloc-transposed-args] 68 | data->read = kcalloc(sizeof(bool), config->max_register + 1, | ^~~~ Since 'n' and 'size' arguments of 'kcalloc()' are multiplied to calculate the final size, their actual order doesn't affect the result and so this is not a bug. But it's still worth to fix it. Signed-off-by: Dmitry Antipov Link: https://msgid.link/r/20231220175829.533700-1-dmantipov@yandex.ru Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-ram.c | 4 ++-- drivers/base/regmap/regmap-raw-ram.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/base/regmap/regmap-ram.c b/drivers/base/regmap/regmap-ram.c index 85f34a5dee04..192d6b131dff 100644 --- a/drivers/base/regmap/regmap-ram.c +++ b/drivers/base/regmap/regmap-ram.c @@ -65,12 +65,12 @@ struct regmap *__regmap_init_ram(const struct regmap_config *config, return ERR_PTR(-EINVAL); } - data->read = kcalloc(sizeof(bool), config->max_register + 1, + data->read = kcalloc(config->max_register + 1, sizeof(bool), GFP_KERNEL); if (!data->read) return ERR_PTR(-ENOMEM); - data->written = kcalloc(sizeof(bool), config->max_register + 1, + data->written = kcalloc(config->max_register + 1, sizeof(bool), GFP_KERNEL); if (!data->written) return ERR_PTR(-ENOMEM); diff --git a/drivers/base/regmap/regmap-raw-ram.c b/drivers/base/regmap/regmap-raw-ram.c index 463adafa9532..93ae07b503fd 100644 --- a/drivers/base/regmap/regmap-raw-ram.c +++ b/drivers/base/regmap/regmap-raw-ram.c @@ -122,12 +122,12 @@ struct regmap *__regmap_init_raw_ram(const struct regmap_config *config, return ERR_PTR(-EINVAL); } - data->read = kcalloc(sizeof(bool), config->max_register + 1, + data->read = kcalloc(config->max_register + 1, sizeof(bool), GFP_KERNEL); if (!data->read) return ERR_PTR(-ENOMEM); - data->written = kcalloc(sizeof(bool), config->max_register + 1, + data->written = kcalloc(config->max_register + 1, sizeof(bool), GFP_KERNEL); if (!data->written) return ERR_PTR(-ENOMEM); From e0f4bd26e29bf6162cdc9dc6fb7522bde7b74d07 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Wed, 20 Dec 2023 08:35:35 +0800 Subject: [PATCH 1010/1562] PM: sleep: Remove obsolete comment from unlock_system_sleep() With the freezer changes introduced by commit f5d39b020809 ("freezer,sched: Rewrite core freezer logic"), the comment in unlock_system_sleep() has become obsolete, there is no need to retain it. Signed-off-by: Kevin Hao Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/kernel/power/main.c b/kernel/power/main.c index f6425ae3e8b0..b1ae9b677d03 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -60,22 +60,6 @@ EXPORT_SYMBOL_GPL(lock_system_sleep); void unlock_system_sleep(unsigned int flags) { - /* - * Don't use freezer_count() because we don't want the call to - * try_to_freeze() here. - * - * Reason: - * Fundamentally, we just don't need it, because freezing condition - * doesn't come into effect until we release the - * system_transition_mutex lock, since the freezer always works with - * system_transition_mutex held. - * - * More importantly, in the case of hibernation, - * unlock_system_sleep() gets called in snapshot_read() and - * snapshot_write() when the freezing condition is still in effect. - * Which means, if we use try_to_freeze() here, it would make them - * enter the refrigerator, thus causing hibernation to lockup. - */ if (!(flags & PF_NOFREEZE)) current->flags &= ~PF_NOFREEZE; mutex_unlock(&system_transition_mutex); From dadce3fbaf10250b35d540caff475ff93b259de0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Dec 2023 22:02:46 -0800 Subject: [PATCH 1011/1562] PM: hibernate: Repair excess function parameter description warning Function swsusp_close() does not have any parameters, so remove the description of parameter @exclusive to prevent this warning. swap.c:1573: warning: Excess function parameter 'exclusive' description in 'swsusp_close' Signed-off-by: Randy Dunlap [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 975e7195573b..6053ddddaf65 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1566,7 +1566,6 @@ put: /** * swsusp_close - close resume device. - * @exclusive: Close the resume device which is exclusively opened. */ void swsusp_close(void) From 4710642807ac46942b08e9bcc39ae6fd91e947fa Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 20 Dec 2023 16:38:21 +0100 Subject: [PATCH 1012/1562] gfs2: Minor gfs2_ail1_empty cleanup Change gfs2_ail1_empty() to return %true when the ail1 list is empty. Based on that, make the loop in empty_ail1_list() more obvious. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/log.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index e5271ae87d1c..61cd52a579d9 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -352,14 +352,15 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, * @sdp: The superblock * @max_revokes: If non-zero, add revokes where appropriate * - * Tries to empty the ail1 lists, starting with the oldest first + * Tries to empty the ail1 lists, starting with the oldest first. + * Returns %true if the ail1 list is now empty. */ -static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) +static bool gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) { struct gfs2_trans *tr, *s; int oldest_tr = 1; - int ret; + bool empty; spin_lock(&sdp->sd_ail_lock); list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { @@ -369,7 +370,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) oldest_tr = 0; } gfs2_log_update_flush_tail(sdp); - ret = list_empty(&sdp->sd_ail1_list); + empty = list_empty(&sdp->sd_ail1_list); spin_unlock(&sdp->sd_ail_lock); if (test_bit(SDF_WITHDRAWING, &sdp->sd_flags)) { @@ -377,7 +378,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) gfs2_withdraw(sdp); } - return ret; + return empty; } static void gfs2_ail1_wait(struct gfs2_sbd *sdp) @@ -974,8 +975,9 @@ void gfs2_ail_drain(struct gfs2_sbd *sdp) static void empty_ail1_list(struct gfs2_sbd *sdp) { unsigned long start = jiffies; + bool empty = false; - for (;;) { + while (!empty) { if (time_after(jiffies, start + (HZ * 600))) { fs_err(sdp, "Error: In %s for 10 minutes! t=%d\n", __func__, current->journal_info ? 1 : 0); @@ -984,8 +986,7 @@ static void empty_ail1_list(struct gfs2_sbd *sdp) } gfs2_ail1_start(sdp); gfs2_ail1_wait(sdp); - if (gfs2_ail1_empty(sdp, 0)) - return; + empty = gfs2_ail1_empty(sdp, 0); } } From 015af1af44003fff797f8632e940824c07d282bf Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 20 Dec 2023 17:05:26 +0100 Subject: [PATCH 1013/1562] gfs2: Mark withdraws as unlikely Mark the gfs2_withdrawn(), gfs2_withdrawing(), and gfs2_withdraw_in_prog() inline functions as likely to return %false. This allows to get rid of likely() and unlikely() annotations at the call sites of those functions. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/aops.c | 2 +- fs/gfs2/file.c | 2 +- fs/gfs2/glock.c | 4 ++-- fs/gfs2/meta_io.c | 6 +++--- fs/gfs2/ops_fstype.c | 2 +- fs/gfs2/super.c | 2 +- fs/gfs2/trans.c | 2 +- fs/gfs2/util.h | 10 +++++----- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9611bfceda4b..4482a5a9bdc9 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -462,7 +462,7 @@ static int gfs2_read_folio(struct file *file, struct folio *folio) error = mpage_read_folio(folio, gfs2_block_map); } - if (unlikely(gfs2_withdrawn(sdp))) + if (gfs2_withdrawn(sdp)) return -EIO; return error; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4b66efc1a82a..03902e780935 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1442,7 +1442,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if (unlikely(gfs2_withdrawn(sdp))) { + if (gfs2_withdrawn(sdp)) { if (fl->fl_type == F_UNLCK) locks_lock_file_wait(file, fl); return -EIO; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 2cb65f76eec8..b71dd7c8f65e 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -156,7 +156,7 @@ static bool glock_blocked_by_withdraw(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - if (likely(!gfs2_withdrawn(sdp))) + if (!gfs2_withdrawn(sdp)) return false; if (gl->gl_ops->go_flags & GLOF_NONDISK) return false; @@ -774,7 +774,7 @@ skip_inval: * gfs2_gl_hash_clear calls clear_glock) and recovery is complete * then it's okay to tell dlm to unlock it. */ - if (unlikely(sdp->sd_log_error && !gfs2_withdrawn(sdp))) + if (unlikely(sdp->sd_log_error) && !gfs2_withdrawn(sdp)) gfs2_withdraw_delayed(sdp); if (glock_blocked_by_withdraw(gl) && (target != LM_ST_UNLOCKED || diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 25ceb0805df2..299ae67ae85e 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -252,7 +252,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head *bh, *bhs[2]; int num = 0; - if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) { + if (gfs2_withdrawn(sdp) && !gfs2_withdraw_in_prog(sdp)) { *bhp = NULL; return -EIO; } @@ -310,7 +310,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) { - if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) + if (gfs2_withdrawn(sdp) && !gfs2_withdraw_in_prog(sdp)) return -EIO; wait_on_buffer(bh); @@ -321,7 +321,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) gfs2_io_error_bh_wd(sdp, bh); return -EIO; } - if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) + if (gfs2_withdrawn(sdp) && !gfs2_withdraw_in_prog(sdp)) return -EIO; return 0; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b108c5d26839..c6ec08909c69 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1073,7 +1073,7 @@ hostdata_error: void gfs2_lm_unmount(struct gfs2_sbd *sdp) { const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops; - if (likely(!gfs2_withdrawn(sdp)) && lm->lm_unmount) + if (!gfs2_withdrawn(sdp) && lm->lm_unmount) lm->lm_unmount(sdp); } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b5c75c8a8d62..85c77dd327ec 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -499,7 +499,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) return; } - if (unlikely(gfs2_withdrawn(sdp))) + if (gfs2_withdrawn(sdp)) return; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 7e835be7032d..1487fbb62d84 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -268,7 +268,7 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) (unsigned long long)bd->bd_bh->b_blocknr); BUG(); } - if (unlikely(gfs2_withdrawn(sdp))) { + if (gfs2_withdrawn(sdp)) { fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n", (unsigned long long)bd->bd_bh->b_blocknr); goto out_unlock; diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 11c9d59b6889..76acf0b39814 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -203,8 +203,8 @@ static inline void gfs2_withdraw_delayed(struct gfs2_sbd *sdp) */ static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp) { - return test_bit(SDF_WITHDRAWN, &sdp->sd_flags) || - test_bit(SDF_WITHDRAWING, &sdp->sd_flags); + return unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags) || + test_bit(SDF_WITHDRAWING, &sdp->sd_flags)); } /** @@ -213,13 +213,13 @@ static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp) */ static inline bool gfs2_withdrawing(struct gfs2_sbd *sdp) { - return test_bit(SDF_WITHDRAWING, &sdp->sd_flags) && - !test_bit(SDF_WITHDRAWN, &sdp->sd_flags); + return unlikely(test_bit(SDF_WITHDRAWING, &sdp->sd_flags) && + !test_bit(SDF_WITHDRAWN, &sdp->sd_flags)); } static inline bool gfs2_withdraw_in_prog(struct gfs2_sbd *sdp) { - return test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags); + return unlikely(test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags)); } #define gfs2_tune_get(sdp, field) \ From 4d927b03a68846e4e791ccde6b4c274df02f11e9 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 20 Dec 2023 17:16:29 +0100 Subject: [PATCH 1014/1562] gfs2: Rename gfs2_withdrawn to gfs2_withdrawing_or_withdrawn This function checks whether the filesystem has been been marked to be withdrawn eventually or has been withdrawn already. Rename this function to avoid confusing code like checking for gfs2_withdrawing() when gfs2_withdrawn() has already returned true. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/aops.c | 2 +- fs/gfs2/file.c | 2 +- fs/gfs2/glock.c | 8 ++++---- fs/gfs2/glops.c | 2 +- fs/gfs2/lock_dlm.c | 8 ++++---- fs/gfs2/log.c | 21 +++++++++++---------- fs/gfs2/meta_io.c | 9 ++++++--- fs/gfs2/ops_fstype.c | 2 +- fs/gfs2/quota.c | 8 ++++---- fs/gfs2/recovery.c | 2 +- fs/gfs2/super.c | 10 +++++----- fs/gfs2/sys.c | 2 +- fs/gfs2/trans.c | 2 +- fs/gfs2/util.c | 4 ++-- fs/gfs2/util.h | 5 +++-- 15 files changed, 46 insertions(+), 41 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 4482a5a9bdc9..d551b9c94935 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -462,7 +462,7 @@ static int gfs2_read_folio(struct file *file, struct folio *folio) error = mpage_read_folio(folio, gfs2_block_map); } - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) return -EIO; return error; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 03902e780935..992ca4effb50 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1442,7 +1442,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { if (fl->fl_type == F_UNLCK) locks_lock_file_wait(file, fl); return -EIO; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b71dd7c8f65e..45f5c88d2622 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -156,7 +156,7 @@ static bool glock_blocked_by_withdraw(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - if (!gfs2_withdrawn(sdp)) + if (!gfs2_withdrawing_or_withdrawn(sdp)) return false; if (gl->gl_ops->go_flags & GLOF_NONDISK) return false; @@ -278,7 +278,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); if (mapping) { truncate_inode_pages_final(mapping); - if (!gfs2_withdrawn(sdp)) + if (!gfs2_withdrawing_or_withdrawn(sdp)) GLOCK_BUG_ON(gl, !mapping_empty(mapping)); } trace_gfs2_glock_put(gl); @@ -774,7 +774,7 @@ skip_inval: * gfs2_gl_hash_clear calls clear_glock) and recovery is complete * then it's okay to tell dlm to unlock it. */ - if (unlikely(sdp->sd_log_error) && !gfs2_withdrawn(sdp)) + if (unlikely(sdp->sd_log_error) && !gfs2_withdrawing_or_withdrawn(sdp)) gfs2_withdraw_delayed(sdp); if (glock_blocked_by_withdraw(gl) && (target != LM_ST_UNLOCKED || @@ -811,7 +811,7 @@ skip_inval: gfs2_glock_queue_work(gl, 0); } else if (ret) { fs_err(sdp, "lm_lock ret %d\n", ret); - GLOCK_BUG_ON(gl, !gfs2_withdrawn(sdp)); + GLOCK_BUG_ON(gl, !gfs2_withdrawing_or_withdrawn(sdp)); } } else { /* lock_nolock */ finish_xmote(gl, target); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 15d0e653fd2b..45653cbc8a87 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -174,7 +174,7 @@ static int gfs2_rgrp_metasync(struct gfs2_glock *gl) filemap_fdatawrite_range(metamapping, start, end); error = filemap_fdatawait_range(metamapping, start, end); - WARN_ON_ONCE(error && !gfs2_withdrawn(sdp)); + WARN_ON_ONCE(error && !gfs2_withdrawing_or_withdrawn(sdp)); mapping_set_error(metamapping, error); if (error) gfs2_io_error(sdp); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 59ab18c79889..d1ac5d0679ea 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1122,7 +1122,7 @@ static void gdlm_recover_prep(void *arg) struct gfs2_sbd *sdp = arg; struct lm_lockstruct *ls = &sdp->sd_lockstruct; - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_err(sdp, "recover_prep ignored due to withdraw.\n"); return; } @@ -1148,7 +1148,7 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot) struct lm_lockstruct *ls = &sdp->sd_lockstruct; int jid = slot->slot - 1; - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_err(sdp, "recover_slot jid %d ignored due to withdraw.\n", jid); return; @@ -1177,7 +1177,7 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, struct gfs2_sbd *sdp = arg; struct lm_lockstruct *ls = &sdp->sd_lockstruct; - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_err(sdp, "recover_done ignored due to withdraw.\n"); return; } @@ -1208,7 +1208,7 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, { struct lm_lockstruct *ls = &sdp->sd_lockstruct; - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_err(sdp, "recovery_result jid %d ignored due to withdraw.\n", jid); return; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 61cd52a579d9..8691839104b7 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -126,7 +126,7 @@ __acquires(&sdp->sd_ail_lock) } } - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { gfs2_remove_from_ail(bd); continue; } @@ -842,7 +842,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, struct super_block *sb = sdp->sd_vfs; u64 dblock; - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) return; page = mempool_alloc(gfs2_page_pool, GFP_NOIO); @@ -1048,7 +1048,8 @@ repeat: * Do this check while holding the log_flush_lock to prevent new * buffers from being added to the ail via gfs2_pin() */ - if (gfs2_withdrawn(sdp) || !test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + if (gfs2_withdrawing_or_withdrawn(sdp) || + !test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) goto out; /* Log might have been flushed while we waited for the flush lock */ @@ -1097,13 +1098,13 @@ repeat: goto out_withdraw; gfs2_ordered_write(sdp); - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) goto out_withdraw; lops_before_commit(sdp, tr); - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) goto out_withdraw; gfs2_log_submit_bio(&sdp->sd_jdesc->jd_log_bio, REQ_OP_WRITE); - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) goto out_withdraw; if (sdp->sd_log_head != sdp->sd_log_flush_head) { @@ -1111,7 +1112,7 @@ repeat: } else if (sdp->sd_log_tail != sdp->sd_log_flush_tail && !sdp->sd_log_idle) { log_write_header(sdp, flags); } - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) goto out_withdraw; lops_after_commit(sdp, tr); @@ -1129,7 +1130,7 @@ repeat: if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) { if (!sdp->sd_log_idle) { empty_ail1_list(sdp); - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) goto out_withdraw; log_write_header(sdp, flags); } @@ -1299,7 +1300,7 @@ int gfs2_logd(void *data) unsigned long t = 1; while (!kthread_should_stop()) { - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) break; /* Check for errors writing to the journal */ @@ -1338,7 +1339,7 @@ int gfs2_logd(void *data) gfs2_ail_flush_reqd(sdp) || gfs2_jrnl_flush_reqd(sdp) || sdp->sd_log_error || - gfs2_withdrawn(sdp) || + gfs2_withdrawing_or_withdrawn(sdp) || kthread_should_stop(), t); } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 299ae67ae85e..f814054c8cd0 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -252,7 +252,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head *bh, *bhs[2]; int num = 0; - if (gfs2_withdrawn(sdp) && !gfs2_withdraw_in_prog(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp) && + !gfs2_withdraw_in_prog(sdp)) { *bhp = NULL; return -EIO; } @@ -310,7 +311,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) { - if (gfs2_withdrawn(sdp) && !gfs2_withdraw_in_prog(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp) && + !gfs2_withdraw_in_prog(sdp)) return -EIO; wait_on_buffer(bh); @@ -321,7 +323,8 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) gfs2_io_error_bh_wd(sdp, bh); return -EIO; } - if (gfs2_withdrawn(sdp) && !gfs2_withdraw_in_prog(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp) && + !gfs2_withdraw_in_prog(sdp)) return -EIO; return 0; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c6ec08909c69..9c6f1a8fb5fb 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1073,7 +1073,7 @@ hostdata_error: void gfs2_lm_unmount(struct gfs2_sbd *sdp) { const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops; - if (!gfs2_withdrawn(sdp) && lm->lm_unmount) + if (!gfs2_withdrawing_or_withdrawn(sdp) && lm->lm_unmount) lm->lm_unmount(sdp); } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index f139ce8cf5ce..9ade69f8d338 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -128,7 +128,7 @@ static void gfs2_qd_dispose(struct gfs2_quota_data *qd) hlist_bl_del_rcu(&qd->qd_hlist); spin_unlock_bucket(qd->qd_hash); - if (!gfs2_withdrawn(sdp)) { + if (!gfs2_withdrawing_or_withdrawn(sdp)) { gfs2_assert_warn(sdp, !qd->qd_change); gfs2_assert_warn(sdp, !qd->qd_slot_ref); gfs2_assert_warn(sdp, !qd->qd_bh_count); @@ -1540,7 +1540,7 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) { if (error == 0 || error == -EROFS) return; - if (!gfs2_withdrawn(sdp)) { + if (!gfs2_withdrawing_or_withdrawn(sdp)) { if (!cmpxchg(&sdp->sd_log_error, 0, error)) fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); wake_up(&sdp->sd_logd_waitq); @@ -1584,7 +1584,7 @@ int gfs2_quotad(void *data) unsigned long t = 0; while (!kthread_should_stop()) { - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) break; /* Update the master statfs file */ @@ -1608,7 +1608,7 @@ int gfs2_quotad(void *data) t = wait_event_interruptible_timeout(sdp->sd_quota_wait, sdp->sd_statfs_force_sync || - gfs2_withdrawn(sdp) || + gfs2_withdrawing_or_withdrawn(sdp) || kthread_should_stop(), t); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 5aae02669a40..f4fe7039f725 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -411,7 +411,7 @@ void gfs2_recover_func(struct work_struct *work) int error = 0; int jlocked = 0; - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_err(sdp, "jid=%u: Recovery not attempted due to withdraw.\n", jd->jd_jid); goto fail; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 85c77dd327ec..cf3431486fd4 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -134,7 +134,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) int error; j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) return -EIO; error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); @@ -153,7 +153,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) gfs2_log_pointers_init(sdp, head.lh_blkno); error = gfs2_quota_init(sdp); - if (!error && gfs2_withdrawn(sdp)) + if (!error && gfs2_withdrawing_or_withdrawn(sdp)) error = -EIO; if (!error) set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); @@ -499,7 +499,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) return; } - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) return; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); @@ -605,7 +605,7 @@ restart: if (!sb_rdonly(sb)) gfs2_make_fs_ro(sdp); else { - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) gfs2_destroy_threads(sdp); gfs2_quota_cleanup(sdp); @@ -685,7 +685,7 @@ static int gfs2_freeze_locally(struct gfs2_sbd *sdp) if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE | GFS2_LFC_FREEZE_GO_SYNC); - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); if (error) return error; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 60a0206890c5..250f340cb44d 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -193,7 +193,7 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) { - unsigned int b = gfs2_withdrawn(sdp); + unsigned int b = gfs2_withdrawing_or_withdrawn(sdp); return snprintf(buf, PAGE_SIZE, "%u\n", b); } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 1487fbb62d84..192213c7359a 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -268,7 +268,7 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) (unsigned long long)bd->bd_bh->b_blocknr); BUG(); } - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n", (unsigned long long)bd->bd_bh->b_blocknr); goto out_unlock; diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index da29fafb6272..f52141ce9485 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -372,7 +372,7 @@ void gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, const char *function, char *file, unsigned int line, bool delayed) { - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) return; fs_err(sdp, @@ -548,7 +548,7 @@ void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, const char *function, char *file, unsigned int line, bool withdraw) { - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) return; fs_err(sdp, "fatal: I/O error\n" diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 76acf0b39814..ba071998461f 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -198,10 +198,11 @@ static inline void gfs2_withdraw_delayed(struct gfs2_sbd *sdp) } /** - * gfs2_withdrawn - test whether the file system is withdrawing or withdrawn + * gfs2_withdrawing_or_withdrawn - test whether the file system is withdrawing + * or withdrawn * @sdp: the superblock */ -static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp) +static inline bool gfs2_withdrawing_or_withdrawn(struct gfs2_sbd *sdp) { return unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags) || test_bit(SDF_WITHDRAWING, &sdp->sd_flags)); From e0f1f021782d6a2e719a451218554a8198c77120 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 20 Dec 2023 18:09:22 +0100 Subject: [PATCH 1015/1562] gfs2: Lift withdraw check out of gfs2_ail1_empty Lift the check for the SDF_WITHDRAWING flag out of gfs2_ail1_empty() and into its callers. This is needed so that gfs2_flush_revokes() can drop the sd_log_lock spinlock before triggering a withdraw if necessary. Instead of checking for the SDF_WITHDRAWING flag, use gfs2_withdrawing(). Also, the low-level code triggering the delayed withdraw reports when there is a problem, so there is no need to report that again. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/log.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 8691839104b7..fdef6bc77c54 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -373,11 +373,6 @@ static bool gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) empty = list_empty(&sdp->sd_ail1_list); spin_unlock(&sdp->sd_ail_lock); - if (test_bit(SDF_WITHDRAWING, &sdp->sd_flags)) { - gfs2_lm(sdp, "fatal: I/O error(s)\n"); - gfs2_withdraw(sdp); - } - return empty; } @@ -815,6 +810,9 @@ void gfs2_flush_revokes(struct gfs2_sbd *sdp) gfs2_log_lock(sdp); gfs2_ail1_empty(sdp, max_revokes); gfs2_log_unlock(sdp); + + if (gfs2_withdrawing(sdp)) + gfs2_withdraw(sdp); } /** @@ -987,7 +985,13 @@ static void empty_ail1_list(struct gfs2_sbd *sdp) gfs2_ail1_start(sdp); gfs2_ail1_wait(sdp); empty = gfs2_ail1_empty(sdp, 0); + + if (gfs2_withdrawing_or_withdrawn(sdp)) + break; } + + if (gfs2_withdrawing(sdp)) + gfs2_withdraw(sdp); } /** @@ -1344,6 +1348,9 @@ int gfs2_logd(void *data) t); } + if (gfs2_withdrawing(sdp)) + gfs2_withdraw(sdp); + return 0; } From ff7a85af5a5bdda04756a8cdbdc0dd9a7a8ea468 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 6 Dec 2023 19:58:06 +0000 Subject: [PATCH 1016/1562] gfs2: Remove use of error flag in journal reads Conventionally, we use the uptodate bit to signal whether a read encountered an error or not. Use folio_end_read() to set the uptodate bit on success. Also use filemap_set_wb_err() to communicate the errno instead of the more heavy-weight mapping_set_error(). Signed-off-by: "Matthew Wilcox (Oracle)" Signed-off-by: Andreas Gruenbacher --- fs/gfs2/lops.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 483f69807062..314ec2a70167 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -391,22 +391,15 @@ static void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) * Simply unlock the pages in the bio. The main thread will wait on them and * process them in order as necessary. */ - static void gfs2_end_log_read(struct bio *bio) { - struct page *page; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + int error = blk_status_to_errno(bio->bi_status); + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - page = bvec->bv_page; - if (bio->bi_status) { - int err = blk_status_to_errno(bio->bi_status); - - SetPageError(page); - mapping_set_error(page->mapping, err); - } - unlock_page(page); + bio_for_each_folio_all(fi, bio) { + /* We're abusing wb_err to get the error to gfs2_find_jhead */ + filemap_set_wb_err(fi.folio->mapping, error); + folio_end_read(fi.folio, !error); } bio_put(bio); @@ -475,7 +468,7 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, folio = filemap_get_folio(jd->jd_inode->i_mapping, index); folio_wait_locked(folio); - if (folio_test_error(folio)) + if (!folio_test_uptodate(folio)) *done = true; if (!*done) From 367e753d5c54a414d82610eb709fe71fda6cf1c3 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 20 Dec 2023 14:38:58 -0500 Subject: [PATCH 1017/1562] dlm: fix format seq ops type 4 This patch fixes to set the type 4 format ops in case of table_open4(). It got accidentially changed by commit 541adb0d4d10 ("fs: dlm: debugfs for queued callbacks") and since them toss debug dumps the same format as format 5 that are the queued ast callbacks for lkbs. Fixes: 541adb0d4d10 ("fs: dlm: debugfs for queued callbacks") Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/debug_fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 42f332f46359..c587bfadeff4 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -748,7 +748,7 @@ static int table_open4(struct inode *inode, struct file *file) struct seq_file *seq; int ret; - ret = seq_open(file, &format5_seq_ops); + ret = seq_open(file, &format4_seq_ops); if (ret) return ret; From 5beebc1dda47719dac85830c53bca1a0ab497d96 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 20 Dec 2023 14:38:59 -0500 Subject: [PATCH 1018/1562] dlm: update format header reflect current format Over the time the dlm debugfs format string has been changed but the header wasn't updated. This patch changes the first line dump header and their meaning to reflect the current formats. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/debug_fs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index c587bfadeff4..4fa11d9ddbb6 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -443,14 +443,14 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr) break; case 3: if (ri->header) { - seq_puts(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n"); + seq_puts(seq, "rsb ptr nodeid first_lkid flags !root_list_empty !recover_list_empty recover_locks_count len\n"); ri->header = 0; } print_format3(ri->rsb, seq); break; case 4: if (ri->header) { - seq_puts(seq, "version 4 rsb 2\n"); + seq_puts(seq, "rsb ptr nodeid master_nodeid dir_nodeid our_nodeid toss_time flags len str|hex name\n"); ri->header = 0; } print_format4(ri->rsb, seq); From 5143eecd2af2b5424f7b96d53f17bb4718e46bd3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 13 Dec 2023 12:59:49 -0800 Subject: [PATCH 1019/1562] lib/maple_tree.c: fix build error due to hotfix alteration Commit 0de56e38b307 ("maple_tree: use maple state end for write operations") was broken by a later patch "maple_tree: do not preallocate nodes for slot stores". But the later patch was scheduled ahead of 0de56e38b307, for 6.7-rc. This fixlet undoes the damage. Fixes: 0de56e38b307 ("maple_tree: use maple state end for write operations") Cc: Liam R. Howlett Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d30815cbab80..4a69148963e0 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5477,7 +5477,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) node_size = mas_wr_new_end(&wr_mas); /* Slot store, does not require additional nodes */ - if (node_size == wr_mas.node_end) { + if (node_size == mas->end) { /* reuse node */ if (!mt_in_rcu(mas->tree)) return 0; From 4a3bfbd1699e2306731809d50d480634012ed4de Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 14 Dec 2023 07:32:58 -0800 Subject: [PATCH 1020/1562] mm/list_lru.c: remove unused list_lru_from_kmem() Fixes: 0a97c01cd20bb ("list_lru: allow explicit memcg and NUMA node selection) Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312141318.q8b5yrAq-lkp@intel.com/ Cc: Nhat Pham Cc: Johannes Weiner Cc: Johannes Weiner Cc: Bagas Sanjaya Signed-off-by: Andrew Morton --- mm/list_lru.c | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index fcca67ac26ec..35b0147542a9 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -59,28 +59,6 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) } return &lru->node[nid].lru; } - -static inline struct list_lru_one * -list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, - struct mem_cgroup **memcg_ptr) -{ - struct list_lru_node *nlru = &lru->node[nid]; - struct list_lru_one *l = &nlru->lru; - struct mem_cgroup *memcg = NULL; - - if (!list_lru_memcg_aware(lru)) - goto out; - - memcg = mem_cgroup_from_slab_obj(ptr); - if (!memcg) - goto out; - - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); -out: - if (memcg_ptr) - *memcg_ptr = memcg; - return l; -} #else static void list_lru_register(struct list_lru *lru) { @@ -105,15 +83,6 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { return &lru->node[nid].lru; } - -static inline struct list_lru_one * -list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, - struct mem_cgroup **memcg_ptr) -{ - if (memcg_ptr) - *memcg_ptr = NULL; - return &lru->node[nid].lru; -} #endif /* CONFIG_MEMCG_KMEM */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, From 508bed884767a8eb394640bae9edcdf082816c43 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 29 Nov 2023 03:21:49 +0000 Subject: [PATCH 1021/1562] mm: memcg: change flush_next_time to flush_last_time Patch series "mm: memcg: subtree stats flushing and thresholds", v4. This series attempts to address shortages in today's approach for memcg stats flushing, namely occasionally stale or expensive stat reads. The series does so by changing the threshold that we use to decide whether to trigger a flush to be per memcg instead of global (patch 3), and then changing flushing to be per memcg (i.e. subtree flushes) instead of global (patch 5). This patch (of 5): flush_next_time is an inaccurate name. It's not the next time that periodic flushing will happen, it's rather the next time that ratelimited flushing can happen if the periodic flusher is late. Simplify its semantics by just storing the timestamp of the last flush instead, flush_last_time. Move the 2*FLUSH_TIME addition to mem_cgroup_flush_stats_ratelimited(), and add a comment explaining it. This way, all the ratelimiting semantics live in one place. No functional change intended. Link: https://lkml.kernel.org/r/20231129032154.3710765-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20231129032154.3710765-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Tested-by: Domenico Cerasuolo Acked-by: Shakeel Butt Acked-by: Chris Li (Google) Tested-by: Bagas Sanjaya Cc: Greg Thelen Cc: Ivan Babrou Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutny Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Cc: Waiman Long Cc: Wei Xu Signed-off-by: Andrew Morton --- mm/memcontrol.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 69b0ad455242..0dc53e0b5e5c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -593,7 +593,7 @@ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_ongoing = ATOMIC_INIT(0); static atomic_t stats_flush_threshold = ATOMIC_INIT(0); -static u64 flush_next_time; +static u64 flush_last_time; #define FLUSH_TIME (2UL*HZ) @@ -653,7 +653,7 @@ static void do_flush_stats(void) atomic_xchg(&stats_flush_ongoing, 1)) return; - WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME); + WRITE_ONCE(flush_last_time, jiffies_64); cgroup_rstat_flush(root_mem_cgroup->css.cgroup); @@ -669,7 +669,8 @@ void mem_cgroup_flush_stats(void) void mem_cgroup_flush_stats_ratelimited(void) { - if (time_after64(jiffies_64, READ_ONCE(flush_next_time))) + /* Only flush if the periodic flusher is one full cycle late */ + if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME)) mem_cgroup_flush_stats(); } From e0bf1dc859fdd08ef738824710770a30a8069433 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 29 Nov 2023 03:21:50 +0000 Subject: [PATCH 1022/1562] mm: memcg: move vmstats structs definition above flushing code The following patch will make use of those structs in the flushing code, so move their definitions (and a few other dependencies) a little bit up to reduce the diff noise in the following patch. No functional change intended. Link: https://lkml.kernel.org/r/20231129032154.3710765-3-yosryahmed@google.com Signed-off-by: Yosry Ahmed Tested-by: Domenico Cerasuolo Acked-by: Shakeel Butt Cc: Chris Li Cc: Greg Thelen Cc: Ivan Babrou Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutny Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Cc: Waiman Long Cc: Wei Xu Signed-off-by: Andrew Morton --- mm/memcontrol.c | 148 ++++++++++++++++++++++++------------------------ 1 file changed, 74 insertions(+), 74 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0dc53e0b5e5c..03c503def835 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -573,6 +573,80 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } +/* Subset of vm_event_item to report for memcg event stats */ +static const unsigned int memcg_vm_event_stat[] = { + PGPGIN, + PGPGOUT, + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, + PGFAULT, + PGMAJFAULT, + PGREFILL, + PGACTIVATE, + PGDEACTIVATE, + PGLAZYFREE, + PGLAZYFREED, +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + ZSWPIN, + ZSWPOUT, + ZSWPWB, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + THP_FAULT_ALLOC, + THP_COLLAPSE_ALLOC, + THP_SWPOUT, + THP_SWPOUT_FALLBACK, +#endif +}; + +#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) +static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; + +static void init_memcg_events(void) +{ + int i; + + for (i = 0; i < NR_MEMCG_EVENTS; ++i) + mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; +} + +static inline int memcg_events_index(enum vm_event_item idx) +{ + return mem_cgroup_events_index[idx] - 1; +} + +struct memcg_vmstats_percpu { + /* Local (CPU and cgroup) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_MEMCG_EVENTS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[MEMCG_NR_STAT]; + unsigned long events_prev[NR_MEMCG_EVENTS]; + + /* Cgroup1: threshold notifications & softlimit tree updates */ + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +struct memcg_vmstats { + /* Aggregated (CPU and subtree) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_MEMCG_EVENTS]; + + /* Non-hierarchical (CPU aggregated) page state & events */ + long state_local[MEMCG_NR_STAT]; + unsigned long events_local[NR_MEMCG_EVENTS]; + + /* Pending child counts during tree propagation */ + long state_pending[MEMCG_NR_STAT]; + unsigned long events_pending[NR_MEMCG_EVENTS]; +}; + /* * memcg and lruvec stats flushing * @@ -684,80 +758,6 @@ static void flush_memcg_stats_dwork(struct work_struct *w) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } -/* Subset of vm_event_item to report for memcg event stats */ -static const unsigned int memcg_vm_event_stat[] = { - PGPGIN, - PGPGOUT, - PGSCAN_KSWAPD, - PGSCAN_DIRECT, - PGSCAN_KHUGEPAGED, - PGSTEAL_KSWAPD, - PGSTEAL_DIRECT, - PGSTEAL_KHUGEPAGED, - PGFAULT, - PGMAJFAULT, - PGREFILL, - PGACTIVATE, - PGDEACTIVATE, - PGLAZYFREE, - PGLAZYFREED, -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) - ZSWPIN, - ZSWPOUT, - ZSWPWB, -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - THP_FAULT_ALLOC, - THP_COLLAPSE_ALLOC, - THP_SWPOUT, - THP_SWPOUT_FALLBACK, -#endif -}; - -#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) -static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; - -static void init_memcg_events(void) -{ - int i; - - for (i = 0; i < NR_MEMCG_EVENTS; ++i) - mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; -} - -static inline int memcg_events_index(enum vm_event_item idx) -{ - return mem_cgroup_events_index[idx] - 1; -} - -struct memcg_vmstats_percpu { - /* Local (CPU and cgroup) page state & events */ - long state[MEMCG_NR_STAT]; - unsigned long events[NR_MEMCG_EVENTS]; - - /* Delta calculation for lockless upward propagation */ - long state_prev[MEMCG_NR_STAT]; - unsigned long events_prev[NR_MEMCG_EVENTS]; - - /* Cgroup1: threshold notifications & softlimit tree updates */ - unsigned long nr_page_events; - unsigned long targets[MEM_CGROUP_NTARGETS]; -}; - -struct memcg_vmstats { - /* Aggregated (CPU and subtree) page state & events */ - long state[MEMCG_NR_STAT]; - unsigned long events[NR_MEMCG_EVENTS]; - - /* Non-hierarchical (CPU aggregated) page state & events */ - long state_local[MEMCG_NR_STAT]; - unsigned long events_local[NR_MEMCG_EVENTS]; - - /* Pending child counts during tree propagation */ - long state_pending[MEMCG_NR_STAT]; - unsigned long events_pending[NR_MEMCG_EVENTS]; -}; - unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { long x = READ_ONCE(memcg->vmstats->state[idx]); From 8d59d2214c2362e7a9d185d80b613e632581af7b Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 29 Nov 2023 03:21:51 +0000 Subject: [PATCH 1023/1562] mm: memcg: make stats flushing threshold per-memcg A global counter for the magnitude of memcg stats update is maintained on the memcg side to avoid invoking rstat flushes when the pending updates are not significant. This avoids unnecessary flushes, which are not very cheap even if there isn't a lot of stats to flush. It also avoids unnecessary lock contention on the underlying global rstat lock. Make this threshold per-memcg. The scheme is followed where percpu (now also per-memcg) counters are incremented in the update path, and only propagated to per-memcg atomics when they exceed a certain threshold. This provides two benefits: (a) On large machines with a lot of memcgs, the global threshold can be reached relatively fast, so guarding the underlying lock becomes less effective. Making the threshold per-memcg avoids this. (b) Having a global threshold makes it hard to do subtree flushes, as we cannot reset the global counter except for a full flush. Per-memcg counters removes this as a blocker from doing subtree flushes, which helps avoid unnecessary work when the stats of a small subtree are needed. Nothing is free, of course. This comes at a cost: (a) A new per-cpu counter per memcg, consuming NR_CPUS * NR_MEMCGS * 4 bytes. The extra memory usage is insigificant. (b) More work on the update side, although in the common case it will only be percpu counter updates. The amount of work scales with the number of ancestors (i.e. tree depth). This is not a new concept, adding a cgroup to the rstat tree involves a parent loop, so is charging. Testing results below show no significant regressions. (c) The error margin in the stats for the system as a whole increases from NR_CPUS * MEMCG_CHARGE_BATCH to NR_CPUS * MEMCG_CHARGE_BATCH * NR_MEMCGS. This is probably fine because we have a similar per-memcg error in charges coming from percpu stocks, and we have a periodic flusher that makes sure we always flush all the stats every 2s anyway. This patch was tested to make sure no significant regressions are introduced on the update path as follows. The following benchmarks were ran in a cgroup that is 2 levels deep (/sys/fs/cgroup/a/b/): (1) Running 22 instances of netperf on a 44 cpu machine with hyperthreading disabled. All instances are run in a level 2 cgroup, as well as netserver: # netserver -6 # netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K Averaging 20 runs, the numbers are as follows: Base: 40198.0 mbps Patched: 38629.7 mbps (-3.9%) The regression is minimal, especially for 22 instances in the same cgroup sharing all ancestors (so updating the same atomics). (2) will-it-scale page_fault tests. These tests (specifically per_process_ops in page_fault3 test) detected a 25.9% regression before for a change in the stats update path [1]. These are the numbers from 10 runs (+ is good) on a machine with 256 cpus: LABEL | MEAN | MEDIAN | STDDEV | ------------------------------+-------------+-------------+------------- page_fault1_per_process_ops | | | | (A) base | 270249.164 | 265437.000 | 13451.836 | (B) patched | 261368.709 | 255725.000 | 13394.767 | | -3.29% | -3.66% | | page_fault1_per_thread_ops | | | | (A) base | 242111.345 | 239737.000 | 10026.031 | (B) patched | 237057.109 | 235305.000 | 9769.687 | | -2.09% | -1.85% | | page_fault1_scalability | | | (A) base | 0.034387 | 0.035168 | 0.0018283 | (B) patched | 0.033988 | 0.034573 | 0.0018056 | | -1.16% | -1.69% | | page_fault2_per_process_ops | | | (A) base | 203561.836 | 203301.000 | 2550.764 | (B) patched | 197195.945 | 197746.000 | 2264.263 | | -3.13% | -2.73% | | page_fault2_per_thread_ops | | | (A) base | 171046.473 | 170776.000 | 1509.679 | (B) patched | 166626.327 | 166406.000 | 768.753 | | -2.58% | -2.56% | | page_fault2_scalability | | | (A) base | 0.054026 | 0.053821 | 0.00062121 | (B) patched | 0.053329 | 0.05306 | 0.00048394 | | -1.29% | -1.41% | | page_fault3_per_process_ops | | | (A) base | 1295807.782 | 1297550.000 | 5907.585 | (B) patched | 1275579.873 | 1273359.000 | 8759.160 | | -1.56% | -1.86% | | page_fault3_per_thread_ops | | | (A) base | 391234.164 | 390860.000 | 1760.720 | (B) patched | 377231.273 | 376369.000 | 1874.971 | | -3.58% | -3.71% | | page_fault3_scalability | | | (A) base | 0.60369 | 0.60072 | 0.0083029 | (B) patched | 0.61733 | 0.61544 | 0.009855 | | +2.26% | +2.45% | | All regressions seem to be minimal, and within the normal variance for the benchmark. The fix for [1] assumes that 3% is noise -- and there were no further practical complaints), so hopefully this means that such variations in these microbenchmarks do not reflect on practical workloads. (3) I also ran stress-ng in a nested cgroup and did not observe any obvious regressions. [1]https://lore.kernel.org/all/20190520063534.GB19312@shao2-debian/ Link: https://lkml.kernel.org/r/20231129032154.3710765-4-yosryahmed@google.com Signed-off-by: Yosry Ahmed Suggested-by: Johannes Weiner Tested-by: Domenico Cerasuolo Acked-by: Shakeel Butt Cc: Chris Li Cc: Greg Thelen Cc: Ivan Babrou Cc: Michal Hocko Cc: Michal Koutny Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Cc: Waiman Long Cc: Wei Xu Signed-off-by: Andrew Morton --- mm/memcontrol.c | 50 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 03c503def835..c5aa0c2cb68b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -631,6 +631,9 @@ struct memcg_vmstats_percpu { /* Cgroup1: threshold notifications & softlimit tree updates */ unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; + + /* Stats updates since the last flush */ + unsigned int stats_updates; }; struct memcg_vmstats { @@ -645,6 +648,9 @@ struct memcg_vmstats { /* Pending child counts during tree propagation */ long state_pending[MEMCG_NR_STAT]; unsigned long events_pending[NR_MEMCG_EVENTS]; + + /* Stats updates since the last flush */ + atomic64_t stats_updates; }; /* @@ -664,9 +670,7 @@ struct memcg_vmstats { */ static void flush_memcg_stats_dwork(struct work_struct *w); static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); -static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_ongoing = ATOMIC_INIT(0); -static atomic_t stats_flush_threshold = ATOMIC_INIT(0); static u64 flush_last_time; #define FLUSH_TIME (2UL*HZ) @@ -693,26 +697,37 @@ static void memcg_stats_unlock(void) preempt_enable_nested(); } + +static bool memcg_should_flush_stats(struct mem_cgroup *memcg) +{ + return atomic64_read(&memcg->vmstats->stats_updates) > + MEMCG_CHARGE_BATCH * num_online_cpus(); +} + static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { + int cpu = smp_processor_id(); unsigned int x; if (!val) return; - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + cgroup_rstat_updated(memcg->css.cgroup, cpu); + + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + x = __this_cpu_add_return(memcg->vmstats_percpu->stats_updates, + abs(val)); + + if (x < MEMCG_CHARGE_BATCH) + continue; - x = __this_cpu_add_return(stats_updates, abs(val)); - if (x > MEMCG_CHARGE_BATCH) { /* - * If stats_flush_threshold exceeds the threshold - * (>num_online_cpus()), cgroup stats update will be triggered - * in __mem_cgroup_flush_stats(). Increasing this var further - * is redundant and simply adds overhead in atomic update. + * If @memcg is already flush-able, increasing stats_updates is + * redundant. Avoid the overhead of the atomic update. */ - if (atomic_read(&stats_flush_threshold) <= num_online_cpus()) - atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); - __this_cpu_write(stats_updates, 0); + if (!memcg_should_flush_stats(memcg)) + atomic64_add(x, &memcg->vmstats->stats_updates); + __this_cpu_write(memcg->vmstats_percpu->stats_updates, 0); } } @@ -731,13 +746,12 @@ static void do_flush_stats(void) cgroup_rstat_flush(root_mem_cgroup->css.cgroup); - atomic_set(&stats_flush_threshold, 0); atomic_set(&stats_flush_ongoing, 0); } void mem_cgroup_flush_stats(void) { - if (atomic_read(&stats_flush_threshold) > num_online_cpus()) + if (memcg_should_flush_stats(root_mem_cgroup)) do_flush_stats(); } @@ -751,8 +765,8 @@ void mem_cgroup_flush_stats_ratelimited(void) static void flush_memcg_stats_dwork(struct work_struct *w) { /* - * Always flush here so that flushing in latency-sensitive paths is - * as cheap as possible. + * Deliberately ignore memcg_should_flush_stats() here so that flushing + * in latency-sensitive paths is as cheap as possible. */ do_flush_stats(); queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); @@ -5788,6 +5802,10 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) } } } + statc->stats_updates = 0; + /* We are in a per-cpu loop here, only do the atomic write once */ + if (atomic64_read(&memcg->vmstats->stats_updates)) + atomic64_set(&memcg->vmstats->stats_updates, 0); } #ifdef CONFIG_MMU From b006847222623ac3cda8589d15379eac86a2bcb7 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 29 Nov 2023 03:21:52 +0000 Subject: [PATCH 1024/1562] mm: workingset: move the stats flush into workingset_test_recent() The workingset code flushes the stats in workingset_refault() to get accurate stats of the eviction memcg. In preparation for more scoped flushed and passing the eviction memcg to the flush call, move the call to workingset_test_recent() where we have a pointer to the eviction memcg. The flush call is sleepable, and cannot be made in an rcu read section. Hence, minimize the rcu read section by also moving it into workingset_test_recent(). Furthermore, instead of holding the rcu read lock throughout workingset_test_recent(), only hold it briefly to get a ref on the eviction memcg. This allows us to make the flush call after we get the eviction memcg. As for workingset_refault(), nothing else there appears to be protected by rcu. The memcg of the faulted folio (which is not necessarily the same as the eviction memcg) is protected by the folio lock, which is held from all callsites. Add a VM_BUG_ON() to make sure this doesn't change from under us. No functional change intended. Link: https://lkml.kernel.org/r/20231129032154.3710765-5-yosryahmed@google.com Signed-off-by: Yosry Ahmed Tested-by: Domenico Cerasuolo Acked-by: Shakeel Butt Cc: Chris Li Cc: Greg Thelen Cc: Ivan Babrou Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutny Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Cc: Waiman Long Cc: Wei Xu Signed-off-by: Andrew Morton --- mm/workingset.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/mm/workingset.c b/mm/workingset.c index 3ea2ccb8e57d..6b9871f5a2e8 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -425,8 +425,16 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) struct pglist_data *pgdat; unsigned long eviction; - if (lru_gen_enabled()) - return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset); + rcu_read_lock(); + + if (lru_gen_enabled()) { + bool recent = lru_gen_test_recent(shadow, file, + &eviction_lruvec, &eviction, workingset); + + rcu_read_unlock(); + return recent; + } + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; @@ -448,8 +456,16 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && !eviction_memcg) + if (!mem_cgroup_disabled() && + (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) { + rcu_read_unlock(); return false; + } + + rcu_read_unlock(); + + /* Flush stats (and potentially sleep) outside the RCU read section */ + mem_cgroup_flush_stats_ratelimited(); eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); @@ -493,6 +509,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) } } + mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size; } @@ -519,19 +536,16 @@ void workingset_refault(struct folio *folio, void *shadow) return; } - /* Flush stats (and potentially sleep) before holding RCU read lock */ - mem_cgroup_flush_stats_ratelimited(); - - rcu_read_lock(); - /* * The activation decision for this folio is made at the level * where the eviction occurred, as that is where the LRU order * during folio reclaim is being determined. * * However, the cgroup that will own the folio is the one that - * is actually experiencing the refault event. + * is actually experiencing the refault event. Make sure the folio is + * locked to guarantee folio_memcg() stability throughout. */ + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); @@ -540,7 +554,7 @@ void workingset_refault(struct folio *folio, void *shadow) mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); if (!workingset_test_recent(shadow, file, &workingset)) - goto out; + return; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); @@ -556,8 +570,6 @@ void workingset_refault(struct folio *folio, void *shadow) lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } -out: - rcu_read_unlock(); } /** From 7d7ef0a4686abe43cd76a141b340a348f45ecdf2 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 29 Nov 2023 03:21:53 +0000 Subject: [PATCH 1025/1562] mm: memcg: restore subtree stats flushing Stats flushing for memcg currently follows the following rules: - Always flush the entire memcg hierarchy (i.e. flush the root). - Only one flusher is allowed at a time. If someone else tries to flush concurrently, they skip and return immediately. - A periodic flusher flushes all the stats every 2 seconds. The reason this approach is followed is because all flushes are serialized by a global rstat spinlock. On the memcg side, flushing is invoked from userspace reads as well as in-kernel flushers (e.g. reclaim, refault, etc). This approach aims to avoid serializing all flushers on the global lock, which can cause a significant performance hit under high concurrency. This approach has the following problems: - Occasionally a userspace read of the stats of a non-root cgroup will be too expensive as it has to flush the entire hierarchy [1]. - Sometimes the stats accuracy are compromised if there is an ongoing flush, and we skip and return before the subtree of interest is actually flushed, yielding stale stats (by up to 2s due to periodic flushing). This is more visible when reading stats from userspace, but can also affect in-kernel flushers. The latter problem is particulary a concern when userspace reads stats after an event occurs, but gets stats from before the event. Examples: - When memory usage / pressure spikes, a userspace OOM handler may look at the stats of different memcgs to select a victim based on various heuristics (e.g. how much private memory will be freed by killing this). Reading stale stats from before the usage spike in this case may cause a wrongful OOM kill. - A proactive reclaimer may read the stats after writing to memory.reclaim to measure the success of the reclaim operation. Stale stats from before reclaim may give a false negative. - Reading the stats of a parent and a child memcg may be inconsistent (child larger than parent), if the flush doesn't happen when the parent is read, but happens when the child is read. As for in-kernel flushers, they will occasionally get stale stats. No regressions are currently known from this, but if there are regressions, they would be very difficult to debug and link to the source of the problem. This patch aims to fix these problems by restoring subtree flushing, and removing the unified/coalesced flushing logic that skips flushing if there is an ongoing flush. This change would introduce a significant regression with global stats flushing thresholds. With per-memcg stats flushing thresholds, this seems to perform really well. The thresholds protect the underlying lock from unnecessary contention. This patch was tested in two ways to ensure the latency of flushing is up to par, on a machine with 384 cpus: - A synthetic test with 5000 concurrent workers in 500 cgroups doing allocations and reclaim, as well as 1000 readers for memory.stat (variation of [2]). No regressions were noticed in the total runtime. Note that significant regressions in this test are observed with global stats thresholds, but not with per-memcg thresholds. - A synthetic stress test for concurrently reading memcg stats while memory allocation/freeing workers are running in the background, provided by Wei Xu [3]. With 250k threads reading the stats every 100ms in 50k cgroups, 99.9% of reads take <= 50us. Less than 0.01% of reads take more than 1ms, and no reads take more than 100ms. [1] https://lore.kernel.org/lkml/CABWYdi0c6__rh-K7dcM_pkf9BJdTRtAU08M43KO9ME4-dsgfoQ@mail.gmail.com/ [2] https://lore.kernel.org/lkml/CAJD7tka13M-zVZTyQJYL1iUAYvuQ1fcHbCjcOBZcz6POYTV-4g@mail.gmail.com/ [3] https://lore.kernel.org/lkml/CAAPL-u9D2b=iF5Lf_cRnKxUfkiEe0AMDTu6yhrUAzX0b6a6rDg@mail.gmail.com/ [akpm@linux-foundation.org: fix mm/zswap.c] [yosryahmed@google.com: remove stats flushing mutex] Link: https://lkml.kernel.org/r/CAJD7tkZgP3m-VVPn+fF_YuvXeQYK=tZZjJHj=dzD=CcSSpp2qg@mail.gmail.com Link: https://lkml.kernel.org/r/20231129032154.3710765-6-yosryahmed@google.com Signed-off-by: Yosry Ahmed Tested-by: Domenico Cerasuolo Acked-by: Shakeel Butt Cc: Chris Li Cc: Greg Thelen Cc: Ivan Babrou Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutny Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Cc: Waiman Long Cc: Wei Xu Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 8 ++--- mm/memcontrol.c | 70 ++++++++++++++++++++++---------------- mm/vmscan.c | 2 +- mm/workingset.c | 10 ++++-- mm/zswap.c | 2 +- 5 files changed, 53 insertions(+), 39 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a308c8eacf20..43b77363ab8e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1051,8 +1051,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return x; } -void mem_cgroup_flush_stats(void); -void mem_cgroup_flush_stats_ratelimited(void); +void mem_cgroup_flush_stats(struct mem_cgroup *memcg); +void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); @@ -1563,11 +1563,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); } -static inline void mem_cgroup_flush_stats(void) +static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg) { } -static inline void mem_cgroup_flush_stats_ratelimited(void) +static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c5aa0c2cb68b..b08b9cd4a3a8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -670,7 +670,6 @@ struct memcg_vmstats { */ static void flush_memcg_stats_dwork(struct work_struct *w); static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); -static atomic_t stats_flush_ongoing = ATOMIC_INIT(0); static u64 flush_last_time; #define FLUSH_TIME (2UL*HZ) @@ -731,35 +730,40 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) } } -static void do_flush_stats(void) +static void do_flush_stats(struct mem_cgroup *memcg) { - /* - * We always flush the entire tree, so concurrent flushers can just - * skip. This avoids a thundering herd problem on the rstat global lock - * from memcg flushers (e.g. reclaim, refault, etc). - */ - if (atomic_read(&stats_flush_ongoing) || - atomic_xchg(&stats_flush_ongoing, 1)) + if (mem_cgroup_is_root(memcg)) + WRITE_ONCE(flush_last_time, jiffies_64); + + cgroup_rstat_flush(memcg->css.cgroup); +} + +/* + * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree + * @memcg: root of the subtree to flush + * + * Flushing is serialized by the underlying global rstat lock. There is also a + * minimum amount of work to be done even if there are no stat updates to flush. + * Hence, we only flush the stats if the updates delta exceeds a threshold. This + * avoids unnecessary work and contention on the underlying lock. + */ +void mem_cgroup_flush_stats(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) return; - WRITE_ONCE(flush_last_time, jiffies_64); + if (!memcg) + memcg = root_mem_cgroup; - cgroup_rstat_flush(root_mem_cgroup->css.cgroup); - - atomic_set(&stats_flush_ongoing, 0); + if (memcg_should_flush_stats(memcg)) + do_flush_stats(memcg); } -void mem_cgroup_flush_stats(void) -{ - if (memcg_should_flush_stats(root_mem_cgroup)) - do_flush_stats(); -} - -void mem_cgroup_flush_stats_ratelimited(void) +void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { /* Only flush if the periodic flusher is one full cycle late */ if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME)) - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); } static void flush_memcg_stats_dwork(struct work_struct *w) @@ -768,7 +772,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w) * Deliberately ignore memcg_should_flush_stats() here so that flushing * in latency-sensitive paths is as cheap as possible. */ - do_flush_stats(); + do_flush_stats(root_mem_cgroup); queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } @@ -1643,7 +1647,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) * * Current memory state: */ - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; @@ -4193,7 +4197,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) int nid; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { seq_printf(m, "%s=%lu", stat->name, @@ -4274,7 +4278,7 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4770,7 +4774,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -6865,7 +6869,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; @@ -8096,7 +8100,11 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) break; } - cgroup_rstat_flush(memcg->css.cgroup); + /* + * mem_cgroup_flush_stats() ignores small changes. Use + * do_flush_stats() directly to get accurate stats for charging. + */ + do_flush_stats(memcg); pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE; if (pages < max) continue; @@ -8161,8 +8169,10 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) static u64 zswap_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { - cgroup_rstat_flush(css->cgroup); - return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + mem_cgroup_flush_stats(memcg); + return memcg_page_state(memcg, MEMCG_ZSWAP_B); } static int zswap_max_show(struct seq_file *m, void *v) diff --git a/mm/vmscan.c b/mm/vmscan.c index f0eba9ef3332..b4ca3563bcf4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2226,7 +2226,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) * Flush the memory cgroup stats, so that we read accurate per-memcg * lruvec stats for heuristics. */ - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(sc->target_mem_cgroup); /* * Determine the scan balance between anon and file LRUs. diff --git a/mm/workingset.c b/mm/workingset.c index 6b9871f5a2e8..2a2a34234df9 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -464,8 +464,12 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) rcu_read_unlock(); - /* Flush stats (and potentially sleep) outside the RCU read section */ - mem_cgroup_flush_stats_ratelimited(); + /* + * Flush stats (and potentially sleep) outside the RCU read section. + * XXX: With per-memcg flushing and thresholding, is ratelimiting + * still needed here? + */ + mem_cgroup_flush_stats_ratelimited(eviction_memcg); eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); @@ -676,7 +680,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct lruvec *lruvec; int i; - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, diff --git a/mm/zswap.c b/mm/zswap.c index 015425ed9003..ac31fec176e9 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -641,7 +641,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, return 0; #ifdef CONFIG_MEMCG_KMEM - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); #else From 7dc7c5ef6463111991002f24c0aea08afe86f2cc Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:02 +0000 Subject: [PATCH 1026/1562] mm: allow deferred splitting of arbitrary anon large folios Patch series "Multi-size THP for anonymous memory", v9. A series to implement multi-size THP (mTHP) for anonymous memory (previously called "small-sized THP" and "large anonymous folios"). The objective of this is to improve performance by allocating larger chunks of memory during anonymous page faults: 1) Since SW (the kernel) is dealing with larger chunks of memory than base pages, there are efficiency savings to be had; fewer page faults, batched PTE and RMAP manipulation, reduced lru list, etc. In short, we reduce kernel overhead. This should benefit all architectures. 2) Since we are now mapping physically contiguous chunks of memory, we can take advantage of HW TLB compression techniques. A reduction in TLB pressure speeds up kernel and user space. arm64 systems have 2 mechanisms to coalesce TLB entries; "the contiguous bit" (architectural) and HPA (uarch). This version incorporates David's feedback on the core patches (#3, #4) and adds some RB and TB tags (see change log for details). By default, the existing behaviour (and performance) is maintained. The user must explicitly enable multi-size THP to see the performance benefit. This is done via a new sysfs interface (as recommended by David Hildenbrand - thanks to David for the suggestion)! This interface is inspired by the existing per-hugepage-size sysfs interface used by hugetlb, provides full backwards compatibility with the existing PMD-size THP interface, and provides a base for future extensibility. See [9] for detailed discussion of the interface. This series is based on mm-unstable (715b67adf4c8). Prerequisites ============= I'm removing this section on the basis that I don't believe what we were previously calling prerequisites are really prerequisites anymore. We originally defined them when mTHP was a compile-time feature. There is now a runtime control to opt-in to mTHP; when disabled, correctness and performance are as before. When enabled, the code is still correct/robust, but in the absence of the one remaining item (compaction) there may be a performance impact in some corners. See the old list in the v8 cover letter at [8]. And a longer explanation of my thinking here [10]. SUMMARY: I don't think we should hold this series up, waiting for the items on the prerequisites list. I believe this series should be ready now so hopefully can be added to mm-unstable for some testing, then fingers crossed for v6.8. Testing ======= The series includes patches for mm selftests to enlighten the cow and khugepaged tests to explicitly test with multi-size THP, in the same way that PMD-sized THP is tested. The new tests all pass, and no regressions are observed in the mm selftest suite. I've also run my usual kernel compilation and java script benchmarks without any issues. Refer to my performance numbers posted with v6 [6]. (These are for multi-size THP only - they do not include the arm64 contpte follow-on series). John Hubbard at Nvidia has indicated dramatic 10x performance improvements for some workloads at [11]. (Observed using v6 of this series as well as the arm64 contpte series). Kefeng Wang at Huawei has also indicated he sees improvements at [12] although there are some latency regressions also. I've also checked that there is no regression in the write fault path when mTHP is disabled using a microbenchmark. I ran it for a baseline kernel, as well as v8 and v9. I repeated on Ampere Altra (bare metal) and Apple M2 (VM): | | m2 vm | altra | |--------------|---------------------|---------------------| | kernel | mean | std_rel | mean | std_rel | |--------------|----------|----------|----------|----------| | baseline | 0.000% | 0.341% | 0.000% | 3.581% | | anonfolio-v8 | 0.005% | 0.272% | 5.068% | 1.128% | | anonfolio-v9 | -0.013% | 0.442% | 0.107% | 1.788% | There is no measurable difference on M2, but altra has a slow down in v8 which is fixed in v9 by moving the THP order check to be inline within thp_vma_allowable_orders(), as suggested by David. This patch (of 10): In preparation for the introduction of anonymous multi-size THP, we would like to be able to split them when they have unmapped subpages, in order to free those unused pages under memory pressure. So remove the artificial requirement that the large folio needed to be at least PMD-sized. Link: https://lkml.kernel.org/r/20231207161211.2374093-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20231207161211.2374093-2-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Yu Zhao Reviewed-by: Yin Fengwei Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Barry Song Tested-by: Kefeng Wang Tested-by: John Hubbard Cc: Alistair Popple Cc: Anshuman Khandual Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/rmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 7a27a2b41802..49e4d86a4f70 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1488,11 +1488,11 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, __lruvec_stat_mod_folio(folio, idx, -nr); /* - * Queue anon THP for deferred split if at least one + * Queue anon large folio for deferred split if at least one * page of the folio is unmapped and at least one page * is still mapped. */ - if (folio_test_pmd_mappable(folio) && folio_test_anon(folio)) + if (folio_test_large(folio) && folio_test_anon(folio)) if (!compound || nr < nr_pmdmapped) deferred_split_folio(folio); } From 372cbd4d5a0665bf7e181c72f5e40e1bf59b0b08 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:03 +0000 Subject: [PATCH 1027/1562] mm: non-pmd-mappable, large folios for folio_add_new_anon_rmap() In preparation for supporting anonymous multi-size THP, improve folio_add_new_anon_rmap() to allow a non-pmd-mappable, large folio to be passed to it. In this case, all contained pages are accounted using the order-0 folio (or base page) scheme. Link: https://lkml.kernel.org/r/20231207161211.2374093-3-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Yu Zhao Reviewed-by: Yin Fengwei Reviewed-by: David Hildenbrand Reviewed-by: Barry Song Tested-by: Kefeng Wang Tested-by: John Hubbard Cc: Alistair Popple Cc: Anshuman Khandual Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/rmap.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 49e4d86a4f70..b086dc957b0c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1305,32 +1305,44 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, * This means the inc-and-test can be bypassed. * The folio does not have to be locked. * - * If the folio is large, it is accounted as a THP. As the folio + * If the folio is pmd-mappable, it is accounted as a THP. As the folio * is new, it's assumed to be mapped exclusively by a single process. */ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { - int nr; + int nr = folio_nr_pages(folio); - VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + VM_BUG_ON_VMA(address < vma->vm_start || + address + (nr << PAGE_SHIFT) > vma->vm_end, vma); __folio_set_swapbacked(folio); + __folio_set_anon(folio, vma, address, true); - if (likely(!folio_test_pmd_mappable(folio))) { + if (likely(!folio_test_large(folio))) { /* increment count (starts at -1) */ atomic_set(&folio->_mapcount, 0); - nr = 1; + SetPageAnonExclusive(&folio->page); + } else if (!folio_test_pmd_mappable(folio)) { + int i; + + for (i = 0; i < nr; i++) { + struct page *page = folio_page(folio, i); + + /* increment count (starts at -1) */ + atomic_set(&page->_mapcount, 0); + SetPageAnonExclusive(page); + } + + atomic_set(&folio->_nr_pages_mapped, nr); } else { /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, COMPOUND_MAPPED); - nr = folio_nr_pages(folio); + SetPageAnonExclusive(&folio->page); __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr); } __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); - __folio_set_anon(folio, vma, address, true); - SetPageAnonExclusive(&folio->page); } /** From 3485b88390b0af9e05dc2c3f57e9936f41e159a0 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:04 +0000 Subject: [PATCH 1028/1562] mm: thp: introduce multi-size THP sysfs interface In preparation for adding support for anonymous multi-size THP, introduce new sysfs structure that will be used to control the new behaviours. A new directory is added under transparent_hugepage for each supported THP size, and contains an `enabled` file, which can be set to "inherit" (to inherit the global setting), "always", "madvise" or "never". For now, the kernel still only supports PMD-sized anonymous THP, so only 1 directory is populated. The first half of the change converts transhuge_vma_suitable() and hugepage_vma_check() so that they take a bitfield of orders for which the user wants to determine support, and the functions filter out all the orders that can't be supported, given the current sysfs configuration and the VMA dimensions. The resulting functions are renamed to thp_vma_suitable_orders() and thp_vma_allowable_orders() respectively. Convenience functions that take a single, unencoded order and return a boolean are also defined as thp_vma_suitable_order() and thp_vma_allowable_order(). The second half of the change implements the new sysfs interface. It has been done so that each supported THP size has a `struct thpsize`, which describes the relevant metadata and is itself a kobject. This is pretty minimal for now, but should make it easy to add new per-thpsize files to the interface if needed in future (e.g. per-size defrag). Rather than keep the `enabled` state directly in the struct thpsize, I've elected to directly encode it into huge_anon_orders_[always|madvise|inherit] bitfields since this reduces the amount of work required in thp_vma_allowable_orders() which is called for every page fault. See Documentation/admin-guide/mm/transhuge.rst, as modified by this commit, for details of how the new sysfs interface works. [ryan.roberts@arm.com: fix build warning when CONFIG_SYSFS is disabled] Link: https://lkml.kernel.org/r/20231211125320.3997543-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20231207161211.2374093-4-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Barry Song Tested-by: Kefeng Wang Tested-by: John Hubbard Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Anshuman Khandual Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 97 +++++++-- Documentation/filesystems/proc.rst | 6 +- fs/proc/task_mmu.c | 3 +- include/linux/huge_mm.h | 181 +++++++++++++--- mm/huge_memory.c | 229 ++++++++++++++++++--- mm/khugepaged.c | 20 +- mm/memory.c | 6 +- mm/page_vma_mapped.c | 3 +- 8 files changed, 458 insertions(+), 87 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index b0cc8243e093..04eb45a2f940 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -45,10 +45,25 @@ components: the two is using hugepages just because of the fact the TLB miss is going to run faster. +Modern kernels support "multi-size THP" (mTHP), which introduces the +ability to allocate memory in blocks that are bigger than a base page +but smaller than traditional PMD-size (as described above), in +increments of a power-of-2 number of pages. mTHP can back anonymous +memory (for example 16K, 32K, 64K, etc). These THPs continue to be +PTE-mapped, but in many cases can still provide similar benefits to +those outlined above: Page faults are significantly reduced (by a +factor of e.g. 4, 8, 16, etc), but latency spikes are much less +prominent because the size of each page isn't as huge as the PMD-sized +variant and there is less memory to clear in each page fault. Some +architectures also employ TLB compression mechanisms to squeeze more +entries in when a set of PTEs are virtually and physically contiguous +and approporiately aligned. In this case, TLB misses will occur less +often. + THP can be enabled system wide or restricted to certain tasks or even memory ranges inside task's address space. Unless THP is completely disabled, there is ``khugepaged`` daemon that scans memory and -collapses sequences of basic pages into huge pages. +collapses sequences of basic pages into PMD-sized huge pages. The THP behaviour is controlled via :ref:`sysfs ` interface and using madvise(2) and prctl(2) system calls. @@ -95,12 +110,40 @@ Global THP controls Transparent Hugepage Support for anonymous memory can be entirely disabled (mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to avoid the risk of consuming more memory resources) or enabled -system wide. This can be achieved with one of:: +system wide. This can be achieved per-supported-THP-size with one of:: + + echo always >/sys/kernel/mm/transparent_hugepage/hugepages-kB/enabled + echo madvise >/sys/kernel/mm/transparent_hugepage/hugepages-kB/enabled + echo never >/sys/kernel/mm/transparent_hugepage/hugepages-kB/enabled + +where is the hugepage size being addressed, the available sizes +for which vary by system. + +For example:: + + echo always >/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled + +Alternatively it is possible to specify that a given hugepage size +will inherit the top-level "enabled" value:: + + echo inherit >/sys/kernel/mm/transparent_hugepage/hugepages-kB/enabled + +For example:: + + echo inherit >/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled + +The top-level setting (for use with "inherit") can be set by issuing +one of the following commands:: echo always >/sys/kernel/mm/transparent_hugepage/enabled echo madvise >/sys/kernel/mm/transparent_hugepage/enabled echo never >/sys/kernel/mm/transparent_hugepage/enabled +By default, PMD-sized hugepages have enabled="inherit" and all other +hugepage sizes have enabled="never". If enabling multiple hugepage +sizes, the kernel will select the most appropriate enabled size for a +given allocation. + It's also possible to limit defrag efforts in the VM to generate anonymous hugepages in case they're not immediately free to madvise regions or to never try to defrag memory and simply fallback to regular @@ -146,25 +189,34 @@ madvise never should be self-explanatory. -By default kernel tries to use huge zero page on read page fault to -anonymous mapping. It's possible to disable huge zero page by writing 0 -or enable it back by writing 1:: +By default kernel tries to use huge, PMD-mappable zero page on read +page fault to anonymous mapping. It's possible to disable huge zero +page by writing 0 or enable it back by writing 1:: echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page -Some userspace (such as a test program, or an optimized memory allocation -library) may want to know the size (in bytes) of a transparent hugepage:: +Some userspace (such as a test program, or an optimized memory +allocation library) may want to know the size (in bytes) of a +PMD-mappable transparent hugepage:: cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size -khugepaged will be automatically started when -transparent_hugepage/enabled is set to "always" or "madvise, and it'll -be automatically shutdown if it's set to "never". +khugepaged will be automatically started when one or more hugepage +sizes are enabled (either by directly setting "always" or "madvise", +or by setting "inherit" while the top-level enabled is set to "always" +or "madvise"), and it'll be automatically shutdown when the last +hugepage size is disabled (either by directly setting "never", or by +setting "inherit" while the top-level enabled is set to "never"). Khugepaged controls ------------------- +.. note:: + khugepaged currently only searches for opportunities to collapse to + PMD-sized THP and no attempt is made to collapse to other THP + sizes. + khugepaged runs usually at low frequency so while one may not want to invoke defrag algorithms synchronously during the page faults, it should be worth invoking defrag at least in khugepaged. However it's @@ -282,19 +334,26 @@ force Need of application restart =========================== -The transparent_hugepage/enabled values and tmpfs mount option only affect -future behavior. So to make them effective you need to restart any -application that could have been using hugepages. This also applies to the -regions registered in khugepaged. +The transparent_hugepage/enabled and +transparent_hugepage/hugepages-kB/enabled values and tmpfs mount +option only affect future behavior. So to make them effective you need +to restart any application that could have been using hugepages. This +also applies to the regions registered in khugepaged. Monitoring usage ================ -The number of anonymous transparent huge pages currently used by the +.. note:: + Currently the below counters only record events relating to + PMD-sized THP. Events relating to other THP sizes are not included. + +The number of PMD-sized anonymous transparent huge pages currently used by the system is available by reading the AnonHugePages field in ``/proc/meminfo``. -To identify what applications are using anonymous transparent huge pages, -it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields -for each mapping. +To identify what applications are using PMD-sized anonymous transparent huge +pages, it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages +fields for each mapping. (Note that AnonHugePages only applies to traditional +PMD-sized THP for historical reasons and should have been called +AnonHugePmdMapped). The number of file transparent huge pages mapped to userspace is available by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``. @@ -413,7 +472,7 @@ for huge pages. Optimizing the applications =========================== -To be guaranteed that the kernel will map a 2M page immediately in any +To be guaranteed that the kernel will map a THP immediately in any memory region, the mmap region has to be hugepage naturally aligned. posix_memalign() can provide that guarantee. diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 49ef12df631b..104c6d047d9b 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -528,9 +528,9 @@ replaced by copy-on-write) part of the underlying shmem object out on swap. does not take into account swapped out page of underlying shmem objects. "Locked" indicates whether the mapping is locked in memory or not. -"THPeligible" indicates whether the mapping is eligible for allocating THP -pages as well as the THP is PMD mappable or not - 1 if true, 0 otherwise. -It just shows the current status. +"THPeligible" indicates whether the mapping is eligible for allocating +naturally aligned THP pages of any currently enabled size. 1 if true, 0 +otherwise. "VmFlags" field deserves a separate description. This member represents the kernel flags associated with the particular virtual memory area in two letter diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index d19924bf0a39..79855e1c5b57 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -865,7 +865,8 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", - hugepage_vma_check(vma, vma->vm_flags, true, false, true)); + !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false, + true, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fa0350b0812a..609c153bae57 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -67,6 +67,24 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<vm_start >> PAGE_SHIFT) - vma->vm_pgoff, - HPAGE_PMD_NR)) + hpage_size >> PAGE_SHIFT)) return false; } - haddr = addr & HPAGE_PMD_MASK; + haddr = ALIGN_DOWN(addr, hpage_size); - if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + if (haddr < vma->vm_start || haddr + hpage_size > vma->vm_end) return false; return true; } +/* + * Filter the bitfield of input orders to the ones suitable for use in the vma. + * See thp_vma_suitable_order(). + * All orders that pass the checks are returned as a bitfield. + */ +static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, + unsigned long addr, unsigned long orders) +{ + int order; + + /* + * Iterate over orders, highest to lowest, removing orders that don't + * meet alignment requirements from the set. Exit loop at first order + * that meets requirements, since all lower orders must also meet + * requirements. + */ + + order = highest_order(orders); + + while (orders) { + if (thp_vma_suitable_order(vma, addr, order)) + break; + order = next_order(&orders, order); + } + + return orders; +} + static inline bool file_thp_enabled(struct vm_area_struct *vma) { struct inode *inode; @@ -130,8 +208,52 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps, bool in_pf, bool enforce_sysfs); +unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders); + +/** + * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma + * @vma: the vm area to check + * @vm_flags: use these vm_flags instead of vma->vm_flags + * @smaps: whether answer will be used for smaps file + * @in_pf: whether answer will be used by page fault handler + * @enforce_sysfs: whether sysfs config should be taken into account + * @orders: bitfield of all orders to consider + * + * Calculates the intersection of the requested hugepage orders and the allowed + * hugepage orders for the provided vma. Permitted orders are encoded as a set + * bit at the corresponding bit position (bit-2 corresponds to order-2, bit-3 + * corresponds to order-3, etc). Order-0 is never considered a hugepage order. + * + * Return: bitfield of orders allowed for hugepage in the vma. 0 if no hugepage + * orders are allowed. + */ +static inline +unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders) +{ + /* Optimization to check if required orders are enabled early. */ + if (enforce_sysfs && vma_is_anonymous(vma)) { + unsigned long mask = READ_ONCE(huge_anon_orders_always); + + if (vm_flags & VM_HUGEPAGE) + mask |= READ_ONCE(huge_anon_orders_madvise); + if (hugepage_global_always() || + ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled())) + mask |= READ_ONCE(huge_anon_orders_inherit); + + orders &= mask; + if (!orders) + return 0; + } + + return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, + enforce_sysfs, orders); +} #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -267,17 +389,24 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) return false; } -static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long addr) +static inline bool thp_vma_suitable_order(struct vm_area_struct *vma, + unsigned long addr, int order) { return false; } -static inline bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs) +static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, + unsigned long addr, unsigned long orders) { - return false; + return 0; +} + +static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders) +{ + return 0; } static inline void folio_prep_large_rmappable(struct folio *folio) {} diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c848ea97ab02..387b030c7f15 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -74,12 +74,23 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; +unsigned long huge_anon_orders_always __read_mostly; +unsigned long huge_anon_orders_madvise __read_mostly; +unsigned long huge_anon_orders_inherit __read_mostly; -bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps, bool in_pf, bool enforce_sysfs) +unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders) { + /* Check the intersection of requested and supported orders. */ + orders &= vma_is_anonymous(vma) ? + THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; + if (!orders) + return 0; + if (!vma->vm_mm) /* vdso */ - return false; + return 0; /* * Explicitly disabled through madvise or prctl, or some @@ -88,16 +99,16 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * */ if ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; + return 0; /* * If the hardware/firmware marked hugepage support disabled. */ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) - return false; + return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ if (vma_is_dax(vma)) - return in_pf; + return in_pf ? orders : 0; /* * khugepaged special VMA and hugetlb VMA. @@ -105,17 +116,29 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * VM_MIXEDMAP set. */ if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) - return false; + return 0; /* - * Check alignment for file vma and size for both file and anon vma. + * Check alignment for file vma and size for both file and anon vma by + * filtering out the unsuitable orders. * * Skip the check for page fault. Huge fault does the check in fault - * handlers. And this check is not suitable for huge PUD fault. + * handlers. */ - if (!in_pf && - !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) - return false; + if (!in_pf) { + int order = highest_order(orders); + unsigned long addr; + + while (orders) { + addr = vma->vm_end - (PAGE_SIZE << order); + if (thp_vma_suitable_order(vma, addr, order)) + break; + order = next_order(&orders, order); + } + + if (!orders) + return 0; + } /* * Enabled via shmem mount options or sysfs settings. @@ -124,29 +147,33 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, */ if (!in_pf && shmem_file(vma->vm_file)) return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, - !enforce_sysfs, vma->vm_mm, vm_flags); - - /* Enforce sysfs THP requirements as necessary */ - if (enforce_sysfs && - (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && - !hugepage_flags_always()))) - return false; + !enforce_sysfs, vma->vm_mm, vm_flags) + ? orders : 0; if (!vma_is_anonymous(vma)) { + /* + * Enforce sysfs THP requirements as necessary. Anonymous vmas + * were already handled in thp_vma_allowable_orders(). + */ + if (enforce_sysfs && + (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && + !hugepage_global_always()))) + return 0; + /* * Trust that ->huge_fault() handlers know what they are doing * in fault path. */ if (((in_pf || smaps)) && vma->vm_ops->huge_fault) - return true; + return orders; /* Only regular file is valid in collapse path */ if (((!in_pf || smaps)) && file_thp_enabled(vma)) - return true; - return false; + return orders; + return 0; } if (vma_is_temporary_stack(vma)) - return false; + return 0; /* * THPeligible bit of smaps should show 1 for proper VMAs even @@ -156,9 +183,9 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * the first page fault. */ if (!vma->anon_vma) - return (smaps || in_pf); + return (smaps || in_pf) ? orders : 0; - return true; + return orders; } static bool get_huge_zero_page(void) @@ -412,9 +439,136 @@ static const struct attribute_group hugepage_attr_group = { .attrs = hugepage_attr, }; +static void hugepage_exit_sysfs(struct kobject *hugepage_kobj); +static void thpsize_release(struct kobject *kobj); +static DEFINE_SPINLOCK(huge_anon_orders_lock); +static LIST_HEAD(thpsize_list); + +struct thpsize { + struct kobject kobj; + struct list_head node; + int order; +}; + +#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) + +static ssize_t thpsize_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + + if (test_bit(order, &huge_anon_orders_always)) + output = "[always] inherit madvise never"; + else if (test_bit(order, &huge_anon_orders_inherit)) + output = "always [inherit] madvise never"; + else if (test_bit(order, &huge_anon_orders_madvise)) + output = "always inherit [madvise] never"; + else + output = "always inherit madvise [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t thpsize_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) { + spin_lock(&huge_anon_orders_lock); + clear_bit(order, &huge_anon_orders_inherit); + clear_bit(order, &huge_anon_orders_madvise); + set_bit(order, &huge_anon_orders_always); + spin_unlock(&huge_anon_orders_lock); + } else if (sysfs_streq(buf, "inherit")) { + spin_lock(&huge_anon_orders_lock); + clear_bit(order, &huge_anon_orders_always); + clear_bit(order, &huge_anon_orders_madvise); + set_bit(order, &huge_anon_orders_inherit); + spin_unlock(&huge_anon_orders_lock); + } else if (sysfs_streq(buf, "madvise")) { + spin_lock(&huge_anon_orders_lock); + clear_bit(order, &huge_anon_orders_always); + clear_bit(order, &huge_anon_orders_inherit); + set_bit(order, &huge_anon_orders_madvise); + spin_unlock(&huge_anon_orders_lock); + } else if (sysfs_streq(buf, "never")) { + spin_lock(&huge_anon_orders_lock); + clear_bit(order, &huge_anon_orders_always); + clear_bit(order, &huge_anon_orders_inherit); + clear_bit(order, &huge_anon_orders_madvise); + spin_unlock(&huge_anon_orders_lock); + } else + ret = -EINVAL; + + return ret; +} + +static struct kobj_attribute thpsize_enabled_attr = + __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store); + +static struct attribute *thpsize_attrs[] = { + &thpsize_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group thpsize_attr_group = { + .attrs = thpsize_attrs, +}; + +static const struct kobj_type thpsize_ktype = { + .release = &thpsize_release, + .sysfs_ops = &kobj_sysfs_ops, +}; + +static struct thpsize *thpsize_create(int order, struct kobject *parent) +{ + unsigned long size = (PAGE_SIZE << order) / SZ_1K; + struct thpsize *thpsize; + int ret; + + thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL); + if (!thpsize) + return ERR_PTR(-ENOMEM); + + ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent, + "hugepages-%lukB", size); + if (ret) { + kfree(thpsize); + return ERR_PTR(ret); + } + + ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group); + if (ret) { + kobject_put(&thpsize->kobj); + return ERR_PTR(ret); + } + + thpsize->order = order; + return thpsize; +} + +static void thpsize_release(struct kobject *kobj) +{ + kfree(to_thpsize(kobj)); +} + static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) { int err; + struct thpsize *thpsize; + unsigned long orders; + int order; + + /* + * Default to setting PMD-sized THP to inherit the global setting and + * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time + * constant so we have to do this here. + */ + huge_anon_orders_inherit = BIT(PMD_ORDER); *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { @@ -434,8 +588,24 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) goto remove_hp_group; } + orders = THP_ORDERS_ALL_ANON; + order = highest_order(orders); + while (orders) { + thpsize = thpsize_create(order, *hugepage_kobj); + if (IS_ERR(thpsize)) { + pr_err("failed to create thpsize for order %d\n", order); + err = PTR_ERR(thpsize); + goto remove_all; + } + list_add(&thpsize->node, &thpsize_list); + order = next_order(&orders, order); + } + return 0; +remove_all: + hugepage_exit_sysfs(*hugepage_kobj); + return err; remove_hp_group: sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); delete_obj: @@ -445,6 +615,13 @@ delete_obj: static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) { + struct thpsize *thpsize, *tmp; + + list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) { + list_del(&thpsize->node); + kobject_put(&thpsize->kobj); + } + sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); kobject_put(hugepage_kobj); @@ -811,7 +988,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - if (!transhuge_vma_suitable(vma, haddr)) + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 064654717843..d72aecd3624a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -446,7 +446,8 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (hugepage_vma_check(vma, vm_flags, false, false, true)) + if (thp_vma_allowable_order(vma, vm_flags, false, false, true, + PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } @@ -922,16 +923,16 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!vma) return SCAN_VMA_NULL; - if (!transhuge_vma_suitable(vma, address)) + if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, - cc->is_khugepaged)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, + cc->is_khugepaged, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then * remapped to file after khugepaged reaquired the mmap_lock. * - * hugepage_vma_check may return true for qualified file + * thp_vma_allowable_order may return true for qualified file * vmas. */ if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) @@ -1503,7 +1504,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here. */ - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, + PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2368,7 +2370,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, + true, PMD_ORDER)) { skip: progress++; continue; @@ -2705,7 +2708,8 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, *prev = vma; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, + PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); diff --git a/mm/memory.c b/mm/memory.c index 99582b188ed2..8ab2d994d997 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4322,7 +4322,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) pmd_t entry; vm_fault_t ret = VM_FAULT_FALLBACK; - if (!transhuge_vma_suitable(vma, haddr)) + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; page = compound_head(page); @@ -5116,7 +5116,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - hugepage_vma_check(vma, vm_flags, false, true, true)) { + thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5150,7 +5150,7 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - hugepage_vma_check(vma, vm_flags, false, true, true)) { + thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index e0b368e545ed..74d2de15fb5e 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -268,7 +268,8 @@ restart: * cleared *pmd but not decremented compound_mapcount(). */ if ((pvmw->flags & PVMW_SYNC) && - transhuge_vma_suitable(vma, pvmw->address) && + thp_vma_suitable_order(vma, pvmw->address, + PMD_ORDER) && (pvmw->nr_pages >= HPAGE_PMD_NR)) { spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); From 19eaf44954df64f9bc8dec398219e15ad0811497 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:05 +0000 Subject: [PATCH 1029/1562] mm: thp: support allocation of anonymous multi-size THP Introduce the logic to allow THP to be configured (through the new sysfs interface we just added) to allocate large folios to back anonymous memory, which are larger than the base page size but smaller than PMD-size. We call this new THP extension "multi-size THP" (mTHP). mTHP continues to be PTE-mapped, but in many cases can still provide similar benefits to traditional PMD-sized THP: Page faults are significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on the configured order), but latency spikes are much less prominent because the size of each page isn't as huge as the PMD-sized variant and there is less memory to clear in each page fault. The number of per-page operations (e.g. ref counting, rmap management, lru list management) are also significantly reduced since those ops now become per-folio. Some architectures also employ TLB compression mechanisms to squeeze more entries in when a set of PTEs are virtually and physically contiguous and approporiately aligned. In this case, TLB misses will occur less often. The new behaviour is disabled by default, but can be enabled at runtime by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled (see documentation in previous commit). The long term aim is to change the default to include suitable lower orders, but there are some risks around internal fragmentation that need to be better understood first. [ryan.roberts@arm.com: resolve some multi-size THP review nits] Link: https://lkml.kernel.org/r/20231214160251.3574571-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: Kefeng Wang Tested-by: John Hubbard Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 ++- mm/memory.c | 109 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 104 insertions(+), 11 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 609c153bae57..fa7a38a30fc6 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_NR (1<vma; + unsigned long orders; + struct folio *folio; + unsigned long addr; + pte_t *pte; + gfp_t gfp; + int order; + + /* + * If uffd is active for the vma we need per-page fault fidelity to + * maintain the uffd semantics. + */ + if (unlikely(userfaultfd_armed(vma))) + goto fallback; + + /* + * Get a list of all the (large) orders below PMD_ORDER that are enabled + * for this vma. Then filter out the orders that can't be allocated over + * the faulting address and still be fully contained in the vma. + */ + orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true, + BIT(PMD_ORDER) - 1); + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + + if (!orders) + goto fallback; + + pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK); + if (!pte) + return ERR_PTR(-EAGAIN); + + /* + * Find the highest order where the aligned range is completely + * pte_none(). Note that all remaining orders will be completely + * pte_none(). + */ + order = highest_order(orders); + while (orders) { + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + if (pte_range_none(pte + pte_index(addr), 1 << order)) + break; + order = next_order(&orders, order); + } + + pte_unmap(pte); + + /* Try allocating the highest of the remaining orders. */ + gfp = vma_thp_gfp_mask(vma); + while (orders) { + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + folio = vma_alloc_folio(gfp, order, vma, addr, true); + if (folio) { + clear_huge_page(&folio->page, vmf->address, 1 << order); + return folio; + } + order = next_order(&orders, order); + } + +fallback: +#endif + return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address); +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -4134,9 +4212,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address; struct folio *folio; vm_fault_t ret = 0; + int nr_pages = 1; pte_t entry; + int i; /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) @@ -4176,10 +4257,16 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); + /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */ + folio = alloc_anon_folio(vmf); + if (IS_ERR(folio)) + return 0; if (!folio) goto oom; + nr_pages = folio_nr_pages(folio); + addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) goto oom_free_page; folio_throttle_swaprate(folio, GFP_KERNEL); @@ -4196,12 +4283,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, - &vmf->ptl); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) goto release; - if (vmf_pte_changed(vmf)) { - update_mmu_tlb(vma, vmf->address, vmf->pte); + if (nr_pages == 1 && vmf_pte_changed(vmf)) { + update_mmu_tlb(vma, addr, vmf->pte); + goto release; + } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { + for (i = 0; i < nr_pages; i++) + update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i); goto release; } @@ -4216,16 +4306,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_MISSING); } - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); - folio_add_new_anon_rmap(folio, vma, vmf->address); + folio_ref_add(folio, nr_pages - 1); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); + folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); setpte: if (uffd_wp) entry = pte_mkuffd_wp(entry); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); + set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); /* No need to invalidate - it was non-present before */ - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); + update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); From b6aab3384cafba151c53d3b5f7e1f8d073aadf03 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:06 +0000 Subject: [PATCH 1030/1562] selftests/mm/kugepaged: restore thp settings at exit Previously, the saved thp settings would be restored upon a signal or at the natural end of the test suite. But there are some tests that directly call exit() upon failure. In this case, the thp settings were not being restored, which could then influence other tests. Fix this by installing an atexit() handler to do the actual restore. The signal handler can now just call exit() and the atexit handler is invoked. Link: https://lkml.kernel.org/r/20231207161211.2374093-6-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Alistair Popple Reviewed-by: David Hildenbrand Tested-by: Kefeng Wang Tested-by: John Hubbard Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/khugepaged.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index 030667cb5533..fc47a1c4944c 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -374,18 +374,22 @@ static void pop_settings(void) write_settings(current_settings()); } -static void restore_settings(int sig) +static void restore_settings_atexit(void) { if (skip_settings_restore) - goto out; + return; printf("Restore THP and khugepaged settings..."); write_settings(&saved_settings); success("OK"); - if (sig) - exit(EXIT_FAILURE); -out: - exit(exit_status); + + skip_settings_restore = true; +} + +static void restore_settings(int sig) +{ + /* exit() will invoke the restore_settings_atexit handler. */ + exit(sig ? EXIT_FAILURE : exit_status); } static void save_settings(void) @@ -415,6 +419,7 @@ static void save_settings(void) success("OK"); + atexit(restore_settings_atexit); signal(SIGTERM, restore_settings); signal(SIGINT, restore_settings); signal(SIGHUP, restore_settings); From 00679a183ac6d2584723cfc2a2c07c8285f802dc Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:07 +0000 Subject: [PATCH 1031/1562] selftests/mm: factor out thp settings management The khugepaged test has a useful framework for save/restore/pop/push of all thp settings via the sysfs interface. This will be useful to explicitly control multi-size THP settings in other tests, so let's move it out of khugepaged and into its own thp_settings.[c|h] utility. Link: https://lkml.kernel.org/r/20231207161211.2374093-7-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: Alistair Popple Acked-by: David Hildenbrand Tested-by: Kefeng Wang Tested-by: John Hubbard Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 4 +- tools/testing/selftests/mm/khugepaged.c | 346 ++-------------------- tools/testing/selftests/mm/thp_settings.c | 296 ++++++++++++++++++ tools/testing/selftests/mm/thp_settings.h | 71 +++++ 4 files changed, 391 insertions(+), 326 deletions(-) create mode 100644 tools/testing/selftests/mm/thp_settings.c create mode 100644 tools/testing/selftests/mm/thp_settings.h diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index dede0bcf97a3..2453add65d12 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -117,8 +117,8 @@ TEST_FILES += va_high_addr_switch.sh include ../lib.mk -$(TEST_GEN_PROGS): vm_util.c -$(TEST_GEN_FILES): vm_util.c +$(TEST_GEN_PROGS): vm_util.c thp_settings.c +$(TEST_GEN_FILES): vm_util.c thp_settings.c $(OUTPUT)/uffd-stress: uffd-common.c $(OUTPUT)/uffd-unit-tests: uffd-common.c diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index fc47a1c4944c..b15e7fd70176 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -22,13 +22,13 @@ #include "linux/magic.h" #include "vm_util.h" +#include "thp_settings.h" #define BASE_ADDR ((void *)(1UL << 30)) static unsigned long hpage_pmd_size; static unsigned long page_size; static int hpage_pmd_nr; -#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" #define PID_SMAPS "/proc/self/smaps" #define TEST_FILE "collapse_test_file" @@ -71,78 +71,7 @@ struct file_info { }; static struct file_info finfo; - -enum thp_enabled { - THP_ALWAYS, - THP_MADVISE, - THP_NEVER, -}; - -static const char *thp_enabled_strings[] = { - "always", - "madvise", - "never", - NULL -}; - -enum thp_defrag { - THP_DEFRAG_ALWAYS, - THP_DEFRAG_DEFER, - THP_DEFRAG_DEFER_MADVISE, - THP_DEFRAG_MADVISE, - THP_DEFRAG_NEVER, -}; - -static const char *thp_defrag_strings[] = { - "always", - "defer", - "defer+madvise", - "madvise", - "never", - NULL -}; - -enum shmem_enabled { - SHMEM_ALWAYS, - SHMEM_WITHIN_SIZE, - SHMEM_ADVISE, - SHMEM_NEVER, - SHMEM_DENY, - SHMEM_FORCE, -}; - -static const char *shmem_enabled_strings[] = { - "always", - "within_size", - "advise", - "never", - "deny", - "force", - NULL -}; - -struct khugepaged_settings { - bool defrag; - unsigned int alloc_sleep_millisecs; - unsigned int scan_sleep_millisecs; - unsigned int max_ptes_none; - unsigned int max_ptes_swap; - unsigned int max_ptes_shared; - unsigned long pages_to_scan; -}; - -struct settings { - enum thp_enabled thp_enabled; - enum thp_defrag thp_defrag; - enum shmem_enabled shmem_enabled; - bool use_zero_page; - struct khugepaged_settings khugepaged; - unsigned long read_ahead_kb; -}; - -static struct settings saved_settings; static bool skip_settings_restore; - static int exit_status; static void success(const char *msg) @@ -161,226 +90,13 @@ static void skip(const char *msg) printf(" \e[33m%s\e[0m\n", msg); } -static int read_file(const char *path, char *buf, size_t buflen) -{ - int fd; - ssize_t numread; - - fd = open(path, O_RDONLY); - if (fd == -1) - return 0; - - numread = read(fd, buf, buflen - 1); - if (numread < 1) { - close(fd); - return 0; - } - - buf[numread] = '\0'; - close(fd); - - return (unsigned int) numread; -} - -static int write_file(const char *path, const char *buf, size_t buflen) -{ - int fd; - ssize_t numwritten; - - fd = open(path, O_WRONLY); - if (fd == -1) { - printf("open(%s)\n", path); - exit(EXIT_FAILURE); - return 0; - } - - numwritten = write(fd, buf, buflen - 1); - close(fd); - if (numwritten < 1) { - printf("write(%s)\n", buf); - exit(EXIT_FAILURE); - return 0; - } - - return (unsigned int) numwritten; -} - -static int read_string(const char *name, const char *strings[]) -{ - char path[PATH_MAX]; - char buf[256]; - char *c; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - - if (!read_file(path, buf, sizeof(buf))) { - perror(path); - exit(EXIT_FAILURE); - } - - c = strchr(buf, '['); - if (!c) { - printf("%s: Parse failure\n", __func__); - exit(EXIT_FAILURE); - } - - c++; - memmove(buf, c, sizeof(buf) - (c - buf)); - - c = strchr(buf, ']'); - if (!c) { - printf("%s: Parse failure\n", __func__); - exit(EXIT_FAILURE); - } - *c = '\0'; - - ret = 0; - while (strings[ret]) { - if (!strcmp(strings[ret], buf)) - return ret; - ret++; - } - - printf("Failed to parse %s\n", name); - exit(EXIT_FAILURE); -} - -static void write_string(const char *name, const char *val) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - - if (!write_file(path, val, strlen(val) + 1)) { - perror(path); - exit(EXIT_FAILURE); - } -} - -static const unsigned long _read_num(const char *path) -{ - char buf[21]; - - if (read_file(path, buf, sizeof(buf)) < 0) { - perror("read_file(read_num)"); - exit(EXIT_FAILURE); - } - - return strtoul(buf, NULL, 10); -} - -static const unsigned long read_num(const char *name) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - return _read_num(path); -} - -static void _write_num(const char *path, unsigned long num) -{ - char buf[21]; - - sprintf(buf, "%ld", num); - if (!write_file(path, buf, strlen(buf) + 1)) { - perror(path); - exit(EXIT_FAILURE); - } -} - -static void write_num(const char *name, unsigned long num) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - _write_num(path, num); -} - -static void write_settings(struct settings *settings) -{ - struct khugepaged_settings *khugepaged = &settings->khugepaged; - - write_string("enabled", thp_enabled_strings[settings->thp_enabled]); - write_string("defrag", thp_defrag_strings[settings->thp_defrag]); - write_string("shmem_enabled", - shmem_enabled_strings[settings->shmem_enabled]); - write_num("use_zero_page", settings->use_zero_page); - - write_num("khugepaged/defrag", khugepaged->defrag); - write_num("khugepaged/alloc_sleep_millisecs", - khugepaged->alloc_sleep_millisecs); - write_num("khugepaged/scan_sleep_millisecs", - khugepaged->scan_sleep_millisecs); - write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); - write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); - write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); - write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); - - if (file_ops && finfo.type == VMA_FILE) - _write_num(finfo.dev_queue_read_ahead_path, - settings->read_ahead_kb); -} - -#define MAX_SETTINGS_DEPTH 4 -static struct settings settings_stack[MAX_SETTINGS_DEPTH]; -static int settings_index; - -static struct settings *current_settings(void) -{ - if (!settings_index) { - printf("Fail: No settings set"); - exit(EXIT_FAILURE); - } - return settings_stack + settings_index - 1; -} - -static void push_settings(struct settings *settings) -{ - if (settings_index >= MAX_SETTINGS_DEPTH) { - printf("Fail: Settings stack exceeded"); - exit(EXIT_FAILURE); - } - settings_stack[settings_index++] = *settings; - write_settings(current_settings()); -} - -static void pop_settings(void) -{ - if (settings_index <= 0) { - printf("Fail: Settings stack empty"); - exit(EXIT_FAILURE); - } - --settings_index; - write_settings(current_settings()); -} - static void restore_settings_atexit(void) { if (skip_settings_restore) return; printf("Restore THP and khugepaged settings..."); - write_settings(&saved_settings); + thp_restore_settings(); success("OK"); skip_settings_restore = true; @@ -395,27 +111,9 @@ static void restore_settings(int sig) static void save_settings(void) { printf("Save THP and khugepaged settings..."); - saved_settings = (struct settings) { - .thp_enabled = read_string("enabled", thp_enabled_strings), - .thp_defrag = read_string("defrag", thp_defrag_strings), - .shmem_enabled = - read_string("shmem_enabled", shmem_enabled_strings), - .use_zero_page = read_num("use_zero_page"), - }; - saved_settings.khugepaged = (struct khugepaged_settings) { - .defrag = read_num("khugepaged/defrag"), - .alloc_sleep_millisecs = - read_num("khugepaged/alloc_sleep_millisecs"), - .scan_sleep_millisecs = - read_num("khugepaged/scan_sleep_millisecs"), - .max_ptes_none = read_num("khugepaged/max_ptes_none"), - .max_ptes_swap = read_num("khugepaged/max_ptes_swap"), - .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), - .pages_to_scan = read_num("khugepaged/pages_to_scan"), - }; if (file_ops && finfo.type == VMA_FILE) - saved_settings.read_ahead_kb = - _read_num(finfo.dev_queue_read_ahead_path); + thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path); + thp_save_settings(); success("OK"); @@ -798,7 +496,7 @@ static void __madvise_collapse(const char *msg, char *p, int nr_hpages, struct mem_ops *ops, bool expect) { int ret; - struct settings settings = *current_settings(); + struct thp_settings settings = *thp_current_settings(); printf("%s...", msg); @@ -808,7 +506,7 @@ static void __madvise_collapse(const char *msg, char *p, int nr_hpages, */ settings.thp_enabled = THP_NEVER; settings.shmem_enabled = SHMEM_NEVER; - push_settings(&settings); + thp_push_settings(&settings); /* Clear VM_NOHUGEPAGE */ madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); @@ -820,7 +518,7 @@ static void __madvise_collapse(const char *msg, char *p, int nr_hpages, else success("OK"); - pop_settings(); + thp_pop_settings(); } static void madvise_collapse(const char *msg, char *p, int nr_hpages, @@ -850,13 +548,13 @@ static bool wait_for_scan(const char *msg, char *p, int nr_hpages, madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); /* Wait until the second full_scan completed */ - full_scans = read_num("khugepaged/full_scans") + 2; + full_scans = thp_read_num("khugepaged/full_scans") + 2; printf("%s...", msg); while (timeout--) { if (ops->check_huge(p, nr_hpages)) break; - if (read_num("khugepaged/full_scans") >= full_scans) + if (thp_read_num("khugepaged/full_scans") >= full_scans) break; printf("."); usleep(TICK); @@ -911,11 +609,11 @@ static bool is_tmpfs(struct mem_ops *ops) static void alloc_at_fault(void) { - struct settings settings = *current_settings(); + struct thp_settings settings = *thp_current_settings(); char *p; settings.thp_enabled = THP_ALWAYS; - push_settings(&settings); + thp_push_settings(&settings); p = alloc_mapping(1); *p = 1; @@ -925,7 +623,7 @@ static void alloc_at_fault(void) else fail("Fail"); - pop_settings(); + thp_pop_settings(); madvise(p, page_size, MADV_DONTNEED); printf("Split huge PMD on MADV_DONTNEED..."); @@ -973,11 +671,11 @@ static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_none = hpage_pmd_nr / 2; - struct settings settings = *current_settings(); + struct thp_settings settings = *thp_current_settings(); void *p; settings.khugepaged.max_ptes_none = max_ptes_none; - push_settings(&settings); + thp_push_settings(&settings); p = ops->setup_area(1); @@ -1002,7 +700,7 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o } skip: ops->cleanup_area(p, hpage_pmd_size); - pop_settings(); + thp_pop_settings(); } static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) @@ -1033,7 +731,7 @@ out: static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) { - int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); + int max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap"); void *p; p = ops->setup_area(1); @@ -1250,11 +948,11 @@ static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *o fail("Fail"); ops->fault(p, 0, page_size); - write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); + thp_write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); c->collapse("Collapse PTE table full of compound pages in child", p, 1, ops, true); - write_num("khugepaged/max_ptes_shared", - current_settings()->khugepaged.max_ptes_shared); + thp_write_num("khugepaged/max_ptes_shared", + thp_current_settings()->khugepaged.max_ptes_shared); validate_memory(p, 0, hpage_pmd_size); ops->cleanup_area(p, hpage_pmd_size); @@ -1275,7 +973,7 @@ static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *o static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) { - int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); + int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared"); int wstatus; void *p; @@ -1443,7 +1141,7 @@ static void parse_test_type(int argc, const char **argv) int main(int argc, const char **argv) { - struct settings default_settings = { + struct thp_settings default_settings = { .thp_enabled = THP_MADVISE, .thp_defrag = THP_DEFRAG_ALWAYS, .shmem_enabled = SHMEM_ADVISE, @@ -1484,7 +1182,7 @@ int main(int argc, const char **argv) default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; save_settings(); - push_settings(&default_settings); + thp_push_settings(&default_settings); alloc_at_fault(); diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c new file mode 100644 index 000000000000..5e8ec792cac7 --- /dev/null +++ b/tools/testing/selftests/mm/thp_settings.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include + +#include "thp_settings.h" + +#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" +#define MAX_SETTINGS_DEPTH 4 +static struct thp_settings settings_stack[MAX_SETTINGS_DEPTH]; +static int settings_index; +static struct thp_settings saved_settings; +static char dev_queue_read_ahead_path[PATH_MAX]; + +static const char * const thp_enabled_strings[] = { + "always", + "madvise", + "never", + NULL +}; + +static const char * const thp_defrag_strings[] = { + "always", + "defer", + "defer+madvise", + "madvise", + "never", + NULL +}; + +static const char * const shmem_enabled_strings[] = { + "always", + "within_size", + "advise", + "never", + "deny", + "force", + NULL +}; + +int read_file(const char *path, char *buf, size_t buflen) +{ + int fd; + ssize_t numread; + + fd = open(path, O_RDONLY); + if (fd == -1) + return 0; + + numread = read(fd, buf, buflen - 1); + if (numread < 1) { + close(fd); + return 0; + } + + buf[numread] = '\0'; + close(fd); + + return (unsigned int) numread; +} + +int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) { + printf("open(%s)\n", path); + exit(EXIT_FAILURE); + return 0; + } + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) { + printf("write(%s)\n", buf); + exit(EXIT_FAILURE); + return 0; + } + + return (unsigned int) numwritten; +} + +const unsigned long read_num(const char *path) +{ + char buf[21]; + + if (read_file(path, buf, sizeof(buf)) < 0) { + perror("read_file()"); + exit(EXIT_FAILURE); + } + + return strtoul(buf, NULL, 10); +} + +void write_num(const char *path, unsigned long num) +{ + char buf[21]; + + sprintf(buf, "%ld", num); + if (!write_file(path, buf, strlen(buf) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +int thp_read_string(const char *name, const char * const strings[]) +{ + char path[PATH_MAX]; + char buf[256]; + char *c; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!read_file(path, buf, sizeof(buf))) { + perror(path); + exit(EXIT_FAILURE); + } + + c = strchr(buf, '['); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + + c++; + memmove(buf, c, sizeof(buf) - (c - buf)); + + c = strchr(buf, ']'); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + *c = '\0'; + + ret = 0; + while (strings[ret]) { + if (!strcmp(strings[ret], buf)) + return ret; + ret++; + } + + printf("Failed to parse %s\n", name); + exit(EXIT_FAILURE); +} + +void thp_write_string(const char *name, const char *val) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(path, val, strlen(val) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +const unsigned long thp_read_num(const char *name) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return read_num(path); +} + +void thp_write_num(const char *name, unsigned long num) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + write_num(path, num); +} + +void thp_read_settings(struct thp_settings *settings) +{ + *settings = (struct thp_settings) { + .thp_enabled = thp_read_string("enabled", thp_enabled_strings), + .thp_defrag = thp_read_string("defrag", thp_defrag_strings), + .shmem_enabled = + thp_read_string("shmem_enabled", shmem_enabled_strings), + .use_zero_page = thp_read_num("use_zero_page"), + }; + settings->khugepaged = (struct khugepaged_settings) { + .defrag = thp_read_num("khugepaged/defrag"), + .alloc_sleep_millisecs = + thp_read_num("khugepaged/alloc_sleep_millisecs"), + .scan_sleep_millisecs = + thp_read_num("khugepaged/scan_sleep_millisecs"), + .max_ptes_none = thp_read_num("khugepaged/max_ptes_none"), + .max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap"), + .max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared"), + .pages_to_scan = thp_read_num("khugepaged/pages_to_scan"), + }; + if (dev_queue_read_ahead_path[0]) + settings->read_ahead_kb = read_num(dev_queue_read_ahead_path); +} + +void thp_write_settings(struct thp_settings *settings) +{ + struct khugepaged_settings *khugepaged = &settings->khugepaged; + + thp_write_string("enabled", thp_enabled_strings[settings->thp_enabled]); + thp_write_string("defrag", thp_defrag_strings[settings->thp_defrag]); + thp_write_string("shmem_enabled", + shmem_enabled_strings[settings->shmem_enabled]); + thp_write_num("use_zero_page", settings->use_zero_page); + + thp_write_num("khugepaged/defrag", khugepaged->defrag); + thp_write_num("khugepaged/alloc_sleep_millisecs", + khugepaged->alloc_sleep_millisecs); + thp_write_num("khugepaged/scan_sleep_millisecs", + khugepaged->scan_sleep_millisecs); + thp_write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); + thp_write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); + thp_write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); + thp_write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); + + if (dev_queue_read_ahead_path[0]) + write_num(dev_queue_read_ahead_path, settings->read_ahead_kb); +} + +struct thp_settings *thp_current_settings(void) +{ + if (!settings_index) { + printf("Fail: No settings set"); + exit(EXIT_FAILURE); + } + return settings_stack + settings_index - 1; +} + +void thp_push_settings(struct thp_settings *settings) +{ + if (settings_index >= MAX_SETTINGS_DEPTH) { + printf("Fail: Settings stack exceeded"); + exit(EXIT_FAILURE); + } + settings_stack[settings_index++] = *settings; + thp_write_settings(thp_current_settings()); +} + +void thp_pop_settings(void) +{ + if (settings_index <= 0) { + printf("Fail: Settings stack empty"); + exit(EXIT_FAILURE); + } + --settings_index; + thp_write_settings(thp_current_settings()); +} + +void thp_restore_settings(void) +{ + thp_write_settings(&saved_settings); +} + +void thp_save_settings(void) +{ + thp_read_settings(&saved_settings); +} + +void thp_set_read_ahead_path(char *path) +{ + if (!path) { + dev_queue_read_ahead_path[0] = '\0'; + return; + } + + strncpy(dev_queue_read_ahead_path, path, + sizeof(dev_queue_read_ahead_path)); + dev_queue_read_ahead_path[sizeof(dev_queue_read_ahead_path) - 1] = '\0'; +} diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h new file mode 100644 index 000000000000..ff3d98c30617 --- /dev/null +++ b/tools/testing/selftests/mm/thp_settings.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __THP_SETTINGS_H__ +#define __THP_SETTINGS_H__ + +#include +#include +#include + +enum thp_enabled { + THP_ALWAYS, + THP_MADVISE, + THP_NEVER, +}; + +enum thp_defrag { + THP_DEFRAG_ALWAYS, + THP_DEFRAG_DEFER, + THP_DEFRAG_DEFER_MADVISE, + THP_DEFRAG_MADVISE, + THP_DEFRAG_NEVER, +}; + +enum shmem_enabled { + SHMEM_ALWAYS, + SHMEM_WITHIN_SIZE, + SHMEM_ADVISE, + SHMEM_NEVER, + SHMEM_DENY, + SHMEM_FORCE, +}; + +struct khugepaged_settings { + bool defrag; + unsigned int alloc_sleep_millisecs; + unsigned int scan_sleep_millisecs; + unsigned int max_ptes_none; + unsigned int max_ptes_swap; + unsigned int max_ptes_shared; + unsigned long pages_to_scan; +}; + +struct thp_settings { + enum thp_enabled thp_enabled; + enum thp_defrag thp_defrag; + enum shmem_enabled shmem_enabled; + bool use_zero_page; + struct khugepaged_settings khugepaged; + unsigned long read_ahead_kb; +}; + +int read_file(const char *path, char *buf, size_t buflen); +int write_file(const char *path, const char *buf, size_t buflen); +const unsigned long read_num(const char *path); +void write_num(const char *path, unsigned long num); + +int thp_read_string(const char *name, const char * const strings[]); +void thp_write_string(const char *name, const char *val); +const unsigned long thp_read_num(const char *name); +void thp_write_num(const char *name, unsigned long num); + +void thp_write_settings(struct thp_settings *settings); +void thp_read_settings(struct thp_settings *settings); +struct thp_settings *thp_current_settings(void); +void thp_push_settings(struct thp_settings *settings); +void thp_pop_settings(void); +void thp_restore_settings(void); +void thp_save_settings(void); + +void thp_set_read_ahead_path(char *path); + +#endif /* __THP_SETTINGS_H__ */ From 4f5070a5e40db2e9dbf5fff4ec678d6fbb338d5c Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:08 +0000 Subject: [PATCH 1032/1562] selftests/mm: support multi-size THP interface in thp_settings Save and restore the new per-size hugepage enabled setting, if available on the running kernel. Since the number of per-size directories is not fixed, solve this as simply as possible by catering for a maximum number in the thp_settings struct (20). Each array index is the order. The value of THP_NEVER is changed to 0 so that all of these new settings default to THP_NEVER and the user only needs to fill in the ones they want to enable. Link: https://lkml.kernel.org/r/20231207161211.2374093-8-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: Kefeng Wang Tested-by: John Hubbard Cc: Alistair Popple Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/khugepaged.c | 3 ++ tools/testing/selftests/mm/thp_settings.c | 57 ++++++++++++++++++++++- tools/testing/selftests/mm/thp_settings.h | 13 +++++- 3 files changed, 69 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index b15e7fd70176..7bd3baa9d34b 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -1141,6 +1141,7 @@ static void parse_test_type(int argc, const char **argv) int main(int argc, const char **argv) { + int hpage_pmd_order; struct thp_settings default_settings = { .thp_enabled = THP_MADVISE, .thp_defrag = THP_DEFRAG_ALWAYS, @@ -1175,11 +1176,13 @@ int main(int argc, const char **argv) exit(EXIT_FAILURE); } hpage_pmd_nr = hpage_pmd_size / page_size; + hpage_pmd_order = __builtin_ctz(hpage_pmd_nr); default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; + default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT; save_settings(); thp_push_settings(&default_settings); diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c index 5e8ec792cac7..a4163438108e 100644 --- a/tools/testing/selftests/mm/thp_settings.c +++ b/tools/testing/selftests/mm/thp_settings.c @@ -16,9 +16,10 @@ static struct thp_settings saved_settings; static char dev_queue_read_ahead_path[PATH_MAX]; static const char * const thp_enabled_strings[] = { - "always", - "madvise", "never", + "always", + "inherit", + "madvise", NULL }; @@ -198,6 +199,10 @@ void thp_write_num(const char *name, unsigned long num) void thp_read_settings(struct thp_settings *settings) { + unsigned long orders = thp_supported_orders(); + char path[PATH_MAX]; + int i; + *settings = (struct thp_settings) { .thp_enabled = thp_read_string("enabled", thp_enabled_strings), .thp_defrag = thp_read_string("defrag", thp_defrag_strings), @@ -218,11 +223,26 @@ void thp_read_settings(struct thp_settings *settings) }; if (dev_queue_read_ahead_path[0]) settings->read_ahead_kb = read_num(dev_queue_read_ahead_path); + + for (i = 0; i < NR_ORDERS; i++) { + if (!((1 << i) & orders)) { + settings->hugepages[i].enabled = THP_NEVER; + continue; + } + snprintf(path, PATH_MAX, "hugepages-%ukB/enabled", + (getpagesize() >> 10) << i); + settings->hugepages[i].enabled = + thp_read_string(path, thp_enabled_strings); + } } void thp_write_settings(struct thp_settings *settings) { struct khugepaged_settings *khugepaged = &settings->khugepaged; + unsigned long orders = thp_supported_orders(); + char path[PATH_MAX]; + int enabled; + int i; thp_write_string("enabled", thp_enabled_strings[settings->thp_enabled]); thp_write_string("defrag", thp_defrag_strings[settings->thp_defrag]); @@ -242,6 +262,15 @@ void thp_write_settings(struct thp_settings *settings) if (dev_queue_read_ahead_path[0]) write_num(dev_queue_read_ahead_path, settings->read_ahead_kb); + + for (i = 0; i < NR_ORDERS; i++) { + if (!((1 << i) & orders)) + continue; + snprintf(path, PATH_MAX, "hugepages-%ukB/enabled", + (getpagesize() >> 10) << i); + enabled = settings->hugepages[i].enabled; + thp_write_string(path, thp_enabled_strings[enabled]); + } } struct thp_settings *thp_current_settings(void) @@ -294,3 +323,27 @@ void thp_set_read_ahead_path(char *path) sizeof(dev_queue_read_ahead_path)); dev_queue_read_ahead_path[sizeof(dev_queue_read_ahead_path) - 1] = '\0'; } + +unsigned long thp_supported_orders(void) +{ + unsigned long orders = 0; + char path[PATH_MAX]; + char buf[256]; + int ret; + int i; + + for (i = 0; i < NR_ORDERS; i++) { + ret = snprintf(path, PATH_MAX, THP_SYSFS "hugepages-%ukB/enabled", + (getpagesize() >> 10) << i); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + ret = read_file(path, buf, sizeof(buf)); + if (ret) + orders |= 1UL << i; + } + + return orders; +} diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h index ff3d98c30617..71cbff05f4c7 100644 --- a/tools/testing/selftests/mm/thp_settings.h +++ b/tools/testing/selftests/mm/thp_settings.h @@ -7,9 +7,10 @@ #include enum thp_enabled { - THP_ALWAYS, - THP_MADVISE, THP_NEVER, + THP_ALWAYS, + THP_INHERIT, + THP_MADVISE, }; enum thp_defrag { @@ -29,6 +30,12 @@ enum shmem_enabled { SHMEM_FORCE, }; +#define NR_ORDERS 20 + +struct hugepages_settings { + enum thp_enabled enabled; +}; + struct khugepaged_settings { bool defrag; unsigned int alloc_sleep_millisecs; @@ -46,6 +53,7 @@ struct thp_settings { bool use_zero_page; struct khugepaged_settings khugepaged; unsigned long read_ahead_kb; + struct hugepages_settings hugepages[NR_ORDERS]; }; int read_file(const char *path, char *buf, size_t buflen); @@ -67,5 +75,6 @@ void thp_restore_settings(void); void thp_save_settings(void); void thp_set_read_ahead_path(char *path); +unsigned long thp_supported_orders(void); #endif /* __THP_SETTINGS_H__ */ From 9f0704eae8a4edc8dca9c8a297f798d505a4103a Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:09 +0000 Subject: [PATCH 1033/1562] selftests/mm/khugepaged: enlighten for multi-size THP The `collapse_max_ptes_none` test was previously failing when a THP size less than PMD-size had enabled="always". The root cause is because the test faults in 1 page less than the threshold it set for collapsing. But when THP is enabled always, we "over allocate" and therefore the threshold is passed, and collapse unexpectedly succeeds. Solve this by enlightening khugepaged selftest. Add a command line option to pass in the desired THP size that should be used for all anonymous allocations. The harness will then explicitly configure a THP size as requested and modify the `collapse_max_ptes_none` test so that it faults in the threshold minus the number of pages in the configured THP size. If no command line option is provided, default to order 0, as per previous behaviour. I chose to use an order in the command line interface, since this makes the interface agnostic of base page size, making it easier to invoke from run_vmtests.sh. Link: https://lkml.kernel.org/r/20231207161211.2374093-9-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: Kefeng Wang Tested-by: John Hubbard Cc: Alistair Popple Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/khugepaged.c | 48 +++++++++++++++++------ tools/testing/selftests/mm/run_vmtests.sh | 2 + 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index 7bd3baa9d34b..829320a519e7 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -28,6 +28,7 @@ static unsigned long hpage_pmd_size; static unsigned long page_size; static int hpage_pmd_nr; +static int anon_order; #define PID_SMAPS "/proc/self/smaps" #define TEST_FILE "collapse_test_file" @@ -607,6 +608,11 @@ static bool is_tmpfs(struct mem_ops *ops) return ops == &__file_ops && finfo.type == VMA_SHMEM; } +static bool is_anon(struct mem_ops *ops) +{ + return ops == &__anon_ops; +} + static void alloc_at_fault(void) { struct thp_settings settings = *thp_current_settings(); @@ -673,6 +679,7 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o int max_ptes_none = hpage_pmd_nr / 2; struct thp_settings settings = *thp_current_settings(); void *p; + int fault_nr_pages = is_anon(ops) ? 1 << anon_order : 1; settings.khugepaged.max_ptes_none = max_ptes_none; thp_push_settings(&settings); @@ -686,10 +693,10 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o goto skip; } - ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size); c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, ops, !c->enforce_pte_scan_limits); - validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size); if (c->enforce_pte_scan_limits) { ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); @@ -1076,7 +1083,7 @@ static void madvise_retracted_page_tables(struct collapse_context *c, static void usage(void) { - fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); + fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] [dir]\n\n"); fprintf(stderr, "\t\t: :\n"); fprintf(stderr, "\t\t: [all|khugepaged|madvise]\n"); fprintf(stderr, "\t\t: [all|anon|file|shmem]\n"); @@ -1085,15 +1092,34 @@ static void usage(void) fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); + fprintf(stderr, "\n\tSupported Options:\n"); + fprintf(stderr, "\t\t-h: This help message.\n"); + fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n"); + fprintf(stderr, "\t\t Defaults to 0. Use this size for anon allocations.\n"); exit(1); } -static void parse_test_type(int argc, const char **argv) +static void parse_test_type(int argc, char **argv) { + int opt; char *buf; const char *token; - if (argc == 1) { + while ((opt = getopt(argc, argv, "s:h")) != -1) { + switch (opt) { + case 's': + anon_order = atoi(optarg); + break; + case 'h': + default: + usage(); + } + } + + argv += optind; + argc -= optind; + + if (argc == 0) { /* Backwards compatibility */ khugepaged_context = &__khugepaged_context; madvise_context = &__madvise_context; @@ -1101,7 +1127,7 @@ static void parse_test_type(int argc, const char **argv) return; } - buf = strdup(argv[1]); + buf = strdup(argv[0]); token = strsep(&buf, ":"); if (!strcmp(token, "all")) { @@ -1135,11 +1161,13 @@ static void parse_test_type(int argc, const char **argv) if (!file_ops) return; - if (argc != 3) + if (argc != 2) usage(); + + get_finfo(argv[1]); } -int main(int argc, const char **argv) +int main(int argc, char **argv) { int hpage_pmd_order; struct thp_settings default_settings = { @@ -1164,9 +1192,6 @@ int main(int argc, const char **argv) parse_test_type(argc, argv); - if (file_ops) - get_finfo(argv[2]); - setbuf(stdout, NULL); page_size = getpagesize(); @@ -1183,6 +1208,7 @@ int main(int argc, const char **argv) default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT; + default_settings.hugepages[anon_order].enabled = THP_ALWAYS; save_settings(); thp_push_settings(&default_settings); diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index c0212258b852..87f513f5cf91 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -357,6 +357,8 @@ CATEGORY="cow" run_test ./cow CATEGORY="thp" run_test ./khugepaged +CATEGORY="thp" run_test ./khugepaged -s 2 + CATEGORY="thp" run_test ./transhuge-stress -d 20 CATEGORY="thp" run_test ./split_huge_page_test From 12dc16b38463a671bc91dc2df10f3a014a27ff3b Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:10 +0000 Subject: [PATCH 1034/1562] selftests/mm/cow: generalize do_run_with_thp() helper do_run_with_thp() prepares (PMD-sized) THP memory into different states before running tests. With the introduction of multi-size THP, we would like to reuse this logic to also test those smaller THP sizes. So let's add a thpsize parameter which tells the function what size THP it should operate on. A separate commit will utilize this change to add new tests for multi-size THP, where available. Link: https://lkml.kernel.org/r/20231207161211.2374093-10-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: David Hildenbrand Tested-by: Kefeng Wang Tested-by: John Hubbard Cc: Alistair Popple Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 121 +++++++++++++++++-------------- 1 file changed, 67 insertions(+), 54 deletions(-) diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 6f2f83990441..a284918b1172 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -32,7 +32,7 @@ static size_t pagesize; static int pagemap_fd; -static size_t thpsize; +static size_t pmdsize; static int nr_hugetlbsizes; static size_t hugetlbsizes[10]; static int gup_fd; @@ -734,7 +734,7 @@ enum thp_run { THP_RUN_PARTIAL_SHARED, }; -static void do_run_with_thp(test_fn fn, enum thp_run thp_run) +static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize) { char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED; size_t size, mmap_size, mremap_size; @@ -759,11 +759,11 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run) } /* - * Try to populate a THP. Touch the first sub-page and test if we get - * another sub-page populated automatically. + * Try to populate a THP. Touch the first sub-page and test if + * we get the last sub-page populated automatically. */ mem[0] = 0; - if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) { + if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) { ksft_test_result_skip("Did not get a THP populated\n"); goto munmap; } @@ -773,12 +773,14 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run) switch (thp_run) { case THP_RUN_PMD: case THP_RUN_PMD_SWAPOUT: + assert(thpsize == pmdsize); break; case THP_RUN_PTE: case THP_RUN_PTE_SWAPOUT: /* * Trigger PTE-mapping the THP by temporarily mapping a single - * subpage R/O. + * subpage R/O. This is a noop if the THP is not pmdsize (and + * therefore already PTE-mapped). */ ret = mprotect(mem + pagesize, pagesize, PROT_READ); if (ret) { @@ -875,52 +877,60 @@ munmap: munmap(mremap_mem, mremap_size); } -static void run_with_thp(test_fn fn, const char *desc) +static void run_with_thp(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with THP\n", desc); - do_run_with_thp(fn, THP_RUN_PMD); + ksft_print_msg("[RUN] %s ... with THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PMD, size); } -static void run_with_thp_swap(test_fn fn, const char *desc) +static void run_with_thp_swap(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc); - do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT); + ksft_print_msg("[RUN] %s ... with swapped-out THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size); } -static void run_with_pte_mapped_thp(test_fn fn, const char *desc) +static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc); - do_run_with_thp(fn, THP_RUN_PTE); + ksft_print_msg("[RUN] %s ... with PTE-mapped THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PTE, size); } -static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc) +static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc); - do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT); + ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size); } -static void run_with_single_pte_of_thp(test_fn fn, const char *desc) +static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc); - do_run_with_thp(fn, THP_RUN_SINGLE_PTE); + ksft_print_msg("[RUN] %s ... with single PTE of THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size); } -static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc) +static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc); - do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT); + ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size); } -static void run_with_partial_mremap_thp(test_fn fn, const char *desc) +static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc); - do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP); + ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size); } -static void run_with_partial_shared_thp(test_fn fn, const char *desc) +static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc); - do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED); + ksft_print_msg("[RUN] %s ... with partially shared THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size); } static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) @@ -1091,15 +1101,15 @@ static void run_anon_test_case(struct test_case const *test_case) run_with_base_page(test_case->fn, test_case->desc); run_with_base_page_swap(test_case->fn, test_case->desc); - if (thpsize) { - run_with_thp(test_case->fn, test_case->desc); - run_with_thp_swap(test_case->fn, test_case->desc); - run_with_pte_mapped_thp(test_case->fn, test_case->desc); - run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc); - run_with_single_pte_of_thp(test_case->fn, test_case->desc); - run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc); - run_with_partial_mremap_thp(test_case->fn, test_case->desc); - run_with_partial_shared_thp(test_case->fn, test_case->desc); + if (pmdsize) { + run_with_thp(test_case->fn, test_case->desc, pmdsize); + run_with_thp_swap(test_case->fn, test_case->desc, pmdsize); + run_with_pte_mapped_thp(test_case->fn, test_case->desc, pmdsize); + run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, pmdsize); + run_with_single_pte_of_thp(test_case->fn, test_case->desc, pmdsize); + run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, pmdsize); + run_with_partial_mremap_thp(test_case->fn, test_case->desc, pmdsize); + run_with_partial_shared_thp(test_case->fn, test_case->desc, pmdsize); } for (i = 0; i < nr_hugetlbsizes; i++) run_with_hugetlb(test_case->fn, test_case->desc, @@ -1120,7 +1130,7 @@ static int tests_per_anon_test_case(void) { int tests = 2 + nr_hugetlbsizes; - if (thpsize) + if (pmdsize) tests += 8; return tests; } @@ -1329,7 +1339,7 @@ static void run_anon_thp_test_cases(void) { int i; - if (!thpsize) + if (!pmdsize) return; ksft_print_msg("[INFO] Anonymous THP tests\n"); @@ -1338,13 +1348,13 @@ static void run_anon_thp_test_cases(void) struct test_case const *test_case = &anon_thp_test_cases[i]; ksft_print_msg("[RUN] %s\n", test_case->desc); - do_run_with_thp(test_case->fn, THP_RUN_PMD); + do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize); } } static int tests_per_anon_thp_test_case(void) { - return thpsize ? 1 : 0; + return pmdsize ? 1 : 0; } typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); @@ -1419,7 +1429,7 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) } /* For alignment purposes, we need twice the thp size. */ - mmap_size = 2 * thpsize; + mmap_size = 2 * pmdsize; mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (mmap_mem == MAP_FAILED) { @@ -1434,11 +1444,11 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) } /* We need a THP-aligned memory area. */ - mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); - smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1)); + mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1)); + smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1)); - ret = madvise(mem, thpsize, MADV_HUGEPAGE); - ret |= madvise(smem, thpsize, MADV_HUGEPAGE); + ret = madvise(mem, pmdsize, MADV_HUGEPAGE); + ret |= madvise(smem, pmdsize, MADV_HUGEPAGE); if (ret) { ksft_test_result_fail("MADV_HUGEPAGE failed\n"); goto munmap; @@ -1457,7 +1467,7 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) goto munmap; } - fn(mem, smem, thpsize); + fn(mem, smem, pmdsize); munmap: munmap(mmap_mem, mmap_size); if (mmap_smem != MAP_FAILED) @@ -1650,7 +1660,7 @@ static void run_non_anon_test_case(struct non_anon_test_case const *test_case) run_with_zeropage(test_case->fn, test_case->desc); run_with_memfd(test_case->fn, test_case->desc); run_with_tmpfile(test_case->fn, test_case->desc); - if (thpsize) + if (pmdsize) run_with_huge_zeropage(test_case->fn, test_case->desc); for (i = 0; i < nr_hugetlbsizes; i++) run_with_memfd_hugetlb(test_case->fn, test_case->desc, @@ -1671,7 +1681,7 @@ static int tests_per_non_anon_test_case(void) { int tests = 3 + nr_hugetlbsizes; - if (thpsize) + if (pmdsize) tests += 1; return tests; } @@ -1683,10 +1693,13 @@ int main(int argc, char **argv) ksft_print_header(); pagesize = getpagesize(); - thpsize = read_pmd_pagesize(); - if (thpsize) + pmdsize = read_pmd_pagesize(); + if (pmdsize) { + ksft_print_msg("[INFO] detected PMD size: %zu KiB\n", + pmdsize / 1024); ksft_print_msg("[INFO] detected THP size: %zu KiB\n", - thpsize / 1024); + pmdsize / 1024); + } nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, ARRAY_SIZE(hugetlbsizes)); detect_huge_zeropage(); From c0f79103322c322ea9342d52c2d81528b7b56232 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:11 +0000 Subject: [PATCH 1035/1562] selftests/mm/cow: add tests for anonymous multi-size THP Add tests similar to the existing PMD-sized THP tests, but which operate on memory backed by (PTE-mapped) multi-size THP. This reuses all the existing infrastructure. If the test suite detects that multi-size THP is not supported by the kernel, the new tests are skipped. Link: https://lkml.kernel.org/r/20231207161211.2374093-11-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: David Hildenbrand Tested-by: Kefeng Wang Tested-by: John Hubbard Cc: Alistair Popple Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 82 +++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index a284918b1172..363bf5f801be 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -29,15 +29,49 @@ #include "../../../../mm/gup_test.h" #include "../kselftest.h" #include "vm_util.h" +#include "thp_settings.h" static size_t pagesize; static int pagemap_fd; static size_t pmdsize; +static int nr_thpsizes; +static size_t thpsizes[20]; static int nr_hugetlbsizes; static size_t hugetlbsizes[10]; static int gup_fd; static bool has_huge_zeropage; +static int sz2ord(size_t size) +{ + return __builtin_ctzll(size / pagesize); +} + +static int detect_thp_sizes(size_t sizes[], int max) +{ + int count = 0; + unsigned long orders; + size_t kb; + int i; + + /* thp not supported at all. */ + if (!pmdsize) + return 0; + + orders = 1UL << sz2ord(pmdsize); + orders |= thp_supported_orders(); + + for (i = 0; orders && count < max; i++) { + if (!(orders & (1UL << i))) + continue; + orders &= ~(1UL << i); + kb = (pagesize >> 10) << i; + sizes[count++] = kb * 1024; + ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb); + } + + return count; +} + static void detect_huge_zeropage(void) { int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page", @@ -1101,15 +1135,27 @@ static void run_anon_test_case(struct test_case const *test_case) run_with_base_page(test_case->fn, test_case->desc); run_with_base_page_swap(test_case->fn, test_case->desc); - if (pmdsize) { - run_with_thp(test_case->fn, test_case->desc, pmdsize); - run_with_thp_swap(test_case->fn, test_case->desc, pmdsize); - run_with_pte_mapped_thp(test_case->fn, test_case->desc, pmdsize); - run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, pmdsize); - run_with_single_pte_of_thp(test_case->fn, test_case->desc, pmdsize); - run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, pmdsize); - run_with_partial_mremap_thp(test_case->fn, test_case->desc, pmdsize); - run_with_partial_shared_thp(test_case->fn, test_case->desc, pmdsize); + for (i = 0; i < nr_thpsizes; i++) { + size_t size = thpsizes[i]; + struct thp_settings settings = *thp_current_settings(); + + settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER; + settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS; + thp_push_settings(&settings); + + if (size == pmdsize) { + run_with_thp(test_case->fn, test_case->desc, size); + run_with_thp_swap(test_case->fn, test_case->desc, size); + } + + run_with_pte_mapped_thp(test_case->fn, test_case->desc, size); + run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size); + run_with_single_pte_of_thp(test_case->fn, test_case->desc, size); + run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size); + run_with_partial_mremap_thp(test_case->fn, test_case->desc, size); + run_with_partial_shared_thp(test_case->fn, test_case->desc, size); + + thp_pop_settings(); } for (i = 0; i < nr_hugetlbsizes; i++) run_with_hugetlb(test_case->fn, test_case->desc, @@ -1130,8 +1176,9 @@ static int tests_per_anon_test_case(void) { int tests = 2 + nr_hugetlbsizes; + tests += 6 * nr_thpsizes; if (pmdsize) - tests += 8; + tests += 2; return tests; } @@ -1689,16 +1736,22 @@ static int tests_per_non_anon_test_case(void) int main(int argc, char **argv) { int err; + struct thp_settings default_settings; ksft_print_header(); pagesize = getpagesize(); pmdsize = read_pmd_pagesize(); if (pmdsize) { + /* Only if THP is supported. */ + thp_read_settings(&default_settings); + default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT; + thp_save_settings(); + thp_push_settings(&default_settings); + ksft_print_msg("[INFO] detected PMD size: %zu KiB\n", pmdsize / 1024); - ksft_print_msg("[INFO] detected THP size: %zu KiB\n", - pmdsize / 1024); + nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes)); } nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, ARRAY_SIZE(hugetlbsizes)); @@ -1717,6 +1770,11 @@ int main(int argc, char **argv) run_anon_thp_test_cases(); run_non_anon_test_cases(); + if (pmdsize) { + /* Only if THP is supported. */ + thp_restore_settings(); + } + err = ksft_get_fail_cnt(); if (err) ksft_exit_fail_msg("%d out of %d tests failed\n", From 03d69d49da496e31246f41a017b32b68b9d2362e Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 8 Dec 2023 10:04:50 +0800 Subject: [PATCH 1036/1562] maple_tree: fix warning comparing pointer to 0 Avoid pointer type value compared with 0 to make code clear. ./tools/testing/radix-tree/maple.c:34142:15-16: WARNING comparing pointer to 0. Link: https://lkml.kernel.org/r/20231208020450.7003-1-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7696 Signed-off-by: Jiapeng Chong Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 35cc8c2a10f4..f1caf4bcf937 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -34139,7 +34139,7 @@ STORE, 140501948112896, 140501948116991, mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); check_erase2_testset(mt, set27, ARRAY_SIZE(set27)); rcu_barrier(); - MT_BUG_ON(mt, 0 != mtree_load(mt, 140415537422336)); + MT_BUG_ON(mt, NULL != mtree_load(mt, 140415537422336)); mt_set_non_kernel(0); mt_validate(mt); mtree_destroy(mt); @@ -34263,7 +34263,7 @@ STORE, 140501948112896, 140501948116991, mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); check_erase2_testset(mt, set37, ARRAY_SIZE(set37)); rcu_barrier(); - MT_BUG_ON(mt, 0 != mtree_load(mt, 94637033459712)); + MT_BUG_ON(mt, NULL != mtree_load(mt, 94637033459712)); mt_validate(mt); mtree_destroy(mt); @@ -34271,7 +34271,7 @@ STORE, 140501948112896, 140501948116991, mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); check_erase2_testset(mt, set38, ARRAY_SIZE(set38)); rcu_barrier(); - MT_BUG_ON(mt, 0 != mtree_load(mt, 94637033459712)); + MT_BUG_ON(mt, NULL != mtree_load(mt, 94637033459712)); mt_validate(mt); mtree_destroy(mt); From d5f6057cf0018dc8863239fc3142b8509b9221cf Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 9 Dec 2023 22:38:39 -0800 Subject: [PATCH 1037/1562] maple_tree: fix typos/spellos etc Fix typos/grammar and spellos in documentation. Link: https://lkml.kernel.org/r/20231210063839.29967-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: Matthew Wilcox (Oracle) Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4a69148963e0..c9a970ea20dd 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -16,8 +16,8 @@ * and are simply the slot index + the minimum of the node. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to - * indicate that the tree is specifying ranges, Pivots may appear in the - * subtree with an entry attached to the value where as keys are unique to a + * indicate that the tree is specifying ranges. Pivots may appear in the + * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. * @@ -2500,7 +2500,7 @@ static inline void mast_set_split_parents(struct maple_subtree_state *mast, } /* - * mas_topiary_node() - Dispose of a singe node + * mas_topiary_node() - Dispose of a single node * @mas: The maple state for pushing nodes * @enode: The encoded maple node * @in_rcu: If the tree is in rcu mode @@ -5492,7 +5492,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) goto ask_now; } - /* New root needs a singe node */ + /* New root needs a single node */ if (unlikely(mte_is_root(mas->node))) goto ask_now; From 306abb63a8cab566bf80860c5430b1fa316646b7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 12 Dec 2023 19:48:06 +0000 Subject: [PATCH 1038/1562] selftests/damon: implement a python module for test-purpose DAMON sysfs controls Patch series "selftests/damon: add Python-written DAMON functionality tests", v2. DAMON exports most of its functionality via its sysfs interface. Hence most DAMON functionality tests could be implemented using the interface. However, because the interfaces require simple but multiple operations for many controls, writing all such tests from the scratch could be repetitive and time consuming. Implement a minimum DAMON sysfs control module, and a couple of DAMON functionality tests using the control module. The first test is for ensuring minimum accuracy of data access monitoring, and the second test is for finding if a previously found and fixed bug is introduced again. Note that the DAMON sysfs control module is only for avoiding duplicating code in tests. For convenient and general control of DAMON, users should use DAMON user-space tools that developed for the purpose, such as damo[1]. [1] https://github.com/damonitor/damo Patches Sequence ---------------- This patchset is constructed with five patches. The first three patches implement a Python-written test implementation-purpose DAMON sysfs control module. The implementation is incrementally done in the sequence of the basic data structure (first patch) first, kdamonds start command (second patch) next, and finally DAMOS tried bytes update command (third patch). Then two patches for implementing selftests using the module follows. The fourth patch implements a basic functionality test of DAMON for working set estimation accuracy. Finally, the fifth patch implements a corner case test for a previously found bug. This patch (of 5): Implement a python module for DAMON sysfs controls. The module is aimed to be useful for writing DAMON functionality tests in future. Nonetheless, this module is only representing a subset of DAMON sysfs files. Following commits will implement more DAMON sysfs controls. Link: https://lkml.kernel.org/r/20231212194810.54457-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231212194810.54457-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 tools/testing/selftests/damon/_damon_sysfs.py diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py new file mode 100644 index 000000000000..78101846ab66 --- /dev/null +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: GPL-2.0 + +class DamosAccessPattern: + size = None + nr_accesses = None + age = None + scheme = None + + def __init__(self, size=None, nr_accesses=None, age=None): + self.size = size + self.nr_accesses = nr_accesses + self.age = age + + if self.size == None: + self.size = [0, 2**64 - 1] + if self.nr_accesses == None: + self.nr_accesses = [0, 2**64 - 1] + if self.age == None: + self.age = [0, 2**64 - 1] + +class Damos: + action = None + access_pattern = None + # todo: Support quotas, watermarks, stats, tried_regions + idx = None + context = None + + def __init__(self, action='stat', access_pattern=DamosAccessPattern()): + self.action = action + self.access_pattern = access_pattern + self.access_pattern.scheme = self + +class DamonTarget: + pid = None + # todo: Support target regions if test is made + idx = None + context = None + + def __init__(self, pid): + self.pid = pid + +class DamonAttrs: + sample_us = None + aggr_us = None + update_us = None + min_nr_regions = None + max_nr_regions = None + context = None + + def __init__(self, sample_us=5000, aggr_us=100000, update_us=1000000, + min_nr_regions=10, max_nr_regions=1000): + self.sample_us = sample_us + self.aggr_us = aggr_us + self.update_us = update_us + self.min_nr_regions = min_nr_regions + self.max_nr_regions = max_nr_regions + +class DamonCtx: + ops = None + monitoring_attrs = None + targets = None + schemes = None + kdamond = None + idx = None + + def __init__(self, ops='paddr', monitoring_attrs=DamonAttrs(), targets=[], + schemes=[]): + self.ops = ops + self.monitoring_attrs = monitoring_attrs + self.monitoring_attrs.context = self + + self.targets = targets + for idx, target in enumerate(self.targets): + target.idx = idx + target.context = self + + self.schemes = schemes + for idx, scheme in enumerate(self.schemes): + scheme.idx = idx + scheme.context = self + +class Kdamond: + state = None + pid = None + contexts = None + idx = None # index of this kdamond between siblings + kdamonds = None # parent + + def __init__(self, contexts=[]): + self.contexts = contexts + for idx, context in enumerate(self.contexts): + context.idx = idx + context.kdamond = self + +class Kdamonds: + kdamonds = [] + + def __init__(self, kdamonds=[]): + self.kdamonds = kdamonds + for idx, kdamond in enumerate(self.kdamonds): + kdamond.idx = idx + kdamond.kdamonds = self From f5f0e5a2bef9e46f7a674b71d7f2a4c4b7e6bc5d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 12 Dec 2023 19:48:07 +0000 Subject: [PATCH 1039/1562] selftests/damon/_damon_sysfs: implement kdamonds start function Extend the tests-writing-purpose DAMON sysfs control module to support the kdamonds start functionality. Link: https://lkml.kernel.org/r/20231212194810.54457-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 206 ++++++++++++++++++ 1 file changed, 206 insertions(+) diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 78101846ab66..6b99f87a5f1e 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -1,5 +1,28 @@ # SPDX-License-Identifier: GPL-2.0 +import os + +sysfs_root = '/sys/kernel/mm/damon/admin' + +def write_file(path, string): + "Returns error string if failed, or None otherwise" + string = '%s' % string + try: + with open(path, 'w') as f: + f.write(string) + except Exception as e: + return '%s' % e + return None + +def read_file(path): + '''Returns the read content and error string. The read content is None if + the reading failed''' + try: + with open(path, 'r') as f: + return f.read(), None + except Exception as e: + return None, '%s' % e + class DamosAccessPattern: size = None nr_accesses = None @@ -18,6 +41,35 @@ class DamosAccessPattern: if self.age == None: self.age = [0, 2**64 - 1] + def sysfs_dir(self): + return os.path.join(self.scheme.sysfs_dir(), 'access_pattern') + + def stage(self): + err = write_file( + os.path.join(self.sysfs_dir(), 'sz', 'min'), self.size[0]) + if err != None: + return err + err = write_file( + os.path.join(self.sysfs_dir(), 'sz', 'max'), self.size[1]) + if err != None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'nr_accesses', 'min'), + self.nr_accesses[0]) + if err != None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'nr_accesses', 'max'), + self.nr_accesses[1]) + if err != None: + return err + err = write_file( + os.path.join(self.sysfs_dir(), 'age', 'min'), self.age[0]) + if err != None: + return err + err = write_file( + os.path.join(self.sysfs_dir(), 'age', 'max'), self.age[1]) + if err != None: + return err + class Damos: action = None access_pattern = None @@ -30,6 +82,39 @@ class Damos: self.access_pattern = access_pattern self.access_pattern.scheme = self + def sysfs_dir(self): + return os.path.join( + self.context.sysfs_dir(), 'schemes', '%d' % self.idx) + + def stage(self): + err = write_file(os.path.join(self.sysfs_dir(), 'action'), self.action) + if err != None: + return err + err = self.access_pattern.stage() + if err != None: + return err + + # disable quotas + err = write_file(os.path.join(self.sysfs_dir(), 'quotas', 'ms'), '0') + if err != None: + return err + err = write_file( + os.path.join(self.sysfs_dir(), 'quotas', 'bytes'), '0') + if err != None: + return err + + # disable watermarks + err = write_file( + os.path.join(self.sysfs_dir(), 'watermarks', 'metric'), 'none') + if err != None: + return err + + # disable filters + err = write_file( + os.path.join(self.sysfs_dir(), 'filters', 'nr_filters'), '0') + if err != None: + return err + class DamonTarget: pid = None # todo: Support target regions if test is made @@ -39,6 +124,18 @@ class DamonTarget: def __init__(self, pid): self.pid = pid + def sysfs_dir(self): + return os.path.join( + self.context.sysfs_dir(), 'targets', '%d' % self.idx) + + def stage(self): + err = write_file( + os.path.join(self.sysfs_dir(), 'regions', 'nr_regions'), '0') + if err != None: + return err + return write_file( + os.path.join(self.sysfs_dir(), 'pid_target'), self.pid) + class DamonAttrs: sample_us = None aggr_us = None @@ -55,6 +152,40 @@ class DamonAttrs: self.min_nr_regions = min_nr_regions self.max_nr_regions = max_nr_regions + def interval_sysfs_dir(self): + return os.path.join(self.context.sysfs_dir(), 'monitoring_attrs', + 'intervals') + + def nr_regions_range_sysfs_dir(self): + return os.path.join(self.context.sysfs_dir(), 'monitoring_attrs', + 'nr_regions') + + def stage(self): + err = write_file(os.path.join(self.interval_sysfs_dir(), 'sample_us'), + self.sample_us) + if err != None: + return err + err = write_file(os.path.join(self.interval_sysfs_dir(), 'aggr_us'), + self.aggr_us) + if err != None: + return err + err = write_file(os.path.join(self.interval_sysfs_dir(), 'update_us'), + self.update_us) + if err != None: + return err + + err = write_file( + os.path.join(self.nr_regions_range_sysfs_dir(), 'min'), + self.min_nr_regions) + if err != None: + return err + + err = write_file( + os.path.join(self.nr_regions_range_sysfs_dir(), 'max'), + self.max_nr_regions) + if err != None: + return err + class DamonCtx: ops = None monitoring_attrs = None @@ -79,6 +210,46 @@ class DamonCtx: scheme.idx = idx scheme.context = self + def sysfs_dir(self): + return os.path.join(self.kdamond.sysfs_dir(), 'contexts', + '%d' % self.idx) + + def stage(self): + err = write_file( + os.path.join(self.sysfs_dir(), 'operations'), self.ops) + if err != None: + return err + err = self.monitoring_attrs.stage() + if err != None: + return err + + nr_targets_file = os.path.join( + self.sysfs_dir(), 'targets', 'nr_targets') + content, err = read_file(nr_targets_file) + if err != None: + return err + if int(content) != len(self.targets): + err = write_file(nr_targets_file, '%d' % len(self.targets)) + if err != None: + return err + for target in self.targets: + err = target.stage() + if err != None: + return err + + nr_schemes_file = os.path.join( + self.sysfs_dir(), 'schemes', 'nr_schemes') + content, err = read_file(nr_schemes_file) + if int(content) != len(self.schemes): + err = write_file(nr_schemes_file, '%d' % len(self.schemes)) + if err != None: + return err + for scheme in self.schemes: + err = scheme.stage() + if err != None: + return err + return None + class Kdamond: state = None pid = None @@ -92,6 +263,27 @@ class Kdamond: context.idx = idx context.kdamond = self + def sysfs_dir(self): + return os.path.join(self.kdamonds.sysfs_dir(), '%d' % self.idx) + + def start(self): + nr_contexts_file = os.path.join(self.sysfs_dir(), + 'contexts', 'nr_contexts') + content, err = read_file(nr_contexts_file) + if err != None: + return err + if int(content) != len(self.contexts): + err = write_file(nr_contexts_file, '%d' % len(self.contexts)) + if err != None: + return err + + for context in self.contexts: + err = context.stage() + if err != None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'on') + return err + class Kdamonds: kdamonds = [] @@ -100,3 +292,17 @@ class Kdamonds: for idx, kdamond in enumerate(self.kdamonds): kdamond.idx = idx kdamond.kdamonds = self + + def sysfs_dir(self): + return os.path.join(sysfs_root, 'kdamonds') + + def start(self): + err = write_file(os.path.join(self.sysfs_dir(), 'nr_kdamonds'), + '%s' % len(self.kdamonds)) + if err != None: + return err + for kdamond in self.kdamonds: + err = kdamond.start() + if err != None: + return err + return None From 3402c6ce398e33bf1733f619756dd068ca2e2aa5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 12 Dec 2023 19:48:08 +0000 Subject: [PATCH 1040/1562] selftests/damon/_damon_sysfs: implement updat_schemes_tried_bytes command Implement update_schemes_tried_bytes command of DAMON sysfs interface in _damon_sysfs.py. It is not only making the update, but also read the updated value from the sysfs interface and store it in the Kdamond python objects so that the user of the module can easily get the value. Link: https://lkml.kernel.org/r/20231212194810.54457-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 6b99f87a5f1e..e98cf4b6a4b7 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -76,6 +76,7 @@ class Damos: # todo: Support quotas, watermarks, stats, tried_regions idx = None context = None + tried_bytes = None def __init__(self, action='stat', access_pattern=DamosAccessPattern()): self.action = action @@ -284,6 +285,19 @@ class Kdamond: err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'on') return err + def update_schemes_tried_bytes(self): + err = write_file(os.path.join(self.sysfs_dir(), 'state'), + 'update_schemes_tried_bytes') + if err != None: + return err + for context in self.contexts: + for scheme in context.schemes: + content, err = read_file(os.path.join(scheme.sysfs_dir(), + 'tried_regions', 'total_bytes')) + if err != None: + return err + scheme.tried_bytes = int(content) + class Kdamonds: kdamonds = [] From b5906f5f7359f561c5915dc146ced1bc2733401c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 12 Dec 2023 19:48:09 +0000 Subject: [PATCH 1041/1562] selftests/damon: add a test for update_schemes_tried_regions sysfs command Add a selftest for verifying the accuracy of DAMON's access monitoring functionality. The test starts a program of artificial access pattern, monitor the access pattern using DAMON, and check if DAMON finds expected amount of hot data region (working set size) with only acceptable error rate. Note that the acceptable error rate is set with only naive assumptions and small number of tests. Hence failures of the test may not always mean DAMON is broken. Rather than that, those could be a signal to better understand the real accuracy level of DAMON in wider environments. Based on further finding, we could optimize DAMON or adjust the expectation of the test. Link: https://lkml.kernel.org/r/20231212194810.54457-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 2 + tools/testing/selftests/damon/access_memory.c | 41 ++++++++++++++ ...te_schemes_tried_regions_wss_estimation.py | 55 +++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 tools/testing/selftests/damon/access_memory.c create mode 100644 tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index b71247ba7196..90ffafc42c5e 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -2,6 +2,7 @@ # Makefile for damon selftests TEST_GEN_FILES += huge_count_read_write +TEST_GEN_FILES += access_memory TEST_FILES = _chk_dependency.sh _debugfs_common.sh TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh @@ -9,6 +10,7 @@ TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += debugfs_rm_non_contexts.sh TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh +TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py TEST_PROGS += reclaim.sh lru_sort.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/access_memory.c b/tools/testing/selftests/damon/access_memory.c new file mode 100644 index 000000000000..585a2fa54329 --- /dev/null +++ b/tools/testing/selftests/damon/access_memory.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Artificial memory access program for testing DAMON. + */ + +#include +#include +#include +#include + +int main(int argc, char *argv[]) +{ + char **regions; + clock_t start_clock; + int nr_regions; + int sz_region; + int access_time_ms; + int i; + + if (argc != 4) { + printf("Usage: %s