From 07bcd733f667fedd323217e7facdb1bd0b9ec6a4 Mon Sep 17 00:00:00 2001 From: gaoxiang17 Date: Fri, 20 Sep 2024 20:20:30 +0800 Subject: [PATCH 1/8] UPSTREAM: mm/page_alloc: add some detailed comments in can_steal_fallback mm/page_alloc: add some detailed comments in can_steal_fallback [akpm@linux-foundation.org: tweak grammar, fit to 80 cols] Link: https://lkml.kernel.org/r/20240920122030.159751-1-gxxa03070307@gmail.com Signed-off-by: gaoxiang17 Signed-off-by: Andrew Morton Bug: 420836317 (cherry picked from commit 6025ea5abbe5d813d6a41c78e6ea14259fb503f4) Change-Id: Ib4a77bf96edeba6ce2c6627c99aacaf148b07d92 Signed-off-by: yipeng xiang --- mm/page_alloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6298a40902fc..dbf1a1faa40d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2100,6 +2100,14 @@ static bool can_steal_fallback(unsigned int order, int start_mt) if (order >= pageblock_order) return true; + /* + * Movable pages won't cause permanent fragmentation, so when you alloc + * small pages, you just need to temporarily steal unmovable or + * reclaimable pages that are closest to the request size. After a + * while, memory compaction may occur to form large contiguous pages, + * and the next movable allocation may not need to steal. Unmovable and + * reclaimable allocations need to actually steal pages. + */ if (order >= pageblock_order / 2 || start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE || From 1332d864bb1800afcc3921d16a41bf1f55b22b95 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 24 Feb 2025 19:08:24 -0500 Subject: [PATCH 2/8] UPSTREAM: mm: page_alloc: don't steal single pages from biggest buddy The fallback code searches for the biggest buddy first in an attempt to steal the whole block and encourage type grouping down the line. The approach used to be this: - Non-movable requests will split the largest buddy and steal the remainder. This splits up contiguity, but it allows subsequent requests of this type to fall back into adjacent space. - Movable requests go and look for the smallest buddy instead. The thinking is that movable requests can be compacted, so grouping is less important than retaining contiguity. c0cd6f557b90 ("mm: page_alloc: fix freelist movement during block conversion") enforces freelist type hygiene, which restricts stealing to either claiming the whole block or just taking the requested chunk; no additional pages or buddy remainders can be stolen any more. The patch mishandled when to switch to finding the smallest buddy in that new reality. As a result, it may steal the exact request size, but from the biggest buddy. This causes fracturing for no good reason. Fix this by committing to the new behavior: either steal the whole block, or fall back to the smallest buddy. Remove single-page stealing from steal_suitable_fallback(). Rename it to try_to_steal_block() to make the intentions clear. If this fails, always fall back to the smallest buddy. The following is from 4 runs of mmtest's thpchallenge. "Pollute" is single page fallback, "steal" is conversion of a partially used block. The numbers for free block conversions (omitted) are comparable. vanilla patched @pollute[unmovable from reclaimable]: 27 106 @pollute[unmovable from movable]: 82 46 @pollute[reclaimable from unmovable]: 256 83 @pollute[reclaimable from movable]: 46 8 @pollute[movable from unmovable]: 4841 868 @pollute[movable from reclaimable]: 5278 12568 @steal[unmovable from reclaimable]: 11 12 @steal[unmovable from movable]: 113 49 @steal[reclaimable from unmovable]: 19 34 @steal[reclaimable from movable]: 47 21 @steal[movable from unmovable]: 250 183 @steal[movable from reclaimable]: 81 93 The allocator appears to do a better job at keeping stealing and polluting to the first fallback preference. As a result, the numbers for "from movable" - the least preferred fallback option, and most detrimental to compactability - are down across the board. Link: https://lkml.kernel.org/r/20250225001023.1494422-2-hannes@cmpxchg.org Fixes: c0cd6f557b90 ("mm: page_alloc: fix freelist movement during block conversion") Signed-off-by: Johannes Weiner Suggested-by: Vlastimil Babka Reviewed-by: Brendan Jackman Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton Bug: 420836317 (cherry picked from commit c2f6ea38fc1b640aa7a2e155cc1c0410ff91afa2) Change-Id: I44a62580f1fcb53a2baff6ce3a8af08e9a20fdc0 Signed-off-by: yipeng xiang --- mm/page_alloc.c | 80 +++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 46 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dbf1a1faa40d..854b035aef1b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2159,13 +2159,12 @@ static inline bool boost_watermark(struct zone *zone) * can claim the whole pageblock for the requested migratetype. If not, we check * the pageblock for constituent pages; if at least half of the pages are free * or compatible, we can still claim the whole block, so pages freed in the - * future will be put on the correct free list. Otherwise, we isolate exactly - * the order we need from the fallback block and leave its migratetype alone. + * future will be put on the correct free list. */ static struct page * -steal_suitable_fallback(struct zone *zone, struct page *page, - int current_order, int order, int start_type, - unsigned int alloc_flags, bool whole_block) +try_to_steal_block(struct zone *zone, struct page *page, + int current_order, int order, int start_type, + unsigned int alloc_flags) { int free_pages, movable_pages, alike_pages; unsigned long start_pfn; @@ -2178,7 +2177,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, * highatomic accounting. */ if (is_migrate_highatomic(block_type)) - goto single_page; + return NULL; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { @@ -2199,14 +2198,10 @@ steal_suitable_fallback(struct zone *zone, struct page *page, if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); - /* We are not allowed to try stealing from the whole block */ - if (!whole_block) - goto single_page; - /* moving whole block can fail due to zone boundary conditions */ if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages, &movable_pages)) - goto single_page; + return NULL; /* * Determine how many pages are compatible with our allocation. @@ -2239,9 +2234,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, return __rmqueue_smallest(zone, order, start_type); } -single_page: - page_del_and_expand(zone, page, order, current_order, block_type); - return page; + return NULL; } /* @@ -2433,14 +2426,19 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, } /* - * Try finding a free buddy page on the fallback list and put it on the free - * list of requested migratetype, possibly along with other pages from the same - * block, depending on fragmentation avoidance heuristics. Returns true if - * fallback was found so that __rmqueue_smallest() can grab it. + * Try finding a free buddy page on the fallback list. + * + * This will attempt to steal a whole pageblock for the requested type + * to ensure grouping of such requests in the future. + * + * If a whole block cannot be stolen, regress to __rmqueue_smallest() + * logic to at least break up as little contiguity as possible. * * The use of signed ints for order and current_order is a deliberate * deviation from the rest of this file, to make the for loop * condition simpler. + * + * Return the stolen page, or NULL if none can be found. */ static __always_inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, @@ -2474,45 +2472,35 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, if (fallback_mt == -1) continue; - /* - * We cannot steal all free pages from the pageblock and the - * requested migratetype is movable. In that case it's better to - * steal and split the smallest available page instead of the - * largest available page, because even if the next movable - * allocation falls back into a different pageblock than this - * one, it won't cause permanent fragmentation. - */ - if (!can_steal && start_migratetype == MIGRATE_MOVABLE - && current_order > order) - goto find_smallest; + if (!can_steal) + break; - goto do_steal; + page = get_page_from_free_area(area, fallback_mt); + page = try_to_steal_block(zone, page, current_order, order, + start_migratetype, alloc_flags); + if (page) + goto got_one; } - return NULL; + if (alloc_flags & ALLOC_NOFRAGMENT) + return NULL; -find_smallest: + /* No luck stealing blocks. Find the smallest fallback page */ for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, start_migratetype, false, &can_steal); - if (fallback_mt != -1) - break; + if (fallback_mt == -1) + continue; + + page = get_page_from_free_area(area, fallback_mt); + page_del_and_expand(zone, page, order, current_order, fallback_mt); + goto got_one; } - /* - * This should not happen - we already found a suitable fallback - * when looking for the largest page. - */ - VM_BUG_ON(current_order > MAX_PAGE_ORDER); - -do_steal: - page = get_page_from_free_area(area, fallback_mt); - - /* take off list, maybe claim block, expand remainder */ - page = steal_suitable_fallback(zone, page, current_order, order, - start_migratetype, alloc_flags, can_steal); + return NULL; +got_one: trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); From dcc9c46651f7301c9eeac2477341c84461fd9598 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 24 Feb 2025 19:08:25 -0500 Subject: [PATCH 3/8] UPSTREAM: mm: page_alloc: remove remnants of unlocked migratetype updates The freelist hygiene patches made migratetype accesses fully protected under the zone->lock. Remove remnants of handling the race conditions that existed before from the MIGRATE_HIGHATOMIC code. Link: https://lkml.kernel.org/r/20250225001023.1494422-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Brendan Jackman Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton Bug: 420836317 (cherry picked from commit 020396a581dc69be2d30939fabde6c029d847034) Change-Id: Ia1266c34f09db1c404df7f37c1a9ff06d61c0cce Signed-off-by: yipeng xiang --- mm/page_alloc.c | 50 ++++++++++++++++--------------------------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 854b035aef1b..cbbe5b9d563c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2164,20 +2164,10 @@ static inline bool boost_watermark(struct zone *zone) static struct page * try_to_steal_block(struct zone *zone, struct page *page, int current_order, int order, int start_type, - unsigned int alloc_flags) + int block_type, unsigned int alloc_flags) { int free_pages, movable_pages, alike_pages; unsigned long start_pfn; - int block_type; - - block_type = get_pageblock_migratetype(page); - - /* - * This can happen due to races and we want to prevent broken - * highatomic accounting. - */ - if (is_migrate_highatomic(block_type)) - return NULL; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { @@ -2362,33 +2352,22 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &(zone->free_area[order]); - int mt; + unsigned long size; page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); if (!page) continue; - mt = get_pageblock_migratetype(page); /* - * In page freeing path, migratetype change is racy so - * we can counter several free pages in a pageblock - * in this loop although we changed the pageblock type - * from highatomic to ac->migratetype. So we should - * adjust the count once. + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. */ - if (is_migrate_highatomic(mt)) { - unsigned long size; - /* - * It should never happen but changes to - * locking could inadvertently allow a per-cpu - * drain to add pages to MIGRATE_HIGHATOMIC - * while unreserving so be safe and watch for - * underflows. - */ - size = max(pageblock_nr_pages, 1UL << order); - size = min(size, zone->nr_reserved_highatomic); - zone->nr_reserved_highatomic -= size; - } + size = max(pageblock_nr_pages, 1UL << order); + size = min(size, zone->nr_reserved_highatomic); + zone->nr_reserved_highatomic -= size; /* * Convert to ac->migratetype and avoid the normal @@ -2400,10 +2379,12 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * may increase. */ if (order < pageblock_order) - ret = move_freepages_block(zone, page, mt, + ret = move_freepages_block(zone, page, + MIGRATE_HIGHATOMIC, ac->migratetype); else { - move_to_free_list(page, zone, order, mt, + move_to_free_list(page, zone, order, + MIGRATE_HIGHATOMIC, ac->migratetype); change_pageblock_range(page, order, ac->migratetype); @@ -2477,7 +2458,8 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, page = get_page_from_free_area(area, fallback_mt); page = try_to_steal_block(zone, page, current_order, order, - start_migratetype, alloc_flags); + start_migratetype, fallback_mt, + alloc_flags); if (page) goto got_one; } From 6154d6314c67c8d6f4db76b2fce43d94e7fc017a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 24 Feb 2025 19:08:26 -0500 Subject: [PATCH 4/8] BACKPORT: mm: page_alloc: group fallback functions together The way the fallback rules are spread out makes them hard to follow. Move the functions next to each other at least. Link: https://lkml.kernel.org/r/20250225001023.1494422-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Brendan Jackman Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton Bug: 420836317 (cherry picked from commit a4138a2702a4428317ecdb115934554df4b788b4) [ 1. In the original patch of the find_suitable_fallback function, replace MIGRATE_PCPTYPES with MIGRATE_FALLBACKS.; 2. Keep the hook function in the reserve_highatomic_pageblock and unreserve_highatomic_pageblock functions. ] Change-Id: I069e8dd7f8b009c686daef4459f9f1452b3f4c2c Signed-off-by: yipeng xiang --- mm/page_alloc.c | 413 ++++++++++++++++++++++++------------------------ 1 file changed, 207 insertions(+), 206 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbbe5b9d563c..0098f32837f2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2076,6 +2076,43 @@ static void change_pageblock_range(struct page *pageblock_page, } } +static inline bool boost_watermark(struct zone *zone) +{ + unsigned long max_boost; + + if (!watermark_boost_factor) + return false; + /* + * Don't bother in zones that are unlikely to produce results. + * On small machines, including kdump capture kernels running + * in a small area, boosting the watermark can cause an out of + * memory situation immediately. + */ + if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) + return false; + + max_boost = mult_frac(zone->_watermark[WMARK_HIGH], + watermark_boost_factor, 10000); + + /* + * high watermark may be uninitialised if fragmentation occurs + * very early in boot so do not boost. We do not fall + * through and boost by pageblock_nr_pages as failing + * allocations that early means that reclaim is not going + * to help and it may even be impossible to reclaim the + * boosted watermark resulting in a hang. + */ + if (!max_boost) + return false; + + max_boost = max(pageblock_nr_pages, max_boost); + + zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, + max_boost); + + return true; +} + /* * When we are falling back to another migratetype during allocation, try to * steal extra free pages from the same pageblocks to satisfy further @@ -2117,41 +2154,38 @@ static bool can_steal_fallback(unsigned int order, int start_mt) return false; } -static inline bool boost_watermark(struct zone *zone) +/* + * Check whether there is a suitable fallback freepage with requested order. + * If only_stealable is true, this function returns fallback_mt only if + * we can steal other freepages all together. This would help to reduce + * fragmentation due to mixed migratetype pages in one pageblock. + */ +int find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool only_stealable, bool *can_steal) { - unsigned long max_boost; + int i; + int fallback_mt; - if (!watermark_boost_factor) - return false; - /* - * Don't bother in zones that are unlikely to produce results. - * On small machines, including kdump capture kernels running - * in a small area, boosting the watermark can cause an out of - * memory situation immediately. - */ - if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) - return false; + if (area->nr_free == 0) + return -1; - max_boost = mult_frac(zone->_watermark[WMARK_HIGH], - watermark_boost_factor, 10000); + *can_steal = false; + for (i = 0; i < MIGRATE_FALLBACKS - 1 ; i++) { + fallback_mt = fallbacks[migratetype][i]; + if (free_area_empty(area, fallback_mt)) + continue; - /* - * high watermark may be uninitialised if fragmentation occurs - * very early in boot so do not boost. We do not fall - * through and boost by pageblock_nr_pages as failing - * allocations that early means that reclaim is not going - * to help and it may even be impossible to reclaim the - * boosted watermark resulting in a hang. - */ - if (!max_boost) - return false; + if (can_steal_fallback(order, migratetype)) + *can_steal = true; - max_boost = max(pageblock_nr_pages, max_boost); + if (!only_stealable) + return fallback_mt; - zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, - max_boost); + if (*can_steal) + return fallback_mt; + } - return true; + return -1; } /* @@ -2227,184 +2261,6 @@ try_to_steal_block(struct zone *zone, struct page *page, return NULL; } -/* - * Check whether there is a suitable fallback freepage with requested order. - * If only_stealable is true, this function returns fallback_mt only if - * we can steal other freepages all together. This would help to reduce - * fragmentation due to mixed migratetype pages in one pageblock. - */ -int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool only_stealable, bool *can_steal) -{ - int i; - int fallback_mt; - - if (area->nr_free == 0) - return -1; - - *can_steal = false; - for (i = 0; i < MIGRATE_FALLBACKS - 1 ; i++) { - fallback_mt = fallbacks[migratetype][i]; - if (free_area_empty(area, fallback_mt)) - continue; - - if (can_steal_fallback(order, migratetype)) - *can_steal = true; - - if (!only_stealable) - return fallback_mt; - - if (*can_steal) - return fallback_mt; - } - - return -1; -} - -/* - * Reserve the pageblock(s) surrounding an allocation request for - * exclusive use of high-order atomic allocations if there are no - * empty page blocks that contain a page with a suitable order - */ -static void reserve_highatomic_pageblock(struct page *page, int order, - struct zone *zone) -{ - int mt; - unsigned long max_managed, flags; - bool bypass = false; - - /* - * The number reserved as: minimum is 1 pageblock, maximum is - * roughly 1% of a zone. But if 1% of a zone falls below a - * pageblock size, then don't reserve any pageblocks. - * Check is race-prone but harmless. - */ - if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages) - return; - max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages); - if (zone->nr_reserved_highatomic >= max_managed) - return; - trace_android_vh_reserve_highatomic_bypass(page, &bypass); - if (bypass) - return; - - spin_lock_irqsave(&zone->lock, flags); - - /* Recheck the nr_reserved_highatomic limit under the lock */ - if (zone->nr_reserved_highatomic >= max_managed) - goto out_unlock; - - /* Yoink! */ - mt = get_pageblock_migratetype(page); - /* Only reserve normal pageblocks (i.e., they can merge with others) */ - if (!migratetype_is_mergeable(mt)) - goto out_unlock; - - if (order < pageblock_order) { - if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) - goto out_unlock; - zone->nr_reserved_highatomic += pageblock_nr_pages; - } else { - change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); - zone->nr_reserved_highatomic += 1 << order; - } - -out_unlock: - spin_unlock_irqrestore(&zone->lock, flags); -} - -/* - * Used when an allocation is about to fail under memory pressure. This - * potentially hurts the reliability of high-order allocations when under - * intense memory pressure but failed atomic allocations should be easier - * to recover from than an OOM. - * - * If @force is true, try to unreserve pageblocks even though highatomic - * pageblock is exhausted. - */ -static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, - bool force) -{ - struct zonelist *zonelist = ac->zonelist; - unsigned long flags; - struct zoneref *z; - struct zone *zone; - struct page *page; - int order; - int ret; - bool skip_unreserve_highatomic = false; - - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, - ac->nodemask) { - /* - * Preserve at least one pageblock unless memory pressure - * is really high. - */ - if (!force && zone->nr_reserved_highatomic <= - pageblock_nr_pages) - continue; - - trace_android_vh_unreserve_highatomic_bypass(force, zone, - &skip_unreserve_highatomic); - if (skip_unreserve_highatomic) - continue; - - spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < NR_PAGE_ORDERS; order++) { - struct free_area *area = &(zone->free_area[order]); - unsigned long size; - - page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); - if (!page) - continue; - - /* - * It should never happen but changes to - * locking could inadvertently allow a per-cpu - * drain to add pages to MIGRATE_HIGHATOMIC - * while unreserving so be safe and watch for - * underflows. - */ - size = max(pageblock_nr_pages, 1UL << order); - size = min(size, zone->nr_reserved_highatomic); - zone->nr_reserved_highatomic -= size; - - /* - * Convert to ac->migratetype and avoid the normal - * pageblock stealing heuristics. Minimally, the caller - * is doing the work and needs the pages. More - * importantly, if the block was always converted to - * MIGRATE_UNMOVABLE or another type then the number - * of pageblocks that cannot be completely freed - * may increase. - */ - if (order < pageblock_order) - ret = move_freepages_block(zone, page, - MIGRATE_HIGHATOMIC, - ac->migratetype); - else { - move_to_free_list(page, zone, order, - MIGRATE_HIGHATOMIC, - ac->migratetype); - change_pageblock_range(page, order, - ac->migratetype); - ret = 1; - } - /* - * Reserving the block(s) already succeeded, - * so this should not fail on zone boundaries. - */ - WARN_ON_ONCE(ret == -1); - if (ret > 0) { - spin_unlock_irqrestore(&zone->lock, flags); - return ret; - } - } - spin_unlock_irqrestore(&zone->lock, flags); - } - - return false; -} /* * Try finding a free buddy page on the fallback list. @@ -3397,6 +3253,151 @@ out: return page; } +/* + * Reserve the pageblock(s) surrounding an allocation request for + * exclusive use of high-order atomic allocations if there are no + * empty page blocks that contain a page with a suitable order + */ +static void reserve_highatomic_pageblock(struct page *page, int order, + struct zone *zone) +{ + int mt; + unsigned long max_managed, flags; + bool bypass = false; + + /* + * The number reserved as: minimum is 1 pageblock, maximum is + * roughly 1% of a zone. But if 1% of a zone falls below a + * pageblock size, then don't reserve any pageblocks. + * Check is race-prone but harmless. + */ + if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages) + return; + max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages); + if (zone->nr_reserved_highatomic >= max_managed) + return; + trace_android_vh_reserve_highatomic_bypass(page, &bypass); + if (bypass) + return; + + spin_lock_irqsave(&zone->lock, flags); + + /* Recheck the nr_reserved_highatomic limit under the lock */ + if (zone->nr_reserved_highatomic >= max_managed) + goto out_unlock; + + /* Yoink! */ + mt = get_pageblock_migratetype(page); + /* Only reserve normal pageblocks (i.e., they can merge with others) */ + if (!migratetype_is_mergeable(mt)) + goto out_unlock; + + if (order < pageblock_order) { + if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) + goto out_unlock; + zone->nr_reserved_highatomic += pageblock_nr_pages; + } else { + change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); + zone->nr_reserved_highatomic += 1 << order; + } + +out_unlock: + spin_unlock_irqrestore(&zone->lock, flags); +} + +/* + * Used when an allocation is about to fail under memory pressure. This + * potentially hurts the reliability of high-order allocations when under + * intense memory pressure but failed atomic allocations should be easier + * to recover from than an OOM. + * + * If @force is true, try to unreserve pageblocks even though highatomic + * pageblock is exhausted. + */ +static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + bool force) +{ + struct zonelist *zonelist = ac->zonelist; + unsigned long flags; + struct zoneref *z; + struct zone *zone; + struct page *page; + int order; + int ret; + bool skip_unreserve_highatomic = false; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, + ac->nodemask) { + /* + * Preserve at least one pageblock unless memory pressure + * is really high. + */ + if (!force && zone->nr_reserved_highatomic <= + pageblock_nr_pages) + continue; + + trace_android_vh_unreserve_highatomic_bypass(force, zone, + &skip_unreserve_highatomic); + if (skip_unreserve_highatomic) + continue; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct free_area *area = &(zone->free_area[order]); + unsigned long size; + + page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); + if (!page) + continue; + + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. + */ + size = max(pageblock_nr_pages, 1UL << order); + size = min(size, zone->nr_reserved_highatomic); + zone->nr_reserved_highatomic -= size; + + /* + * Convert to ac->migratetype and avoid the normal + * pageblock stealing heuristics. Minimally, the caller + * is doing the work and needs the pages. More + * importantly, if the block was always converted to + * MIGRATE_UNMOVABLE or another type then the number + * of pageblocks that cannot be completely freed + * may increase. + */ + if (order < pageblock_order) + ret = move_freepages_block(zone, page, + MIGRATE_HIGHATOMIC, + ac->migratetype); + else { + move_to_free_list(page, zone, order, + MIGRATE_HIGHATOMIC, + ac->migratetype); + change_pageblock_range(page, order, + ac->migratetype); + ret = 1; + } + /* + * Reserving the block(s) already succeeded, + * so this should not fail on zone boundaries. + */ + WARN_ON_ONCE(ret == -1); + if (ret > 0) { + spin_unlock_irqrestore(&zone->lock, flags); + return ret; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + } + + return false; +} + static inline long __zone_watermark_unusable_free(struct zone *z, unsigned int order, unsigned int alloc_flags) { From ebbdcd4a72d48e933d00ab70511136a02c8b4279 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Fri, 28 Feb 2025 09:52:17 +0000 Subject: [PATCH 5/8] UPSTREAM: mm/page_alloc: clarify terminology in migratetype fallback code Patch series "mm/page_alloc: Some clarifications for migratetype fallback", v4. A couple of patches to try and make the code easier to follow. This patch (of 2): This code is rather confusing because: 1. "Steal" is sometimes used to refer to the general concept of allocating from a from a block of a fallback migratetype (steal_suitable_fallback()) but sometimes it refers specifically to converting a whole block's migratetype (can_steal_fallback()). 2. can_steal_fallback() sounds as though it's answering the question "am I functionally permitted to allocate from that other type" but in fact it is encoding a heuristic preference. 3. The same piece of data has different names in different places: can_steal vs whole_block. This reinforces point 2 because it looks like the different names reflect a shift in intent from "am I allowed to steal" to "do I want to steal", but no such shift exists. Fix 1. by avoiding the term "steal" in ambiguous contexts. Start using the term "claim" to refer to the special case of stealing the entire block. Fix 2. by using "should" instead of "can", and also rename its parameters and add some commentary to make it more explicit what they mean. Fix 3. by adopting the new "claim" terminology universally for this set of variables. Link: https://lkml.kernel.org/r/20250228-clarify-steal-v4-0-cb2ef1a4e610@google.com Link: https://lkml.kernel.org/r/20250228-clarify-steal-v4-1-cb2ef1a4e610@google.com Signed-off-by: Brendan Jackman Reviewed-by: Vlastimil Babka Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Yosry Ahmed Signed-off-by: Andrew Morton Bug: 420836317 (cherry picked from commit e47f1f56dd82cc6d91f5c4d914a534aa03cd12ca) Change-Id: I8f1b57aebf308f378f50cd1381f31d249362078e Signed-off-by: yipeng xiang --- mm/compaction.c | 4 +-- mm/internal.h | 2 +- mm/page_alloc.c | 72 ++++++++++++++++++++++++------------------------- 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 263fd54821bc..41f511fbb685 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2359,7 +2359,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) ret = COMPACT_NO_SUITABLE_PAGE; for (order = cc->order; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &cc->zone->free_area[order]; - bool can_steal; + bool claim_block; /* Job done if page is free of the right migratetype */ if (!free_area_empty(area, migratetype)) @@ -2376,7 +2376,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) * other migratetype buddy lists. */ if (find_suitable_fallback(area, order, migratetype, - true, &can_steal) != -1) + true, &claim_block) != -1) /* * Movable pages are OK in any pageblock. If we are * stealing for a non-movable allocation, make sure diff --git a/mm/internal.h b/mm/internal.h index 19579278587a..9ef38b415c5b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -834,7 +834,7 @@ void init_cma_reserved_pageblock(struct page *page); #endif /* CONFIG_COMPACTION || CONFIG_CMA */ int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool only_stealable, bool *can_steal); + int migratetype, bool claim_only, bool *claim_block); static inline bool free_area_empty(struct free_area *area, int migratetype) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0098f32837f2..3e2714c4f515 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2115,22 +2115,22 @@ static inline bool boost_watermark(struct zone *zone) /* * When we are falling back to another migratetype during allocation, try to - * steal extra free pages from the same pageblocks to satisfy further - * allocations, instead of polluting multiple pageblocks. + * claim entire blocks to satisfy further allocations, instead of polluting + * multiple pageblocks. * - * If we are stealing a relatively large buddy page, it is likely there will - * be more free pages in the pageblock, so try to steal them all. For - * reclaimable and unmovable allocations, we steal regardless of page size, - * as fragmentation caused by those allocations polluting movable pageblocks - * is worse than movable allocations stealing from unmovable and reclaimable - * pageblocks. + * If we are stealing a relatively large buddy page, it is likely there will be + * more free pages in the pageblock, so try to claim the whole block. For + * reclaimable and unmovable allocations, we try to claim the whole block + * regardless of page size, as fragmentation caused by those allocations + * polluting movable pageblocks is worse than movable allocations stealing from + * unmovable and reclaimable pageblocks. */ -static bool can_steal_fallback(unsigned int order, int start_mt) +static bool should_try_claim_block(unsigned int order, int start_mt) { /* * Leaving this order check is intended, although there is * relaxed order check in next check. The reason is that - * we can actually steal whole pageblock if this condition met, + * we can actually claim the whole pageblock if this condition met, * but, below check doesn't guarantee it and that is just heuristic * so could be changed anytime. */ @@ -2143,7 +2143,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt) * reclaimable pages that are closest to the request size. After a * while, memory compaction may occur to form large contiguous pages, * and the next movable allocation may not need to steal. Unmovable and - * reclaimable allocations need to actually steal pages. + * reclaimable allocations need to actually claim the whole block. */ if (order >= pageblock_order / 2 || start_mt == MIGRATE_RECLAIMABLE || @@ -2156,12 +2156,14 @@ static bool can_steal_fallback(unsigned int order, int start_mt) /* * Check whether there is a suitable fallback freepage with requested order. - * If only_stealable is true, this function returns fallback_mt only if - * we can steal other freepages all together. This would help to reduce + * Sets *claim_block to instruct the caller whether it should convert a whole + * pageblock to the returned migratetype. + * If only_claim is true, this function returns fallback_mt only if + * we would do this whole-block claiming. This would help to reduce * fragmentation due to mixed migratetype pages in one pageblock. */ int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool only_stealable, bool *can_steal) + int migratetype, bool only_claim, bool *claim_block) { int i; int fallback_mt; @@ -2169,19 +2171,16 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, if (area->nr_free == 0) return -1; - *can_steal = false; + *claim_block = false; for (i = 0; i < MIGRATE_FALLBACKS - 1 ; i++) { fallback_mt = fallbacks[migratetype][i]; if (free_area_empty(area, fallback_mt)) continue; - if (can_steal_fallback(order, migratetype)) - *can_steal = true; + if (should_try_claim_block(order, migratetype)) + *claim_block = true; - if (!only_stealable) - return fallback_mt; - - if (*can_steal) + if (*claim_block || !only_claim) return fallback_mt; } @@ -2189,14 +2188,14 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, } /* - * This function implements actual steal behaviour. If order is large enough, we - * can claim the whole pageblock for the requested migratetype. If not, we check - * the pageblock for constituent pages; if at least half of the pages are free - * or compatible, we can still claim the whole block, so pages freed in the - * future will be put on the correct free list. + * This function implements actual block claiming behaviour. If order is large + * enough, we can claim the whole pageblock for the requested migratetype. If + * not, we check the pageblock for constituent pages; if at least half of the + * pages are free or compatible, we can still claim the whole block, so pages + * freed in the future will be put on the correct free list. */ static struct page * -try_to_steal_block(struct zone *zone, struct page *page, +try_to_claim_block(struct zone *zone, struct page *page, int current_order, int order, int start_type, int block_type, unsigned int alloc_flags) { @@ -2265,11 +2264,12 @@ try_to_steal_block(struct zone *zone, struct page *page, /* * Try finding a free buddy page on the fallback list. * - * This will attempt to steal a whole pageblock for the requested type + * This will attempt to claim a whole pageblock for the requested type * to ensure grouping of such requests in the future. * - * If a whole block cannot be stolen, regress to __rmqueue_smallest() - * logic to at least break up as little contiguity as possible. + * If a whole block cannot be claimed, steal an individual page, regressing to + * __rmqueue_smallest() logic to at least break up as little contiguity as + * possible. * * The use of signed ints for order and current_order is a deliberate * deviation from the rest of this file, to make the for loop @@ -2286,7 +2286,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, int min_order = order; struct page *page; int fallback_mt; - bool can_steal; + bool claim_block; /* * Do not steal pages from freelists belonging to other pageblocks @@ -2305,15 +2305,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &can_steal); + start_migratetype, false, &claim_block); if (fallback_mt == -1) continue; - if (!can_steal) + if (!claim_block) break; page = get_page_from_free_area(area, fallback_mt); - page = try_to_steal_block(zone, page, current_order, order, + page = try_to_claim_block(zone, page, current_order, order, start_migratetype, fallback_mt, alloc_flags); if (page) @@ -2323,11 +2323,11 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, if (alloc_flags & ALLOC_NOFRAGMENT) return NULL; - /* No luck stealing blocks. Find the smallest fallback page */ + /* No luck claiming pageblock. Find the smallest fallback page */ for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &can_steal); + start_migratetype, false, &claim_block); if (fallback_mt == -1) continue; From 4f5843a6bb9df781c0b59f3789af2d135fda14ed Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Fri, 28 Feb 2025 09:52:18 +0000 Subject: [PATCH 6/8] UPSTREAM: mm/page_alloc: clarify should_claim_block() commentary There's lots of text here but it's a little hard to follow, this is an attempt to break it up and align its structure more closely with the code. Reword the top-level function comment to just explain what question the function answers from the point of view of the caller. Break up the internal logic into different sections that can have their own commentary describing why that part of the rationale is present. Note the page_group_by_mobility_disabled logic is not explained in the commentary, that is outside the scope of this patch... Link: https://lkml.kernel.org/r/20250228-clarify-steal-v4-2-cb2ef1a4e610@google.com Signed-off-by: Brendan Jackman Reviewed-by: Vlastimil Babka Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Yosry Ahmed Signed-off-by: Andrew Morton Bug: 420836317 (cherry picked from commit a14efee04796dd3f614eaf5348ca1ac099c21349) Change-Id: I6c7f908a4e9f025726dadab210c2d59004fe1946 Signed-off-by: yipeng xiang --- mm/page_alloc.c | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3e2714c4f515..997261050119 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2114,16 +2114,9 @@ static inline bool boost_watermark(struct zone *zone) } /* - * When we are falling back to another migratetype during allocation, try to - * claim entire blocks to satisfy further allocations, instead of polluting - * multiple pageblocks. - * - * If we are stealing a relatively large buddy page, it is likely there will be - * more free pages in the pageblock, so try to claim the whole block. For - * reclaimable and unmovable allocations, we try to claim the whole block - * regardless of page size, as fragmentation caused by those allocations - * polluting movable pageblocks is worse than movable allocations stealing from - * unmovable and reclaimable pageblocks. + * When we are falling back to another migratetype during allocation, should we + * try to claim an entire block to satisfy further allocations, instead of + * polluting multiple pageblocks? */ static bool should_try_claim_block(unsigned int order, int start_mt) { @@ -2138,19 +2131,32 @@ static bool should_try_claim_block(unsigned int order, int start_mt) return true; /* - * Movable pages won't cause permanent fragmentation, so when you alloc - * small pages, you just need to temporarily steal unmovable or - * reclaimable pages that are closest to the request size. After a - * while, memory compaction may occur to form large contiguous pages, - * and the next movable allocation may not need to steal. Unmovable and - * reclaimable allocations need to actually claim the whole block. + * Above a certain threshold, always try to claim, as it's likely there + * will be more free pages in the pageblock. */ - if (order >= pageblock_order / 2 || - start_mt == MIGRATE_RECLAIMABLE || - start_mt == MIGRATE_UNMOVABLE || - page_group_by_mobility_disabled) + if (order >= pageblock_order / 2) return true; + /* + * Unmovable/reclaimable allocations would cause permanent + * fragmentations if they fell back to allocating from a movable block + * (polluting it), so we try to claim the whole block regardless of the + * allocation size. Later movable allocations can always steal from this + * block, which is less problematic. + */ + if (start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE) + return true; + + if (page_group_by_mobility_disabled) + return true; + + /* + * Movable pages won't cause permanent fragmentation, so when you alloc + * small pages, we just need to temporarily steal unmovable or + * reclaimable pages that are closest to the request size. After a + * while, memory compaction may occur to form large contiguous pages, + * and the next movable allocation may not need to steal. + */ return false; } From 6cf093756a11483f6b1fb83198e67ee83edbdbea Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2025 14:01:53 -0400 Subject: [PATCH 7/8] BACKPORT: mm: page_alloc: speed up fallbacks in rmqueue_bulk() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test robot identified c2f6ea38fc1b ("mm: page_alloc: don't steal single pages from biggest buddy") as the root cause of a 56.4% regression in vm-scalability::lru-file-mmap-read. Carlos reports an earlier patch, c0cd6f557b90 ("mm: page_alloc: fix freelist movement during block conversion"), as the root cause for a regression in worst-case zone->lock+irqoff hold times. Both of these patches modify the page allocator's fallback path to be less greedy in an effort to stave off fragmentation. The flip side of this is that fallbacks are also less productive each time around, which means the fallback search can run much more frequently. Carlos' traces point to rmqueue_bulk() specifically, which tries to refill the percpu cache by allocating a large batch of pages in a loop. It highlights how once the native freelists are exhausted, the fallback code first scans orders top-down for whole blocks to claim, then falls back to a bottom-up search for the smallest buddy to steal. For the next batch page, it goes through the same thing again. This can be made more efficient. Since rmqueue_bulk() holds the zone->lock over the entire batch, the freelists are not subject to outside changes; when the search for a block to claim has already failed, there is no point in trying again for the next page. Modify __rmqueue() to remember the last successful fallback mode, and restart directly from there on the next rmqueue_bulk() iteration. Oliver confirms that this improves beyond the regression that the test robot reported against c2f6ea38fc1b: commit: f3b92176f4 ("tools/selftests: add guard region test for /proc/$pid/pagemap") c2f6ea38fc ("mm: page_alloc: don't steal single pages from biggest buddy") acc4d5ff0b ("Merge tag 'net-6.15-rc0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net") 2c847f27c3 ("mm: page_alloc: speed up fallbacks in rmqueue_bulk()") <--- your patch f3b92176f4f7100f c2f6ea38fc1b640aa7a2e155cc1 acc4d5ff0b61eb1715c498b6536 2c847f27c37da65a93d23c237c5 ---------------- --------------------------- --------------------------- --------------------------- %stddev %change %stddev %change %stddev %change %stddev \ | \ | \ | \ 25525364 ± 3% -56.4% 11135467 -57.8% 10779336 +31.6% 33581409 vm-scalability.throughput Carlos confirms that worst-case times are almost fully recovered compared to before the earlier culprit patch: 2dd482ba627d (before freelist hygiene): 1ms c0cd6f557b90 (after freelist hygiene): 90ms next-20250319 (steal smallest buddy): 280ms this patch : 8ms [jackmanb@google.com: comment updates] Link: https://lkml.kernel.org/r/D92AC0P9594X.3BML64MUKTF8Z@google.com [hannes@cmpxchg.org: reset rmqueue_mode in rmqueue_buddy() error loop, per Yunsheng Lin] Link: https://lkml.kernel.org/r/20250409140023.GA2313@cmpxchg.org Link: https://lkml.kernel.org/r/20250407180154.63348-1-hannes@cmpxchg.org Fixes: c0cd6f557b90 ("mm: page_alloc: fix freelist movement during block conversion") Fixes: c2f6ea38fc1b ("mm: page_alloc: don't steal single pages from biggest buddy") Signed-off-by: Johannes Weiner Signed-off-by: Brendan Jackman Reported-by: kernel test robot Reported-by: Carlos Song Tested-by: Carlos Song Tested-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202503271547.fc08b188-lkp@intel.com Reviewed-by: Brendan Jackman Tested-by: Shivank Garg Acked-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: [6.10+] Signed-off-by: Andrew Morton Bug: 420836317 (cherry picked from commit 90abee6d7895d5eef18c91d870d8168be4e76e9d) [Resolve conflicts caused by cma_redirect_restricted ] Change-Id: I4bf9e270886716b0a3f11f9edce9a73e855b1fe9 Signed-off-by: yipeng xiang --- mm/page_alloc.c | 116 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 81 insertions(+), 35 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 997261050119..fc5fe8f53cf9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2268,23 +2268,15 @@ try_to_claim_block(struct zone *zone, struct page *page, /* - * Try finding a free buddy page on the fallback list. - * - * This will attempt to claim a whole pageblock for the requested type - * to ensure grouping of such requests in the future. - * - * If a whole block cannot be claimed, steal an individual page, regressing to - * __rmqueue_smallest() logic to at least break up as little contiguity as - * possible. + * Try to allocate from some fallback migratetype by claiming the entire block, + * i.e. converting it to the allocation's start migratetype. * * The use of signed ints for order and current_order is a deliberate * deviation from the rest of this file, to make the for loop * condition simpler. - * - * Return the stolen page, or NULL if none can be found. */ static __always_inline struct page * -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype, +__rmqueue_claim(struct zone *zone, int order, int start_migratetype, unsigned int alloc_flags) { struct free_area *area; @@ -2322,14 +2314,29 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, page = try_to_claim_block(zone, page, current_order, order, start_migratetype, fallback_mt, alloc_flags); - if (page) - goto got_one; + if (page) { + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + return page; + } } - if (alloc_flags & ALLOC_NOFRAGMENT) - return NULL; + return NULL; +} + +/* + * Try to steal a single page from some fallback migratetype. Leave the rest of + * the block as its current migratetype, potentially causing fragmentation. + */ +static __always_inline struct page * +__rmqueue_steal(struct zone *zone, int order, int start_migratetype) +{ + struct free_area *area; + int current_order; + struct page *page; + int fallback_mt; + bool claim_block; - /* No luck claiming pageblock. Find the smallest fallback page */ for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, @@ -2339,25 +2346,28 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, page = get_page_from_free_area(area, fallback_mt); page_del_and_expand(zone, page, order, current_order, fallback_mt); - goto got_one; + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + return page; } return NULL; - -got_one: - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, fallback_mt); - - return page; } +enum rmqueue_mode { + RMQUEUE_NORMAL, + RMQUEUE_CMA, + RMQUEUE_CLAIM, + RMQUEUE_STEAL, +}; + /* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ static __always_inline struct page * __rmqueue(struct zone *zone, unsigned int order, int migratetype, - unsigned int alloc_flags) + unsigned int alloc_flags, enum rmqueue_mode *mode) { struct page *page = NULL; @@ -2380,16 +2390,48 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, } } - page = __rmqueue_smallest(zone, order, migratetype); - if (unlikely(!page)) { - if (!cma_redirect_restricted() && alloc_flags & ALLOC_CMA) + /* + * First try the freelists of the requested migratetype, then try + * fallbacks modes with increasing levels of fragmentation risk. + * + * The fallback logic is expensive and rmqueue_bulk() calls in + * a loop with the zone->lock held, meaning the freelists are + * not subject to any outside changes. Remember in *mode where + * we found pay dirt, to save us the search on the next call. + */ + switch (*mode) { + case RMQUEUE_NORMAL: + page = __rmqueue_smallest(zone, order, migratetype); + if (page) + return page; + fallthrough; + case RMQUEUE_CMA: + if (!cma_redirect_restricted() && alloc_flags & ALLOC_CMA) { page = __rmqueue_cma_fallback(zone, order); - - if (!page) - page = __rmqueue_fallback(zone, order, migratetype, - alloc_flags); + if (page) { + *mode = RMQUEUE_CMA; + return page; + } + } + fallthrough; + case RMQUEUE_CLAIM: + page = __rmqueue_claim(zone, order, migratetype, alloc_flags); + if (page) { + /* Replenished preferred freelist, back to normal mode. */ + *mode = RMQUEUE_NORMAL; + return page; + } + fallthrough; + case RMQUEUE_STEAL: + if (!(alloc_flags & ALLOC_NOFRAGMENT)) { + page = __rmqueue_steal(zone, order, migratetype); + if (page) { + *mode = RMQUEUE_STEAL; + return page; + } + } } - return page; + return NULL; } /* @@ -2401,6 +2443,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { + enum rmqueue_mode rmqm = RMQUEUE_NORMAL; unsigned long flags; int i; @@ -2416,7 +2459,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (cma_redirect_restricted() && is_migrate_cma(migratetype)) page = __rmqueue_cma_fallback(zone, order); else - page = __rmqueue(zone, order, migratetype, alloc_flags); + page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm); if (unlikely(page == NULL)) break; @@ -3040,9 +3083,12 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, alloc_flags & ALLOC_CMA) page = __rmqueue_cma_fallback(zone, order); - if (!page) + if (!page) { + enum rmqueue_mode rmqm = RMQUEUE_NORMAL; + page = __rmqueue(zone, order, migratetype, - alloc_flags); + alloc_flags, &rmqm); + } /* * If the allocation fails, allow OOM handling and * order-0 (atomic) allocs access to HIGHATOMIC From 44add0722b59301c6db931a612d79fd0668771a7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2025 14:01:54 -0400 Subject: [PATCH 8/8] FROMGIT: mm: page_alloc: tighten up find_suitable_fallback() find_suitable_fallback() is not as efficient as it could be, and somewhat difficult to follow. 1. should_try_claim_block() is a loop invariant. There is no point in checking fallback areas if the caller is interested in claimable blocks but the order and the migratetype don't allow for that. 2. __rmqueue_steal() doesn't care about claimability, so it shouldn't have to run those tests. Different callers want different things from this helper: 1. __compact_finished() scans orders up until it finds a claimable block 2. __rmqueue_claim() scans orders down as long as blocks are claimable 3. __rmqueue_steal() doesn't care about claimability at all Move should_try_claim_block() out of the loop. Only test it for the two callers who care in the first place. Distinguish "no blocks" from "order + mt are not claimable" in the return value; __rmqueue_claim() can stop once order becomes unclaimable, __compact_finished() can keep advancing until order becomes claimable. Before: Performance counter stats for './run case-lru-file-mmap-read' (5 runs): 85,294.85 msec task-clock # 5.644 CPUs utilized ( +- 0.32% ) 15,968 context-switches # 187.209 /sec ( +- 3.81% ) 153 cpu-migrations # 1.794 /sec ( +- 3.29% ) 801,808 page-faults # 9.400 K/sec ( +- 0.10% ) 733,358,331,786 instructions # 1.87 insn per cycle ( +- 0.20% ) (64.94%) 392,622,904,199 cycles # 4.603 GHz ( +- 0.31% ) (64.84%) 148,563,488,531 branches # 1.742 G/sec ( +- 0.18% ) (63.86%) 152,143,228 branch-misses # 0.10% of all branches ( +- 1.19% ) (62.82%) 15.1128 +- 0.0637 seconds time elapsed ( +- 0.42% ) After: Performance counter stats for './run case-lru-file-mmap-read' (5 runs): 84,380.21 msec task-clock # 5.664 CPUs utilized ( +- 0.21% ) 16,656 context-switches # 197.392 /sec ( +- 3.27% ) 151 cpu-migrations # 1.790 /sec ( +- 3.28% ) 801,703 page-faults # 9.501 K/sec ( +- 0.09% ) 731,914,183,060 instructions # 1.88 insn per cycle ( +- 0.38% ) (64.90%) 388,673,535,116 cycles # 4.606 GHz ( +- 0.24% ) (65.06%) 148,251,482,143 branches # 1.757 G/sec ( +- 0.37% ) (63.92%) 149,766,550 branch-misses # 0.10% of all branches ( +- 1.22% ) (62.88%) 14.8968 +- 0.0486 seconds time elapsed ( +- 0.33% ) Link: https://lkml.kernel.org/r/20250407180154.63348-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Brendan Jackman Tested-by: Shivank Garg Reviewed-by: Vlastimil Babka Cc: Carlos Song Cc: Mel Gorman Signed-off-by: Andrew Morton Bug: 420836317 Change-Id: I2886de9da0fd99047cf5c675cd2ae7c386267770 (cherry picked from commit ee414bd97b3fa0a4f74e40004e3b4191326bd46c https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git/?h=mm-everything) Signed-off-by: yipeng xiang --- mm/compaction.c | 4 +--- mm/internal.h | 2 +- mm/page_alloc.c | 31 +++++++++++++------------------ 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 41f511fbb685..7d4d185de16d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2359,7 +2359,6 @@ static enum compact_result __compact_finished(struct compact_control *cc) ret = COMPACT_NO_SUITABLE_PAGE; for (order = cc->order; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &cc->zone->free_area[order]; - bool claim_block; /* Job done if page is free of the right migratetype */ if (!free_area_empty(area, migratetype)) @@ -2375,8 +2374,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) * Job done if allocation would steal freepages from * other migratetype buddy lists. */ - if (find_suitable_fallback(area, order, migratetype, - true, &claim_block) != -1) + if (find_suitable_fallback(area, order, migratetype, true) >= 0) /* * Movable pages are OK in any pageblock. If we are * stealing for a non-movable allocation, make sure diff --git a/mm/internal.h b/mm/internal.h index 9ef38b415c5b..07103ace9c53 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -834,7 +834,7 @@ void init_cma_reserved_pageblock(struct page *page); #endif /* CONFIG_COMPACTION || CONFIG_CMA */ int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool claim_only, bool *claim_block); + int migratetype, bool claimable); static inline bool free_area_empty(struct free_area *area, int migratetype) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fc5fe8f53cf9..2aa4d3e6d180 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2162,31 +2162,25 @@ static bool should_try_claim_block(unsigned int order, int start_mt) /* * Check whether there is a suitable fallback freepage with requested order. - * Sets *claim_block to instruct the caller whether it should convert a whole - * pageblock to the returned migratetype. - * If only_claim is true, this function returns fallback_mt only if + * If claimable is true, this function returns fallback_mt only if * we would do this whole-block claiming. This would help to reduce * fragmentation due to mixed migratetype pages in one pageblock. */ int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool only_claim, bool *claim_block) + int migratetype, bool claimable) { int i; - int fallback_mt; + + if (claimable && !should_try_claim_block(order, migratetype)) + return -2; if (area->nr_free == 0) return -1; - *claim_block = false; for (i = 0; i < MIGRATE_FALLBACKS - 1 ; i++) { - fallback_mt = fallbacks[migratetype][i]; - if (free_area_empty(area, fallback_mt)) - continue; + int fallback_mt = fallbacks[migratetype][i]; - if (should_try_claim_block(order, migratetype)) - *claim_block = true; - - if (*claim_block || !only_claim) + if (!free_area_empty(area, fallback_mt)) return fallback_mt; } @@ -2284,7 +2278,6 @@ __rmqueue_claim(struct zone *zone, int order, int start_migratetype, int min_order = order; struct page *page; int fallback_mt; - bool claim_block; /* * Do not steal pages from freelists belonging to other pageblocks @@ -2303,11 +2296,14 @@ __rmqueue_claim(struct zone *zone, int order, int start_migratetype, --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &claim_block); + start_migratetype, true); + + /* No block in that order */ if (fallback_mt == -1) continue; - if (!claim_block) + /* Advanced into orders too low to claim, abort */ + if (fallback_mt == -2) break; page = get_page_from_free_area(area, fallback_mt); @@ -2335,12 +2331,11 @@ __rmqueue_steal(struct zone *zone, int order, int start_migratetype) int current_order; struct page *page; int fallback_mt; - bool claim_block; for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &claim_block); + start_migratetype, false); if (fallback_mt == -1) continue;