From 7e3e8d6fbfcfb0c7cd4691c933b3e45073cd0cd2 Mon Sep 17 00:00:00 2001 From: Ashish Mhetre Date: Tue, 6 Aug 2024 10:51:35 +0000 Subject: [PATCH] iommu/io-pgtable-arm: Optimise non-coherent unmap BugLink: https://bugs.launchpad.net/bugs/2080908 The current __arm_lpae_unmap() function calls dma_sync() on individual PTEs after clearing them. Overall unmap performance can be improved by around 25% for large buffer sizes by combining the syncs for adjacent leaf entries. Optimize the unmap time by clearing all the leaf entries and issuing a single dma_sync() for them. Below is detailed analysis of average unmap latency(in us) with and without this optimization obtained by running dma_map_benchmark for different buffer sizes. UnMap Latency(us) Size Without With % gain with optimiztion optimization optimization 4KB 3 3 0 8KB 4 3.8 5 16KB 6.1 5.4 11.48 32KB 10.2 8.5 16.67 64KB 18.5 14.9 19.46 128KB 35 27.5 21.43 256KB 67.5 52.2 22.67 512KB 127.9 97.2 24.00 1MB 248.6 187.4 24.62 2MB 65.5 65.5 0 4MB 119.2 119 0.17 Reviewed-by: Robin Murphy Signed-off-by: Ashish Mhetre Acked-by: Will Deacon Link: https://lore.kernel.org/r/20240806105135.218089-1-amhetre@nvidia.com Signed-off-by: Joerg Roedel (cherry picked from commit 84b2baf427968c1b2e3ae3b7afcb0118cdee0915) Signed-off-by: Laxman Dewangan Acked-by: Noah Wager Acked-by: Jacob Martin Signed-off-by: Noah Wager --- drivers/iommu/io-pgtable-arm.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 5ef6cae0c2b5..f48fe308c8c3 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -271,13 +271,13 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries, sizeof(*ptep) * num_entries, DMA_TO_DEVICE); } -static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg) +static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries) { + for (int i = 0; i < num_entries; i++) + ptep[i] = 0; - *ptep = 0; - - if (!cfg->coherent_walk) - __arm_lpae_sync_pte(ptep, 1, cfg); + if (!cfg->coherent_walk && num_entries) + __arm_lpae_sync_pte(ptep, num_entries, cfg); } static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, @@ -648,26 +648,29 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, max_entries = arm_lpae_max_entries(unmap_idx_start, data); num_entries = min_t(int, pgcount, max_entries); - while (i < num_entries) { - pte = READ_ONCE(*ptep); + /* Find and handle non-leaf entries */ + for (i = 0; i < num_entries; i++) { + pte = READ_ONCE(ptep[i]); if (WARN_ON(!pte)) break; - __arm_lpae_clear_pte(ptep, &iop->cfg); - if (!iopte_leaf(pte, lvl, iop->fmt)) { + __arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1); + /* Also flush any partial walks */ io_pgtable_tlb_flush_walk(iop, iova + i * size, size, ARM_LPAE_GRANULE(data)); __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data)); - } else if (!iommu_iotlb_gather_queued(gather)) { - io_pgtable_tlb_add_page(iop, gather, iova + i * size, size); } - - ptep++; - i++; } + /* Clear the remaining entries */ + __arm_lpae_clear_pte(ptep, &iop->cfg, i); + + if (gather && !iommu_iotlb_gather_queued(gather)) + for (int j = 0; j < i; j++) + io_pgtable_tlb_add_page(iop, gather, iova + j * size, size); + return i * size; } else if (iopte_leaf(pte, lvl, iop->fmt)) { /*