iommu/io-pgtable-arm: Optimise non-coherent unmap

BugLink: https://bugs.launchpad.net/bugs/2080908

The current __arm_lpae_unmap() function calls dma_sync() on individual
PTEs after clearing them. Overall unmap performance can be improved by
around 25% for large buffer sizes by combining the syncs for adjacent
leaf entries.
Optimize the unmap time by clearing all the leaf entries and issuing a
single dma_sync() for them.
Below is detailed analysis of average unmap latency(in us) with and
without this optimization obtained by running dma_map_benchmark for
different buffer sizes.

		UnMap Latency(us)
Size	Without		With		% gain with
	optimiztion	optimization	optimization

4KB	3		3		0
8KB	4		3.8		5
16KB	6.1		5.4		11.48
32KB	10.2		8.5		16.67
64KB	18.5		14.9		19.46
128KB	35		27.5		21.43
256KB	67.5		52.2		22.67
512KB	127.9		97.2		24.00
1MB	248.6		187.4		24.62
2MB	65.5		65.5		0
4MB	119.2		119		0.17

Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Ashish Mhetre <amhetre@nvidia.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20240806105135.218089-1-amhetre@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 84b2baf427968c1b2e3ae3b7afcb0118cdee0915)
Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Noah Wager <noah.wager@canonical.com>
This commit is contained in:
Ashish Mhetre
2024-08-06 10:51:35 +00:00
committed by Noah Wager
parent 63f4b30a33
commit 7e3e8d6fbf
+17 -14
View File
@@ -271,13 +271,13 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,
sizeof(*ptep) * num_entries, DMA_TO_DEVICE);
}
static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg)
static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries)
{
for (int i = 0; i < num_entries; i++)
ptep[i] = 0;
*ptep = 0;
if (!cfg->coherent_walk)
__arm_lpae_sync_pte(ptep, 1, cfg);
if (!cfg->coherent_walk && num_entries)
__arm_lpae_sync_pte(ptep, num_entries, cfg);
}
static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
@@ -648,26 +648,29 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
max_entries = arm_lpae_max_entries(unmap_idx_start, data);
num_entries = min_t(int, pgcount, max_entries);
while (i < num_entries) {
pte = READ_ONCE(*ptep);
/* Find and handle non-leaf entries */
for (i = 0; i < num_entries; i++) {
pte = READ_ONCE(ptep[i]);
if (WARN_ON(!pte))
break;
__arm_lpae_clear_pte(ptep, &iop->cfg);
if (!iopte_leaf(pte, lvl, iop->fmt)) {
__arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1);
/* Also flush any partial walks */
io_pgtable_tlb_flush_walk(iop, iova + i * size, size,
ARM_LPAE_GRANULE(data));
__arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
} else if (!iommu_iotlb_gather_queued(gather)) {
io_pgtable_tlb_add_page(iop, gather, iova + i * size, size);
}
ptep++;
i++;
}
/* Clear the remaining entries */
__arm_lpae_clear_pte(ptep, &iop->cfg, i);
if (gather && !iommu_iotlb_gather_queued(gather))
for (int j = 0; j < i; j++)
io_pgtable_tlb_add_page(iop, gather, iova + j * size, size);
return i * size;
} else if (iopte_leaf(pte, lvl, iop->fmt)) {
/*