Merge branch 'android16-6.12' into branch 'android16-6.12-lts'

Catch the -lts branch up with all of the updates and kmi break that
happened in the android16-6.12 branch.  Changes included in here are:

* 147b27e8d6 ANDROID: nvmet: Use the bdev_is_zone_start() function
* 392bbaffdc ANDROID: scsi: ufs: Support I/O tracing for zoned block devices
* e395d18c8b ANDROID: scsi: scsi_debug: Support npo2 zone sizes
* 497ca126dd ANDROID: scsi/sd_zbc: Support npo2 zone sizes
* 355dfccf9d ANDROID: dm-table: allow zoned devices with non power-of-2 zone sizes
* d6f0f66569 ANDROID: block: Do not set the I/O priority for zoned writes
* 30ce6652ee ANDROID: block: Support npo2 zone sizes
* 4a77dbe5c5 UPSTREAM: loop: fix queue freeze vs limits lock order
* d2eefa734f UPSTREAM: loop: refactor queue limits updates
* b0477a0759 UPSTREAM: loop: Fix ABBA locking race
* 49d8530dfe UPSTREAM: loop: Simplify discard granularity calc
* f1aac3cfaf UPSTREAM: loop: Use bdev limit helpers for configuring discard
* 02cf51391e UPSTREAM: usb-storage: fix queue freeze vs limits lock order
* 96dfef3be8 UPSTREAM: nbd: fix queue freeze vs limits lock order
* 1bf8be0b4e UPSTREAM: nvme: fix queue freeze vs limits lock order
* 32ab5e2dd9 UPSTREAM: block: fix queue freeze vs limits lock order in sysfs store methods
* e4eb47a3ec BACKPORT: block: add a store_limit operations for sysfs entries
* 574e0848d2 UPSTREAM: block: add a queue_limits_commit_update_frozen helper
* 65ad590076 FROMGIT: genirq: Retain depth for managed IRQs across CPU hotplug
* 1bc40b53aa FROMGIT: ufs: core: support updating device command timeout
* 5e97c36004 ANDROID: Build Rust Binder as a GKI module
* daae469749 ANDROID: rust_binder: handle read/write_consumed > read/write_size
* b23e338263 ANDROID: rust_binder: add Rust Binder to Makefile
* 7163533526 ANDROID: rust_binder: fixups for 6.12.19 LTS
* bf40001347 ANDROID: rust_binder: add back tracepoints
* dac7c66bc9 ANDROID: rust_binder: move Rust Binder in preparation for GKI module
* 8313296331 FROMGIT: rust: alloc: add Vec::insert_within_capacity
* c28afde01d FROMGIT: rust: alloc: add Vec::remove
* e1da60354a FROMGIT: rust: alloc: add Vec::retain
* 1e01dcf3be FROMGIT: rust: alloc: add Vec::drain_all
* 1a17ca097d FROMGIT: rust: alloc: add Vec::push_within_capacity
* 75c0948156 FROMGIT: rust: alloc: add Vec::pop
* ed2019e2c4 FROMGIT: rust: alloc: add Vec::clear
* 04d685ecf9 FROMGIT: rust: alloc: replace `Vec::set_len` with `inc_len`
* 597ebe7c32 FROMGIT: rust: alloc: refactor `Vec::truncate` using `dec_len`
* 8a1546ee71 FROMGIT: rust: alloc: add `Vec::dec_len`
* 48080570b0 FROMGIT: rust: alloc: add Vec::len() <= Vec::capacity invariant
* 7907fdcba6 FROMGIT: rust: alloc: allow coercion from `Box<T>` to `Box<dyn U>` if T implements U
* 9de29f7183 FROMGIT: rust: alloc: use `spare_capacity_mut` to reduce unsafe
* c40401d665 FROMGIT: rust: alloc: add Vec::resize method
* 9d37907c65 FROMGIT: rust: alloc: add Vec::truncate method
* f037ab7a73 FROMGIT: rust: alloc: add missing invariant in Vec::set_len()
* 025e0fc417 UPSTREAM: rust: kunit: allow to know if we are in a test
* 86603276f4 UPSTREAM: rust: macros: add macro to easily run KUnit tests
* f8c704efd6 BACKPORT: rust: kunit: add KUnit case and suite macros
* 1b461575a8 UPSTREAM: rust: add kunitconfig
* 615a5b6d7e UPSTREAM: rust: uaccess: generalize userSliceReader to support any Vec
* 0690b3438b ANDROID: 2025/05/15 KMI update
* daf75d7717 FROMLIST: mmc: sdhci-msm: Enable force hw reset during cqe recovery
* 794391e0e8 FROMLIST: mmc: core: Introduce new flag to force hardware reset
* 84e14946eb ANDROID: GKI: add ANDROID_OEM_DATA in struct bio
* e7b9281897 ANDROID: rust: allow zero init for KABI members
* 9027c8ec43 ANDROID: 16K: Add VMA padding size to smaps output
* 307be4b887 ANDROID: 16K: Don't copy data vma for maps/smaps output
* d378f3ab39 ANDROID: 16K: Fixup padding vm_flags bits on VMA splits
* c0d7f9802a ANDROID: 16K: Introduce pgsize_migration_inline.h
* 6fd1ed47f5 ANDROID: 16K: Fix vm_flags conflicts from mseal
* 2a651ea884 ANDROID: 16K: Don't set padding vm_flags on 32-bit archs
* 81734e02c6 ANDROID: 16K: Avoid mmap lock assertions for padding VMAs
* 4199eaf23e ANDROID: 16K: Only check basename of linker context
* 6050c4b129 ANDROID: 16K: Avoid and document padding madvise lock warning
* 6e64e9ce1f ANDROID: 16K: Fix show maps CFI failure
* 95d0b11a65 ANDROID: 16K: Handle pad VMA splits and merges
* 29dc8b580b ANDROID: 16K: madvise_vma_pad_pages: Remove filemap_fault check
* bcbb9d3c85 ANDROID: 16K: Only madvise padding from dynamic linker context
* 2feb999649 ANDROID: 16K: Separate padding from ELF LOAD segment mappings
* 092ff7e5b4 ANDROID: 16K: Exclude ELF padding for fault around range
* a3b4e8f698 ANDROID: 16K: Use MADV_DONTNEED to save VMA padding pages.
* 0d793cde88 ANDROID: 16K: Introduce ELF padding representation for VMAs
* 918c98f267 ANDROID: 16K: Introduce /sys/kernel/mm/pgsize_migration/enabled
* e9420a4582 ANDROID: 16K: rust: ashmem: __page_align VMA size check
* d44ff7a3ed ANDROID: 16K: Introduce rust __page_*() helpers
* e39fcef01c ANDROID: 16K: Duplicate command line for parsing page_shift
* 44a6882cc7 ANDROID: 16K: Init page_shift param in a pure_initcall()
* 68ba0f4dfb ANDROID: 16K: __PAGE_ALIGN() virtio gpu dumb buffers
* 1022438243 ANDROID: 16K: Avoid conflicting __PAGE_SIZE in bpf/core
* 778a447513 ANDROID: 16K: Emulate cachestat counters
* 0d44e1eb0e ANDROID: 16K: Disable kernel APIs indexed by PFNs
* d684b3125e ANDROID: 16K: Emulate pread() for pagemap
* 669f0c4355 ANDROID: 16K: Emulate /proc/pid/pagemap
* 3c9a39c770 ANDROID: 16K: Fix mincore emulation
* 8aab407984 ANDROID: 16K: Emulate mincore() syscall
* 596774b15c ANDROID: 16K: x86_64: Disable userfaultfd
* c94c31e526 ANDROID: 16K: Update sysctl_perf_event_mlock if PERF_EVENTS enabled
* 13ba0aec9c ANDROID: 16K: Fixup perf_mmap check for metadata page
* 03ce5534fc ANDROID: 16K: Fix swapfile header
* 53ab86eb55 ANDROID: 16K: Fix SIGBUS semantics and document __filemap_fixup()
* 50a96587af ANDROID: 16K: [s]maps: Fold fixup entries into the parent entry
* 57bbcef534 ANDROID: 16K: Ensure mseal start and len are 16kB multiples
* 5c1d7ef671 ANDROID: 16K: Handle pgoff > file_size for shmem and file backed VMAs
* cd48f9a1f7 ANDROID: 16K: Ensure stack expansion size is __PAGE_SIZE multiple
* a8df614576 ANDROID: 16K: Only support page size emulation for x86_64
* ac98b230db ANDROID: 16K: Use bit 59 for __VM_NO_COMPAT
* eb54f19663 ANDROID: 16K: Fix __MAP_NO_COMPAT overflow
* 36157a52cd ANDROID: 16K: __PAGE_ALIGN dma-bufs size from heap allocations
* 65df6a39b7 ANDROID: 16K: Align vsyscall mapping size to a 16kB multiple
* 4395898bf5 ANDROID: 16K: Align vdso mapping size to a 16kB multiple
* 37ebd01b5a ANDROID: 16K: Make the x86 vdso layout 16kB compatible
* c64a15a595 ANDROID: 16K: Introduce __MAX_PAGE_SIZE macros
* 60b3135822 ANDROID: 16K: Remove androidboot from page_shift kernel param
* 5e32ba9023 ANDROID: 16K: Remove unescessary err log in randomize_page()
* 1ae0864980 ANDROID: 16K Prevent non-__PAGE_ALIGNED() VMA splits by anon names
* 68e0528b38 ANDROID: 16K: Remove anon name for fixup VMA
* f7f25a5b1a ANDROID: 16K: Add page_compat[_enabled] to symbol list
* 93bfe702cd ANDROID: 16K: Export page compat symbols
* 181bc19bef ANDROID: 16K: x86_64: Allow stack randomization of twice page-size
* f51703f4c1 ANDROID: 16K: x86_64: __PAGE_ALIGN mmap randomization
* 4daa4c1fec ANDROID: 16K: brk: __PAGE_ALIGN brk
* 7852452429 ANDROID: 16K: mlock: __PAGE_ALIGN addr and len
* 4956d7c6c4 ANDROID: 16K: msync: __PAGE_ALIGN addr and len
* 5d8eb7f9e0 ANDROID: 16K: madvise: __PAGE_ALIGN addr and len
* a52b76b874 ANDROID: 16K: mremap: __PAGE_ALIGN addr and len
* 2d3fed3a43 ANDROID: 16K: mprotect: __PAGE_ALIGN addr and len
* 397425965f ANDROID: 16K: munmap: __PAGE_ALIGN addr and len
* a9e38ff89a ANDROID: 16K: __PAGE_ALIGN stack_[top|base]
* ba166bce2c ANDROID: 16K: __PAGE_ALIGN randomize_stack_top() address
* 9ba9a0891b ANDROID: 16K: __PAGE_ALIGN randomize_page() address
* 81e0928547 ANDROID: 16K: __PAGE_ALIGN mmap hint address
* a1e630ea0d ANDROID: 16K: ashmem: Fix size check
* df9123472f ANDROID: 16K: Fix selinux mmap size check
* 7dea17008f ANDROID: 16K: procfs: maps: Don't show fixup VMAs
* e076e9ff2c ANDROID: 16K: Handle filemap faults
* a9ccc1128e ANDROID: 16K: Introduce __VM_NO_COMPAT vma flag
* e7f83d4d4b ANDROID: 16K: Ensure unmapped_area returns a __PAGE_ALIGNED address
* 796be8fd27 ANDROID: 16K: Reduce mmap rand bits
* 80e2a42d97 ANDROID: 16K: x86_64: Set ELF_EXEC_PAGESIZE to __PAGE_SIZE
* 58e2fa4ec4 ANDROID: 16K: Remove build time dependencies on ELF_EXEC_PAGESIZE
* d09cd43b3f ANDROID: 16K: Log unaligned operations
* 1fb2de0c3d ANDROID: 16K: Add page-compat helper macros
* a052d19e1c ANDROID: GKI: Pad vendor properties to power_supply_property enum
* 61de19b772 ANDROID: drivers/iommu: Pad iommu structs
* 6cb1db877d ANDROID: KVM: arm64: Pad more pKVM structs
* b3c31c9b21 ANDROID: KVM: arm64: Drop struct pkvm_mapping from KMI
* ef10b442e4 ANDROID: KVM: arm64: Remove struct kvm_cpu_context from the KMI
* 15bf9aa274 ANDROID: GKI: Add ABI padding for kcompressd feature
* e80ed6bcfb ANDROID: GKI: Add memory reclaim ABI padding
* 9e96103d83 ANDROID: GKI: Add dmabuf ABI padding
* 4bd97e7a02 ANDROID: GKI: Add cgroup ABI padding
* b209d55c0e ANDROID: GKI: Add cpuset ABI padding
* 060da33ae4 ANDROID: GKI: Add memcg ABI padding
* d48d0d0892 FROMLIST: scsi: core: Implement reserved command handling
* 26febb7cde UPSTREAM: block: track queue dying state automatically for modeling queue freeze lockdep
* df5f9ab297 UPSTREAM: block: don't verify queue freeze manually in elevator_init_mq()
* 752dff69ae UPSTREAM: block: track disk DEAD state automatically for modeling queue freeze lockdep
* 225f2e16ad UPSTREAM: block: don't reorder requests in blk_mq_add_to_batch
* bdcd6a28fd UPSTREAM: block: don't reorder requests in blk_add_rq_to_plug
* 24f685a927 UPSTREAM: block: add a rq_list type
* bbce2aa253 UPSTREAM: block: remove rq_list_move
* 128144da22 ANDROID: KVM: arm64: Add smc64 trap handling for protected guests
* 2c1385ae0e ANDROID: Modify android_rvh_find_lowest_rq hook
* bad3ca6c52 ANDROID: GKI: add vendor padding variable in struct nf_conn
* ef3d16e0e0 ANDROID: vendor_hooks: add a field in pglist_data
* 0dd21f133b ANDROID: scsi: ufs: add UFSHCD_ANDROID_QUIRK_SET_IID_TO_ONE
* 75adb09e2f ANDROID: GKI: the "reusachtig" padding sync with android16-6.12
* 20159aa0ac UPSTREAM: PCI: Check BAR index for validity
* 46f484fa4d UPSTREAM: perf: Fix hang while freeing sigtrap event
* f295287ed4 UPSTREAM: perf/core: Simplify the perf_event_alloc() error path
* 748bd1ca17 UPSTREAM: perf/core: Add aux_pause, aux_resume, aux_start_paused
* 887fb3f16c ANDROID: KVM: arm64: Add __pkvm_host_donate_sglist_hyp
* a7667808d9 UPSTREAM: tools/selftests: add guard region test for /proc/$pid/pagemap
* dd6e353d71 UPSTREAM: fs/proc/task_mmu: add guard region bit to pagemap
* df3e8432fa UPSTREAM: tools/selftests: add file/shmem-backed mapping guard region tests
* bc91eb889e UPSTREAM: tools/selftests: expand all guard region tests to file-backed
* 458e4dbd0b UPSTREAM: selftests/mm: rename guard-pages to guard-regions
* 8261d30079 UPSTREAM: mm: allow guard regions in file-backed and read-only mappings
* ca6b245e10 UPSTREAM: selftests/mm: use PIDFD_SELF in guard pages test
* 99b3bb2022 BACKPORT: selftests/pidfd: add tests for PIDFD_SELF_*
* 7a879200c9 UPSTREAM: selftests/pidfd: add new PIDFD_SELF* defines
* 1734a4ad6b BACKPORT: pidfd: add PIDFD_SELF* sentinels to refer to own thread/process
* b00dca6fb7 UPSTREAM: selftests/mm: add fork CoW guard page test
* 5367c0eacc BACKPORT: selftests/mm: add self tests for guard page feature
* 86f861b42e UPSTREAM: tools: testing: update tools UAPI header for mman-common.h
* b9ee6db5a8 BACKPORT: mm: madvise: implement lightweight guard page mechanism
* c14f85307d UPSTREAM: mm: add PTE_MARKER_GUARD PTE marker
* c5be90ae70 UPSTREAM: mm: pagewalk: add the ability to install PTEs
* 3306eb50a4 FROMGIT: docs: core-api: document the IOVA-based API
* 26405baef4 FROMGIT: dma-mapping: add a dma_need_unmap helper
* 66bc206d64 FROMGIT: dma-mapping: Implement link/unlink ranges API
* 59a15e3bf1 FROMGIT: iommu/dma: Factor out a iommu_dma_map_swiotlb helper
* 0f2253b2b1 FROMGIT: dma-mapping: Provide an interface to allow allocate IOVA
* c64f83e1d6 FROMGIT: iommu: add kernel-doc for iommu_unmap_fast
* 5c59ff3809 FROMGIT: iommu: generalize the batched sync after map interface
* 15ad0760b8 FROMGIT: dma-mapping: move the PCI P2PDMA mapping helpers to pci-p2pdma.h
* 661e6bda0e FROMGIT: PCI/P2PDMA: Refactor the p2pdma mapping helpers
* e44dfa62df Reapply "ANDROID: enable memory allocation profiling configs"
* 60372b88d2 ANDROID: binder: add OEM data to struct binder_alloc
* 31f62a008e ANDROID: Limit vfs-only namespace to GKI builds
* e2c81a7fa3 ANDROID: Fix incorrect namespacing for ANDROID_GKI_VFS_EXPORT_ONLY
* 7af261fc12 ANDROID: KVM: arm64: Use smccc 1.2 for direct FF-A calls
* 996a35040a FROMLIST: dm-zone: Use bdev_*() helper functions where applicable
* 1d1b2e8d63 FROMGIT: perf/aux: Allocate non-contiguous AUX pages by default
* 6e0b046d59 UPSTREAM: wifi: cfg80211: fix out-of-bounds access during multi-link element defragmentation
* 617a8cdb8d ANDROID: GKI: add OEM data to struct scan_control for XM OGKI
* acc91ef94b FROMGIT: dma-buf: insert memory barrier before updating num_fences
* 85856ec8b2 ANDROID: gunyah: Fix potential use-after-free in gunyah_rm_notifier_register
* e48193bfcf ANDROID: KVM: arm64: Reserve all args for req_mmio
* 42cfdfb46c ANDROID: GKI: Add reservation and use macros for non-LTS backports
* e1cdedc5db FROMGIT: mm/memcg: use kmem_cache when alloc memcg pernode info
* 65c043e1ca FROMGIT: mm/memcg: use kmem_cache when alloc memcg
* 434e2d5481 FROMGIT: mm/memcg: move mem_cgroup_init() ahead of cgroup_init()
* 4e16895056 ANDROID: GKI: Update oplus symbol list
* 28cbf47bba ANDROID: GKI: Export css_task_iter_start()
* 84849bc819 Revert "ANDROID: arm64: Forcefully disable SME at runtime"
* 0aaf2786fa FROMGIT: arm64/fpsimd: ptrace: Gracefully handle errors
* a51c741bb6 FROMGIT: arm64/fpsimd: ptrace: Mandate SVE payload for streaming-mode state
* 1d05f8264a FROMGIT: arm64/fpsimd: ptrace: Do not present register data for inactive mode
* 958a94681f FROMGIT: arm64/fpsimd: ptrace: Save task state before generating SVE header
* 3baa9071c3 FROMGIT: arm64/fpsimd: ptrace/prctl: Ensure VL changes leave task in a valid state
* ccf055346e FROMGIT: arm64/fpsimd: ptrace/prctl: Ensure VL changes do not resurrect stale data
* e18a498a2f FROMGIT: BACKPORT: arm64/fpsimd: Make clone() compatible with ZA lazy saving
* a6267d4bf5 FROMGIT: arm64/fpsimd: Clear PSTATE.SM during clone()
* 370e80e212 FROMGIT: arm64/fpsimd: Consistently preserve FPSIMD state during clone()
* f5db1f9a3b FROMGIT: arm64/fpsimd: Remove redundant task->mm check
* 57f5b387c4 FROMGIT: arm64/fpsimd: signal: Use SMSTOP behaviour in setup_return()
* f940d322b6 FROMGIT: arm64/fpsimd: Add task_smstop_sm()
* 73106ecef5 FROMGIT: arm64/fpsimd: Factor out {sve,sme}_state_size() helpers
* f0f4be3921 FROMGIT: arm64/fpsimd: Clarify sve_sync_*() functions
* 49bba8e1e8 FROMGIT: arm64/fpsimd: ptrace: Consistently handle partial writes to NT_ARM_(S)SVE
* b2853208b1 FROMGIT: arm64/fpsimd: signal: Consistently read FPSIMD context
* bed5006f4a FROMGIT: arm64/fpsimd: signal: Mandate SVE payload for streaming-mode state
* 63897a249f FROMGIT: arm64/fpsimd: signal: Clear PSTATE.SM when restoring FPSIMD frame only
* 37749ff2f7 FROMGIT: arm64/fpsimd: Do not discard modified SVE state
* f01e49470a FROMGIT: arm64/fpsimd: Avoid warning when sve_to_fpsimd() is unused
* 787c2bf09b FROMGIT: arm64/fpsimd: signal: Clear TPIDR2 when delivering signals
* dd9f8f02e9 FROMGIT: arm64/fpsimd: signal: Simplify preserve_tpidr2_context()
* 9592e13c60 FROMGIT: arm64/fpsimd: signal: Always save+flush state early
* 14383c6162 FROMGIT: arm64/fpsimd: signal32: Always save+flush state early
* 0c377582f6 FROMGIT: arm64/fpsimd: Add fpsimd_save_and_flush_current_state()
* acd59f18f3 FROMGIT: arm64/fpsimd: Fix merging of FPSIMD state during signal return
* f78acfcc31 FROMGIT: arm64/fpsimd: Reset FPMR upon exec()
* 32dbf4add0 FROMGIT: arm64/fpsimd: Avoid clobbering kernel FPSIMD state with SMSTOP
* 2d33087d98 FROMGIT: arm64/fpsimd: Don't corrupt FPMR when streaming mode changes
* c757f1bcc8 FROMGIT: arm64/fpsimd: Discard stale CPU state when handling SME traps
* 64c0feb892 FROMGIT: arm64/fpsimd: Remove opportunistic freeing of SME state
* f55fc6340b FROMGIT: arm64/fpsimd: Remove redundant SVE trap manipulation
* 2ccf10f4a6 FROMGIT: arm64/fpsimd: Remove unused fpsimd_force_sync_to_sve()
* 1e380d1c0e FROMGIT: arm64/fpsimd: Avoid RES0 bits in the SME trap handler
* 6cf85d6ca1 BACKPORT: KVM: arm64: Eagerly switch ZCR_EL{1,2}
* 6c0394f0ef BACKPORT: KVM: arm64: Mark some header functions as inline
* 66762de87f BACKPORT: KVM: arm64: Refactor exit handlers
* d09c293b5b BACKPORT: KVM: arm64: Remove VHE host restore of CPACR_EL1.SMEN
* 5f2af6c19e BACKPORT: KVM: arm64: Remove VHE host restore of CPACR_EL1.ZEN
* f012246148 BACKPORT: KVM: arm64: Remove host FPSIMD saving for non-protected KVM
* 3aa13c0fd1 BACKPORT: KVM: arm64: Unconditionally save+flush host FPSIMD/SVE/SME state
* 5f1b9561a1 ANDROID: KVM: arm64: Eagerly restore host ZCR_EL2 after vcpu run in pKVM
* 86622b5452 ANDROID: ABI: update symbol list for honor
* 5addce7b33 ANDROID: GKI:Add VendorHook for ProbeTimeout
* e8df77b867 ANDROID: GKI: Update symbol list for qcom
* 6c6bf93463 ANDROID: GKI: update symbol list for xiaomi
* e8da2c8c48 ANDROID: Export cgroup function to allow module to remove control files
* f2c750c9f8 ANDROID: Update symbols list for imx
* c206f26b28 ANDROID: Update symbols to oplus symbol list.
* 55ac0abda4 ANDROID: Export the necessary symbols for the implementation of the BPF scheduler.
* de6714dc48 ANDROID: Drop tests_zip_arm64 from TV target.
* 7f12a7bda3 ANDROID: GKI: Update RTK STB KMI symbol list
* ba364a2340 ANDROID: vendor_hooks: add vendor hook in cma_alloc()
* 21de8f00f4 ANDROID: vendor hooks: use DECLARE_RESTRICTED_HOOK for android_rvh
* d574cb3cc1 ANDROID: GKI: update symbol list for xiaomi
* 41763ef33d ANDROID: GKI: Update symbols list file for oplus
* b62718ba86 ANDROID: vendor_hooks: add hook in __alloc_workqueue()
* c7b71fcb6f FROMLIST: xfrm: Migrate offload configuration
* 564d5ceda6 ANDROID: KVM: arm64: Fix relinquish filtering
* d9d550aef0 Revert "ANDROID: Revert^2 "KVM: arm64: iommu: Allow to boot without IOMMU driver""
* 8d139a5479 ANDROID: GKI: Update symbols list file for honor
* f3b22c7868 ANDROID: fs/proc: Perform priority inheritance around access_remote_vm()
* 06a574beb9 ANDROID: fix incorrect #ifdef for CONFIG_ANDROID_VENDOR_OEM_DATA
* d52356998b FROMLIST: scsi: ufs: core: Increase the UIC command timeout further
* 17f5bd09ee ANDROID: sched/psi: disable the privilege check if CONFIG_DEFAULT_SECURITY_SELINUX is enabled
* ad2761e088 ANDROID: ABI: Update pixel symbol list
* 86f6711a2d ANDROID: scsi: ufs: add complete init vendor hook
* 273b99c30a ANDROID: scsi: ufs: add vendor hook to override key reprogramming
* 05c9b03f4c FROMGIT: dm-verity: use softirq context only when !need_resched()
* a8027abd1e ANDROID: KVM: arm64: Redirect modprobe to /dev/kmsg
* 078ef75fa4 ANDROID: gki_defconfig: Enable CONFIG_ARM_SDE_INTERFACE
* f982a6b573 ANDROID: arm64: SDEI: Export SDEI related symbols
* b145782bbd FROMGIT: firmware: SDEI: Allow sdei initialization without ACPI_APEI_GHES
* cbd7c4caa9 ANDROID: KVM: arm64: Do not pkvm_init_devices() when no registered devices
* 1fad370b9e ANDROID: KVM: arm64: iommu: Do not remap on iommu_atomic_pool reclaim
* 890428fb57 ANDROID: Update symbols list for imx
* 776eedb13c ANDROID: ABI: Update symbol list for mtk
* ac8b302ab0 ANDROID: mm: Add vendor hook before rmqueue_bulk
* 34fe71fe24 ANDROID: GKI: Update symbol list file for xiaomi
* 88cb3505eb ANDROID: mm: export __pte_offset_map/unuse_swap_pte/read_swap_cache_async
* 46aa903098 ANDROID: Disable check_defconfig for kernel_aarch64_tv.
* 88680fe19e ANDROID: fuse-bpf: Fix recursion in fuse_copy_file_range
* 5838b5ac0a ANDROID: turn off KMI strict mode for TV builds
* e680506fe0 ANDROID: KVM: iommu: Allow IOMMU mapping in carveouts
* 4089d8be3f ANDROID: GKI: Update symbol list file for xiaomi
* 20adcab29c UPSTREAM: codel: remove sch->q.qlen check before qdisc_tree_reduce_backlog()
* 4e4b0bdf85 ANDROID: GKI: Update QCOM symbol list
* b791ce76d1 ANDROID: GKI: Update the ABI symbol list for qcom
* 6690013277 FROMLIST: mm: add nr_free_highatomic in show_free_areas
* cedbc9e5ec ANDROID: GKI: Update qcom symbol list
* 2145149a38 ANDROID: implement wrapper for reverse migration
* dfc83778aa ANDROID: GKI: Update symbols list file for honor
* 1213a4027a ANDROID: ABI: Update pixel symbol list
* a546b31e53 BACKPORT: FROMGIT: coresight: core: Disable helpers for devices that fail to enable
* bdda915529 FROMGIT: coresight: catu: Introduce refcount and spinlock for enabling/disabling
* 2366a0bf75 UPSTREAM: firmware: arm_ffa: Upgrade FF-A version to v1.2 in the driver
* e5ea70aa2d ANDROID: gki_defconfig: do not use FineIBT on x86
* b73e9bfc92 FROMGIT: sched/core: Tweak wait_task_inactive() to force dequeue sched_delayed tasks

Change-Id: Ie76eebb5d135e428f1c0986639fca0d1ead2aa51
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
This commit is contained in:
Greg Kroah-Hartman
2025-05-19 06:36:03 +00:00
418 changed files with 43420 additions and 27198 deletions

View File

@@ -290,20 +290,18 @@ common_kernel(
additional_kmi_symbol_lists = [":aarch64_additional_kmi_symbol_lists"],
arch = "arm64",
build_gki_artifacts = True,
check_defconfig = "disabled",
ddk_headers_archive = ":kernel_aarch64_ddk_headers_archive",
ddk_module_headers = [":all_headers_aarch64"],
defconfig = "arch/arm64/configs/gki_defconfig",
extra_dist = [
":test_mappings_zip",
":tests_zip_arm64",
],
extra_dist = [],
gki_boot_img_sizes = _GKI_AARCH64_BOOT_IMAGE_SIZES,
gki_system_dlkm_modules = ":gki_system_dlkm_modules_arm64",
kcflags = COMMON_KCFLAGS,
# We don't guarantee ABI stability for TV target
kmi_enforced = False,
kmi_symbol_list = "gki/aarch64/symbols/base",
kmi_symbol_list_strict_mode = True,
kmi_symbol_list_strict_mode = False,
make_goals = _GKI_AARCH64_MAKE_GOALS,
makefile = ":Makefile",
module_implicit_outs = get_gki_modules_list("arm64") + get_kunit_modules_list("arm64"),

View File

@@ -21,7 +21,8 @@ There are four components to pagemap:
* Bit 56 page exclusively mapped (since 4.2)
* Bit 57 pte is uffd-wp write-protected (since 5.13) (see
Documentation/admin-guide/mm/userfaultfd.rst)
* Bits 58-60 zero
* Bit 58 pte is a guard region (since 6.15) (see madvise (2) man page)
* Bits 59-60 zero
* Bit 61 page is file-page or shared-anon (since 3.5)
* Bit 62 page swapped
* Bit 63 page present

View File

@@ -69,8 +69,8 @@ model features for SME is included in Appendix A.
vectors from 0 to VL/8-1 stored in the same endianness invariant format as is
used for SVE vectors.
* On thread creation TPIDR2_EL0 is preserved unless CLONE_SETTLS is specified,
in which case it is set to 0.
* On thread creation PSTATE.ZA and TPIDR2_EL0 are preserved unless CLONE_VM
is specified, in which case PSTATE.ZA is set to 0 and TPIDR2_EL0 is set to 0.
2. Vector lengths
------------------
@@ -115,7 +115,7 @@ be zeroed.
5. Signal handling
-------------------
* Signal handlers are invoked with streaming mode and ZA disabled.
* Signal handlers are invoked with PSTATE.SM=0, PSTATE.ZA=0, and TPIDR2_EL0=0.
* A new signal frame record TPIDR2_MAGIC is added formatted as a struct
tpidr2_context to allow access to TPIDR2_EL0 from signal handlers.
@@ -241,7 +241,7 @@ prctl(PR_SME_SET_VL, unsigned long arg)
length, or calling PR_SME_SET_VL with the PR_SME_SET_VL_ONEXEC flag,
does not constitute a change to the vector length for this purpose.
* Changing the vector length causes PSTATE.ZA and PSTATE.SM to be cleared.
* Changing the vector length causes PSTATE.ZA to be cleared.
Calling PR_SME_SET_VL with vl equal to the thread's current vector
length, or calling PR_SME_SET_VL with the PR_SME_SET_VL_ONEXEC flag,
does not constitute a change to the vector length for this purpose.

View File

@@ -530,6 +530,77 @@ routines, e.g.:::
....
}
Part Ie - IOVA-based DMA mappings
---------------------------------
These APIs allow a very efficient mapping when using an IOMMU. They are an
optional path that requires extra code and are only recommended for drivers
where DMA mapping performance, or the space usage for storing the DMA addresses
matter. All the considerations from the previous section apply here as well.
::
bool dma_iova_try_alloc(struct device *dev, struct dma_iova_state *state,
phys_addr_t phys, size_t size);
Is used to try to allocate IOVA space for mapping operation. If it returns
false this API can't be used for the given device and the normal streaming
DMA mapping API should be used. The ``struct dma_iova_state`` is allocated
by the driver and must be kept around until unmap time.
::
static inline bool dma_use_iova(struct dma_iova_state *state)
Can be used by the driver to check if the IOVA-based API is used after a
call to dma_iova_try_alloc. This can be useful in the unmap path.
::
int dma_iova_link(struct device *dev, struct dma_iova_state *state,
phys_addr_t phys, size_t offset, size_t size,
enum dma_data_direction dir, unsigned long attrs);
Is used to link ranges to the IOVA previously allocated. The start of all
but the first call to dma_iova_link for a given state must be aligned
to the DMA merge boundary returned by ``dma_get_merge_boundary())``, and
the size of all but the last range must be aligned to the DMA merge boundary
as well.
::
int dma_iova_sync(struct device *dev, struct dma_iova_state *state,
size_t offset, size_t size);
Must be called to sync the IOMMU page tables for IOVA-range mapped by one or
more calls to ``dma_iova_link()``.
For drivers that use a one-shot mapping, all ranges can be unmapped and the
IOVA freed by calling:
::
void dma_iova_destroy(struct device *dev, struct dma_iova_state *state,
size_t mapped_len, enum dma_data_direction dir,
unsigned long attrs);
Alternatively drivers can dynamically manage the IOVA space by unmapping
and mapping individual regions. In that case
::
void dma_iova_unlink(struct device *dev, struct dma_iova_state *state,
size_t offset, size_t size, enum dma_data_direction dir,
unsigned long attrs);
is used to unmap a range previously mapped, and
::
void dma_iova_free(struct device *dev, struct dma_iova_state *state);
is used to free the IOVA space. All regions must have been unmapped using
``dma_iova_unlink()`` before calling ``dma_iova_free()``.
Part II - Non-coherent DMA allocations
--------------------------------------

View File

@@ -12477,6 +12477,7 @@ F: Documentation/dev-tools/kunit/
F: include/kunit/
F: lib/kunit/
F: rust/kernel/kunit.rs
F: rust/macros/kunit.rs
F: scripts/rustdoc_test_*
F: tools/testing/kunit/

View File

@@ -78,6 +78,9 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
#define MADV_GUARD_REMOVE 103 /* unguard range */
/* compatibility flags */
#define MAP_FILE 0

View File

@@ -329,6 +329,7 @@ CONFIG_REGMAP_KUNIT=m
CONFIG_ARM_SCMI_PROTOCOL=y
CONFIG_ARM_SCMI_TRANSPORT_VIRTIO=y
CONFIG_ARM_SCPI_PROTOCOL=y
CONFIG_ARM_SDE_INTERFACE=y
# CONFIG_EFI_ARMSTUB_DTB_LOADER is not set
CONFIG_GNSS=m
CONFIG_ZRAM=m
@@ -651,6 +652,7 @@ CONFIG_POWERCAP=y
CONFIG_IDLE_INJECT=y
CONFIG_ANDROID_BINDER_IPC=y
CONFIG_ANDROID_BINDERFS=y
CONFIG_ANDROID_BINDER_IPC_RUST=m
CONFIG_ANDROID_VENDOR_HOOKS=y
CONFIG_ANDROID_DEBUG_KINFO=y
CONFIG_LIBNVDIMM=y
@@ -789,6 +791,8 @@ CONFIG_UBSAN_TRAP=y
CONFIG_PAGE_OWNER=y
CONFIG_PAGE_PINNER=y
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_MEM_ALLOC_PROFILING=y
# CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT is not set
CONFIG_KASAN=y
CONFIG_KASAN_HW_TAGS=y
CONFIG_KFENCE=y

View File

@@ -370,12 +370,14 @@
/*
* ISS values for SME traps
*/
#define ESR_ELx_SME_ISS_SMTC_MASK GENMASK(2, 0)
#define ESR_ELx_SME_ISS_SMTC(esr) ((esr) & ESR_ELx_SME_ISS_SMTC_MASK)
#define ESR_ELx_SME_ISS_SME_DISABLED 0
#define ESR_ELx_SME_ISS_ILL 1
#define ESR_ELx_SME_ISS_SM_DISABLED 2
#define ESR_ELx_SME_ISS_ZA_DISABLED 3
#define ESR_ELx_SME_ISS_ZT_DISABLED 4
#define ESR_ELx_SME_ISS_SMTC_SME_DISABLED 0
#define ESR_ELx_SME_ISS_SMTC_ILL 1
#define ESR_ELx_SME_ISS_SMTC_SM_DISABLED 2
#define ESR_ELx_SME_ISS_SMTC_ZA_DISABLED 3
#define ESR_ELx_SME_ISS_SMTC_ZT_DISABLED 4
/* ISS field definitions for MOPS exceptions */
#define ESR_ELx_MOPS_ISS_MEM_INST (UL(1) << 24)

View File

@@ -6,6 +6,7 @@
#define __ASM_FP_H
#include <asm/errno.h>
#include <asm/percpu.h>
#include <asm/ptrace.h>
#include <asm/processor.h>
#include <asm/sigcontext.h>
@@ -76,7 +77,6 @@ extern void fpsimd_load_state(struct user_fpsimd_state *state);
extern void fpsimd_thread_switch(struct task_struct *next);
extern void fpsimd_flush_thread(void);
extern void fpsimd_signal_preserve_current_state(void);
extern void fpsimd_preserve_current_state(void);
extern void fpsimd_restore_current_state(void);
extern void fpsimd_update_current_state(struct user_fpsimd_state const *state);
@@ -94,9 +94,12 @@ struct cpu_fp_state {
enum fp_type to_save;
};
DECLARE_PER_CPU(struct cpu_fp_state, fpsimd_last_state);
extern void fpsimd_bind_state_to_cpu(struct cpu_fp_state *fp_state);
extern void fpsimd_flush_task_state(struct task_struct *target);
extern void fpsimd_save_and_flush_current_state(void);
extern void fpsimd_save_and_flush_cpu_state(void);
static inline bool thread_sm_enabled(struct thread_struct *thread)
@@ -109,6 +112,8 @@ static inline bool thread_za_enabled(struct thread_struct *thread)
return system_supports_sme() && (thread->svcr & SVCR_ZA_MASK);
}
extern void task_smstop_sm(struct task_struct *task);
/* Maximum VL that SVE/SME VL-agnostic software can transparently support */
#define VL_ARCH_MAX 0x100
@@ -196,10 +201,8 @@ struct vl_info {
extern void sve_alloc(struct task_struct *task, bool flush);
extern void fpsimd_release_task(struct task_struct *task);
extern void fpsimd_sync_to_sve(struct task_struct *task);
extern void fpsimd_force_sync_to_sve(struct task_struct *task);
extern void sve_sync_to_fpsimd(struct task_struct *task);
extern void sve_sync_from_fpsimd_zeropad(struct task_struct *task);
extern void fpsimd_sync_from_effective_state(struct task_struct *task);
extern void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task);
extern int vec_set_vector_length(struct task_struct *task, enum vec_type type,
unsigned long vl, unsigned long flags);
@@ -293,14 +296,29 @@ static inline bool sve_vq_available(unsigned int vq)
return vq_available(ARM64_VEC_SVE, vq);
}
size_t sve_state_size(struct task_struct const *task);
static inline size_t __sve_state_size(unsigned int sve_vl, unsigned int sme_vl)
{
unsigned int vl = max(sve_vl, sme_vl);
return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl));
}
/*
* Return how many bytes of memory are required to store the full SVE
* state for task, given task's currently configured vector length.
*/
static inline size_t sve_state_size(struct task_struct const *task)
{
unsigned int sve_vl = task_get_sve_vl(task);
unsigned int sme_vl = task_get_sme_vl(task);
return __sve_state_size(sve_vl, sme_vl);
}
#else /* ! CONFIG_ARM64_SVE */
static inline void sve_alloc(struct task_struct *task, bool flush) { }
static inline void fpsimd_release_task(struct task_struct *task) { }
static inline void sve_sync_to_fpsimd(struct task_struct *task) { }
static inline void sve_sync_from_fpsimd_zeropad(struct task_struct *task) { }
static inline void fpsimd_sync_from_effective_state(struct task_struct *task) { }
static inline void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task) { }
static inline int sve_max_virtualisable_vl(void)
{
@@ -334,6 +352,11 @@ static inline void vec_update_vq_map(enum vec_type t) { }
static inline int vec_verify_vq_map(enum vec_type t) { return 0; }
static inline void sve_setup(void) { }
static inline size_t __sve_state_size(unsigned int sve_vl, unsigned int sme_vl)
{
return 0;
}
static inline size_t sve_state_size(struct task_struct const *task)
{
return 0;
@@ -386,6 +409,16 @@ extern int sme_set_current_vl(unsigned long arg);
extern int sme_get_current_vl(void);
extern void sme_suspend_exit(void);
static inline size_t __sme_state_size(unsigned int sme_vl)
{
size_t size = ZA_SIG_REGS_SIZE(sve_vq_from_vl(sme_vl));
if (system_supports_sme2())
size += ZT_SIG_REG_SIZE;
return size;
}
/*
* Return how many bytes of memory are required to store the full SME
* specific state for task, given task's currently configured vector
@@ -393,15 +426,7 @@ extern void sme_suspend_exit(void);
*/
static inline size_t sme_state_size(struct task_struct const *task)
{
unsigned int vl = task_get_sme_vl(task);
size_t size;
size = ZA_SIG_REGS_SIZE(sve_vq_from_vl(vl));
if (system_supports_sme2())
size += ZT_SIG_REG_SIZE;
return size;
return __sme_state_size(task_get_sme_vl(task));
}
#else
@@ -422,6 +447,11 @@ static inline int sme_set_current_vl(unsigned long arg) { return -EINVAL; }
static inline int sme_get_current_vl(void) { return -EINVAL; }
static inline void sme_suspend_exit(void) { }
static inline size_t __sme_state_size(unsigned int sme_vl)
{
return 0;
}
static inline size_t sme_state_size(struct task_struct const *task)
{
return 0;

View File

@@ -88,7 +88,9 @@ struct kvm_hyp_memcache {
phys_addr_t head;
unsigned long nr_pages;
unsigned long flags;
struct pkvm_mapping *mapping; /* only used from EL1 */
void *mapping; /* struct pkvm_mapping *, only used from EL1 */
ANDROID_KABI_RESERVE(1);
ANDROID_KABI_RESERVE(2);
};
static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc,
@@ -299,6 +301,7 @@ struct kvm_protected_vm {
gpa_t pvmfw_load_addr;
bool enabled;
u32 ffa_support;
bool smc_forwarded;
};
struct kvm_mpidr_data {
@@ -660,23 +663,13 @@ struct kvm_host_data {
struct kvm_cpu_context host_ctxt;
/*
* All pointers in this union are hyp VA.
* Hyp VA.
* sve_state is only used in pKVM and if system_supports_sve().
*/
union {
struct user_fpsimd_state *fpsimd_state;
struct cpu_sve_state *sve_state;
};
struct cpu_sve_state *sve_state;
union {
/* HYP VA pointer to the host storage for FPMR */
u64 *fpmr_ptr;
/*
* Used by pKVM only, as it needs to provide storage
* for the host
*/
u64 fpmr;
};
/* Used by pKVM only. */
u64 fpmr;
/* Ownership of the FP regs */
enum {
@@ -1070,10 +1063,6 @@ struct kvm_vcpu_arch {
/* pKVM host vcpu state is dirty, needs resync (nVHE-only) */
#define PKVM_HOST_STATE_DIRTY __vcpu_single_flag(iflags, BIT(7))
/* SVE enabled for host EL0 */
#define HOST_SVE_ENABLED __vcpu_single_flag(sflags, BIT(0))
/* SME enabled for EL0 */
#define HOST_SME_ENABLED __vcpu_single_flag(sflags, BIT(1))
/* Physical CPU not in supported_cpus */
#define ON_UNSUPPORTED_CPU __vcpu_single_flag(sflags, BIT(2))
/* WFIT instruction trapped */

View File

@@ -26,7 +26,7 @@ void pkvm_destroy_hyp_vm(struct kvm *kvm);
bool pkvm_is_hyp_created(struct kvm *kvm);
int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu);
void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa);
int pvkm_enable_smc_forwarding(struct file *kvm_file);
/*
* This functions as an allow-list of protected VM capabilities.
* Features not explicitly allowed by this function are denied.

View File

@@ -20,6 +20,11 @@ enum pkvm_psci_notification {
PKVM_PSCI_CPU_ENTRY,
};
struct pkvm_sglist_page {
u64 pfn : 40;
u8 order;
} __packed;
/**
* struct pkvm_module_ops - pKVM modules callbacks
* @create_private_mapping: Map a memory region into the hypervisor private
@@ -107,6 +112,10 @@ enum pkvm_psci_notification {
* order depends on the registration order. If no
* handler return True, the SMC is forwarded to
* EL3.
* @register_guest_smc_handler: @cb is called when guest identified by the
* pkvm_handle issues an SMC that pKVM couldn't
* handle. If @cb returns false, then unsupported
* operation error is returned back to the guest.
* @register_default_trap_handler:
* @cb is called whenever EL2 traps EL1 and pKVM
* has not handled it. If @cb returns false, the
@@ -132,6 +141,9 @@ enum pkvm_psci_notification {
* full control is given to the hypervisor.
* @host_donate_hyp_prot: As host_donate_hyp_prot, but this variant sets
* the prot of the hyp.
* @host_donate_sglist_hyp: Similar to host_donate_hyp but take an array of PFNs
* (kvm_sglist_page) as an argument. This intends to
* batch IOMMU updates.
* @hyp_donate_host: The page @pfn whom control has previously been
* given to the hypervisor (@host_donate_hyp) is
* given back to the host.
@@ -218,6 +230,9 @@ struct pkvm_module_ops {
int (*host_stage2_enable_lazy_pte)(u64 addr, u64 nr_pages);
int (*host_stage2_disable_lazy_pte)(u64 addr, u64 nr_pages);
int (*register_host_smc_handler)(bool (*cb)(struct user_pt_regs *));
int (*register_guest_smc_handler)(bool (*cb)(struct arm_smccc_1_2_regs *regs,
struct arm_smccc_1_2_regs *res,
pkvm_handle_t handle));
int (*register_default_trap_handler)(bool (*cb)(struct user_pt_regs *));
int (*register_illegal_abt_notifier)(void (*cb)(struct user_pt_regs *));
int (*register_psci_notifier)(void (*cb)(enum pkvm_psci_notification, struct user_pt_regs *));
@@ -225,6 +240,7 @@ struct pkvm_module_ops {
int (*register_unmask_serror)(bool (*unmask)(void), void (*mask)(void));
int (*host_donate_hyp)(u64 pfn, u64 nr_pages, bool accept_mmio);
int (*host_donate_hyp_prot)(u64 pfn, u64 nr_pages, bool accept_mmio, enum kvm_pgtable_prot prot);
int (*host_donate_sglist_hyp)(struct pkvm_sglist_page *sglist, size_t nr_pages);
int (*hyp_donate_host)(u64 pfn, u64 nr_pages);
int (*host_share_hyp)(u64 pfn);
int (*host_unshare_hyp)(u64 pfn);

View File

@@ -396,20 +396,16 @@ static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
* As per the ABI exit SME streaming mode and clear the SVE state not
* shared with FPSIMD on syscall entry.
*/
static inline void fp_user_discard(void)
static inline void fpsimd_syscall_enter(void)
{
/*
* If SME is active then exit streaming mode. If ZA is active
* then flush the SVE registers but leave userspace access to
* both SVE and SME enabled, otherwise disable SME for the
* task and fall through to disabling SVE too. This means
* that after a syscall we never have any streaming mode
* register state to track, if this changes the KVM code will
* need updating.
*/
/* Ensure PSTATE.SM is clear, but leave PSTATE.ZA as-is. */
if (system_supports_sme())
sme_smstop_sm();
/*
* The CPU is not in streaming mode. If non-streaming SVE is not
* supported, there is no SVE state that needs to be discarded.
*/
if (!system_supports_sve())
return;
@@ -419,6 +415,33 @@ static inline void fp_user_discard(void)
sve_vq_minus_one = sve_vq_from_vl(task_get_sve_vl(current)) - 1;
sve_flush_live(true, sve_vq_minus_one);
}
/*
* Any live non-FPSIMD SVE state has been zeroed. Allow
* fpsimd_save_user_state() to lazily discard SVE state until either
* the live state is unbound or fpsimd_syscall_exit() is called.
*/
__this_cpu_write(fpsimd_last_state.to_save, FP_STATE_FPSIMD);
}
static __always_inline void fpsimd_syscall_exit(void)
{
if (!system_supports_sve())
return;
/*
* The current task's user FPSIMD/SVE/SME state is now bound to this
* CPU. The fpsimd_last_state.to_save value is either:
*
* - FP_STATE_FPSIMD, if the state has not been reloaded on this CPU
* since fpsimd_syscall_enter().
*
* - FP_STATE_CURRENT, if the state has been reloaded on this CPU at
* any point.
*
* Reset this to FP_STATE_CURRENT to stop lazy discarding.
*/
__this_cpu_write(fpsimd_last_state.to_save, FP_STATE_CURRENT);
}
UNHANDLED(el1t, 64, sync)
@@ -710,10 +733,11 @@ static void noinstr el0_svc(struct pt_regs *regs)
{
enter_from_user_mode(regs);
cortex_a76_erratum_1463225_svc_handler();
fp_user_discard();
fpsimd_syscall_enter();
local_daif_restore(DAIF_PROCCTX);
do_el0_svc(regs);
exit_to_user_mode(regs);
fpsimd_syscall_exit();
}
static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr)

View File

@@ -119,7 +119,7 @@
* whatever is in the FPSIMD registers is not saved to memory, but discarded.
*/
static DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state);
DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state);
__ro_after_init struct vl_info vl_info[ARM64_VEC_MAX] = {
#ifdef CONFIG_ARM64_SVE
@@ -359,20 +359,15 @@ static void task_fpsimd_load(void)
WARN_ON(preemptible());
WARN_ON(test_thread_flag(TIF_KERNEL_FPSTATE));
if (system_supports_fpmr())
write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR);
if (system_supports_sve() || system_supports_sme()) {
switch (current->thread.fp_type) {
case FP_STATE_FPSIMD:
/* Stop tracking SVE for this task until next use. */
if (test_and_clear_thread_flag(TIF_SVE))
sve_user_disable();
clear_thread_flag(TIF_SVE);
break;
case FP_STATE_SVE:
if (!thread_sm_enabled(&current->thread) &&
!WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE)))
sve_user_enable();
if (!thread_sm_enabled(&current->thread))
WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE));
if (test_thread_flag(TIF_SVE))
sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1);
@@ -413,6 +408,9 @@ static void task_fpsimd_load(void)
restore_ffr = system_supports_fa64();
}
if (system_supports_fpmr())
write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR);
if (restore_sve_regs) {
WARN_ON_ONCE(current->thread.fp_type != FP_STATE_SVE);
sve_load_state(sve_pffr(&current->thread),
@@ -453,12 +451,15 @@ static void fpsimd_save_user_state(void)
*(last->fpmr) = read_sysreg_s(SYS_FPMR);
/*
* If a task is in a syscall the ABI allows us to only
* preserve the state shared with FPSIMD so don't bother
* saving the full SVE state in that case.
* Save SVE state if it is live.
*
* The syscall ABI discards live SVE state at syscall entry. When
* entering a syscall, fpsimd_syscall_enter() sets to_save to
* FP_STATE_FPSIMD to allow the SVE state to be lazily discarded until
* either new SVE state is loaded+bound or fpsimd_syscall_exit() is
* called prior to a return to userspace.
*/
if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE) &&
!in_syscall(current_pt_regs())) ||
if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE)) ||
last->to_save == FP_STATE_SVE) {
save_sve_regs = true;
save_ffr = true;
@@ -651,7 +652,7 @@ static void __fpsimd_to_sve(void *sst, struct user_fpsimd_state const *fst,
* task->thread.uw.fpsimd_state must be up to date before calling this
* function.
*/
static void fpsimd_to_sve(struct task_struct *task)
static inline void fpsimd_to_sve(struct task_struct *task)
{
unsigned int vq;
void *sst = task->thread.sve_state;
@@ -675,7 +676,7 @@ static void fpsimd_to_sve(struct task_struct *task)
* bytes of allocated kernel memory.
* task->thread.sve_state must be up to date before calling this function.
*/
static void sve_to_fpsimd(struct task_struct *task)
static inline void sve_to_fpsimd(struct task_struct *task)
{
unsigned int vq, vl;
void const *sst = task->thread.sve_state;
@@ -694,6 +695,28 @@ static void sve_to_fpsimd(struct task_struct *task)
}
}
static inline void __fpsimd_zero_vregs(struct user_fpsimd_state *fpsimd)
{
memset(&fpsimd->vregs, 0, sizeof(fpsimd->vregs));
}
/*
* Simulate the effects of an SMSTOP SM instruction.
*/
void task_smstop_sm(struct task_struct *task)
{
if (!thread_sm_enabled(&task->thread))
return;
__fpsimd_zero_vregs(&task->thread.uw.fpsimd_state);
task->thread.uw.fpsimd_state.fpsr = 0x0800009f;
if (system_supports_fpmr())
task->thread.uw.fpmr = 0;
task->thread.svcr &= ~SVCR_SM_MASK;
task->thread.fp_type = FP_STATE_FPSIMD;
}
void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p)
{
write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK,
@@ -701,39 +724,12 @@ void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p)
}
#ifdef CONFIG_ARM64_SVE
/*
* Call __sve_free() directly only if you know task can't be scheduled
* or preempted.
*/
static void __sve_free(struct task_struct *task)
static void sve_free(struct task_struct *task)
{
kfree(task->thread.sve_state);
task->thread.sve_state = NULL;
}
static void sve_free(struct task_struct *task)
{
WARN_ON(test_tsk_thread_flag(task, TIF_SVE));
__sve_free(task);
}
/*
* Return how many bytes of memory are required to store the full SVE
* state for task, given task's currently configured vector length.
*/
size_t sve_state_size(struct task_struct const *task)
{
unsigned int vl = 0;
if (system_supports_sve())
vl = task_get_sve_vl(task);
if (system_supports_sme())
vl = max(vl, task_get_sme_vl(task));
return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl));
}
/*
* Ensure that task->thread.sve_state is allocated and sufficiently large.
*
@@ -758,69 +754,34 @@ void sve_alloc(struct task_struct *task, bool flush)
kzalloc(sve_state_size(task), GFP_KERNEL);
}
/*
* Force the FPSIMD state shared with SVE to be updated in the SVE state
* even if the SVE state is the current active state.
* Ensure that task->thread.uw.fpsimd_state is up to date with respect to the
* task's currently effective FPSIMD/SVE state.
*
* This should only be called by ptrace. task must be non-runnable.
* task->thread.sve_state must point to at least sve_state_size(task)
* bytes of allocated kernel memory.
* The task's FPSIMD/SVE/SME state must not be subject to concurrent
* manipulation.
*/
void fpsimd_force_sync_to_sve(struct task_struct *task)
{
fpsimd_to_sve(task);
}
/*
* Ensure that task->thread.sve_state is up to date with respect to
* the user task, irrespective of when SVE is in use or not.
*
* This should only be called by ptrace. task must be non-runnable.
* task->thread.sve_state must point to at least sve_state_size(task)
* bytes of allocated kernel memory.
*/
void fpsimd_sync_to_sve(struct task_struct *task)
{
if (!test_tsk_thread_flag(task, TIF_SVE) &&
!thread_sm_enabled(&task->thread))
fpsimd_to_sve(task);
}
/*
* Ensure that task->thread.uw.fpsimd_state is up to date with respect to
* the user task, irrespective of whether SVE is in use or not.
*
* This should only be called by ptrace. task must be non-runnable.
* task->thread.sve_state must point to at least sve_state_size(task)
* bytes of allocated kernel memory.
*/
void sve_sync_to_fpsimd(struct task_struct *task)
void fpsimd_sync_from_effective_state(struct task_struct *task)
{
if (task->thread.fp_type == FP_STATE_SVE)
sve_to_fpsimd(task);
}
/*
* Ensure that task->thread.sve_state is up to date with respect to
* the task->thread.uw.fpsimd_state.
* Ensure that the task's currently effective FPSIMD/SVE state is up to date
* with respect to task->thread.uw.fpsimd_state, zeroing any effective
* non-FPSIMD (S)SVE state.
*
* This should only be called by ptrace to merge new FPSIMD register
* values into a task for which SVE is currently active.
* task must be non-runnable.
* task->thread.sve_state must point to at least sve_state_size(task)
* bytes of allocated kernel memory.
* task->thread.uw.fpsimd_state must already have been initialised with
* the new FPSIMD register values to be merged in.
* The task's FPSIMD/SVE/SME state must not be subject to concurrent
* manipulation.
*/
void sve_sync_from_fpsimd_zeropad(struct task_struct *task)
void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task)
{
unsigned int vq;
void *sst = task->thread.sve_state;
struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state;
if (!test_tsk_thread_flag(task, TIF_SVE) &&
!thread_sm_enabled(&task->thread))
if (task->thread.fp_type != FP_STATE_SVE)
return;
vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread));
@@ -829,10 +790,73 @@ void sve_sync_from_fpsimd_zeropad(struct task_struct *task)
__fpsimd_to_sve(sst, fst, vq);
}
static int change_live_vector_length(struct task_struct *task,
enum vec_type type,
unsigned long vl)
{
unsigned int sve_vl = task_get_sve_vl(task);
unsigned int sme_vl = task_get_sme_vl(task);
void *sve_state = NULL, *sme_state = NULL;
if (type == ARM64_VEC_SME)
sme_vl = vl;
else
sve_vl = vl;
/*
* Allocate the new sve_state and sme_state before freeing the old
* copies so that allocation failure can be handled without needing to
* mutate the task's state in any way.
*
* Changes to the SVE vector length must not discard live ZA state or
* clear PSTATE.ZA, as userspace code which is unaware of the AAPCS64
* ZA lazy saving scheme may attempt to change the SVE vector length
* while unsaved/dormant ZA state exists.
*/
sve_state = kzalloc(__sve_state_size(sve_vl, sme_vl), GFP_KERNEL);
if (!sve_state)
goto out_mem;
if (type == ARM64_VEC_SME) {
sme_state = kzalloc(__sme_state_size(sme_vl), GFP_KERNEL);
if (!sme_state)
goto out_mem;
}
if (task == current)
fpsimd_save_and_flush_current_state();
else
fpsimd_flush_task_state(task);
/*
* Always preserve PSTATE.SM and the effective FPSIMD state, zeroing
* other SVE state.
*/
fpsimd_sync_from_effective_state(task);
task_set_vl(task, type, vl);
kfree(task->thread.sve_state);
task->thread.sve_state = sve_state;
fpsimd_sync_to_effective_state_zeropad(task);
if (type == ARM64_VEC_SME) {
task->thread.svcr &= ~SVCR_ZA_MASK;
kfree(task->thread.sme_state);
task->thread.sme_state = sme_state;
}
return 0;
out_mem:
kfree(sve_state);
kfree(sme_state);
return -ENOMEM;
}
int vec_set_vector_length(struct task_struct *task, enum vec_type type,
unsigned long vl, unsigned long flags)
{
bool free_sme = false;
bool onexec = flags & PR_SVE_SET_VL_ONEXEC;
bool inherit = flags & PR_SVE_VL_INHERIT;
if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT |
PR_SVE_SET_VL_ONEXEC))
@@ -852,71 +876,17 @@ int vec_set_vector_length(struct task_struct *task, enum vec_type type,
vl = find_supported_vector_length(type, vl);
if (flags & (PR_SVE_VL_INHERIT |
PR_SVE_SET_VL_ONEXEC))
if (!onexec && vl != task_get_vl(task, type)) {
if (change_live_vector_length(task, type, vl))
return -ENOMEM;
}
if (onexec || inherit)
task_set_vl_onexec(task, type, vl);
else
/* Reset VL to system default on next exec: */
task_set_vl_onexec(task, type, 0);
/* Only actually set the VL if not deferred: */
if (flags & PR_SVE_SET_VL_ONEXEC)
goto out;
if (vl == task_get_vl(task, type))
goto out;
/*
* To ensure the FPSIMD bits of the SVE vector registers are preserved,
* write any live register state back to task_struct, and convert to a
* regular FPSIMD thread.
*/
if (task == current) {
get_cpu_fpsimd_context();
fpsimd_save_user_state();
}
fpsimd_flush_task_state(task);
if (test_and_clear_tsk_thread_flag(task, TIF_SVE) ||
thread_sm_enabled(&task->thread)) {
sve_to_fpsimd(task);
task->thread.fp_type = FP_STATE_FPSIMD;
}
if (system_supports_sme()) {
if (type == ARM64_VEC_SME ||
!(task->thread.svcr & (SVCR_SM_MASK | SVCR_ZA_MASK))) {
/*
* We are changing the SME VL or weren't using
* SME anyway, discard the state and force a
* reallocation.
*/
task->thread.svcr &= ~(SVCR_SM_MASK |
SVCR_ZA_MASK);
clear_tsk_thread_flag(task, TIF_SME);
free_sme = true;
}
}
if (task == current)
put_cpu_fpsimd_context();
task_set_vl(task, type, vl);
/*
* Free the changed states if they are not in use, SME will be
* reallocated to the correct size on next use and we just
* allocate SVE now in case it is needed for use in streaming
* mode.
*/
sve_free(task);
sve_alloc(task, true);
if (free_sme)
sme_free(task);
out:
update_tsk_thread_flag(task, vec_vl_inherit_flag(type),
flags & PR_SVE_VL_INHERIT);
@@ -1212,7 +1182,7 @@ void __init sve_setup(void)
*/
void fpsimd_release_task(struct task_struct *dead_task)
{
__sve_free(dead_task);
sve_free(dead_task);
sme_free(dead_task);
}
@@ -1436,7 +1406,7 @@ void do_sme_acc(unsigned long esr, struct pt_regs *regs)
* If this not a trap due to SME being disabled then something
* is being used in the wrong mode, report as SIGILL.
*/
if (ESR_ELx_ISS(esr) != ESR_ELx_SME_ISS_SME_DISABLED) {
if (ESR_ELx_SME_ISS_SMTC(esr) != ESR_ELx_SME_ISS_SMTC_SME_DISABLED) {
force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
return;
}
@@ -1460,6 +1430,8 @@ void do_sme_acc(unsigned long esr, struct pt_regs *regs)
sme_set_vq(vq_minus_one);
fpsimd_bind_task_to_cpu();
} else {
fpsimd_flush_task_state(current);
}
put_cpu_fpsimd_context();
@@ -1573,8 +1545,8 @@ void fpsimd_thread_switch(struct task_struct *next)
fpsimd_save_user_state();
if (test_tsk_thread_flag(next, TIF_KERNEL_FPSTATE)) {
fpsimd_load_kernel_state(next);
fpsimd_flush_cpu_state();
fpsimd_load_kernel_state(next);
} else {
/*
* Fix up TIF_FOREIGN_FPSTATE to correctly describe next's
@@ -1661,6 +1633,9 @@ void fpsimd_flush_thread(void)
current->thread.svcr = 0;
}
if (system_supports_fpmr())
current->thread.uw.fpmr = 0;
current->thread.fp_type = FP_STATE_FPSIMD;
put_cpu_fpsimd_context();
@@ -1682,43 +1657,6 @@ void fpsimd_preserve_current_state(void)
put_cpu_fpsimd_context();
}
/*
* Like fpsimd_preserve_current_state(), but ensure that
* current->thread.uw.fpsimd_state is updated so that it can be copied to
* the signal frame.
*/
void fpsimd_signal_preserve_current_state(void)
{
fpsimd_preserve_current_state();
if (current->thread.fp_type == FP_STATE_SVE)
sve_to_fpsimd(current);
}
/*
* Called by KVM when entering the guest.
*/
void fpsimd_kvm_prepare(void)
{
if (!system_supports_sve())
return;
/*
* KVM does not save host SVE state since we can only enter
* the guest from a syscall so the ABI means that only the
* non-saved SVE state needs to be saved. If we have left
* SVE enabled for performance reasons then update the task
* state to be FPSIMD only.
*/
get_cpu_fpsimd_context();
if (test_and_clear_thread_flag(TIF_SVE)) {
sve_to_fpsimd(current);
current->thread.fp_type = FP_STATE_FPSIMD;
}
put_cpu_fpsimd_context();
}
/*
* Associate current's FPSIMD context with this cpu
* The caller must have ownership of the cpu FPSIMD context before calling
@@ -1811,30 +1749,14 @@ void fpsimd_restore_current_state(void)
put_cpu_fpsimd_context();
}
/*
* Load an updated userland FPSIMD state for 'current' from memory and set the
* flag that indicates that the FPSIMD register contents are the most recent
* FPSIMD state of 'current'. This is used by the signal code to restore the
* register state when returning from a signal handler in FPSIMD only cases,
* any SVE context will be discarded.
*/
void fpsimd_update_current_state(struct user_fpsimd_state const *state)
{
if (WARN_ON(!system_supports_fpsimd()))
return;
get_cpu_fpsimd_context();
current->thread.uw.fpsimd_state = *state;
if (test_thread_flag(TIF_SVE))
if (current->thread.fp_type == FP_STATE_SVE)
fpsimd_to_sve(current);
task_fpsimd_load();
fpsimd_bind_task_to_cpu();
clear_thread_flag(TIF_FOREIGN_FPSTATE);
put_cpu_fpsimd_context();
}
/*
@@ -1864,6 +1786,17 @@ void fpsimd_flush_task_state(struct task_struct *t)
barrier();
}
void fpsimd_save_and_flush_current_state(void)
{
if (!system_supports_fpsimd())
return;
get_cpu_fpsimd_context();
fpsimd_save_user_state();
fpsimd_flush_task_state(current);
put_cpu_fpsimd_context();
}
/*
* Save the FPSIMD state to memory and invalidate cpu view.
* This function must be called with preemption disabled.

View File

@@ -399,17 +399,6 @@ void __init init_feature_override(u64 boot_status, const void *fdt,
parse_cmdline(fdt, chosen);
/*
* ANDROID: Forcefully disable SME at runtime until it is fixed
* upstream (b/393087661). We prefer this to disabling
* CONFIG_ARM64_SME so that the impact of the fixes on KMI is
* minimised.
*/
id_aa64pfr1_override.mask |= ID_AA64PFR1_EL1_SME;
id_aa64pfr1_override.val &= ~ID_AA64PFR1_EL1_SME;
id_aa64smfr0_override.mask = GENMASK(63, 0);
id_aa64smfr0_override.val = 0;
for (i = 0; i < ARRAY_SIZE(regs); i++) {
reg = prel64_pointer(regs[i].reg);
override = prel64_pointer(reg->override);

View File

@@ -299,50 +299,34 @@ void arch_release_task_struct(struct task_struct *tsk)
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
if (current->mm)
fpsimd_preserve_current_state();
/*
* The current/src task's FPSIMD state may or may not be live, and may
* have been altered by ptrace after entry to the kernel. Save the
* effective FPSIMD state so that this will be copied into dst.
*/
fpsimd_save_and_flush_current_state();
fpsimd_sync_from_effective_state(src);
*dst = *src;
/*
* Detach src's sve_state (if any) from dst so that it does not
* get erroneously used or freed prematurely. dst's copies
* will be allocated on demand later on if dst uses SVE.
* For consistency, also clear TIF_SVE here: this could be done
* later in copy_process(), but to avoid tripping up future
* maintainers it is best not to leave TIF flags and buffers in
* an inconsistent state, even temporarily.
* Drop stale reference to src's sve_state and convert dst to
* non-streaming FPSIMD mode.
*/
dst->thread.fp_type = FP_STATE_FPSIMD;
dst->thread.sve_state = NULL;
clear_tsk_thread_flag(dst, TIF_SVE);
task_smstop_sm(dst);
/*
* In the unlikely event that we create a new thread with ZA
* enabled we should retain the ZA and ZT state so duplicate
* it here. This may be shortly freed if we exec() or if
* CLONE_SETTLS but it's simpler to do it here. To avoid
* confusing the rest of the code ensure that we have a
* sve_state allocated whenever sme_state is allocated.
* Drop stale reference to src's sme_state and ensure dst has ZA
* disabled.
*
* When necessary, ZA will be inherited later in copy_thread_za().
*/
if (thread_za_enabled(&src->thread)) {
dst->thread.sve_state = kzalloc(sve_state_size(src),
GFP_KERNEL);
if (!dst->thread.sve_state)
return -ENOMEM;
dst->thread.sme_state = kmemdup(src->thread.sme_state,
sme_state_size(src),
GFP_KERNEL);
if (!dst->thread.sme_state) {
kfree(dst->thread.sve_state);
dst->thread.sve_state = NULL;
return -ENOMEM;
}
} else {
dst->thread.sme_state = NULL;
clear_tsk_thread_flag(dst, TIF_SME);
}
dst->thread.fp_type = FP_STATE_FPSIMD;
dst->thread.sme_state = NULL;
clear_tsk_thread_flag(dst, TIF_SME);
dst->thread.svcr &= ~SVCR_ZA_MASK;
/* clear any pending asynchronous tag fault raised by the parent */
clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT);
@@ -350,6 +334,31 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
return 0;
}
static int copy_thread_za(struct task_struct *dst, struct task_struct *src)
{
if (!thread_za_enabled(&src->thread))
return 0;
dst->thread.sve_state = kzalloc(sve_state_size(src),
GFP_KERNEL);
if (!dst->thread.sve_state)
return -ENOMEM;
dst->thread.sme_state = kmemdup(src->thread.sme_state,
sme_state_size(src),
GFP_KERNEL);
if (!dst->thread.sme_state) {
kfree(dst->thread.sve_state);
dst->thread.sve_state = NULL;
return -ENOMEM;
}
set_tsk_thread_flag(dst, TIF_SME);
dst->thread.svcr |= SVCR_ZA_MASK;
return 0;
}
asmlinkage void ret_from_fork(void) asm("ret_from_fork");
int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
@@ -358,6 +367,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
unsigned long stack_start = args->stack;
unsigned long tls = args->tls;
struct pt_regs *childregs = task_pt_regs(p);
int ret;
memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
@@ -381,8 +391,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
* out-of-sync with the saved value.
*/
*task_user_tls(p) = read_sysreg(tpidr_el0);
if (system_supports_tpidr2())
p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
if (system_supports_poe())
p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
@@ -395,13 +403,39 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
}
/*
* If a TLS pointer was passed to clone, use it for the new
* thread. We also reset TPIDR2 if it's in use.
* Due to the AAPCS64 "ZA lazy saving scheme", PSTATE.ZA and
* TPIDR2 need to be manipulated as a pair, and either both
* need to be inherited or both need to be reset.
*
* Within a process, child threads must not inherit their
* parent's TPIDR2 value or they may clobber their parent's
* stack at some later point.
*
* When a process is fork()'d, the child must inherit ZA and
* TPIDR2 from its parent in case there was dormant ZA state.
*
* Use CLONE_VM to determine when the child will share the
* address space with the parent, and cannot safely inherit the
* state.
*/
if (clone_flags & CLONE_SETTLS) {
p->thread.uw.tp_value = tls;
p->thread.tpidr2_el0 = 0;
if (system_supports_sme()) {
if (!(clone_flags & CLONE_VM)) {
p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
ret = copy_thread_za(p, current);
if (ret)
return ret;
} else {
p->thread.tpidr2_el0 = 0;
WARN_ON_ONCE(p->thread.svcr & SVCR_ZA_MASK);
}
}
/*
* If a TLS pointer was passed to clone, use it for the new
* thread.
*/
if (clone_flags & CLONE_SETTLS)
p->thread.uw.tp_value = tls;
} else {
/*
* A kthread has no context to ERET to, so ensure any buggy

View File

@@ -595,7 +595,7 @@ static int __fpr_get(struct task_struct *target,
{
struct user_fpsimd_state *uregs;
sve_sync_to_fpsimd(target);
fpsimd_sync_from_effective_state(target);
uregs = &target->thread.uw.fpsimd_state;
@@ -627,7 +627,7 @@ static int __fpr_set(struct task_struct *target,
* Ensure target->thread.uw.fpsimd_state is up to date, so that a
* short copyin can't resurrect stale data.
*/
sve_sync_to_fpsimd(target);
fpsimd_sync_from_effective_state(target);
newstate = target->thread.uw.fpsimd_state;
@@ -654,7 +654,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
sve_sync_from_fpsimd_zeropad(target);
fpsimd_sync_to_effective_state_zeropad(target);
fpsimd_flush_task_state(target);
return ret;
@@ -776,6 +776,11 @@ static void sve_init_header_from_task(struct user_sve_header *header,
task_type = ARM64_VEC_SVE;
active = (task_type == type);
if (active && target->thread.fp_type == FP_STATE_SVE)
header->flags = SVE_PT_REGS_SVE;
else
header->flags = SVE_PT_REGS_FPSIMD;
switch (type) {
case ARM64_VEC_SVE:
if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT))
@@ -790,19 +795,14 @@ static void sve_init_header_from_task(struct user_sve_header *header,
return;
}
if (active) {
if (target->thread.fp_type == FP_STATE_FPSIMD) {
header->flags |= SVE_PT_REGS_FPSIMD;
} else {
header->flags |= SVE_PT_REGS_SVE;
}
}
header->vl = task_get_vl(target, type);
vq = sve_vq_from_vl(header->vl);
header->max_vl = vec_max_vl(type);
header->size = SVE_PT_SIZE(vq, header->flags);
if (active)
header->size = SVE_PT_SIZE(vq, header->flags);
else
header->size = sizeof(header);
header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl),
SVE_PT_REGS_SVE);
}
@@ -821,18 +821,25 @@ static int sve_get_common(struct task_struct *target,
unsigned int vq;
unsigned long start, end;
if (target == current)
fpsimd_preserve_current_state();
/* Header */
sve_init_header_from_task(&header, target, type);
vq = sve_vq_from_vl(header.vl);
membuf_write(&to, &header, sizeof(header));
if (target == current)
fpsimd_preserve_current_state();
BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header));
BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header));
/*
* When the requested vector type is not active, do not present data
* from the other mode to userspace.
*/
if (header.size == sizeof(header))
return 0;
switch ((header.flags & SVE_PT_REGS_MASK)) {
case SVE_PT_REGS_FPSIMD:
return __fpr_get(target, regset, to);
@@ -860,7 +867,7 @@ static int sve_get_common(struct task_struct *target,
return membuf_zero(&to, end - start);
default:
return 0;
BUILD_BUG();
}
}
@@ -884,6 +891,9 @@ static int sve_set_common(struct task_struct *target,
struct user_sve_header header;
unsigned int vq;
unsigned long start, end;
bool fpsimd;
fpsimd_flush_task_state(target);
/* Header */
if (count < sizeof(header))
@@ -891,7 +901,16 @@ static int sve_set_common(struct task_struct *target,
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header,
0, sizeof(header));
if (ret)
goto out;
return ret;
/*
* Streaming SVE data is always stored and presented in SVE format.
* Require the user to provide SVE formatted data for consistency, and
* to avoid the risk that we configure the task into an invalid state.
*/
fpsimd = (header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD;
if (fpsimd && type == ARM64_VEC_SME)
return -EINVAL;
/*
* Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by
@@ -900,88 +919,68 @@ static int sve_set_common(struct task_struct *target,
ret = vec_set_vector_length(target, type, header.vl,
((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
if (ret)
goto out;
return ret;
/* Allocate SME storage if necessary, preserving any existing ZA/ZT state */
if (type == ARM64_VEC_SME) {
sme_alloc(target, false);
if (!target->thread.sme_state)
return -ENOMEM;
}
/* Allocate SVE storage if necessary, zeroing any existing SVE state */
if (!fpsimd) {
sve_alloc(target, true);
if (!target->thread.sve_state)
return -ENOMEM;
}
/* Actual VL set may be less than the user asked for: */
vq = sve_vq_from_vl(task_get_vl(target, type));
/* Enter/exit streaming mode */
if (system_supports_sme()) {
u64 old_svcr = target->thread.svcr;
switch (type) {
case ARM64_VEC_SVE:
target->thread.svcr &= ~SVCR_SM_MASK;
set_tsk_thread_flag(target, TIF_SVE);
break;
case ARM64_VEC_SME:
target->thread.svcr |= SVCR_SM_MASK;
/*
* Disable traps and ensure there is SME storage but
* preserve any currently set values in ZA/ZT.
*/
sme_alloc(target, false);
set_tsk_thread_flag(target, TIF_SME);
break;
default:
WARN_ON_ONCE(1);
ret = -EINVAL;
goto out;
return -EINVAL;
}
/*
* If we switched then invalidate any existing SVE
* state and ensure there's storage.
*/
if (target->thread.svcr != old_svcr)
sve_alloc(target, true);
}
/* Always zero V regs, FPSR, and FPCR */
memset(&current->thread.uw.fpsimd_state, 0,
sizeof(current->thread.uw.fpsimd_state));
/* Registers: FPSIMD-only case */
BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header));
if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) {
ret = __fpr_set(target, regset, pos, count, kbuf, ubuf,
SVE_PT_FPSIMD_OFFSET);
if (fpsimd) {
clear_tsk_thread_flag(target, TIF_SVE);
target->thread.fp_type = FP_STATE_FPSIMD;
goto out;
ret = __fpr_set(target, regset, pos, count, kbuf, ubuf,
SVE_PT_FPSIMD_OFFSET);
return ret;
}
/*
* Otherwise: no registers or full SVE case. For backwards
* compatibility reasons we treat empty flags as SVE registers.
*/
/* Otherwise: no registers or full SVE case. */
target->thread.fp_type = FP_STATE_SVE;
/*
* If setting a different VL from the requested VL and there is
* register data, the data layout will be wrong: don't even
* try to set the registers in this case.
*/
if (count && vq != sve_vq_from_vl(header.vl)) {
ret = -EIO;
goto out;
}
sve_alloc(target, true);
if (!target->thread.sve_state) {
ret = -ENOMEM;
clear_tsk_thread_flag(target, TIF_SVE);
target->thread.fp_type = FP_STATE_FPSIMD;
goto out;
}
/*
* Ensure target->thread.sve_state is up to date with target's
* FPSIMD regs, so that a short copyin leaves trailing
* registers unmodified. Only enable SVE if we are
* configuring normal SVE, a system with streaming SVE may not
* have normal SVE.
*/
fpsimd_sync_to_sve(target);
if (type == ARM64_VEC_SVE)
set_tsk_thread_flag(target, TIF_SVE);
target->thread.fp_type = FP_STATE_SVE;
if (count && vq != sve_vq_from_vl(header.vl))
return -EIO;
BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header));
start = SVE_PT_SVE_OFFSET;
@@ -990,7 +989,7 @@ static int sve_set_common(struct task_struct *target,
target->thread.sve_state,
start, end);
if (ret)
goto out;
return ret;
start = end;
end = SVE_PT_SVE_FPSR_OFFSET(vq);
@@ -1006,8 +1005,6 @@ static int sve_set_common(struct task_struct *target,
&target->thread.uw.fpsimd_state.fpsr,
start, end);
out:
fpsimd_flush_task_state(target);
return ret;
}

View File

@@ -250,6 +250,8 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
&current->thread.uw.fpsimd_state;
int err;
fpsimd_sync_from_effective_state(current);
/* copy the FP and status/control registers */
err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs));
__put_user_error(fpsimd->fpsr, &ctx->fpsr, err);
@@ -262,37 +264,46 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
return err ? -EFAULT : 0;
}
static int restore_fpsimd_context(struct user_ctxs *user)
static int read_fpsimd_context(struct user_fpsimd_state *fpsimd,
struct user_ctxs *user)
{
struct user_fpsimd_state fpsimd;
int err = 0;
int err;
/* check the size information */
if (user->fpsimd_size != sizeof(struct fpsimd_context))
return -EINVAL;
/* copy the FP and status/control registers */
err = __copy_from_user(fpsimd.vregs, &(user->fpsimd->vregs),
sizeof(fpsimd.vregs));
__get_user_error(fpsimd.fpsr, &(user->fpsimd->fpsr), err);
__get_user_error(fpsimd.fpcr, &(user->fpsimd->fpcr), err);
err = __copy_from_user(fpsimd->vregs, &(user->fpsimd->vregs),
sizeof(fpsimd->vregs));
__get_user_error(fpsimd->fpsr, &(user->fpsimd->fpsr), err);
__get_user_error(fpsimd->fpcr, &(user->fpsimd->fpcr), err);
return err ? -EFAULT : 0;
}
static int restore_fpsimd_context(struct user_ctxs *user)
{
struct user_fpsimd_state fpsimd;
int err;
err = read_fpsimd_context(&fpsimd, user);
if (err)
return err;
clear_thread_flag(TIF_SVE);
current->thread.svcr &= ~SVCR_SM_MASK;
current->thread.fp_type = FP_STATE_FPSIMD;
/* load the hardware registers from the fpsimd_state structure */
if (!err)
fpsimd_update_current_state(&fpsimd);
return err ? -EFAULT : 0;
fpsimd_update_current_state(&fpsimd);
return 0;
}
static int preserve_fpmr_context(struct fpmr_context __user *ctx)
{
int err = 0;
current->thread.uw.fpmr = read_sysreg_s(SYS_FPMR);
__put_user_error(FPMR_MAGIC, &ctx->head.magic, err);
__put_user_error(sizeof(*ctx), &ctx->head.size, err);
__put_user_error(current->thread.uw.fpmr, &ctx->fpmr, err);
@@ -310,7 +321,7 @@ static int restore_fpmr_context(struct user_ctxs *user)
__get_user_error(fpmr, &user->fpmr->fpmr, err);
if (!err)
write_sysreg_s(fpmr, SYS_FPMR);
current->thread.uw.fpmr = fpmr;
return err;
}
@@ -372,11 +383,6 @@ static int preserve_sve_context(struct sve_context __user *ctx)
err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved));
if (vq) {
/*
* This assumes that the SVE state has already been saved to
* the task struct by calling the function
* fpsimd_signal_preserve_current_state().
*/
err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET,
current->thread.sve_state,
SVE_SIG_REGS_SIZE(vq));
@@ -391,6 +397,7 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user)
unsigned int vl, vq;
struct user_fpsimd_state fpsimd;
u16 user_vl, flags;
bool sm;
if (user->sve_size < sizeof(*user->sve))
return -EINVAL;
@@ -400,7 +407,8 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user)
if (err)
return err;
if (flags & SVE_SIG_FLAG_SM) {
sm = flags & SVE_SIG_FLAG_SM;
if (sm) {
if (!system_supports_sme())
return -EINVAL;
@@ -420,28 +428,23 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user)
if (user_vl != vl)
return -EINVAL;
if (user->sve_size == sizeof(*user->sve)) {
clear_thread_flag(TIF_SVE);
current->thread.svcr &= ~SVCR_SM_MASK;
current->thread.fp_type = FP_STATE_FPSIMD;
goto fpsimd_only;
}
/*
* Non-streaming SVE state may be preserved without an SVE payload, in
* which case the SVE context only has a header with VL==0, and all
* state can be restored from the FPSIMD context.
*
* Streaming SVE state is always preserved with an SVE payload. For
* consistency and robustness, reject restoring streaming SVE state
* without an SVE payload.
*/
if (!sm && user->sve_size == sizeof(*user->sve))
return restore_fpsimd_context(user);
vq = sve_vq_from_vl(vl);
if (user->sve_size < SVE_SIG_CONTEXT_SIZE(vq))
return -EINVAL;
/*
* Careful: we are about __copy_from_user() directly into
* thread.sve_state with preemption enabled, so protection is
* needed to prevent a racing context switch from writing stale
* registers back over the new data.
*/
fpsimd_flush_task_state(current);
/* From now, fpsimd_thread_switch() won't touch thread.sve_state */
sve_alloc(current, true);
if (!current->thread.sve_state) {
clear_thread_flag(TIF_SVE);
@@ -461,19 +464,14 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user)
set_thread_flag(TIF_SVE);
current->thread.fp_type = FP_STATE_SVE;
fpsimd_only:
/* copy the FP and status/control registers */
/* restore_sigframe() already checked that user->fpsimd != NULL. */
err = __copy_from_user(fpsimd.vregs, user->fpsimd->vregs,
sizeof(fpsimd.vregs));
__get_user_error(fpsimd.fpsr, &user->fpsimd->fpsr, err);
__get_user_error(fpsimd.fpcr, &user->fpsimd->fpcr, err);
err = read_fpsimd_context(&fpsimd, user);
if (err)
return err;
/* load the hardware registers from the fpsimd_state structure */
if (!err)
fpsimd_update_current_state(&fpsimd);
/* Merge the FPSIMD registers into the SVE state */
fpsimd_update_current_state(&fpsimd);
return err ? -EFAULT : 0;
return 0;
}
#else /* ! CONFIG_ARM64_SVE */
@@ -493,13 +491,12 @@ extern int preserve_sve_context(void __user *ctx);
static int preserve_tpidr2_context(struct tpidr2_context __user *ctx)
{
u64 tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
int err = 0;
current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
__put_user_error(TPIDR2_MAGIC, &ctx->head.magic, err);
__put_user_error(sizeof(*ctx), &ctx->head.size, err);
__put_user_error(current->thread.tpidr2_el0, &ctx->tpidr2, err);
__put_user_error(tpidr2_el0, &ctx->tpidr2, err);
return err;
}
@@ -541,11 +538,6 @@ static int preserve_za_context(struct za_context __user *ctx)
err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved));
if (vq) {
/*
* This assumes that the ZA state has already been saved to
* the task struct by calling the function
* fpsimd_signal_preserve_current_state().
*/
err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET,
current->thread.sme_state,
ZA_SIG_REGS_SIZE(vq));
@@ -580,16 +572,6 @@ static int restore_za_context(struct user_ctxs *user)
if (user->za_size < ZA_SIG_CONTEXT_SIZE(vq))
return -EINVAL;
/*
* Careful: we are about __copy_from_user() directly into
* thread.sme_state with preemption enabled, so protection is
* needed to prevent a racing context switch from writing stale
* registers back over the new data.
*/
fpsimd_flush_task_state(current);
/* From now, fpsimd_thread_switch() won't touch thread.sve_state */
sme_alloc(current, true);
if (!current->thread.sme_state) {
current->thread.svcr &= ~SVCR_ZA_MASK;
@@ -627,11 +609,6 @@ static int preserve_zt_context(struct zt_context __user *ctx)
BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved));
err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved));
/*
* This assumes that the ZT state has already been saved to
* the task struct by calling the function
* fpsimd_signal_preserve_current_state().
*/
err |= __copy_to_user((char __user *)ctx + ZT_SIG_REGS_OFFSET,
thread_zt_state(&current->thread),
ZT_SIG_REGS_SIZE(1));
@@ -657,16 +634,6 @@ static int restore_zt_context(struct user_ctxs *user)
if (nregs != 1)
return -EINVAL;
/*
* Careful: we are about __copy_from_user() directly into
* thread.zt_state with preemption enabled, so protection is
* needed to prevent a racing context switch from writing stale
* registers back over the new data.
*/
fpsimd_flush_task_state(current);
/* From now, fpsimd_thread_switch() won't touch ZT in thread state */
err = __copy_from_user(thread_zt_state(&current->thread),
(char __user const *)user->zt +
ZT_SIG_REGS_OFFSET,
@@ -929,6 +896,8 @@ static int restore_sigframe(struct pt_regs *regs,
*/
forget_syscall(regs);
fpsimd_save_and_flush_current_state();
err |= !valid_user_regs(&regs->user_regs, current);
if (err == 0)
err = parse_user_sigframe(&user, sf);
@@ -1280,21 +1249,9 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
/* Signal handlers are invoked with ZA and streaming mode disabled */
if (system_supports_sme()) {
/*
* If we were in streaming mode the saved register
* state was SVE but we will exit SM and use the
* FPSIMD register state - flush the saved FPSIMD
* register state in case it gets loaded.
*/
if (current->thread.svcr & SVCR_SM_MASK) {
memset(&current->thread.uw.fpsimd_state, 0,
sizeof(current->thread.uw.fpsimd_state));
current->thread.fp_type = FP_STATE_FPSIMD;
}
current->thread.svcr &= ~(SVCR_ZA_MASK |
SVCR_SM_MASK);
sme_smstop();
task_smstop_sm(current);
current->thread.svcr &= ~SVCR_ZA_MASK;
write_sysreg_s(0, SYS_TPIDR2_EL0);
}
if (ka->sa.sa_flags & SA_RESTORER)
@@ -1313,7 +1270,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
struct user_access_state ua_state;
int err = 0;
fpsimd_signal_preserve_current_state();
fpsimd_save_and_flush_current_state();
if (get_sigframe(&user, ksig, regs))
return 1;

View File

@@ -103,7 +103,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
* Note that this also saves V16-31, which aren't visible
* in AArch32.
*/
fpsimd_signal_preserve_current_state();
fpsimd_save_and_flush_current_state();
/* Place structure header on the stack */
__put_user_error(magic, &frame->magic, err);
@@ -169,14 +169,17 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
fpsimd.fpsr = fpscr & VFP_FPSCR_STAT_MASK;
fpsimd.fpcr = fpscr & VFP_FPSCR_CTRL_MASK;
if (err)
return -EFAULT;
/*
* We don't need to touch the exception register, so
* reload the hardware state.
*/
if (!err)
fpsimd_update_current_state(&fpsimd);
fpsimd_save_and_flush_current_state();
current->thread.uw.fpsimd_state = fpsimd;
return err ? -EFAULT : 0;
return 0;
}
static int compat_restore_sigframe(struct pt_regs *regs,

View File

@@ -2556,14 +2556,6 @@ static void finalize_init_hyp_mode(void)
per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state =
kern_hyp_va(sve_state);
}
} else {
for_each_possible_cpu(cpu) {
struct user_fpsimd_state *fpsimd_state;
fpsimd_state = &per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->host_ctxt.fp_regs;
per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->fpsimd_state =
kern_hyp_va(fpsimd_state);
}
}
}

View File

@@ -54,43 +54,16 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
if (!system_supports_fpsimd())
return;
fpsimd_kvm_prepare();
/*
* We will check TIF_FOREIGN_FPSTATE just before entering the
* guest in kvm_arch_vcpu_ctxflush_fp() and override this to
* FP_STATE_FREE if the flag set.
* Ensure that any host FPSIMD/SVE/SME state is saved and unbound such
* that the host kernel is responsible for restoring this state upon
* return to userspace, and the hyp code doesn't need to save anything.
*
* When the host may use SME, fpsimd_save_and_flush_cpu_state() ensures
* that PSTATE.{SM,ZA} == {0,0}.
*/
*host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
*host_data_ptr(fpsimd_state) = kern_hyp_va(&current->thread.uw.fpsimd_state);
*host_data_ptr(fpmr_ptr) = kern_hyp_va(&current->thread.uw.fpmr);
vcpu_clear_flag(vcpu, HOST_SVE_ENABLED);
if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN)
vcpu_set_flag(vcpu, HOST_SVE_ENABLED);
if (system_supports_sme()) {
vcpu_clear_flag(vcpu, HOST_SME_ENABLED);
if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN)
vcpu_set_flag(vcpu, HOST_SME_ENABLED);
/*
* If PSTATE.SM is enabled then save any pending FP
* state and disable PSTATE.SM. If we leave PSTATE.SM
* enabled and the guest does not enable SME via
* CPACR_EL1.SMEN then operations that should be valid
* may generate SME traps from EL1 to EL1 which we
* can't intercept and which would confuse the guest.
*
* Do the same for PSTATE.ZA in the case where there
* is state in the registers which has not already
* been saved, this is very unlikely to happen.
*/
if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) {
*host_data_ptr(fp_owner) = FP_STATE_FREE;
fpsimd_save_and_flush_cpu_state();
}
}
fpsimd_save_and_flush_cpu_state();
*host_data_ptr(fp_owner) = FP_STATE_FREE;
/*
* If normal guests gain SME support, maintain this behavior for pKVM
@@ -162,52 +135,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
local_irq_save(flags);
/*
* If we have VHE then the Hyp code will reset CPACR_EL1 to
* the default value and we need to reenable SME.
*/
if (has_vhe() && system_supports_sme()) {
/* Also restore EL0 state seen on entry */
if (vcpu_get_flag(vcpu, HOST_SME_ENABLED))
sysreg_clear_set(CPACR_EL1, 0, CPACR_ELx_SMEN);
else
sysreg_clear_set(CPACR_EL1,
CPACR_EL1_SMEN_EL0EN,
CPACR_EL1_SMEN_EL1EN);
isb();
}
if (guest_owns_fp_regs()) {
if (vcpu_has_sve(vcpu)) {
u64 zcr = read_sysreg_el1(SYS_ZCR);
/*
* If the vCPU is in the hyp context then ZCR_EL1 is
* loaded with its vEL2 counterpart.
*/
__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr;
/*
* Restore the VL that was saved when bound to the CPU,
* which is the maximum VL for the guest. Because the
* layout of the data when saving the sve state depends
* on the VL, we need to use a consistent (i.e., the
* maximum) VL.
* Note that this means that at guest exit ZCR_EL1 is
* not necessarily the same as on guest entry.
*
* ZCR_EL2 holds the guest hypervisor's VL when running
* a nested guest, which could be smaller than the
* max for the vCPU. Similar to above, we first need to
* switch to a VL consistent with the layout of the
* vCPU's SVE state. KVM support for NV implies VHE, so
* using the ZCR_EL1 alias is safe.
*/
if (!has_vhe() || (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)))
sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1,
SYS_ZCR_EL1);
}
/*
* Flush (save and invalidate) the fpsimd/sve state so that if
* the host tries to use fpsimd/sve, it's not using stale data
@@ -219,18 +147,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
* when needed.
*/
fpsimd_save_and_flush_cpu_state();
} else if (has_vhe() && system_supports_sve()) {
/*
* The FPSIMD/SVE state in the CPU has not been touched, and we
* have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been
* reset by kvm_reset_cptr_el2() in the Hyp code, disabling SVE
* for EL0. To avoid spurious traps, restore the trap state
* seen by kvm_arch_vcpu_load_fp():
*/
if (vcpu_get_flag(vcpu, HOST_SVE_ENABLED))
sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN);
else
sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0);
}
local_irq_restore(flags);

View File

@@ -44,6 +44,11 @@ alternative_if ARM64_HAS_RAS_EXTN
alternative_else_nop_endif
mrs x1, isr_el1
cbz x1, 1f
// Ensure that __guest_enter() always provides a context
// synchronization event so that callers don't need ISBs for anything
// that would usually be synchonized by the ERET.
isb
mov x0, #ARM_EXCEPTION_IRQ
ret

View File

@@ -327,7 +327,7 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault);
}
static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code)
static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code)
{
*vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
arm64_mops_reset_regs(vcpu_gp_regs(vcpu), vcpu->arch.fault.esr_el2);
@@ -376,7 +376,86 @@ static inline void __hyp_sve_save_host(void)
true);
}
static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu);
static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu)
{
u64 zcr_el1, zcr_el2;
if (!guest_owns_fp_regs())
return;
if (vcpu_has_sve(vcpu)) {
/* A guest hypervisor may restrict the effective max VL. */
if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))
zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2);
else
zcr_el2 = vcpu_sve_max_vq(vcpu) - 1;
write_sysreg_el2(zcr_el2, SYS_ZCR);
zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu));
write_sysreg_el1(zcr_el1, SYS_ZCR);
}
}
static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu)
{
u64 zcr_el1, zcr_el2;
if (!guest_owns_fp_regs())
return;
/*
* When the guest owns the FP regs, we know that guest+hyp traps for
* any FPSIMD/SVE/SME features exposed to the guest have been disabled
* by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd()
* prior to __guest_entry(). As __guest_entry() guarantees a context
* synchronization event, we don't need an ISB here to avoid taking
* traps for anything that was exposed to the guest.
*/
if (vcpu_has_sve(vcpu)) {
zcr_el1 = read_sysreg_el1(SYS_ZCR);
__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1;
/*
* The guest's state is always saved using the guest's max VL.
* Ensure that the host has the guest's max VL active such that
* the host can save the guest's state lazily, but don't
* artificially restrict the host to the guest's max VL.
*/
if (has_vhe()) {
zcr_el2 = vcpu_sve_max_vq(vcpu) - 1;
write_sysreg_el2(zcr_el2, SYS_ZCR);
} else {
zcr_el2 = sve_vq_from_vl(kvm_host_sve_max_vl) - 1;
write_sysreg_el2(zcr_el2, SYS_ZCR);
zcr_el1 = vcpu_sve_max_vq(vcpu) - 1;
write_sysreg_el1(zcr_el1, SYS_ZCR);
}
}
}
static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
{
/*
* Non-protected kvm relies on the host restoring its sve state.
* Protected kvm restores the host's sve state as not to reveal that
* fpsimd was used by a guest nor leak upper sve bits.
*/
if (system_supports_sve()) {
__hyp_sve_save_host();
/* Re-enable SVE traps if not supported for the guest vcpu. */
if (!vcpu_has_sve(vcpu))
cpacr_clear_set(CPACR_ELx_ZEN, 0);
} else {
__fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs));
}
if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm)))
*host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR);
}
/*
* We trap the first access to the FP/SIMD to save the host context and
@@ -384,7 +463,7 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu);
* If FP/SIMD is not implemented, handle the trap and inject an undefined
* instruction exception to the guest. Similarly for trapped SVE accesses.
*/
static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
{
bool sve_guest;
u8 esr_ec;
@@ -426,7 +505,7 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
isb();
/* Write out the host state if it's in the registers */
if (host_owns_fp_regs())
if (is_protected_kvm_enabled() && host_owns_fp_regs())
kvm_hyp_save_fpsimd_host(vcpu);
/* Restore the guest state */
@@ -575,7 +654,7 @@ static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu)
return true;
}
static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
{
if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) &&
handle_tx2_tvm(vcpu))
@@ -595,7 +674,7 @@ static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
return false;
}
static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code)
static inline bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code)
{
if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
__vgic_v3_perform_cpuif_access(vcpu) == 1)
@@ -604,19 +683,18 @@ static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code)
return false;
}
static bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, u64 *exit_code)
static inline bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu,
u64 *exit_code)
{
if (!__populate_fault_info(vcpu))
return true;
return false;
}
static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
__alias(kvm_hyp_handle_memory_fault);
static bool kvm_hyp_handle_watchpt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
__alias(kvm_hyp_handle_memory_fault);
#define kvm_hyp_handle_iabt_low kvm_hyp_handle_memory_fault
#define kvm_hyp_handle_watchpt_low kvm_hyp_handle_memory_fault
static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
{
if (kvm_hyp_handle_memory_fault(vcpu, exit_code))
return true;
@@ -646,23 +724,16 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *);
static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu);
static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code);
/*
* Allow the hypervisor to handle the exit with an exit handler if it has one.
*
* Returns true if the hypervisor handled the exit, and control should go back
* to the guest, or false if it hasn't.
*/
static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code,
const exit_handler_fn *handlers)
{
const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu);
exit_handler_fn fn;
fn = handlers[kvm_vcpu_trap_get_class(vcpu)];
exit_handler_fn fn = handlers[kvm_vcpu_trap_get_class(vcpu)];
if (fn)
return fn(vcpu, exit_code);
@@ -692,20 +763,9 @@ static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code
* the guest, false when we should restore the host state and return to the
* main run loop.
*/
static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
static inline bool __fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code,
const exit_handler_fn *handlers)
{
/*
* Save PSTATE early so that we can evaluate the vcpu mode
* early on.
*/
synchronize_vcpu_pstate(vcpu, exit_code);
/*
* Check whether we want to repaint the state one way or
* another.
*/
early_exit_filter(vcpu, exit_code);
if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ)
vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR);
@@ -735,7 +795,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
goto exit;
/* Check if there's an exit handler and allow it to handle the exit. */
if (kvm_hyp_handle_exit(vcpu, exit_code))
if (kvm_hyp_handle_exit(vcpu, exit_code, handlers))
goto guest;
exit:
/* Return to the host kernel and handle the exit */

View File

@@ -69,7 +69,7 @@ struct kvm_iommu_ops {
phys_addr_t (*iova_to_phys)(struct kvm_hyp_iommu_domain *domain, unsigned long iova);
void (*iotlb_sync)(struct kvm_hyp_iommu_domain *domain,
struct iommu_iotlb_gather *gather);
bool (*dabt_handler)(struct kvm_cpu_context *host_ctxt, u64 esr, u64 addr);
bool (*dabt_handler)(struct user_pt_regs *regs, u64 esr, u64 addr);
void (*host_stage2_idmap)(struct kvm_hyp_iommu_domain *domain,
phys_addr_t start, phys_addr_t end, int prot);
void (*host_stage2_idmap_complete)(bool map);

View File

@@ -10,6 +10,7 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_pgtable.h>
#include <asm/kvm_pkvm_module.h>
#include <asm/virt.h>
#include <nvhe/memory.h>
#include <nvhe/pkvm.h>
@@ -45,6 +46,7 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
int ___pkvm_host_donate_hyp(u64 pfn, u64 nr_pages, bool accept_mmio);
int ___pkvm_host_donate_hyp_prot(u64 pfn, u64 nr_pages,
bool accept_mmio, enum kvm_pgtable_prot prot);
int __pkvm_host_donate_sglist_hyp(struct pkvm_sglist_page *sglist, size_t nr_pages);
int __pkvm_host_donate_hyp_locked(u64 pfn, u64 nr_pages, enum kvm_pgtable_prot prot);
int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
int __pkvm_guest_share_hyp_page(struct pkvm_hyp_vcpu *vcpu, u64 ipa, u64 *hyp_va);

View File

@@ -1,4 +1,6 @@
#include <asm/kvm_pgtable.h>
#include <linux/kvm_host.h>
#include <linux/arm-smccc.h>
#define HCALL_HANDLED 0
#define HCALL_UNHANDLED -1
@@ -19,6 +21,8 @@ int handle_host_dynamic_hcall(struct user_pt_regs *regs, int id);
void __pkvm_close_module_registration(void);
bool module_handle_host_perm_fault(struct user_pt_regs *regs, u64 esr, u64 addr);
bool module_handle_host_smc(struct user_pt_regs *regs);
bool module_handle_guest_smc(struct arm_smccc_1_2_regs *regs, struct arm_smccc_1_2_regs *res,
pkvm_handle_t handle);
#else
static inline int __pkvm_init_module(void *module_init) { return -EOPNOTSUPP; }
static inline int
@@ -31,4 +35,9 @@ handle_host_dynamic_hcall(struct kvm_cpu_context *host_ctxt, int id)
static inline void __pkvm_close_module_registration(void) { }
bool module_handle_host_perm_fault(struct user_pt_regs *regs, u64 esr, u64 addr) { return false; }
bool module_handle_host_smc(struct user_pt_regs *regs) { return false; }
bool module_handle_guest_smc(struct arm_smccc_1_2_regs *regs, struct arm_smccc_1_2_regs *res,
pkvm_handle_t handle)
{
return false;
}
#endif

View File

@@ -143,6 +143,8 @@ void pkvm_reset_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu);
bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code);
bool kvm_hyp_handle_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code);
bool kvm_handle_pvm_smc64(struct kvm_vcpu *vcpu, u64 *exit_code);
struct pkvm_hyp_vcpu *pkvm_mpidr_to_hyp_vcpu(struct pkvm_hyp_vm *vm, u64 mpidr);
static inline bool pkvm_hyp_vm_has_pvmfw(struct pkvm_hyp_vm *vm)

View File

@@ -10,7 +10,8 @@ hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o
cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o \
serial.o alloc_mgt.o iommu/iommu.o power/hvc.o power/scmi.o device/device.o
hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o iommu/pviommu-host.o iommu/pviommu.o
../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o iommu/pviommu-host.o iommu/pviommu.o \
../../../kernel/smccc-call.o
hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o
hyp-obj-$(CONFIG_TRACING) += clock.o events.o trace.o
hyp-obj-$(CONFIG_PROTECTED_NVHE_FTRACE) += ftrace.o

View File

@@ -31,6 +31,9 @@ int pkvm_init_devices(void)
size_t dev_sz;
int ret;
if (!registered_devices_nr)
return 0;
registered_devices = kern_hyp_va(registered_devices);
dev_sz = PAGE_ALIGN(size_mul(sizeof(struct pkvm_device),
registered_devices_nr));
@@ -277,8 +280,9 @@ bool pkvm_device_request_mmio(struct pkvm_hyp_vcpu *hyp_vcpu, u64 *exit_code)
u64 token;
s8 level;
/* arg2 and arg3 reserved for future use. */
if (smccc_get_arg2(vcpu) || smccc_get_arg3(vcpu) || !PAGE_ALIGNED(ipa))
/* args 2..6 reserved for future use. */
if (smccc_get_arg2(vcpu) || smccc_get_arg3(vcpu) || smccc_get_arg4(vcpu) ||
smccc_get_arg5(vcpu) || smccc_get_arg6(vcpu) || !PAGE_ALIGNED(ipa))
goto out_inval;
ret = pkvm_get_guest_pa_request(hyp_vcpu, ipa, PAGE_SIZE,

View File

@@ -27,6 +27,7 @@
*/
#include <linux/arm_ffa.h>
#include <asm/kvm_hypevents.h>
#include <asm/kvm_pkvm.h>
#include <kvm/arm_hypercalls.h>
@@ -1107,8 +1108,7 @@ out_unlock:
hyp_spin_unlock(&kvm_ffa_hyp_lock);
}
static void do_ffa_direct_msg(struct arm_smccc_res *res,
struct kvm_cpu_context *ctxt,
static void do_ffa_direct_msg(struct kvm_cpu_context *ctxt,
u64 vm_handle)
{
DECLARE_REG(u32, func_id, ctxt, 0);
@@ -1120,14 +1120,38 @@ static void do_ffa_direct_msg(struct arm_smccc_res *res,
DECLARE_REG(u32, w6, ctxt, 6);
DECLARE_REG(u32, w7, ctxt, 7);
struct arm_smccc_1_2_regs req, resp;
if (FIELD_GET(FFA_SRC_ENDPOINT_MASK, endp) != vm_handle) {
ffa_to_smccc_res(res, FFA_RET_INVALID_PARAMETERS);
resp = (struct arm_smccc_1_2_regs) {
.a0 = FFA_ERROR,
.a2 = FFA_RET_INVALID_PARAMETERS,
};
return;
}
arm_smccc_1_1_smc(func_id, endp, msg_flags, w3,
w4, w5, w6, w7,
res);
req = (struct arm_smccc_1_2_regs) {
.a0 = func_id,
.a1 = endp,
.a2 = msg_flags,
.a3 = w3,
.a4 = w4,
.a5 = w5,
.a6 = w6,
.a7 = w7,
};
/*
* In case SMCCC 1.2 is not supported we should preserve the
* host registers.
*/
memcpy(&resp, &ctxt->regs.regs[0], sizeof(resp));
__hyp_exit();
arm_smccc_1_2_smc(&req, &resp);
__hyp_enter();
memcpy(&ctxt->regs.regs[0], &resp, sizeof(resp));
}
bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
@@ -1198,8 +1222,8 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
goto out_handled;
case FFA_MSG_SEND_DIRECT_REQ:
case FFA_FN64_MSG_SEND_DIRECT_REQ:
do_ffa_direct_msg(&res, host_ctxt, HOST_FFA_ID);
goto out_handled;
do_ffa_direct_msg(host_ctxt, HOST_FFA_ID);
return true;
}
if (ffa_call_supported(func_id))
@@ -1273,8 +1297,8 @@ bool kvm_guest_ffa_handler(struct pkvm_hyp_vcpu *hyp_vcpu, u64 *exit_code)
goto out_guest;
case FFA_MSG_SEND_DIRECT_REQ:
case FFA_FN64_MSG_SEND_DIRECT_REQ:
do_ffa_direct_msg(&res, ctxt, hyp_vcpu_to_ffa_handle(hyp_vcpu));
goto out_guest;
do_ffa_direct_msg(ctxt, hyp_vcpu_to_ffa_handle(hyp_vcpu));
return true;
default:
ret = -EOPNOTSUPP;
break;

View File

@@ -7,6 +7,7 @@
#include <kvm/arm_hypercalls.h>
#include <hyp/adjust_pc.h>
#include <hyp/switch.h>
#include <asm/pgtable-types.h>
#include <asm/kvm_asm.h>
@@ -545,7 +546,7 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
if (system_supports_sve())
__hyp_sve_restore_host();
else
__fpsimd_restore_state(*host_data_ptr(fpsimd_state));
__fpsimd_restore_state(host_data_ptr(host_ctxt.fp_regs));
if (has_fpmr)
write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR);
@@ -900,30 +901,6 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, u64 *exit_code)
hyp_vcpu->exit_code = *exit_code;
}
static void fpsimd_host_restore(void)
{
cpacr_clear_set(0, CPACR_ELx_FPEN | CPACR_ELx_ZEN);
isb();
if (unlikely(is_protected_kvm_enabled())) {
struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (vcpu_has_sve(&hyp_vcpu->vcpu))
__hyp_sve_save_guest(&hyp_vcpu->vcpu);
else
__fpsimd_save_state(&hyp_vcpu->vcpu.arch.ctxt.fp_regs);
__fpsimd_restore_state(*host_data_ptr(fpsimd_state));
*host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
}
if (system_supports_sve())
sve_cond_update_zcr_vq(sve_vq_from_vl(kvm_host_sve_max_vl) - 1,
SYS_ZCR_EL2);
}
static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
@@ -952,8 +929,6 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
*last_ran = hyp_vcpu->vcpu.vcpu_id;
}
*host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) {
/* Propagate WFx trapping flags */
hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI);
@@ -972,9 +947,6 @@ static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt)
if (hyp_vcpu) {
struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
if (guest_owns_fp_regs())
fpsimd_host_restore();
if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu) &&
!vcpu_get_flag(host_vcpu, PKVM_HOST_STATE_DIRTY)) {
__sync_hyp_vcpu(hyp_vcpu);
@@ -995,9 +967,6 @@ static void handle___pkvm_vcpu_sync_state(struct kvm_cpu_context *host_ctxt)
if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
return;
if (guest_owns_fp_regs())
fpsimd_host_restore();
__sync_hyp_vcpu(hyp_vcpu);
}
@@ -1064,17 +1033,13 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
goto out;
flush_hyp_vcpu(hyp_vcpu);
ret = __kvm_vcpu_run(&hyp_vcpu->vcpu);
sync_hyp_vcpu(hyp_vcpu, &ret);
/* Trap host fpsimd/sve if the guest has used fpsimd/sve. */
if (guest_owns_fp_regs())
cpacr_clear_set(CPACR_ELx_FPEN | CPACR_ELx_ZEN, 0);
} else {
/* The host is fully trusted, run its vCPU directly. */
fpsimd_lazy_switch_to_guest(host_vcpu);
ret = __kvm_vcpu_run(host_vcpu);
fpsimd_lazy_switch_to_host(host_vcpu);
}
out:
cpu_reg(host_ctxt, 1) = ret;
@@ -2038,13 +2003,8 @@ inval:
static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(u64, func_id, host_ctxt, 0);
struct pkvm_hyp_vcpu *hyp_vcpu;
bool handled;
hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (hyp_vcpu && guest_owns_fp_regs())
fpsimd_host_restore();
func_id &= ~ARM_SMCCC_CALL_HINTS;
handled = kvm_host_psci_handler(host_ctxt, func_id);
@@ -2079,11 +2039,6 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
case ESR_ELx_EC_SMC64:
handle_host_smc(host_ctxt);
break;
case ESR_ELx_EC_FP_ASIMD:
case ESR_ELx_EC_SVE:
case ESR_ELx_EC_SME:
fpsimd_host_restore();
break;
case ESR_ELx_EC_IABT_LOW:
case ESR_ELx_EC_DABT_LOW:
handle_host_mem_abort(host_ctxt);

View File

@@ -178,7 +178,7 @@ void *kvm_iommu_donate_pages_atomic(u8 order)
void kvm_iommu_reclaim_pages_atomic(void *p, u8 order)
{
__kvm_iommu_reclaim_pages(&iommu_atomic_pool, p, order);
hyp_put_page(&iommu_atomic_pool, p);
}
static struct kvm_hyp_iommu_domain *
@@ -274,7 +274,7 @@ int kvm_iommu_init(struct kvm_iommu_ops *ops,
!ops->alloc_domain ||
!ops->free_domain ||
!ops->get_iommu_by_id)
return 0;
return -ENODEV;
ret = hyp_pool_init_empty(&iommu_host_pool, 64);
if (ret)
@@ -609,7 +609,7 @@ bool kvm_iommu_host_dabt_handler(struct kvm_cpu_context *host_ctxt, u64 esr, u64
bool ret = false;
if (kvm_iommu_ops && kvm_iommu_ops->dabt_handler)
ret = kvm_iommu_ops->dabt_handler(host_ctxt, esr, addr);
ret = kvm_iommu_ops->dabt_handler(&host_ctxt->regs, esr, addr);
if (ret)
kvm_skip_host_instr();

View File

@@ -1898,8 +1898,16 @@ static int __pkvm_use_dma_locked(phys_addr_t phys_addr, size_t size,
} else {
/* For VMs, we know if we reach this point the VM has access to the page. */
if (!hyp_vcpu) {
ret = ___host_check_page_state_range(phys_addr, size,
PKVM_PAGE_OWNED, reg, false);
for (i = 0; i < nr_pages; i++) {
enum pkvm_page_state state;
phys_addr_t this_addr = phys_addr + i * PAGE_SIZE;
state = hyp_phys_to_page(this_addr)->host_state;
if (state != PKVM_PAGE_OWNED) {
ret = -EPERM;
break;
}
}
if (ret)
return ret;
}
@@ -2346,6 +2354,83 @@ unlock:
return ret;
}
int __pkvm_host_donate_sglist_hyp(struct pkvm_sglist_page *sglist, size_t nr_pages)
{
int p, ret;
host_lock_component();
hyp_lock_component();
/* Checking we are reading hyp private memory */
if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG))
WARN_ON(__hyp_check_page_state_range((u64)sglist, nr_pages * sizeof(*sglist),
PKVM_PAGE_OWNED));
for (p = 0; p < nr_pages; p++) {
u64 phys = hyp_pfn_to_phys(sglist[p].pfn);
size_t size;
if (check_shl_overflow(PAGE_SIZE, sglist[p].order, &size)) {
ret = -EINVAL;
goto unlock;
}
if (!addr_is_memory(phys)) {
ret = -EINVAL;
goto unlock;
}
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
if (ret)
goto unlock;
if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
ret = __hyp_check_page_state_range((u64)__hyp_va(phys), size, PKVM_NOPAGE);
if (ret)
goto unlock;
}
}
for (p = 0; p < nr_pages; p++) {
size_t size = PAGE_SIZE << sglist[p].order;
u64 phys = hyp_pfn_to_phys(sglist[p].pfn);
enum kvm_pgtable_prot prot;
prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED);
ret = pkvm_create_mappings_locked(__hyp_va(phys), __hyp_va(phys) + size, prot);
if (ret) {
WARN_ON(ret != -ENOMEM);
kvm_iommu_host_stage2_idmap_complete(false);
/* Rollback */
for (; p >= 0; p--) {
phys = hyp_pfn_to_phys(sglist[p].pfn);
size = PAGE_SIZE << sglist[p].order;
WARN_ON(host_stage2_idmap_locked(phys, size,
PKVM_HOST_MEM_PROT, false));
kvm_iommu_host_stage2_idmap(phys, phys + size, PKVM_HOST_MEM_PROT);
pkvm_remove_mappings_locked(__hyp_va(phys), __hyp_va(phys) + size);
}
kvm_iommu_host_stage2_idmap_complete(true);
break;
}
WARN_ON(__host_stage2_set_owner_locked(phys, size, PKVM_ID_HYP, true, 0, false));
kvm_iommu_host_stage2_idmap(phys, phys + size, 0);
}
kvm_iommu_host_stage2_idmap_complete(false);
unlock:
hyp_unlock_component();
host_unlock_component();
return ret;
}
void hyp_poison_page(phys_addr_t phys, size_t size)
{
WARN_ON(!PAGE_ALIGNED(size));

View File

@@ -134,6 +134,7 @@ static int __hyp_smp_processor_id(void)
enum mod_handler_type {
HOST_FAULT_HANDLER = 0,
HOST_SMC_HANDLER,
GUEST_SMC_HANDLER,
NUM_MOD_HANDLER_TYPES,
};
@@ -180,6 +181,13 @@ static int __register_host_smc_handler(bool (*cb)(struct user_pt_regs *))
return mod_handler_register(HOST_SMC_HANDLER, cb);
}
static int __register_guest_smc_handler(bool (*cb)(struct arm_smccc_1_2_regs *regs,
struct arm_smccc_1_2_regs *res,
pkvm_handle_t handle))
{
return mod_handler_register(GUEST_SMC_HANDLER, cb);
}
bool module_handle_host_perm_fault(struct user_pt_regs *regs, u64 esr, u64 addr)
{
int (*cb)(struct user_pt_regs *regs, u64 esr, u64 addr);
@@ -206,6 +214,21 @@ bool module_handle_host_smc(struct user_pt_regs *regs)
return false;
}
bool module_handle_guest_smc(struct arm_smccc_1_2_regs *regs, struct arm_smccc_1_2_regs *res,
pkvm_handle_t handle)
{
bool (*cb)(struct arm_smccc_1_2_regs *regs, struct arm_smccc_1_2_regs *res,
pkvm_handle_t handle);
int i;
for_each_mod_handler(GUEST_SMC_HANDLER, cb, i) {
if (cb(regs, res, handle))
return true;
}
return false;
}
const struct pkvm_module_ops module_ops = {
.create_private_mapping = __pkvm_create_private_mapping,
.alloc_module_va = __pkvm_alloc_module_va,
@@ -229,6 +252,7 @@ const struct pkvm_module_ops module_ops = {
.host_stage2_enable_lazy_pte = host_stage2_enable_lazy_pte,
.host_stage2_disable_lazy_pte = host_stage2_disable_lazy_pte,
.register_host_smc_handler = __register_host_smc_handler,
.register_guest_smc_handler = __register_guest_smc_handler,
.register_default_trap_handler = __pkvm_register_default_trap_handler,
.register_illegal_abt_notifier = __pkvm_register_illegal_abt_notifier,
.register_psci_notifier = __pkvm_register_psci_notifier,
@@ -236,6 +260,7 @@ const struct pkvm_module_ops module_ops = {
.register_unmask_serror = __pkvm_register_unmask_serror,
.host_donate_hyp = ___pkvm_host_donate_hyp,
.host_donate_hyp_prot = ___pkvm_host_donate_hyp_prot,
.host_donate_sglist_hyp = __pkvm_host_donate_sglist_hyp,
.hyp_donate_host = __pkvm_hyp_donate_host,
.host_share_hyp = __pkvm_host_share_hyp,
.host_unshare_hyp = __pkvm_host_unshare_hyp,

View File

@@ -18,6 +18,7 @@
#include <nvhe/ffa.h>
#include <nvhe/mem_protect.h>
#include <nvhe/memory.h>
#include <nvhe/modules.h>
#include <nvhe/mm.h>
#include <nvhe/pkvm.h>
#include <nvhe/pviommu.h>
@@ -593,6 +594,7 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
hyp_vm->kvm.arch.pkvm.pvmfw_load_addr = pvmfw_load_addr;
hyp_vm->kvm.arch.pkvm.ffa_support = READ_ONCE(host_kvm->arch.pkvm.ffa_support);
hyp_vm->kvm.arch.pkvm.smc_forwarded = READ_ONCE(host_kvm->arch.pkvm.smc_forwarded);
hyp_vm->kvm.arch.mmu.last_vcpu_ran = (int __percpu *)last_ran;
memset(last_ran, -1, pkvm_get_last_ran_size());
pkvm_init_features_from_host(hyp_vm, host_kvm);
@@ -1676,6 +1678,43 @@ static bool pkvm_forward_trng(struct kvm_vcpu *vcpu)
return true;
}
static bool is_standard_secure_service_call(u64 func_id)
{
return (func_id >= PSCI_0_2_FN_BASE && func_id <= ARM_CCA_FUNC_END) ||
(func_id >= PSCI_0_2_FN64_BASE && func_id <= ARM_CCA_64BIT_FUNC_END);
}
bool kvm_handle_pvm_smc64(struct kvm_vcpu *vcpu, u64 *exit_code)
{
bool handled = false;
struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
struct pkvm_hyp_vm *vm;
struct pkvm_hyp_vcpu *hyp_vcpu;
struct arm_smccc_1_2_regs regs;
struct arm_smccc_1_2_regs res;
DECLARE_REG(u64, func_id, ctxt, 0);
hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu);
vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
if (is_standard_secure_service_call(func_id))
return false;
if (!vm->kvm.arch.pkvm.smc_forwarded)
return false;
memcpy(&regs, &ctxt->regs, sizeof(regs));
handled = module_handle_guest_smc(&regs, &res, vm->kvm.arch.pkvm.handle);
if (handled)
memcpy(&ctxt->regs.regs[0], &res, sizeof(res));
else
ctxt->regs.regs[0] = -1;
__kvm_skip_instr(vcpu);
return handled;
}
/*
* Handler for protected VM HVC calls.
*

View File

@@ -112,6 +112,9 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
{
u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */
if (!guest_owns_fp_regs())
__activate_traps_fpsimd32(vcpu);
if (has_hvhe()) {
val |= CPACR_ELx_TTA;
@@ -120,6 +123,8 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
if (vcpu_has_sve(vcpu))
val |= CPACR_ELx_ZEN;
}
write_sysreg(val, cpacr_el1);
} else {
val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1;
@@ -134,12 +139,32 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
if (!guest_owns_fp_regs())
val |= CPTR_EL2_TFP;
write_sysreg(val, cptr_el2);
}
}
if (!guest_owns_fp_regs())
__activate_traps_fpsimd32(vcpu);
static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
{
if (has_hvhe()) {
u64 val = CPACR_ELx_FPEN;
kvm_write_cptr_el2(val);
if (cpus_have_final_cap(ARM64_SVE))
val |= CPACR_ELx_ZEN;
if (cpus_have_final_cap(ARM64_SME))
val |= CPACR_ELx_SMEN;
write_sysreg(val, cpacr_el1);
} else {
u64 val = CPTR_NVHE_EL2_RES1;
if (!cpus_have_final_cap(ARM64_SVE))
val |= CPTR_EL2_TZ;
if (!cpus_have_final_cap(ARM64_SME))
val |= CPTR_EL2_TSM;
write_sysreg(val, cptr_el2);
}
}
static void __activate_traps(struct kvm_vcpu *vcpu)
@@ -205,7 +230,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
kvm_reset_cptr_el2(vcpu);
__deactivate_cptr_traps(vcpu);
write_sysreg(__kvm_hyp_host_vector, vbar_el2);
}
@@ -278,34 +303,6 @@ static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code)
kvm_handle_pvm_sysreg(vcpu, exit_code));
}
static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
{
/*
* Non-protected kvm relies on the host restoring its sve state.
* Protected kvm restores the host's sve state as not to reveal that
* fpsimd was used by a guest nor leak upper sve bits.
*/
if (unlikely(is_protected_kvm_enabled() && system_supports_sve())) {
__hyp_sve_save_host();
/* Re-enable SVE traps if not supported for the guest vcpu. */
if (!vcpu_has_sve(vcpu))
cpacr_clear_set(CPACR_ELx_ZEN, 0);
} else {
__fpsimd_save_state(*host_data_ptr(fpsimd_state));
}
if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) {
u64 val = read_sysreg_s(SYS_FPMR);
if (unlikely(is_protected_kvm_enabled()))
*host_data_ptr(fpmr) = val;
else
**host_data_ptr(fpmr_ptr) = val;
}
}
static const exit_handler_fn hyp_exit_handlers[] = {
[0 ... ESR_ELx_EC_MAX] = NULL,
[ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32,
@@ -321,6 +318,7 @@ static const exit_handler_fn hyp_exit_handlers[] = {
static const exit_handler_fn pvm_exit_handlers[] = {
[0 ... ESR_ELx_EC_MAX] = NULL,
[ESR_ELx_EC_HVC64] = kvm_handle_pvm_hvc64,
[ESR_ELx_EC_SMC64] = kvm_handle_pvm_smc64,
[ESR_ELx_EC_SYS64] = kvm_handle_pvm_sys64,
[ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd,
[ESR_ELx_EC_SME] = kvm_handle_pvm_restricted,
@@ -354,18 +352,25 @@ void vcpu_illegal_trap(struct kvm_vcpu *vcpu, u64 *exit_code)
*exit_code |= ARM_EXCEPTION_IL;
}
/*
* Some guests (e.g., protected VMs) are not be allowed to run in AArch32.
* The ARMv8 architecture does not give the hypervisor a mechanism to prevent a
* guest from dropping to AArch32 EL0 if implemented by the CPU. If the
* hypervisor spots a guest in such a state ensure it is handled, and don't
* trust the host to spot or fix it. The check below is based on the one in
* kvm_arch_vcpu_ioctl_run().
*/
static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
{
const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu);
synchronize_vcpu_pstate(vcpu, exit_code);
/*
* Some guests (e.g., protected VMs) are not be allowed to run in
* AArch32. The ARMv8 architecture does not give the hypervisor a
* mechanism to prevent a guest from dropping to AArch32 EL0 if
* implemented by the CPU. If the hypervisor spots a guest in such a
* state ensure it is handled, and don't trust the host to spot or fix
* it. The check below is based on the one in
* kvm_arch_vcpu_ioctl_run().
*/
if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu)))
vcpu_illegal_trap(vcpu, exit_code);
return __fixup_guest_exit(vcpu, exit_code, handlers);
}
/* Switch to the guest for legacy non-VHE systems */

View File

@@ -312,14 +312,6 @@ static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code)
return true;
}
static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
{
__fpsimd_save_state(*host_data_ptr(fpsimd_state));
if (kvm_has_fpmr(vcpu->kvm))
**host_data_ptr(fpmr_ptr) = read_sysreg_s(SYS_FPMR);
}
static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code)
{
int ret = -EINVAL;
@@ -434,13 +426,10 @@ static const exit_handler_fn hyp_exit_handlers[] = {
[ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops,
};
static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
{
return hyp_exit_handlers;
}
synchronize_vcpu_pstate(vcpu, exit_code);
static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
{
/*
* If we were in HYP context on entry, adjust the PSTATE view
* so that the usual helpers work correctly.
@@ -460,6 +449,8 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
*vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT);
*vcpu_cpsr(vcpu) |= mode;
}
return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers);
}
/* Switch to the guest for VHE systems running in EL2 */
@@ -474,6 +465,8 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
sysreg_save_host_state_vhe(host_ctxt);
fpsimd_lazy_switch_to_guest(vcpu);
/*
* Note that ARM erratum 1165522 requires us to configure both stage 1
* and stage 2 translation for the guest context before we clear
@@ -498,6 +491,8 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
__deactivate_traps(vcpu);
fpsimd_lazy_switch_to_host(vcpu);
sysreg_restore_host_state_vhe(host_ctxt);
if (guest_owns_fp_regs())

View File

@@ -79,6 +79,7 @@ static bool kvm_smccc_default_allowed(u32 func_id)
case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
case ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID:
return true;
default:
/* PSCI 0.2 and up is in the 0:0x1f range */

View File

@@ -84,7 +84,7 @@ int kvm_iommu_init_driver(void)
{
if (!smp_load_acquire(&iommu_driver) || !iommu_driver->get_iommu_id_by_of) {
kvm_err("pKVM enabled without an IOMMU driver, do not run confidential workloads in virtual machines\n");
return 0;
return -ENODEV;
}
kvm_hyp_iommu_domains = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,

View File

@@ -753,6 +753,25 @@ void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa)
kfree(ppage);
}
int pkvm_enable_smc_forwarding(struct file *kvm_file)
{
struct kvm *host_kvm;
if (!file_is_kvm(kvm_file))
return -EINVAL;
if (!kvm_get_kvm_safe(kvm_file->private_data))
return -EINVAL;
host_kvm = kvm_file->private_data;
if (!host_kvm)
return -EINVAL;
host_kvm->arch.pkvm.smc_forwarded = true;
return 0;
}
static int __init pkvm_firmware_rmem_err(struct reserved_mem *rmem,
const char *reason)
{
@@ -927,11 +946,28 @@ static int __init early_pkvm_modules_cfg(char *arg)
}
early_param("kvm-arm.protected_modules", early_pkvm_modules_cfg);
static void free_modprobe_argv(struct subprocess_info *info)
static void __init free_modprobe_argv(struct subprocess_info *info)
{
kfree(info->argv);
}
static int __init init_modprobe(struct subprocess_info *info, struct cred *new)
{
struct file *file = filp_open("/dev/kmsg", O_RDWR, 0);
if (IS_ERR(file)) {
pr_warn("Warning: unable to open /dev/kmsg, modprobe will be silent.\n");
return 0;
}
init_dup(file);
init_dup(file);
init_dup(file);
fput(file);
return 0;
}
/*
* Heavily inspired by request_module(). The latest couldn't be reused though as
* the feature can be disabled depending on umh configuration. Here some
@@ -974,7 +1010,7 @@ static int __init __pkvm_request_early_module(char *module_name,
argv[idx++] = NULL;
info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
NULL, free_modprobe_argv, NULL);
init_modprobe, free_modprobe_argv, NULL);
if (!info)
goto err;

View File

@@ -107,3 +107,23 @@ WORKAROUND_REPEAT_TLBI
WORKAROUND_SPECULATIVE_AT
WORKAROUND_SPECULATIVE_SSBS
WORKAROUND_SPECULATIVE_UNPRIV_LOAD
ANDROID_KABI_RESERVE_01
ANDROID_KABI_RESERVE_02
ANDROID_KABI_RESERVE_03
ANDROID_KABI_RESERVE_04
ANDROID_KABI_RESERVE_05
ANDROID_KABI_RESERVE_06
ANDROID_KABI_RESERVE_07
ANDROID_KABI_RESERVE_08
ANDROID_KABI_RESERVE_09
ANDROID_KABI_RESERVE_10
ANDROID_KABI_RESERVE_11
ANDROID_KABI_RESERVE_12
ANDROID_KABI_RESERVE_13
ANDROID_KABI_RESERVE_14
ANDROID_KABI_RESERVE_15
ANDROID_KABI_RESERVE_16
ANDROID_KABI_RESERVE_17
ANDROID_KABI_RESERVE_18
ANDROID_KABI_RESERVE_19
ANDROID_KABI_RESERVE_20

View File

@@ -105,6 +105,9 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
#define MADV_GUARD_REMOVE 103 /* unguard range */
/* compatibility flags */
#define MAP_FILE 0

View File

@@ -75,6 +75,9 @@
#define MADV_HWPOISON 100 /* poison a page for testing */
#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
#define MADV_GUARD_REMOVE 103 /* unguard range */
/* compatibility flags */
#define MAP_FILE 0

View File

@@ -70,6 +70,7 @@ CONFIG_NR_CPUS=32
CONFIG_EFI=y
CONFIG_CMDLINE_BOOL=y
CONFIG_CMDLINE="console=ttynull stack_depot_disable=on cgroup_disable=pressure bootconfig"
# CONFIG_CFI_AUTO_DEFAULT is not set
CONFIG_HIBERNATION=y
CONFIG_PM_WAKELOCKS=y
CONFIG_PM_WAKELOCKS_LIMIT=0
@@ -561,6 +562,7 @@ CONFIG_POWERCAP=y
CONFIG_IDLE_INJECT=y
CONFIG_ANDROID_BINDER_IPC=y
CONFIG_ANDROID_BINDERFS=y
CONFIG_ANDROID_BINDER_IPC_RUST=m
CONFIG_ANDROID_VENDOR_HOOKS=y
CONFIG_LIBNVDIMM=y
CONFIG_INTERCONNECT=y
@@ -696,6 +698,8 @@ CONFIG_UBSAN_TRAP=y
CONFIG_PAGE_OWNER=y
CONFIG_PAGE_PINNER=y
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_MEM_ALLOC_PROFILING=y
# CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT is not set
CONFIG_KFENCE=y
CONFIG_KFENCE_SAMPLE_INTERVAL=500
CONFIG_KFENCE_NUM_OBJECTS=63

View File

@@ -1,6 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/vdso.h>
#include <linux/page_size_compat.h>
/*
* Linker script for vDSO. This is an ELF shared object prelinked to
* its virtual address, and with only one read-only segment.
@@ -16,7 +18,7 @@ SECTIONS
* segment.
*/
vvar_start = . - 4 * PAGE_SIZE;
vvar_start = . - 4 * __MAX_PAGE_SIZE;
vvar_page = vvar_start;
/* Place all vvars at the offsets in asm/vvar.h. */
@@ -24,9 +26,9 @@ SECTIONS
#include <asm/vvar.h>
#undef EMIT_VVAR
pvclock_page = vvar_start + PAGE_SIZE;
hvclock_page = vvar_start + 2 * PAGE_SIZE;
timens_page = vvar_start + 3 * PAGE_SIZE;
pvclock_page = vvar_start + __MAX_PAGE_SIZE;
hvclock_page = vvar_start + 2 * __MAX_PAGE_SIZE;
timens_page = vvar_start + 3 * __MAX_PAGE_SIZE;
#undef _ASM_X86_VVAR_H
/* Place all vvars in timens too at the offsets in asm/vvar.h. */

View File

@@ -5,6 +5,16 @@
* are built for 32-bit userspace.
*/
/*
* For x86_64 16kB page size emulation
*
* The redefinition is needed here since, vdso2c is a program that runs
* on the host.
*
* It converts the vdso shared lib to a C array.
*/
#define __MAX_PAGE_SIZE 16384
static void BITSFUNC(copy)(FILE *outfile, const unsigned char *data, size_t len)
{
size_t i;
@@ -175,7 +185,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
return;
}
mapping_size = (stripped_len + 4095) / 4096 * 4096;
mapping_size = (stripped_len + __MAX_PAGE_SIZE-1) / __MAX_PAGE_SIZE * __MAX_PAGE_SIZE;
fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n");
fprintf(outfile, "#include <linux/linkage.h>\n");
@@ -184,8 +194,8 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
fprintf(outfile, "#include <asm/vdso.h>\n");
fprintf(outfile, "\n");
fprintf(outfile,
"static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
mapping_size);
"static unsigned char raw_data[%lu] __ro_after_init __aligned(%d) = {",
mapping_size, __MAX_PAGE_SIZE);
for (i = 0; i < stripped_len; i++) {
if (i % 10 == 0)
fprintf(outfile, "\n\t");

View File

@@ -32,6 +32,7 @@
#include <linux/mm_types.h>
#include <linux/syscalls.h>
#include <linux/ratelimit.h>
#include <linux/page_size_compat.h>
#include <asm/vsyscall.h>
#include <asm/unistd.h>
@@ -284,7 +285,7 @@ static const struct vm_operations_struct gate_vma_ops = {
};
static struct vm_area_struct gate_vma __ro_after_init = {
.vm_start = VSYSCALL_ADDR,
.vm_end = VSYSCALL_ADDR + PAGE_SIZE,
.vm_end = VSYSCALL_ADDR + __MAX_PAGE_SIZE,
.vm_page_prot = PAGE_READONLY_EXEC,
.vm_flags = VM_READ | VM_EXEC,
.vm_ops = &gate_vma_ops,

View File

@@ -1797,6 +1797,8 @@ static __init int pt_init(void)
if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG;
else
pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_PREFER_LARGE;
pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
pt_pmu.pmu.attr_groups = pt_attr_groups;

View File

@@ -5,6 +5,7 @@
/*
* ELF register definitions..
*/
#include <linux/page_size_compat.h>
#include <linux/thread_info.h>
#include <asm/ia32.h>
@@ -228,7 +229,7 @@ extern int force_personality32;
#endif /* !CONFIG_X86_32 */
#define CORE_DUMP_USE_REGSET
#define ELF_EXEC_PAGESIZE 4096
#define ELF_EXEC_PAGESIZE __PAGE_SIZE
/*
* This is the base location for PIE (ET_DYN with INTERP) loads. On

View File

@@ -4,6 +4,7 @@
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page_size_compat.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/prctl.h>
@@ -1006,7 +1007,7 @@ early_param("idle", idle_setup);
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
sp -= get_random_u32_below(8192);
sp -= get_random_u32_below(__PAGE_SIZE << 1);
return sp & ~0xf;
}

View File

@@ -11,6 +11,7 @@
* Copyright 2007 Jiri Kosina, SUSE Labs.
*/
#include <linux/page_size_compat.h>
#include <linux/personality.h>
#include <linux/mm.h>
#include <linux/random.h>
@@ -71,7 +72,7 @@ static unsigned long arch_rnd(unsigned int rndbits)
{
if (!(current->flags & PF_RANDOMIZE))
return 0;
return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT;
return (get_random_long() & ((1UL << rndbits) - 1)) << __PAGE_SHIFT;
}
unsigned long arch_mmap_rnd(void)

View File

@@ -113,6 +113,9 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
#define MADV_GUARD_REMOVE 103 /* unguard range */
/* compatibility flags */
#define MAP_FILE 0

View File

@@ -882,8 +882,41 @@ end_io:
}
EXPORT_SYMBOL(submit_bio_noacct);
#ifdef CONFIG_BLK_DEV_ZONED
/**
* blk_bio_is_seq_zoned_write() - Check if @bio requires write serialization.
* @bio: Bio to examine.
*
* Note: REQ_OP_ZONE_APPEND bios do not require serialization.
* Note: this function treats conventional zones on a zoned block device as
* sequential zones. This is fine since zoned UFS devices have no conventional
* zones.
*/
static bool blk_bio_is_seq_zoned_write(struct bio *bio)
{
if (!bdev_is_zoned(bio->bi_bdev))
return false;
return bio_op(bio) == REQ_OP_WRITE ||
bio_op(bio) == REQ_OP_WRITE_ZEROES;
}
#else
static bool blk_bio_is_seq_zoned_write(struct bio *bio)
{
return false;
}
#endif
static void bio_set_ioprio(struct bio *bio)
{
/*
* Do not set the I/O priority of sequential zoned write bios because
* this could lead to reordering by the mq-deadline I/O scheduler and
* hence to unaligned write errors.
*/
if (blk_bio_is_seq_zoned_write(bio))
return;
/* Nobody set ioprio so far? Initialize it based on task's nice value */
if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
bio->bi_ioprio = get_current_ioprio();
@@ -1127,8 +1160,8 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
return;
plug->cur_ktime = 0;
plug->mq_list = NULL;
plug->cached_rq = NULL;
rq_list_init(&plug->mq_list);
rq_list_init(&plug->cached_rqs);
plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
plug->rq_count = 0;
plug->multiple_queues = false;
@@ -1224,7 +1257,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
* queue for cached requests, we don't want a blocked task holding
* up a queue freeze/quiesce event.
*/
if (unlikely(!rq_list_empty(plug->cached_rq)))
if (unlikely(!rq_list_empty(&plug->cached_rqs)))
blk_mq_free_plug_rqs(plug);
plug->cur_ktime = 0;

View File

@@ -218,9 +218,7 @@ static ssize_t flag_store(struct device *dev, const char *page, size_t count,
else
lim.integrity.flags |= flag;
blk_mq_freeze_queue(q);
err = queue_limits_commit_update(q, &lim);
blk_mq_unfreeze_queue(q);
err = queue_limits_commit_update_frozen(q, &lim);
if (err)
return err;
return count;

View File

@@ -1141,7 +1141,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
struct blk_plug *plug = current->plug;
struct request *rq;
if (!plug || rq_list_empty(plug->mq_list))
if (!plug || rq_list_empty(&plug->mq_list))
return false;
rq_list_for_each(&plug->mq_list, rq) {

View File

@@ -133,6 +133,10 @@ static bool blk_freeze_set_owner(struct request_queue *q,
if (!q->mq_freeze_depth) {
q->mq_freeze_owner = owner;
q->mq_freeze_owner_depth = 1;
q->mq_freeze_disk_dead = !q->disk ||
test_bit(GD_DEAD, &q->disk->state) ||
!blk_queue_registered(q);
q->mq_freeze_queue_dying = blk_queue_dying(q);
return true;
}
@@ -191,7 +195,7 @@ bool __blk_freeze_queue_start(struct request_queue *q,
void blk_freeze_queue_start(struct request_queue *q)
{
if (__blk_freeze_queue_start(q, current))
blk_freeze_acquire_lock(q, false, false);
blk_freeze_acquire_lock(q);
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -259,7 +263,7 @@ bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
void blk_mq_unfreeze_queue(struct request_queue *q)
{
if (__blk_mq_unfreeze_queue(q, false))
blk_unfreeze_release_lock(q, false, false);
blk_unfreeze_release_lock(q);
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
@@ -508,7 +512,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
prefetch(tags->static_rqs[tag]);
tag_mask &= ~(1UL << i);
rq = blk_mq_rq_ctx_init(data, tags, tag);
rq_list_add(data->cached_rq, rq);
rq_list_add_head(data->cached_rqs, rq);
nr++;
}
if (!(data->rq_flags & RQF_SCHED_TAGS))
@@ -517,7 +521,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
data->nr_tags -= nr;
return rq_list_pop(data->cached_rq);
return rq_list_pop(data->cached_rqs);
}
static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
@@ -614,7 +618,7 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
.flags = flags,
.cmd_flags = opf,
.nr_tags = plug->nr_ios,
.cached_rq = &plug->cached_rq,
.cached_rqs = &plug->cached_rqs,
};
struct request *rq;
@@ -639,14 +643,14 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
if (!plug)
return NULL;
if (rq_list_empty(plug->cached_rq)) {
if (rq_list_empty(&plug->cached_rqs)) {
if (plug->nr_ios == 1)
return NULL;
rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
if (!rq)
return NULL;
} else {
rq = rq_list_peek(&plug->cached_rq);
rq = rq_list_peek(&plug->cached_rqs);
if (!rq || rq->q != q)
return NULL;
@@ -655,7 +659,7 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
return NULL;
plug->cached_rq = rq_list_next(rq);
rq_list_pop(&plug->cached_rqs);
blk_mq_rq_time_init(rq, 0);
}
@@ -832,7 +836,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug)
{
struct request *rq;
while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL)
blk_mq_free_request(rq);
}
@@ -1388,8 +1392,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
*/
if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
plug->has_elevator = true;
rq->rq_next = NULL;
rq_list_add(&plug->mq_list, rq);
rq_list_add_tail(&plug->mq_list, rq);
plug->rq_count++;
}
@@ -2801,7 +2804,7 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug)
blk_status_t ret = BLK_STS_OK;
while ((rq = rq_list_pop(&plug->mq_list))) {
bool last = rq_list_empty(plug->mq_list);
bool last = rq_list_empty(&plug->mq_list);
if (hctx != rq->mq_hctx) {
if (hctx) {
@@ -2844,8 +2847,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
{
struct blk_mq_hw_ctx *this_hctx = NULL;
struct blk_mq_ctx *this_ctx = NULL;
struct request *requeue_list = NULL;
struct request **requeue_lastp = &requeue_list;
struct rq_list requeue_list = {};
unsigned int depth = 0;
bool is_passthrough = false;
LIST_HEAD(list);
@@ -2859,12 +2861,12 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
is_passthrough = blk_rq_is_passthrough(rq);
} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
is_passthrough != blk_rq_is_passthrough(rq)) {
rq_list_add_tail(&requeue_lastp, rq);
rq_list_add_tail(&requeue_list, rq);
continue;
}
list_add(&rq->queuelist, &list);
list_add_tail(&rq->queuelist, &list);
depth++;
} while (!rq_list_empty(plug->mq_list));
} while (!rq_list_empty(&plug->mq_list));
plug->mq_list = requeue_list;
trace_block_unplug(this_hctx->queue, depth, !from_sched);
@@ -2919,19 +2921,19 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
if (q->mq_ops->queue_rqs) {
blk_mq_run_dispatch_ops(q,
__blk_mq_flush_plug_list(q, plug));
if (rq_list_empty(plug->mq_list))
if (rq_list_empty(&plug->mq_list))
return;
}
blk_mq_run_dispatch_ops(q,
blk_mq_plug_issue_direct(plug));
if (rq_list_empty(plug->mq_list))
if (rq_list_empty(&plug->mq_list))
return;
}
do {
blk_mq_dispatch_plug_list(plug, from_schedule);
} while (!rq_list_empty(plug->mq_list));
} while (!rq_list_empty(&plug->mq_list));
}
static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
@@ -2996,7 +2998,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
if (plug) {
data.nr_tags = plug->nr_ios;
plug->nr_ios = 1;
data.cached_rq = &plug->cached_rq;
data.cached_rqs = &plug->cached_rqs;
}
rq = __blk_mq_alloc_requests(&data);
@@ -3019,7 +3021,7 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
if (!plug)
return NULL;
rq = rq_list_peek(&plug->cached_rq);
rq = rq_list_peek(&plug->cached_rqs);
if (!rq || rq->q != q)
return NULL;
if (type != rq->mq_hctx->type &&
@@ -3033,14 +3035,14 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
struct bio *bio)
{
WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
if (rq_list_pop(&plug->cached_rqs) != rq)
WARN_ON_ONCE(1);
/*
* If any qos ->throttle() end up blocking, we will have flushed the
* plug and hence killed the cached_rq list as well. Pop this entry
* before we throttle.
*/
plug->cached_rq = rq_list_next(rq);
rq_qos_throttle(rq->q, bio);
blk_mq_rq_time_init(rq, 0);

View File

@@ -155,7 +155,7 @@ struct blk_mq_alloc_data {
/* allocate multiple requests/tags in one go */
unsigned int nr_tags;
struct request **cached_rq;
struct rq_list *cached_rqs;
/* input & output parameter */
struct blk_mq_ctx *ctx;

View File

@@ -434,6 +434,30 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(queue_limits_commit_update);
/**
* queue_limits_commit_update_frozen - commit an atomic update of queue limits
* @q: queue to update
* @lim: limits to apply
*
* Apply the limits in @lim that were obtained from queue_limits_start_update()
* and updated with the new values by the caller to @q. Freezes the queue
* before the update and unfreezes it after.
*
* Returns 0 if successful, else a negative error code.
*/
int queue_limits_commit_update_frozen(struct request_queue *q,
struct queue_limits *lim)
{
int ret;
blk_mq_freeze_queue(q);
ret = queue_limits_commit_update(q, lim);
blk_mq_unfreeze_queue(q);
return ret;
}
EXPORT_SYMBOL_GPL(queue_limits_commit_update_frozen);
/**
* queue_limits_set - apply queue limits to queue
* @q: queue to update

View File

@@ -25,6 +25,8 @@ struct queue_sysfs_entry {
ssize_t (*show)(struct gendisk *disk, char *page);
int (*load_module)(struct gendisk *disk, const char *page, size_t count);
ssize_t (*store)(struct gendisk *disk, const char *page, size_t count);
int (*store_limit)(struct gendisk *disk, const char *page,
size_t count, struct queue_limits *lim);
};
static ssize_t
@@ -152,13 +154,11 @@ QUEUE_SYSFS_SHOW_CONST(discard_zeroes_data, 0)
QUEUE_SYSFS_SHOW_CONST(write_same_max, 0)
QUEUE_SYSFS_SHOW_CONST(poll_delay, -1)
static ssize_t queue_max_discard_sectors_store(struct gendisk *disk,
const char *page, size_t count)
static int queue_max_discard_sectors_store(struct gendisk *disk,
const char *page, size_t count, struct queue_limits *lim)
{
unsigned long max_discard_bytes;
struct queue_limits lim;
ssize_t ret;
int err;
ret = queue_var_store(&max_discard_bytes, page, count);
if (ret < 0)
@@ -170,12 +170,8 @@ static ssize_t queue_max_discard_sectors_store(struct gendisk *disk,
if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX)
return -EINVAL;
lim = queue_limits_start_update(disk->queue);
lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT;
err = queue_limits_commit_update(disk->queue, &lim);
if (err)
return err;
return ret;
lim->max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT;
return 0;
}
/*
@@ -190,30 +186,24 @@ static ssize_t queue_zone_append_max_show(struct gendisk *disk, char *page)
SECTOR_SHIFT);
}
static ssize_t
queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count)
static int
queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count,
struct queue_limits *lim)
{
unsigned long max_sectors_kb;
struct queue_limits lim;
ssize_t ret;
int err;
ret = queue_var_store(&max_sectors_kb, page, count);
if (ret < 0)
return ret;
lim = queue_limits_start_update(disk->queue);
lim.max_user_sectors = max_sectors_kb << 1;
err = queue_limits_commit_update(disk->queue, &lim);
if (err)
return err;
return ret;
lim->max_user_sectors = max_sectors_kb << 1;
return 0;
}
static ssize_t queue_feature_store(struct gendisk *disk, const char *page,
size_t count, blk_features_t feature)
size_t count, struct queue_limits *lim, blk_features_t feature)
{
struct queue_limits lim;
unsigned long val;
ssize_t ret;
@@ -221,15 +211,11 @@ static ssize_t queue_feature_store(struct gendisk *disk, const char *page,
if (ret < 0)
return ret;
lim = queue_limits_start_update(disk->queue);
if (val)
lim.features |= feature;
lim->features |= feature;
else
lim.features &= ~feature;
ret = queue_limits_commit_update(disk->queue, &lim);
if (ret)
return ret;
return count;
lim->features &= ~feature;
return 0;
}
#define QUEUE_SYSFS_FEATURE(_name, _feature) \
@@ -238,10 +224,10 @@ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \
return sprintf(page, "%u\n", \
!!(disk->queue->limits.features & _feature)); \
} \
static ssize_t queue_##_name##_store(struct gendisk *disk, \
const char *page, size_t count) \
static int queue_##_name##_store(struct gendisk *disk, \
const char *page, size_t count, struct queue_limits *lim) \
{ \
return queue_feature_store(disk, page, count, _feature); \
return queue_feature_store(disk, page, count, lim, _feature); \
}
QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL)
@@ -381,12 +367,10 @@ static ssize_t queue_wc_show(struct gendisk *disk, char *page)
return sprintf(page, "write through\n");
}
static ssize_t queue_wc_store(struct gendisk *disk, const char *page,
size_t count)
static int queue_wc_store(struct gendisk *disk, const char *page,
size_t count, struct queue_limits *lim)
{
struct queue_limits lim;
bool disable;
int err;
if (!strncmp(page, "write back", 10)) {
disable = false;
@@ -397,15 +381,11 @@ static ssize_t queue_wc_store(struct gendisk *disk, const char *page,
return -EINVAL;
}
lim = queue_limits_start_update(disk->queue);
if (disable)
lim.flags |= BLK_FLAG_WRITE_CACHE_DISABLED;
lim->flags |= BLK_FLAG_WRITE_CACHE_DISABLED;
else
lim.flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED;
err = queue_limits_commit_update(disk->queue, &lim);
if (err)
return err;
return count;
lim->flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED;
return 0;
}
#define QUEUE_RO_ENTRY(_prefix, _name) \
@@ -421,6 +401,13 @@ static struct queue_sysfs_entry _prefix##_entry = { \
.store = _prefix##_store, \
};
#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \
static struct queue_sysfs_entry _prefix##_entry = { \
.attr = { .name = _name, .mode = 0644 }, \
.show = _prefix##_show, \
.store_limit = _prefix##_store, \
}
#define QUEUE_RW_LOAD_MODULE_ENTRY(_prefix, _name) \
static struct queue_sysfs_entry _prefix##_entry = { \
.attr = { .name = _name, .mode = 0644 }, \
@@ -431,7 +418,7 @@ static struct queue_sysfs_entry _prefix##_entry = { \
QUEUE_RW_ENTRY(queue_requests, "nr_requests");
QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb");
QUEUE_RW_ENTRY(queue_max_sectors, "max_sectors_kb");
QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb");
QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
QUEUE_RO_ENTRY(queue_max_segments, "max_segments");
QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
@@ -447,7 +434,7 @@ QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size");
QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity");
QUEUE_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes");
QUEUE_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes");
QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes");
QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
QUEUE_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes");
@@ -470,7 +457,7 @@ QUEUE_RW_ENTRY(queue_nomerges, "nomerges");
QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity");
QUEUE_RW_ENTRY(queue_poll, "io_poll");
QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay");
QUEUE_RW_ENTRY(queue_wc, "write_cache");
QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache");
QUEUE_RO_ENTRY(queue_fua, "fua");
QUEUE_RO_ENTRY(queue_dax, "dax");
QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
@@ -483,10 +470,10 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry = {
.show = queue_logical_block_size_show,
};
QUEUE_RW_ENTRY(queue_rotational, "rotational");
QUEUE_RW_ENTRY(queue_iostats, "iostats");
QUEUE_RW_ENTRY(queue_add_random, "add_random");
QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes");
QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational");
QUEUE_LIM_RW_ENTRY(queue_iostats, "iostats");
QUEUE_LIM_RW_ENTRY(queue_add_random, "add_random");
QUEUE_LIM_RW_ENTRY(queue_stable_writes, "stable_writes");
#ifdef CONFIG_BLK_WBT
static ssize_t queue_var_store64(s64 *var, const char *page)
@@ -683,7 +670,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
struct request_queue *q = disk->queue;
ssize_t res;
if (!entry->store)
if (!entry->store_limit && !entry->store)
return -EIO;
/*
@@ -697,11 +684,26 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
return res;
}
blk_mq_freeze_queue(q);
if (entry->store_limit) {
struct queue_limits lim = queue_limits_start_update(q);
res = entry->store_limit(disk, page, length, &lim);
if (res < 0) {
queue_limits_cancel_update(q);
return res;
}
res = queue_limits_commit_update_frozen(q, &lim);
if (res)
return res;
return length;
}
mutex_lock(&q->sysfs_lock);
blk_mq_freeze_queue(q);
res = entry->store(disk, page, length);
mutex_unlock(&q->sysfs_lock);
blk_mq_unfreeze_queue(q);
mutex_unlock(&q->sysfs_lock);
return res;
}

View File

@@ -1520,7 +1520,6 @@ static int disk_update_zone_resources(struct gendisk *disk,
unsigned int nr_seq_zones, nr_conv_zones;
unsigned int pool_size;
struct queue_limits lim;
int ret;
disk->nr_zones = args->nr_zones;
disk->zone_capacity = args->zone_capacity;
@@ -1571,11 +1570,7 @@ static int disk_update_zone_resources(struct gendisk *disk,
}
commit:
blk_mq_freeze_queue(q);
ret = queue_limits_commit_update(q, &lim);
blk_mq_unfreeze_queue(q);
return ret;
return queue_limits_commit_update_frozen(q, &lim);
}
static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
@@ -1678,6 +1673,25 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
return -ENODEV;
}
if (zone->start == 0) {
if (zone->len == 0) {
pr_warn("%s: Invalid zero zone size", disk->disk_name);
return -ENODEV;
}
/*
* Non power-of-2 zone size support was added to remove the gap
* between zone capacity and zone size. Though it is technically
* possible to have gaps in a non power-of-2 device, Linux
* requires the zone size to be equal to zone capacity for non
* power-of-2 zoned devices.
*/
if (!is_power_of_2(zone->len) && zone->capacity < zone->len) {
pr_err("%s: Invalid zone capacity %lld with non power-of-2 zone size %lld",
disk->disk_name, zone->capacity, zone->len);
return -ENODEV;
}
}
/*
* All zones must have the same size, with the exception on an eventual
* smaller last zone.
@@ -1753,9 +1767,8 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
* Checks that the device driver indicated a valid zone size and that
* the max zone append limit is set.
*/
if (!zone_sectors || !is_power_of_2(zone_sectors)) {
pr_warn("%s: Invalid non power of two zone size (%llu)\n",
disk->disk_name, zone_sectors);
if (!zone_sectors) {
pr_warn("%s: Invalid zone size\n", disk->disk_name);
return -ENODEV;
}
@@ -1770,7 +1783,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
* GFP_NOIO was specified.
*/
args.disk = disk;
args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
args.nr_zones = div64_u64(capacity + zone_sectors - 1, zone_sectors);
noio_flag = memalloc_noio_save();
ret = disk_revalidate_zone_resources(disk, args.nr_zones);
if (ret) {

View File

@@ -729,22 +729,29 @@ void blk_integrity_verify(struct bio *bio);
void blk_integrity_prepare(struct request *rq);
void blk_integrity_complete(struct request *rq, unsigned int nr_bytes);
static inline void blk_freeze_acquire_lock(struct request_queue *q, bool
disk_dead, bool queue_dying)
#ifdef CONFIG_LOCKDEP
static inline void blk_freeze_acquire_lock(struct request_queue *q)
{
if (!disk_dead)
if (!q->mq_freeze_disk_dead)
rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_);
if (!queue_dying)
if (!q->mq_freeze_queue_dying)
rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_);
}
static inline void blk_unfreeze_release_lock(struct request_queue *q, bool
disk_dead, bool queue_dying)
static inline void blk_unfreeze_release_lock(struct request_queue *q)
{
if (!queue_dying)
if (!q->mq_freeze_queue_dying)
rwsem_release(&q->q_lockdep_map, _RET_IP_);
if (!disk_dead)
if (!q->mq_freeze_disk_dead)
rwsem_release(&q->io_lockdep_map, _RET_IP_);
}
#else
static inline void blk_freeze_acquire_lock(struct request_queue *q)
{
}
static inline void blk_unfreeze_release_lock(struct request_queue *q)
{
}
#endif
#endif /* BLK_INTERNAL_H */

View File

@@ -601,16 +601,13 @@ void elevator_init_mq(struct request_queue *q)
*
* Disk isn't added yet, so verifying queue lock only manually.
*/
blk_freeze_queue_start_non_owner(q);
blk_freeze_acquire_lock(q, true, false);
blk_mq_freeze_queue_wait(q);
blk_mq_freeze_queue(q);
blk_mq_cancel_work_sync(q);
err = blk_mq_init_sched(q, e);
blk_unfreeze_release_lock(q, true, false);
blk_mq_unfreeze_queue_non_owner(q);
blk_mq_unfreeze_queue(q);
if (err) {
pr_warn("\"%s\" elevator initialization failed, "

View File

@@ -4,6 +4,7 @@
#include <linux/percpu.h>
#include <linux/hashtable.h>
#include <linux/android_kabi.h>
#include "blk-mq.h"
struct io_cq;
@@ -48,6 +49,11 @@ struct elevator_mq_ops {
struct request *(*next_request)(struct request_queue *, struct request *);
void (*init_icq)(struct io_cq *);
void (*exit_icq)(struct io_cq *);
ANDROID_KABI_RESERVE(1);
ANDROID_KABI_RESERVE(2);
ANDROID_KABI_RESERVE(3);
ANDROID_KABI_RESERVE(4);
};
#define ELV_NAME_MAX (16)
@@ -83,6 +89,9 @@ struct elevator_type
/* managed by elevator core */
char icq_cache_name[ELV_NAME_MAX + 6]; /* elvname + "_io_cq" */
struct list_head list;
ANDROID_KABI_RESERVE(1);
ANDROID_KABI_RESERVE(2);
};
static inline bool elevator_tryget(struct elevator_type *e)

View File

@@ -641,7 +641,7 @@ void del_gendisk(struct gendisk *disk)
struct request_queue *q = disk->queue;
struct block_device *part;
unsigned long idx;
bool start_drain, queue_dying;
bool start_drain;
might_sleep();
@@ -670,9 +670,8 @@ void del_gendisk(struct gendisk *disk)
*/
mutex_lock(&disk->open_mutex);
start_drain = __blk_mark_disk_dead(disk);
queue_dying = blk_queue_dying(q);
if (start_drain)
blk_freeze_acquire_lock(q, true, queue_dying);
blk_freeze_acquire_lock(q);
xa_for_each_start(&disk->part_tbl, idx, part, 1)
drop_partition(part);
mutex_unlock(&disk->open_mutex);
@@ -728,7 +727,7 @@ void del_gendisk(struct gendisk *disk)
blk_mq_exit_queue(q);
if (start_drain)
blk_unfreeze_release_lock(q, true, queue_dying);
blk_unfreeze_release_lock(q);
}
EXPORT_SYMBOL(del_gendisk);

View File

@@ -1,5 +1,5 @@
BRANCH=android16-6.12
KMI_GENERATION=4
KMI_GENERATION=5
CLANG_VERSION=r536225
RUSTC_VERSION=1.82.0
AARCH64_NDK_TRIPLE=aarch64-linux-android31

View File

@@ -23,6 +23,7 @@ config ACPI_APEI_GHES
select ACPI_HED
select IRQ_WORK
select GENERIC_ALLOCATOR
select ARM_SDE_INTERFACE if ARM64
help
Generic Hardware Error Source provides a way to report
platform hardware errors (such as that from chipset). It

View File

@@ -1612,7 +1612,7 @@ void __init acpi_ghes_init(void)
{
int rc;
sdei_init();
acpi_sdei_init();
if (acpi_disabled)
return;

View File

@@ -37,6 +37,18 @@ config ANDROID_BINDER_DEVICES
created. Each binder device has its own context manager, and is
therefore logically separated from the other devices.
config ANDROID_BINDER_IPC_RUST
tristate "Rust version of Android Binder IPC Driver"
depends on RUST && ANDROID_BINDER_IPC && ANDROID_BINDERFS
help
This enables the Rust implementation of the Binder driver. At this
time, the Rust implementation can only be built as an alternative to
the C implementation. By default, the C implementation is used, but
if the binder.impl=rust kernel command-line parameter is provided,
then the Rust implementation is used instead.
To build this as a GKI module, choose m.
config ANDROID_BINDER_IPC_SELFTEST
bool "Android Binder IPC Driver Selftest"
depends on ANDROID_BINDER_IPC

View File

@@ -6,3 +6,7 @@ obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o
obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
obj-$(CONFIG_ANDROID_VENDOR_HOOKS) += vendor_hooks.o
obj-$(CONFIG_ANDROID_DEBUG_KINFO) += debug_kinfo.o
obj-$(CONFIG_ANDROID_BINDER_IPC_RUST) += rust_binder.o
rust_binder-objs := binder/rust_binder.o binder/rust_binderfs.o binder/rust_binder_events.o
rust_binder-objs += binder/rust_binder_hooks.o binder/page_range_helper.o

View File

@@ -0,0 +1,611 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use core::mem::{size_of, size_of_val, MaybeUninit};
use core::ops::Range;
use kernel::{
bindings,
fs::file::{File, FileDescriptorReservation},
prelude::*,
sync::Arc,
types::{ARef, AsBytes, FromBytes},
uaccess::UserSliceReader,
uapi,
};
use crate::{
deferred_close::DeferredFdCloser,
defs::*,
node::{Node, NodeRef},
process::Process,
DArc,
};
#[derive(Default)]
pub(crate) struct AllocationInfo {
/// Range within the allocation where we can find the offsets to the object descriptors.
pub(crate) offsets: Option<Range<usize>>,
/// The target node of the transaction this allocation is associated to.
/// Not set for replies.
pub(crate) target_node: Option<NodeRef>,
/// When this allocation is dropped, call `pending_oneway_finished` on the node.
///
/// This is used to serialize oneway transaction on the same node. Binder guarantees that
/// oneway transactions to the same node are delivered sequentially in the order they are sent.
pub(crate) oneway_node: Option<DArc<Node>>,
/// Zero the data in the buffer on free.
pub(crate) clear_on_free: bool,
/// List of files embedded in this transaction.
file_list: FileList,
}
/// Represents an allocation that the kernel is currently using.
///
/// When allocations are idle, the range allocator holds the data related to them.
///
/// # Invariants
///
/// This allocation corresponds to an allocation in the range allocator, so the relevant pages are
/// marked in use in the page range.
pub(crate) struct Allocation {
pub(crate) offset: usize,
size: usize,
pub(crate) ptr: usize,
pub(crate) process: Arc<Process>,
allocation_info: Option<AllocationInfo>,
free_on_drop: bool,
pub(crate) oneway_spam_detected: bool,
#[allow(dead_code)]
pub(crate) debug_id: usize,
}
impl Allocation {
pub(crate) fn new(
process: Arc<Process>,
debug_id: usize,
offset: usize,
size: usize,
ptr: usize,
oneway_spam_detected: bool,
) -> Self {
Self {
process,
offset,
size,
ptr,
debug_id,
oneway_spam_detected,
allocation_info: None,
free_on_drop: true,
}
}
fn size_check(&self, offset: usize, size: usize) -> Result {
let overflow_fail = offset.checked_add(size).is_none();
let cmp_size_fail = offset.wrapping_add(size) > self.size;
if overflow_fail || cmp_size_fail {
return Err(EFAULT);
}
Ok(())
}
pub(crate) fn copy_into(
&self,
reader: &mut UserSliceReader,
offset: usize,
size: usize,
) -> Result {
self.size_check(offset, size)?;
// SAFETY: While this object exists, the range allocator will keep the range allocated, and
// in turn, the pages will be marked as in use.
unsafe {
self.process
.pages
.copy_from_user_slice(reader, self.offset + offset, size)
}
}
pub(crate) fn read<T: FromBytes>(&self, offset: usize) -> Result<T> {
self.size_check(offset, size_of::<T>())?;
// SAFETY: While this object exists, the range allocator will keep the range allocated, and
// in turn, the pages will be marked as in use.
unsafe { self.process.pages.read(self.offset + offset) }
}
pub(crate) fn write<T: ?Sized>(&self, offset: usize, obj: &T) -> Result {
self.size_check(offset, size_of_val::<T>(obj))?;
// SAFETY: While this object exists, the range allocator will keep the range allocated, and
// in turn, the pages will be marked as in use.
unsafe { self.process.pages.write(self.offset + offset, obj) }
}
pub(crate) fn fill_zero(&self) -> Result {
// SAFETY: While this object exists, the range allocator will keep the range allocated, and
// in turn, the pages will be marked as in use.
unsafe { self.process.pages.fill_zero(self.offset, self.size) }
}
pub(crate) fn keep_alive(mut self) {
self.process
.buffer_make_freeable(self.offset, self.allocation_info.take());
self.free_on_drop = false;
}
pub(crate) fn set_info(&mut self, info: AllocationInfo) {
self.allocation_info = Some(info);
}
pub(crate) fn get_or_init_info(&mut self) -> &mut AllocationInfo {
self.allocation_info.get_or_insert_with(Default::default)
}
pub(crate) fn set_info_offsets(&mut self, offsets: Range<usize>) {
self.get_or_init_info().offsets = Some(offsets);
}
pub(crate) fn set_info_oneway_node(&mut self, oneway_node: DArc<Node>) {
self.get_or_init_info().oneway_node = Some(oneway_node);
}
pub(crate) fn set_info_clear_on_drop(&mut self) {
self.get_or_init_info().clear_on_free = true;
}
pub(crate) fn set_info_target_node(&mut self, target_node: NodeRef) {
self.get_or_init_info().target_node = Some(target_node);
}
/// Reserve enough space to push at least `num_fds` fds.
pub(crate) fn info_add_fd_reserve(&mut self, num_fds: usize) -> Result {
self.get_or_init_info()
.file_list
.files_to_translate
.reserve(num_fds, GFP_KERNEL)?;
Ok(())
}
pub(crate) fn info_add_fd(
&mut self,
file: ARef<File>,
buffer_offset: usize,
close_on_free: bool,
) -> Result {
self.get_or_init_info().file_list.files_to_translate.push(
FileEntry {
file,
buffer_offset,
close_on_free,
},
GFP_KERNEL,
)?;
Ok(())
}
pub(crate) fn set_info_close_on_free(&mut self, cof: FdsCloseOnFree) {
self.get_or_init_info().file_list.close_on_free = cof.0;
}
pub(crate) fn translate_fds(&mut self) -> Result<TranslatedFds> {
let file_list = match self.allocation_info.as_mut() {
Some(info) => &mut info.file_list,
None => return Ok(TranslatedFds::new()),
};
let files = core::mem::take(&mut file_list.files_to_translate);
let num_close_on_free = files.iter().filter(|entry| entry.close_on_free).count();
let mut close_on_free = KVec::with_capacity(num_close_on_free, GFP_KERNEL)?;
let mut reservations = KVec::with_capacity(files.len(), GFP_KERNEL)?;
for file_info in files {
let res = FileDescriptorReservation::get_unused_fd_flags(bindings::O_CLOEXEC)?;
let fd = res.reserved_fd();
self.write::<u32>(file_info.buffer_offset, &fd)?;
crate::trace::trace_transaction_fd_recv(self.debug_id, fd, file_info.buffer_offset);
reservations.push(
Reservation {
res,
file: file_info.file,
},
GFP_KERNEL,
)?;
if file_info.close_on_free {
close_on_free.push(fd, GFP_KERNEL)?;
}
}
Ok(TranslatedFds {
reservations,
close_on_free: FdsCloseOnFree(close_on_free),
})
}
/// Should the looper return to userspace when freeing this allocation?
pub(crate) fn looper_need_return_on_free(&self) -> bool {
// Closing fds involves pushing task_work for execution when we return to userspace. Hence,
// we should return to userspace asap if we are closing fds.
match self.allocation_info {
Some(ref info) => !info.file_list.close_on_free.is_empty(),
None => false,
}
}
}
impl Drop for Allocation {
fn drop(&mut self) {
if !self.free_on_drop {
return;
}
if let Some(mut info) = self.allocation_info.take() {
if let Some(oneway_node) = info.oneway_node.as_ref() {
oneway_node.pending_oneway_finished();
}
info.target_node = None;
if let Some(offsets) = info.offsets.clone() {
let view = AllocationView::new(self, offsets.start);
for i in offsets.step_by(size_of::<usize>()) {
if view.cleanup_object(i).is_err() {
pr_warn!("Error cleaning up object at offset {}\n", i)
}
}
}
for &fd in &info.file_list.close_on_free {
let closer = match DeferredFdCloser::new(GFP_KERNEL) {
Ok(closer) => closer,
Err(kernel::alloc::AllocError) => {
// Ignore allocation failures.
break;
}
};
// Here, we ignore errors. The operation can fail if the fd is not valid, or if the
// method is called from a kthread. However, this is always called from a syscall,
// so the latter case cannot happen, and we don't care about the first case.
let _ = closer.close_fd(fd);
}
if info.clear_on_free {
if let Err(e) = self.fill_zero() {
pr_warn!("Failed to clear data on free: {:?}", e);
}
}
}
self.process.buffer_raw_free(self.ptr);
}
}
/// A wrapper around `Allocation` that is being created.
///
/// If the allocation is destroyed while wrapped in this wrapper, then the allocation will be
/// considered to be part of a failed transaction. Successful transactions avoid that by calling
/// `success`, which skips the destructor.
#[repr(transparent)]
pub(crate) struct NewAllocation(pub(crate) Allocation);
impl NewAllocation {
pub(crate) fn success(self) -> Allocation {
// This skips the destructor.
//
// SAFETY: This type is `#[repr(transparent)]`, so the layout matches.
unsafe { core::mem::transmute(self) }
}
}
impl core::ops::Deref for NewAllocation {
type Target = Allocation;
fn deref(&self) -> &Allocation {
&self.0
}
}
impl core::ops::DerefMut for NewAllocation {
fn deref_mut(&mut self) -> &mut Allocation {
&mut self.0
}
}
impl Drop for NewAllocation {
fn drop(&mut self) {
crate::trace::trace_transaction_failed_buffer_release(self.debug_id);
}
}
/// A view into the beginning of an allocation.
///
/// All attempts to read or write outside of the view will fail. To intentionally access outside of
/// this view, use the `alloc` field of this struct directly.
pub(crate) struct AllocationView<'a> {
pub(crate) alloc: &'a mut Allocation,
limit: usize,
}
impl<'a> AllocationView<'a> {
pub(crate) fn new(alloc: &'a mut Allocation, limit: usize) -> Self {
AllocationView { alloc, limit }
}
pub(crate) fn read<T: FromBytes>(&self, offset: usize) -> Result<T> {
if offset.checked_add(size_of::<T>()).ok_or(EINVAL)? > self.limit {
return Err(EINVAL);
}
self.alloc.read(offset)
}
pub(crate) fn write<T: AsBytes>(&self, offset: usize, obj: &T) -> Result {
if offset.checked_add(size_of::<T>()).ok_or(EINVAL)? > self.limit {
return Err(EINVAL);
}
self.alloc.write(offset, obj)
}
pub(crate) fn copy_into(
&self,
reader: &mut UserSliceReader,
offset: usize,
size: usize,
) -> Result {
if offset.checked_add(size).ok_or(EINVAL)? > self.limit {
return Err(EINVAL);
}
self.alloc.copy_into(reader, offset, size)
}
pub(crate) fn transfer_binder_object(
&self,
offset: usize,
obj: &uapi::flat_binder_object,
strong: bool,
node_ref: NodeRef,
) -> Result {
let mut newobj = FlatBinderObject::default();
let node = node_ref.node.clone();
if Arc::ptr_eq(&node_ref.node.owner, &self.alloc.process) {
// The receiving process is the owner of the node, so send it a binder object (instead
// of a handle).
let (ptr, cookie) = node.get_id();
newobj.hdr.type_ = if strong {
BINDER_TYPE_BINDER
} else {
BINDER_TYPE_WEAK_BINDER
};
newobj.flags = obj.flags;
newobj.__bindgen_anon_1.binder = ptr as _;
newobj.cookie = cookie as _;
self.write(offset, &newobj)?;
// Increment the user ref count on the node. It will be decremented as part of the
// destruction of the buffer, when we see a binder or weak-binder object.
node.update_refcount(true, 1, strong);
} else {
// The receiving process is different from the owner, so we need to insert a handle to
// the binder object.
let handle = self
.alloc
.process
.as_arc_borrow()
.insert_or_update_handle(node_ref, false)?;
newobj.hdr.type_ = if strong {
BINDER_TYPE_HANDLE
} else {
BINDER_TYPE_WEAK_HANDLE
};
newobj.flags = obj.flags;
newobj.__bindgen_anon_1.handle = handle;
if self.write(offset, &newobj).is_err() {
// Decrement ref count on the handle we just created.
let _ = self
.alloc
.process
.as_arc_borrow()
.update_ref(handle, false, strong);
return Err(EINVAL);
}
}
crate::trace::trace_transaction_node_send(self.alloc.debug_id, &node, obj, &newobj);
Ok(())
}
fn cleanup_object(&self, index_offset: usize) -> Result {
let offset = self.alloc.read(index_offset)?;
let header = self.read::<BinderObjectHeader>(offset)?;
match header.type_ {
BINDER_TYPE_WEAK_BINDER | BINDER_TYPE_BINDER => {
let obj = self.read::<FlatBinderObject>(offset)?;
let strong = header.type_ == BINDER_TYPE_BINDER;
// SAFETY: The type is `BINDER_TYPE_{WEAK_}BINDER`, so the `binder` field is
// populated.
let ptr = unsafe { obj.__bindgen_anon_1.binder };
let cookie = obj.cookie;
self.alloc.process.update_node(ptr, cookie, strong);
Ok(())
}
BINDER_TYPE_WEAK_HANDLE | BINDER_TYPE_HANDLE => {
let obj = self.read::<FlatBinderObject>(offset)?;
let strong = header.type_ == BINDER_TYPE_HANDLE;
// SAFETY: The type is `BINDER_TYPE_{WEAK_}HANDLE`, so the `handle` field is
// populated.
let handle = unsafe { obj.__bindgen_anon_1.handle };
self.alloc
.process
.as_arc_borrow()
.update_ref(handle, false, strong)
}
_ => Ok(()),
}
}
}
/// A binder object as it is serialized.
///
/// # Invariants
///
/// All bytes must be initialized, and the value of `self.hdr.type_` must be one of the allowed
/// types.
#[repr(C)]
pub(crate) union BinderObject {
hdr: uapi::binder_object_header,
fbo: uapi::flat_binder_object,
fdo: uapi::binder_fd_object,
bbo: uapi::binder_buffer_object,
fdao: uapi::binder_fd_array_object,
}
/// A view into a `BinderObject` that can be used in a match statement.
pub(crate) enum BinderObjectRef<'a> {
Binder(&'a mut uapi::flat_binder_object),
Handle(&'a mut uapi::flat_binder_object),
Fd(&'a mut uapi::binder_fd_object),
Ptr(&'a mut uapi::binder_buffer_object),
Fda(&'a mut uapi::binder_fd_array_object),
}
impl BinderObject {
pub(crate) fn read_from(reader: &mut UserSliceReader) -> Result<BinderObject> {
let object = Self::read_from_inner(|slice| {
let read_len = usize::min(slice.len(), reader.len());
reader.clone_reader().read_slice(&mut slice[..read_len])?;
Ok(())
})?;
// If we used a object type smaller than the largest object size, then we've read more
// bytes than we needed to. However, we used `.clone_reader()` to avoid advancing the
// original reader. Now, we call `skip` so that the caller's reader is advanced by the
// right amount.
//
// The `skip` call fails if the reader doesn't have `size` bytes available. This could
// happen if the type header corresponds to an object type that is larger than the rest of
// the reader.
//
// Any extra bytes beyond the size of the object are inaccessible after this call, so
// reading them again from the `reader` later does not result in TOCTOU bugs.
reader.skip(object.size())?;
Ok(object)
}
/// Use the provided reader closure to construct a `BinderObject`.
///
/// The closure should write the bytes for the object into the provided slice.
pub(crate) fn read_from_inner<R>(reader: R) -> Result<BinderObject>
where
R: FnOnce(&mut [u8; size_of::<BinderObject>()]) -> Result<()>,
{
let mut obj = MaybeUninit::<BinderObject>::zeroed();
// SAFETY: The lengths of `BinderObject` and `[u8; size_of::<BinderObject>()]` are equal,
// and the byte array has an alignment requirement of one, so the pointer cast is okay.
// Additionally, `obj` was initialized to zeros, so the byte array will not be
// uninitialized.
(reader)(unsafe { &mut *obj.as_mut_ptr().cast() })?;
// SAFETY: The entire object is initialized, so accessing this field is safe.
let type_ = unsafe { obj.assume_init_ref().hdr.type_ };
if Self::type_to_size(type_).is_none() {
// The value of `obj.hdr_type_` was invalid.
return Err(EINVAL);
}
// SAFETY: All bytes are initialized (since we zeroed them at the start) and we checked
// that `self.hdr.type_` is one of the allowed types, so the type invariants are satisfied.
unsafe { Ok(obj.assume_init()) }
}
pub(crate) fn as_ref(&mut self) -> BinderObjectRef<'_> {
use BinderObjectRef::*;
// SAFETY: The constructor ensures that all bytes of `self` are initialized, and all
// variants of this union accept all initialized bit patterns.
unsafe {
match self.hdr.type_ {
BINDER_TYPE_WEAK_BINDER | BINDER_TYPE_BINDER => Binder(&mut self.fbo),
BINDER_TYPE_WEAK_HANDLE | BINDER_TYPE_HANDLE => Handle(&mut self.fbo),
BINDER_TYPE_FD => Fd(&mut self.fdo),
BINDER_TYPE_PTR => Ptr(&mut self.bbo),
BINDER_TYPE_FDA => Fda(&mut self.fdao),
// SAFETY: By the type invariant, the value of `self.hdr.type_` cannot have any
// other value than the ones checked above.
_ => core::hint::unreachable_unchecked(),
}
}
}
pub(crate) fn size(&self) -> usize {
// SAFETY: The entire object is initialized, so accessing this field is safe.
let type_ = unsafe { self.hdr.type_ };
// SAFETY: The type invariants guarantee that the type field is correct.
unsafe { Self::type_to_size(type_).unwrap_unchecked() }
}
fn type_to_size(type_: u32) -> Option<usize> {
match type_ {
BINDER_TYPE_WEAK_BINDER => Some(size_of::<uapi::flat_binder_object>()),
BINDER_TYPE_BINDER => Some(size_of::<uapi::flat_binder_object>()),
BINDER_TYPE_WEAK_HANDLE => Some(size_of::<uapi::flat_binder_object>()),
BINDER_TYPE_HANDLE => Some(size_of::<uapi::flat_binder_object>()),
BINDER_TYPE_FD => Some(size_of::<uapi::binder_fd_object>()),
BINDER_TYPE_PTR => Some(size_of::<uapi::binder_buffer_object>()),
BINDER_TYPE_FDA => Some(size_of::<uapi::binder_fd_array_object>()),
_ => None,
}
}
}
#[derive(Default)]
struct FileList {
files_to_translate: KVec<FileEntry>,
close_on_free: KVec<u32>,
}
struct FileEntry {
/// The file for which a descriptor will be created in the recipient process.
file: ARef<File>,
/// The offset in the buffer where the file descriptor is stored.
buffer_offset: usize,
/// Whether this fd should be closed when the allocation is freed.
close_on_free: bool,
}
pub(crate) struct TranslatedFds {
reservations: KVec<Reservation>,
/// If commit is called, then these fds should be closed. (If commit is not called, then they
/// shouldn't be closed.)
close_on_free: FdsCloseOnFree,
}
struct Reservation {
res: FileDescriptorReservation,
file: ARef<File>,
}
impl TranslatedFds {
pub(crate) fn new() -> Self {
Self {
reservations: KVec::new(),
close_on_free: FdsCloseOnFree(KVec::new()),
}
}
pub(crate) fn commit(self) -> FdsCloseOnFree {
for entry in self.reservations {
entry.res.fd_install(entry.file);
}
self.close_on_free
}
}
pub(crate) struct FdsCloseOnFree(KVec<u32>);

View File

@@ -0,0 +1,183 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use kernel::{
error::Error,
list::{List, ListArc, ListLinks},
prelude::*,
security,
str::{CStr, CString},
sync::{Arc, Mutex},
task::Kuid,
};
use crate::{error::BinderError, node::NodeRef, process::Process};
kernel::sync::global_lock! {
// SAFETY: We call `init` in the module initializer, so it's initialized before first use.
pub(crate) unsafe(uninit) static CONTEXTS: Mutex<ContextList> = ContextList {
list: List::new(),
};
}
pub(crate) struct ContextList {
list: List<Context>,
}
pub(crate) fn get_all_contexts() -> Result<KVec<Arc<Context>>> {
let lock = CONTEXTS.lock();
let count = lock.list.iter().count();
let mut ctxs = KVec::with_capacity(count, GFP_KERNEL)?;
for ctx in &lock.list {
ctxs.push(Arc::from(ctx), GFP_KERNEL)?;
}
Ok(ctxs)
}
/// This struct keeps track of the processes using this context, and which process is the context
/// manager.
struct Manager {
node: Option<NodeRef>,
uid: Option<Kuid>,
all_procs: List<Process>,
}
/// There is one context per binder file (/dev/binder, /dev/hwbinder, etc)
#[pin_data]
pub(crate) struct Context {
#[pin]
manager: Mutex<Manager>,
pub(crate) name: CString,
#[pin]
links: ListLinks,
}
kernel::list::impl_has_list_links! {
impl HasListLinks<0> for Context { self.links }
}
kernel::list::impl_list_arc_safe! {
impl ListArcSafe<0> for Context { untracked; }
}
kernel::list::impl_list_item! {
impl ListItem<0> for Context {
using ListLinks;
}
}
impl Context {
pub(crate) fn new(name: &CStr) -> Result<Arc<Self>> {
let name = CString::try_from(name)?;
let list_ctx = ListArc::pin_init::<Error>(
try_pin_init!(Context {
name,
links <- ListLinks::new(),
manager <- kernel::new_mutex!(Manager {
all_procs: List::new(),
node: None,
uid: None,
}, "Context::manager"),
}),
GFP_KERNEL,
)?;
let ctx = list_ctx.clone_arc();
CONTEXTS.lock().list.push_back(list_ctx);
Ok(ctx)
}
/// Called when the file for this context is unlinked.
///
/// No-op if called twice.
pub(crate) fn deregister(&self) {
// SAFETY: We never add the context to any other linked list than this one, so it is either
// in this list, or not in any list.
unsafe { CONTEXTS.lock().list.remove(self) };
}
pub(crate) fn register_process(self: &Arc<Self>, proc: ListArc<Process>) {
if !Arc::ptr_eq(self, &proc.ctx) {
pr_err!("Context::register_process called on the wrong context.");
return;
}
self.manager.lock().all_procs.push_back(proc);
}
pub(crate) fn deregister_process(self: &Arc<Self>, proc: &Process) {
if !Arc::ptr_eq(self, &proc.ctx) {
pr_err!("Context::deregister_process called on the wrong context.");
return;
}
// SAFETY: We just checked that this is the right list.
unsafe { self.manager.lock().all_procs.remove(proc) };
}
pub(crate) fn set_manager_node(&self, node_ref: NodeRef) -> Result {
let mut manager = self.manager.lock();
if manager.node.is_some() {
pr_warn!("BINDER_SET_CONTEXT_MGR already set");
return Err(EBUSY);
}
security::binder_set_context_mgr(&node_ref.node.owner.cred)?;
// If the context manager has been set before, ensure that we use the same euid.
let caller_uid = Kuid::current_euid();
if let Some(ref uid) = manager.uid {
if *uid != caller_uid {
return Err(EPERM);
}
}
manager.node = Some(node_ref);
manager.uid = Some(caller_uid);
Ok(())
}
pub(crate) fn unset_manager_node(&self) {
let node_ref = self.manager.lock().node.take();
drop(node_ref);
}
pub(crate) fn get_manager_node(&self, strong: bool) -> Result<NodeRef, BinderError> {
self.manager
.lock()
.node
.as_ref()
.ok_or_else(BinderError::new_dead)?
.clone(strong)
.map_err(BinderError::from)
}
pub(crate) fn for_each_proc<F>(&self, mut func: F)
where
F: FnMut(&Process),
{
let lock = self.manager.lock();
for proc in &lock.all_procs {
func(&proc);
}
}
pub(crate) fn get_all_procs(&self) -> Result<KVec<Arc<Process>>> {
let lock = self.manager.lock();
let count = lock.all_procs.iter().count();
let mut procs = KVec::with_capacity(count, GFP_KERNEL)?;
for proc in &lock.all_procs {
procs.push(Arc::from(proc), GFP_KERNEL)?;
}
Ok(procs)
}
pub(crate) fn get_procs_with_pid(&self, pid: i32) -> Result<KVec<Arc<Process>>> {
let orig = self.get_all_procs()?;
let mut backing = KVec::with_capacity(orig.len(), GFP_KERNEL)?;
for proc in orig.into_iter().filter(|proc| proc.task.pid() == pid) {
backing.push(proc, GFP_KERNEL)?;
}
Ok(backing)
}
}

View File

@@ -0,0 +1,202 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
//! Logic for closing files in a deferred manner.
//!
//! This file could make sense to have in `kernel::fs`, but it was rejected for being too
//! Binder-specific.
use core::mem::MaybeUninit;
use kernel::{
alloc::{AllocError, Flags},
bindings,
prelude::*,
};
/// Helper used for closing file descriptors in a way that is safe even if the file is currently
/// held using `fdget`.
///
/// Additional motivation can be found in commit 80cd795630d6 ("binder: fix use-after-free due to
/// ksys_close() during fdget()") and in the comments on `binder_do_fd_close`.
pub(crate) struct DeferredFdCloser {
inner: KBox<DeferredFdCloserInner>,
}
/// SAFETY: This just holds an allocation with no real content, so there's no safety issue with
/// moving it across threads.
unsafe impl Send for DeferredFdCloser {}
unsafe impl Sync for DeferredFdCloser {}
/// # Invariants
///
/// If the `file` pointer is non-null, then it points at a `struct file` and owns a refcount to
/// that file.
#[repr(C)]
struct DeferredFdCloserInner {
twork: MaybeUninit<bindings::callback_head>,
file: *mut bindings::file,
}
impl DeferredFdCloser {
/// Create a new [`DeferredFdCloser`].
pub(crate) fn new(flags: Flags) -> Result<Self, AllocError> {
Ok(Self {
// INVARIANT: The `file` pointer is null, so the type invariant does not apply.
inner: KBox::new(
DeferredFdCloserInner {
twork: MaybeUninit::uninit(),
file: core::ptr::null_mut(),
},
flags,
)?,
})
}
/// Schedule a task work that closes the file descriptor when this task returns to userspace.
///
/// Fails if this is called from a context where we cannot run work when returning to
/// userspace. (E.g., from a kthread.)
pub(crate) fn close_fd(self, fd: u32) -> Result<(), DeferredFdCloseError> {
use bindings::task_work_notify_mode_TWA_RESUME as TWA_RESUME;
// In this method, we schedule the task work before closing the file. This is because
// scheduling a task work is fallible, and we need to know whether it will fail before we
// attempt to close the file.
// Task works are not available on kthreads.
let current = kernel::current!();
// Check if this is a kthread.
// SAFETY: Reading `flags` from a task is always okay.
if unsafe { ((*current.as_ptr()).flags & bindings::PF_KTHREAD) != 0 } {
return Err(DeferredFdCloseError::TaskWorkUnavailable);
}
// Transfer ownership of the box's allocation to a raw pointer. This disables the
// destructor, so we must manually convert it back to a KBox to drop it.
//
// Until we convert it back to a `KBox`, there are no aliasing requirements on this
// pointer.
let inner = KBox::into_raw(self.inner);
// The `callback_head` field is first in the struct, so this cast correctly gives us a
// pointer to the field.
let callback_head = inner.cast::<bindings::callback_head>();
// SAFETY: This pointer offset operation does not go out-of-bounds.
let file_field = unsafe { core::ptr::addr_of_mut!((*inner).file) };
let current = current.as_ptr();
// SAFETY: This function currently has exclusive access to the `DeferredFdCloserInner`, so
// it is okay for us to perform unsynchronized writes to its `callback_head` field.
unsafe { bindings::init_task_work(callback_head, Some(Self::do_close_fd)) };
// SAFETY: This inserts the `DeferredFdCloserInner` into the task workqueue for the current
// task. If this operation is successful, then this transfers exclusive ownership of the
// `callback_head` field to the C side until it calls `do_close_fd`, and we don't touch or
// invalidate the field during that time.
//
// When the C side calls `do_close_fd`, the safety requirements of that method are
// satisfied because when a task work is executed, the callback is given ownership of the
// pointer.
//
// The file pointer is currently null. If it is changed to be non-null before `do_close_fd`
// is called, then that change happens due to the write at the end of this function, and
// that write has a safety comment that explains why the refcount can be dropped when
// `do_close_fd` runs.
let res = unsafe { bindings::task_work_add(current, callback_head, TWA_RESUME) };
if res != 0 {
// SAFETY: Scheduling the task work failed, so we still have ownership of the box, so
// we may destroy it.
unsafe { drop(KBox::from_raw(inner)) };
return Err(DeferredFdCloseError::TaskWorkUnavailable);
}
// This removes the fd from the fd table in `current`. The file is not fully closed until
// `filp_close` is called. We are given ownership of one refcount to the file.
//
// SAFETY: This is safe no matter what `fd` is. If the `fd` is valid (that is, if the
// pointer is non-null), then we call `filp_close` on the returned pointer as required by
// `file_close_fd`.
let file = unsafe { bindings::file_close_fd(fd) };
if file.is_null() {
// We don't clean up the task work since that might be expensive if the task work queue
// is long. Just let it execute and let it clean up for itself.
return Err(DeferredFdCloseError::BadFd);
}
// Acquire a second refcount to the file.
//
// SAFETY: The `file` pointer points at a file with a non-zero refcount.
unsafe { bindings::get_file(file) };
// This method closes the fd, consuming one of our two refcounts. There could be active
// light refcounts created from that fd, so we must ensure that the file has a positive
// refcount for the duration of those active light refcounts. We do that by holding on to
// the second refcount until the current task returns to userspace.
//
// SAFETY: The `file` pointer is valid. Passing `current->files` as the file table to close
// it in is correct, since we just got the `fd` from `file_close_fd` which also uses
// `current->files`.
//
// Note: fl_owner_t is currently a void pointer.
unsafe { bindings::filp_close(file, (*current).files as bindings::fl_owner_t) };
// We update the file pointer that the task work is supposed to fput. This transfers
// ownership of our last refcount.
//
// INVARIANT: This changes the `file` field of a `DeferredFdCloserInner` from null to
// non-null. This doesn't break the type invariant for `DeferredFdCloserInner` because we
// still own a refcount to the file, so we can pass ownership of that refcount to the
// `DeferredFdCloserInner`.
//
// When `do_close_fd` runs, it must be safe for it to `fput` the refcount. However, this is
// the case because all light refcounts that are associated with the fd we closed
// previously must be dropped when `do_close_fd`, since light refcounts must be dropped
// before returning to userspace.
//
// SAFETY: Task works are executed on the current thread right before we return to
// userspace, so this write is guaranteed to happen before `do_close_fd` is called, which
// means that a race is not possible here.
unsafe { *file_field = file };
Ok(())
}
/// # Safety
///
/// The provided pointer must point at the `twork` field of a `DeferredFdCloserInner` stored in
/// a `KBox`, and the caller must pass exclusive ownership of that `KBox`. Furthermore, if the
/// file pointer is non-null, then it must be okay to release the refcount by calling `fput`.
unsafe extern "C" fn do_close_fd(inner: *mut bindings::callback_head) {
// SAFETY: The caller just passed us ownership of this box.
let inner = unsafe { KBox::from_raw(inner.cast::<DeferredFdCloserInner>()) };
if !inner.file.is_null() {
// SAFETY: By the type invariants, we own a refcount to this file, and the caller
// guarantees that dropping the refcount now is okay.
unsafe { bindings::fput(inner.file) };
}
// The allocation is freed when `inner` goes out of scope.
}
}
/// Represents a failure to close an fd in a deferred manner.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub(crate) enum DeferredFdCloseError {
/// Closing the fd failed because we were unable to schedule a task work.
TaskWorkUnavailable,
/// Closing the fd failed because the fd does not exist.
BadFd,
}
impl From<DeferredFdCloseError> for Error {
fn from(err: DeferredFdCloseError) -> Error {
match err {
DeferredFdCloseError::TaskWorkUnavailable => ESRCH,
DeferredFdCloseError::BadFd => EBADF,
}
}
}

View File

@@ -0,0 +1,182 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use core::mem::MaybeUninit;
use core::ops::{Deref, DerefMut};
use kernel::{
types::{AsBytes, FromBytes},
uapi::{self, *},
};
macro_rules! pub_no_prefix {
($prefix:ident, $($newname:ident),+ $(,)?) => {
$(pub(crate) const $newname: u32 = kernel::macros::concat_idents!($prefix, $newname);)+
};
}
pub_no_prefix!(
binder_driver_return_protocol_,
BR_TRANSACTION,
BR_TRANSACTION_SEC_CTX,
BR_REPLY,
BR_DEAD_REPLY,
BR_FAILED_REPLY,
BR_FROZEN_REPLY,
BR_NOOP,
BR_SPAWN_LOOPER,
BR_TRANSACTION_COMPLETE,
BR_TRANSACTION_PENDING_FROZEN,
BR_ONEWAY_SPAM_SUSPECT,
BR_OK,
BR_ERROR,
BR_INCREFS,
BR_ACQUIRE,
BR_RELEASE,
BR_DECREFS,
BR_DEAD_BINDER,
BR_CLEAR_DEATH_NOTIFICATION_DONE,
);
pub_no_prefix!(
binder_driver_command_protocol_,
BC_TRANSACTION,
BC_TRANSACTION_SG,
BC_REPLY,
BC_REPLY_SG,
BC_FREE_BUFFER,
BC_ENTER_LOOPER,
BC_EXIT_LOOPER,
BC_REGISTER_LOOPER,
BC_INCREFS,
BC_ACQUIRE,
BC_RELEASE,
BC_DECREFS,
BC_INCREFS_DONE,
BC_ACQUIRE_DONE,
BC_REQUEST_DEATH_NOTIFICATION,
BC_CLEAR_DEATH_NOTIFICATION,
BC_DEAD_BINDER_DONE,
);
pub_no_prefix!(
flat_binder_object_shifts_,
FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT
);
pub_no_prefix!(
flat_binder_object_flags_,
FLAT_BINDER_FLAG_ACCEPTS_FDS,
FLAT_BINDER_FLAG_INHERIT_RT,
FLAT_BINDER_FLAG_PRIORITY_MASK,
FLAT_BINDER_FLAG_SCHED_POLICY_MASK,
FLAT_BINDER_FLAG_TXN_SECURITY_CTX
);
pub_no_prefix!(
transaction_flags_,
TF_ONE_WAY,
TF_ACCEPT_FDS,
TF_CLEAR_BUF,
TF_UPDATE_TXN
);
pub(crate) use uapi::{
BINDER_TYPE_BINDER, BINDER_TYPE_FD, BINDER_TYPE_FDA, BINDER_TYPE_HANDLE, BINDER_TYPE_PTR,
BINDER_TYPE_WEAK_BINDER, BINDER_TYPE_WEAK_HANDLE,
};
macro_rules! decl_wrapper {
($newname:ident, $wrapped:ty) => {
// Define a wrapper around the C type. Use `MaybeUninit` to enforce that the value of
// padding bytes must be preserved.
#[derive(Copy, Clone)]
#[repr(transparent)]
pub(crate) struct $newname(MaybeUninit<$wrapped>);
// SAFETY: This macro is only used with types where this is ok.
unsafe impl FromBytes for $newname {}
unsafe impl AsBytes for $newname {}
impl Deref for $newname {
type Target = $wrapped;
fn deref(&self) -> &Self::Target {
// SAFETY: We use `MaybeUninit` only to preserve padding. The value must still
// always be valid.
unsafe { self.0.assume_init_ref() }
}
}
impl DerefMut for $newname {
fn deref_mut(&mut self) -> &mut Self::Target {
// SAFETY: We use `MaybeUninit` only to preserve padding. The value must still
// always be valid.
unsafe { self.0.assume_init_mut() }
}
}
impl Default for $newname {
fn default() -> Self {
// Create a new value of this type where all bytes (including padding) are zeroed.
Self(MaybeUninit::zeroed())
}
}
};
}
decl_wrapper!(BinderNodeDebugInfo, uapi::binder_node_debug_info);
decl_wrapper!(BinderNodeInfoForRef, uapi::binder_node_info_for_ref);
decl_wrapper!(FlatBinderObject, uapi::flat_binder_object);
decl_wrapper!(BinderFdObject, uapi::binder_fd_object);
decl_wrapper!(BinderFdArrayObject, uapi::binder_fd_array_object);
decl_wrapper!(BinderObjectHeader, uapi::binder_object_header);
decl_wrapper!(BinderBufferObject, uapi::binder_buffer_object);
decl_wrapper!(BinderTransactionData, uapi::binder_transaction_data);
decl_wrapper!(
BinderTransactionDataSecctx,
uapi::binder_transaction_data_secctx
);
decl_wrapper!(BinderTransactionDataSg, uapi::binder_transaction_data_sg);
decl_wrapper!(BinderWriteRead, uapi::binder_write_read);
decl_wrapper!(BinderVersion, uapi::binder_version);
decl_wrapper!(BinderFrozenStatusInfo, uapi::binder_frozen_status_info);
decl_wrapper!(BinderFreezeInfo, uapi::binder_freeze_info);
decl_wrapper!(ExtendedError, uapi::binder_extended_error);
impl BinderVersion {
pub(crate) fn current() -> Self {
Self(MaybeUninit::new(uapi::binder_version {
protocol_version: BINDER_CURRENT_PROTOCOL_VERSION as _,
}))
}
}
impl BinderTransactionData {
pub(crate) fn with_buffers_size(self, buffers_size: u64) -> BinderTransactionDataSg {
BinderTransactionDataSg(MaybeUninit::new(uapi::binder_transaction_data_sg {
transaction_data: *self,
buffers_size,
}))
}
}
impl BinderTransactionDataSecctx {
/// View the inner data as wrapped in `BinderTransactionData`.
pub(crate) fn tr_data(&mut self) -> &mut BinderTransactionData {
// SAFETY: Transparent wrapper is safe to transmute.
unsafe {
&mut *(&mut self.transaction_data as *mut uapi::binder_transaction_data
as *mut BinderTransactionData)
}
}
}
impl ExtendedError {
pub(crate) fn new(id: u32, command: u32, param: i32) -> Self {
Self(MaybeUninit::new(uapi::binder_extended_error {
id,
command,
param,
}))
}
}

View File

@@ -0,0 +1,99 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use kernel::prelude::*;
use crate::defs::*;
pub(crate) type BinderResult<T = ()> = core::result::Result<T, BinderError>;
/// An error that will be returned to userspace via the `BINDER_WRITE_READ` ioctl rather than via
/// errno.
pub(crate) struct BinderError {
pub(crate) reply: u32,
source: Option<Error>,
}
impl BinderError {
pub(crate) fn new_dead() -> Self {
Self {
reply: BR_DEAD_REPLY,
source: None,
}
}
pub(crate) fn new_frozen() -> Self {
Self {
reply: BR_FROZEN_REPLY,
source: None,
}
}
pub(crate) fn new_frozen_oneway() -> Self {
Self {
reply: BR_TRANSACTION_PENDING_FROZEN,
source: None,
}
}
pub(crate) fn is_dead(&self) -> bool {
self.reply == BR_DEAD_REPLY
}
pub(crate) fn as_errno(&self) -> kernel::ffi::c_int {
self.source.unwrap_or(EINVAL).to_errno()
}
pub(crate) fn should_pr_warn(&self) -> bool {
self.source.is_some()
}
}
/// Convert an errno into a `BinderError` and store the errno used to construct it. The errno
/// should be stored as the thread's extended error when given to userspace.
impl From<Error> for BinderError {
fn from(source: Error) -> Self {
Self {
reply: BR_FAILED_REPLY,
source: Some(source),
}
}
}
impl From<kernel::fs::file::BadFdError> for BinderError {
fn from(source: kernel::fs::file::BadFdError) -> Self {
BinderError::from(Error::from(source))
}
}
impl From<kernel::alloc::AllocError> for BinderError {
fn from(_: kernel::alloc::AllocError) -> Self {
Self {
reply: BR_FAILED_REPLY,
source: Some(ENOMEM),
}
}
}
impl core::fmt::Debug for BinderError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self.reply {
BR_FAILED_REPLY => match self.source.as_ref() {
Some(source) => f
.debug_struct("BR_FAILED_REPLY")
.field("source", source)
.finish(),
None => f.pad("BR_FAILED_REPLY"),
},
BR_DEAD_REPLY => f.pad("BR_DEAD_REPLY"),
BR_FROZEN_REPLY => f.pad("BR_FROZEN_REPLY"),
BR_TRANSACTION_PENDING_FROZEN => f.pad("BR_TRANSACTION_PENDING_FROZEN"),
BR_TRANSACTION_COMPLETE => f.pad("BR_TRANSACTION_COMPLETE"),
_ => f
.debug_struct("BinderError")
.field("reply", &self.reply)
.finish(),
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,79 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use kernel::{list::ListArc, prelude::*, seq_file::SeqFile, seq_print, sync::UniqueArc};
use crate::{node::Node, thread::Thread, BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverToRead};
use core::mem::MaybeUninit;
pub(crate) struct CritIncrWrapper {
inner: UniqueArc<MaybeUninit<DTRWrap<NodeWrapper>>>,
}
impl CritIncrWrapper {
pub(crate) fn new() -> Result<Self> {
Ok(CritIncrWrapper {
inner: UniqueArc::new_uninit(GFP_KERNEL)?,
})
}
pub(super) fn init(self, node: DArc<Node>) -> DLArc<dyn DeliverToRead> {
match self.inner.pin_init_with(DTRWrap::new(NodeWrapper { node })) {
Ok(initialized) => ListArc::from(initialized) as _,
Err(err) => match err {},
}
}
}
struct NodeWrapper {
node: DArc<Node>,
}
kernel::list::impl_list_arc_safe! {
impl ListArcSafe<0> for NodeWrapper {
untracked;
}
}
impl DeliverToRead for NodeWrapper {
fn do_work(
self: DArc<Self>,
_thread: &Thread,
writer: &mut BinderReturnWriter<'_>,
) -> Result<bool> {
let node = &self.node;
let mut owner_inner = node.owner.inner.lock();
let inner = node.inner.access_mut(&mut owner_inner);
let ds = &mut inner.delivery_state;
assert!(ds.has_pushed_wrapper);
assert!(ds.has_strong_zero2one);
ds.has_pushed_wrapper = false;
ds.has_strong_zero2one = false;
node.do_work_locked(writer, owner_inner)
}
fn cancel(self: DArc<Self>) {}
fn on_thread_selected(&self, _thread: &Thread) {}
fn should_sync_wakeup(&self) -> bool {
false
}
#[inline(never)]
fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> {
seq_print!(
m,
"{}node work {}: u{:016x} c{:016x}\n",
prefix,
self.node.debug_id,
self.node.ptr,
self.node.cookie,
);
Ok(())
}
}

View File

@@ -0,0 +1,783 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
//! This module has utilities for managing a page range where unused pages may be reclaimed by a
//! vma shrinker.
// To avoid deadlocks, locks are taken in the order:
//
// 1. mmap lock
// 2. spinlock
// 3. lru spinlock
//
// The shrinker will use trylock methods because it locks them in a different order.
use core::{
alloc::Layout,
marker::PhantomPinned,
mem::{size_of, size_of_val, MaybeUninit},
ptr::{self, NonNull},
};
use kernel::{
alloc::allocator::Kmalloc,
alloc::Allocator,
bindings,
error::Result,
ffi::{c_ulong, c_void},
mm::{virt, Mm, MmWithUser},
new_mutex, new_spinlock,
page::{Page, PAGE_SHIFT, PAGE_SIZE},
prelude::*,
str::CStr,
sync::{Mutex, SpinLock},
task::Pid,
types::ARef,
types::{FromBytes, Opaque},
uaccess::UserSliceReader,
};
/// Represents a shrinker that can be registered with the kernel.
///
/// Each shrinker can be used by many `ShrinkablePageRange` objects.
#[repr(C)]
pub(crate) struct Shrinker {
inner: Opaque<*mut bindings::shrinker>,
list_lru: Opaque<bindings::list_lru>,
}
unsafe impl Send for Shrinker {}
unsafe impl Sync for Shrinker {}
impl Shrinker {
/// Create a new shrinker.
///
/// # Safety
///
/// Before using this shrinker with a `ShrinkablePageRange`, the `register` method must have
/// been called exactly once, and it must not have returned an error.
pub(crate) const unsafe fn new() -> Self {
Self {
inner: Opaque::uninit(),
list_lru: Opaque::uninit(),
}
}
/// Register this shrinker with the kernel.
pub(crate) fn register(&'static self, name: &CStr) -> Result<()> {
// SAFETY: These fields are not yet used, so it's okay to zero them.
unsafe {
self.inner.get().write(ptr::null_mut());
self.list_lru.get().write_bytes(0, 1);
}
// SAFETY: The field is not yet used, so we can initialize it.
let ret = unsafe {
bindings::__list_lru_init(self.list_lru.get(), false, ptr::null_mut(), ptr::null_mut())
};
if ret != 0 {
return Err(Error::from_errno(ret));
}
// SAFETY: The `name` points at a valid c string.
let shrinker = unsafe { bindings::shrinker_alloc(0, name.as_char_ptr()) };
if shrinker.is_null() {
// SAFETY: We initialized it, so its okay to destroy it.
unsafe { bindings::list_lru_destroy(self.list_lru.get()) };
return Err(Error::from_errno(ret));
}
// SAFETY: We're about to register the shrinker, and these are the fields we need to
// initialize. (All other fields are already zeroed.)
unsafe {
ptr::addr_of_mut!((*shrinker).count_objects).write(Some(rust_shrink_count));
ptr::addr_of_mut!((*shrinker).scan_objects).write(Some(rust_shrink_scan));
}
// SAFETY: The new shrinker has been fully initialized, so we can register it.
unsafe { bindings::shrinker_register(shrinker) };
// SAFETY: This initializes the pointer to the shrinker so that we can use it.
unsafe { self.inner.get().write(shrinker) };
Ok(())
}
}
/// A container that manages a page range in a vma.
///
/// The pages can be thought of as an array of booleans of whether the pages are usable. The
/// methods `use_range` and `stop_using_range` set all booleans in a range to true or false
/// respectively. Initially, no pages are allocated. When a page is not used, it is not freed
/// immediately. Instead, it is made available to the memory shrinker to free it if the device is
/// under memory pressure.
///
/// It's okay for `use_range` and `stop_using_range` to race with each other, although there's no
/// way to know whether an index ends up with true or false if a call to `use_range` races with
/// another call to `stop_using_range` on a given index.
///
/// It's also okay for the two methods to race with themselves, e.g. if two threads call
/// `use_range` on the same index, then that's fine and neither call will return until the page is
/// allocated and mapped.
///
/// The methods that read or write to a range require that the page is marked as in use. So it is
/// _not_ okay to call `stop_using_range` on a page that is in use by the methods that read or
/// write to the page.
#[pin_data(PinnedDrop)]
pub(crate) struct ShrinkablePageRange {
/// Shrinker object registered with the kernel.
shrinker: &'static Shrinker,
/// Pid using this page range. Only used as debugging information.
pid: Pid,
/// The mm for the relevant process.
mm: ARef<Mm>,
/// Used to synchronize calls to `vm_insert_page` and `zap_page_range_single`.
#[pin]
mm_lock: Mutex<()>,
/// Spinlock protecting changes to pages.
#[pin]
lock: SpinLock<Inner>,
/// Must not move, since page info has pointers back.
#[pin]
_pin: PhantomPinned,
}
struct Inner {
/// Array of pages.
///
/// Since this is also accessed by the shrinker, we can't use a `Box`, which asserts exclusive
/// ownership. To deal with that, we manage it using raw pointers.
pages: *mut PageInfo,
/// Length of the `pages` array.
size: usize,
/// The address of the vma to insert the pages into.
vma_addr: usize,
}
unsafe impl Send for ShrinkablePageRange {}
unsafe impl Sync for ShrinkablePageRange {}
type StableMmGuard =
kernel::sync::lock::Guard<'static, (), kernel::sync::lock::mutex::MutexBackend>;
/// An array element that describes the current state of a page.
///
/// There are three states:
///
/// * Free. The page is None. The `lru` element is not queued.
/// * Available. The page is Some. The `lru` element is queued to the shrinker's lru.
/// * Used. The page is Some. The `lru` element is not queued.
///
/// When an element is available, the shrinker is able to free the page.
#[repr(C)]
struct PageInfo {
lru: bindings::list_head,
page: Option<Page>,
range: *const ShrinkablePageRange,
}
impl PageInfo {
/// # Safety
///
/// The caller ensures that reading from `me.page` is ok.
unsafe fn has_page(me: *const PageInfo) -> bool {
// SAFETY: This pointer offset is in bounds.
let page = unsafe { ptr::addr_of!((*me).page) };
unsafe { (*page).is_some() }
}
/// # Safety
///
/// The caller ensures that writing to `me.page` is ok, and that the page is not currently set.
unsafe fn set_page(me: *mut PageInfo, page: Page) {
// SAFETY: This pointer offset is in bounds.
let ptr = unsafe { ptr::addr_of_mut!((*me).page) };
// SAFETY: The pointer is valid for writing, so also valid for reading.
if unsafe { (*ptr).is_some() } {
pr_err!("set_page called when there is already a page");
// SAFETY: We will initialize the page again below.
unsafe { ptr::drop_in_place(ptr) };
}
// SAFETY: The pointer is valid for writing.
unsafe { ptr::write(ptr, Some(page)) };
}
/// # Safety
///
/// The caller ensures that reading from `me.page` is ok for the duration of 'a.
unsafe fn get_page<'a>(me: *const PageInfo) -> Option<&'a Page> {
// SAFETY: This pointer offset is in bounds.
let ptr = unsafe { ptr::addr_of!((*me).page) };
// SAFETY: The pointer is valid for reading.
unsafe { (*ptr).as_ref() }
}
/// # Safety
///
/// The caller ensures that writing to `me.page` is ok for the duration of 'a.
unsafe fn take_page(me: *mut PageInfo) -> Option<Page> {
// SAFETY: This pointer offset is in bounds.
let ptr = unsafe { ptr::addr_of_mut!((*me).page) };
// SAFETY: The pointer is valid for reading.
unsafe { (*ptr).take() }
}
/// Add this page to the lru list, if not already in the list.
///
/// # Safety
///
/// The pointer must be valid, and it must be the right shrinker.
unsafe fn list_lru_add(me: *mut PageInfo, shrinker: &'static Shrinker) {
// SAFETY: This pointer offset is in bounds.
let lru_ptr = unsafe { ptr::addr_of_mut!((*me).lru) };
// SAFETY: The lru pointer is valid, and we're not using it with any other lru list.
unsafe { bindings::list_lru_add_obj(shrinker.list_lru.get(), lru_ptr) };
}
/// Remove this page from the lru list, if it is in the list.
///
/// # Safety
///
/// The pointer must be valid, and it must be the right shrinker.
unsafe fn list_lru_del(me: *mut PageInfo, shrinker: &'static Shrinker) {
// SAFETY: This pointer offset is in bounds.
let lru_ptr = unsafe { ptr::addr_of_mut!((*me).lru) };
// SAFETY: The lru pointer is valid, and we're not using it with any other lru list.
unsafe { bindings::list_lru_del_obj(shrinker.list_lru.get(), lru_ptr) };
}
}
impl ShrinkablePageRange {
/// Create a new `ShrinkablePageRange` using the given shrinker.
pub(crate) fn new(shrinker: &'static Shrinker) -> impl PinInit<Self, Error> {
try_pin_init!(Self {
shrinker,
pid: kernel::current!().pid(),
mm: ARef::from(&**kernel::current!().mm().ok_or(ESRCH)?),
mm_lock <- new_mutex!((), "ShrinkablePageRange::mm"),
lock <- new_spinlock!(Inner {
pages: ptr::null_mut(),
size: 0,
vma_addr: 0,
}, "ShrinkablePageRange"),
_pin: PhantomPinned,
})
}
pub(crate) fn stable_trylock_mm(&self) -> Option<StableMmGuard> {
// SAFETY: This extends the duration of the reference. Since this call happens before
// `mm_lock` is taken in the destructor of `ShrinkablePageRange`, the destructor will block
// until the returned guard is dropped. This ensures that the guard is valid until dropped.
let mm_lock = unsafe { &*ptr::from_ref(&self.mm_lock) };
mm_lock.try_lock()
}
/// Register a vma with this page range. Returns the size of the region.
pub(crate) fn register_with_vma(&self, vma: &virt::VmAreaNew) -> Result<usize> {
let num_bytes = usize::min(vma.end() - vma.start(), bindings::SZ_4M as usize);
let num_pages = num_bytes >> PAGE_SHIFT;
if !ptr::eq::<Mm>(&*self.mm, &**vma.mm()) {
pr_debug!("Failed to register with vma: invalid vma->vm_mm");
return Err(EINVAL);
}
if num_pages == 0 {
pr_debug!("Failed to register with vma: size zero");
return Err(EINVAL);
}
let layout = Layout::array::<PageInfo>(num_pages).map_err(|_| ENOMEM)?;
// SAFETY: The layout has non-zero size.
let pages = Kmalloc::alloc(layout, GFP_KERNEL)?.cast::<PageInfo>();
// SAFETY: This just initializes the pages array.
unsafe {
let self_ptr = self as *const ShrinkablePageRange;
for i in 0..num_pages {
let info = pages.add(i).as_ptr();
ptr::addr_of_mut!((*info).range).write(self_ptr);
ptr::addr_of_mut!((*info).page).write(None);
let lru = ptr::addr_of_mut!((*info).lru);
ptr::addr_of_mut!((*lru).next).write(lru);
ptr::addr_of_mut!((*lru).prev).write(lru);
}
}
let mut inner = self.lock.lock();
if inner.size > 0 {
pr_debug!("Failed to register with vma: already registered");
drop(inner);
// SAFETY: The `pages` array was allocated with the same layout.
unsafe { Kmalloc::free(pages.cast(), layout) };
return Err(EBUSY);
}
inner.pages = pages.as_ptr();
inner.size = num_pages;
inner.vma_addr = vma.start();
Ok(num_pages)
}
/// Make sure that the given pages are allocated and mapped.
///
/// Must not be called from an atomic context.
pub(crate) fn use_range(&self, start: usize, end: usize) -> Result<()> {
crate::trace::trace_update_page_range(self.pid, true, start, end);
if start >= end {
return Ok(());
}
let mut inner = self.lock.lock();
assert!(end <= inner.size);
for i in start..end {
// SAFETY: This pointer offset is in bounds.
let page_info = unsafe { inner.pages.add(i) };
// SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay.
if unsafe { PageInfo::has_page(page_info) } {
crate::trace::trace_alloc_lru_start(self.pid, i);
// Since we're going to use the page, we should remove it from the lru list so that
// the shrinker will not free it.
//
// SAFETY: The pointer is valid, and this is the right shrinker.
//
// The shrinker can't free the page between the check and this call to
// `list_lru_del` because we hold the lock.
unsafe { PageInfo::list_lru_del(page_info, self.shrinker) };
crate::trace::trace_alloc_lru_end(self.pid, i);
} else {
// We have to allocate a new page. Use the slow path.
drop(inner);
crate::trace::trace_alloc_page_start(self.pid, i);
match self.use_page_slow(i) {
Ok(()) => {}
Err(err) => {
pr_warn!("Error in use_page_slow: {:?}", err);
return Err(err);
}
}
crate::trace::trace_alloc_page_end(self.pid, i);
inner = self.lock.lock();
}
}
Ok(())
}
/// Mark the given page as in use, slow path.
///
/// Must not be called from an atomic context.
///
/// # Safety
///
/// Assumes that `i` is in bounds.
#[cold]
fn use_page_slow(&self, i: usize) -> Result<()> {
let new_page = Page::alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO)?;
let mm_mutex = self.mm_lock.lock();
let inner = self.lock.lock();
// SAFETY: This pointer offset is in bounds.
let page_info = unsafe { inner.pages.add(i) };
// SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay.
if unsafe { PageInfo::has_page(page_info) } {
// The page was already there, or someone else added the page while we didn't hold the
// spinlock.
//
// SAFETY: The pointer is valid, and this is the right shrinker.
//
// The shrinker can't free the page between the check and this call to
// `list_lru_del` because we hold the lock.
unsafe { PageInfo::list_lru_del(page_info, self.shrinker) };
return Ok(());
}
let vma_addr = inner.vma_addr;
// Release the spinlock while we insert the page into the vma.
drop(inner);
// No overflow since we stay in bounds of the vma.
let user_page_addr = vma_addr + (i << PAGE_SHIFT);
// We use `mmput_async` when dropping the `mm` because `use_page_slow` is usually used from
// a remote process. If the call to `mmput` races with the process shutting down, then the
// caller of `use_page_slow` becomes responsible for cleaning up the `mm`, which doesn't
// happen until it returns to userspace. However, the caller might instead go to sleep and
// wait for the owner of the `mm` to wake it up, which doesn't happen because it's in the
// middle of a shutdown process that wont complete until the `mm` is dropped. This can
// amount to a deadlock.
//
// Using `mmput_async` avoids this, because then the `mm` cleanup is instead queued to a
// workqueue.
MmWithUser::into_mmput_async(self.mm.mmget_not_zero().ok_or(ESRCH)?)
.mmap_read_lock()
.vma_lookup(vma_addr)
.ok_or(ESRCH)?
.as_mixedmap_vma()
.ok_or(ESRCH)?
.vm_insert_page(user_page_addr, &new_page)
.inspect_err(|err| {
pr_warn!(
"Failed to vm_insert_page({}): vma_addr:{} i:{} err:{:?}",
user_page_addr,
vma_addr,
i,
err
)
})?;
let inner = self.lock.lock();
// SAFETY: The `page_info` pointer is valid and currently does not have a page. The page
// can be written to since we hold the lock.
//
// We released and reacquired the spinlock since we checked that the page is null, but we
// always hold the mm_lock mutex when setting the page to a non-null value, so it's not
// possible for someone else to have changed it since our check.
unsafe { PageInfo::set_page(page_info, new_page) };
drop(inner);
drop(mm_mutex);
Ok(())
}
/// If the given page is in use, then mark it as available so that the shrinker can free it.
///
/// May be called from an atomic context.
pub(crate) fn stop_using_range(&self, start: usize, end: usize) {
crate::trace::trace_update_page_range(self.pid, false, start, end);
if start >= end {
return;
}
let inner = self.lock.lock();
assert!(end <= inner.size);
for i in (start..end).rev() {
// SAFETY: The pointer is in bounds.
let page_info = unsafe { inner.pages.add(i) };
// SAFETY: Okay for reading since we have the lock.
if unsafe { PageInfo::has_page(page_info) } {
crate::trace::trace_free_lru_start(self.pid, i);
// SAFETY: The pointer is valid, and it's the right shrinker.
unsafe { PageInfo::list_lru_add(page_info, self.shrinker) };
crate::trace::trace_free_lru_end(self.pid, i);
}
}
}
/// Helper for reading or writing to a range of bytes that may overlap with several pages.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
unsafe fn iterate<T>(&self, mut offset: usize, mut size: usize, mut cb: T) -> Result
where
T: FnMut(&Page, usize, usize) -> Result,
{
if size == 0 {
return Ok(());
}
// SAFETY: The caller promises that the pages touched by this call are in use. It's only
// possible for a page to be in use if we have already been registered with a vma, and we
// only change the `pages` and `size` fields during registration with a vma, so there is no
// race when we read them here without taking the lock.
let (pages, num_pages) = {
let inner = self.lock.lock();
(inner.pages, inner.size)
};
let num_bytes = num_pages << PAGE_SHIFT;
// Check that the request is within the buffer.
if offset.checked_add(size).ok_or(EFAULT)? > num_bytes {
return Err(EFAULT);
}
let mut page_index = offset >> PAGE_SHIFT;
offset &= PAGE_SIZE - 1;
while size > 0 {
let available = usize::min(size, PAGE_SIZE - offset);
// SAFETY: The pointer is in bounds.
let page_info = unsafe { pages.add(page_index) };
// SAFETY: The caller guarantees that this page is in the "in use" state for the
// duration of this call to `iterate`, so nobody will change the page.
let page = unsafe { PageInfo::get_page(page_info) };
if page.is_none() {
pr_warn!("Page is null!");
}
let page = page.ok_or(EFAULT)?;
cb(page, offset, available)?;
size -= available;
page_index += 1;
offset = 0;
}
Ok(())
}
/// Copy from userspace into this page range.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
pub(crate) unsafe fn copy_from_user_slice(
&self,
reader: &mut UserSliceReader,
offset: usize,
size: usize,
) -> Result {
// SAFETY: `self.iterate` has the same safety requirements as `copy_from_user_slice`.
unsafe {
self.iterate(offset, size, |page, offset, to_copy| {
page.copy_from_user_slice_raw(reader, offset, to_copy)
})
}
}
/// Copy from this page range into kernel space.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
pub(crate) unsafe fn read<T: FromBytes>(&self, offset: usize) -> Result<T> {
let mut out = MaybeUninit::<T>::uninit();
let mut out_offset = 0;
// SAFETY: `self.iterate` has the same safety requirements as `read`.
unsafe {
self.iterate(offset, size_of::<T>(), |page, offset, to_copy| {
// SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T.
let obj_ptr = (out.as_mut_ptr() as *mut u8).add(out_offset);
// SAFETY: The pointer points is in-bounds of the `out` variable, so it is valid.
page.read_raw(obj_ptr, offset, to_copy)?;
out_offset += to_copy;
Ok(())
})?;
}
// SAFETY: We just initialised the data.
Ok(unsafe { out.assume_init() })
}
/// Copy from kernel space into this page range.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
pub(crate) unsafe fn write<T: ?Sized>(&self, offset: usize, obj: &T) -> Result {
let mut obj_offset = 0;
// SAFETY: `self.iterate` has the same safety requirements as `write`.
unsafe {
self.iterate(offset, size_of_val(obj), |page, offset, to_copy| {
// SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T.
let obj_ptr = (obj as *const T as *const u8).add(obj_offset);
// SAFETY: We have a reference to the object, so the pointer is valid.
page.write_raw(obj_ptr, offset, to_copy)?;
obj_offset += to_copy;
Ok(())
})
}
}
/// Write zeroes to the given range.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
pub(crate) unsafe fn fill_zero(&self, offset: usize, size: usize) -> Result {
// SAFETY: `self.iterate` has the same safety requirements as `copy_into`.
unsafe {
self.iterate(offset, size, |page, offset, len| {
page.fill_zero_raw(offset, len)
})
}
}
}
#[pinned_drop]
impl PinnedDrop for ShrinkablePageRange {
fn drop(self: Pin<&mut Self>) {
let (pages, size) = {
let lock = self.lock.lock();
(lock.pages, lock.size)
};
if size == 0 {
return;
}
// This is the destructor, so unlike the other methods, we only need to worry about races
// with the shrinker here.
for i in 0..size {
// SAFETY: The pointer is valid and it's the right shrinker.
unsafe { PageInfo::list_lru_del(pages.add(i), self.shrinker) };
// SAFETY: If the shrinker was going to free this page, then it would have taken it
// from the PageInfo before releasing the lru lock. Thus, the call to `list_lru_del`
// will either remove it before the shrinker can access it, or the shrinker will
// already have taken the page at this point.
unsafe { drop(PageInfo::take_page(pages.add(i))) };
}
// Wait for users of the mutex to go away. This call is necessary for the safety of
// `stable_trylock_mm`.
drop(self.mm_lock.lock());
let Some(pages) = NonNull::new(pages) else {
return;
};
// SAFETY: This computation did not overflow when allocating the pages array, so it will
// not overflow this time.
let layout = unsafe { Layout::array::<PageInfo>(size).unwrap_unchecked() };
// SAFETY: The `pages` array was allocated with the same layout.
unsafe { Kmalloc::free(pages.cast(), layout) };
}
}
#[no_mangle]
unsafe extern "C" fn rust_shrink_count(
shrink: *mut bindings::shrinker,
_sc: *mut bindings::shrink_control,
) -> c_ulong {
// SAFETY: This method is only used with the `Shrinker` type, and the cast is valid since
// `shrinker` is the first field of a #[repr(C)] struct.
let shrinker = unsafe { &*shrink.cast::<Shrinker>() };
// SAFETY: Accessing the lru list is okay. Just an FFI call.
unsafe { bindings::list_lru_count(shrinker.list_lru.get()) }
}
#[no_mangle]
unsafe extern "C" fn rust_shrink_scan(
shrink: *mut bindings::shrinker,
sc: *mut bindings::shrink_control,
) -> c_ulong {
// SAFETY: This method is only used with the `Shrinker` type, and the cast is valid since
// `shrinker` is the first field of a #[repr(C)] struct.
let shrinker = unsafe { &*shrink.cast::<Shrinker>() };
// SAFETY: Caller guarantees that it is safe to read this field.
let nr_to_scan = unsafe { (*sc).nr_to_scan };
// SAFETY: Accessing the lru list is okay. Just an FFI call.
unsafe {
extern "C" {
fn rust_shrink_free_page_wrap(
item: *mut bindings::list_head,
list: *mut bindings::list_lru_one,
lock: *mut bindings::spinlock_t,
cb_arg: *mut kernel::ffi::c_void,
) -> bindings::lru_status;
}
bindings::list_lru_walk(
shrinker.list_lru.get(),
Some(rust_shrink_free_page_wrap),
ptr::null_mut(),
nr_to_scan,
)
}
}
const LRU_SKIP: bindings::lru_status = bindings::lru_status_LRU_SKIP;
const LRU_REMOVED_ENTRY: bindings::lru_status = bindings::lru_status_LRU_REMOVED_RETRY;
#[no_mangle]
unsafe extern "C" fn rust_shrink_free_page(
item: *mut bindings::list_head,
lru: *mut bindings::list_lru_one,
lru_lock: *mut bindings::spinlock_t,
_cb_arg: *mut c_void,
) -> bindings::lru_status {
// Fields that should survive after unlocking the lru lock.
let pid;
let page;
let page_index;
let mm;
let mmap_read;
let mm_mutex;
let vma_addr;
{
// SAFETY: The `list_head` field is first in `PageInfo`.
let info = item as *mut PageInfo;
let range = unsafe { &*((*info).range) };
mm = match range.mm.mmget_not_zero() {
Some(mm) => MmWithUser::into_mmput_async(mm),
None => return LRU_SKIP,
};
mm_mutex = match range.stable_trylock_mm() {
Some(guard) => guard,
None => return LRU_SKIP,
};
mmap_read = match mm.mmap_read_trylock() {
Some(guard) => guard,
None => return LRU_SKIP,
};
// We can't lock it normally here, since we hold the lru lock.
let inner = match range.lock.try_lock() {
Some(inner) => inner,
None => return LRU_SKIP,
};
// SAFETY: The item is in this lru list, so it's okay to remove it.
unsafe { bindings::list_lru_isolate(lru, item) };
// SAFETY: Both pointers are in bounds of the same allocation.
page_index = unsafe { info.offset_from(inner.pages) } as usize;
pid = range.pid;
crate::trace::trace_unmap_kernel_start(pid, page_index);
// SAFETY: We hold the spinlock, so we can take the page.
//
// This sets the page pointer to zero before we unmap it from the vma. However, we call
// `zap_page_range` before we release the mmap lock, so `use_page_slow` will not be able to
// insert a new page until after our call to `zap_page_range`.
page = unsafe { PageInfo::take_page(info) };
vma_addr = inner.vma_addr;
crate::trace::trace_unmap_kernel_end(pid, page_index);
// From this point on, we don't access this PageInfo or ShrinkablePageRange again, because
// they can be freed at any point after we unlock `lru_lock`. This is with the exception of
// `mm_mutex` which is kept alive by holding the lock.
}
// SAFETY: The lru lock is locked when this method is called.
unsafe { bindings::spin_unlock(lru_lock) };
if let Some(vma) = mmap_read.vma_lookup(vma_addr) {
let user_page_addr = vma_addr + (page_index << PAGE_SHIFT);
crate::trace::trace_unmap_user_start(pid, page_index);
vma.zap_page_range_single(user_page_addr, PAGE_SIZE);
crate::trace::trace_unmap_user_end(pid, page_index);
}
drop(mmap_read);
drop(mm_mutex);
drop(mm);
drop(page);
// SAFETY: We just unlocked the lru lock, but it should be locked when we return.
unsafe { bindings::spin_lock(lru_lock) };
LRU_REMOVED_ENTRY
}

View File

@@ -0,0 +1,25 @@
// SPDX-License-Identifier: GPL-2.0
/* C helper for page_range.rs to work around a CFI violation.
*
* Bindgen currently pretends that `enum lru_status` is the same as an integer.
* This assumption is fine ABI-wise, but once you add CFI to the mix, it
* triggers a CFI violation because `enum lru_status` gets a different CFI tag.
*
* This file contains a workaround until bindgen can be fixed.
*
* Copyright (C) 2024 Google LLC.
*/
#include <linux/list_lru.h>
#include <linux/spinlock.h>
unsigned int rust_shrink_free_page(struct list_head *item,
struct list_lru_one *list, spinlock_t *lock,
void *cb_arg);
enum lru_status
rust_shrink_free_page_wrap(struct list_head *item, struct list_lru_one *list,
spinlock_t *lock, void *cb_arg)
{
return rust_shrink_free_page(item, list, lock, cb_arg);
}

View File

@@ -0,0 +1,80 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
//! This module defines the types and methods relevant to priority inheritance.
use kernel::bindings;
pub(crate) type Policy = kernel::ffi::c_uint;
pub(crate) type Priority = kernel::ffi::c_int;
pub(crate) type Nice = kernel::ffi::c_int;
pub(crate) const SCHED_NORMAL: Policy = bindings::SCHED_NORMAL;
pub(crate) const SCHED_FIFO: Policy = bindings::SCHED_FIFO;
pub(crate) const MIN_NICE: Nice = bindings::MIN_NICE as _;
pub(crate) const MAX_NICE: Nice = bindings::MAX_NICE as _;
pub(crate) const DEFAULT_PRIO: Priority = bindings::DEFAULT_PRIO as _;
pub(crate) const MAX_RT_PRIO: Priority = bindings::MAX_RT_PRIO as _;
/// Scheduler policy and priority.
///
/// The binder driver supports inheriting the following scheduler policies:
/// * SCHED_NORMAL
/// * SCHED_BATCH
/// * SCHED_FIFO
/// * SCHED_RR
#[derive(Copy, Clone, Default)]
pub(crate) struct BinderPriority {
pub(crate) sched_policy: Policy,
pub(crate) prio: Priority,
}
#[derive(Copy, Clone, Eq, PartialEq)]
pub(crate) enum PriorityState {
Set,
Pending,
Abort,
}
pub(crate) fn get_default_prio_from_task(task: &kernel::task::Task) -> BinderPriority {
if is_supported_policy(task.policy()) {
BinderPriority {
sched_policy: task.policy(),
prio: task.normal_prio(),
}
} else {
BinderPriority {
sched_policy: SCHED_NORMAL,
prio: DEFAULT_PRIO,
}
}
}
pub(crate) fn is_rt_policy(policy: Policy) -> bool {
policy == bindings::SCHED_FIFO || policy == bindings::SCHED_RR
}
pub(crate) fn is_fair_policy(policy: Policy) -> bool {
policy == bindings::SCHED_NORMAL || policy == bindings::SCHED_BATCH
}
pub(crate) fn is_supported_policy(policy: Policy) -> bool {
is_fair_policy(policy) || is_rt_policy(policy)
}
pub(crate) fn to_userspace_prio(policy: Policy, prio: Priority) -> Nice {
if is_fair_policy(policy) {
prio - DEFAULT_PRIO
} else {
MAX_RT_PRIO - 1 - prio
}
}
pub(crate) fn to_kernel_prio(policy: Policy, prio: Nice) -> Priority {
if is_fair_policy(policy) {
prio + DEFAULT_PRIO
} else {
MAX_RT_PRIO - 1 - prio
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,247 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use kernel::{
page::{PAGE_MASK, PAGE_SIZE},
prelude::*,
seq_file::SeqFile,
seq_print,
task::Pid,
};
use crate::range_alloc::{DescriptorState, FreedRange, Range};
/// Keeps track of allocations in a process' mmap.
///
/// Each process has an mmap where the data for incoming transactions will be placed. This struct
/// keeps track of allocations made in the mmap. For each allocation, we store a descriptor that
/// has metadata related to the allocation. We also keep track of available free space.
pub(super) struct ArrayRangeAllocator<T> {
/// This stores all ranges that are allocated. Unlike the tree based allocator, we do *not*
/// store the free ranges.
///
/// Sorted by offset.
pub(super) ranges: KVec<Range<T>>,
size: usize,
free_oneway_space: usize,
}
struct FindEmptyRes {
/// Which index in `ranges` should we insert the new range at?
///
/// Inserting the new range at this index keeps `ranges` sorted.
insert_at_idx: usize,
/// Which offset should we insert the new range at?
insert_at_offset: usize,
}
impl<T> ArrayRangeAllocator<T> {
pub(crate) fn new(size: usize, alloc: EmptyArrayAlloc<T>) -> Self {
Self {
ranges: alloc.ranges,
size,
free_oneway_space: size / 2,
}
}
pub(crate) fn free_oneway_space(&self) -> usize {
self.free_oneway_space
}
pub(crate) fn count_buffers(&self) -> usize {
self.ranges.len()
}
pub(crate) fn total_size(&self) -> usize {
self.size
}
pub(crate) fn is_full(&self) -> bool {
self.ranges.len() == self.ranges.capacity()
}
pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> {
for range in &self.ranges {
seq_print!(
m,
" buffer {}: {} size {} pid {} oneway {}",
0,
range.offset,
range.size,
range.state.pid(),
range.state.is_oneway(),
);
if let DescriptorState::Reserved(_) = range.state {
seq_print!(m, " reserved\n");
} else {
seq_print!(m, " allocated\n");
}
}
Ok(())
}
/// Find somewhere to put a new range.
///
/// Unlike the tree implementation, we do not bother to find the smallest gap. The idea is that
/// fragmentation isn't a big issue when we don't have many ranges.
///
/// Returns the index that the new range should have in `self.ranges` after insertion.
fn find_empty_range(&self, size: usize) -> Option<FindEmptyRes> {
let after_last_range = self.ranges.last().map(Range::endpoint).unwrap_or(0);
if size <= self.total_size() - after_last_range {
// We can put the range at the end, so just do that.
Some(FindEmptyRes {
insert_at_idx: self.ranges.len(),
insert_at_offset: after_last_range,
})
} else {
let mut end_of_prev = 0;
for (i, range) in self.ranges.iter().enumerate() {
// Does it fit before the i'th range?
if size <= range.offset - end_of_prev {
return Some(FindEmptyRes {
insert_at_idx: i,
insert_at_offset: end_of_prev,
});
}
end_of_prev = range.endpoint();
}
None
}
}
pub(crate) fn reserve_new(
&mut self,
debug_id: usize,
size: usize,
is_oneway: bool,
pid: Pid,
) -> Result<usize> {
// Compute new value of free_oneway_space, which is set only on success.
let new_oneway_space = if is_oneway {
match self.free_oneway_space.checked_sub(size) {
Some(new_oneway_space) => new_oneway_space,
None => return Err(ENOSPC),
}
} else {
self.free_oneway_space
};
let FindEmptyRes {
insert_at_idx,
insert_at_offset,
} = self.find_empty_range(size).ok_or(ENOSPC)?;
self.free_oneway_space = new_oneway_space;
let new_range = Range {
offset: insert_at_offset,
size,
state: DescriptorState::new(is_oneway, debug_id, pid),
};
// Insert the value at the given index to keep the array sorted.
self.ranges.insert_within_capacity(insert_at_idx, new_range).ok().unwrap();
Ok(insert_at_offset)
}
pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result<FreedRange> {
// This could use a binary search, but linear scans are usually faster for small arrays.
let i = self
.ranges
.iter()
.position(|range| range.offset == offset)
.ok_or(EINVAL)?;
let range = &self.ranges[i];
if let DescriptorState::Allocated(_) = range.state {
return Err(EPERM);
}
let size = range.size;
let offset = range.offset;
if range.state.is_oneway() {
self.free_oneway_space += size;
}
// This computes the range of pages that are no longer used by *any* allocated range. The
// caller will mark them as unused, which means that they can be freed if the system comes
// under memory pressure.
let mut freed_range = FreedRange::interior_pages(offset, size);
if offset % PAGE_SIZE != 0 {
if i == 0 || self.ranges[i - 1].endpoint() <= (offset & PAGE_MASK) {
freed_range.start_page_idx -= 1;
}
}
if range.endpoint() % PAGE_SIZE != 0 {
let page_after = (range.endpoint() & PAGE_MASK) + PAGE_SIZE;
if i + 1 == self.ranges.len() || page_after <= self.ranges[i + 1].offset {
freed_range.end_page_idx += 1;
}
}
self.ranges.remove(i)?;
Ok(freed_range)
}
pub(crate) fn reservation_commit(&mut self, offset: usize, data: Option<T>) -> Result {
// This could use a binary search, but linear scans are usually faster for small arrays.
let range = self
.ranges
.iter_mut()
.find(|range| range.offset == offset)
.ok_or(ENOENT)?;
let DescriptorState::Reserved(reservation) = &range.state else {
return Err(ENOENT);
};
range.state = DescriptorState::Allocated(reservation.clone().allocate(data));
Ok(())
}
pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option<T>)> {
// This could use a binary search, but linear scans are usually faster for small arrays.
let range = self
.ranges
.iter_mut()
.find(|range| range.offset == offset)
.ok_or(ENOENT)?;
let DescriptorState::Allocated(allocation) = &mut range.state else {
return Err(ENOENT);
};
let data = allocation.take();
let debug_id = allocation.reservation.debug_id;
range.state = DescriptorState::Reserved(allocation.reservation.clone());
Ok((range.size, debug_id, data))
}
pub(crate) fn take_for_each<F: Fn(usize, usize, usize, Option<T>)>(&mut self, callback: F) {
for range in self.ranges.iter_mut() {
if let DescriptorState::Allocated(allocation) = &mut range.state {
callback(
range.offset,
range.size,
allocation.reservation.debug_id,
allocation.data.take(),
);
}
}
}
}
pub(crate) struct EmptyArrayAlloc<T> {
ranges: KVec<Range<T>>,
}
impl<T> EmptyArrayAlloc<T> {
pub(crate) fn try_new(capacity: usize) -> Result<Self> {
Ok(Self {
ranges: KVec::with_capacity(capacity, GFP_KERNEL)?,
})
}
}

View File

@@ -0,0 +1,326 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use kernel::{page::PAGE_SIZE, prelude::*, seq_file::SeqFile, task::Pid};
mod tree;
use self::tree::{FromArrayAllocs, ReserveNewTreeAlloc, TreeRangeAllocator};
mod array;
use self::array::{ArrayRangeAllocator, EmptyArrayAlloc};
enum DescriptorState<T> {
Reserved(Reservation),
Allocated(Allocation<T>),
}
impl<T> DescriptorState<T> {
fn new(is_oneway: bool, debug_id: usize, pid: Pid) -> Self {
DescriptorState::Reserved(Reservation {
debug_id,
is_oneway,
pid,
})
}
fn pid(&self) -> Pid {
match self {
DescriptorState::Reserved(inner) => inner.pid,
DescriptorState::Allocated(inner) => inner.reservation.pid,
}
}
fn is_oneway(&self) -> bool {
match self {
DescriptorState::Reserved(inner) => inner.is_oneway,
DescriptorState::Allocated(inner) => inner.reservation.is_oneway,
}
}
}
#[derive(Clone)]
struct Reservation {
debug_id: usize,
is_oneway: bool,
pid: Pid,
}
impl Reservation {
fn allocate<T>(self, data: Option<T>) -> Allocation<T> {
Allocation {
data,
reservation: self,
}
}
}
struct Allocation<T> {
reservation: Reservation,
data: Option<T>,
}
impl<T> Allocation<T> {
fn deallocate(self) -> (Reservation, Option<T>) {
(self.reservation, self.data)
}
fn debug_id(&self) -> usize {
self.reservation.debug_id
}
fn take(&mut self) -> Option<T> {
self.data.take()
}
}
/// The array implementation must switch to the tree if it wants to go beyond this number of
/// ranges.
const TREE_THRESHOLD: usize = 8;
/// Represents a range of pages that have just become completely free.
#[derive(Copy, Clone)]
pub(crate) struct FreedRange {
pub(crate) start_page_idx: usize,
pub(crate) end_page_idx: usize,
}
impl FreedRange {
fn interior_pages(offset: usize, size: usize) -> FreedRange {
FreedRange {
// Divide round up
start_page_idx: (offset + (PAGE_SIZE - 1)) / PAGE_SIZE,
// Divide round down
end_page_idx: (offset + size) / PAGE_SIZE,
}
}
}
struct Range<T> {
offset: usize,
size: usize,
state: DescriptorState<T>,
}
impl<T> Range<T> {
fn endpoint(&self) -> usize {
self.offset + self.size
}
}
pub(crate) struct RangeAllocator<T> {
inner: Impl<T>,
}
enum Impl<T> {
Empty(usize),
Array(ArrayRangeAllocator<T>),
Tree(TreeRangeAllocator<T>),
}
impl<T> RangeAllocator<T> {
pub(crate) fn new(size: usize) -> Self {
Self {
inner: Impl::Empty(size),
}
}
pub(crate) fn free_oneway_space(&self) -> usize {
match &self.inner {
Impl::Empty(size) => size / 2,
Impl::Array(array) => array.free_oneway_space(),
Impl::Tree(tree) => tree.free_oneway_space(),
}
}
pub(crate) fn count_buffers(&self) -> usize {
match &self.inner {
Impl::Empty(_size) => 0,
Impl::Array(array) => array.count_buffers(),
Impl::Tree(tree) => tree.count_buffers(),
}
}
pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> {
match &self.inner {
Impl::Empty(_size) => Ok(()),
Impl::Array(array) => array.debug_print(m),
Impl::Tree(tree) => tree.debug_print(m),
}
}
/// Try to reserve a new buffer, using the provided allocation if necessary.
pub(crate) fn reserve_new(&mut self, mut args: ReserveNewArgs<T>) -> Result<ReserveNew<T>> {
match &mut self.inner {
Impl::Empty(size) => {
let empty_array = match args.empty_array_alloc.take() {
Some(empty_array) => ArrayRangeAllocator::new(*size, empty_array),
None => {
return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc {
args,
need_empty_array_alloc: true,
need_new_tree_alloc: false,
need_tree_alloc: false,
}))
}
};
self.inner = Impl::Array(empty_array);
self.reserve_new(args)
}
Impl::Array(array) if array.is_full() => {
let allocs = match args.new_tree_alloc {
Some(ref mut allocs) => allocs,
None => {
return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc {
args,
need_empty_array_alloc: false,
need_new_tree_alloc: true,
need_tree_alloc: true,
}))
}
};
let new_tree =
TreeRangeAllocator::from_array(array.total_size(), &mut array.ranges, allocs);
self.inner = Impl::Tree(new_tree);
self.reserve_new(args)
}
Impl::Array(array) => {
let offset =
array.reserve_new(args.debug_id, args.size, args.is_oneway, args.pid)?;
Ok(ReserveNew::Success(ReserveNewSuccess {
offset,
oneway_spam_detected: false,
_empty_array_alloc: args.empty_array_alloc,
_new_tree_alloc: args.new_tree_alloc,
_tree_alloc: args.tree_alloc,
}))
}
Impl::Tree(tree) => {
let alloc = match args.tree_alloc {
Some(alloc) => alloc,
None => {
return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc {
args,
need_empty_array_alloc: false,
need_new_tree_alloc: false,
need_tree_alloc: true,
}));
}
};
let (offset, oneway_spam_detected) =
tree.reserve_new(args.debug_id, args.size, args.is_oneway, args.pid, alloc)?;
Ok(ReserveNew::Success(ReserveNewSuccess {
offset,
oneway_spam_detected,
_empty_array_alloc: args.empty_array_alloc,
_new_tree_alloc: args.new_tree_alloc,
_tree_alloc: None,
}))
}
}
}
/// Deletes the allocations at `offset`.
pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result<FreedRange> {
match &mut self.inner {
Impl::Empty(_size) => Err(EINVAL),
Impl::Array(array) => array.reservation_abort(offset),
Impl::Tree(tree) => {
let freed_range = tree.reservation_abort(offset)?;
if tree.is_empty() {
self.inner = Impl::Empty(tree.total_size());
}
Ok(freed_range)
}
}
}
/// Called when an allocation is no longer in use by the kernel.
pub(crate) fn reservation_commit(&mut self, offset: usize, data: Option<T>) -> Result {
match &mut self.inner {
Impl::Empty(_size) => Err(EINVAL),
Impl::Array(array) => array.reservation_commit(offset, data),
Impl::Tree(tree) => tree.reservation_commit(offset, data),
}
}
/// Called when the kernel starts using an allocation.
///
/// Returns the size of the existing entry and the data associated with it.
pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option<T>)> {
match &mut self.inner {
Impl::Empty(_size) => Err(EINVAL),
Impl::Array(array) => array.reserve_existing(offset),
Impl::Tree(tree) => tree.reserve_existing(offset),
}
}
/// Call the provided callback at every allocated region.
///
/// This destroys the range allocator. Used only during shutdown.
pub(crate) fn take_for_each<F: Fn(usize, usize, usize, Option<T>)>(&mut self, callback: F) {
match &mut self.inner {
Impl::Empty(_size) => {}
Impl::Array(array) => array.take_for_each(callback),
Impl::Tree(tree) => tree.take_for_each(callback),
}
}
}
/// The arguments for `reserve_new`.
#[derive(Default)]
pub(crate) struct ReserveNewArgs<T> {
pub(crate) size: usize,
pub(crate) is_oneway: bool,
pub(crate) debug_id: usize,
pub(crate) pid: Pid,
pub(crate) empty_array_alloc: Option<EmptyArrayAlloc<T>>,
pub(crate) new_tree_alloc: Option<FromArrayAllocs<T>>,
pub(crate) tree_alloc: Option<ReserveNewTreeAlloc<T>>,
}
/// The return type of `ReserveNew`.
pub(crate) enum ReserveNew<T> {
Success(ReserveNewSuccess<T>),
NeedAlloc(ReserveNewNeedAlloc<T>),
}
/// Returned by `reserve_new` when the reservation was successul.
pub(crate) struct ReserveNewSuccess<T> {
pub(crate) offset: usize,
pub(crate) oneway_spam_detected: bool,
// If the user supplied an allocation that we did not end up using, then we return it here.
// The caller will kfree it outside of the lock.
_empty_array_alloc: Option<EmptyArrayAlloc<T>>,
_new_tree_alloc: Option<FromArrayAllocs<T>>,
_tree_alloc: Option<ReserveNewTreeAlloc<T>>,
}
/// Returned by `reserve_new` to request the caller to make an allocation before calling the method
/// again.
pub(crate) struct ReserveNewNeedAlloc<T> {
args: ReserveNewArgs<T>,
need_empty_array_alloc: bool,
need_new_tree_alloc: bool,
need_tree_alloc: bool,
}
impl<T> ReserveNewNeedAlloc<T> {
/// Make the necessary allocations for another call to `reserve_new`.
pub(crate) fn make_alloc(mut self) -> Result<ReserveNewArgs<T>> {
if self.need_empty_array_alloc && self.args.empty_array_alloc.is_none() {
self.args.empty_array_alloc = Some(EmptyArrayAlloc::try_new(TREE_THRESHOLD)?);
}
if self.need_new_tree_alloc && self.args.new_tree_alloc.is_none() {
self.args.new_tree_alloc = Some(FromArrayAllocs::try_new(TREE_THRESHOLD)?);
}
if self.need_tree_alloc && self.args.tree_alloc.is_none() {
self.args.tree_alloc = Some(ReserveNewTreeAlloc::try_new()?);
}
Ok(self.args)
}
}

View File

@@ -0,0 +1,500 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use kernel::{
page::PAGE_SIZE,
prelude::*,
rbtree::{RBTree, RBTreeNode, RBTreeNodeReservation},
seq_file::SeqFile,
seq_print,
task::Pid,
};
use crate::range_alloc::{DescriptorState, FreedRange, Range};
/// Keeps track of allocations in a process' mmap.
///
/// Each process has an mmap where the data for incoming transactions will be placed. This struct
/// keeps track of allocations made in the mmap. For each allocation, we store a descriptor that
/// has metadata related to the allocation. We also keep track of available free space.
pub(super) struct TreeRangeAllocator<T> {
/// This collection contains descriptors for *both* ranges containing an allocation, *and* free
/// ranges between allocations. The free ranges get merged, so there are never two free ranges
/// next to each other.
tree: RBTree<usize, Descriptor<T>>,
/// Contains an entry for every free range in `self.tree`. This tree sorts the ranges by size,
/// letting us look up the smallest range whose size is at least some lower bound.
free_tree: RBTree<FreeKey, ()>,
size: usize,
free_oneway_space: usize,
}
impl<T> TreeRangeAllocator<T> {
pub(crate) fn from_array(
size: usize,
ranges: &mut KVec<Range<T>>,
alloc: &mut FromArrayAllocs<T>,
) -> Self {
let mut tree = TreeRangeAllocator {
tree: RBTree::new(),
free_tree: RBTree::new(),
size,
free_oneway_space: size / 2,
};
let mut free_offset = 0;
for range in ranges.drain_all() {
let free_size = range.offset - free_offset;
if free_size > 0 {
let free_node = alloc.free_tree.pop().unwrap();
tree.free_tree
.insert(free_node.into_node((free_size, free_offset), ()));
let tree_node = alloc.tree.pop().unwrap();
tree.tree.insert(
tree_node.into_node(free_offset, Descriptor::new(free_offset, free_size)),
);
}
free_offset = range.endpoint();
if range.state.is_oneway() {
tree.free_oneway_space = tree.free_oneway_space.saturating_sub(range.size);
}
let free_res = alloc.free_tree.pop().unwrap();
let tree_node = alloc.tree.pop().unwrap();
let mut desc = Descriptor::new(range.offset, range.size);
desc.state = Some((range.state, free_res));
tree.tree.insert(tree_node.into_node(range.offset, desc));
}
// After the last range, we may need a free range.
if free_offset < size {
let free_size = size - free_offset;
let free_node = alloc.free_tree.pop().unwrap();
tree.free_tree
.insert(free_node.into_node((free_size, free_offset), ()));
let tree_node = alloc.tree.pop().unwrap();
tree.tree
.insert(tree_node.into_node(free_offset, Descriptor::new(free_offset, free_size)));
}
tree
}
pub(crate) fn is_empty(&self) -> bool {
let mut tree_iter = self.tree.values();
// There's always at least one range, because index zero is either the start of a free or
// allocated range.
let first_value = tree_iter.next().unwrap();
if tree_iter.next().is_some() {
// There are never two free ranges next to each other, so if there is more than one
// descriptor, then at least one of them must hold an allocated range.
return false;
}
// There is only one descriptor. Return true if it is for a free range.
first_value.state.is_none()
}
pub(crate) fn total_size(&self) -> usize {
self.size
}
pub(crate) fn free_oneway_space(&self) -> usize {
self.free_oneway_space
}
pub(crate) fn count_buffers(&self) -> usize {
self.tree
.values()
.filter(|desc| desc.state.is_some())
.count()
}
pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> {
for desc in self.tree.values() {
let state = match &desc.state {
Some(state) => &state.0,
None => continue,
};
seq_print!(
m,
" buffer: {} size {} pid {}",
desc.offset,
desc.size,
state.pid(),
);
if state.is_oneway() {
seq_print!(m, " oneway");
}
match state {
DescriptorState::Reserved(_res) => {
seq_print!(m, " reserved\n");
}
DescriptorState::Allocated(_alloc) => {
seq_print!(m, " allocated\n");
}
}
}
Ok(())
}
fn find_best_match(&mut self, size: usize) -> Option<&mut Descriptor<T>> {
let free_cursor = self.free_tree.cursor_lower_bound(&(size, 0))?;
let ((_, offset), _) = free_cursor.current();
self.tree.get_mut(offset)
}
/// Try to reserve a new buffer, using the provided allocation if necessary.
pub(crate) fn reserve_new(
&mut self,
debug_id: usize,
size: usize,
is_oneway: bool,
pid: Pid,
alloc: ReserveNewTreeAlloc<T>,
) -> Result<(usize, bool)> {
// Compute new value of free_oneway_space, which is set only on success.
let new_oneway_space = if is_oneway {
match self.free_oneway_space.checked_sub(size) {
Some(new_oneway_space) => new_oneway_space,
None => return Err(ENOSPC),
}
} else {
self.free_oneway_space
};
// Start detecting spammers once we have less than 20%
// of async space left (which is less than 10% of total
// buffer size).
//
// (This will short-circut, so `low_oneway_space` is
// only called when necessary.)
let oneway_spam_detected =
is_oneway && new_oneway_space < self.size / 10 && self.low_oneway_space(pid);
let (found_size, found_off, tree_node, free_tree_node) = match self.find_best_match(size) {
None => {
pr_warn!("ENOSPC from range_alloc.reserve_new - size: {}", size);
return Err(ENOSPC);
}
Some(desc) => {
let found_size = desc.size;
let found_offset = desc.offset;
// In case we need to break up the descriptor
let new_desc = Descriptor::new(found_offset + size, found_size - size);
let (tree_node, free_tree_node, desc_node_res) = alloc.initialize(new_desc);
desc.state = Some((
DescriptorState::new(is_oneway, debug_id, pid),
desc_node_res,
));
desc.size = size;
(found_size, found_offset, tree_node, free_tree_node)
}
};
self.free_oneway_space = new_oneway_space;
self.free_tree.remove(&(found_size, found_off));
if found_size != size {
self.tree.insert(tree_node);
self.free_tree.insert(free_tree_node);
}
Ok((found_off, oneway_spam_detected))
}
pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result<FreedRange> {
let mut cursor = self.tree.cursor_lower_bound(&offset).ok_or_else(|| {
pr_warn!(
"EINVAL from range_alloc.reservation_abort - offset: {}",
offset
);
EINVAL
})?;
let (_, desc) = cursor.current_mut();
if desc.offset != offset {
pr_warn!(
"EINVAL from range_alloc.reservation_abort - offset: {}",
offset
);
return Err(EINVAL);
}
let (reservation, free_node_res) = desc.try_change_state(|state| match state {
Some((DescriptorState::Reserved(reservation), free_node_res)) => {
(None, Ok((reservation, free_node_res)))
}
None => {
pr_warn!(
"EINVAL from range_alloc.reservation_abort - offset: {}",
offset
);
(None, Err(EINVAL))
}
allocated => {
pr_warn!(
"EPERM from range_alloc.reservation_abort - offset: {}",
offset
);
(allocated, Err(EPERM))
}
})?;
let mut size = desc.size;
let mut offset = desc.offset;
let free_oneway_space_add = if reservation.is_oneway { size } else { 0 };
self.free_oneway_space += free_oneway_space_add;
let mut freed_range = FreedRange::interior_pages(offset, size);
// Compute how large the next free region needs to be to include one more page in
// the newly freed range.
let add_next_page_needed = match (offset + size) % PAGE_SIZE {
0 => usize::MAX,
unalign => PAGE_SIZE - unalign,
};
// Compute how large the previous free region needs to be to include one more page
// in the newly freed range.
let add_prev_page_needed = match offset % PAGE_SIZE {
0 => usize::MAX,
unalign => unalign,
};
// Merge next into current if next is free
let remove_next = match cursor.peek_next() {
Some((_, next)) if next.state.is_none() => {
if next.size >= add_next_page_needed {
freed_range.end_page_idx += 1;
}
self.free_tree.remove(&(next.size, next.offset));
size += next.size;
true
}
_ => false,
};
if remove_next {
let (_, desc) = cursor.current_mut();
desc.size = size;
cursor.remove_next();
}
// Merge current into prev if prev is free
match cursor.peek_prev_mut() {
Some((_, prev)) if prev.state.is_none() => {
if prev.size >= add_prev_page_needed {
freed_range.start_page_idx -= 1;
}
// merge previous with current, remove current
self.free_tree.remove(&(prev.size, prev.offset));
offset = prev.offset;
size += prev.size;
prev.size = size;
cursor.remove_current();
}
_ => {}
};
self.free_tree
.insert(free_node_res.into_node((size, offset), ()));
Ok(freed_range)
}
pub(crate) fn reservation_commit(&mut self, offset: usize, data: Option<T>) -> Result {
let desc = self.tree.get_mut(&offset).ok_or_else(|| {
pr_warn!(
"ENOENT from range_alloc.reservation_commit - offset: {}",
offset
);
ENOENT
})?;
desc.try_change_state(|state| match state {
Some((DescriptorState::Reserved(reservation), free_node_res)) => (
Some((
DescriptorState::Allocated(reservation.allocate(data)),
free_node_res,
)),
Ok(()),
),
other => {
pr_warn!(
"ENOENT from range_alloc.reservation_commit - offset: {}",
offset
);
(other, Err(ENOENT))
}
})
}
/// Takes an entry at the given offset from [`DescriptorState::Allocated`] to
/// [`DescriptorState::Reserved`].
///
/// Returns the size of the existing entry and the data associated with it.
pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option<T>)> {
let desc = self.tree.get_mut(&offset).ok_or_else(|| {
pr_warn!(
"ENOENT from range_alloc.reserve_existing - offset: {}",
offset
);
ENOENT
})?;
let (debug_id, data) = desc.try_change_state(|state| match state {
Some((DescriptorState::Allocated(allocation), free_node_res)) => {
let (reservation, data) = allocation.deallocate();
let debug_id = reservation.debug_id;
(
Some((DescriptorState::Reserved(reservation), free_node_res)),
Ok((debug_id, data)),
)
}
other => {
pr_warn!(
"ENOENT from range_alloc.reserve_existing - offset: {}",
offset
);
(other, Err(ENOENT))
}
})?;
Ok((desc.size, debug_id, data))
}
/// Call the provided callback at every allocated region.
///
/// This destroys the range allocator. Used only during shutdown.
pub(crate) fn take_for_each<F: Fn(usize, usize, usize, Option<T>)>(&mut self, callback: F) {
for (_, desc) in self.tree.iter_mut() {
if let Some((DescriptorState::Allocated(allocation), _)) = &mut desc.state {
callback(
desc.offset,
desc.size,
allocation.debug_id(),
allocation.take(),
);
}
}
}
/// Find the amount and size of buffers allocated by the current caller.
///
/// The idea is that once we cross the threshold, whoever is responsible
/// for the low async space is likely to try to send another async transaction,
/// and at some point we'll catch them in the act. This is more efficient
/// than keeping a map per pid.
fn low_oneway_space(&self, calling_pid: Pid) -> bool {
let mut total_alloc_size = 0;
let mut num_buffers = 0;
for (_, desc) in self.tree.iter() {
if let Some((state, _)) = &desc.state {
if state.is_oneway() && state.pid() == calling_pid {
total_alloc_size += desc.size;
num_buffers += 1;
}
}
}
// Warn if this pid has more than 50 transactions, or more than 50% of
// async space (which is 25% of total buffer size). Oneway spam is only
// detected when the threshold is exceeded.
num_buffers > 50 || total_alloc_size > self.size / 4
}
}
type TreeDescriptorState<T> = (DescriptorState<T>, FreeNodeRes);
struct Descriptor<T> {
size: usize,
offset: usize,
state: Option<TreeDescriptorState<T>>,
}
impl<T> Descriptor<T> {
fn new(offset: usize, size: usize) -> Self {
Self {
size,
offset,
state: None,
}
}
fn try_change_state<F, Data>(&mut self, f: F) -> Result<Data>
where
F: FnOnce(Option<TreeDescriptorState<T>>) -> (Option<TreeDescriptorState<T>>, Result<Data>),
{
let (new_state, result) = f(self.state.take());
self.state = new_state;
result
}
}
// (Descriptor.size, Descriptor.offset)
type FreeKey = (usize, usize);
type FreeNodeRes = RBTreeNodeReservation<FreeKey, ()>;
/// An allocation for use by `reserve_new`.
pub(crate) struct ReserveNewTreeAlloc<T> {
tree_node_res: RBTreeNodeReservation<usize, Descriptor<T>>,
free_tree_node_res: FreeNodeRes,
desc_node_res: FreeNodeRes,
}
impl<T> ReserveNewTreeAlloc<T> {
pub(crate) fn try_new() -> Result<Self> {
let tree_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?;
let free_tree_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?;
let desc_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?;
Ok(Self {
tree_node_res,
free_tree_node_res,
desc_node_res,
})
}
fn initialize(
self,
desc: Descriptor<T>,
) -> (
RBTreeNode<usize, Descriptor<T>>,
RBTreeNode<FreeKey, ()>,
FreeNodeRes,
) {
let size = desc.size;
let offset = desc.offset;
(
self.tree_node_res.into_node(offset, desc),
self.free_tree_node_res.into_node((size, offset), ()),
self.desc_node_res,
)
}
}
/// An allocation for creating a tree from an `ArrayRangeAllocator`.
pub(crate) struct FromArrayAllocs<T> {
tree: KVec<RBTreeNodeReservation<usize, Descriptor<T>>>,
free_tree: KVec<RBTreeNodeReservation<FreeKey, ()>>,
}
impl<T> FromArrayAllocs<T> {
pub(crate) fn try_new(len: usize) -> Result<Self> {
let num_descriptors = 2 * len + 1;
let mut tree = KVec::with_capacity(num_descriptors, GFP_KERNEL)?;
for _ in 0..num_descriptors {
tree.push(RBTreeNodeReservation::new(GFP_KERNEL)?, GFP_KERNEL)?;
}
let mut free_tree = KVec::with_capacity(num_descriptors, GFP_KERNEL)?;
for _ in 0..num_descriptors {
free_tree.push(RBTreeNodeReservation::new(GFP_KERNEL)?, GFP_KERNEL)?;
}
Ok(Self { tree, free_tree })
}
}

View File

@@ -0,0 +1,142 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RUST_BINDER_H
#define _LINUX_RUST_BINDER_H
#include <uapi/linux/android/binder.h>
#include <uapi/linux/android/binderfs.h>
/*
* These symbols are exposed by `rust_binderfs.c` and exist here so that Rust
* Binder can call them.
*/
int init_rust_binderfs(void);
struct dentry;
struct inode;
struct dentry *rust_binderfs_create_proc_file(struct inode *nodp, int pid);
void rust_binderfs_remove_file(struct dentry *dentry);
typedef void *rust_binder_context;
/**
* struct binder_device - information about a binder device node
* @minor: the minor number used by this device
* @ctx: the Rust Context used by this device, or null for binder-control
*
* This is used as the private data for files directly in binderfs, but not
* files in the binder_logs subdirectory. This struct owns a refcount on `ctx`
* and the entry for `minor` in `binderfs_minors`. For binder-control `ctx` is
* null.
*/
struct binder_device {
int minor;
rust_binder_context ctx;
};
/*
* The internal data types in the Rust Binder driver are opaque to C, so we use
* void pointer typedefs for these types.
*/
typedef void *rust_binder_transaction;
typedef void *rust_binder_thread;
typedef void *rust_binder_process;
typedef void *rust_binder_node;
typedef void *rust_binder_ref_data;
struct rb_transaction_layout {
size_t debug_id;
size_t code;
size_t flags;
size_t from_thread;
size_t to_proc;
size_t target_node;
};
struct rb_thread_layout {
size_t arc_offset;
size_t process;
size_t id;
};
struct rb_process_layout {
size_t arc_offset;
size_t task;
};
struct rb_node_layout {
size_t arc_offset;
size_t debug_id;
size_t ptr;
};
struct rust_binder_layout {
struct rb_transaction_layout t;
struct rb_thread_layout th;
struct rb_process_layout p;
struct rb_node_layout n;
};
extern const struct rust_binder_layout RUST_BINDER_LAYOUT;
static inline size_t rust_binder_transaction_debug_id(rust_binder_transaction t)
{
return * (size_t *) (t + RUST_BINDER_LAYOUT.t.debug_id);
}
static inline u32 rust_binder_transaction_code(rust_binder_transaction t)
{
return * (u32 *) (t + RUST_BINDER_LAYOUT.t.code);
}
static inline u32 rust_binder_transaction_flags(rust_binder_transaction t)
{
return * (u32 *) (t + RUST_BINDER_LAYOUT.t.flags);
}
// Nullable!
static inline rust_binder_node rust_binder_transaction_target_node(rust_binder_transaction t)
{
void *p = * (void **) (t + RUST_BINDER_LAYOUT.t.target_node);
if (p)
p = p + RUST_BINDER_LAYOUT.n.arc_offset;
return p;
}
static inline rust_binder_thread rust_binder_transaction_from_thread(rust_binder_transaction t)
{
void *p = * (void **) (t + RUST_BINDER_LAYOUT.t.from_thread);
return p + RUST_BINDER_LAYOUT.th.arc_offset;
}
static inline rust_binder_process rust_binder_transaction_to_proc(rust_binder_transaction t)
{
void *p = * (void **) (t + RUST_BINDER_LAYOUT.t.to_proc);
return p + RUST_BINDER_LAYOUT.p.arc_offset;
}
static inline rust_binder_process rust_binder_thread_proc(rust_binder_thread t)
{
void *p = * (void **) (t + RUST_BINDER_LAYOUT.th.process);
return p + RUST_BINDER_LAYOUT.p.arc_offset;
}
static inline s32 rust_binder_thread_id(rust_binder_thread t)
{
return * (s32 *) (t + RUST_BINDER_LAYOUT.th.id);
}
static inline struct task_struct *rust_binder_process_task(rust_binder_process t)
{
return * (struct task_struct **) (t + RUST_BINDER_LAYOUT.p.task);
}
static inline size_t rust_binder_node_debug_id(rust_binder_node t)
{
return * (size_t *) (t + RUST_BINDER_LAYOUT.n.debug_id);
}
static inline binder_uintptr_t rust_binder_node_ptr(rust_binder_node t)
{
return * (binder_uintptr_t *) (t + RUST_BINDER_LAYOUT.n.ptr);
}
#endif

View File

@@ -0,0 +1,629 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
//! Binder -- the Android IPC mechanism.
#![recursion_limit = "256"]
use kernel::{
bindings::{self, seq_file},
fs::File,
list::{HasListLinks, ListArc, ListArcSafe, ListLinksSelfPtr, TryNewListArc},
prelude::*,
seq_file::SeqFile,
seq_print,
sync::poll::PollTable,
sync::Arc,
task::Pid,
types::{AsBytes, ForeignOwnable},
uaccess::UserSliceWriter,
};
use crate::{context::Context, page_range::Shrinker, process::Process, thread::Thread};
use core::{
ptr::NonNull,
sync::atomic::{AtomicBool, AtomicUsize, Ordering},
};
mod allocation;
mod context;
mod deferred_close;
mod defs;
mod error;
mod node;
mod page_range;
mod prio;
mod process;
mod range_alloc;
mod stats;
mod thread;
mod trace;
mod transaction;
#[allow(warnings)] // generated bindgen code
mod binderfs {
use kernel::bindings::{dentry, inode};
extern "C" {
pub fn init_rust_binderfs() -> kernel::ffi::c_int;
}
extern "C" {
pub fn rust_binderfs_create_proc_file(
nodp: *mut inode,
pid: kernel::ffi::c_int,
) -> *mut dentry;
}
extern "C" {
pub fn rust_binderfs_remove_file(dentry: *mut dentry);
}
pub type rust_binder_context = *mut kernel::ffi::c_void;
#[repr(C)]
#[derive(Copy, Clone)]
pub struct binder_device {
pub minor: kernel::ffi::c_int,
pub ctx: rust_binder_context,
}
impl Default for binder_device {
fn default() -> Self {
let mut s = ::core::mem::MaybeUninit::<Self>::uninit();
unsafe {
::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
s.assume_init()
}
}
}
}
module! {
type: BinderModule,
name: "rust_binder",
author: "Wedson Almeida Filho, Alice Ryhl",
description: "Android Binder",
license: "GPL",
}
use kernel::bindings::rust_binder_layout;
#[no_mangle]
static RUST_BINDER_LAYOUT: rust_binder_layout = rust_binder_layout {
t: transaction::TRANSACTION_LAYOUT,
th: thread::THREAD_LAYOUT,
p: process::PROCESS_LAYOUT,
n: node::NODE_LAYOUT,
};
fn next_debug_id() -> usize {
static NEXT_DEBUG_ID: AtomicUsize = AtomicUsize::new(0);
NEXT_DEBUG_ID.fetch_add(1, Ordering::Relaxed)
}
/// Provides a single place to write Binder return values via the
/// supplied `UserSliceWriter`.
pub(crate) struct BinderReturnWriter<'a> {
writer: UserSliceWriter,
thread: &'a Thread,
}
impl<'a> BinderReturnWriter<'a> {
fn new(writer: UserSliceWriter, thread: &'a Thread) -> Self {
BinderReturnWriter { writer, thread }
}
/// Write a return code back to user space.
/// Should be a `BR_` constant from [`defs`] e.g. [`defs::BR_TRANSACTION_COMPLETE`].
fn write_code(&mut self, code: u32) -> Result {
crate::trace::trace_return(code);
stats::GLOBAL_STATS.inc_br(code);
self.thread.process.stats.inc_br(code);
self.writer.write(&code)
}
/// Write something *other than* a return code to user space.
fn write_payload<T: AsBytes>(&mut self, payload: &T) -> Result {
self.writer.write(payload)
}
fn len(&self) -> usize {
self.writer.len()
}
}
/// Specifies how a type should be delivered to the read part of a BINDER_WRITE_READ ioctl.
///
/// When a value is pushed to the todo list for a process or thread, it is stored as a trait object
/// with the type `Arc<dyn DeliverToRead>`. Trait objects are a Rust feature that lets you
/// implement dynamic dispatch over many different types. This lets us store many different types
/// in the todo list.
trait DeliverToRead: ListArcSafe + Send + Sync {
/// Performs work. Returns true if remaining work items in the queue should be processed
/// immediately, or false if it should return to caller before processing additional work
/// items.
fn do_work(
self: DArc<Self>,
thread: &Thread,
writer: &mut BinderReturnWriter<'_>,
) -> Result<bool>;
/// Cancels the given work item. This is called instead of [`DeliverToRead::do_work`] when work
/// won't be delivered.
fn cancel(self: DArc<Self>);
/// Called when a work item is delivered directly to a specific thread, rather than to the
/// process work list.
fn on_thread_selected(&self, _thread: &thread::Thread);
/// Should we use `wake_up_interruptible_sync` or `wake_up_interruptible` when scheduling this
/// work item?
///
/// Generally only set to true for non-oneway transactions.
fn should_sync_wakeup(&self) -> bool;
fn debug_print(&self, m: &SeqFile, prefix: &str, transaction_prefix: &str) -> Result<()>;
}
// Wrapper around a `DeliverToRead` with linked list links.
#[pin_data]
struct DTRWrap<T: ?Sized> {
#[pin]
links: ListLinksSelfPtr<DTRWrap<dyn DeliverToRead>>,
#[pin]
wrapped: T,
}
kernel::list::impl_has_list_links_self_ptr! {
impl HasSelfPtr<DTRWrap<dyn DeliverToRead>> for DTRWrap<dyn DeliverToRead> { self.links }
}
kernel::list::impl_list_arc_safe! {
impl{T: ListArcSafe + ?Sized} ListArcSafe<0> for DTRWrap<T> {
tracked_by wrapped: T;
}
}
kernel::list::impl_list_item! {
impl ListItem<0> for DTRWrap<dyn DeliverToRead> {
using ListLinksSelfPtr;
}
}
impl<T: ?Sized> core::ops::Deref for DTRWrap<T> {
type Target = T;
fn deref(&self) -> &T {
&self.wrapped
}
}
type DArc<T> = kernel::sync::Arc<DTRWrap<T>>;
type DLArc<T> = kernel::list::ListArc<DTRWrap<T>>;
impl<T: ListArcSafe> DTRWrap<T> {
fn new(val: impl PinInit<T>) -> impl PinInit<Self> {
pin_init!(Self {
links <- ListLinksSelfPtr::new(),
wrapped <- val,
})
}
fn arc_try_new(val: T) -> Result<DLArc<T>, kernel::alloc::AllocError> {
ListArc::pin_init(
try_pin_init!(Self {
links <- ListLinksSelfPtr::new(),
wrapped: val,
}),
GFP_KERNEL,
)
.map_err(|_| kernel::alloc::AllocError)
}
fn arc_pin_init(init: impl PinInit<T>) -> Result<DLArc<T>, kernel::error::Error> {
ListArc::pin_init(
try_pin_init!(Self {
links <- ListLinksSelfPtr::new(),
wrapped <- init,
}),
GFP_KERNEL,
)
}
}
struct DeliverCode {
code: u32,
skip: AtomicBool,
}
kernel::list::impl_list_arc_safe! {
impl ListArcSafe<0> for DeliverCode { untracked; }
}
impl DeliverCode {
fn new(code: u32) -> Self {
Self {
code,
skip: AtomicBool::new(false),
}
}
/// Disable this DeliverCode and make it do nothing.
///
/// This is used instead of removing it from the work list, since `LinkedList::remove` is
/// unsafe, whereas this method is not.
fn skip(&self) {
self.skip.store(true, Ordering::Relaxed);
}
}
impl DeliverToRead for DeliverCode {
fn do_work(
self: DArc<Self>,
_thread: &Thread,
writer: &mut BinderReturnWriter<'_>,
) -> Result<bool> {
if !self.skip.load(Ordering::Relaxed) {
writer.write_code(self.code)?;
}
Ok(true)
}
fn cancel(self: DArc<Self>) {}
fn on_thread_selected(&self, _thread: &Thread) {}
fn should_sync_wakeup(&self) -> bool {
false
}
fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> {
seq_print!(m, "{}", prefix);
if self.skip.load(Ordering::Relaxed) {
seq_print!(m, "(skipped) ");
}
if self.code == defs::BR_TRANSACTION_COMPLETE {
seq_print!(m, "transaction complete\n");
} else {
seq_print!(m, "transaction error: {}\n", self.code);
}
Ok(())
}
}
const fn ptr_align(value: usize) -> usize {
let size = core::mem::size_of::<usize>() - 1;
(value + size) & !size
}
// SAFETY: We call register in `init`.
static BINDER_SHRINKER: Shrinker = unsafe { Shrinker::new() };
struct BinderModule {}
impl kernel::Module for BinderModule {
fn init(_module: &'static kernel::ThisModule) -> Result<Self> {
// SAFETY: The module initializer never runs twice, so we only call this once.
unsafe { crate::context::CONTEXTS.init() };
// SAFETY: This just accesses global booleans.
unsafe {
extern "C" {
static mut binder_use_rust: i32;
fn unload_binder() -> i32;
}
if binder_use_rust == 0 {
return Ok(Self {});
}
if unload_binder() != 0 {
pr_err!("Failed to unload C Binder.");
return Ok(Self {});
}
}
pr_warn!("Loaded Rust Binder.");
BINDER_SHRINKER.register(kernel::c_str!("android-binder"))?;
// SAFETY: The module is being loaded, so we can initialize binderfs.
unsafe { kernel::error::to_result(binderfs::init_rust_binderfs())? };
Ok(Self {})
}
}
/// Makes the inner type Sync.
#[repr(transparent)]
pub struct AssertSync<T>(T);
// SAFETY: Used only to insert `file_operations` into a global, which is safe.
unsafe impl<T> Sync for AssertSync<T> {}
/// File operations that rust_binderfs.c can use.
#[no_mangle]
#[used]
pub static rust_binder_fops: AssertSync<kernel::bindings::file_operations> = {
// SAFETY: All zeroes is safe for the `file_operations` type.
let zeroed_ops = unsafe { core::mem::MaybeUninit::zeroed().assume_init() };
let ops = kernel::bindings::file_operations {
owner: THIS_MODULE.as_ptr(),
poll: Some(rust_binder_poll),
unlocked_ioctl: Some(rust_binder_unlocked_ioctl),
compat_ioctl: Some(rust_binder_compat_ioctl),
mmap: Some(rust_binder_mmap),
open: Some(rust_binder_open),
release: Some(rust_binder_release),
flush: Some(rust_binder_flush),
..zeroed_ops
};
AssertSync(ops)
};
#[no_mangle]
unsafe extern "C" fn rust_binder_new_context(
name: *const kernel::ffi::c_char,
) -> *mut kernel::ffi::c_void {
// SAFETY: The caller will always provide a valid c string here.
let name = unsafe { kernel::str::CStr::from_char_ptr(name) };
match Context::new(name) {
Ok(ctx) => Arc::into_foreign(ctx).cast_mut(),
Err(_err) => core::ptr::null_mut(),
}
}
#[no_mangle]
unsafe extern "C" fn rust_binder_remove_context(device: *mut kernel::ffi::c_void) {
if !device.is_null() {
// SAFETY: The caller ensures that the `device` pointer came from a previous call to
// `rust_binder_new_device`.
let ctx = unsafe { Arc::<Context>::from_foreign(device) };
ctx.deregister();
drop(ctx);
}
}
unsafe extern "C" fn rust_binder_open(
inode: *mut bindings::inode,
file_ptr: *mut bindings::file,
) -> kernel::ffi::c_int {
// SAFETY: The `rust_binderfs.c` file ensures that `i_private` is set to a
// `struct binder_device`.
let device = unsafe { (*inode).i_private } as *const binderfs::binder_device;
assert!(!device.is_null());
// SAFETY: The `rust_binderfs.c` file ensures that `device->ctx` holds a binder context when
// using the rust binder fops.
let ctx = unsafe { Arc::<Context>::borrow((*device).ctx) };
// SAFETY: The caller provides a valid file pointer to a new `struct file`.
let file = unsafe { File::from_raw_file(file_ptr) };
let process = match Process::open(ctx, file) {
Ok(process) => process,
Err(err) => return err.to_errno(),
};
// SAFETY: This is an `inode` for a newly created binder file.
match unsafe { BinderfsProcFile::new(inode, process.task.pid()) } {
Ok(Some(file)) => process.inner.lock().binderfs_file = Some(file),
Ok(None) => { /* pid already exists */ }
Err(err) => return err.to_errno(),
}
// SAFETY: This file is associated with Rust binder, so we own the `private_data` field.
unsafe { (*file_ptr).private_data = process.into_foreign().cast_mut() };
0
}
unsafe extern "C" fn rust_binder_release(
_inode: *mut bindings::inode,
file: *mut bindings::file,
) -> kernel::ffi::c_int {
// SAFETY: We previously set `private_data` in `rust_binder_open`.
let process = unsafe { Arc::<Process>::from_foreign((*file).private_data) };
// SAFETY: The caller ensures that the file is valid.
let file = unsafe { File::from_raw_file(file) };
Process::release(process, file);
0
}
unsafe extern "C" fn rust_binder_compat_ioctl(
file: *mut bindings::file,
cmd: kernel::ffi::c_uint,
arg: kernel::ffi::c_ulong,
) -> kernel::ffi::c_long {
// SAFETY: We previously set `private_data` in `rust_binder_open`.
let f = unsafe { Arc::<Process>::borrow((*file).private_data) };
// SAFETY: The caller ensures that the file is valid.
match Process::compat_ioctl(f, unsafe { File::from_raw_file(file) }, cmd as _, arg as _) {
Ok(()) => 0,
Err(err) => err.to_errno() as isize,
}
}
unsafe extern "C" fn rust_binder_unlocked_ioctl(
file: *mut bindings::file,
cmd: kernel::ffi::c_uint,
arg: kernel::ffi::c_ulong,
) -> kernel::ffi::c_long {
// SAFETY: We previously set `private_data` in `rust_binder_open`.
let f = unsafe { Arc::<Process>::borrow((*file).private_data) };
// SAFETY: The caller ensures that the file is valid.
match Process::ioctl(f, unsafe { File::from_raw_file(file) }, cmd as _, arg as _) {
Ok(()) => 0,
Err(err) => err.to_errno() as isize,
}
}
unsafe extern "C" fn rust_binder_mmap(
file: *mut bindings::file,
vma: *mut bindings::vm_area_struct,
) -> kernel::ffi::c_int {
// SAFETY: We previously set `private_data` in `rust_binder_open`.
let f = unsafe { Arc::<Process>::borrow((*file).private_data) };
// SAFETY: The caller ensures that the vma is valid.
let area = unsafe { kernel::mm::virt::VmAreaNew::from_raw(vma) };
// SAFETY: The caller ensures that the file is valid.
match Process::mmap(f, unsafe { File::from_raw_file(file) }, area) {
Ok(()) => 0,
Err(err) => err.to_errno(),
}
}
unsafe extern "C" fn rust_binder_poll(
file: *mut bindings::file,
wait: *mut bindings::poll_table_struct,
) -> bindings::__poll_t {
// SAFETY: We previously set `private_data` in `rust_binder_open`.
let f = unsafe { Arc::<Process>::borrow((*file).private_data) };
// SAFETY: The caller ensures that the file is valid.
let fileref = unsafe { File::from_raw_file(file) };
// SAFETY: The caller ensures that the `PollTable` is valid.
match Process::poll(f, fileref, unsafe { PollTable::from_ptr(wait) }) {
Ok(v) => v,
Err(_) => bindings::POLLERR,
}
}
unsafe extern "C" fn rust_binder_flush(
file: *mut bindings::file,
_id: bindings::fl_owner_t,
) -> kernel::ffi::c_int {
// SAFETY: We previously set `private_data` in `rust_binder_open`.
let f = unsafe { Arc::<Process>::borrow((*file).private_data) };
match Process::flush(f) {
Ok(()) => 0,
Err(err) => err.to_errno(),
}
}
#[no_mangle]
unsafe extern "C" fn rust_binder_stats_show(
ptr: *mut seq_file,
_: *mut kernel::ffi::c_void,
) -> kernel::ffi::c_int {
// SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which
// this method is called.
let m = unsafe { SeqFile::from_raw(ptr) };
if let Err(err) = rust_binder_stats_show_impl(m) {
seq_print!(m, "failed to generate state: {:?}\n", err);
}
0
}
#[no_mangle]
unsafe extern "C" fn rust_binder_state_show(
ptr: *mut seq_file,
_: *mut kernel::ffi::c_void,
) -> kernel::ffi::c_int {
// SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which
// this method is called.
let m = unsafe { SeqFile::from_raw(ptr) };
if let Err(err) = rust_binder_state_show_impl(m) {
seq_print!(m, "failed to generate state: {:?}\n", err);
}
0
}
#[no_mangle]
unsafe extern "C" fn rust_binder_proc_show(
ptr: *mut seq_file,
_: *mut kernel::ffi::c_void,
) -> kernel::ffi::c_int {
// SAFETY: Accessing the private field of `seq_file` is okay.
let pid = (unsafe { (*ptr).private }) as usize as Pid;
// SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which
// this method is called.
let m = unsafe { SeqFile::from_raw(ptr) };
if let Err(err) = rust_binder_proc_show_impl(m, pid) {
seq_print!(m, "failed to generate state: {:?}\n", err);
}
0
}
#[no_mangle]
unsafe extern "C" fn rust_binder_transactions_show(
ptr: *mut seq_file,
_: *mut kernel::ffi::c_void,
) -> kernel::ffi::c_int {
// SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which
// this method is called.
let m = unsafe { SeqFile::from_raw(ptr) };
if let Err(err) = rust_binder_transactions_show_impl(m) {
seq_print!(m, "failed to generate state: {:?}\n", err);
}
0
}
fn rust_binder_transactions_show_impl(m: &SeqFile) -> Result<()> {
seq_print!(m, "binder transactions:\n");
let contexts = context::get_all_contexts()?;
for ctx in contexts {
let procs = ctx.get_all_procs()?;
for proc in procs {
proc.debug_print(m, &ctx, false)?;
seq_print!(m, "\n");
}
}
Ok(())
}
fn rust_binder_stats_show_impl(m: &SeqFile) -> Result<()> {
seq_print!(m, "binder stats:\n");
stats::GLOBAL_STATS.debug_print("", m);
let contexts = context::get_all_contexts()?;
for ctx in contexts {
let procs = ctx.get_all_procs()?;
for proc in procs {
proc.debug_print_stats(m, &ctx)?;
seq_print!(m, "\n");
}
}
Ok(())
}
fn rust_binder_state_show_impl(m: &SeqFile) -> Result<()> {
seq_print!(m, "binder state:\n");
let contexts = context::get_all_contexts()?;
for ctx in contexts {
let procs = ctx.get_all_procs()?;
for proc in procs {
proc.debug_print(m, &ctx, true)?;
seq_print!(m, "\n");
}
}
Ok(())
}
fn rust_binder_proc_show_impl(m: &SeqFile, pid: Pid) -> Result<()> {
seq_print!(m, "binder proc state:\n");
let contexts = context::get_all_contexts()?;
for ctx in contexts {
let procs = ctx.get_procs_with_pid(pid)?;
for proc in procs {
proc.debug_print(m, &ctx, true)?;
seq_print!(m, "\n");
}
}
Ok(())
}
struct BinderfsProcFile(NonNull<bindings::dentry>);
// SAFETY: Safe to drop any thread.
unsafe impl Send for BinderfsProcFile {}
impl BinderfsProcFile {
/// # Safety
///
/// Takes an inode from a newly created binder file.
unsafe fn new(nodp: *mut bindings::inode, pid: i32) -> Result<Option<Self>> {
// SAFETY: The caller passes an `inode` for a newly created binder file.
let dentry = unsafe { binderfs::rust_binderfs_create_proc_file(nodp, pid) };
match kernel::error::from_err_ptr(dentry) {
Ok(dentry) => Ok(NonNull::new(dentry).map(Self)),
Err(err) if err == EEXIST => Ok(None),
Err(err) => Err(err),
}
}
}
impl Drop for BinderfsProcFile {
fn drop(&mut self) {
// SAFETY: This is a dentry from `rust_binderfs_remove_file` that has not been deleted yet.
unsafe { binderfs::rust_binderfs_remove_file(self.0.as_ptr()) };
}
}

View File

@@ -0,0 +1,59 @@
// SPDX-License-Identifier: GPL-2.0-only
/* rust_binder_events.c
*
* Rust Binder tracepoints.
*
* Copyright 2024 Google LLC
*/
#include "rust_binder.h"
const char * const binder_command_strings[] = {
"BC_TRANSACTION",
"BC_REPLY",
"BC_ACQUIRE_RESULT",
"BC_FREE_BUFFER",
"BC_INCREFS",
"BC_ACQUIRE",
"BC_RELEASE",
"BC_DECREFS",
"BC_INCREFS_DONE",
"BC_ACQUIRE_DONE",
"BC_ATTEMPT_ACQUIRE",
"BC_REGISTER_LOOPER",
"BC_ENTER_LOOPER",
"BC_EXIT_LOOPER",
"BC_REQUEST_DEATH_NOTIFICATION",
"BC_CLEAR_DEATH_NOTIFICATION",
"BC_DEAD_BINDER_DONE",
"BC_TRANSACTION_SG",
"BC_REPLY_SG",
};
const char * const binder_return_strings[] = {
"BR_ERROR",
"BR_OK",
"BR_TRANSACTION",
"BR_REPLY",
"BR_ACQUIRE_RESULT",
"BR_DEAD_REPLY",
"BR_TRANSACTION_COMPLETE",
"BR_INCREFS",
"BR_ACQUIRE",
"BR_RELEASE",
"BR_DECREFS",
"BR_ATTEMPT_ACQUIRE",
"BR_NOOP",
"BR_SPAWN_LOOPER",
"BR_FINISHED",
"BR_DEAD_BINDER",
"BR_CLEAR_DEATH_NOTIFICATION_DONE",
"BR_FAILED_REPLY",
"BR_FROZEN_REPLY",
"BR_ONEWAY_SPAM_SUSPECT",
"BR_TRANSACTION_PENDING_FROZEN"
};
#define CREATE_TRACE_POINTS
#define CREATE_RUST_TRACE_POINTS
#include "rust_binder_events.h"

View File

@@ -0,0 +1,387 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2024 Google, Inc.
*/
#undef TRACE_SYSTEM
#undef TRACE_INCLUDE_FILE
#undef TRACE_INCLUDE_PATH
#define TRACE_SYSTEM rust_binder
#define TRACE_INCLUDE_FILE rust_binder_events
#define TRACE_INCLUDE_PATH ../drivers/android/binder
#if !defined(_RUST_BINDER_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _RUST_BINDER_TRACE_H
#include <linux/tracepoint.h>
TRACE_EVENT(rust_binder_ioctl,
TP_PROTO(unsigned int cmd, unsigned long arg),
TP_ARGS(cmd, arg),
TP_STRUCT__entry(
__field(unsigned int, cmd)
__field(unsigned long, arg)
),
TP_fast_assign(
__entry->cmd = cmd;
__entry->arg = arg;
),
TP_printk("cmd=0x%x arg=0x%lx", __entry->cmd, __entry->arg)
);
DECLARE_EVENT_CLASS(rust_binder_function_return_class,
TP_PROTO(int ret),
TP_ARGS(ret),
TP_STRUCT__entry(
__field(int, ret)
),
TP_fast_assign(
__entry->ret = ret;
),
TP_printk("ret=%d", __entry->ret)
);
#define DEFINE_RBINDER_FUNCTION_RETURN_EVENT(name) \
DEFINE_EVENT(rust_binder_function_return_class, name, \
TP_PROTO(int ret), \
TP_ARGS(ret))
DEFINE_RBINDER_FUNCTION_RETURN_EVENT(rust_binder_ioctl_done);
DEFINE_RBINDER_FUNCTION_RETURN_EVENT(rust_binder_read_done);
DEFINE_RBINDER_FUNCTION_RETURN_EVENT(rust_binder_write_done);
TRACE_EVENT(rust_binder_set_priority,
TP_PROTO(struct task_struct *thread, int desired_prio, int new_prio),
TP_ARGS(thread, desired_prio, new_prio),
TP_STRUCT__entry(
__field(int, proc)
__field(int, thread)
__field(unsigned int, old_prio)
__field(unsigned int, new_prio)
__field(unsigned int, desired_prio)
),
TP_fast_assign(
__entry->proc = thread->tgid;
__entry->thread = thread->pid;
__entry->old_prio = thread->normal_prio;
__entry->new_prio = new_prio;
__entry->desired_prio = desired_prio;
),
TP_printk("proc=%d thread=%d old=%d => new=%d desired=%d",
__entry->proc, __entry->thread, __entry->old_prio,
__entry->new_prio, __entry->desired_prio)
);
TRACE_EVENT(rust_binder_wait_for_work,
TP_PROTO(bool proc_work, bool transaction_stack, bool thread_todo),
TP_ARGS(proc_work, transaction_stack, thread_todo),
TP_STRUCT__entry(
__field(bool, proc_work)
__field(bool, transaction_stack)
__field(bool, thread_todo)
),
TP_fast_assign(
__entry->proc_work = proc_work;
__entry->transaction_stack = transaction_stack;
__entry->thread_todo = thread_todo;
),
TP_printk("proc_work=%d transaction_stack=%d thread_todo=%d",
__entry->proc_work, __entry->transaction_stack,
__entry->thread_todo)
);
TRACE_EVENT(rust_binder_transaction,
TP_PROTO(bool reply, rust_binder_transaction t),
TP_ARGS(reply, t),
TP_STRUCT__entry(
__field(int, debug_id)
__field(int, target_node)
__field(int, from_proc)
__field(int, to_proc)
__field(int, reply)
__field(unsigned int, code)
__field(unsigned int, flags)
),
TP_fast_assign(
rust_binder_thread from_thread = rust_binder_transaction_from_thread(t);
rust_binder_process from = rust_binder_thread_proc(from_thread);
rust_binder_process to = rust_binder_transaction_to_proc(t);
rust_binder_node target_node = rust_binder_transaction_target_node(t);
__entry->debug_id = rust_binder_transaction_debug_id(t);
__entry->target_node = target_node ? rust_binder_node_debug_id(target_node) : 0;
__entry->from_proc = rust_binder_process_task(from)->pid;
__entry->to_proc = rust_binder_process_task(to)->pid;
__entry->reply = reply;
__entry->code = rust_binder_transaction_code(t);
__entry->flags = rust_binder_transaction_flags(t);
),
TP_printk("transaction=%d target_node=%d dest_proc=%d from_proc=%d reply=%d flags=0x%x code=0x%x",
__entry->debug_id, __entry->target_node, __entry->to_proc,
__entry->from_proc, __entry->reply, __entry->flags,
__entry->code)
);
TRACE_EVENT(rust_binder_transaction_thread_selected,
TP_PROTO(rust_binder_transaction t, rust_binder_thread thread),
TP_ARGS(t, thread),
TP_STRUCT__entry(
__field(int, debug_id)
__field(int, to_thread)
),
TP_fast_assign(
__entry->debug_id = rust_binder_transaction_debug_id(t);
__entry->to_thread = rust_binder_thread_id(thread);
),
TP_printk("transaction=%d thread=%d", __entry->debug_id, __entry->to_thread)
);
TRACE_EVENT(rust_binder_transaction_received,
TP_PROTO(rust_binder_transaction t),
TP_ARGS(t),
TP_STRUCT__entry(
__field(int, debug_id)
),
TP_fast_assign(
__entry->debug_id = rust_binder_transaction_debug_id(t);
),
TP_printk("transaction=%d", __entry->debug_id)
);
TRACE_EVENT(rust_binder_transaction_node_send,
TP_PROTO(int t_debug_id, rust_binder_node node,
const struct flat_binder_object *original,
const struct flat_binder_object *translated),
TP_ARGS(t_debug_id, node, original, translated),
TP_STRUCT__entry(
__field(int, debug_id)
__field(int, node_debug_id)
__field(binder_uintptr_t, node_ptr)
__field(int, types)
__field(int, original_handle)
__field(int, translated_handle)
),
TP_fast_assign(
int orig_is_handle = original->hdr.type == BINDER_TYPE_HANDLE || original->hdr.type == BINDER_TYPE_WEAK_HANDLE;
int orig_is_strong = original->hdr.type == BINDER_TYPE_HANDLE || original->hdr.type == BINDER_TYPE_BINDER;
int tran_is_handle = translated->hdr.type == BINDER_TYPE_HANDLE || translated->hdr.type == BINDER_TYPE_WEAK_HANDLE;
int tran_is_strong = translated->hdr.type == BINDER_TYPE_HANDLE || translated->hdr.type == BINDER_TYPE_BINDER;
__entry->debug_id = t_debug_id;
__entry->node_debug_id = rust_binder_node_debug_id(node);
__entry->node_ptr = rust_binder_node_debug_id(node);
__entry->types =
(orig_is_handle << 0) |
(tran_is_handle << 1) |
(orig_is_strong << 2) |
(tran_is_strong << 3);
__entry->original_handle = orig_is_handle ? original->handle : 0;
__entry->translated_handle = tran_is_handle ? original->handle : 0;
),
TP_printk("transaction=%d node=%d ptr=0x%016llx: %s%s [%d] ==> %s%s [%d]",
__entry->debug_id, __entry->node_debug_id,
(u64)__entry->node_ptr,
(__entry->types & (1<<2)) ? "" : "weak ",
(__entry->types & (1<<0)) ? "handle" : "binder",
__entry->original_handle,
(__entry->types & (1<<3)) ? "" : "weak ",
(__entry->types & (1<<1)) ? "handle" : "binder",
__entry->translated_handle)
);
TRACE_EVENT(rust_binder_transaction_fd_send,
TP_PROTO(int t_debug_id, int fd, size_t offset),
TP_ARGS(t_debug_id, fd, offset),
TP_STRUCT__entry(
__field(int, debug_id)
__field(int, fd)
__field(size_t, offset)
),
TP_fast_assign(
__entry->debug_id = t_debug_id;
__entry->fd = fd;
__entry->offset = offset;
),
TP_printk("transaction=%d src_fd=%d offset=%zu",
__entry->debug_id, __entry->fd, __entry->offset)
);
TRACE_EVENT(rust_binder_transaction_fd_recv,
TP_PROTO(int t_debug_id, int fd, size_t offset),
TP_ARGS(t_debug_id, fd, offset),
TP_STRUCT__entry(
__field(int, debug_id)
__field(int, fd)
__field(size_t, offset)
),
TP_fast_assign(
__entry->debug_id = t_debug_id;
__entry->fd = fd;
__entry->offset = offset;
),
TP_printk("transaction=%d dest_fd=%d offset=%zu",
__entry->debug_id, __entry->fd, __entry->offset)
);
TRACE_EVENT(rust_binder_transaction_alloc_buf,
TP_PROTO(int debug_id, const struct binder_transaction_data_sg *data),
TP_ARGS(debug_id, data),
TP_STRUCT__entry(
__field(int, debug_id)
__field(size_t, data_size)
__field(size_t, offsets_size)
__field(size_t, extra_buffers_size)
),
TP_fast_assign(
__entry->debug_id = debug_id;
__entry->data_size = data->transaction_data.data_size;
__entry->offsets_size = data->transaction_data.offsets_size;
__entry->extra_buffers_size = data->buffers_size;
),
TP_printk("transaction=%d data_size=%zd offsets_size=%zd extra_buffers_size=%zd",
__entry->debug_id, __entry->data_size, __entry->offsets_size,
__entry->extra_buffers_size)
);
DECLARE_EVENT_CLASS(rust_binder_buffer_release_class,
TP_PROTO(int debug_id),
TP_ARGS(debug_id),
TP_STRUCT__entry(
__field(int, debug_id)
),
TP_fast_assign(
__entry->debug_id = debug_id;
),
TP_printk("transaction=%d", __entry->debug_id)
);
DEFINE_EVENT(rust_binder_buffer_release_class, rust_binder_transaction_buffer_release,
TP_PROTO(int debug_id),
TP_ARGS(debug_id));
DEFINE_EVENT(rust_binder_buffer_release_class, rust_binder_transaction_failed_buffer_release,
TP_PROTO(int debug_id),
TP_ARGS(debug_id));
DEFINE_EVENT(rust_binder_buffer_release_class, rust_binder_transaction_update_buffer_release,
TP_PROTO(int debug_id),
TP_ARGS(debug_id));
TRACE_EVENT(rust_binder_update_page_range,
TP_PROTO(int pid, bool allocate, size_t start, size_t end),
TP_ARGS(pid, allocate, start, end),
TP_STRUCT__entry(
__field(int, proc)
__field(bool, allocate)
__field(size_t, offset)
__field(size_t, size)
),
TP_fast_assign(
__entry->proc = pid;
__entry->allocate = allocate;
__entry->offset = start;
__entry->size = end - start;
),
TP_printk("proc=%d allocate=%d offset=%zu size=%zu",
__entry->proc, __entry->allocate,
__entry->offset, __entry->size)
);
DECLARE_EVENT_CLASS(rust_binder_lru_page_class,
TP_PROTO(int pid, size_t page_index),
TP_ARGS(pid, page_index),
TP_STRUCT__entry(
__field(int, proc)
__field(size_t, page_index)
),
TP_fast_assign(
__entry->proc = pid;
__entry->page_index = page_index;
),
TP_printk("proc=%d page_index=%zu",
__entry->proc, __entry->page_index)
);
DEFINE_EVENT(rust_binder_lru_page_class, rust_binder_alloc_lru_start,
TP_PROTO(int pid, size_t page_index),
TP_ARGS(pid, page_index));
DEFINE_EVENT(rust_binder_lru_page_class, rust_binder_alloc_lru_end,
TP_PROTO(int pid, size_t page_index),
TP_ARGS(pid, page_index));
DEFINE_EVENT(rust_binder_lru_page_class, rust_binder_free_lru_start,
TP_PROTO(int pid, size_t page_index),
TP_ARGS(pid, page_index));
DEFINE_EVENT(rust_binder_lru_page_class, rust_binder_free_lru_end,
TP_PROTO(int pid, size_t page_index),
TP_ARGS(pid, page_index));
DEFINE_EVENT(rust_binder_lru_page_class, rust_binder_alloc_page_start,
TP_PROTO(int pid, size_t page_index),
TP_ARGS(pid, page_index));
DEFINE_EVENT(rust_binder_lru_page_class, rust_binder_alloc_page_end,
TP_PROTO(int pid, size_t page_index),
TP_ARGS(pid, page_index));
DEFINE_EVENT(rust_binder_lru_page_class, rust_binder_unmap_user_start,
TP_PROTO(int pid, size_t page_index),
TP_ARGS(pid, page_index));
DEFINE_EVENT(rust_binder_lru_page_class, rust_binder_unmap_user_end,
TP_PROTO(int pid, size_t page_index),
TP_ARGS(pid, page_index));
DEFINE_EVENT(rust_binder_lru_page_class, rust_binder_unmap_kernel_start,
TP_PROTO(int pid, size_t page_index),
TP_ARGS(pid, page_index));
DEFINE_EVENT(rust_binder_lru_page_class, rust_binder_unmap_kernel_end,
TP_PROTO(int pid, size_t page_index),
TP_ARGS(pid, page_index));
TRACE_EVENT(rust_binder_command,
TP_PROTO(uint32_t cmd),
TP_ARGS(cmd),
TP_STRUCT__entry(
__field(uint32_t, cmd)
),
TP_fast_assign(
__entry->cmd = cmd;
),
TP_printk("cmd=0x%x %s",
__entry->cmd,
_IOC_NR(__entry->cmd) < ARRAY_SIZE(binder_command_strings) ?
binder_command_strings[_IOC_NR(__entry->cmd)] :
"unknown")
);
TRACE_EVENT(rust_binder_return,
TP_PROTO(uint32_t ret),
TP_ARGS(ret),
TP_STRUCT__entry(
__field(uint32_t, ret)
),
TP_fast_assign(
__entry->ret = ret;
),
TP_printk("ret=0x%x %s",
__entry->ret,
_IOC_NR(__entry->ret) < ARRAY_SIZE(binder_return_strings) ?
binder_return_strings[_IOC_NR(__entry->ret)] :
"unknown")
);
#endif /* _RUST_BINDER_TRACE_H */
/* This part must be outside protection */
#include <trace/define_trace.h>

View File

@@ -0,0 +1,23 @@
// SPDX-License-Identifier: GPL-2.0-only
/* rust_binder_events.c
*
* Rust Binder vendorhooks.
*
* Copyright 2024 Google LLC
*/
#include "rust_binder.h"
#define CREATE_TRACE_POINTS
#define CREATE_RUST_TRACE_POINTS
#include <trace/hooks/vendor_hooks.h>
#include <linux/tracepoint.h>
#include "rust_binder_hooks.h"
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
* associated with them) to allow external modules to probe them.
*/
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rust_binder_set_priority);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rust_binder_restore_priority);

View File

@@ -0,0 +1,33 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2024 Google, Inc.
*/
#undef TRACE_SYSTEM
#undef TRACE_INCLUDE_FILE
#undef TRACE_INCLUDE_PATH
#define TRACE_SYSTEM rust_binder
#define TRACE_INCLUDE_FILE rust_binder_hooks
#define TRACE_INCLUDE_PATH ../drivers/android/binder
#if !defined(_RUST_BINDER_HOOK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _RUST_BINDER_HOOK_H
#include <trace/hooks/vendor_hooks.h>
/*
* Following tracepoints are not exported in tracefs and provide a
* mechanism for vendor modules to hook and extend functionality
*/
DECLARE_HOOK(android_vh_rust_binder_set_priority,
TP_PROTO(rust_binder_transaction t, struct task_struct *task),
TP_ARGS(t, task));
DECLARE_HOOK(android_vh_rust_binder_restore_priority,
TP_PROTO(struct task_struct *task),
TP_ARGS(task));
#endif /* _RUST_BINDER_HOOK_H */
/* This part must be outside protection */
#include <trace/define_trace.h>

View File

@@ -0,0 +1,87 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* rust_binder_internal.h
*
* This file contains internal data structures used by Rust Binder. Mostly,
* these are type definitions used only by binderfs or things that Rust Binder
* define and export to binderfs.
*
* It does not include things exported by binderfs to Rust Binder since this
* file is not included as input to bindgen.
*
* Copyright (C) 2024 Google LLC.
*/
#ifndef _LINUX_RUST_BINDER_INTERNAL_H
#define _LINUX_RUST_BINDER_INTERNAL_H
#define RUST_BINDERFS_SUPER_MAGIC 0x6c6f6f71
#include <linux/seq_file.h>
#include <uapi/linux/android/binder.h>
#include <uapi/linux/android/binderfs.h>
/*
* The internal data types in the Rust Binder driver are opaque to C, so we use
* void pointer typedefs for these types.
*/
typedef void *rust_binder_context;
/**
* struct binder_device - information about a binder device node
* @minor: the minor number used by this device
* @ctx: the Rust Context used by this device, or null for binder-control
*
* This is used as the private data for files directly in binderfs, but not
* files in the binder_logs subdirectory. This struct owns a refcount on `ctx`
* and the entry for `minor` in `binderfs_minors`. For binder-control `ctx` is
* null.
*/
struct binder_device {
int minor;
rust_binder_context ctx;
};
int rust_binder_stats_show(struct seq_file *m, void *unused);
int rust_binder_state_show(struct seq_file *m, void *unused);
int rust_binder_transactions_show(struct seq_file *m, void *unused);
int rust_binder_proc_show(struct seq_file *m, void *pid);
extern const struct file_operations rust_binder_fops;
rust_binder_context rust_binder_new_context(char *name);
void rust_binder_remove_context(rust_binder_context device);
/**
* binderfs_mount_opts - mount options for binderfs
* @max: maximum number of allocatable binderfs binder devices
* @stats_mode: enable binder stats in binderfs.
*/
struct binderfs_mount_opts {
int max;
int stats_mode;
};
/**
* binderfs_info - information about a binderfs mount
* @ipc_ns: The ipc namespace the binderfs mount belongs to.
* @control_dentry: This records the dentry of this binderfs mount
* binder-control device.
* @root_uid: uid that needs to be used when a new binder device is
* created.
* @root_gid: gid that needs to be used when a new binder device is
* created.
* @mount_opts: The mount options in use.
* @device_count: The current number of allocated binder devices.
* @proc_log_dir: Pointer to the directory dentry containing process-specific
* logs.
*/
struct binderfs_info {
struct ipc_namespace *ipc_ns;
struct dentry *control_dentry;
kuid_t root_uid;
kgid_t root_gid;
struct binderfs_mount_opts mount_opts;
int device_count;
struct dentry *proc_log_dir;
};
#endif /* _LINUX_RUST_BINDER_INTERNAL_H */

View File

@@ -0,0 +1,849 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler_types.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/fsnotify.h>
#include <linux/gfp.h>
#include <linux/idr.h>
#include <linux/init.h>
#include <linux/ipc_namespace.h>
#include <linux/kdev_t.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/namei.h>
#include <linux/magic.h>
#include <linux/major.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/fs_parser.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/spinlock_types.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/uaccess.h>
#include <linux/user_namespace.h>
#include <linux/xarray.h>
#include <uapi/asm-generic/errno-base.h>
#include <uapi/linux/android/binder.h>
#include <uapi/linux/android/binderfs.h>
#include "rust_binder_internal.h"
#define FIRST_INODE 1
#define SECOND_INODE 2
#define INODE_OFFSET 3
#define BINDERFS_MAX_MINOR (1U << MINORBITS)
/* Ensure that the initial ipc namespace always has devices available. */
#define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 4)
DEFINE_SHOW_ATTRIBUTE(rust_binder_stats);
DEFINE_SHOW_ATTRIBUTE(rust_binder_state);
DEFINE_SHOW_ATTRIBUTE(rust_binder_transactions);
DEFINE_SHOW_ATTRIBUTE(rust_binder_proc);
char *rust_binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES;
module_param_named(rust_devices, rust_binder_devices_param, charp, 0444);
static dev_t binderfs_dev;
static DEFINE_MUTEX(binderfs_minors_mutex);
static DEFINE_IDA(binderfs_minors);
enum binderfs_param {
Opt_max,
Opt_stats_mode,
};
enum binderfs_stats_mode {
binderfs_stats_mode_unset,
binderfs_stats_mode_global,
};
struct binder_features {
bool oneway_spam_detection;
bool extended_error;
};
static const struct constant_table binderfs_param_stats[] = {
{ "global", binderfs_stats_mode_global },
{}
};
static const struct fs_parameter_spec binderfs_fs_parameters[] = {
fsparam_u32("max", Opt_max),
fsparam_enum("stats", Opt_stats_mode, binderfs_param_stats),
{}
};
static struct binder_features binder_features = {
.oneway_spam_detection = true,
.extended_error = true,
};
static inline struct binderfs_info *BINDERFS_SB(const struct super_block *sb)
{
return sb->s_fs_info;
}
bool is_rust_binderfs_device(const struct inode *inode)
{
if (inode->i_sb->s_magic == RUST_BINDERFS_SUPER_MAGIC)
return true;
return false;
}
/**
* binderfs_binder_device_create - allocate inode from super block of a
* binderfs mount
* @ref_inode: inode from wich the super block will be taken
* @userp: buffer to copy information about new device for userspace to
* @req: struct binderfs_device as copied from userspace
*
* This function allocates a new binder_device and reserves a new minor
* number for it.
* Minor numbers are limited and tracked globally in binderfs_minors. The
* function will stash a struct binder_device for the specific binder
* device in i_private of the inode.
* It will go on to allocate a new inode from the super block of the
* filesystem mount, stash a struct binder_device in its i_private field
* and attach a dentry to that inode.
*
* Return: 0 on success, negative errno on failure
*/
static int binderfs_binder_device_create(struct inode *ref_inode,
struct binderfs_device __user *userp,
struct binderfs_device *req)
{
int minor, ret;
struct dentry *dentry, *root;
struct binder_device *device = NULL;
rust_binder_context ctx = NULL;
struct inode *inode = NULL;
struct super_block *sb = ref_inode->i_sb;
struct binderfs_info *info = sb->s_fs_info;
#if defined(CONFIG_IPC_NS)
bool use_reserve = (info->ipc_ns == &init_ipc_ns);
#else
bool use_reserve = true;
#endif
/* Reserve new minor number for the new device. */
mutex_lock(&binderfs_minors_mutex);
if (++info->device_count <= info->mount_opts.max)
minor = ida_alloc_max(&binderfs_minors,
use_reserve ? BINDERFS_MAX_MINOR :
BINDERFS_MAX_MINOR_CAPPED,
GFP_KERNEL);
else
minor = -ENOSPC;
if (minor < 0) {
--info->device_count;
mutex_unlock(&binderfs_minors_mutex);
return minor;
}
mutex_unlock(&binderfs_minors_mutex);
ret = -ENOMEM;
device = kzalloc(sizeof(*device), GFP_KERNEL);
if (!device)
goto err;
req->name[BINDERFS_MAX_NAME] = '\0'; /* NUL-terminate */
ctx = rust_binder_new_context(req->name);
if (!ctx)
goto err;
inode = new_inode(sb);
if (!inode)
goto err;
inode->i_ino = minor + INODE_OFFSET;
simple_inode_init_ts(inode);
init_special_inode(inode, S_IFCHR | 0600,
MKDEV(MAJOR(binderfs_dev), minor));
inode->i_fop = &rust_binder_fops;
inode->i_uid = info->root_uid;
inode->i_gid = info->root_gid;
req->major = MAJOR(binderfs_dev);
req->minor = minor;
device->ctx = ctx;
device->minor = minor;
if (userp && copy_to_user(userp, req, sizeof(*req))) {
ret = -EFAULT;
goto err;
}
root = sb->s_root;
inode_lock(d_inode(root));
/* look it up */
dentry = lookup_one_len(req->name, root, strlen(req->name));
if (IS_ERR(dentry)) {
inode_unlock(d_inode(root));
ret = PTR_ERR(dentry);
goto err;
}
if (d_really_is_positive(dentry)) {
/* already exists */
dput(dentry);
inode_unlock(d_inode(root));
ret = -EEXIST;
goto err;
}
inode->i_private = device;
d_instantiate(dentry, inode);
fsnotify_create(root->d_inode, dentry);
inode_unlock(d_inode(root));
return 0;
err:
kfree(device);
rust_binder_remove_context(ctx);
mutex_lock(&binderfs_minors_mutex);
--info->device_count;
ida_free(&binderfs_minors, minor);
mutex_unlock(&binderfs_minors_mutex);
iput(inode);
return ret;
}
/**
* binder_ctl_ioctl - handle binder device node allocation requests
*
* The request handler for the binder-control device. All requests operate on
* the binderfs mount the binder-control device resides in:
* - BINDER_CTL_ADD
* Allocate a new binder device.
*
* Return: %0 on success, negative errno on failure.
*/
static long binder_ctl_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
int ret = -EINVAL;
struct inode *inode = file_inode(file);
struct binderfs_device __user *device = (struct binderfs_device __user *)arg;
struct binderfs_device device_req;
switch (cmd) {
case BINDER_CTL_ADD:
ret = copy_from_user(&device_req, device, sizeof(device_req));
if (ret) {
ret = -EFAULT;
break;
}
ret = binderfs_binder_device_create(inode, device, &device_req);
break;
default:
break;
}
return ret;
}
static void binderfs_evict_inode(struct inode *inode)
{
struct binder_device *device = inode->i_private;
struct binderfs_info *info = BINDERFS_SB(inode->i_sb);
clear_inode(inode);
if (!S_ISCHR(inode->i_mode) || !device)
return;
mutex_lock(&binderfs_minors_mutex);
--info->device_count;
ida_free(&binderfs_minors, device->minor);
mutex_unlock(&binderfs_minors_mutex);
/* ctx is null for binder-control, but this function ignores null pointers */
rust_binder_remove_context(device->ctx);
kfree(device);
}
static int binderfs_fs_context_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
int opt;
struct binderfs_mount_opts *ctx = fc->fs_private;
struct fs_parse_result result;
opt = fs_parse(fc, binderfs_fs_parameters, param, &result);
if (opt < 0)
return opt;
switch (opt) {
case Opt_max:
if (result.uint_32 > BINDERFS_MAX_MINOR)
return invalfc(fc, "Bad value for '%s'", param->key);
ctx->max = result.uint_32;
break;
case Opt_stats_mode:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
ctx->stats_mode = result.uint_32;
break;
default:
return invalfc(fc, "Unsupported parameter '%s'", param->key);
}
return 0;
}
static int binderfs_fs_context_reconfigure(struct fs_context *fc)
{
struct binderfs_mount_opts *ctx = fc->fs_private;
struct binderfs_info *info = BINDERFS_SB(fc->root->d_sb);
if (info->mount_opts.stats_mode != ctx->stats_mode)
return invalfc(fc, "Binderfs stats mode cannot be changed during a remount");
info->mount_opts.stats_mode = ctx->stats_mode;
info->mount_opts.max = ctx->max;
return 0;
}
static int binderfs_show_options(struct seq_file *seq, struct dentry *root)
{
struct binderfs_info *info = BINDERFS_SB(root->d_sb);
if (info->mount_opts.max <= BINDERFS_MAX_MINOR)
seq_printf(seq, ",max=%d", info->mount_opts.max);
switch (info->mount_opts.stats_mode) {
case binderfs_stats_mode_unset:
break;
case binderfs_stats_mode_global:
seq_printf(seq, ",stats=global");
break;
}
return 0;
}
static const struct super_operations binderfs_super_ops = {
.evict_inode = binderfs_evict_inode,
.show_options = binderfs_show_options,
.statfs = simple_statfs,
};
static inline bool is_binderfs_control_device(const struct dentry *dentry)
{
struct binderfs_info *info = dentry->d_sb->s_fs_info;
return info->control_dentry == dentry;
}
static int binderfs_rename(struct mnt_idmap *idmap,
struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
if (is_binderfs_control_device(old_dentry) ||
is_binderfs_control_device(new_dentry))
return -EPERM;
return simple_rename(idmap, old_dir, old_dentry, new_dir,
new_dentry, flags);
}
static int binderfs_unlink(struct inode *dir, struct dentry *dentry)
{
if (is_binderfs_control_device(dentry))
return -EPERM;
return simple_unlink(dir, dentry);
}
static const struct file_operations binder_ctl_fops = {
.owner = THIS_MODULE,
.open = nonseekable_open,
.unlocked_ioctl = binder_ctl_ioctl,
.compat_ioctl = binder_ctl_ioctl,
.llseek = noop_llseek,
};
/**
* binderfs_binder_ctl_create - create a new binder-control device
* @sb: super block of the binderfs mount
*
* This function creates a new binder-control device node in the binderfs mount
* referred to by @sb.
*
* Return: 0 on success, negative errno on failure
*/
static int binderfs_binder_ctl_create(struct super_block *sb)
{
int minor, ret;
struct dentry *dentry;
struct binder_device *device;
struct inode *inode = NULL;
struct dentry *root = sb->s_root;
struct binderfs_info *info = sb->s_fs_info;
#if defined(CONFIG_IPC_NS)
bool use_reserve = (info->ipc_ns == &init_ipc_ns);
#else
bool use_reserve = true;
#endif
device = kzalloc(sizeof(*device), GFP_KERNEL);
if (!device)
return -ENOMEM;
/* If we have already created a binder-control node, return. */
if (info->control_dentry) {
ret = 0;
goto out;
}
ret = -ENOMEM;
inode = new_inode(sb);
if (!inode)
goto out;
/* Reserve a new minor number for the new device. */
mutex_lock(&binderfs_minors_mutex);
minor = ida_alloc_max(&binderfs_minors,
use_reserve ? BINDERFS_MAX_MINOR :
BINDERFS_MAX_MINOR_CAPPED,
GFP_KERNEL);
mutex_unlock(&binderfs_minors_mutex);
if (minor < 0) {
ret = minor;
goto out;
}
inode->i_ino = SECOND_INODE;
simple_inode_init_ts(inode);
init_special_inode(inode, S_IFCHR | 0600,
MKDEV(MAJOR(binderfs_dev), minor));
inode->i_fop = &binder_ctl_fops;
inode->i_uid = info->root_uid;
inode->i_gid = info->root_gid;
device->minor = minor;
device->ctx = NULL;
dentry = d_alloc_name(root, "binder-control");
if (!dentry)
goto out;
inode->i_private = device;
info->control_dentry = dentry;
d_add(dentry, inode);
return 0;
out:
kfree(device);
iput(inode);
return ret;
}
static const struct inode_operations binderfs_dir_inode_operations = {
.lookup = simple_lookup,
.rename = binderfs_rename,
.unlink = binderfs_unlink,
};
static struct inode *binderfs_make_inode(struct super_block *sb, int mode)
{
struct inode *ret;
ret = new_inode(sb);
if (ret) {
ret->i_ino = iunique(sb, BINDERFS_MAX_MINOR + INODE_OFFSET);
ret->i_mode = mode;
simple_inode_init_ts(ret);
}
return ret;
}
static struct dentry *binderfs_create_dentry(struct dentry *parent,
const char *name)
{
struct dentry *dentry;
dentry = lookup_one_len(name, parent, strlen(name));
if (IS_ERR(dentry))
return dentry;
/* Return error if the file/dir already exists. */
if (d_really_is_positive(dentry)) {
dput(dentry);
return ERR_PTR(-EEXIST);
}
return dentry;
}
void rust_binderfs_remove_file(struct dentry *dentry)
{
struct inode *parent_inode;
parent_inode = d_inode(dentry->d_parent);
inode_lock(parent_inode);
if (simple_positive(dentry)) {
dget(dentry);
simple_unlink(parent_inode, dentry);
d_delete(dentry);
dput(dentry);
}
inode_unlock(parent_inode);
}
struct dentry *rust_binderfs_create_file(struct dentry *parent, const char *name,
const struct file_operations *fops,
void *data)
{
struct dentry *dentry;
struct inode *new_inode, *parent_inode;
struct super_block *sb;
parent_inode = d_inode(parent);
inode_lock(parent_inode);
dentry = binderfs_create_dentry(parent, name);
if (IS_ERR(dentry))
goto out;
sb = parent_inode->i_sb;
new_inode = binderfs_make_inode(sb, S_IFREG | 0444);
if (!new_inode) {
dput(dentry);
dentry = ERR_PTR(-ENOMEM);
goto out;
}
new_inode->i_fop = fops;
new_inode->i_private = data;
d_instantiate(dentry, new_inode);
fsnotify_create(parent_inode, dentry);
out:
inode_unlock(parent_inode);
return dentry;
}
struct dentry *rust_binderfs_create_proc_file(struct inode *nodp, int pid)
{
struct binderfs_info *info = nodp->i_sb->s_fs_info;
struct dentry *dir = info->proc_log_dir;
char strbuf[20 + 1];
void *data = (void *)(unsigned long) pid;
if (!dir)
return NULL;
snprintf(strbuf, sizeof(strbuf), "%u", pid);
return rust_binderfs_create_file(dir, strbuf, &rust_binder_proc_fops, data);
}
static struct dentry *binderfs_create_dir(struct dentry *parent,
const char *name)
{
struct dentry *dentry;
struct inode *new_inode, *parent_inode;
struct super_block *sb;
parent_inode = d_inode(parent);
inode_lock(parent_inode);
dentry = binderfs_create_dentry(parent, name);
if (IS_ERR(dentry))
goto out;
sb = parent_inode->i_sb;
new_inode = binderfs_make_inode(sb, S_IFDIR | 0755);
if (!new_inode) {
dput(dentry);
dentry = ERR_PTR(-ENOMEM);
goto out;
}
new_inode->i_fop = &simple_dir_operations;
new_inode->i_op = &simple_dir_inode_operations;
set_nlink(new_inode, 2);
d_instantiate(dentry, new_inode);
inc_nlink(parent_inode);
fsnotify_mkdir(parent_inode, dentry);
out:
inode_unlock(parent_inode);
return dentry;
}
static int binder_features_show(struct seq_file *m, void *unused)
{
bool *feature = m->private;
seq_printf(m, "%d\n", *feature);
return 0;
}
DEFINE_SHOW_ATTRIBUTE(binder_features);
static int init_binder_features(struct super_block *sb)
{
struct dentry *dentry, *dir;
dir = binderfs_create_dir(sb->s_root, "features");
if (IS_ERR(dir))
return PTR_ERR(dir);
dentry = rust_binderfs_create_file(dir, "oneway_spam_detection",
&binder_features_fops,
&binder_features.oneway_spam_detection);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
dentry = rust_binderfs_create_file(dir, "extended_error",
&binder_features_fops,
&binder_features.extended_error);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
return 0;
}
static int init_binder_logs(struct super_block *sb)
{
struct dentry *binder_logs_root_dir, *dentry, *proc_log_dir;
struct binderfs_info *info;
int ret = 0;
binder_logs_root_dir = binderfs_create_dir(sb->s_root,
"binder_logs");
if (IS_ERR(binder_logs_root_dir)) {
ret = PTR_ERR(binder_logs_root_dir);
goto out;
}
dentry = rust_binderfs_create_file(binder_logs_root_dir, "stats",
&rust_binder_stats_fops, NULL);
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
goto out;
}
dentry = rust_binderfs_create_file(binder_logs_root_dir, "state",
&rust_binder_state_fops, NULL);
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
goto out;
}
dentry = rust_binderfs_create_file(binder_logs_root_dir, "transactions",
&rust_binder_transactions_fops, NULL);
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
goto out;
}
proc_log_dir = binderfs_create_dir(binder_logs_root_dir, "proc");
if (IS_ERR(proc_log_dir)) {
ret = PTR_ERR(proc_log_dir);
goto out;
}
info = sb->s_fs_info;
info->proc_log_dir = proc_log_dir;
out:
return ret;
}
static int binderfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
int ret;
struct binderfs_info *info;
struct binderfs_mount_opts *ctx = fc->fs_private;
struct inode *inode = NULL;
struct binderfs_device device_info = {};
const char *name;
size_t len;
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
/*
* The binderfs filesystem can be mounted by userns root in a
* non-initial userns. By default such mounts have the SB_I_NODEV flag
* set in s_iflags to prevent security issues where userns root can
* just create random device nodes via mknod() since it owns the
* filesystem mount. But binderfs does not allow to create any files
* including devices nodes. The only way to create binder devices nodes
* is through the binder-control device which userns root is explicitly
* allowed to do. So removing the SB_I_NODEV flag from s_iflags is both
* necessary and safe.
*/
sb->s_iflags &= ~SB_I_NODEV;
sb->s_iflags |= SB_I_NOEXEC;
sb->s_magic = RUST_BINDERFS_SUPER_MAGIC;
sb->s_op = &binderfs_super_ops;
sb->s_time_gran = 1;
sb->s_fs_info = kzalloc(sizeof(struct binderfs_info), GFP_KERNEL);
if (!sb->s_fs_info)
return -ENOMEM;
info = sb->s_fs_info;
info->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns);
info->root_gid = make_kgid(sb->s_user_ns, 0);
if (!gid_valid(info->root_gid))
info->root_gid = GLOBAL_ROOT_GID;
info->root_uid = make_kuid(sb->s_user_ns, 0);
if (!uid_valid(info->root_uid))
info->root_uid = GLOBAL_ROOT_UID;
info->mount_opts.max = ctx->max;
info->mount_opts.stats_mode = ctx->stats_mode;
inode = new_inode(sb);
if (!inode)
return -ENOMEM;
inode->i_ino = FIRST_INODE;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | 0755;
simple_inode_init_ts(inode);
inode->i_op = &binderfs_dir_inode_operations;
set_nlink(inode, 2);
sb->s_root = d_make_root(inode);
if (!sb->s_root)
return -ENOMEM;
ret = binderfs_binder_ctl_create(sb);
if (ret)
return ret;
name = rust_binder_devices_param;
for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) {
strscpy(device_info.name, name, len + 1);
ret = binderfs_binder_device_create(inode, NULL, &device_info);
if (ret)
return ret;
name += len;
if (*name == ',')
name++;
}
ret = init_binder_features(sb);
if (ret)
return ret;
if (info->mount_opts.stats_mode == binderfs_stats_mode_global)
return init_binder_logs(sb);
return 0;
}
static int binderfs_fs_context_get_tree(struct fs_context *fc)
{
return get_tree_nodev(fc, binderfs_fill_super);
}
static void binderfs_fs_context_free(struct fs_context *fc)
{
struct binderfs_mount_opts *ctx = fc->fs_private;
kfree(ctx);
}
static const struct fs_context_operations binderfs_fs_context_ops = {
.free = binderfs_fs_context_free,
.get_tree = binderfs_fs_context_get_tree,
.parse_param = binderfs_fs_context_parse_param,
.reconfigure = binderfs_fs_context_reconfigure,
};
static int binderfs_init_fs_context(struct fs_context *fc)
{
struct binderfs_mount_opts *ctx;
ctx = kzalloc(sizeof(struct binderfs_mount_opts), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
ctx->max = BINDERFS_MAX_MINOR;
ctx->stats_mode = binderfs_stats_mode_unset;
fc->fs_private = ctx;
fc->ops = &binderfs_fs_context_ops;
return 0;
}
static void binderfs_kill_super(struct super_block *sb)
{
struct binderfs_info *info = sb->s_fs_info;
/*
* During inode eviction struct binderfs_info is needed.
* So first wipe the super_block then free struct binderfs_info.
*/
kill_litter_super(sb);
if (info && info->ipc_ns)
put_ipc_ns(info->ipc_ns);
kfree(info);
}
static struct file_system_type binder_fs_type = {
.name = "binder",
.init_fs_context = binderfs_init_fs_context,
.parameters = binderfs_fs_parameters,
.kill_sb = binderfs_kill_super,
.fs_flags = FS_USERNS_MOUNT,
};
int init_rust_binderfs(void)
{
int ret;
const char *name;
size_t len;
/* Verify that the default binderfs device names are valid. */
name = rust_binder_devices_param;
for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) {
if (len > BINDERFS_MAX_NAME)
return -E2BIG;
name += len;
if (*name == ',')
name++;
}
/* Allocate new major number for binderfs. */
ret = alloc_chrdev_region(&binderfs_dev, 0, BINDERFS_MAX_MINOR,
"rust_binder");
if (ret)
return ret;
ret = register_filesystem(&binder_fs_type);
if (ret) {
unregister_chrdev_region(binderfs_dev, BINDERFS_MAX_MINOR);
return ret;
}
return ret;
}

View File

@@ -0,0 +1,88 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
//! Keep track of statistics for binder_logs.
use crate::defs::*;
use core::sync::atomic::{AtomicU32, Ordering::Relaxed};
use kernel::{ioctl::_IOC_NR, seq_file::SeqFile, seq_print};
const BC_COUNT: usize = _IOC_NR(BC_REPLY_SG) as usize + 1;
const BR_COUNT: usize = _IOC_NR(BR_TRANSACTION_PENDING_FROZEN) as usize + 1;
pub(crate) static GLOBAL_STATS: BinderStats = BinderStats::new();
pub(crate) struct BinderStats {
bc: [AtomicU32; BC_COUNT],
br: [AtomicU32; BR_COUNT],
}
impl BinderStats {
pub(crate) const fn new() -> Self {
const ZERO: AtomicU32 = AtomicU32::new(0);
Self {
bc: [ZERO; BC_COUNT],
br: [ZERO; BR_COUNT],
}
}
pub(crate) fn inc_bc(&self, bc: u32) {
let idx = _IOC_NR(bc) as usize;
if let Some(bc_ref) = self.bc.get(idx) {
bc_ref.fetch_add(1, Relaxed);
}
}
pub(crate) fn inc_br(&self, br: u32) {
let idx = _IOC_NR(br) as usize;
if let Some(br_ref) = self.br.get(idx) {
br_ref.fetch_add(1, Relaxed);
}
}
pub(crate) fn debug_print(&self, prefix: &str, m: &SeqFile) {
for (i, cnt) in self.bc.iter().enumerate() {
let cnt = cnt.load(Relaxed);
if cnt > 0 {
seq_print!(m, "{}{}: {}\n", prefix, command_string(i), cnt);
}
}
for (i, cnt) in self.br.iter().enumerate() {
let cnt = cnt.load(Relaxed);
if cnt > 0 {
seq_print!(m, "{}{}: {}\n", prefix, return_string(i), cnt);
}
}
}
}
mod strings {
use core::str::from_utf8_unchecked;
use kernel::str::CStr;
extern "C" {
static binder_command_strings: [*const u8; super::BC_COUNT];
static binder_return_strings: [*const u8; super::BR_COUNT];
}
pub(super) fn command_string(i: usize) -> &'static str {
// SAFETY: Accessing `binder_command_strings` is always safe.
let c_str_ptr = unsafe { binder_command_strings[i] };
// SAFETY: The `binder_command_strings` array only contains nul-terminated strings.
let bytes = unsafe { CStr::from_char_ptr(c_str_ptr) }.as_bytes();
// SAFETY: The `binder_command_strings` array only contains strings with ascii-chars.
unsafe { from_utf8_unchecked(bytes) }
}
pub(super) fn return_string(i: usize) -> &'static str {
// SAFETY: Accessing `binder_return_strings` is always safe.
let c_str_ptr = unsafe { binder_return_strings[i] };
// SAFETY: The `binder_command_strings` array only contains nul-terminated strings.
let bytes = unsafe { CStr::from_char_ptr(c_str_ptr) }.as_bytes();
// SAFETY: The `binder_command_strings` array only contains strings with ascii-chars.
unsafe { from_utf8_unchecked(bytes) }
}
}
use strings::{command_string, return_string};

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,236 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use crate::{defs::BinderTransactionDataSg, node::Node, thread::Thread, transaction::Transaction};
use kernel::bindings::{
binder_transaction_data_sg, flat_binder_object, rust_binder_node, rust_binder_thread,
rust_binder_transaction, task_struct,
};
use kernel::error::Result;
use kernel::ffi::{c_int, c_uint, c_ulong};
use kernel::task::{Pid, Task};
use kernel::tracepoint::declare_trace;
use kernel::uapi;
declare_trace! {
unsafe fn rust_binder_ioctl(cmd: c_uint, arg: c_ulong);
unsafe fn rust_binder_ioctl_done(ret: c_int);
unsafe fn rust_binder_read_done(ret: c_int);
unsafe fn rust_binder_write_done(ret: c_int);
unsafe fn rust_binder_set_priority(thread: *mut task_struct, desired_prio: c_int, new_prio: c_int);
unsafe fn android_vh_rust_binder_set_priority(t: rust_binder_transaction, task: *mut task_struct);
unsafe fn android_vh_rust_binder_restore_priority(task: *mut task_struct);
unsafe fn rust_binder_wait_for_work(proc_work: bool, transaction_stack: bool, thread_todo: bool);
unsafe fn rust_binder_transaction(reply: bool, t: rust_binder_transaction);
unsafe fn rust_binder_transaction_received(t: rust_binder_transaction);
unsafe fn rust_binder_transaction_thread_selected(t: rust_binder_transaction, thread: rust_binder_thread);
unsafe fn rust_binder_transaction_node_send(t_debug_id: c_int, n: rust_binder_node,
orig: *const flat_binder_object,
trans: *const flat_binder_object);
unsafe fn rust_binder_transaction_fd_send(t_debug_id: c_int, fd: c_int, offset: usize);
unsafe fn rust_binder_transaction_fd_recv(t_debug_id: c_int, fd: c_int, offset: usize);
unsafe fn rust_binder_transaction_alloc_buf(debug_id: c_int, data: *const binder_transaction_data_sg);
unsafe fn rust_binder_transaction_buffer_release(debug_id: c_int);
unsafe fn rust_binder_transaction_failed_buffer_release(debug_id: c_int);
unsafe fn rust_binder_transaction_update_buffer_release(debug_id: c_int);
unsafe fn rust_binder_update_page_range(pid: c_int, allocate: bool, start: usize, end: usize);
unsafe fn rust_binder_alloc_lru_start(pid: c_int, page_index: usize);
unsafe fn rust_binder_alloc_lru_end(pid: c_int, page_index: usize);
unsafe fn rust_binder_free_lru_start(pid: c_int, page_index: usize);
unsafe fn rust_binder_free_lru_end(pid: c_int, page_index: usize);
unsafe fn rust_binder_alloc_page_start(pid: c_int, page_index: usize);
unsafe fn rust_binder_alloc_page_end(pid: c_int, page_index: usize);
unsafe fn rust_binder_unmap_user_start(pid: c_int, page_index: usize);
unsafe fn rust_binder_unmap_user_end(pid: c_int, page_index: usize);
unsafe fn rust_binder_unmap_kernel_start(pid: c_int, page_index: usize);
unsafe fn rust_binder_unmap_kernel_end(pid: c_int, page_index: usize);
unsafe fn rust_binder_command(cmd: u32);
unsafe fn rust_binder_return(ret: u32);
}
#[inline]
fn raw_transaction(t: &Transaction) -> rust_binder_transaction {
t as *const Transaction as rust_binder_transaction
}
#[inline]
fn raw_thread(t: &Thread) -> rust_binder_thread {
t as *const Thread as rust_binder_thread
}
#[inline]
fn raw_node(n: &Node) -> rust_binder_node {
n as *const Node as rust_binder_node
}
#[inline]
fn to_errno(ret: Result) -> i32 {
match ret {
Ok(()) => 0,
Err(err) => err.to_errno(),
}
}
#[inline]
pub(crate) fn trace_ioctl(cmd: u32, arg: usize) {
// SAFETY: Always safe to call.
unsafe { rust_binder_ioctl(cmd, arg as c_ulong) }
}
#[inline]
pub(crate) fn trace_ioctl_done(ret: Result) {
// SAFETY: Always safe to call.
unsafe { rust_binder_ioctl_done(to_errno(ret)) }
}
#[inline]
pub(crate) fn trace_read_done(ret: Result) {
// SAFETY: Always safe to call.
unsafe { rust_binder_read_done(to_errno(ret)) }
}
#[inline]
pub(crate) fn trace_write_done(ret: Result) {
// SAFETY: Always safe to call.
unsafe { rust_binder_write_done(to_errno(ret)) }
}
#[inline]
pub(crate) fn trace_set_priority(thread: &Task, desired_prio: c_int, new_prio: c_int) {
// SAFETY: The pointer to the task is valid for the duration of this call.
unsafe { rust_binder_set_priority(thread.as_ptr(), desired_prio, new_prio) }
}
#[inline]
pub(crate) fn vh_set_priority(t: &Transaction, task: &Task) {
// SAFETY: The pointers to `t` and `task` are valid.
unsafe { android_vh_rust_binder_set_priority(raw_transaction(t), task.as_ptr()) }
}
#[inline]
pub(crate) fn vh_restore_priority(task: &Task) {
// SAFETY: The pointer to `task` is valid.
unsafe { android_vh_rust_binder_restore_priority(task.as_ptr()) }
}
#[inline]
pub(crate) fn trace_wait_for_work(proc_work: bool, transaction_stack: bool, thread_todo: bool) {
// SAFETY: Always safe to call.
unsafe { rust_binder_wait_for_work(proc_work, transaction_stack, thread_todo) }
}
#[inline]
pub(crate) fn trace_transaction(reply: bool, t: &Transaction) {
// SAFETY: The raw transaction is valid for the duration of this call.
unsafe { rust_binder_transaction(reply, raw_transaction(t)) }
}
#[inline]
pub(crate) fn trace_transaction_received(t: &Transaction) {
// SAFETY: The raw transaction is valid for the duration of this call.
unsafe { rust_binder_transaction_received(raw_transaction(t)) }
}
#[inline]
pub(crate) fn trace_transaction_thread_selected(t: &Transaction, th: &Thread) {
// SAFETY: The raw transaction is valid for the duration of this call.
unsafe { rust_binder_transaction_thread_selected(raw_transaction(t), raw_thread(th)) }
}
#[inline]
pub(crate) fn trace_transaction_node_send(
t_debug_id: usize,
n: &Node,
orig: &uapi::flat_binder_object,
trans: &uapi::flat_binder_object,
) {
// CAST: Types are identical.
let orig = orig as *const uapi::flat_binder_object as *const flat_binder_object;
// CAST: Types are identical.
let trans = trans as *const uapi::flat_binder_object as *const flat_binder_object;
// SAFETY: The pointers are valid for the duration of this call.
unsafe { rust_binder_transaction_node_send(t_debug_id as c_int, raw_node(n), orig, trans) }
}
#[inline]
pub(crate) fn trace_transaction_fd_send(t_debug_id: usize, fd: u32, offset: usize) {
// SAFETY: This function is always safe to call.
unsafe { rust_binder_transaction_fd_send(t_debug_id as c_int, fd as c_int, offset) }
}
#[inline]
pub(crate) fn trace_transaction_fd_recv(t_debug_id: usize, fd: u32, offset: usize) {
// SAFETY: This function is always safe to call.
unsafe { rust_binder_transaction_fd_recv(t_debug_id as c_int, fd as c_int, offset) }
}
#[inline]
pub(crate) fn trace_transaction_alloc_buf(debug_id: usize, data: &BinderTransactionDataSg) {
let data = data as *const BinderTransactionDataSg;
// SAFETY: The `data` pointer is valid.
unsafe { rust_binder_transaction_alloc_buf(debug_id as c_int, data.cast()) }
}
#[inline]
pub(crate) fn trace_transaction_buffer_release(debug_id: usize) {
// SAFETY: Always safe to call.
unsafe { rust_binder_transaction_buffer_release(debug_id as c_int) }
}
#[inline]
pub(crate) fn trace_transaction_failed_buffer_release(debug_id: usize) {
// SAFETY: Always safe to call.
unsafe { rust_binder_transaction_failed_buffer_release(debug_id as c_int) }
}
#[inline]
pub(crate) fn trace_transaction_update_buffer_release(debug_id: usize) {
// SAFETY: Always safe to call.
unsafe { rust_binder_transaction_update_buffer_release(debug_id as c_int) }
}
#[inline]
pub(crate) fn trace_update_page_range(pid: Pid, allocate: bool, start: usize, end: usize) {
// SAFETY: Always safe to call.
unsafe { rust_binder_update_page_range(pid as c_int, allocate, start, end) }
}
macro_rules! define_wrapper_lru_page_class {
($(fn $name:ident;)*) => {$(
kernel::macros::paste! {
#[inline]
pub(crate) fn [< trace_ $name >](pid: Pid, page_index: usize) {
// SAFETY: Always safe to call.
unsafe { [< rust_binder_ $name >](pid as c_int, page_index) }
}
}
)*}
}
define_wrapper_lru_page_class! {
fn alloc_lru_start;
fn alloc_lru_end;
fn free_lru_start;
fn free_lru_end;
fn alloc_page_start;
fn alloc_page_end;
fn unmap_user_start;
fn unmap_user_end;
fn unmap_kernel_start;
fn unmap_kernel_end;
}
#[inline]
pub(crate) fn trace_command(cmd: u32) {
// SAFETY: Trivially safe to call with primitive u32.
unsafe { rust_binder_command(cmd) }
}
#[inline]
pub(crate) fn trace_return(ret: u32) {
// SAFETY: Trivially safe to call with primitive u32.
unsafe { rust_binder_return(ret) }
}

View File

@@ -0,0 +1,557 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use core::sync::atomic::{AtomicBool, Ordering};
use kernel::{
prelude::*,
seq_file::SeqFile,
seq_print,
sync::{Arc, SpinLock},
task::Kuid,
time::{ktime_ms_delta, Ktime},
types::ScopeGuard,
};
use crate::{
allocation::{Allocation, TranslatedFds},
defs::*,
error::{BinderError, BinderResult},
node::{Node, NodeRef},
prio::{self, BinderPriority, PriorityState},
process::{Process, ProcessInner},
ptr_align,
thread::{PushWorkRes, Thread},
BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverToRead,
};
use core::mem::offset_of;
use kernel::bindings::rb_transaction_layout;
pub(crate) const TRANSACTION_LAYOUT: rb_transaction_layout = rb_transaction_layout {
debug_id: offset_of!(Transaction, debug_id),
code: offset_of!(Transaction, code),
flags: offset_of!(Transaction, flags),
from_thread: offset_of!(Transaction, from),
to_proc: offset_of!(Transaction, to),
target_node: offset_of!(Transaction, target_node),
};
#[pin_data(PinnedDrop)]
pub(crate) struct Transaction {
pub(crate) debug_id: usize,
target_node: Option<DArc<Node>>,
pub(crate) from_parent: Option<DArc<Transaction>>,
pub(crate) from: Arc<Thread>,
pub(crate) to: Arc<Process>,
#[pin]
allocation: SpinLock<Option<Allocation>>,
is_outstanding: AtomicBool,
set_priority_called: AtomicBool,
priority: BinderPriority,
#[pin]
saved_priority: SpinLock<BinderPriority>,
code: u32,
pub(crate) flags: u32,
data_size: usize,
offsets_size: usize,
data_address: usize,
sender_euid: Kuid,
txn_security_ctx_off: Option<usize>,
pub(crate) oneway_spam_detected: bool,
start_time: Ktime,
}
kernel::list::impl_list_arc_safe! {
impl ListArcSafe<0> for Transaction { untracked; }
}
impl Transaction {
pub(crate) fn new(
node_ref: NodeRef,
from_parent: Option<DArc<Transaction>>,
from: &Arc<Thread>,
tr: &BinderTransactionDataSg,
) -> BinderResult<DLArc<Self>> {
let debug_id = super::next_debug_id();
let trd = &tr.transaction_data;
let allow_fds = node_ref.node.flags & FLAT_BINDER_FLAG_ACCEPTS_FDS != 0;
let txn_security_ctx = node_ref.node.flags & FLAT_BINDER_FLAG_TXN_SECURITY_CTX != 0;
let mut txn_security_ctx_off = if txn_security_ctx { Some(0) } else { None };
let to = node_ref.node.owner.clone();
let mut alloc = match from.copy_transaction_data(
to.clone(),
tr,
debug_id,
allow_fds,
txn_security_ctx_off.as_mut(),
) {
Ok(alloc) => alloc,
Err(err) => {
if !err.is_dead() {
pr_warn!("Failure in copy_transaction_data: {:?}", err);
}
return Err(err);
}
};
let oneway_spam_detected = alloc.oneway_spam_detected;
if trd.flags & TF_ONE_WAY != 0 {
if from_parent.is_some() {
pr_warn!("Oneway transaction should not be in a transaction stack.");
return Err(EINVAL.into());
}
alloc.set_info_oneway_node(node_ref.node.clone());
}
if trd.flags & TF_CLEAR_BUF != 0 {
alloc.set_info_clear_on_drop();
}
let target_node = node_ref.node.clone();
alloc.set_info_target_node(node_ref);
let data_address = alloc.ptr;
let priority =
if (trd.flags & TF_ONE_WAY == 0) && prio::is_supported_policy(from.task.policy()) {
BinderPriority {
sched_policy: from.task.policy(),
prio: from.task.normal_prio(),
}
} else {
from.process.default_priority
};
Ok(DTRWrap::arc_pin_init(pin_init!(Transaction {
debug_id,
target_node: Some(target_node),
from_parent,
sender_euid: from.process.cred.euid(),
from: from.clone(),
to,
code: trd.code,
flags: trd.flags,
data_size: trd.data_size as _,
offsets_size: trd.offsets_size as _,
data_address,
allocation <- kernel::new_spinlock!(Some(alloc.success()), "Transaction::new"),
is_outstanding: AtomicBool::new(false),
priority,
saved_priority <- kernel::new_spinlock!(BinderPriority::default(), "Transaction::saved_priority"),
set_priority_called: AtomicBool::new(false),
txn_security_ctx_off,
oneway_spam_detected,
start_time: Ktime::ktime_get(),
}))?)
}
pub(crate) fn new_reply(
from: &Arc<Thread>,
to: Arc<Process>,
tr: &BinderTransactionDataSg,
allow_fds: bool,
) -> BinderResult<DLArc<Self>> {
let debug_id = super::next_debug_id();
let trd = &tr.transaction_data;
let mut alloc = match from.copy_transaction_data(to.clone(), tr, debug_id, allow_fds, None)
{
Ok(alloc) => alloc,
Err(err) => {
pr_warn!("Failure in copy_transaction_data: {:?}", err);
return Err(err);
}
};
let oneway_spam_detected = alloc.oneway_spam_detected;
if trd.flags & TF_CLEAR_BUF != 0 {
alloc.set_info_clear_on_drop();
}
Ok(DTRWrap::arc_pin_init(pin_init!(Transaction {
debug_id,
target_node: None,
from_parent: None,
sender_euid: from.process.task.euid(),
from: from.clone(),
to,
code: trd.code,
flags: trd.flags,
data_size: trd.data_size as _,
offsets_size: trd.offsets_size as _,
data_address: alloc.ptr,
allocation <- kernel::new_spinlock!(Some(alloc.success()), "Transaction::new"),
is_outstanding: AtomicBool::new(false),
priority: BinderPriority::default(),
saved_priority <- kernel::new_spinlock!(BinderPriority::default(), "Transaction::saved_priority"),
set_priority_called: AtomicBool::new(false),
txn_security_ctx_off: None,
oneway_spam_detected,
start_time: Ktime::ktime_get(),
}))?)
}
#[inline(never)]
pub(crate) fn debug_print_inner(&self, m: &SeqFile, prefix: &str) {
seq_print!(
m,
"{}{}: from {}:{} to {} code {:x} flags {:x} pri {}:{} elapsed {}ms",
prefix,
self.debug_id,
self.from.process.task.pid(),
self.from.id,
self.to.task.pid(),
self.code,
self.flags,
self.priority.sched_policy,
self.priority.prio,
ktime_ms_delta(Ktime::ktime_get(), self.start_time),
);
if let Some(target_node) = &self.target_node {
seq_print!(m, " node {}", target_node.debug_id);
}
seq_print!(m, " size {}:{}\n", self.data_size, self.offsets_size);
}
pub(crate) fn saved_priority(&self) -> BinderPriority {
*self.saved_priority.lock()
}
/// Determines if the transaction is stacked on top of the given transaction.
pub(crate) fn is_stacked_on(&self, onext: &Option<DArc<Self>>) -> bool {
match (&self.from_parent, onext) {
(None, None) => true,
(Some(from_parent), Some(next)) => Arc::ptr_eq(from_parent, next),
_ => false,
}
}
/// Returns a pointer to the next transaction on the transaction stack, if there is one.
pub(crate) fn clone_next(&self) -> Option<DArc<Self>> {
Some(self.from_parent.as_ref()?.clone())
}
/// Searches in the transaction stack for a thread that belongs to the target process. This is
/// useful when finding a target for a new transaction: if the node belongs to a process that
/// is already part of the transaction stack, we reuse the thread.
fn find_target_thread(&self) -> Option<Arc<Thread>> {
let mut it = &self.from_parent;
while let Some(transaction) = it {
if Arc::ptr_eq(&transaction.from.process, &self.to) {
return Some(transaction.from.clone());
}
it = &transaction.from_parent;
}
None
}
/// Searches in the transaction stack for a transaction originating at the given thread.
pub(crate) fn find_from(&self, thread: &Thread) -> Option<DArc<Transaction>> {
let mut it = &self.from_parent;
while let Some(transaction) = it {
if core::ptr::eq(thread, transaction.from.as_ref()) {
return Some(transaction.clone());
}
it = &transaction.from_parent;
}
None
}
pub(crate) fn set_outstanding(&self, to_process: &mut ProcessInner) {
// No race because this method is only called once.
if !self.is_outstanding.load(Ordering::Relaxed) {
self.is_outstanding.store(true, Ordering::Relaxed);
to_process.add_outstanding_txn();
}
}
/// Decrement `outstanding_txns` in `to` if it hasn't already been decremented.
fn drop_outstanding_txn(&self) {
// No race because this is called at most twice, and one of the calls are in the
// destructor, which is guaranteed to not race with any other operations on the
// transaction. It also cannot race with `set_outstanding`, since submission happens
// before delivery.
if self.is_outstanding.load(Ordering::Relaxed) {
self.is_outstanding.store(false, Ordering::Relaxed);
self.to.drop_outstanding_txn();
}
}
/// Submits the transaction to a work queue. Uses a thread if there is one in the transaction
/// stack, otherwise uses the destination process.
///
/// Not used for replies.
pub(crate) fn submit(self: DLArc<Self>) -> BinderResult {
crate::trace::trace_transaction(false, &self);
// Defined before `process_inner` so that the destructor runs after releasing the lock.
let mut _t_outdated;
let oneway = self.flags & TF_ONE_WAY != 0;
let process = self.to.clone();
let mut process_inner = process.inner.lock();
self.set_outstanding(&mut process_inner);
if oneway {
if let Some(target_node) = self.target_node.clone() {
if process_inner.is_frozen {
process_inner.async_recv = true;
if self.flags & TF_UPDATE_TXN != 0 {
if let Some(t_outdated) =
target_node.take_outdated_transaction(&self, &mut process_inner)
{
crate::trace::trace_transaction_update_buffer_release(
t_outdated.debug_id,
);
// Save the transaction to be dropped after locks are released.
_t_outdated = t_outdated;
}
}
}
match target_node.submit_oneway(self, &mut process_inner) {
Ok(()) => {}
Err((err, work)) => {
drop(process_inner);
// Drop work after releasing process lock.
drop(work);
return Err(err);
}
}
if process_inner.is_frozen {
return Err(BinderError::new_frozen_oneway());
} else {
return Ok(());
}
} else {
pr_err!("Failed to submit oneway transaction to node.");
}
}
if process_inner.is_frozen {
process_inner.sync_recv = true;
return Err(BinderError::new_frozen());
}
let res = if let Some(thread) = self.find_target_thread() {
match thread.push_work(self) {
PushWorkRes::Ok => Ok(()),
PushWorkRes::FailedDead(me) => Err((BinderError::new_dead(), me)),
}
} else {
process_inner.push_work(self)
};
drop(process_inner);
match res {
Ok(()) => Ok(()),
Err((err, work)) => {
// Drop work after releasing process lock.
drop(work);
Err(err)
}
}
}
/// Check whether one oneway transaction can supersede another.
pub(crate) fn can_replace(&self, old: &Transaction) -> bool {
if self.from.process.task.pid() != old.from.process.task.pid() {
return false;
}
if self.flags & old.flags & (TF_ONE_WAY | TF_UPDATE_TXN) != (TF_ONE_WAY | TF_UPDATE_TXN) {
return false;
}
let target_node_match = match (self.target_node.as_ref(), old.target_node.as_ref()) {
(None, None) => true,
(Some(tn1), Some(tn2)) => Arc::ptr_eq(tn1, tn2),
_ => false,
};
self.code == old.code && self.flags == old.flags && target_node_match
}
fn prepare_file_list(&self) -> Result<TranslatedFds> {
let mut alloc = self.allocation.lock().take().ok_or(ESRCH)?;
match alloc.translate_fds() {
Ok(translated) => {
*self.allocation.lock() = Some(alloc);
Ok(translated)
}
Err(err) => {
// Free the allocation eagerly.
drop(alloc);
Err(err)
}
}
}
}
impl DeliverToRead for Transaction {
fn do_work(
self: DArc<Self>,
thread: &Thread,
writer: &mut BinderReturnWriter<'_>,
) -> Result<bool> {
let send_failed_reply = ScopeGuard::new(|| {
if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 {
let reply = Err(BR_FAILED_REPLY);
self.from.deliver_reply(reply, &self);
}
self.drop_outstanding_txn();
});
// Update thread priority. This only has an effect if the transaction is delivered via the
// process work list, since the priority has otherwise already been updated.
self.on_thread_selected(thread);
let files = if let Ok(list) = self.prepare_file_list() {
list
} else {
// On failure to process the list, we send a reply back to the sender and ignore the
// transaction on the recipient.
return Ok(true);
};
let mut tr_sec = BinderTransactionDataSecctx::default();
let tr = tr_sec.tr_data();
if let Some(target_node) = &self.target_node {
let (ptr, cookie) = target_node.get_id();
tr.target.ptr = ptr as _;
tr.cookie = cookie as _;
};
tr.code = self.code;
tr.flags = self.flags;
tr.data_size = self.data_size as _;
tr.data.ptr.buffer = self.data_address as _;
tr.offsets_size = self.offsets_size as _;
if tr.offsets_size > 0 {
tr.data.ptr.offsets = (self.data_address + ptr_align(self.data_size)) as _;
}
tr.sender_euid = self.sender_euid.into_uid_in_current_ns();
tr.sender_pid = 0;
if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 {
// Not a reply and not one-way.
tr.sender_pid = self.from.process.task.pid_in_current_ns();
}
let code = if self.target_node.is_none() {
BR_REPLY
} else if self.txn_security_ctx_off.is_some() {
BR_TRANSACTION_SEC_CTX
} else {
BR_TRANSACTION
};
// Write the transaction code and data to the user buffer.
writer.write_code(code)?;
if let Some(off) = self.txn_security_ctx_off {
tr_sec.secctx = (self.data_address + off) as u64;
writer.write_payload(&tr_sec)?;
} else {
writer.write_payload(&*tr)?;
}
let mut alloc = self.allocation.lock().take().ok_or(ESRCH)?;
// Dismiss the completion of transaction with a failure. No failure paths are allowed from
// here on out.
send_failed_reply.dismiss();
// Commit files, and set FDs in FDA to be closed on buffer free.
let close_on_free = files.commit();
alloc.set_info_close_on_free(close_on_free);
// It is now the user's responsibility to clear the allocation.
alloc.keep_alive();
self.drop_outstanding_txn();
crate::trace::trace_transaction_received(&self);
// When this is not a reply and not a oneway transaction, update `current_transaction`. If
// it's a reply, `current_transaction` has already been updated appropriately.
if self.target_node.is_some() && tr_sec.transaction_data.flags & TF_ONE_WAY == 0 {
thread.set_current_transaction(self);
}
Ok(false)
}
fn cancel(self: DArc<Self>) {
let allocation = self.allocation.lock().take();
drop(allocation);
// If this is not a reply or oneway transaction, then send a dead reply.
if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 {
let reply = Err(BR_DEAD_REPLY);
self.from.deliver_reply(reply, &self);
}
self.drop_outstanding_txn();
}
fn on_thread_selected(&self, to_thread: &Thread) {
// Return immediately if reply.
let target_node = match self.target_node.as_ref() {
Some(target_node) => target_node,
None => return,
};
// We only need to do this once.
if self.set_priority_called.swap(true, Ordering::Relaxed) {
return;
}
crate::trace::trace_transaction_thread_selected(self, to_thread);
let node_prio = target_node.node_prio();
let mut desired = self.priority;
if !target_node.inherit_rt() && prio::is_rt_policy(desired.sched_policy) {
desired.prio = prio::DEFAULT_PRIO;
desired.sched_policy = prio::SCHED_NORMAL;
}
if node_prio.prio < self.priority.prio
|| (node_prio.prio == self.priority.prio && node_prio.sched_policy == prio::SCHED_FIFO)
{
// In case the minimum priority on the node is
// higher (lower value), use that priority. If
// the priority is the same, but the node uses
// SCHED_FIFO, prefer SCHED_FIFO, since it can
// run unbounded, unlike SCHED_RR.
desired = node_prio;
}
let mut prio_state = to_thread.prio_lock.lock();
if prio_state.state == PriorityState::Pending {
// Task is in the process of changing priorities
// saving its current values would be incorrect.
// Instead, save the pending priority and signal
// the task to abort the priority restore.
prio_state.state = PriorityState::Abort;
*self.saved_priority.lock() = prio_state.next;
} else {
let task = &*self.to.task;
let mut saved_priority = self.saved_priority.lock();
saved_priority.sched_policy = task.policy();
saved_priority.prio = task.normal_prio();
}
drop(prio_state);
to_thread.set_priority(&desired, self);
}
fn should_sync_wakeup(&self) -> bool {
self.flags & TF_ONE_WAY == 0
}
fn debug_print(&self, m: &SeqFile, _prefix: &str, tprefix: &str) -> Result<()> {
self.debug_print_inner(m, tprefix);
Ok(())
}
}
#[pinned_drop]
impl PinnedDrop for Transaction {
fn drop(self: Pin<&mut Self>) {
self.drop_outstanding_txn();
}
}

View File

@@ -109,6 +109,7 @@ struct binder_alloc {
int pid;
size_t pages_high;
bool oneway_spam_detected;
ANDROID_OEM_DATA(1);
};
#ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST

View File

@@ -105,6 +105,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpu_idle_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mpam_set);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_wq_lockup_pool);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_alloc_and_link_pwqs);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_alloc_workqueue);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_create_worker);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ipi_stop);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sysrq_crash);
@@ -130,6 +131,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_si_mem_available_adjust);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_si_meminfo_adjust);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_si_meminfo_adjust_shmem);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_fill_prdt);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ufs_complete_init);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ufs_reprogram_all_keys);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_prepare_command);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_update_sysfs);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_command);
@@ -151,6 +154,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_f2fs_restore_priority);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_f2fs_printk);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_f2fs_create);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_io_statistics);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_dpm_prepare);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ogki_check_vip_status);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ogki_task_util);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_ogki_uclamp_task_util);
@@ -399,6 +403,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_count_workingset_refault);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sk_clone_lock);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_free_unref_folios_to_pcp_bypass);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_alloc_fail);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_alloc_start);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_alloc_finish);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cma_alloc_busy_info);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_vmalloc_node_bypass);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_vfree_bypass);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_modify_scan_control);
@@ -527,3 +534,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mm_direct_reclaim_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mm_may_oom_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_calculate_totalreserve_pages);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_madvise_cold_pageout_skip);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rmqueue_pcplist_override_batch);

View File

@@ -1869,7 +1869,9 @@ int dpm_prepare(pm_message_t state)
* disable probing of devices. This sync point is important at least
* at boot time + hibernation restore.
*/
trace_android_rvh_dpm_prepare(0);
wait_for_device_probe();
trace_android_rvh_dpm_prepare(1);
/*
* It is unsafe if probing of devices will happen during suspend or
* hibernation and system behavior will be unpredictable in this case.

View File

@@ -311,6 +311,13 @@ static void loop_clear_limits(struct loop_device *lo, int mode)
lim.discard_granularity = 0;
}
/*
* XXX: this updates the queue limits without freezing the queue, which
* is against the locking protocol and dangerous. But we can't just
* freeze the queue as we're inside the ->queue_rq method here. So this
* should move out into a workqueue unless we get the file operations to
* advertise if they support specific fallocate operations.
*/
queue_limits_commit_update(lo->lo_queue, &lim);
}
@@ -770,12 +777,11 @@ static void loop_sysfs_exit(struct loop_device *lo)
&loop_attribute_group);
}
static void loop_config_discard(struct loop_device *lo,
struct queue_limits *lim)
static void loop_get_discard_config(struct loop_device *lo,
u32 *granularity, u32 *max_discard_sectors)
{
struct file *file = lo->lo_backing_file;
struct inode *inode = file->f_mapping->host;
u32 granularity = 0, max_discard_sectors = 0;
struct kstatfs sbuf;
/*
@@ -786,27 +792,19 @@ static void loop_config_discard(struct loop_device *lo,
* file-backed loop devices: discarded regions read back as zero.
*/
if (S_ISBLK(inode->i_mode)) {
struct request_queue *backingq = bdev_get_queue(I_BDEV(inode));
struct block_device *bdev = I_BDEV(inode);
max_discard_sectors = backingq->limits.max_write_zeroes_sectors;
granularity = bdev_discard_granularity(I_BDEV(inode)) ?:
queue_physical_block_size(backingq);
*max_discard_sectors = bdev_write_zeroes_sectors(bdev);
*granularity = bdev_discard_granularity(bdev);
/*
* We use punch hole to reclaim the free space used by the
* image a.k.a. discard.
*/
} else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) {
max_discard_sectors = UINT_MAX >> 9;
granularity = sbuf.f_bsize;
*max_discard_sectors = UINT_MAX >> 9;
*granularity = sbuf.f_bsize;
}
lim->max_hw_discard_sectors = max_discard_sectors;
lim->max_write_zeroes_sectors = max_discard_sectors;
if (max_discard_sectors)
lim->discard_granularity = granularity;
else
lim->discard_granularity = 0;
}
struct loop_worker {
@@ -986,12 +984,13 @@ static unsigned int loop_default_blocksize(struct loop_device *lo,
return SECTOR_SIZE;
}
static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize)
static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim,
unsigned int bsize)
{
struct file *file = lo->lo_backing_file;
struct inode *inode = file->f_mapping->host;
struct block_device *backing_bdev = NULL;
struct queue_limits lim;
u32 granularity = 0, max_discard_sectors = 0;
if (S_ISBLK(inode->i_mode))
backing_bdev = I_BDEV(inode);
@@ -1001,17 +1000,22 @@ static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize)
if (!bsize)
bsize = loop_default_blocksize(lo, backing_bdev);
lim = queue_limits_start_update(lo->lo_queue);
lim.logical_block_size = bsize;
lim.physical_block_size = bsize;
lim.io_min = bsize;
lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL);
loop_get_discard_config(lo, &granularity, &max_discard_sectors);
lim->logical_block_size = bsize;
lim->physical_block_size = bsize;
lim->io_min = bsize;
lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL);
if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY))
lim.features |= BLK_FEAT_WRITE_CACHE;
lim->features |= BLK_FEAT_WRITE_CACHE;
if (backing_bdev && !bdev_nonrot(backing_bdev))
lim.features |= BLK_FEAT_ROTATIONAL;
loop_config_discard(lo, &lim);
return queue_limits_commit_update(lo->lo_queue, &lim);
lim->features |= BLK_FEAT_ROTATIONAL;
lim->max_hw_discard_sectors = max_discard_sectors;
lim->max_write_zeroes_sectors = max_discard_sectors;
if (max_discard_sectors)
lim->discard_granularity = granularity;
else
lim->discard_granularity = 0;
}
static int loop_configure(struct loop_device *lo, blk_mode_t mode,
@@ -1020,6 +1024,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
{
struct file *file = fget(config->fd);
struct address_space *mapping;
struct queue_limits lim;
int error;
loff_t size;
bool partscan;
@@ -1091,7 +1096,10 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
error = loop_reconfigure_limits(lo, config->block_size);
lim = queue_limits_start_update(lo->lo_queue);
loop_update_limits(lo, &lim, config->block_size);
/* No need to freeze the queue as the device isn't bound yet. */
error = queue_limits_commit_update(lo->lo_queue, &lim);
if (error)
goto out_unlock;
@@ -1151,7 +1159,12 @@ static void __loop_clr_fd(struct loop_device *lo)
lo->lo_sizelimit = 0;
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
/* reset the block size to the default */
/*
* Reset the block size to the default.
*
* No queue freezing needed because this is called from the final
* ->release call only, so there can't be any outstanding I/O.
*/
lim = queue_limits_start_update(lo->lo_queue);
lim.logical_block_size = SECTOR_SIZE;
lim.physical_block_size = SECTOR_SIZE;
@@ -1459,6 +1472,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
{
struct queue_limits lim;
int err = 0;
if (lo->lo_state != Lo_bound)
@@ -1470,8 +1484,11 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
sync_blockdev(lo->lo_device);
invalidate_bdev(lo->lo_device);
lim = queue_limits_start_update(lo->lo_queue);
loop_update_limits(lo, &lim, arg);
blk_mq_freeze_queue(lo->lo_queue);
err = loop_reconfigure_limits(lo, arg);
err = queue_limits_commit_update(lo->lo_queue, &lim);
loop_update_dio(lo);
blk_mq_unfreeze_queue(lo->lo_queue);

View File

@@ -327,8 +327,7 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
nsock->sent = 0;
}
static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
loff_t blksize)
static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, loff_t blksize)
{
struct queue_limits lim;
int error;
@@ -368,7 +367,7 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
lim.logical_block_size = blksize;
lim.physical_block_size = blksize;
error = queue_limits_commit_update(nbd->disk->queue, &lim);
error = queue_limits_commit_update_frozen(nbd->disk->queue, &lim);
if (error)
return error;
@@ -379,18 +378,6 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
return 0;
}
static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
loff_t blksize)
{
int error;
blk_mq_freeze_queue(nbd->disk->queue);
error = __nbd_set_size(nbd, bytesize, blksize);
blk_mq_unfreeze_queue(nbd->disk->queue);
return error;
}
static void nbd_complete_rq(struct request *req)
{
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);

Some files were not shown because too many files have changed in this diff Show More