ANDROID: rust_binder: move Rust Binder in preparation for GKI module

This moves the code from the common-modules/binder branch verbatim. The
code is not yet added to the build system as it requires some adjusments
before it can built as a GKI module.

Bug: 388786466
Change-Id: I6b49b1b6ff0e35fbae2b9efc13d6bbde984b5196
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
This commit is contained in:
Alice Ryhl
2025-03-13 09:17:10 +00:00
committed by Matthew Maurer
parent 8313296331
commit dac7c66bc9
23 changed files with 10159 additions and 0 deletions

View File

@@ -0,0 +1,611 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use core::mem::{size_of, size_of_val, MaybeUninit};
use core::ops::Range;
use kernel::{
bindings,
fs::file::{File, FileDescriptorReservation},
prelude::*,
sync::Arc,
types::{ARef, AsBytes, FromBytes},
uaccess::UserSliceReader,
uapi,
};
use crate::{
deferred_close::DeferredFdCloser,
defs::*,
node::{Node, NodeRef},
process::Process,
DArc,
};
#[derive(Default)]
pub(crate) struct AllocationInfo {
/// Range within the allocation where we can find the offsets to the object descriptors.
pub(crate) offsets: Option<Range<usize>>,
/// The target node of the transaction this allocation is associated to.
/// Not set for replies.
pub(crate) target_node: Option<NodeRef>,
/// When this allocation is dropped, call `pending_oneway_finished` on the node.
///
/// This is used to serialize oneway transaction on the same node. Binder guarantees that
/// oneway transactions to the same node are delivered sequentially in the order they are sent.
pub(crate) oneway_node: Option<DArc<Node>>,
/// Zero the data in the buffer on free.
pub(crate) clear_on_free: bool,
/// List of files embedded in this transaction.
file_list: FileList,
}
/// Represents an allocation that the kernel is currently using.
///
/// When allocations are idle, the range allocator holds the data related to them.
///
/// # Invariants
///
/// This allocation corresponds to an allocation in the range allocator, so the relevant pages are
/// marked in use in the page range.
pub(crate) struct Allocation {
pub(crate) offset: usize,
size: usize,
pub(crate) ptr: usize,
pub(crate) process: Arc<Process>,
allocation_info: Option<AllocationInfo>,
free_on_drop: bool,
pub(crate) oneway_spam_detected: bool,
#[allow(dead_code)]
pub(crate) debug_id: usize,
}
impl Allocation {
pub(crate) fn new(
process: Arc<Process>,
debug_id: usize,
offset: usize,
size: usize,
ptr: usize,
oneway_spam_detected: bool,
) -> Self {
Self {
process,
offset,
size,
ptr,
debug_id,
oneway_spam_detected,
allocation_info: None,
free_on_drop: true,
}
}
fn size_check(&self, offset: usize, size: usize) -> Result {
let overflow_fail = offset.checked_add(size).is_none();
let cmp_size_fail = offset.wrapping_add(size) > self.size;
if overflow_fail || cmp_size_fail {
return Err(EFAULT);
}
Ok(())
}
pub(crate) fn copy_into(
&self,
reader: &mut UserSliceReader,
offset: usize,
size: usize,
) -> Result {
self.size_check(offset, size)?;
// SAFETY: While this object exists, the range allocator will keep the range allocated, and
// in turn, the pages will be marked as in use.
unsafe {
self.process
.pages
.copy_from_user_slice(reader, self.offset + offset, size)
}
}
pub(crate) fn read<T: FromBytes>(&self, offset: usize) -> Result<T> {
self.size_check(offset, size_of::<T>())?;
// SAFETY: While this object exists, the range allocator will keep the range allocated, and
// in turn, the pages will be marked as in use.
unsafe { self.process.pages.read(self.offset + offset) }
}
pub(crate) fn write<T: ?Sized>(&self, offset: usize, obj: &T) -> Result {
self.size_check(offset, size_of_val::<T>(obj))?;
// SAFETY: While this object exists, the range allocator will keep the range allocated, and
// in turn, the pages will be marked as in use.
unsafe { self.process.pages.write(self.offset + offset, obj) }
}
pub(crate) fn fill_zero(&self) -> Result {
// SAFETY: While this object exists, the range allocator will keep the range allocated, and
// in turn, the pages will be marked as in use.
unsafe { self.process.pages.fill_zero(self.offset, self.size) }
}
pub(crate) fn keep_alive(mut self) {
self.process
.buffer_make_freeable(self.offset, self.allocation_info.take());
self.free_on_drop = false;
}
pub(crate) fn set_info(&mut self, info: AllocationInfo) {
self.allocation_info = Some(info);
}
pub(crate) fn get_or_init_info(&mut self) -> &mut AllocationInfo {
self.allocation_info.get_or_insert_with(Default::default)
}
pub(crate) fn set_info_offsets(&mut self, offsets: Range<usize>) {
self.get_or_init_info().offsets = Some(offsets);
}
pub(crate) fn set_info_oneway_node(&mut self, oneway_node: DArc<Node>) {
self.get_or_init_info().oneway_node = Some(oneway_node);
}
pub(crate) fn set_info_clear_on_drop(&mut self) {
self.get_or_init_info().clear_on_free = true;
}
pub(crate) fn set_info_target_node(&mut self, target_node: NodeRef) {
self.get_or_init_info().target_node = Some(target_node);
}
/// Reserve enough space to push at least `num_fds` fds.
pub(crate) fn info_add_fd_reserve(&mut self, num_fds: usize) -> Result {
self.get_or_init_info()
.file_list
.files_to_translate
.reserve(num_fds, GFP_KERNEL)?;
Ok(())
}
pub(crate) fn info_add_fd(
&mut self,
file: ARef<File>,
buffer_offset: usize,
close_on_free: bool,
) -> Result {
self.get_or_init_info().file_list.files_to_translate.push(
FileEntry {
file,
buffer_offset,
close_on_free,
},
GFP_KERNEL,
)?;
Ok(())
}
pub(crate) fn set_info_close_on_free(&mut self, cof: FdsCloseOnFree) {
self.get_or_init_info().file_list.close_on_free = cof.0;
}
pub(crate) fn translate_fds(&mut self) -> Result<TranslatedFds> {
let file_list = match self.allocation_info.as_mut() {
Some(info) => &mut info.file_list,
None => return Ok(TranslatedFds::new()),
};
let files = core::mem::take(&mut file_list.files_to_translate);
let num_close_on_free = files.iter().filter(|entry| entry.close_on_free).count();
let mut close_on_free = Vec::with_capacity(num_close_on_free, GFP_KERNEL)?;
let mut reservations = Vec::with_capacity(files.len(), GFP_KERNEL)?;
for file_info in files {
let res = FileDescriptorReservation::get_unused_fd_flags(bindings::O_CLOEXEC)?;
let fd = res.reserved_fd();
self.write::<u32>(file_info.buffer_offset, &fd)?;
crate::trace::trace_transaction_fd_recv(self.debug_id, fd, file_info.buffer_offset);
reservations.push(
Reservation {
res,
file: file_info.file,
},
GFP_KERNEL,
)?;
if file_info.close_on_free {
close_on_free.push(fd, GFP_KERNEL)?;
}
}
Ok(TranslatedFds {
reservations,
close_on_free: FdsCloseOnFree(close_on_free),
})
}
/// Should the looper return to userspace when freeing this allocation?
pub(crate) fn looper_need_return_on_free(&self) -> bool {
// Closing fds involves pushing task_work for execution when we return to userspace. Hence,
// we should return to userspace asap if we are closing fds.
match self.allocation_info {
Some(ref info) => !info.file_list.close_on_free.is_empty(),
None => false,
}
}
}
impl Drop for Allocation {
fn drop(&mut self) {
if !self.free_on_drop {
return;
}
if let Some(mut info) = self.allocation_info.take() {
if let Some(oneway_node) = info.oneway_node.as_ref() {
oneway_node.pending_oneway_finished();
}
info.target_node = None;
if let Some(offsets) = info.offsets.clone() {
let view = AllocationView::new(self, offsets.start);
for i in offsets.step_by(size_of::<usize>()) {
if view.cleanup_object(i).is_err() {
pr_warn!("Error cleaning up object at offset {}\n", i)
}
}
}
for &fd in &info.file_list.close_on_free {
let closer = match DeferredFdCloser::new(GFP_KERNEL) {
Ok(closer) => closer,
Err(kernel::alloc::AllocError) => {
// Ignore allocation failures.
break;
}
};
// Here, we ignore errors. The operation can fail if the fd is not valid, or if the
// method is called from a kthread. However, this is always called from a syscall,
// so the latter case cannot happen, and we don't care about the first case.
let _ = closer.close_fd(fd);
}
if info.clear_on_free {
if let Err(e) = self.fill_zero() {
pr_warn!("Failed to clear data on free: {:?}", e);
}
}
}
self.process.buffer_raw_free(self.ptr);
}
}
/// A wrapper around `Allocation` that is being created.
///
/// If the allocation is destroyed while wrapped in this wrapper, then the allocation will be
/// considered to be part of a failed transaction. Successful transactions avoid that by calling
/// `success`, which skips the destructor.
#[repr(transparent)]
pub(crate) struct NewAllocation(pub(crate) Allocation);
impl NewAllocation {
pub(crate) fn success(self) -> Allocation {
// This skips the destructor.
//
// SAFETY: This type is `#[repr(transparent)]`, so the layout matches.
unsafe { core::mem::transmute(self) }
}
}
impl core::ops::Deref for NewAllocation {
type Target = Allocation;
fn deref(&self) -> &Allocation {
&self.0
}
}
impl core::ops::DerefMut for NewAllocation {
fn deref_mut(&mut self) -> &mut Allocation {
&mut self.0
}
}
impl Drop for NewAllocation {
fn drop(&mut self) {
crate::trace::trace_transaction_failed_buffer_release(self.debug_id);
}
}
/// A view into the beginning of an allocation.
///
/// All attempts to read or write outside of the view will fail. To intentionally access outside of
/// this view, use the `alloc` field of this struct directly.
pub(crate) struct AllocationView<'a> {
pub(crate) alloc: &'a mut Allocation,
limit: usize,
}
impl<'a> AllocationView<'a> {
pub(crate) fn new(alloc: &'a mut Allocation, limit: usize) -> Self {
AllocationView { alloc, limit }
}
pub(crate) fn read<T: FromBytes>(&self, offset: usize) -> Result<T> {
if offset.checked_add(size_of::<T>()).ok_or(EINVAL)? > self.limit {
return Err(EINVAL);
}
self.alloc.read(offset)
}
pub(crate) fn write<T: AsBytes>(&self, offset: usize, obj: &T) -> Result {
if offset.checked_add(size_of::<T>()).ok_or(EINVAL)? > self.limit {
return Err(EINVAL);
}
self.alloc.write(offset, obj)
}
pub(crate) fn copy_into(
&self,
reader: &mut UserSliceReader,
offset: usize,
size: usize,
) -> Result {
if offset.checked_add(size).ok_or(EINVAL)? > self.limit {
return Err(EINVAL);
}
self.alloc.copy_into(reader, offset, size)
}
pub(crate) fn transfer_binder_object(
&self,
offset: usize,
obj: &uapi::flat_binder_object,
strong: bool,
node_ref: NodeRef,
) -> Result {
let mut newobj = FlatBinderObject::default();
let node = node_ref.node.clone();
if Arc::ptr_eq(&node_ref.node.owner, &self.alloc.process) {
// The receiving process is the owner of the node, so send it a binder object (instead
// of a handle).
let (ptr, cookie) = node.get_id();
newobj.hdr.type_ = if strong {
BINDER_TYPE_BINDER
} else {
BINDER_TYPE_WEAK_BINDER
};
newobj.flags = obj.flags;
newobj.__bindgen_anon_1.binder = ptr as _;
newobj.cookie = cookie as _;
self.write(offset, &newobj)?;
// Increment the user ref count on the node. It will be decremented as part of the
// destruction of the buffer, when we see a binder or weak-binder object.
node.update_refcount(true, 1, strong);
} else {
// The receiving process is different from the owner, so we need to insert a handle to
// the binder object.
let handle = self
.alloc
.process
.as_arc_borrow()
.insert_or_update_handle(node_ref, false)?;
newobj.hdr.type_ = if strong {
BINDER_TYPE_HANDLE
} else {
BINDER_TYPE_WEAK_HANDLE
};
newobj.flags = obj.flags;
newobj.__bindgen_anon_1.handle = handle;
if self.write(offset, &newobj).is_err() {
// Decrement ref count on the handle we just created.
let _ = self
.alloc
.process
.as_arc_borrow()
.update_ref(handle, false, strong);
return Err(EINVAL);
}
}
crate::trace::trace_transaction_node_send(self.alloc.debug_id, &node, obj, &newobj);
Ok(())
}
fn cleanup_object(&self, index_offset: usize) -> Result {
let offset = self.alloc.read(index_offset)?;
let header = self.read::<BinderObjectHeader>(offset)?;
match header.type_ {
BINDER_TYPE_WEAK_BINDER | BINDER_TYPE_BINDER => {
let obj = self.read::<FlatBinderObject>(offset)?;
let strong = header.type_ == BINDER_TYPE_BINDER;
// SAFETY: The type is `BINDER_TYPE_{WEAK_}BINDER`, so the `binder` field is
// populated.
let ptr = unsafe { obj.__bindgen_anon_1.binder };
let cookie = obj.cookie;
self.alloc.process.update_node(ptr, cookie, strong);
Ok(())
}
BINDER_TYPE_WEAK_HANDLE | BINDER_TYPE_HANDLE => {
let obj = self.read::<FlatBinderObject>(offset)?;
let strong = header.type_ == BINDER_TYPE_HANDLE;
// SAFETY: The type is `BINDER_TYPE_{WEAK_}HANDLE`, so the `handle` field is
// populated.
let handle = unsafe { obj.__bindgen_anon_1.handle };
self.alloc
.process
.as_arc_borrow()
.update_ref(handle, false, strong)
}
_ => Ok(()),
}
}
}
/// A binder object as it is serialized.
///
/// # Invariants
///
/// All bytes must be initialized, and the value of `self.hdr.type_` must be one of the allowed
/// types.
#[repr(C)]
pub(crate) union BinderObject {
hdr: uapi::binder_object_header,
fbo: uapi::flat_binder_object,
fdo: uapi::binder_fd_object,
bbo: uapi::binder_buffer_object,
fdao: uapi::binder_fd_array_object,
}
/// A view into a `BinderObject` that can be used in a match statement.
pub(crate) enum BinderObjectRef<'a> {
Binder(&'a mut uapi::flat_binder_object),
Handle(&'a mut uapi::flat_binder_object),
Fd(&'a mut uapi::binder_fd_object),
Ptr(&'a mut uapi::binder_buffer_object),
Fda(&'a mut uapi::binder_fd_array_object),
}
impl BinderObject {
pub(crate) fn read_from(reader: &mut UserSliceReader) -> Result<BinderObject> {
let object = Self::read_from_inner(|slice| {
let read_len = usize::min(slice.len(), reader.len());
reader.clone_reader().read_slice(&mut slice[..read_len])?;
Ok(())
})?;
// If we used a object type smaller than the largest object size, then we've read more
// bytes than we needed to. However, we used `.clone_reader()` to avoid advancing the
// original reader. Now, we call `skip` so that the caller's reader is advanced by the
// right amount.
//
// The `skip` call fails if the reader doesn't have `size` bytes available. This could
// happen if the type header corresponds to an object type that is larger than the rest of
// the reader.
//
// Any extra bytes beyond the size of the object are inaccessible after this call, so
// reading them again from the `reader` later does not result in TOCTOU bugs.
reader.skip(object.size())?;
Ok(object)
}
/// Use the provided reader closure to construct a `BinderObject`.
///
/// The closure should write the bytes for the object into the provided slice.
pub(crate) fn read_from_inner<R>(reader: R) -> Result<BinderObject>
where
R: FnOnce(&mut [u8; size_of::<BinderObject>()]) -> Result<()>,
{
let mut obj = MaybeUninit::<BinderObject>::zeroed();
// SAFETY: The lengths of `BinderObject` and `[u8; size_of::<BinderObject>()]` are equal,
// and the byte array has an alignment requirement of one, so the pointer cast is okay.
// Additionally, `obj` was initialized to zeros, so the byte array will not be
// uninitialized.
(reader)(unsafe { &mut *obj.as_mut_ptr().cast() })?;
// SAFETY: The entire object is initialized, so accessing this field is safe.
let type_ = unsafe { obj.assume_init_ref().hdr.type_ };
if Self::type_to_size(type_).is_none() {
// The value of `obj.hdr_type_` was invalid.
return Err(EINVAL);
}
// SAFETY: All bytes are initialized (since we zeroed them at the start) and we checked
// that `self.hdr.type_` is one of the allowed types, so the type invariants are satisfied.
unsafe { Ok(obj.assume_init()) }
}
pub(crate) fn as_ref(&mut self) -> BinderObjectRef<'_> {
use BinderObjectRef::*;
// SAFETY: The constructor ensures that all bytes of `self` are initialized, and all
// variants of this union accept all initialized bit patterns.
unsafe {
match self.hdr.type_ {
BINDER_TYPE_WEAK_BINDER | BINDER_TYPE_BINDER => Binder(&mut self.fbo),
BINDER_TYPE_WEAK_HANDLE | BINDER_TYPE_HANDLE => Handle(&mut self.fbo),
BINDER_TYPE_FD => Fd(&mut self.fdo),
BINDER_TYPE_PTR => Ptr(&mut self.bbo),
BINDER_TYPE_FDA => Fda(&mut self.fdao),
// SAFETY: By the type invariant, the value of `self.hdr.type_` cannot have any
// other value than the ones checked above.
_ => core::hint::unreachable_unchecked(),
}
}
}
pub(crate) fn size(&self) -> usize {
// SAFETY: The entire object is initialized, so accessing this field is safe.
let type_ = unsafe { self.hdr.type_ };
// SAFETY: The type invariants guarantee that the type field is correct.
unsafe { Self::type_to_size(type_).unwrap_unchecked() }
}
fn type_to_size(type_: u32) -> Option<usize> {
match type_ {
BINDER_TYPE_WEAK_BINDER => Some(size_of::<uapi::flat_binder_object>()),
BINDER_TYPE_BINDER => Some(size_of::<uapi::flat_binder_object>()),
BINDER_TYPE_WEAK_HANDLE => Some(size_of::<uapi::flat_binder_object>()),
BINDER_TYPE_HANDLE => Some(size_of::<uapi::flat_binder_object>()),
BINDER_TYPE_FD => Some(size_of::<uapi::binder_fd_object>()),
BINDER_TYPE_PTR => Some(size_of::<uapi::binder_buffer_object>()),
BINDER_TYPE_FDA => Some(size_of::<uapi::binder_fd_array_object>()),
_ => None,
}
}
}
#[derive(Default)]
struct FileList {
files_to_translate: Vec<FileEntry>,
close_on_free: Vec<u32>,
}
struct FileEntry {
/// The file for which a descriptor will be created in the recipient process.
file: ARef<File>,
/// The offset in the buffer where the file descriptor is stored.
buffer_offset: usize,
/// Whether this fd should be closed when the allocation is freed.
close_on_free: bool,
}
pub(crate) struct TranslatedFds {
reservations: Vec<Reservation>,
/// If commit is called, then these fds should be closed. (If commit is not called, then they
/// shouldn't be closed.)
close_on_free: FdsCloseOnFree,
}
struct Reservation {
res: FileDescriptorReservation,
file: ARef<File>,
}
impl TranslatedFds {
pub(crate) fn new() -> Self {
Self {
reservations: Vec::new(),
close_on_free: FdsCloseOnFree(Vec::new()),
}
}
pub(crate) fn commit(self) -> FdsCloseOnFree {
for entry in self.reservations {
entry.res.fd_install(entry.file);
}
self.close_on_free
}
}
pub(crate) struct FdsCloseOnFree(Vec<u32>);

View File

@@ -0,0 +1,180 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use kernel::{
error::Error,
list::{List, ListArc, ListLinks},
prelude::*,
security,
str::{CStr, CString},
sync::{Arc, Mutex},
task::Kuid,
};
use crate::{error::BinderError, node::NodeRef, process::Process};
kernel::sync::global_lock! {
// SAFETY: We call `init` in the module initializer, so it's initialized before first use.
pub(crate) unsafe(uninit) static CONTEXTS: Mutex<ContextList> = ContextList {
list: List::new(),
};
}
pub(crate) struct ContextList {
list: List<Context>,
}
pub(crate) fn get_all_contexts() -> Result<Vec<Arc<Context>>> {
let lock = CONTEXTS.lock();
let count = lock.list.iter().count();
let mut ctxs = Vec::with_capacity(count, GFP_KERNEL)?;
for ctx in &lock.list {
ctxs.push(Arc::from(ctx), GFP_KERNEL)?;
}
Ok(ctxs)
}
/// This struct keeps track of the processes using this context, and which process is the context
/// manager.
struct Manager {
node: Option<NodeRef>,
uid: Option<Kuid>,
all_procs: List<Process>,
}
/// There is one context per binder file (/dev/binder, /dev/hwbinder, etc)
#[pin_data]
pub(crate) struct Context {
#[pin]
manager: Mutex<Manager>,
pub(crate) name: CString,
#[pin]
links: ListLinks,
}
kernel::list::impl_has_list_links! {
impl HasListLinks<0> for Context { self.links }
}
kernel::list::impl_list_arc_safe! {
impl ListArcSafe<0> for Context { untracked; }
}
kernel::list::impl_list_item! {
impl ListItem<0> for Context {
using ListLinks;
}
}
impl Context {
pub(crate) fn new(name: &CStr) -> Result<Arc<Self>> {
let name = CString::try_from(name)?;
let list_ctx = ListArc::pin_init::<Error>(
try_pin_init!(Context {
name,
links <- ListLinks::new(),
manager <- kernel::new_mutex!(Manager {
all_procs: List::new(),
node: None,
uid: None,
}, "Context::manager"),
}),
GFP_KERNEL,
)?;
let ctx = list_ctx.clone_arc();
CONTEXTS.lock().list.push_back(list_ctx);
Ok(ctx)
}
/// Called when the file for this context is unlinked.
///
/// No-op if called twice.
pub(crate) fn deregister(&self) {
// SAFETY: We never add the context to any other linked list than this one, so it is either
// in this list, or not in any list.
unsafe { CONTEXTS.lock().list.remove(self) };
}
pub(crate) fn register_process(self: &Arc<Self>, proc: ListArc<Process>) {
if !Arc::ptr_eq(self, &proc.ctx) {
pr_err!("Context::register_process called on the wrong context.");
return;
}
self.manager.lock().all_procs.push_back(proc);
}
pub(crate) fn deregister_process(self: &Arc<Self>, proc: &Process) {
if !Arc::ptr_eq(self, &proc.ctx) {
pr_err!("Context::deregister_process called on the wrong context.");
return;
}
// SAFETY: We just checked that this is the right list.
unsafe { self.manager.lock().all_procs.remove(proc) };
}
pub(crate) fn set_manager_node(&self, node_ref: NodeRef) -> Result {
let mut manager = self.manager.lock();
if manager.node.is_some() {
pr_warn!("BINDER_SET_CONTEXT_MGR already set");
return Err(EBUSY);
}
security::binder_set_context_mgr(&node_ref.node.owner.cred)?;
// If the context manager has been set before, ensure that we use the same euid.
let caller_uid = Kuid::current_euid();
if let Some(ref uid) = manager.uid {
if *uid != caller_uid {
return Err(EPERM);
}
}
manager.node = Some(node_ref);
manager.uid = Some(caller_uid);
Ok(())
}
pub(crate) fn unset_manager_node(&self) {
let node_ref = self.manager.lock().node.take();
drop(node_ref);
}
pub(crate) fn get_manager_node(&self, strong: bool) -> Result<NodeRef, BinderError> {
self.manager
.lock()
.node
.as_ref()
.ok_or_else(BinderError::new_dead)?
.clone(strong)
.map_err(BinderError::from)
}
pub(crate) fn for_each_proc<F>(&self, mut func: F)
where
F: FnMut(&Process),
{
let lock = self.manager.lock();
for proc in &lock.all_procs {
func(&proc);
}
}
pub(crate) fn get_all_procs(&self) -> Result<Vec<Arc<Process>>> {
let lock = self.manager.lock();
let count = lock.all_procs.iter().count();
let mut procs = Vec::with_capacity(count, GFP_KERNEL)?;
for proc in &lock.all_procs {
procs.push(Arc::from(proc), GFP_KERNEL)?;
}
Ok(procs)
}
pub(crate) fn get_procs_with_pid(&self, pid: i32) -> Result<Vec<Arc<Process>>> {
let mut procs = self.get_all_procs()?;
procs.retain(|proc| proc.task.pid() == pid);
Ok(procs)
}
}

View File

@@ -0,0 +1,202 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
//! Logic for closing files in a deferred manner.
//!
//! This file could make sense to have in `kernel::fs`, but it was rejected for being too
//! Binder-specific.
use core::mem::MaybeUninit;
use kernel::{
alloc::{AllocError, Flags},
bindings,
prelude::*,
};
/// Helper used for closing file descriptors in a way that is safe even if the file is currently
/// held using `fdget`.
///
/// Additional motivation can be found in commit 80cd795630d6 ("binder: fix use-after-free due to
/// ksys_close() during fdget()") and in the comments on `binder_do_fd_close`.
pub(crate) struct DeferredFdCloser {
inner: Box<DeferredFdCloserInner>,
}
/// SAFETY: This just holds an allocation with no real content, so there's no safety issue with
/// moving it across threads.
unsafe impl Send for DeferredFdCloser {}
unsafe impl Sync for DeferredFdCloser {}
/// # Invariants
///
/// If the `file` pointer is non-null, then it points at a `struct file` and owns a refcount to
/// that file.
#[repr(C)]
struct DeferredFdCloserInner {
twork: MaybeUninit<bindings::callback_head>,
file: *mut bindings::file,
}
impl DeferredFdCloser {
/// Create a new [`DeferredFdCloser`].
pub(crate) fn new(flags: Flags) -> Result<Self, AllocError> {
Ok(Self {
// INVARIANT: The `file` pointer is null, so the type invariant does not apply.
inner: Box::new(
DeferredFdCloserInner {
twork: MaybeUninit::uninit(),
file: core::ptr::null_mut(),
},
flags,
)?,
})
}
/// Schedule a task work that closes the file descriptor when this task returns to userspace.
///
/// Fails if this is called from a context where we cannot run work when returning to
/// userspace. (E.g., from a kthread.)
pub(crate) fn close_fd(self, fd: u32) -> Result<(), DeferredFdCloseError> {
use bindings::task_work_notify_mode_TWA_RESUME as TWA_RESUME;
// In this method, we schedule the task work before closing the file. This is because
// scheduling a task work is fallible, and we need to know whether it will fail before we
// attempt to close the file.
// Task works are not available on kthreads.
let current = kernel::current!();
// Check if this is a kthread.
// SAFETY: Reading `flags` from a task is always okay.
if unsafe { ((*current.as_ptr()).flags & bindings::PF_KTHREAD) != 0 } {
return Err(DeferredFdCloseError::TaskWorkUnavailable);
}
// Transfer ownership of the box's allocation to a raw pointer. This disables the
// destructor, so we must manually convert it back to a Box to drop it.
//
// Until we convert it back to a `Box`, there are no aliasing requirements on this
// pointer.
let inner = Box::into_raw(self.inner);
// The `callback_head` field is first in the struct, so this cast correctly gives us a
// pointer to the field.
let callback_head = inner.cast::<bindings::callback_head>();
// SAFETY: This pointer offset operation does not go out-of-bounds.
let file_field = unsafe { core::ptr::addr_of_mut!((*inner).file) };
let current = current.as_ptr();
// SAFETY: This function currently has exclusive access to the `DeferredFdCloserInner`, so
// it is okay for us to perform unsynchronized writes to its `callback_head` field.
unsafe { bindings::init_task_work(callback_head, Some(Self::do_close_fd)) };
// SAFETY: This inserts the `DeferredFdCloserInner` into the task workqueue for the current
// task. If this operation is successful, then this transfers exclusive ownership of the
// `callback_head` field to the C side until it calls `do_close_fd`, and we don't touch or
// invalidate the field during that time.
//
// When the C side calls `do_close_fd`, the safety requirements of that method are
// satisfied because when a task work is executed, the callback is given ownership of the
// pointer.
//
// The file pointer is currently null. If it is changed to be non-null before `do_close_fd`
// is called, then that change happens due to the write at the end of this function, and
// that write has a safety comment that explains why the refcount can be dropped when
// `do_close_fd` runs.
let res = unsafe { bindings::task_work_add(current, callback_head, TWA_RESUME) };
if res != 0 {
// SAFETY: Scheduling the task work failed, so we still have ownership of the box, so
// we may destroy it.
unsafe { drop(Box::from_raw(inner)) };
return Err(DeferredFdCloseError::TaskWorkUnavailable);
}
// This removes the fd from the fd table in `current`. The file is not fully closed until
// `filp_close` is called. We are given ownership of one refcount to the file.
//
// SAFETY: This is safe no matter what `fd` is. If the `fd` is valid (that is, if the
// pointer is non-null), then we call `filp_close` on the returned pointer as required by
// `file_close_fd`.
let file = unsafe { bindings::file_close_fd(fd) };
if file.is_null() {
// We don't clean up the task work since that might be expensive if the task work queue
// is long. Just let it execute and let it clean up for itself.
return Err(DeferredFdCloseError::BadFd);
}
// Acquire a second refcount to the file.
//
// SAFETY: The `file` pointer points at a file with a non-zero refcount.
unsafe { bindings::get_file(file) };
// This method closes the fd, consuming one of our two refcounts. There could be active
// light refcounts created from that fd, so we must ensure that the file has a positive
// refcount for the duration of those active light refcounts. We do that by holding on to
// the second refcount until the current task returns to userspace.
//
// SAFETY: The `file` pointer is valid. Passing `current->files` as the file table to close
// it in is correct, since we just got the `fd` from `file_close_fd` which also uses
// `current->files`.
//
// Note: fl_owner_t is currently a void pointer.
unsafe { bindings::filp_close(file, (*current).files as bindings::fl_owner_t) };
// We update the file pointer that the task work is supposed to fput. This transfers
// ownership of our last refcount.
//
// INVARIANT: This changes the `file` field of a `DeferredFdCloserInner` from null to
// non-null. This doesn't break the type invariant for `DeferredFdCloserInner` because we
// still own a refcount to the file, so we can pass ownership of that refcount to the
// `DeferredFdCloserInner`.
//
// When `do_close_fd` runs, it must be safe for it to `fput` the refcount. However, this is
// the case because all light refcounts that are associated with the fd we closed
// previously must be dropped when `do_close_fd`, since light refcounts must be dropped
// before returning to userspace.
//
// SAFETY: Task works are executed on the current thread right before we return to
// userspace, so this write is guaranteed to happen before `do_close_fd` is called, which
// means that a race is not possible here.
unsafe { *file_field = file };
Ok(())
}
/// # Safety
///
/// The provided pointer must point at the `twork` field of a `DeferredFdCloserInner` stored in
/// a `Box`, and the caller must pass exclusive ownership of that `Box`. Furthermore, if the
/// file pointer is non-null, then it must be okay to release the refcount by calling `fput`.
unsafe extern "C" fn do_close_fd(inner: *mut bindings::callback_head) {
// SAFETY: The caller just passed us ownership of this box.
let inner = unsafe { Box::from_raw(inner.cast::<DeferredFdCloserInner>()) };
if !inner.file.is_null() {
// SAFETY: By the type invariants, we own a refcount to this file, and the caller
// guarantees that dropping the refcount now is okay.
unsafe { bindings::fput(inner.file) };
}
// The allocation is freed when `inner` goes out of scope.
}
}
/// Represents a failure to close an fd in a deferred manner.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub(crate) enum DeferredFdCloseError {
/// Closing the fd failed because we were unable to schedule a task work.
TaskWorkUnavailable,
/// Closing the fd failed because the fd does not exist.
BadFd,
}
impl From<DeferredFdCloseError> for Error {
fn from(err: DeferredFdCloseError) -> Error {
match err {
DeferredFdCloseError::TaskWorkUnavailable => ESRCH,
DeferredFdCloseError::BadFd => EBADF,
}
}
}

View File

@@ -0,0 +1,182 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use core::mem::MaybeUninit;
use core::ops::{Deref, DerefMut};
use kernel::{
types::{AsBytes, FromBytes},
uapi::{self, *},
};
macro_rules! pub_no_prefix {
($prefix:ident, $($newname:ident),+ $(,)?) => {
$(pub(crate) const $newname: u32 = kernel::macros::concat_idents!($prefix, $newname);)+
};
}
pub_no_prefix!(
binder_driver_return_protocol_,
BR_TRANSACTION,
BR_TRANSACTION_SEC_CTX,
BR_REPLY,
BR_DEAD_REPLY,
BR_FAILED_REPLY,
BR_FROZEN_REPLY,
BR_NOOP,
BR_SPAWN_LOOPER,
BR_TRANSACTION_COMPLETE,
BR_TRANSACTION_PENDING_FROZEN,
BR_ONEWAY_SPAM_SUSPECT,
BR_OK,
BR_ERROR,
BR_INCREFS,
BR_ACQUIRE,
BR_RELEASE,
BR_DECREFS,
BR_DEAD_BINDER,
BR_CLEAR_DEATH_NOTIFICATION_DONE,
);
pub_no_prefix!(
binder_driver_command_protocol_,
BC_TRANSACTION,
BC_TRANSACTION_SG,
BC_REPLY,
BC_REPLY_SG,
BC_FREE_BUFFER,
BC_ENTER_LOOPER,
BC_EXIT_LOOPER,
BC_REGISTER_LOOPER,
BC_INCREFS,
BC_ACQUIRE,
BC_RELEASE,
BC_DECREFS,
BC_INCREFS_DONE,
BC_ACQUIRE_DONE,
BC_REQUEST_DEATH_NOTIFICATION,
BC_CLEAR_DEATH_NOTIFICATION,
BC_DEAD_BINDER_DONE,
);
pub_no_prefix!(
flat_binder_object_shifts_,
FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT
);
pub_no_prefix!(
flat_binder_object_flags_,
FLAT_BINDER_FLAG_ACCEPTS_FDS,
FLAT_BINDER_FLAG_INHERIT_RT,
FLAT_BINDER_FLAG_PRIORITY_MASK,
FLAT_BINDER_FLAG_SCHED_POLICY_MASK,
FLAT_BINDER_FLAG_TXN_SECURITY_CTX
);
pub_no_prefix!(
transaction_flags_,
TF_ONE_WAY,
TF_ACCEPT_FDS,
TF_CLEAR_BUF,
TF_UPDATE_TXN
);
pub(crate) use uapi::{
BINDER_TYPE_BINDER, BINDER_TYPE_FD, BINDER_TYPE_FDA, BINDER_TYPE_HANDLE, BINDER_TYPE_PTR,
BINDER_TYPE_WEAK_BINDER, BINDER_TYPE_WEAK_HANDLE,
};
macro_rules! decl_wrapper {
($newname:ident, $wrapped:ty) => {
// Define a wrapper around the C type. Use `MaybeUninit` to enforce that the value of
// padding bytes must be preserved.
#[derive(Copy, Clone)]
#[repr(transparent)]
pub(crate) struct $newname(MaybeUninit<$wrapped>);
// SAFETY: This macro is only used with types where this is ok.
unsafe impl FromBytes for $newname {}
unsafe impl AsBytes for $newname {}
impl Deref for $newname {
type Target = $wrapped;
fn deref(&self) -> &Self::Target {
// SAFETY: We use `MaybeUninit` only to preserve padding. The value must still
// always be valid.
unsafe { self.0.assume_init_ref() }
}
}
impl DerefMut for $newname {
fn deref_mut(&mut self) -> &mut Self::Target {
// SAFETY: We use `MaybeUninit` only to preserve padding. The value must still
// always be valid.
unsafe { self.0.assume_init_mut() }
}
}
impl Default for $newname {
fn default() -> Self {
// Create a new value of this type where all bytes (including padding) are zeroed.
Self(MaybeUninit::zeroed())
}
}
};
}
decl_wrapper!(BinderNodeDebugInfo, uapi::binder_node_debug_info);
decl_wrapper!(BinderNodeInfoForRef, uapi::binder_node_info_for_ref);
decl_wrapper!(FlatBinderObject, uapi::flat_binder_object);
decl_wrapper!(BinderFdObject, uapi::binder_fd_object);
decl_wrapper!(BinderFdArrayObject, uapi::binder_fd_array_object);
decl_wrapper!(BinderObjectHeader, uapi::binder_object_header);
decl_wrapper!(BinderBufferObject, uapi::binder_buffer_object);
decl_wrapper!(BinderTransactionData, uapi::binder_transaction_data);
decl_wrapper!(
BinderTransactionDataSecctx,
uapi::binder_transaction_data_secctx
);
decl_wrapper!(BinderTransactionDataSg, uapi::binder_transaction_data_sg);
decl_wrapper!(BinderWriteRead, uapi::binder_write_read);
decl_wrapper!(BinderVersion, uapi::binder_version);
decl_wrapper!(BinderFrozenStatusInfo, uapi::binder_frozen_status_info);
decl_wrapper!(BinderFreezeInfo, uapi::binder_freeze_info);
decl_wrapper!(ExtendedError, uapi::binder_extended_error);
impl BinderVersion {
pub(crate) fn current() -> Self {
Self(MaybeUninit::new(uapi::binder_version {
protocol_version: BINDER_CURRENT_PROTOCOL_VERSION as _,
}))
}
}
impl BinderTransactionData {
pub(crate) fn with_buffers_size(self, buffers_size: u64) -> BinderTransactionDataSg {
BinderTransactionDataSg(MaybeUninit::new(uapi::binder_transaction_data_sg {
transaction_data: *self,
buffers_size,
}))
}
}
impl BinderTransactionDataSecctx {
/// View the inner data as wrapped in `BinderTransactionData`.
pub(crate) fn tr_data(&mut self) -> &mut BinderTransactionData {
// SAFETY: Transparent wrapper is safe to transmute.
unsafe {
&mut *(&mut self.transaction_data as *mut uapi::binder_transaction_data
as *mut BinderTransactionData)
}
}
}
impl ExtendedError {
pub(crate) fn new(id: u32, command: u32, param: i32) -> Self {
Self(MaybeUninit::new(uapi::binder_extended_error {
id,
command,
param,
}))
}
}

View File

@@ -0,0 +1,99 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use kernel::prelude::*;
use crate::defs::*;
pub(crate) type BinderResult<T = ()> = core::result::Result<T, BinderError>;
/// An error that will be returned to userspace via the `BINDER_WRITE_READ` ioctl rather than via
/// errno.
pub(crate) struct BinderError {
pub(crate) reply: u32,
source: Option<Error>,
}
impl BinderError {
pub(crate) fn new_dead() -> Self {
Self {
reply: BR_DEAD_REPLY,
source: None,
}
}
pub(crate) fn new_frozen() -> Self {
Self {
reply: BR_FROZEN_REPLY,
source: None,
}
}
pub(crate) fn new_frozen_oneway() -> Self {
Self {
reply: BR_TRANSACTION_PENDING_FROZEN,
source: None,
}
}
pub(crate) fn is_dead(&self) -> bool {
self.reply == BR_DEAD_REPLY
}
pub(crate) fn as_errno(&self) -> core::ffi::c_int {
self.source.unwrap_or(EINVAL).to_errno()
}
pub(crate) fn should_pr_warn(&self) -> bool {
self.source.is_some()
}
}
/// Convert an errno into a `BinderError` and store the errno used to construct it. The errno
/// should be stored as the thread's extended error when given to userspace.
impl From<Error> for BinderError {
fn from(source: Error) -> Self {
Self {
reply: BR_FAILED_REPLY,
source: Some(source),
}
}
}
impl From<kernel::fs::file::BadFdError> for BinderError {
fn from(source: kernel::fs::file::BadFdError) -> Self {
BinderError::from(Error::from(source))
}
}
impl From<kernel::alloc::AllocError> for BinderError {
fn from(_: kernel::alloc::AllocError) -> Self {
Self {
reply: BR_FAILED_REPLY,
source: Some(ENOMEM),
}
}
}
impl core::fmt::Debug for BinderError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self.reply {
BR_FAILED_REPLY => match self.source.as_ref() {
Some(source) => f
.debug_struct("BR_FAILED_REPLY")
.field("source", source)
.finish(),
None => f.pad("BR_FAILED_REPLY"),
},
BR_DEAD_REPLY => f.pad("BR_DEAD_REPLY"),
BR_FROZEN_REPLY => f.pad("BR_FROZEN_REPLY"),
BR_TRANSACTION_PENDING_FROZEN => f.pad("BR_TRANSACTION_PENDING_FROZEN"),
BR_TRANSACTION_COMPLETE => f.pad("BR_TRANSACTION_COMPLETE"),
_ => f
.debug_struct("BinderError")
.field("reply", &self.reply)
.finish(),
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,79 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use kernel::{list::ListArc, prelude::*, seq_file::SeqFile, seq_print, sync::UniqueArc};
use crate::{node::Node, thread::Thread, BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverToRead};
use core::mem::MaybeUninit;
pub(crate) struct CritIncrWrapper {
inner: UniqueArc<MaybeUninit<DTRWrap<NodeWrapper>>>,
}
impl CritIncrWrapper {
pub(crate) fn new() -> Result<Self> {
Ok(CritIncrWrapper {
inner: UniqueArc::new_uninit(GFP_KERNEL)?,
})
}
pub(super) fn init(self, node: DArc<Node>) -> DLArc<dyn DeliverToRead> {
match self.inner.pin_init_with(DTRWrap::new(NodeWrapper { node })) {
Ok(initialized) => ListArc::from(initialized) as _,
Err(err) => match err {},
}
}
}
struct NodeWrapper {
node: DArc<Node>,
}
kernel::list::impl_list_arc_safe! {
impl ListArcSafe<0> for NodeWrapper {
untracked;
}
}
impl DeliverToRead for NodeWrapper {
fn do_work(
self: DArc<Self>,
_thread: &Thread,
writer: &mut BinderReturnWriter<'_>,
) -> Result<bool> {
let node = &self.node;
let mut owner_inner = node.owner.inner.lock();
let inner = node.inner.access_mut(&mut owner_inner);
let ds = &mut inner.delivery_state;
assert!(ds.has_pushed_wrapper);
assert!(ds.has_strong_zero2one);
ds.has_pushed_wrapper = false;
ds.has_strong_zero2one = false;
node.do_work_locked(writer, owner_inner)
}
fn cancel(self: DArc<Self>) {}
fn on_thread_selected(&self, _thread: &Thread) {}
fn should_sync_wakeup(&self) -> bool {
false
}
#[inline(never)]
fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> {
seq_print!(
m,
"{}node work {}: u{:016x} c{:016x}\n",
prefix,
self.node.debug_id,
self.node.ptr,
self.node.cookie,
);
Ok(())
}
}

View File

@@ -0,0 +1,780 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
//! This module has utilities for managing a page range where unused pages may be reclaimed by a
//! vma shrinker.
// To avoid deadlocks, locks are taken in the order:
//
// 1. mmap lock
// 2. spinlock
// 3. lru spinlock
//
// The shrinker will use trylock methods because it locks them in a different order.
use core::{
alloc::Layout,
ffi::{c_ulong, c_void},
marker::PhantomPinned,
mem::{size_of, size_of_val, MaybeUninit},
ptr,
};
use kernel::{
bindings,
error::Result,
mm::{virt, Mm, MmWithUser},
new_mutex, new_spinlock,
page::{Page, PAGE_SHIFT, PAGE_SIZE},
prelude::*,
str::CStr,
sync::{Mutex, SpinLock},
task::Pid,
types::ARef,
types::{FromBytes, Opaque},
uaccess::UserSliceReader,
};
/// Represents a shrinker that can be registered with the kernel.
///
/// Each shrinker can be used by many `ShrinkablePageRange` objects.
#[repr(C)]
pub(crate) struct Shrinker {
inner: Opaque<*mut bindings::shrinker>,
list_lru: Opaque<bindings::list_lru>,
}
unsafe impl Send for Shrinker {}
unsafe impl Sync for Shrinker {}
impl Shrinker {
/// Create a new shrinker.
///
/// # Safety
///
/// Before using this shrinker with a `ShrinkablePageRange`, the `register` method must have
/// been called exactly once, and it must not have returned an error.
pub(crate) const unsafe fn new() -> Self {
Self {
inner: Opaque::uninit(),
list_lru: Opaque::uninit(),
}
}
/// Register this shrinker with the kernel.
pub(crate) fn register(&'static self, name: &CStr) -> Result<()> {
// SAFETY: These fields are not yet used, so it's okay to zero them.
unsafe {
self.inner.get().write(ptr::null_mut());
self.list_lru.get().write_bytes(0, 1);
}
// SAFETY: The field is not yet used, so we can initialize it.
let ret = unsafe {
bindings::__list_lru_init(self.list_lru.get(), false, ptr::null_mut(), ptr::null_mut())
};
if ret != 0 {
return Err(Error::from_errno(ret));
}
// SAFETY: The `name` points at a valid c string.
let shrinker = unsafe { bindings::shrinker_alloc(0, name.as_char_ptr()) };
if shrinker.is_null() {
// SAFETY: We initialized it, so its okay to destroy it.
unsafe { bindings::list_lru_destroy(self.list_lru.get()) };
return Err(Error::from_errno(ret));
}
// SAFETY: We're about to register the shrinker, and these are the fields we need to
// initialize. (All other fields are already zeroed.)
unsafe {
ptr::addr_of_mut!((*shrinker).count_objects).write(Some(rust_shrink_count));
ptr::addr_of_mut!((*shrinker).scan_objects).write(Some(rust_shrink_scan));
}
// SAFETY: The new shrinker has been fully initialized, so we can register it.
unsafe { bindings::shrinker_register(shrinker) };
// SAFETY: This initializes the pointer to the shrinker so that we can use it.
unsafe { self.inner.get().write(shrinker) };
Ok(())
}
}
/// A container that manages a page range in a vma.
///
/// The pages can be thought of as an array of booleans of whether the pages are usable. The
/// methods `use_range` and `stop_using_range` set all booleans in a range to true or false
/// respectively. Initially, no pages are allocated. When a page is not used, it is not freed
/// immediately. Instead, it is made available to the memory shrinker to free it if the device is
/// under memory pressure.
///
/// It's okay for `use_range` and `stop_using_range` to race with each other, although there's no
/// way to know whether an index ends up with true or false if a call to `use_range` races with
/// another call to `stop_using_range` on a given index.
///
/// It's also okay for the two methods to race with themselves, e.g. if two threads call
/// `use_range` on the same index, then that's fine and neither call will return until the page is
/// allocated and mapped.
///
/// The methods that read or write to a range require that the page is marked as in use. So it is
/// _not_ okay to call `stop_using_range` on a page that is in use by the methods that read or
/// write to the page.
#[pin_data(PinnedDrop)]
pub(crate) struct ShrinkablePageRange {
/// Shrinker object registered with the kernel.
shrinker: &'static Shrinker,
/// Pid using this page range. Only used as debugging information.
pid: Pid,
/// The mm for the relevant process.
mm: ARef<Mm>,
/// Used to synchronize calls to `vm_insert_page` and `zap_page_range_single`.
#[pin]
mm_lock: Mutex<()>,
/// Spinlock protecting changes to pages.
#[pin]
lock: SpinLock<Inner>,
/// Must not move, since page info has pointers back.
#[pin]
_pin: PhantomPinned,
}
struct Inner {
/// Array of pages.
///
/// Since this is also accessed by the shrinker, we can't use a `Box`, which asserts exclusive
/// ownership. To deal with that, we manage it using raw pointers.
pages: *mut PageInfo,
/// Length of the `pages` array.
size: usize,
/// The address of the vma to insert the pages into.
vma_addr: usize,
}
unsafe impl Send for ShrinkablePageRange {}
unsafe impl Sync for ShrinkablePageRange {}
type StableMmGuard =
kernel::sync::lock::Guard<'static, (), kernel::sync::lock::mutex::MutexBackend>;
/// An array element that describes the current state of a page.
///
/// There are three states:
///
/// * Free. The page is None. The `lru` element is not queued.
/// * Available. The page is Some. The `lru` element is queued to the shrinker's lru.
/// * Used. The page is Some. The `lru` element is not queued.
///
/// When an element is available, the shrinker is able to free the page.
#[repr(C)]
struct PageInfo {
lru: bindings::list_head,
page: Option<Page>,
range: *const ShrinkablePageRange,
}
impl PageInfo {
/// # Safety
///
/// The caller ensures that reading from `me.page` is ok.
unsafe fn has_page(me: *const PageInfo) -> bool {
// SAFETY: This pointer offset is in bounds.
let page = unsafe { ptr::addr_of!((*me).page) };
unsafe { (*page).is_some() }
}
/// # Safety
///
/// The caller ensures that writing to `me.page` is ok, and that the page is not currently set.
unsafe fn set_page(me: *mut PageInfo, page: Page) {
// SAFETY: This pointer offset is in bounds.
let ptr = unsafe { ptr::addr_of_mut!((*me).page) };
// SAFETY: The pointer is valid for writing, so also valid for reading.
if unsafe { (*ptr).is_some() } {
pr_err!("set_page called when there is already a page");
// SAFETY: We will initialize the page again below.
unsafe { ptr::drop_in_place(ptr) };
}
// SAFETY: The pointer is valid for writing.
unsafe { ptr::write(ptr, Some(page)) };
}
/// # Safety
///
/// The caller ensures that reading from `me.page` is ok for the duration of 'a.
unsafe fn get_page<'a>(me: *const PageInfo) -> Option<&'a Page> {
// SAFETY: This pointer offset is in bounds.
let ptr = unsafe { ptr::addr_of!((*me).page) };
// SAFETY: The pointer is valid for reading.
unsafe { (*ptr).as_ref() }
}
/// # Safety
///
/// The caller ensures that writing to `me.page` is ok for the duration of 'a.
unsafe fn take_page(me: *mut PageInfo) -> Option<Page> {
// SAFETY: This pointer offset is in bounds.
let ptr = unsafe { ptr::addr_of_mut!((*me).page) };
// SAFETY: The pointer is valid for reading.
unsafe { (*ptr).take() }
}
/// Add this page to the lru list, if not already in the list.
///
/// # Safety
///
/// The pointer must be valid, and it must be the right shrinker.
unsafe fn list_lru_add(me: *mut PageInfo, shrinker: &'static Shrinker) {
// SAFETY: This pointer offset is in bounds.
let lru_ptr = unsafe { ptr::addr_of_mut!((*me).lru) };
// SAFETY: The lru pointer is valid, and we're not using it with any other lru list.
unsafe { bindings::list_lru_add_obj(shrinker.list_lru.get(), lru_ptr) };
}
/// Remove this page from the lru list, if it is in the list.
///
/// # Safety
///
/// The pointer must be valid, and it must be the right shrinker.
unsafe fn list_lru_del(me: *mut PageInfo, shrinker: &'static Shrinker) {
// SAFETY: This pointer offset is in bounds.
let lru_ptr = unsafe { ptr::addr_of_mut!((*me).lru) };
// SAFETY: The lru pointer is valid, and we're not using it with any other lru list.
unsafe { bindings::list_lru_del_obj(shrinker.list_lru.get(), lru_ptr) };
}
}
impl ShrinkablePageRange {
/// Create a new `ShrinkablePageRange` using the given shrinker.
pub(crate) fn new(shrinker: &'static Shrinker) -> impl PinInit<Self, Error> {
try_pin_init!(Self {
shrinker,
pid: kernel::current!().pid(),
mm: ARef::from(&**kernel::current!().mm().ok_or(ESRCH)?),
mm_lock <- new_mutex!((), "ShrinkablePageRange::mm"),
lock <- new_spinlock!(Inner {
pages: ptr::null_mut(),
size: 0,
vma_addr: 0,
}, "ShrinkablePageRange"),
_pin: PhantomPinned,
})
}
pub(crate) fn stable_trylock_mm(&self) -> Option<StableMmGuard> {
// SAFETY: This extends the duration of the reference. Since this call happens before
// `mm_lock` is taken in the destructor of `ShrinkablePageRange`, the destructor will block
// until the returned guard is dropped. This ensures that the guard is valid until dropped.
let mm_lock = unsafe { &*ptr::from_ref(&self.mm_lock) };
mm_lock.try_lock()
}
/// Register a vma with this page range. Returns the size of the region.
pub(crate) fn register_with_vma(&self, vma: &virt::VmAreaNew) -> Result<usize> {
let num_bytes = usize::min(vma.end() - vma.start(), bindings::SZ_4M as usize);
let num_pages = num_bytes >> PAGE_SHIFT;
if !ptr::eq::<Mm>(&*self.mm, &**vma.mm()) {
pr_debug!("Failed to register with vma: invalid vma->vm_mm");
return Err(EINVAL);
}
if num_pages == 0 {
pr_debug!("Failed to register with vma: size zero");
return Err(EINVAL);
}
let layout = Layout::array::<PageInfo>(num_pages).map_err(|_| ENOMEM)?;
// SAFETY: The layout has non-zero size.
let pages = unsafe { alloc::alloc::alloc(layout) as *mut PageInfo };
if pages.is_null() {
return Err(ENOMEM);
}
// SAFETY: This just initializes the pages array.
unsafe {
let self_ptr = self as *const ShrinkablePageRange;
for i in 0..num_pages {
let info = pages.add(i);
ptr::addr_of_mut!((*info).range).write(self_ptr);
ptr::addr_of_mut!((*info).page).write(None);
let lru = ptr::addr_of_mut!((*info).lru);
ptr::addr_of_mut!((*lru).next).write(lru);
ptr::addr_of_mut!((*lru).prev).write(lru);
}
}
let mut inner = self.lock.lock();
if inner.size > 0 {
pr_debug!("Failed to register with vma: already registered");
drop(inner);
// SAFETY: The `pages` array was allocated with the same layout.
unsafe { alloc::alloc::dealloc(pages.cast(), layout) };
return Err(EBUSY);
}
inner.pages = pages;
inner.size = num_pages;
inner.vma_addr = vma.start();
Ok(num_pages)
}
/// Make sure that the given pages are allocated and mapped.
///
/// Must not be called from an atomic context.
pub(crate) fn use_range(&self, start: usize, end: usize) -> Result<()> {
crate::trace::trace_update_page_range(self.pid, true, start, end);
if start >= end {
return Ok(());
}
let mut inner = self.lock.lock();
assert!(end <= inner.size);
for i in start..end {
// SAFETY: This pointer offset is in bounds.
let page_info = unsafe { inner.pages.add(i) };
// SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay.
if unsafe { PageInfo::has_page(page_info) } {
crate::trace::trace_alloc_lru_start(self.pid, i);
// Since we're going to use the page, we should remove it from the lru list so that
// the shrinker will not free it.
//
// SAFETY: The pointer is valid, and this is the right shrinker.
//
// The shrinker can't free the page between the check and this call to
// `list_lru_del` because we hold the lock.
unsafe { PageInfo::list_lru_del(page_info, self.shrinker) };
crate::trace::trace_alloc_lru_end(self.pid, i);
} else {
// We have to allocate a new page. Use the slow path.
drop(inner);
crate::trace::trace_alloc_page_start(self.pid, i);
match self.use_page_slow(i) {
Ok(()) => {}
Err(err) => {
pr_warn!("Error in use_page_slow: {:?}", err);
return Err(err);
}
}
crate::trace::trace_alloc_page_end(self.pid, i);
inner = self.lock.lock();
}
}
Ok(())
}
/// Mark the given page as in use, slow path.
///
/// Must not be called from an atomic context.
///
/// # Safety
///
/// Assumes that `i` is in bounds.
#[cold]
fn use_page_slow(&self, i: usize) -> Result<()> {
let new_page = Page::alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO)?;
let mm_mutex = self.mm_lock.lock();
let inner = self.lock.lock();
// SAFETY: This pointer offset is in bounds.
let page_info = unsafe { inner.pages.add(i) };
// SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay.
if unsafe { PageInfo::has_page(page_info) } {
// The page was already there, or someone else added the page while we didn't hold the
// spinlock.
//
// SAFETY: The pointer is valid, and this is the right shrinker.
//
// The shrinker can't free the page between the check and this call to
// `list_lru_del` because we hold the lock.
unsafe { PageInfo::list_lru_del(page_info, self.shrinker) };
return Ok(());
}
let vma_addr = inner.vma_addr;
// Release the spinlock while we insert the page into the vma.
drop(inner);
// No overflow since we stay in bounds of the vma.
let user_page_addr = vma_addr + (i << PAGE_SHIFT);
// We use `mmput_async` when dropping the `mm` because `use_page_slow` is usually used from
// a remote process. If the call to `mmput` races with the process shutting down, then the
// caller of `use_page_slow` becomes responsible for cleaning up the `mm`, which doesn't
// happen until it returns to userspace. However, the caller might instead go to sleep and
// wait for the owner of the `mm` to wake it up, which doesn't happen because it's in the
// middle of a shutdown process that wont complete until the `mm` is dropped. This can
// amount to a deadlock.
//
// Using `mmput_async` avoids this, because then the `mm` cleanup is instead queued to a
// workqueue.
MmWithUser::into_mmput_async(self.mm.mmget_not_zero().ok_or(ESRCH)?)
.mmap_read_lock()
.vma_lookup(vma_addr)
.ok_or(ESRCH)?
.as_mixedmap_vma()
.ok_or(ESRCH)?
.vm_insert_page(user_page_addr, &new_page)
.inspect_err(|err| {
pr_warn!(
"Failed to vm_insert_page({}): vma_addr:{} i:{} err:{:?}",
user_page_addr,
vma_addr,
i,
err
)
})?;
let inner = self.lock.lock();
// SAFETY: The `page_info` pointer is valid and currently does not have a page. The page
// can be written to since we hold the lock.
//
// We released and reacquired the spinlock since we checked that the page is null, but we
// always hold the mm_lock mutex when setting the page to a non-null value, so it's not
// possible for someone else to have changed it since our check.
unsafe { PageInfo::set_page(page_info, new_page) };
drop(inner);
drop(mm_mutex);
Ok(())
}
/// If the given page is in use, then mark it as available so that the shrinker can free it.
///
/// May be called from an atomic context.
pub(crate) fn stop_using_range(&self, start: usize, end: usize) {
crate::trace::trace_update_page_range(self.pid, false, start, end);
if start >= end {
return;
}
let inner = self.lock.lock();
assert!(end <= inner.size);
for i in (start..end).rev() {
// SAFETY: The pointer is in bounds.
let page_info = unsafe { inner.pages.add(i) };
// SAFETY: Okay for reading since we have the lock.
if unsafe { PageInfo::has_page(page_info) } {
crate::trace::trace_free_lru_start(self.pid, i);
// SAFETY: The pointer is valid, and it's the right shrinker.
unsafe { PageInfo::list_lru_add(page_info, self.shrinker) };
crate::trace::trace_free_lru_end(self.pid, i);
}
}
}
/// Helper for reading or writing to a range of bytes that may overlap with several pages.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
unsafe fn iterate<T>(&self, mut offset: usize, mut size: usize, mut cb: T) -> Result
where
T: FnMut(&Page, usize, usize) -> Result,
{
if size == 0 {
return Ok(());
}
// SAFETY: The caller promises that the pages touched by this call are in use. It's only
// possible for a page to be in use if we have already been registered with a vma, and we
// only change the `pages` and `size` fields during registration with a vma, so there is no
// race when we read them here without taking the lock.
let (pages, num_pages) = {
let inner = self.lock.lock();
(inner.pages, inner.size)
};
let num_bytes = num_pages << PAGE_SHIFT;
// Check that the request is within the buffer.
if offset.checked_add(size).ok_or(EFAULT)? > num_bytes {
return Err(EFAULT);
}
let mut page_index = offset >> PAGE_SHIFT;
offset &= PAGE_SIZE - 1;
while size > 0 {
let available = usize::min(size, PAGE_SIZE - offset);
// SAFETY: The pointer is in bounds.
let page_info = unsafe { pages.add(page_index) };
// SAFETY: The caller guarantees that this page is in the "in use" state for the
// duration of this call to `iterate`, so nobody will change the page.
let page = unsafe { PageInfo::get_page(page_info) };
if page.is_none() {
pr_warn!("Page is null!");
}
let page = page.ok_or(EFAULT)?;
cb(page, offset, available)?;
size -= available;
page_index += 1;
offset = 0;
}
Ok(())
}
/// Copy from userspace into this page range.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
pub(crate) unsafe fn copy_from_user_slice(
&self,
reader: &mut UserSliceReader,
offset: usize,
size: usize,
) -> Result {
// SAFETY: `self.iterate` has the same safety requirements as `copy_from_user_slice`.
unsafe {
self.iterate(offset, size, |page, offset, to_copy| {
page.copy_from_user_slice_raw(reader, offset, to_copy)
})
}
}
/// Copy from this page range into kernel space.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
pub(crate) unsafe fn read<T: FromBytes>(&self, offset: usize) -> Result<T> {
let mut out = MaybeUninit::<T>::uninit();
let mut out_offset = 0;
// SAFETY: `self.iterate` has the same safety requirements as `read`.
unsafe {
self.iterate(offset, size_of::<T>(), |page, offset, to_copy| {
// SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T.
let obj_ptr = (out.as_mut_ptr() as *mut u8).add(out_offset);
// SAFETY: The pointer points is in-bounds of the `out` variable, so it is valid.
page.read_raw(obj_ptr, offset, to_copy)?;
out_offset += to_copy;
Ok(())
})?;
}
// SAFETY: We just initialised the data.
Ok(unsafe { out.assume_init() })
}
/// Copy from kernel space into this page range.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
pub(crate) unsafe fn write<T: ?Sized>(&self, offset: usize, obj: &T) -> Result {
let mut obj_offset = 0;
// SAFETY: `self.iterate` has the same safety requirements as `write`.
unsafe {
self.iterate(offset, size_of_val(obj), |page, offset, to_copy| {
// SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T.
let obj_ptr = (obj as *const T as *const u8).add(obj_offset);
// SAFETY: We have a reference to the object, so the pointer is valid.
page.write_raw(obj_ptr, offset, to_copy)?;
obj_offset += to_copy;
Ok(())
})
}
}
/// Write zeroes to the given range.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
pub(crate) unsafe fn fill_zero(&self, offset: usize, size: usize) -> Result {
// SAFETY: `self.iterate` has the same safety requirements as `copy_into`.
unsafe {
self.iterate(offset, size, |page, offset, len| {
page.fill_zero_raw(offset, len)
})
}
}
}
#[pinned_drop]
impl PinnedDrop for ShrinkablePageRange {
fn drop(self: Pin<&mut Self>) {
let (pages, size) = {
let lock = self.lock.lock();
(lock.pages, lock.size)
};
if size == 0 {
return;
}
// This is the destructor, so unlike the other methods, we only need to worry about races
// with the shrinker here.
for i in 0..size {
// SAFETY: The pointer is valid and it's the right shrinker.
unsafe { PageInfo::list_lru_del(pages.add(i), self.shrinker) };
// SAFETY: If the shrinker was going to free this page, then it would have taken it
// from the PageInfo before releasing the lru lock. Thus, the call to `list_lru_del`
// will either remove it before the shrinker can access it, or the shrinker will
// already have taken the page at this point.
unsafe { drop(PageInfo::take_page(pages.add(i))) };
}
// Wait for users of the mutex to go away. This call is necessary for the safety of
// `stable_trylock_mm`.
drop(self.mm_lock.lock());
// SAFETY: This computation did not overflow when allocating the pages array, so it will
// not overflow this time.
let layout = unsafe { Layout::array::<PageInfo>(size).unwrap_unchecked() };
// SAFETY: The `pages` array was allocated with the same layout.
unsafe { alloc::alloc::dealloc(pages.cast(), layout) };
}
}
#[no_mangle]
unsafe extern "C" fn rust_shrink_count(
shrink: *mut bindings::shrinker,
_sc: *mut bindings::shrink_control,
) -> c_ulong {
// SAFETY: This method is only used with the `Shrinker` type, and the cast is valid since
// `shrinker` is the first field of a #[repr(C)] struct.
let shrinker = unsafe { &*shrink.cast::<Shrinker>() };
// SAFETY: Accessing the lru list is okay. Just an FFI call.
unsafe { bindings::list_lru_count(shrinker.list_lru.get()) }
}
#[no_mangle]
unsafe extern "C" fn rust_shrink_scan(
shrink: *mut bindings::shrinker,
sc: *mut bindings::shrink_control,
) -> c_ulong {
// SAFETY: This method is only used with the `Shrinker` type, and the cast is valid since
// `shrinker` is the first field of a #[repr(C)] struct.
let shrinker = unsafe { &*shrink.cast::<Shrinker>() };
// SAFETY: Caller guarantees that it is safe to read this field.
let nr_to_scan = unsafe { (*sc).nr_to_scan };
// SAFETY: Accessing the lru list is okay. Just an FFI call.
unsafe {
extern "C" {
fn rust_shrink_free_page_wrap(
item: *mut bindings::list_head,
list: *mut bindings::list_lru_one,
lock: *mut bindings::spinlock_t,
cb_arg: *mut core::ffi::c_void,
) -> bindings::lru_status;
}
bindings::list_lru_walk(
shrinker.list_lru.get(),
Some(rust_shrink_free_page_wrap),
ptr::null_mut(),
nr_to_scan,
)
}
}
const LRU_SKIP: bindings::lru_status = bindings::lru_status_LRU_SKIP;
const LRU_REMOVED_ENTRY: bindings::lru_status = bindings::lru_status_LRU_REMOVED_RETRY;
#[no_mangle]
unsafe extern "C" fn rust_shrink_free_page(
item: *mut bindings::list_head,
lru: *mut bindings::list_lru_one,
lru_lock: *mut bindings::spinlock_t,
_cb_arg: *mut c_void,
) -> bindings::lru_status {
// Fields that should survive after unlocking the lru lock.
let pid;
let page;
let page_index;
let mm;
let mmap_read;
let mm_mutex;
let vma_addr;
{
// SAFETY: The `list_head` field is first in `PageInfo`.
let info = item as *mut PageInfo;
let range = unsafe { &*((*info).range) };
mm = match range.mm.mmget_not_zero() {
Some(mm) => MmWithUser::into_mmput_async(mm),
None => return LRU_SKIP,
};
mm_mutex = match range.stable_trylock_mm() {
Some(guard) => guard,
None => return LRU_SKIP,
};
mmap_read = match mm.mmap_read_trylock() {
Some(guard) => guard,
None => return LRU_SKIP,
};
// We can't lock it normally here, since we hold the lru lock.
let inner = match range.lock.try_lock() {
Some(inner) => inner,
None => return LRU_SKIP,
};
// SAFETY: The item is in this lru list, so it's okay to remove it.
unsafe { bindings::list_lru_isolate(lru, item) };
// SAFETY: Both pointers are in bounds of the same allocation.
page_index = unsafe { info.offset_from(inner.pages) } as usize;
pid = range.pid;
crate::trace::trace_unmap_kernel_start(pid, page_index);
// SAFETY: We hold the spinlock, so we can take the page.
//
// This sets the page pointer to zero before we unmap it from the vma. However, we call
// `zap_page_range` before we release the mmap lock, so `use_page_slow` will not be able to
// insert a new page until after our call to `zap_page_range`.
page = unsafe { PageInfo::take_page(info) };
vma_addr = inner.vma_addr;
crate::trace::trace_unmap_kernel_end(pid, page_index);
// From this point on, we don't access this PageInfo or ShrinkablePageRange again, because
// they can be freed at any point after we unlock `lru_lock`. This is with the exception of
// `mm_mutex` which is kept alive by holding the lock.
}
// SAFETY: The lru lock is locked when this method is called.
unsafe { bindings::spin_unlock(lru_lock) };
if let Some(vma) = mmap_read.vma_lookup(vma_addr) {
let user_page_addr = vma_addr + (page_index << PAGE_SHIFT);
crate::trace::trace_unmap_user_start(pid, page_index);
vma.zap_page_range_single(user_page_addr, PAGE_SIZE);
crate::trace::trace_unmap_user_end(pid, page_index);
}
drop(mmap_read);
drop(mm_mutex);
drop(mm);
drop(page);
// SAFETY: We just unlocked the lru lock, but it should be locked when we return.
unsafe { bindings::spin_lock(lru_lock) };
LRU_REMOVED_ENTRY
}

View File

@@ -0,0 +1,25 @@
// SPDX-License-Identifier: GPL-2.0
/* C helper for page_range.rs to work around a CFI violation.
*
* Bindgen currently pretends that `enum lru_status` is the same as an integer.
* This assumption is fine ABI-wise, but once you add CFI to the mix, it
* triggers a CFI violation because `enum lru_status` gets a different CFI tag.
*
* This file contains a workaround until bindgen can be fixed.
*
* Copyright (C) 2024 Google LLC.
*/
#include <linux/list_lru.h>
#include <linux/spinlock.h>
unsigned int rust_shrink_free_page(struct list_head *item,
struct list_lru_one *list, spinlock_t *lock,
void *cb_arg);
enum lru_status
rust_shrink_free_page_wrap(struct list_head *item, struct list_lru_one *list,
spinlock_t *lock, void *cb_arg)
{
return rust_shrink_free_page(item, list, lock, cb_arg);
}

View File

@@ -0,0 +1,80 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
//! This module defines the types and methods relevant to priority inheritance.
use kernel::bindings;
pub(crate) type Policy = core::ffi::c_uint;
pub(crate) type Priority = core::ffi::c_int;
pub(crate) type Nice = core::ffi::c_int;
pub(crate) const SCHED_NORMAL: Policy = bindings::SCHED_NORMAL;
pub(crate) const SCHED_FIFO: Policy = bindings::SCHED_FIFO;
pub(crate) const MIN_NICE: Nice = bindings::MIN_NICE as _;
pub(crate) const MAX_NICE: Nice = bindings::MAX_NICE as _;
pub(crate) const DEFAULT_PRIO: Priority = bindings::DEFAULT_PRIO as _;
pub(crate) const MAX_RT_PRIO: Priority = bindings::MAX_RT_PRIO as _;
/// Scheduler policy and priority.
///
/// The binder driver supports inheriting the following scheduler policies:
/// * SCHED_NORMAL
/// * SCHED_BATCH
/// * SCHED_FIFO
/// * SCHED_RR
#[derive(Copy, Clone, Default)]
pub(crate) struct BinderPriority {
pub(crate) sched_policy: Policy,
pub(crate) prio: Priority,
}
#[derive(Copy, Clone, Eq, PartialEq)]
pub(crate) enum PriorityState {
Set,
Pending,
Abort,
}
pub(crate) fn get_default_prio_from_task(task: &kernel::task::Task) -> BinderPriority {
if is_supported_policy(task.policy()) {
BinderPriority {
sched_policy: task.policy(),
prio: task.normal_prio(),
}
} else {
BinderPriority {
sched_policy: SCHED_NORMAL,
prio: DEFAULT_PRIO,
}
}
}
pub(crate) fn is_rt_policy(policy: Policy) -> bool {
policy == bindings::SCHED_FIFO || policy == bindings::SCHED_RR
}
pub(crate) fn is_fair_policy(policy: Policy) -> bool {
policy == bindings::SCHED_NORMAL || policy == bindings::SCHED_BATCH
}
pub(crate) fn is_supported_policy(policy: Policy) -> bool {
is_fair_policy(policy) || is_rt_policy(policy)
}
pub(crate) fn to_userspace_prio(policy: Policy, prio: Priority) -> Nice {
if is_fair_policy(policy) {
prio - DEFAULT_PRIO
} else {
MAX_RT_PRIO - 1 - prio
}
}
pub(crate) fn to_kernel_prio(policy: Policy, prio: Nice) -> Priority {
if is_fair_policy(policy) {
prio + DEFAULT_PRIO
} else {
MAX_RT_PRIO - 1 - prio
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,282 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use kernel::{
page::{PAGE_MASK, PAGE_SIZE},
prelude::*,
seq_file::SeqFile,
seq_print,
task::Pid,
};
use crate::range_alloc::{DescriptorState, FreedRange, Range};
/// Keeps track of allocations in a process' mmap.
///
/// Each process has an mmap where the data for incoming transactions will be placed. This struct
/// keeps track of allocations made in the mmap. For each allocation, we store a descriptor that
/// has metadata related to the allocation. We also keep track of available free space.
pub(super) struct ArrayRangeAllocator<T> {
/// This stores all ranges that are allocated. Unlike the tree based allocator, we do *not*
/// store the free ranges.
///
/// Sorted by offset.
pub(super) ranges: Vec<Range<T>>,
size: usize,
free_oneway_space: usize,
}
struct FindEmptyRes {
/// Which index in `ranges` should we insert the new range at?
///
/// Inserting the new range at this index keeps `ranges` sorted.
insert_at_idx: usize,
/// Which offset should we insert the new range at?
insert_at_offset: usize,
}
impl<T> ArrayRangeAllocator<T> {
pub(crate) fn new(size: usize, alloc: EmptyArrayAlloc<T>) -> Self {
Self {
ranges: alloc.ranges,
size,
free_oneway_space: size / 2,
}
}
pub(crate) fn free_oneway_space(&self) -> usize {
self.free_oneway_space
}
pub(crate) fn count_buffers(&self) -> usize {
self.ranges.len()
}
pub(crate) fn total_size(&self) -> usize {
self.size
}
pub(crate) fn is_full(&self) -> bool {
self.ranges.len() == self.ranges.capacity()
}
pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> {
for range in &self.ranges {
seq_print!(
m,
" buffer {}: {} size {} pid {} oneway {}",
0,
range.offset,
range.size,
range.state.pid(),
range.state.is_oneway(),
);
if let DescriptorState::Reserved(_) = range.state {
seq_print!(m, " reserved\n");
} else {
seq_print!(m, " allocated\n");
}
}
Ok(())
}
/// Find somewhere to put a new range.
///
/// Unlike the tree implementation, we do not bother to find the smallest gap. The idea is that
/// fragmentation isn't a big issue when we don't have many ranges.
///
/// Returns the index that the new range should have in `self.ranges` after insertion.
fn find_empty_range(&self, size: usize) -> Option<FindEmptyRes> {
let after_last_range = self.ranges.last().map(Range::endpoint).unwrap_or(0);
if size <= self.total_size() - after_last_range {
// We can put the range at the end, so just do that.
Some(FindEmptyRes {
insert_at_idx: self.ranges.len(),
insert_at_offset: after_last_range,
})
} else {
let mut end_of_prev = 0;
for (i, range) in self.ranges.iter().enumerate() {
// Does it fit before the i'th range?
if size <= range.offset - end_of_prev {
return Some(FindEmptyRes {
insert_at_idx: i,
insert_at_offset: end_of_prev,
});
}
end_of_prev = range.endpoint();
}
None
}
}
pub(crate) fn reserve_new(
&mut self,
debug_id: usize,
size: usize,
is_oneway: bool,
pid: Pid,
) -> Result<usize> {
// Compute new value of free_oneway_space, which is set only on success.
let new_oneway_space = if is_oneway {
match self.free_oneway_space.checked_sub(size) {
Some(new_oneway_space) => new_oneway_space,
None => return Err(ENOSPC),
}
} else {
self.free_oneway_space
};
let FindEmptyRes {
insert_at_idx,
insert_at_offset,
} = self.find_empty_range(size).ok_or(ENOSPC)?;
self.free_oneway_space = new_oneway_space;
let new_range = Range {
offset: insert_at_offset,
size,
state: DescriptorState::new(is_oneway, debug_id, pid),
};
// Insert the value at the given index to keep the array sorted.
insert_within_capacity(&mut self.ranges, insert_at_idx, new_range);
Ok(insert_at_offset)
}
pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result<FreedRange> {
// This could use a binary search, but linear scans are usually faster for small arrays.
let i = self
.ranges
.iter()
.position(|range| range.offset == offset)
.ok_or(EINVAL)?;
let range = &self.ranges[i];
if let DescriptorState::Allocated(_) = range.state {
return Err(EPERM);
}
let size = range.size;
let offset = range.offset;
if range.state.is_oneway() {
self.free_oneway_space += size;
}
// This computes the range of pages that are no longer used by *any* allocated range. The
// caller will mark them as unused, which means that they can be freed if the system comes
// under memory pressure.
let mut freed_range = FreedRange::interior_pages(offset, size);
if offset % PAGE_SIZE != 0 {
if i == 0 || self.ranges[i - 1].endpoint() <= (offset & PAGE_MASK) {
freed_range.start_page_idx -= 1;
}
}
if range.endpoint() % PAGE_SIZE != 0 {
let page_after = (range.endpoint() & PAGE_MASK) + PAGE_SIZE;
if i + 1 == self.ranges.len() || page_after <= self.ranges[i + 1].offset {
freed_range.end_page_idx += 1;
}
}
self.ranges.remove(i);
Ok(freed_range)
}
pub(crate) fn reservation_commit(&mut self, offset: usize, data: Option<T>) -> Result {
// This could use a binary search, but linear scans are usually faster for small arrays.
let range = self
.ranges
.iter_mut()
.find(|range| range.offset == offset)
.ok_or(ENOENT)?;
let DescriptorState::Reserved(reservation) = &range.state else {
return Err(ENOENT);
};
range.state = DescriptorState::Allocated(reservation.clone().allocate(data));
Ok(())
}
pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option<T>)> {
// This could use a binary search, but linear scans are usually faster for small arrays.
let range = self
.ranges
.iter_mut()
.find(|range| range.offset == offset)
.ok_or(ENOENT)?;
let DescriptorState::Allocated(allocation) = &mut range.state else {
return Err(ENOENT);
};
let data = allocation.take();
let debug_id = allocation.reservation.debug_id;
range.state = DescriptorState::Reserved(allocation.reservation.clone());
Ok((range.size, debug_id, data))
}
pub(crate) fn take_for_each<F: Fn(usize, usize, usize, Option<T>)>(&mut self, callback: F) {
for range in self.ranges.iter_mut() {
if let DescriptorState::Allocated(allocation) = &mut range.state {
callback(
range.offset,
range.size,
allocation.reservation.debug_id,
allocation.data.take(),
);
}
}
}
}
pub(crate) struct EmptyArrayAlloc<T> {
ranges: Vec<Range<T>>,
}
impl<T> EmptyArrayAlloc<T> {
pub(crate) fn try_new(capacity: usize) -> Result<Self> {
Ok(Self {
ranges: Vec::with_capacity(capacity, GFP_KERNEL)?,
})
}
}
/// Copied from `rust/alloc/vec/mod.rs` with allocation codepath removed.
///
/// TODO: Either add this to the standard library (like [`push_within_capacity`]) or move it to the
/// kernel crate once [the updated allocation APIs][alloc] are available.
///
/// [`push_within_capacity`]: https://github.com/rust-lang/rust/issues/100486
/// [alloc]: https://lore.kernel.org/r/20240328013603.206764-1-wedsonaf@gmail.com
fn insert_within_capacity<T>(vec: &mut Vec<T>, index: usize, element: T) {
let len = vec.len();
if len == vec.capacity() {
panic!("no capacity to insert");
}
unsafe {
// The spot to put the new value
{
let p = vec.as_mut_ptr().add(index);
if index < len {
// Shift everything over to make space. (Duplicating the
// `index`th element into two consecutive places.)
core::ptr::copy(p, p.add(1), len - index);
} else if index == len {
// No elements need shifting.
} else {
panic!("insertion index (is {index}) should be <= len (is {len})");
}
// Write it in, overwriting the first copy of the `index`th
// element.
core::ptr::write(p, element);
}
vec.set_len(len + 1);
}
}

View File

@@ -0,0 +1,326 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use kernel::{page::PAGE_SIZE, prelude::*, seq_file::SeqFile, task::Pid};
mod tree;
use self::tree::{FromArrayAllocs, ReserveNewTreeAlloc, TreeRangeAllocator};
mod array;
use self::array::{ArrayRangeAllocator, EmptyArrayAlloc};
enum DescriptorState<T> {
Reserved(Reservation),
Allocated(Allocation<T>),
}
impl<T> DescriptorState<T> {
fn new(is_oneway: bool, debug_id: usize, pid: Pid) -> Self {
DescriptorState::Reserved(Reservation {
debug_id,
is_oneway,
pid,
})
}
fn pid(&self) -> Pid {
match self {
DescriptorState::Reserved(inner) => inner.pid,
DescriptorState::Allocated(inner) => inner.reservation.pid,
}
}
fn is_oneway(&self) -> bool {
match self {
DescriptorState::Reserved(inner) => inner.is_oneway,
DescriptorState::Allocated(inner) => inner.reservation.is_oneway,
}
}
}
#[derive(Clone)]
struct Reservation {
debug_id: usize,
is_oneway: bool,
pid: Pid,
}
impl Reservation {
fn allocate<T>(self, data: Option<T>) -> Allocation<T> {
Allocation {
data,
reservation: self,
}
}
}
struct Allocation<T> {
reservation: Reservation,
data: Option<T>,
}
impl<T> Allocation<T> {
fn deallocate(self) -> (Reservation, Option<T>) {
(self.reservation, self.data)
}
fn debug_id(&self) -> usize {
self.reservation.debug_id
}
fn take(&mut self) -> Option<T> {
self.data.take()
}
}
/// The array implementation must switch to the tree if it wants to go beyond this number of
/// ranges.
const TREE_THRESHOLD: usize = 8;
/// Represents a range of pages that have just become completely free.
#[derive(Copy, Clone)]
pub(crate) struct FreedRange {
pub(crate) start_page_idx: usize,
pub(crate) end_page_idx: usize,
}
impl FreedRange {
fn interior_pages(offset: usize, size: usize) -> FreedRange {
FreedRange {
// Divide round up
start_page_idx: (offset + (PAGE_SIZE - 1)) / PAGE_SIZE,
// Divide round down
end_page_idx: (offset + size) / PAGE_SIZE,
}
}
}
struct Range<T> {
offset: usize,
size: usize,
state: DescriptorState<T>,
}
impl<T> Range<T> {
fn endpoint(&self) -> usize {
self.offset + self.size
}
}
pub(crate) struct RangeAllocator<T> {
inner: Impl<T>,
}
enum Impl<T> {
Empty(usize),
Array(ArrayRangeAllocator<T>),
Tree(TreeRangeAllocator<T>),
}
impl<T> RangeAllocator<T> {
pub(crate) fn new(size: usize) -> Self {
Self {
inner: Impl::Empty(size),
}
}
pub(crate) fn free_oneway_space(&self) -> usize {
match &self.inner {
Impl::Empty(size) => size / 2,
Impl::Array(array) => array.free_oneway_space(),
Impl::Tree(tree) => tree.free_oneway_space(),
}
}
pub(crate) fn count_buffers(&self) -> usize {
match &self.inner {
Impl::Empty(_size) => 0,
Impl::Array(array) => array.count_buffers(),
Impl::Tree(tree) => tree.count_buffers(),
}
}
pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> {
match &self.inner {
Impl::Empty(_size) => Ok(()),
Impl::Array(array) => array.debug_print(m),
Impl::Tree(tree) => tree.debug_print(m),
}
}
/// Try to reserve a new buffer, using the provided allocation if necessary.
pub(crate) fn reserve_new(&mut self, mut args: ReserveNewArgs<T>) -> Result<ReserveNew<T>> {
match &mut self.inner {
Impl::Empty(size) => {
let empty_array = match args.empty_array_alloc.take() {
Some(empty_array) => ArrayRangeAllocator::new(*size, empty_array),
None => {
return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc {
args,
need_empty_array_alloc: true,
need_new_tree_alloc: false,
need_tree_alloc: false,
}))
}
};
self.inner = Impl::Array(empty_array);
self.reserve_new(args)
}
Impl::Array(array) if array.is_full() => {
let allocs = match args.new_tree_alloc {
Some(ref mut allocs) => allocs,
None => {
return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc {
args,
need_empty_array_alloc: false,
need_new_tree_alloc: true,
need_tree_alloc: true,
}))
}
};
let new_tree =
TreeRangeAllocator::from_array(array.total_size(), &mut array.ranges, allocs);
self.inner = Impl::Tree(new_tree);
self.reserve_new(args)
}
Impl::Array(array) => {
let offset =
array.reserve_new(args.debug_id, args.size, args.is_oneway, args.pid)?;
Ok(ReserveNew::Success(ReserveNewSuccess {
offset,
oneway_spam_detected: false,
_empty_array_alloc: args.empty_array_alloc,
_new_tree_alloc: args.new_tree_alloc,
_tree_alloc: args.tree_alloc,
}))
}
Impl::Tree(tree) => {
let alloc = match args.tree_alloc {
Some(alloc) => alloc,
None => {
return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc {
args,
need_empty_array_alloc: false,
need_new_tree_alloc: false,
need_tree_alloc: true,
}));
}
};
let (offset, oneway_spam_detected) =
tree.reserve_new(args.debug_id, args.size, args.is_oneway, args.pid, alloc)?;
Ok(ReserveNew::Success(ReserveNewSuccess {
offset,
oneway_spam_detected,
_empty_array_alloc: args.empty_array_alloc,
_new_tree_alloc: args.new_tree_alloc,
_tree_alloc: None,
}))
}
}
}
/// Deletes the allocations at `offset`.
pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result<FreedRange> {
match &mut self.inner {
Impl::Empty(_size) => Err(EINVAL),
Impl::Array(array) => array.reservation_abort(offset),
Impl::Tree(tree) => {
let freed_range = tree.reservation_abort(offset)?;
if tree.is_empty() {
self.inner = Impl::Empty(tree.total_size());
}
Ok(freed_range)
}
}
}
/// Called when an allocation is no longer in use by the kernel.
pub(crate) fn reservation_commit(&mut self, offset: usize, data: Option<T>) -> Result {
match &mut self.inner {
Impl::Empty(_size) => Err(EINVAL),
Impl::Array(array) => array.reservation_commit(offset, data),
Impl::Tree(tree) => tree.reservation_commit(offset, data),
}
}
/// Called when the kernel starts using an allocation.
///
/// Returns the size of the existing entry and the data associated with it.
pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option<T>)> {
match &mut self.inner {
Impl::Empty(_size) => Err(EINVAL),
Impl::Array(array) => array.reserve_existing(offset),
Impl::Tree(tree) => tree.reserve_existing(offset),
}
}
/// Call the provided callback at every allocated region.
///
/// This destroys the range allocator. Used only during shutdown.
pub(crate) fn take_for_each<F: Fn(usize, usize, usize, Option<T>)>(&mut self, callback: F) {
match &mut self.inner {
Impl::Empty(_size) => {}
Impl::Array(array) => array.take_for_each(callback),
Impl::Tree(tree) => tree.take_for_each(callback),
}
}
}
/// The arguments for `reserve_new`.
#[derive(Default)]
pub(crate) struct ReserveNewArgs<T> {
pub(crate) size: usize,
pub(crate) is_oneway: bool,
pub(crate) debug_id: usize,
pub(crate) pid: Pid,
pub(crate) empty_array_alloc: Option<EmptyArrayAlloc<T>>,
pub(crate) new_tree_alloc: Option<FromArrayAllocs<T>>,
pub(crate) tree_alloc: Option<ReserveNewTreeAlloc<T>>,
}
/// The return type of `ReserveNew`.
pub(crate) enum ReserveNew<T> {
Success(ReserveNewSuccess<T>),
NeedAlloc(ReserveNewNeedAlloc<T>),
}
/// Returned by `reserve_new` when the reservation was successul.
pub(crate) struct ReserveNewSuccess<T> {
pub(crate) offset: usize,
pub(crate) oneway_spam_detected: bool,
// If the user supplied an allocation that we did not end up using, then we return it here.
// The caller will kfree it outside of the lock.
_empty_array_alloc: Option<EmptyArrayAlloc<T>>,
_new_tree_alloc: Option<FromArrayAllocs<T>>,
_tree_alloc: Option<ReserveNewTreeAlloc<T>>,
}
/// Returned by `reserve_new` to request the caller to make an allocation before calling the method
/// again.
pub(crate) struct ReserveNewNeedAlloc<T> {
args: ReserveNewArgs<T>,
need_empty_array_alloc: bool,
need_new_tree_alloc: bool,
need_tree_alloc: bool,
}
impl<T> ReserveNewNeedAlloc<T> {
/// Make the necessary allocations for another call to `reserve_new`.
pub(crate) fn make_alloc(mut self) -> Result<ReserveNewArgs<T>> {
if self.need_empty_array_alloc && self.args.empty_array_alloc.is_none() {
self.args.empty_array_alloc = Some(EmptyArrayAlloc::try_new(TREE_THRESHOLD)?);
}
if self.need_new_tree_alloc && self.args.new_tree_alloc.is_none() {
self.args.new_tree_alloc = Some(FromArrayAllocs::try_new(TREE_THRESHOLD)?);
}
if self.need_tree_alloc && self.args.tree_alloc.is_none() {
self.args.tree_alloc = Some(ReserveNewTreeAlloc::try_new()?);
}
Ok(self.args)
}
}

View File

@@ -0,0 +1,500 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use kernel::{
page::PAGE_SIZE,
prelude::*,
rbtree::{RBTree, RBTreeNode, RBTreeNodeReservation},
seq_file::SeqFile,
seq_print,
task::Pid,
};
use crate::range_alloc::{DescriptorState, FreedRange, Range};
/// Keeps track of allocations in a process' mmap.
///
/// Each process has an mmap where the data for incoming transactions will be placed. This struct
/// keeps track of allocations made in the mmap. For each allocation, we store a descriptor that
/// has metadata related to the allocation. We also keep track of available free space.
pub(super) struct TreeRangeAllocator<T> {
/// This collection contains descriptors for *both* ranges containing an allocation, *and* free
/// ranges between allocations. The free ranges get merged, so there are never two free ranges
/// next to each other.
tree: RBTree<usize, Descriptor<T>>,
/// Contains an entry for every free range in `self.tree`. This tree sorts the ranges by size,
/// letting us look up the smallest range whose size is at least some lower bound.
free_tree: RBTree<FreeKey, ()>,
size: usize,
free_oneway_space: usize,
}
impl<T> TreeRangeAllocator<T> {
pub(crate) fn from_array(
size: usize,
ranges: &mut Vec<Range<T>>,
alloc: &mut FromArrayAllocs<T>,
) -> Self {
let mut tree = TreeRangeAllocator {
tree: RBTree::new(),
free_tree: RBTree::new(),
size,
free_oneway_space: size / 2,
};
let mut free_offset = 0;
for range in ranges.drain(..) {
let free_size = range.offset - free_offset;
if free_size > 0 {
let free_node = alloc.free_tree.pop().unwrap();
tree.free_tree
.insert(free_node.into_node((free_size, free_offset), ()));
let tree_node = alloc.tree.pop().unwrap();
tree.tree.insert(
tree_node.into_node(free_offset, Descriptor::new(free_offset, free_size)),
);
}
free_offset = range.endpoint();
if range.state.is_oneway() {
tree.free_oneway_space = tree.free_oneway_space.saturating_sub(range.size);
}
let free_res = alloc.free_tree.pop().unwrap();
let tree_node = alloc.tree.pop().unwrap();
let mut desc = Descriptor::new(range.offset, range.size);
desc.state = Some((range.state, free_res));
tree.tree.insert(tree_node.into_node(range.offset, desc));
}
// After the last range, we may need a free range.
if free_offset < size {
let free_size = size - free_offset;
let free_node = alloc.free_tree.pop().unwrap();
tree.free_tree
.insert(free_node.into_node((free_size, free_offset), ()));
let tree_node = alloc.tree.pop().unwrap();
tree.tree
.insert(tree_node.into_node(free_offset, Descriptor::new(free_offset, free_size)));
}
tree
}
pub(crate) fn is_empty(&self) -> bool {
let mut tree_iter = self.tree.values();
// There's always at least one range, because index zero is either the start of a free or
// allocated range.
let first_value = tree_iter.next().unwrap();
if tree_iter.next().is_some() {
// There are never two free ranges next to each other, so if there is more than one
// descriptor, then at least one of them must hold an allocated range.
return false;
}
// There is only one descriptor. Return true if it is for a free range.
first_value.state.is_none()
}
pub(crate) fn total_size(&self) -> usize {
self.size
}
pub(crate) fn free_oneway_space(&self) -> usize {
self.free_oneway_space
}
pub(crate) fn count_buffers(&self) -> usize {
self.tree
.values()
.filter(|desc| desc.state.is_some())
.count()
}
pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> {
for desc in self.tree.values() {
let state = match &desc.state {
Some(state) => &state.0,
None => continue,
};
seq_print!(
m,
" buffer: {} size {} pid {}",
desc.offset,
desc.size,
state.pid(),
);
if state.is_oneway() {
seq_print!(m, " oneway");
}
match state {
DescriptorState::Reserved(_res) => {
seq_print!(m, " reserved\n");
}
DescriptorState::Allocated(_alloc) => {
seq_print!(m, " allocated\n");
}
}
}
Ok(())
}
fn find_best_match(&mut self, size: usize) -> Option<&mut Descriptor<T>> {
let free_cursor = self.free_tree.cursor_lower_bound(&(size, 0))?;
let ((_, offset), _) = free_cursor.current();
self.tree.get_mut(offset)
}
/// Try to reserve a new buffer, using the provided allocation if necessary.
pub(crate) fn reserve_new(
&mut self,
debug_id: usize,
size: usize,
is_oneway: bool,
pid: Pid,
alloc: ReserveNewTreeAlloc<T>,
) -> Result<(usize, bool)> {
// Compute new value of free_oneway_space, which is set only on success.
let new_oneway_space = if is_oneway {
match self.free_oneway_space.checked_sub(size) {
Some(new_oneway_space) => new_oneway_space,
None => return Err(ENOSPC),
}
} else {
self.free_oneway_space
};
// Start detecting spammers once we have less than 20%
// of async space left (which is less than 10% of total
// buffer size).
//
// (This will short-circut, so `low_oneway_space` is
// only called when necessary.)
let oneway_spam_detected =
is_oneway && new_oneway_space < self.size / 10 && self.low_oneway_space(pid);
let (found_size, found_off, tree_node, free_tree_node) = match self.find_best_match(size) {
None => {
pr_warn!("ENOSPC from range_alloc.reserve_new - size: {}", size);
return Err(ENOSPC);
}
Some(desc) => {
let found_size = desc.size;
let found_offset = desc.offset;
// In case we need to break up the descriptor
let new_desc = Descriptor::new(found_offset + size, found_size - size);
let (tree_node, free_tree_node, desc_node_res) = alloc.initialize(new_desc);
desc.state = Some((
DescriptorState::new(is_oneway, debug_id, pid),
desc_node_res,
));
desc.size = size;
(found_size, found_offset, tree_node, free_tree_node)
}
};
self.free_oneway_space = new_oneway_space;
self.free_tree.remove(&(found_size, found_off));
if found_size != size {
self.tree.insert(tree_node);
self.free_tree.insert(free_tree_node);
}
Ok((found_off, oneway_spam_detected))
}
pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result<FreedRange> {
let mut cursor = self.tree.cursor_lower_bound(&offset).ok_or_else(|| {
pr_warn!(
"EINVAL from range_alloc.reservation_abort - offset: {}",
offset
);
EINVAL
})?;
let (_, desc) = cursor.current_mut();
if desc.offset != offset {
pr_warn!(
"EINVAL from range_alloc.reservation_abort - offset: {}",
offset
);
return Err(EINVAL);
}
let (reservation, free_node_res) = desc.try_change_state(|state| match state {
Some((DescriptorState::Reserved(reservation), free_node_res)) => {
(None, Ok((reservation, free_node_res)))
}
None => {
pr_warn!(
"EINVAL from range_alloc.reservation_abort - offset: {}",
offset
);
(None, Err(EINVAL))
}
allocated => {
pr_warn!(
"EPERM from range_alloc.reservation_abort - offset: {}",
offset
);
(allocated, Err(EPERM))
}
})?;
let mut size = desc.size;
let mut offset = desc.offset;
let free_oneway_space_add = if reservation.is_oneway { size } else { 0 };
self.free_oneway_space += free_oneway_space_add;
let mut freed_range = FreedRange::interior_pages(offset, size);
// Compute how large the next free region needs to be to include one more page in
// the newly freed range.
let add_next_page_needed = match (offset + size) % PAGE_SIZE {
0 => usize::MAX,
unalign => PAGE_SIZE - unalign,
};
// Compute how large the previous free region needs to be to include one more page
// in the newly freed range.
let add_prev_page_needed = match offset % PAGE_SIZE {
0 => usize::MAX,
unalign => unalign,
};
// Merge next into current if next is free
let remove_next = match cursor.peek_next() {
Some((_, next)) if next.state.is_none() => {
if next.size >= add_next_page_needed {
freed_range.end_page_idx += 1;
}
self.free_tree.remove(&(next.size, next.offset));
size += next.size;
true
}
_ => false,
};
if remove_next {
let (_, desc) = cursor.current_mut();
desc.size = size;
cursor.remove_next();
}
// Merge current into prev if prev is free
match cursor.peek_prev_mut() {
Some((_, prev)) if prev.state.is_none() => {
if prev.size >= add_prev_page_needed {
freed_range.start_page_idx -= 1;
}
// merge previous with current, remove current
self.free_tree.remove(&(prev.size, prev.offset));
offset = prev.offset;
size += prev.size;
prev.size = size;
cursor.remove_current();
}
_ => {}
};
self.free_tree
.insert(free_node_res.into_node((size, offset), ()));
Ok(freed_range)
}
pub(crate) fn reservation_commit(&mut self, offset: usize, data: Option<T>) -> Result {
let desc = self.tree.get_mut(&offset).ok_or_else(|| {
pr_warn!(
"ENOENT from range_alloc.reservation_commit - offset: {}",
offset
);
ENOENT
})?;
desc.try_change_state(|state| match state {
Some((DescriptorState::Reserved(reservation), free_node_res)) => (
Some((
DescriptorState::Allocated(reservation.allocate(data)),
free_node_res,
)),
Ok(()),
),
other => {
pr_warn!(
"ENOENT from range_alloc.reservation_commit - offset: {}",
offset
);
(other, Err(ENOENT))
}
})
}
/// Takes an entry at the given offset from [`DescriptorState::Allocated`] to
/// [`DescriptorState::Reserved`].
///
/// Returns the size of the existing entry and the data associated with it.
pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option<T>)> {
let desc = self.tree.get_mut(&offset).ok_or_else(|| {
pr_warn!(
"ENOENT from range_alloc.reserve_existing - offset: {}",
offset
);
ENOENT
})?;
let (debug_id, data) = desc.try_change_state(|state| match state {
Some((DescriptorState::Allocated(allocation), free_node_res)) => {
let (reservation, data) = allocation.deallocate();
let debug_id = reservation.debug_id;
(
Some((DescriptorState::Reserved(reservation), free_node_res)),
Ok((debug_id, data)),
)
}
other => {
pr_warn!(
"ENOENT from range_alloc.reserve_existing - offset: {}",
offset
);
(other, Err(ENOENT))
}
})?;
Ok((desc.size, debug_id, data))
}
/// Call the provided callback at every allocated region.
///
/// This destroys the range allocator. Used only during shutdown.
pub(crate) fn take_for_each<F: Fn(usize, usize, usize, Option<T>)>(&mut self, callback: F) {
for (_, desc) in self.tree.iter_mut() {
if let Some((DescriptorState::Allocated(allocation), _)) = &mut desc.state {
callback(
desc.offset,
desc.size,
allocation.debug_id(),
allocation.take(),
);
}
}
}
/// Find the amount and size of buffers allocated by the current caller.
///
/// The idea is that once we cross the threshold, whoever is responsible
/// for the low async space is likely to try to send another async transaction,
/// and at some point we'll catch them in the act. This is more efficient
/// than keeping a map per pid.
fn low_oneway_space(&self, calling_pid: Pid) -> bool {
let mut total_alloc_size = 0;
let mut num_buffers = 0;
for (_, desc) in self.tree.iter() {
if let Some((state, _)) = &desc.state {
if state.is_oneway() && state.pid() == calling_pid {
total_alloc_size += desc.size;
num_buffers += 1;
}
}
}
// Warn if this pid has more than 50 transactions, or more than 50% of
// async space (which is 25% of total buffer size). Oneway spam is only
// detected when the threshold is exceeded.
num_buffers > 50 || total_alloc_size > self.size / 4
}
}
type TreeDescriptorState<T> = (DescriptorState<T>, FreeNodeRes);
struct Descriptor<T> {
size: usize,
offset: usize,
state: Option<TreeDescriptorState<T>>,
}
impl<T> Descriptor<T> {
fn new(offset: usize, size: usize) -> Self {
Self {
size,
offset,
state: None,
}
}
fn try_change_state<F, Data>(&mut self, f: F) -> Result<Data>
where
F: FnOnce(Option<TreeDescriptorState<T>>) -> (Option<TreeDescriptorState<T>>, Result<Data>),
{
let (new_state, result) = f(self.state.take());
self.state = new_state;
result
}
}
// (Descriptor.size, Descriptor.offset)
type FreeKey = (usize, usize);
type FreeNodeRes = RBTreeNodeReservation<FreeKey, ()>;
/// An allocation for use by `reserve_new`.
pub(crate) struct ReserveNewTreeAlloc<T> {
tree_node_res: RBTreeNodeReservation<usize, Descriptor<T>>,
free_tree_node_res: FreeNodeRes,
desc_node_res: FreeNodeRes,
}
impl<T> ReserveNewTreeAlloc<T> {
pub(crate) fn try_new() -> Result<Self> {
let tree_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?;
let free_tree_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?;
let desc_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?;
Ok(Self {
tree_node_res,
free_tree_node_res,
desc_node_res,
})
}
fn initialize(
self,
desc: Descriptor<T>,
) -> (
RBTreeNode<usize, Descriptor<T>>,
RBTreeNode<FreeKey, ()>,
FreeNodeRes,
) {
let size = desc.size;
let offset = desc.offset;
(
self.tree_node_res.into_node(offset, desc),
self.free_tree_node_res.into_node((size, offset), ()),
self.desc_node_res,
)
}
}
/// An allocation for creating a tree from an `ArrayRangeAllocator`.
pub(crate) struct FromArrayAllocs<T> {
tree: Vec<RBTreeNodeReservation<usize, Descriptor<T>>>,
free_tree: Vec<RBTreeNodeReservation<FreeKey, ()>>,
}
impl<T> FromArrayAllocs<T> {
pub(crate) fn try_new(len: usize) -> Result<Self> {
let num_descriptors = 2 * len + 1;
let mut tree = Vec::with_capacity(num_descriptors, GFP_KERNEL)?;
for _ in 0..num_descriptors {
tree.push(RBTreeNodeReservation::new(GFP_KERNEL)?, GFP_KERNEL)?;
}
let mut free_tree = Vec::with_capacity(num_descriptors, GFP_KERNEL)?;
for _ in 0..num_descriptors {
free_tree.push(RBTreeNodeReservation::new(GFP_KERNEL)?, GFP_KERNEL)?;
}
Ok(Self { tree, free_tree })
}
}

View File

@@ -0,0 +1,618 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
//! Binder -- the Android IPC mechanism.
#![recursion_limit = "256"]
use kernel::{
bindings::{self, seq_file},
fs::File,
list::{HasListLinks, ListArc, ListArcSafe, ListLinksSelfPtr, TryNewListArc},
prelude::*,
seq_file::SeqFile,
seq_print,
sync::poll::PollTable,
sync::Arc,
task::Pid,
types::{AsBytes, ForeignOwnable},
uaccess::UserSliceWriter,
};
use crate::{context::Context, page_range::Shrinker, process::Process, thread::Thread};
use core::{
ptr::NonNull,
sync::atomic::{AtomicBool, AtomicUsize, Ordering},
};
mod allocation;
mod context;
mod deferred_close;
mod defs;
mod error;
mod node;
mod page_range;
mod prio;
mod process;
mod range_alloc;
mod stats;
mod thread;
mod trace;
mod transaction;
#[allow(warnings)] // generated bindgen code
mod binderfs {
use kernel::bindings::{dentry, inode};
extern "C" {
pub fn init_rust_binderfs() -> core::ffi::c_int;
}
extern "C" {
pub fn rust_binderfs_create_proc_file(
nodp: *mut inode,
pid: core::ffi::c_int,
) -> *mut dentry;
}
extern "C" {
pub fn rust_binderfs_remove_file(dentry: *mut dentry);
}
pub type rust_binder_context = *mut core::ffi::c_void;
#[repr(C)]
#[derive(Copy, Clone)]
pub struct binder_device {
pub minor: core::ffi::c_int,
pub ctx: rust_binder_context,
}
impl Default for binder_device {
fn default() -> Self {
let mut s = ::core::mem::MaybeUninit::<Self>::uninit();
unsafe {
::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
s.assume_init()
}
}
}
}
module! {
type: BinderModule,
name: "rust_binder",
author: "Wedson Almeida Filho, Alice Ryhl",
description: "Android Binder",
license: "GPL",
}
fn next_debug_id() -> usize {
static NEXT_DEBUG_ID: AtomicUsize = AtomicUsize::new(0);
NEXT_DEBUG_ID.fetch_add(1, Ordering::Relaxed)
}
/// Provides a single place to write Binder return values via the
/// supplied `UserSliceWriter`.
pub(crate) struct BinderReturnWriter<'a> {
writer: UserSliceWriter,
thread: &'a Thread,
}
impl<'a> BinderReturnWriter<'a> {
fn new(writer: UserSliceWriter, thread: &'a Thread) -> Self {
BinderReturnWriter { writer, thread }
}
/// Write a return code back to user space.
/// Should be a `BR_` constant from [`defs`] e.g. [`defs::BR_TRANSACTION_COMPLETE`].
fn write_code(&mut self, code: u32) -> Result {
crate::trace::trace_return(code);
stats::GLOBAL_STATS.inc_br(code);
self.thread.process.stats.inc_br(code);
self.writer.write(&code)
}
/// Write something *other than* a return code to user space.
fn write_payload<T: AsBytes>(&mut self, payload: &T) -> Result {
self.writer.write(payload)
}
fn len(&self) -> usize {
self.writer.len()
}
}
/// Specifies how a type should be delivered to the read part of a BINDER_WRITE_READ ioctl.
///
/// When a value is pushed to the todo list for a process or thread, it is stored as a trait object
/// with the type `Arc<dyn DeliverToRead>`. Trait objects are a Rust feature that lets you
/// implement dynamic dispatch over many different types. This lets us store many different types
/// in the todo list.
trait DeliverToRead: ListArcSafe + Send + Sync {
/// Performs work. Returns true if remaining work items in the queue should be processed
/// immediately, or false if it should return to caller before processing additional work
/// items.
fn do_work(
self: DArc<Self>,
thread: &Thread,
writer: &mut BinderReturnWriter<'_>,
) -> Result<bool>;
/// Cancels the given work item. This is called instead of [`DeliverToRead::do_work`] when work
/// won't be delivered.
fn cancel(self: DArc<Self>);
/// Called when a work item is delivered directly to a specific thread, rather than to the
/// process work list.
fn on_thread_selected(&self, _thread: &thread::Thread);
/// Should we use `wake_up_interruptible_sync` or `wake_up_interruptible` when scheduling this
/// work item?
///
/// Generally only set to true for non-oneway transactions.
fn should_sync_wakeup(&self) -> bool;
fn debug_print(&self, m: &SeqFile, prefix: &str, transaction_prefix: &str) -> Result<()>;
}
// Wrapper around a `DeliverToRead` with linked list links.
#[pin_data]
struct DTRWrap<T: ?Sized> {
#[pin]
links: ListLinksSelfPtr<DTRWrap<dyn DeliverToRead>>,
#[pin]
wrapped: T,
}
kernel::list::impl_has_list_links_self_ptr! {
impl HasSelfPtr<DTRWrap<dyn DeliverToRead>> for DTRWrap<dyn DeliverToRead> { self.links }
}
kernel::list::impl_list_arc_safe! {
impl{T: ListArcSafe + ?Sized} ListArcSafe<0> for DTRWrap<T> {
tracked_by wrapped: T;
}
}
kernel::list::impl_list_item! {
impl ListItem<0> for DTRWrap<dyn DeliverToRead> {
using ListLinksSelfPtr;
}
}
impl<T: ?Sized> core::ops::Deref for DTRWrap<T> {
type Target = T;
fn deref(&self) -> &T {
&self.wrapped
}
}
type DArc<T> = kernel::sync::Arc<DTRWrap<T>>;
type DLArc<T> = kernel::list::ListArc<DTRWrap<T>>;
impl<T: ListArcSafe> DTRWrap<T> {
fn new(val: impl PinInit<T>) -> impl PinInit<Self> {
pin_init!(Self {
links <- ListLinksSelfPtr::new(),
wrapped <- val,
})
}
fn arc_try_new(val: T) -> Result<DLArc<T>, kernel::alloc::AllocError> {
ListArc::pin_init(
try_pin_init!(Self {
links <- ListLinksSelfPtr::new(),
wrapped: val,
}),
GFP_KERNEL,
)
.map_err(|_| kernel::alloc::AllocError)
}
fn arc_pin_init(init: impl PinInit<T>) -> Result<DLArc<T>, kernel::error::Error> {
ListArc::pin_init(
try_pin_init!(Self {
links <- ListLinksSelfPtr::new(),
wrapped <- init,
}),
GFP_KERNEL,
)
}
}
struct DeliverCode {
code: u32,
skip: AtomicBool,
}
kernel::list::impl_list_arc_safe! {
impl ListArcSafe<0> for DeliverCode { untracked; }
}
impl DeliverCode {
fn new(code: u32) -> Self {
Self {
code,
skip: AtomicBool::new(false),
}
}
/// Disable this DeliverCode and make it do nothing.
///
/// This is used instead of removing it from the work list, since `LinkedList::remove` is
/// unsafe, whereas this method is not.
fn skip(&self) {
self.skip.store(true, Ordering::Relaxed);
}
}
impl DeliverToRead for DeliverCode {
fn do_work(
self: DArc<Self>,
_thread: &Thread,
writer: &mut BinderReturnWriter<'_>,
) -> Result<bool> {
if !self.skip.load(Ordering::Relaxed) {
writer.write_code(self.code)?;
}
Ok(true)
}
fn cancel(self: DArc<Self>) {}
fn on_thread_selected(&self, _thread: &Thread) {}
fn should_sync_wakeup(&self) -> bool {
false
}
fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> {
seq_print!(m, "{}", prefix);
if self.skip.load(Ordering::Relaxed) {
seq_print!(m, "(skipped) ");
}
if self.code == defs::BR_TRANSACTION_COMPLETE {
seq_print!(m, "transaction complete\n");
} else {
seq_print!(m, "transaction error: {}\n", self.code);
}
Ok(())
}
}
const fn ptr_align(value: usize) -> usize {
let size = core::mem::size_of::<usize>() - 1;
(value + size) & !size
}
// SAFETY: We call register in `init`.
static BINDER_SHRINKER: Shrinker = unsafe { Shrinker::new() };
struct BinderModule {}
impl kernel::Module for BinderModule {
fn init(_module: &'static kernel::ThisModule) -> Result<Self> {
// SAFETY: The module initializer never runs twice, so we only call this once.
unsafe { crate::context::CONTEXTS.init() };
// SAFETY: This just accesses global booleans.
unsafe {
extern "C" {
static mut binder_use_rust: i32;
fn unload_binder() -> i32;
}
if binder_use_rust == 0 {
return Ok(Self {});
}
if unload_binder() != 0 {
pr_err!("Failed to unload C Binder.");
return Ok(Self {});
}
}
BINDER_SHRINKER.register(kernel::c_str!("android-binder"))?;
// SAFETY: The module is being loaded, so we can initialize binderfs.
unsafe { kernel::error::to_result(binderfs::init_rust_binderfs())? };
Ok(Self {})
}
}
/// Makes the inner type Sync.
#[repr(transparent)]
pub struct AssertSync<T>(T);
// SAFETY: Used only to insert `file_operations` into a global, which is safe.
unsafe impl<T> Sync for AssertSync<T> {}
/// File operations that rust_binderfs.c can use.
#[no_mangle]
#[used]
pub static rust_binder_fops: AssertSync<kernel::bindings::file_operations> = {
// SAFETY: All zeroes is safe for the `file_operations` type.
let zeroed_ops = unsafe { core::mem::MaybeUninit::zeroed().assume_init() };
let ops = kernel::bindings::file_operations {
owner: THIS_MODULE.as_ptr(),
poll: Some(rust_binder_poll),
unlocked_ioctl: Some(rust_binder_unlocked_ioctl),
compat_ioctl: Some(rust_binder_compat_ioctl),
mmap: Some(rust_binder_mmap),
open: Some(rust_binder_open),
release: Some(rust_binder_release),
flush: Some(rust_binder_flush),
..zeroed_ops
};
AssertSync(ops)
};
#[no_mangle]
unsafe extern "C" fn rust_binder_new_context(
name: *const core::ffi::c_char,
) -> *mut core::ffi::c_void {
// SAFETY: The caller will always provide a valid c string here.
let name = unsafe { kernel::str::CStr::from_char_ptr(name) };
match Context::new(name) {
Ok(ctx) => Arc::into_foreign(ctx).cast_mut(),
Err(_err) => core::ptr::null_mut(),
}
}
#[no_mangle]
unsafe extern "C" fn rust_binder_remove_context(device: *mut core::ffi::c_void) {
if !device.is_null() {
// SAFETY: The caller ensures that the `device` pointer came from a previous call to
// `rust_binder_new_device`.
let ctx = unsafe { Arc::<Context>::from_foreign(device) };
ctx.deregister();
drop(ctx);
}
}
unsafe extern "C" fn rust_binder_open(
inode: *mut bindings::inode,
file_ptr: *mut bindings::file,
) -> core::ffi::c_int {
// SAFETY: The `rust_binderfs.c` file ensures that `i_private` is set to a
// `struct binder_device`.
let device = unsafe { (*inode).i_private } as *const binderfs::binder_device;
assert!(!device.is_null());
// SAFETY: The `rust_binderfs.c` file ensures that `device->ctx` holds a binder context when
// using the rust binder fops.
let ctx = unsafe { Arc::<Context>::borrow((*device).ctx) };
// SAFETY: The caller provides a valid file pointer to a new `struct file`.
let file = unsafe { File::from_raw_file(file_ptr) };
let process = match Process::open(ctx, file) {
Ok(process) => process,
Err(err) => return err.to_errno(),
};
// SAFETY: This is an `inode` for a newly created binder file.
match unsafe { BinderfsProcFile::new(inode, process.task.pid()) } {
Ok(Some(file)) => process.inner.lock().binderfs_file = Some(file),
Ok(None) => { /* pid already exists */ }
Err(err) => return err.to_errno(),
}
// SAFETY: This file is associated with Rust binder, so we own the `private_data` field.
unsafe { (*file_ptr).private_data = process.into_foreign().cast_mut() };
0
}
unsafe extern "C" fn rust_binder_release(
_inode: *mut bindings::inode,
file: *mut bindings::file,
) -> core::ffi::c_int {
// SAFETY: We previously set `private_data` in `rust_binder_open`.
let process = unsafe { Arc::<Process>::from_foreign((*file).private_data) };
// SAFETY: The caller ensures that the file is valid.
let file = unsafe { File::from_raw_file(file) };
Process::release(process, file);
0
}
unsafe extern "C" fn rust_binder_compat_ioctl(
file: *mut bindings::file,
cmd: core::ffi::c_uint,
arg: core::ffi::c_ulong,
) -> core::ffi::c_long {
// SAFETY: We previously set `private_data` in `rust_binder_open`.
let f = unsafe { Arc::<Process>::borrow((*file).private_data) };
// SAFETY: The caller ensures that the file is valid.
match Process::compat_ioctl(f, unsafe { File::from_raw_file(file) }, cmd as _, arg as _) {
Ok(()) => 0,
Err(err) => err.to_errno().into(),
}
}
unsafe extern "C" fn rust_binder_unlocked_ioctl(
file: *mut bindings::file,
cmd: core::ffi::c_uint,
arg: core::ffi::c_ulong,
) -> core::ffi::c_long {
// SAFETY: We previously set `private_data` in `rust_binder_open`.
let f = unsafe { Arc::<Process>::borrow((*file).private_data) };
// SAFETY: The caller ensures that the file is valid.
match Process::ioctl(f, unsafe { File::from_raw_file(file) }, cmd as _, arg as _) {
Ok(()) => 0,
Err(err) => err.to_errno().into(),
}
}
unsafe extern "C" fn rust_binder_mmap(
file: *mut bindings::file,
vma: *mut bindings::vm_area_struct,
) -> core::ffi::c_int {
// SAFETY: We previously set `private_data` in `rust_binder_open`.
let f = unsafe { Arc::<Process>::borrow((*file).private_data) };
// SAFETY: The caller ensures that the vma is valid.
let area = unsafe { kernel::mm::virt::VmAreaNew::from_raw(vma) };
// SAFETY: The caller ensures that the file is valid.
match Process::mmap(f, unsafe { File::from_raw_file(file) }, area) {
Ok(()) => 0,
Err(err) => err.to_errno(),
}
}
unsafe extern "C" fn rust_binder_poll(
file: *mut bindings::file,
wait: *mut bindings::poll_table_struct,
) -> bindings::__poll_t {
// SAFETY: We previously set `private_data` in `rust_binder_open`.
let f = unsafe { Arc::<Process>::borrow((*file).private_data) };
// SAFETY: The caller ensures that the file is valid.
let fileref = unsafe { File::from_raw_file(file) };
// SAFETY: The caller ensures that the `PollTable` is valid.
match Process::poll(f, fileref, unsafe { PollTable::from_ptr(wait) }) {
Ok(v) => v,
Err(_) => bindings::POLLERR,
}
}
unsafe extern "C" fn rust_binder_flush(
file: *mut bindings::file,
_id: bindings::fl_owner_t,
) -> core::ffi::c_int {
// SAFETY: We previously set `private_data` in `rust_binder_open`.
let f = unsafe { Arc::<Process>::borrow((*file).private_data) };
match Process::flush(f) {
Ok(()) => 0,
Err(err) => err.to_errno(),
}
}
#[no_mangle]
unsafe extern "C" fn rust_binder_stats_show(
ptr: *mut seq_file,
_: *mut core::ffi::c_void,
) -> core::ffi::c_int {
// SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which
// this method is called.
let m = unsafe { SeqFile::from_raw(ptr) };
if let Err(err) = rust_binder_stats_show_impl(m) {
seq_print!(m, "failed to generate state: {:?}\n", err);
}
0
}
#[no_mangle]
unsafe extern "C" fn rust_binder_state_show(
ptr: *mut seq_file,
_: *mut core::ffi::c_void,
) -> core::ffi::c_int {
// SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which
// this method is called.
let m = unsafe { SeqFile::from_raw(ptr) };
if let Err(err) = rust_binder_state_show_impl(m) {
seq_print!(m, "failed to generate state: {:?}\n", err);
}
0
}
#[no_mangle]
unsafe extern "C" fn rust_binder_proc_show(
ptr: *mut seq_file,
_: *mut core::ffi::c_void,
) -> core::ffi::c_int {
// SAFETY: Accessing the private field of `seq_file` is okay.
let pid = (unsafe { (*ptr).private }) as usize as Pid;
// SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which
// this method is called.
let m = unsafe { SeqFile::from_raw(ptr) };
if let Err(err) = rust_binder_proc_show_impl(m, pid) {
seq_print!(m, "failed to generate state: {:?}\n", err);
}
0
}
#[no_mangle]
unsafe extern "C" fn rust_binder_transactions_show(
ptr: *mut seq_file,
_: *mut core::ffi::c_void,
) -> core::ffi::c_int {
// SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which
// this method is called.
let m = unsafe { SeqFile::from_raw(ptr) };
if let Err(err) = rust_binder_transactions_show_impl(m) {
seq_print!(m, "failed to generate state: {:?}\n", err);
}
0
}
fn rust_binder_transactions_show_impl(m: &SeqFile) -> Result<()> {
seq_print!(m, "binder transactions:\n");
let contexts = context::get_all_contexts()?;
for ctx in contexts {
let procs = ctx.get_all_procs()?;
for proc in procs {
proc.debug_print(m, &ctx, false)?;
seq_print!(m, "\n");
}
}
Ok(())
}
fn rust_binder_stats_show_impl(m: &SeqFile) -> Result<()> {
seq_print!(m, "binder stats:\n");
stats::GLOBAL_STATS.debug_print("", m);
let contexts = context::get_all_contexts()?;
for ctx in contexts {
let procs = ctx.get_all_procs()?;
for proc in procs {
proc.debug_print_stats(m, &ctx)?;
seq_print!(m, "\n");
}
}
Ok(())
}
fn rust_binder_state_show_impl(m: &SeqFile) -> Result<()> {
seq_print!(m, "binder state:\n");
let contexts = context::get_all_contexts()?;
for ctx in contexts {
let procs = ctx.get_all_procs()?;
for proc in procs {
proc.debug_print(m, &ctx, true)?;
seq_print!(m, "\n");
}
}
Ok(())
}
fn rust_binder_proc_show_impl(m: &SeqFile, pid: Pid) -> Result<()> {
seq_print!(m, "binder proc state:\n");
let contexts = context::get_all_contexts()?;
for ctx in contexts {
let procs = ctx.get_procs_with_pid(pid)?;
for proc in procs {
proc.debug_print(m, &ctx, true)?;
seq_print!(m, "\n");
}
}
Ok(())
}
struct BinderfsProcFile(NonNull<bindings::dentry>);
// SAFETY: Safe to drop any thread.
unsafe impl Send for BinderfsProcFile {}
impl BinderfsProcFile {
/// # Safety
///
/// Takes an inode from a newly created binder file.
unsafe fn new(nodp: *mut bindings::inode, pid: i32) -> Result<Option<Self>> {
// SAFETY: The caller passes an `inode` for a newly created binder file.
let dentry = unsafe { binderfs::rust_binderfs_create_proc_file(nodp, pid) };
match kernel::error::from_err_ptr(dentry) {
Ok(dentry) => Ok(NonNull::new(dentry).map(Self)),
Err(err) if err == EEXIST => Ok(None),
Err(err) => Err(err),
}
}
}
impl Drop for BinderfsProcFile {
fn drop(&mut self) {
// SAFETY: This is a dentry from `rust_binderfs_remove_file` that has not been deleted yet.
unsafe { binderfs::rust_binderfs_remove_file(self.0.as_ptr()) };
}
}

View File

@@ -0,0 +1,53 @@
// SPDX-License-Identifier: GPL-2.0-only
/* rust_binder_events.c
*
* Rust Binder tracepoints.
*
* Copyright 2024 Google LLC
*/
const char * const binder_command_strings[] = {
"BC_TRANSACTION",
"BC_REPLY",
"BC_ACQUIRE_RESULT",
"BC_FREE_BUFFER",
"BC_INCREFS",
"BC_ACQUIRE",
"BC_RELEASE",
"BC_DECREFS",
"BC_INCREFS_DONE",
"BC_ACQUIRE_DONE",
"BC_ATTEMPT_ACQUIRE",
"BC_REGISTER_LOOPER",
"BC_ENTER_LOOPER",
"BC_EXIT_LOOPER",
"BC_REQUEST_DEATH_NOTIFICATION",
"BC_CLEAR_DEATH_NOTIFICATION",
"BC_DEAD_BINDER_DONE",
"BC_TRANSACTION_SG",
"BC_REPLY_SG",
};
const char * const binder_return_strings[] = {
"BR_ERROR",
"BR_OK",
"BR_TRANSACTION",
"BR_REPLY",
"BR_ACQUIRE_RESULT",
"BR_DEAD_REPLY",
"BR_TRANSACTION_COMPLETE",
"BR_INCREFS",
"BR_ACQUIRE",
"BR_RELEASE",
"BR_DECREFS",
"BR_ATTEMPT_ACQUIRE",
"BR_NOOP",
"BR_SPAWN_LOOPER",
"BR_FINISHED",
"BR_DEAD_BINDER",
"BR_CLEAR_DEATH_NOTIFICATION_DONE",
"BR_FAILED_REPLY",
"BR_FROZEN_REPLY",
"BR_ONEWAY_SPAM_SUSPECT",
"BR_TRANSACTION_PENDING_FROZEN"
};

View File

@@ -0,0 +1,23 @@
// SPDX-License-Identifier: GPL-2.0-only
/* rust_binder_events.c
*
* Rust Binder vendorhooks.
*
* Copyright 2024 Google LLC
*/
#include <linux/rust_binder.h>
#define CREATE_TRACE_POINTS
#define CREATE_RUST_TRACE_POINTS
#include <trace/hooks/vendor_hooks.h>
#include <linux/tracepoint.h>
#include <trace/hooks/rust_binder.h>
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
* associated with them) to allow external modules to probe them.
*/
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rust_binder_set_priority);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rust_binder_restore_priority);

View File

@@ -0,0 +1,87 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* rust_binder_internal.h
*
* This file contains internal data structures used by Rust Binder. Mostly,
* these are type definitions used only by binderfs or things that Rust Binder
* define and export to binderfs.
*
* It does not include things exported by binderfs to Rust Binder since this
* file is not included as input to bindgen.
*
* Copyright (C) 2024 Google LLC.
*/
#ifndef _LINUX_RUST_BINDER_INTERNAL_H
#define _LINUX_RUST_BINDER_INTERNAL_H
#define RUST_BINDERFS_SUPER_MAGIC 0x6c6f6f71
#include <linux/seq_file.h>
#include <uapi/linux/android/binder.h>
#include <uapi/linux/android/binderfs.h>
/*
* The internal data types in the Rust Binder driver are opaque to C, so we use
* void pointer typedefs for these types.
*/
typedef void *rust_binder_context;
/**
* struct binder_device - information about a binder device node
* @minor: the minor number used by this device
* @ctx: the Rust Context used by this device, or null for binder-control
*
* This is used as the private data for files directly in binderfs, but not
* files in the binder_logs subdirectory. This struct owns a refcount on `ctx`
* and the entry for `minor` in `binderfs_minors`. For binder-control `ctx` is
* null.
*/
struct binder_device {
int minor;
rust_binder_context ctx;
};
int rust_binder_stats_show(struct seq_file *m, void *unused);
int rust_binder_state_show(struct seq_file *m, void *unused);
int rust_binder_transactions_show(struct seq_file *m, void *unused);
int rust_binder_proc_show(struct seq_file *m, void *pid);
extern const struct file_operations rust_binder_fops;
rust_binder_context rust_binder_new_context(char *name);
void rust_binder_remove_context(rust_binder_context device);
/**
* binderfs_mount_opts - mount options for binderfs
* @max: maximum number of allocatable binderfs binder devices
* @stats_mode: enable binder stats in binderfs.
*/
struct binderfs_mount_opts {
int max;
int stats_mode;
};
/**
* binderfs_info - information about a binderfs mount
* @ipc_ns: The ipc namespace the binderfs mount belongs to.
* @control_dentry: This records the dentry of this binderfs mount
* binder-control device.
* @root_uid: uid that needs to be used when a new binder device is
* created.
* @root_gid: gid that needs to be used when a new binder device is
* created.
* @mount_opts: The mount options in use.
* @device_count: The current number of allocated binder devices.
* @proc_log_dir: Pointer to the directory dentry containing process-specific
* logs.
*/
struct binderfs_info {
struct ipc_namespace *ipc_ns;
struct dentry *control_dentry;
kuid_t root_uid;
kgid_t root_gid;
struct binderfs_mount_opts mount_opts;
int device_count;
struct dentry *proc_log_dir;
};
#endif /* _LINUX_RUST_BINDER_INTERNAL_H */

View File

@@ -0,0 +1,849 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler_types.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/fsnotify.h>
#include <linux/gfp.h>
#include <linux/idr.h>
#include <linux/init.h>
#include <linux/ipc_namespace.h>
#include <linux/kdev_t.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/namei.h>
#include <linux/magic.h>
#include <linux/major.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/fs_parser.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/spinlock_types.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/uaccess.h>
#include <linux/user_namespace.h>
#include <linux/xarray.h>
#include <uapi/asm-generic/errno-base.h>
#include <uapi/linux/android/binder.h>
#include <uapi/linux/android/binderfs.h>
#include "rust_binder_internal.h"
#define FIRST_INODE 1
#define SECOND_INODE 2
#define INODE_OFFSET 3
#define BINDERFS_MAX_MINOR (1U << MINORBITS)
/* Ensure that the initial ipc namespace always has devices available. */
#define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 4)
DEFINE_SHOW_ATTRIBUTE(rust_binder_stats);
DEFINE_SHOW_ATTRIBUTE(rust_binder_state);
DEFINE_SHOW_ATTRIBUTE(rust_binder_transactions);
DEFINE_SHOW_ATTRIBUTE(rust_binder_proc);
char *rust_binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES;
module_param_named(rust_devices, rust_binder_devices_param, charp, 0444);
static dev_t binderfs_dev;
static DEFINE_MUTEX(binderfs_minors_mutex);
static DEFINE_IDA(binderfs_minors);
enum binderfs_param {
Opt_max,
Opt_stats_mode,
};
enum binderfs_stats_mode {
binderfs_stats_mode_unset,
binderfs_stats_mode_global,
};
struct binder_features {
bool oneway_spam_detection;
bool extended_error;
};
static const struct constant_table binderfs_param_stats[] = {
{ "global", binderfs_stats_mode_global },
{}
};
static const struct fs_parameter_spec binderfs_fs_parameters[] = {
fsparam_u32("max", Opt_max),
fsparam_enum("stats", Opt_stats_mode, binderfs_param_stats),
{}
};
static struct binder_features binder_features = {
.oneway_spam_detection = true,
.extended_error = true,
};
static inline struct binderfs_info *BINDERFS_SB(const struct super_block *sb)
{
return sb->s_fs_info;
}
bool is_rust_binderfs_device(const struct inode *inode)
{
if (inode->i_sb->s_magic == RUST_BINDERFS_SUPER_MAGIC)
return true;
return false;
}
/**
* binderfs_binder_device_create - allocate inode from super block of a
* binderfs mount
* @ref_inode: inode from wich the super block will be taken
* @userp: buffer to copy information about new device for userspace to
* @req: struct binderfs_device as copied from userspace
*
* This function allocates a new binder_device and reserves a new minor
* number for it.
* Minor numbers are limited and tracked globally in binderfs_minors. The
* function will stash a struct binder_device for the specific binder
* device in i_private of the inode.
* It will go on to allocate a new inode from the super block of the
* filesystem mount, stash a struct binder_device in its i_private field
* and attach a dentry to that inode.
*
* Return: 0 on success, negative errno on failure
*/
static int binderfs_binder_device_create(struct inode *ref_inode,
struct binderfs_device __user *userp,
struct binderfs_device *req)
{
int minor, ret;
struct dentry *dentry, *root;
struct binder_device *device = NULL;
rust_binder_context ctx = NULL;
struct inode *inode = NULL;
struct super_block *sb = ref_inode->i_sb;
struct binderfs_info *info = sb->s_fs_info;
#if defined(CONFIG_IPC_NS)
bool use_reserve = (info->ipc_ns == &init_ipc_ns);
#else
bool use_reserve = true;
#endif
/* Reserve new minor number for the new device. */
mutex_lock(&binderfs_minors_mutex);
if (++info->device_count <= info->mount_opts.max)
minor = ida_alloc_max(&binderfs_minors,
use_reserve ? BINDERFS_MAX_MINOR :
BINDERFS_MAX_MINOR_CAPPED,
GFP_KERNEL);
else
minor = -ENOSPC;
if (minor < 0) {
--info->device_count;
mutex_unlock(&binderfs_minors_mutex);
return minor;
}
mutex_unlock(&binderfs_minors_mutex);
ret = -ENOMEM;
device = kzalloc(sizeof(*device), GFP_KERNEL);
if (!device)
goto err;
req->name[BINDERFS_MAX_NAME] = '\0'; /* NUL-terminate */
ctx = rust_binder_new_context(req->name);
if (!ctx)
goto err;
inode = new_inode(sb);
if (!inode)
goto err;
inode->i_ino = minor + INODE_OFFSET;
simple_inode_init_ts(inode);
init_special_inode(inode, S_IFCHR | 0600,
MKDEV(MAJOR(binderfs_dev), minor));
inode->i_fop = &rust_binder_fops;
inode->i_uid = info->root_uid;
inode->i_gid = info->root_gid;
req->major = MAJOR(binderfs_dev);
req->minor = minor;
device->ctx = ctx;
device->minor = minor;
if (userp && copy_to_user(userp, req, sizeof(*req))) {
ret = -EFAULT;
goto err;
}
root = sb->s_root;
inode_lock(d_inode(root));
/* look it up */
dentry = lookup_one_len(req->name, root, strlen(req->name));
if (IS_ERR(dentry)) {
inode_unlock(d_inode(root));
ret = PTR_ERR(dentry);
goto err;
}
if (d_really_is_positive(dentry)) {
/* already exists */
dput(dentry);
inode_unlock(d_inode(root));
ret = -EEXIST;
goto err;
}
inode->i_private = device;
d_instantiate(dentry, inode);
fsnotify_create(root->d_inode, dentry);
inode_unlock(d_inode(root));
return 0;
err:
kfree(device);
rust_binder_remove_context(ctx);
mutex_lock(&binderfs_minors_mutex);
--info->device_count;
ida_free(&binderfs_minors, minor);
mutex_unlock(&binderfs_minors_mutex);
iput(inode);
return ret;
}
/**
* binder_ctl_ioctl - handle binder device node allocation requests
*
* The request handler for the binder-control device. All requests operate on
* the binderfs mount the binder-control device resides in:
* - BINDER_CTL_ADD
* Allocate a new binder device.
*
* Return: %0 on success, negative errno on failure.
*/
static long binder_ctl_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
int ret = -EINVAL;
struct inode *inode = file_inode(file);
struct binderfs_device __user *device = (struct binderfs_device __user *)arg;
struct binderfs_device device_req;
switch (cmd) {
case BINDER_CTL_ADD:
ret = copy_from_user(&device_req, device, sizeof(device_req));
if (ret) {
ret = -EFAULT;
break;
}
ret = binderfs_binder_device_create(inode, device, &device_req);
break;
default:
break;
}
return ret;
}
static void binderfs_evict_inode(struct inode *inode)
{
struct binder_device *device = inode->i_private;
struct binderfs_info *info = BINDERFS_SB(inode->i_sb);
clear_inode(inode);
if (!S_ISCHR(inode->i_mode) || !device)
return;
mutex_lock(&binderfs_minors_mutex);
--info->device_count;
ida_free(&binderfs_minors, device->minor);
mutex_unlock(&binderfs_minors_mutex);
/* ctx is null for binder-control, but this function ignores null pointers */
rust_binder_remove_context(device->ctx);
kfree(device);
}
static int binderfs_fs_context_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
int opt;
struct binderfs_mount_opts *ctx = fc->fs_private;
struct fs_parse_result result;
opt = fs_parse(fc, binderfs_fs_parameters, param, &result);
if (opt < 0)
return opt;
switch (opt) {
case Opt_max:
if (result.uint_32 > BINDERFS_MAX_MINOR)
return invalfc(fc, "Bad value for '%s'", param->key);
ctx->max = result.uint_32;
break;
case Opt_stats_mode:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
ctx->stats_mode = result.uint_32;
break;
default:
return invalfc(fc, "Unsupported parameter '%s'", param->key);
}
return 0;
}
static int binderfs_fs_context_reconfigure(struct fs_context *fc)
{
struct binderfs_mount_opts *ctx = fc->fs_private;
struct binderfs_info *info = BINDERFS_SB(fc->root->d_sb);
if (info->mount_opts.stats_mode != ctx->stats_mode)
return invalfc(fc, "Binderfs stats mode cannot be changed during a remount");
info->mount_opts.stats_mode = ctx->stats_mode;
info->mount_opts.max = ctx->max;
return 0;
}
static int binderfs_show_options(struct seq_file *seq, struct dentry *root)
{
struct binderfs_info *info = BINDERFS_SB(root->d_sb);
if (info->mount_opts.max <= BINDERFS_MAX_MINOR)
seq_printf(seq, ",max=%d", info->mount_opts.max);
switch (info->mount_opts.stats_mode) {
case binderfs_stats_mode_unset:
break;
case binderfs_stats_mode_global:
seq_printf(seq, ",stats=global");
break;
}
return 0;
}
static const struct super_operations binderfs_super_ops = {
.evict_inode = binderfs_evict_inode,
.show_options = binderfs_show_options,
.statfs = simple_statfs,
};
static inline bool is_binderfs_control_device(const struct dentry *dentry)
{
struct binderfs_info *info = dentry->d_sb->s_fs_info;
return info->control_dentry == dentry;
}
static int binderfs_rename(struct mnt_idmap *idmap,
struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
if (is_binderfs_control_device(old_dentry) ||
is_binderfs_control_device(new_dentry))
return -EPERM;
return simple_rename(idmap, old_dir, old_dentry, new_dir,
new_dentry, flags);
}
static int binderfs_unlink(struct inode *dir, struct dentry *dentry)
{
if (is_binderfs_control_device(dentry))
return -EPERM;
return simple_unlink(dir, dentry);
}
static const struct file_operations binder_ctl_fops = {
.owner = THIS_MODULE,
.open = nonseekable_open,
.unlocked_ioctl = binder_ctl_ioctl,
.compat_ioctl = binder_ctl_ioctl,
.llseek = noop_llseek,
};
/**
* binderfs_binder_ctl_create - create a new binder-control device
* @sb: super block of the binderfs mount
*
* This function creates a new binder-control device node in the binderfs mount
* referred to by @sb.
*
* Return: 0 on success, negative errno on failure
*/
static int binderfs_binder_ctl_create(struct super_block *sb)
{
int minor, ret;
struct dentry *dentry;
struct binder_device *device;
struct inode *inode = NULL;
struct dentry *root = sb->s_root;
struct binderfs_info *info = sb->s_fs_info;
#if defined(CONFIG_IPC_NS)
bool use_reserve = (info->ipc_ns == &init_ipc_ns);
#else
bool use_reserve = true;
#endif
device = kzalloc(sizeof(*device), GFP_KERNEL);
if (!device)
return -ENOMEM;
/* If we have already created a binder-control node, return. */
if (info->control_dentry) {
ret = 0;
goto out;
}
ret = -ENOMEM;
inode = new_inode(sb);
if (!inode)
goto out;
/* Reserve a new minor number for the new device. */
mutex_lock(&binderfs_minors_mutex);
minor = ida_alloc_max(&binderfs_minors,
use_reserve ? BINDERFS_MAX_MINOR :
BINDERFS_MAX_MINOR_CAPPED,
GFP_KERNEL);
mutex_unlock(&binderfs_minors_mutex);
if (minor < 0) {
ret = minor;
goto out;
}
inode->i_ino = SECOND_INODE;
simple_inode_init_ts(inode);
init_special_inode(inode, S_IFCHR | 0600,
MKDEV(MAJOR(binderfs_dev), minor));
inode->i_fop = &binder_ctl_fops;
inode->i_uid = info->root_uid;
inode->i_gid = info->root_gid;
device->minor = minor;
device->ctx = NULL;
dentry = d_alloc_name(root, "binder-control");
if (!dentry)
goto out;
inode->i_private = device;
info->control_dentry = dentry;
d_add(dentry, inode);
return 0;
out:
kfree(device);
iput(inode);
return ret;
}
static const struct inode_operations binderfs_dir_inode_operations = {
.lookup = simple_lookup,
.rename = binderfs_rename,
.unlink = binderfs_unlink,
};
static struct inode *binderfs_make_inode(struct super_block *sb, int mode)
{
struct inode *ret;
ret = new_inode(sb);
if (ret) {
ret->i_ino = iunique(sb, BINDERFS_MAX_MINOR + INODE_OFFSET);
ret->i_mode = mode;
simple_inode_init_ts(ret);
}
return ret;
}
static struct dentry *binderfs_create_dentry(struct dentry *parent,
const char *name)
{
struct dentry *dentry;
dentry = lookup_one_len(name, parent, strlen(name));
if (IS_ERR(dentry))
return dentry;
/* Return error if the file/dir already exists. */
if (d_really_is_positive(dentry)) {
dput(dentry);
return ERR_PTR(-EEXIST);
}
return dentry;
}
void rust_binderfs_remove_file(struct dentry *dentry)
{
struct inode *parent_inode;
parent_inode = d_inode(dentry->d_parent);
inode_lock(parent_inode);
if (simple_positive(dentry)) {
dget(dentry);
simple_unlink(parent_inode, dentry);
d_delete(dentry);
dput(dentry);
}
inode_unlock(parent_inode);
}
struct dentry *rust_binderfs_create_file(struct dentry *parent, const char *name,
const struct file_operations *fops,
void *data)
{
struct dentry *dentry;
struct inode *new_inode, *parent_inode;
struct super_block *sb;
parent_inode = d_inode(parent);
inode_lock(parent_inode);
dentry = binderfs_create_dentry(parent, name);
if (IS_ERR(dentry))
goto out;
sb = parent_inode->i_sb;
new_inode = binderfs_make_inode(sb, S_IFREG | 0444);
if (!new_inode) {
dput(dentry);
dentry = ERR_PTR(-ENOMEM);
goto out;
}
new_inode->i_fop = fops;
new_inode->i_private = data;
d_instantiate(dentry, new_inode);
fsnotify_create(parent_inode, dentry);
out:
inode_unlock(parent_inode);
return dentry;
}
struct dentry *rust_binderfs_create_proc_file(struct inode *nodp, int pid)
{
struct binderfs_info *info = nodp->i_sb->s_fs_info;
struct dentry *dir = info->proc_log_dir;
char strbuf[20 + 1];
void *data = (void *)(unsigned long) pid;
if (!dir)
return NULL;
snprintf(strbuf, sizeof(strbuf), "%u", pid);
return rust_binderfs_create_file(dir, strbuf, &rust_binder_proc_fops, data);
}
static struct dentry *binderfs_create_dir(struct dentry *parent,
const char *name)
{
struct dentry *dentry;
struct inode *new_inode, *parent_inode;
struct super_block *sb;
parent_inode = d_inode(parent);
inode_lock(parent_inode);
dentry = binderfs_create_dentry(parent, name);
if (IS_ERR(dentry))
goto out;
sb = parent_inode->i_sb;
new_inode = binderfs_make_inode(sb, S_IFDIR | 0755);
if (!new_inode) {
dput(dentry);
dentry = ERR_PTR(-ENOMEM);
goto out;
}
new_inode->i_fop = &simple_dir_operations;
new_inode->i_op = &simple_dir_inode_operations;
set_nlink(new_inode, 2);
d_instantiate(dentry, new_inode);
inc_nlink(parent_inode);
fsnotify_mkdir(parent_inode, dentry);
out:
inode_unlock(parent_inode);
return dentry;
}
static int binder_features_show(struct seq_file *m, void *unused)
{
bool *feature = m->private;
seq_printf(m, "%d\n", *feature);
return 0;
}
DEFINE_SHOW_ATTRIBUTE(binder_features);
static int init_binder_features(struct super_block *sb)
{
struct dentry *dentry, *dir;
dir = binderfs_create_dir(sb->s_root, "features");
if (IS_ERR(dir))
return PTR_ERR(dir);
dentry = rust_binderfs_create_file(dir, "oneway_spam_detection",
&binder_features_fops,
&binder_features.oneway_spam_detection);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
dentry = rust_binderfs_create_file(dir, "extended_error",
&binder_features_fops,
&binder_features.extended_error);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
return 0;
}
static int init_binder_logs(struct super_block *sb)
{
struct dentry *binder_logs_root_dir, *dentry, *proc_log_dir;
struct binderfs_info *info;
int ret = 0;
binder_logs_root_dir = binderfs_create_dir(sb->s_root,
"binder_logs");
if (IS_ERR(binder_logs_root_dir)) {
ret = PTR_ERR(binder_logs_root_dir);
goto out;
}
dentry = rust_binderfs_create_file(binder_logs_root_dir, "stats",
&rust_binder_stats_fops, NULL);
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
goto out;
}
dentry = rust_binderfs_create_file(binder_logs_root_dir, "state",
&rust_binder_state_fops, NULL);
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
goto out;
}
dentry = rust_binderfs_create_file(binder_logs_root_dir, "transactions",
&rust_binder_transactions_fops, NULL);
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
goto out;
}
proc_log_dir = binderfs_create_dir(binder_logs_root_dir, "proc");
if (IS_ERR(proc_log_dir)) {
ret = PTR_ERR(proc_log_dir);
goto out;
}
info = sb->s_fs_info;
info->proc_log_dir = proc_log_dir;
out:
return ret;
}
static int binderfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
int ret;
struct binderfs_info *info;
struct binderfs_mount_opts *ctx = fc->fs_private;
struct inode *inode = NULL;
struct binderfs_device device_info = {};
const char *name;
size_t len;
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
/*
* The binderfs filesystem can be mounted by userns root in a
* non-initial userns. By default such mounts have the SB_I_NODEV flag
* set in s_iflags to prevent security issues where userns root can
* just create random device nodes via mknod() since it owns the
* filesystem mount. But binderfs does not allow to create any files
* including devices nodes. The only way to create binder devices nodes
* is through the binder-control device which userns root is explicitly
* allowed to do. So removing the SB_I_NODEV flag from s_iflags is both
* necessary and safe.
*/
sb->s_iflags &= ~SB_I_NODEV;
sb->s_iflags |= SB_I_NOEXEC;
sb->s_magic = RUST_BINDERFS_SUPER_MAGIC;
sb->s_op = &binderfs_super_ops;
sb->s_time_gran = 1;
sb->s_fs_info = kzalloc(sizeof(struct binderfs_info), GFP_KERNEL);
if (!sb->s_fs_info)
return -ENOMEM;
info = sb->s_fs_info;
info->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns);
info->root_gid = make_kgid(sb->s_user_ns, 0);
if (!gid_valid(info->root_gid))
info->root_gid = GLOBAL_ROOT_GID;
info->root_uid = make_kuid(sb->s_user_ns, 0);
if (!uid_valid(info->root_uid))
info->root_uid = GLOBAL_ROOT_UID;
info->mount_opts.max = ctx->max;
info->mount_opts.stats_mode = ctx->stats_mode;
inode = new_inode(sb);
if (!inode)
return -ENOMEM;
inode->i_ino = FIRST_INODE;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | 0755;
simple_inode_init_ts(inode);
inode->i_op = &binderfs_dir_inode_operations;
set_nlink(inode, 2);
sb->s_root = d_make_root(inode);
if (!sb->s_root)
return -ENOMEM;
ret = binderfs_binder_ctl_create(sb);
if (ret)
return ret;
name = rust_binder_devices_param;
for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) {
strscpy(device_info.name, name, len + 1);
ret = binderfs_binder_device_create(inode, NULL, &device_info);
if (ret)
return ret;
name += len;
if (*name == ',')
name++;
}
ret = init_binder_features(sb);
if (ret)
return ret;
if (info->mount_opts.stats_mode == binderfs_stats_mode_global)
return init_binder_logs(sb);
return 0;
}
static int binderfs_fs_context_get_tree(struct fs_context *fc)
{
return get_tree_nodev(fc, binderfs_fill_super);
}
static void binderfs_fs_context_free(struct fs_context *fc)
{
struct binderfs_mount_opts *ctx = fc->fs_private;
kfree(ctx);
}
static const struct fs_context_operations binderfs_fs_context_ops = {
.free = binderfs_fs_context_free,
.get_tree = binderfs_fs_context_get_tree,
.parse_param = binderfs_fs_context_parse_param,
.reconfigure = binderfs_fs_context_reconfigure,
};
static int binderfs_init_fs_context(struct fs_context *fc)
{
struct binderfs_mount_opts *ctx;
ctx = kzalloc(sizeof(struct binderfs_mount_opts), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
ctx->max = BINDERFS_MAX_MINOR;
ctx->stats_mode = binderfs_stats_mode_unset;
fc->fs_private = ctx;
fc->ops = &binderfs_fs_context_ops;
return 0;
}
static void binderfs_kill_super(struct super_block *sb)
{
struct binderfs_info *info = sb->s_fs_info;
/*
* During inode eviction struct binderfs_info is needed.
* So first wipe the super_block then free struct binderfs_info.
*/
kill_litter_super(sb);
if (info && info->ipc_ns)
put_ipc_ns(info->ipc_ns);
kfree(info);
}
static struct file_system_type binder_fs_type = {
.name = "binder",
.init_fs_context = binderfs_init_fs_context,
.parameters = binderfs_fs_parameters,
.kill_sb = binderfs_kill_super,
.fs_flags = FS_USERNS_MOUNT,
};
int init_rust_binderfs(void)
{
int ret;
const char *name;
size_t len;
/* Verify that the default binderfs device names are valid. */
name = rust_binder_devices_param;
for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) {
if (len > BINDERFS_MAX_NAME)
return -E2BIG;
name += len;
if (*name == ',')
name++;
}
/* Allocate new major number for binderfs. */
ret = alloc_chrdev_region(&binderfs_dev, 0, BINDERFS_MAX_MINOR,
"rust_binder");
if (ret)
return ret;
ret = register_filesystem(&binder_fs_type);
if (ret) {
unregister_chrdev_region(binderfs_dev, BINDERFS_MAX_MINOR);
return ret;
}
return ret;
}

View File

@@ -0,0 +1,88 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
//! Keep track of statistics for binder_logs.
use crate::defs::*;
use core::sync::atomic::{AtomicU32, Ordering::Relaxed};
use kernel::{ioctl::_IOC_NR, seq_file::SeqFile, seq_print};
const BC_COUNT: usize = _IOC_NR(BC_REPLY_SG) as usize + 1;
const BR_COUNT: usize = _IOC_NR(BR_TRANSACTION_PENDING_FROZEN) as usize + 1;
pub(crate) static GLOBAL_STATS: BinderStats = BinderStats::new();
pub(crate) struct BinderStats {
bc: [AtomicU32; BC_COUNT],
br: [AtomicU32; BR_COUNT],
}
impl BinderStats {
pub(crate) const fn new() -> Self {
const ZERO: AtomicU32 = AtomicU32::new(0);
Self {
bc: [ZERO; BC_COUNT],
br: [ZERO; BR_COUNT],
}
}
pub(crate) fn inc_bc(&self, bc: u32) {
let idx = _IOC_NR(bc) as usize;
if let Some(bc_ref) = self.bc.get(idx) {
bc_ref.fetch_add(1, Relaxed);
}
}
pub(crate) fn inc_br(&self, br: u32) {
let idx = _IOC_NR(br) as usize;
if let Some(br_ref) = self.br.get(idx) {
br_ref.fetch_add(1, Relaxed);
}
}
pub(crate) fn debug_print(&self, prefix: &str, m: &SeqFile) {
for (i, cnt) in self.bc.iter().enumerate() {
let cnt = cnt.load(Relaxed);
if cnt > 0 {
seq_print!(m, "{}{}: {}\n", prefix, command_string(i), cnt);
}
}
for (i, cnt) in self.br.iter().enumerate() {
let cnt = cnt.load(Relaxed);
if cnt > 0 {
seq_print!(m, "{}{}: {}\n", prefix, return_string(i), cnt);
}
}
}
}
mod strings {
use core::str::from_utf8_unchecked;
use kernel::str::CStr;
extern "C" {
static binder_command_strings: [*const core::ffi::c_char; super::BC_COUNT];
static binder_return_strings: [*const core::ffi::c_char; super::BR_COUNT];
}
pub(super) fn command_string(i: usize) -> &'static str {
// SAFETY: Accessing `binder_command_strings` is always safe.
let c_str_ptr = unsafe { binder_command_strings[i] };
// SAFETY: The `binder_command_strings` array only contains nul-terminated strings.
let bytes = unsafe { CStr::from_char_ptr(c_str_ptr) }.as_bytes();
// SAFETY: The `binder_command_strings` array only contains strings with ascii-chars.
unsafe { from_utf8_unchecked(bytes) }
}
pub(super) fn return_string(i: usize) -> &'static str {
// SAFETY: Accessing `binder_return_strings` is always safe.
let c_str_ptr = unsafe { binder_return_strings[i] };
// SAFETY: The `binder_command_strings` array only contains nul-terminated strings.
let bytes = unsafe { CStr::from_char_ptr(c_str_ptr) }.as_bytes();
// SAFETY: The `binder_command_strings` array only contains strings with ascii-chars.
unsafe { from_utf8_unchecked(bytes) }
}
}
use strings::{command_string, return_string};

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,109 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
#![allow(unused_variables)]
#![allow(unused_imports)]
use crate::{defs::BinderTransactionDataSg, node::Node, thread::Thread, transaction::Transaction};
use kernel::error::Result;
use kernel::task::{Pid, Task};
use kernel::tracepoint::declare_trace;
use kernel::uapi::flat_binder_object;
use core::ffi::{c_int, c_uint, c_ulong};
#[inline]
pub(crate) fn trace_ioctl(cmd: u32, arg: usize) {}
#[inline]
pub(crate) fn trace_ioctl_done(ret: Result) {}
#[inline]
pub(crate) fn trace_read_done(ret: Result) {}
#[inline]
pub(crate) fn trace_write_done(ret: Result) {}
#[inline]
pub(crate) fn trace_set_priority(thread: &Task, desired_prio: c_int, new_prio: c_int) {}
#[inline]
pub(crate) fn vh_set_priority(t: &Transaction, task: &Task) {}
#[inline]
pub(crate) fn vh_restore_priority(task: &Task) {}
#[inline]
pub(crate) fn trace_wait_for_work(proc_work: bool, transaction_stack: bool, thread_todo: bool) {}
#[inline]
pub(crate) fn trace_transaction(reply: bool, t: &Transaction) {}
#[inline]
pub(crate) fn trace_transaction_received(t: &Transaction) {}
#[inline]
pub(crate) fn trace_transaction_thread_selected(t: &Transaction, th: &Thread) {}
#[inline]
pub(crate) fn trace_transaction_node_send(
t_debug_id: usize,
n: &Node,
orig: &flat_binder_object,
trans: &flat_binder_object,
) {
}
#[inline]
pub(crate) fn trace_transaction_fd_send(t_debug_id: usize, fd: u32, offset: usize) {}
#[inline]
pub(crate) fn trace_transaction_fd_recv(t_debug_id: usize, fd: u32, offset: usize) {}
#[inline]
pub(crate) fn trace_transaction_alloc_buf(debug_id: usize, data: &BinderTransactionDataSg) {
let data = data as *const BinderTransactionDataSg;
}
#[inline]
pub(crate) fn trace_transaction_buffer_release(debug_id: usize) {}
#[inline]
pub(crate) fn trace_transaction_failed_buffer_release(debug_id: usize) {}
#[inline]
pub(crate) fn trace_transaction_update_buffer_release(debug_id: usize) {}
#[inline]
pub(crate) fn trace_update_page_range(pid: Pid, allocate: bool, start: usize, end: usize) {}
macro_rules! define_wrapper_lru_page_class {
($(fn $name:ident;)*) => {$(
kernel::macros::paste! {
#[inline]
pub(crate) fn [< trace_ $name >](pid: Pid, page_index: usize) {
}
}
)*}
}
define_wrapper_lru_page_class! {
fn alloc_lru_start;
fn alloc_lru_end;
fn free_lru_start;
fn free_lru_end;
fn alloc_page_start;
fn alloc_page_end;
fn unmap_user_start;
fn unmap_user_end;
fn unmap_kernel_start;
fn unmap_kernel_end;
}
#[inline]
pub(crate) fn trace_command(cmd: u32) {}
#[inline]
pub(crate) fn trace_return(ret: u32) {}

View File

@@ -0,0 +1,546 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
use core::sync::atomic::{AtomicBool, Ordering};
use kernel::{
prelude::*,
seq_file::SeqFile,
seq_print,
sync::{Arc, SpinLock},
task::Kuid,
time::{ktime_ms_delta, Ktime},
types::ScopeGuard,
};
use crate::{
allocation::{Allocation, TranslatedFds},
defs::*,
error::{BinderError, BinderResult},
node::{Node, NodeRef},
prio::{self, BinderPriority, PriorityState},
process::{Process, ProcessInner},
ptr_align,
thread::{PushWorkRes, Thread},
BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverToRead,
};
#[pin_data(PinnedDrop)]
pub(crate) struct Transaction {
pub(crate) debug_id: usize,
target_node: Option<DArc<Node>>,
pub(crate) from_parent: Option<DArc<Transaction>>,
pub(crate) from: Arc<Thread>,
pub(crate) to: Arc<Process>,
#[pin]
allocation: SpinLock<Option<Allocation>>,
is_outstanding: AtomicBool,
set_priority_called: AtomicBool,
priority: BinderPriority,
#[pin]
saved_priority: SpinLock<BinderPriority>,
code: u32,
pub(crate) flags: u32,
data_size: usize,
offsets_size: usize,
data_address: usize,
sender_euid: Kuid,
txn_security_ctx_off: Option<usize>,
pub(crate) oneway_spam_detected: bool,
start_time: Ktime,
}
kernel::list::impl_list_arc_safe! {
impl ListArcSafe<0> for Transaction { untracked; }
}
impl Transaction {
pub(crate) fn new(
node_ref: NodeRef,
from_parent: Option<DArc<Transaction>>,
from: &Arc<Thread>,
tr: &BinderTransactionDataSg,
) -> BinderResult<DLArc<Self>> {
let debug_id = super::next_debug_id();
let trd = &tr.transaction_data;
let allow_fds = node_ref.node.flags & FLAT_BINDER_FLAG_ACCEPTS_FDS != 0;
let txn_security_ctx = node_ref.node.flags & FLAT_BINDER_FLAG_TXN_SECURITY_CTX != 0;
let mut txn_security_ctx_off = if txn_security_ctx { Some(0) } else { None };
let to = node_ref.node.owner.clone();
let mut alloc = match from.copy_transaction_data(
to.clone(),
tr,
debug_id,
allow_fds,
txn_security_ctx_off.as_mut(),
) {
Ok(alloc) => alloc,
Err(err) => {
if !err.is_dead() {
pr_warn!("Failure in copy_transaction_data: {:?}", err);
}
return Err(err);
}
};
let oneway_spam_detected = alloc.oneway_spam_detected;
if trd.flags & TF_ONE_WAY != 0 {
if from_parent.is_some() {
pr_warn!("Oneway transaction should not be in a transaction stack.");
return Err(EINVAL.into());
}
alloc.set_info_oneway_node(node_ref.node.clone());
}
if trd.flags & TF_CLEAR_BUF != 0 {
alloc.set_info_clear_on_drop();
}
let target_node = node_ref.node.clone();
alloc.set_info_target_node(node_ref);
let data_address = alloc.ptr;
let priority =
if (trd.flags & TF_ONE_WAY == 0) && prio::is_supported_policy(from.task.policy()) {
BinderPriority {
sched_policy: from.task.policy(),
prio: from.task.normal_prio(),
}
} else {
from.process.default_priority
};
Ok(DTRWrap::arc_pin_init(pin_init!(Transaction {
debug_id,
target_node: Some(target_node),
from_parent,
sender_euid: from.process.cred.euid(),
from: from.clone(),
to,
code: trd.code,
flags: trd.flags,
data_size: trd.data_size as _,
offsets_size: trd.offsets_size as _,
data_address,
allocation <- kernel::new_spinlock!(Some(alloc.success()), "Transaction::new"),
is_outstanding: AtomicBool::new(false),
priority,
saved_priority <- kernel::new_spinlock!(BinderPriority::default(), "Transaction::saved_priority"),
set_priority_called: AtomicBool::new(false),
txn_security_ctx_off,
oneway_spam_detected,
start_time: Ktime::ktime_get(),
}))?)
}
pub(crate) fn new_reply(
from: &Arc<Thread>,
to: Arc<Process>,
tr: &BinderTransactionDataSg,
allow_fds: bool,
) -> BinderResult<DLArc<Self>> {
let debug_id = super::next_debug_id();
let trd = &tr.transaction_data;
let mut alloc = match from.copy_transaction_data(to.clone(), tr, debug_id, allow_fds, None)
{
Ok(alloc) => alloc,
Err(err) => {
pr_warn!("Failure in copy_transaction_data: {:?}", err);
return Err(err);
}
};
let oneway_spam_detected = alloc.oneway_spam_detected;
if trd.flags & TF_CLEAR_BUF != 0 {
alloc.set_info_clear_on_drop();
}
Ok(DTRWrap::arc_pin_init(pin_init!(Transaction {
debug_id,
target_node: None,
from_parent: None,
sender_euid: from.process.task.euid(),
from: from.clone(),
to,
code: trd.code,
flags: trd.flags,
data_size: trd.data_size as _,
offsets_size: trd.offsets_size as _,
data_address: alloc.ptr,
allocation <- kernel::new_spinlock!(Some(alloc.success()), "Transaction::new"),
is_outstanding: AtomicBool::new(false),
priority: BinderPriority::default(),
saved_priority <- kernel::new_spinlock!(BinderPriority::default(), "Transaction::saved_priority"),
set_priority_called: AtomicBool::new(false),
txn_security_ctx_off: None,
oneway_spam_detected,
start_time: Ktime::ktime_get(),
}))?)
}
#[inline(never)]
pub(crate) fn debug_print_inner(&self, m: &SeqFile, prefix: &str) {
seq_print!(
m,
"{}{}: from {}:{} to {} code {:x} flags {:x} pri {}:{} elapsed {}ms",
prefix,
self.debug_id,
self.from.process.task.pid(),
self.from.id,
self.to.task.pid(),
self.code,
self.flags,
self.priority.sched_policy,
self.priority.prio,
ktime_ms_delta(Ktime::ktime_get(), self.start_time),
);
if let Some(target_node) = &self.target_node {
seq_print!(m, " node {}", target_node.debug_id);
}
seq_print!(m, " size {}:{}\n", self.data_size, self.offsets_size);
}
pub(crate) fn saved_priority(&self) -> BinderPriority {
*self.saved_priority.lock()
}
/// Determines if the transaction is stacked on top of the given transaction.
pub(crate) fn is_stacked_on(&self, onext: &Option<DArc<Self>>) -> bool {
match (&self.from_parent, onext) {
(None, None) => true,
(Some(from_parent), Some(next)) => Arc::ptr_eq(from_parent, next),
_ => false,
}
}
/// Returns a pointer to the next transaction on the transaction stack, if there is one.
pub(crate) fn clone_next(&self) -> Option<DArc<Self>> {
Some(self.from_parent.as_ref()?.clone())
}
/// Searches in the transaction stack for a thread that belongs to the target process. This is
/// useful when finding a target for a new transaction: if the node belongs to a process that
/// is already part of the transaction stack, we reuse the thread.
fn find_target_thread(&self) -> Option<Arc<Thread>> {
let mut it = &self.from_parent;
while let Some(transaction) = it {
if Arc::ptr_eq(&transaction.from.process, &self.to) {
return Some(transaction.from.clone());
}
it = &transaction.from_parent;
}
None
}
/// Searches in the transaction stack for a transaction originating at the given thread.
pub(crate) fn find_from(&self, thread: &Thread) -> Option<DArc<Transaction>> {
let mut it = &self.from_parent;
while let Some(transaction) = it {
if core::ptr::eq(thread, transaction.from.as_ref()) {
return Some(transaction.clone());
}
it = &transaction.from_parent;
}
None
}
pub(crate) fn set_outstanding(&self, to_process: &mut ProcessInner) {
// No race because this method is only called once.
if !self.is_outstanding.load(Ordering::Relaxed) {
self.is_outstanding.store(true, Ordering::Relaxed);
to_process.add_outstanding_txn();
}
}
/// Decrement `outstanding_txns` in `to` if it hasn't already been decremented.
fn drop_outstanding_txn(&self) {
// No race because this is called at most twice, and one of the calls are in the
// destructor, which is guaranteed to not race with any other operations on the
// transaction. It also cannot race with `set_outstanding`, since submission happens
// before delivery.
if self.is_outstanding.load(Ordering::Relaxed) {
self.is_outstanding.store(false, Ordering::Relaxed);
self.to.drop_outstanding_txn();
}
}
/// Submits the transaction to a work queue. Uses a thread if there is one in the transaction
/// stack, otherwise uses the destination process.
///
/// Not used for replies.
pub(crate) fn submit(self: DLArc<Self>) -> BinderResult {
crate::trace::trace_transaction(false, &self);
// Defined before `process_inner` so that the destructor runs after releasing the lock.
let mut _t_outdated;
let oneway = self.flags & TF_ONE_WAY != 0;
let process = self.to.clone();
let mut process_inner = process.inner.lock();
self.set_outstanding(&mut process_inner);
if oneway {
if let Some(target_node) = self.target_node.clone() {
if process_inner.is_frozen {
process_inner.async_recv = true;
if self.flags & TF_UPDATE_TXN != 0 {
if let Some(t_outdated) =
target_node.take_outdated_transaction(&self, &mut process_inner)
{
crate::trace::trace_transaction_update_buffer_release(
t_outdated.debug_id,
);
// Save the transaction to be dropped after locks are released.
_t_outdated = t_outdated;
}
}
}
match target_node.submit_oneway(self, &mut process_inner) {
Ok(()) => {}
Err((err, work)) => {
drop(process_inner);
// Drop work after releasing process lock.
drop(work);
return Err(err);
}
}
if process_inner.is_frozen {
return Err(BinderError::new_frozen_oneway());
} else {
return Ok(());
}
} else {
pr_err!("Failed to submit oneway transaction to node.");
}
}
if process_inner.is_frozen {
process_inner.sync_recv = true;
return Err(BinderError::new_frozen());
}
let res = if let Some(thread) = self.find_target_thread() {
match thread.push_work(self) {
PushWorkRes::Ok => Ok(()),
PushWorkRes::FailedDead(me) => Err((BinderError::new_dead(), me)),
}
} else {
process_inner.push_work(self)
};
drop(process_inner);
match res {
Ok(()) => Ok(()),
Err((err, work)) => {
// Drop work after releasing process lock.
drop(work);
Err(err)
}
}
}
/// Check whether one oneway transaction can supersede another.
pub(crate) fn can_replace(&self, old: &Transaction) -> bool {
if self.from.process.task.pid() != old.from.process.task.pid() {
return false;
}
if self.flags & old.flags & (TF_ONE_WAY | TF_UPDATE_TXN) != (TF_ONE_WAY | TF_UPDATE_TXN) {
return false;
}
let target_node_match = match (self.target_node.as_ref(), old.target_node.as_ref()) {
(None, None) => true,
(Some(tn1), Some(tn2)) => Arc::ptr_eq(tn1, tn2),
_ => false,
};
self.code == old.code && self.flags == old.flags && target_node_match
}
fn prepare_file_list(&self) -> Result<TranslatedFds> {
let mut alloc = self.allocation.lock().take().ok_or(ESRCH)?;
match alloc.translate_fds() {
Ok(translated) => {
*self.allocation.lock() = Some(alloc);
Ok(translated)
}
Err(err) => {
// Free the allocation eagerly.
drop(alloc);
Err(err)
}
}
}
}
impl DeliverToRead for Transaction {
fn do_work(
self: DArc<Self>,
thread: &Thread,
writer: &mut BinderReturnWriter<'_>,
) -> Result<bool> {
let send_failed_reply = ScopeGuard::new(|| {
if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 {
let reply = Err(BR_FAILED_REPLY);
self.from.deliver_reply(reply, &self);
}
self.drop_outstanding_txn();
});
// Update thread priority. This only has an effect if the transaction is delivered via the
// process work list, since the priority has otherwise already been updated.
self.on_thread_selected(thread);
let files = if let Ok(list) = self.prepare_file_list() {
list
} else {
// On failure to process the list, we send a reply back to the sender and ignore the
// transaction on the recipient.
return Ok(true);
};
let mut tr_sec = BinderTransactionDataSecctx::default();
let tr = tr_sec.tr_data();
if let Some(target_node) = &self.target_node {
let (ptr, cookie) = target_node.get_id();
tr.target.ptr = ptr as _;
tr.cookie = cookie as _;
};
tr.code = self.code;
tr.flags = self.flags;
tr.data_size = self.data_size as _;
tr.data.ptr.buffer = self.data_address as _;
tr.offsets_size = self.offsets_size as _;
if tr.offsets_size > 0 {
tr.data.ptr.offsets = (self.data_address + ptr_align(self.data_size)) as _;
}
tr.sender_euid = self.sender_euid.into_uid_in_current_ns();
tr.sender_pid = 0;
if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 {
// Not a reply and not one-way.
tr.sender_pid = self.from.process.task.pid_in_current_ns();
}
let code = if self.target_node.is_none() {
BR_REPLY
} else if self.txn_security_ctx_off.is_some() {
BR_TRANSACTION_SEC_CTX
} else {
BR_TRANSACTION
};
// Write the transaction code and data to the user buffer.
writer.write_code(code)?;
if let Some(off) = self.txn_security_ctx_off {
tr_sec.secctx = (self.data_address + off) as u64;
writer.write_payload(&tr_sec)?;
} else {
writer.write_payload(&*tr)?;
}
let mut alloc = self.allocation.lock().take().ok_or(ESRCH)?;
// Dismiss the completion of transaction with a failure. No failure paths are allowed from
// here on out.
send_failed_reply.dismiss();
// Commit files, and set FDs in FDA to be closed on buffer free.
let close_on_free = files.commit();
alloc.set_info_close_on_free(close_on_free);
// It is now the user's responsibility to clear the allocation.
alloc.keep_alive();
self.drop_outstanding_txn();
crate::trace::trace_transaction_received(&self);
// When this is not a reply and not a oneway transaction, update `current_transaction`. If
// it's a reply, `current_transaction` has already been updated appropriately.
if self.target_node.is_some() && tr_sec.transaction_data.flags & TF_ONE_WAY == 0 {
thread.set_current_transaction(self);
}
Ok(false)
}
fn cancel(self: DArc<Self>) {
let allocation = self.allocation.lock().take();
drop(allocation);
// If this is not a reply or oneway transaction, then send a dead reply.
if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 {
let reply = Err(BR_DEAD_REPLY);
self.from.deliver_reply(reply, &self);
}
self.drop_outstanding_txn();
}
fn on_thread_selected(&self, to_thread: &Thread) {
// Return immediately if reply.
let target_node = match self.target_node.as_ref() {
Some(target_node) => target_node,
None => return,
};
// We only need to do this once.
if self.set_priority_called.swap(true, Ordering::Relaxed) {
return;
}
crate::trace::trace_transaction_thread_selected(self, to_thread);
let node_prio = target_node.node_prio();
let mut desired = self.priority;
if !target_node.inherit_rt() && prio::is_rt_policy(desired.sched_policy) {
desired.prio = prio::DEFAULT_PRIO;
desired.sched_policy = prio::SCHED_NORMAL;
}
if node_prio.prio < self.priority.prio
|| (node_prio.prio == self.priority.prio && node_prio.sched_policy == prio::SCHED_FIFO)
{
// In case the minimum priority on the node is
// higher (lower value), use that priority. If
// the priority is the same, but the node uses
// SCHED_FIFO, prefer SCHED_FIFO, since it can
// run unbounded, unlike SCHED_RR.
desired = node_prio;
}
let mut prio_state = to_thread.prio_lock.lock();
if prio_state.state == PriorityState::Pending {
// Task is in the process of changing priorities
// saving its current values would be incorrect.
// Instead, save the pending priority and signal
// the task to abort the priority restore.
prio_state.state = PriorityState::Abort;
*self.saved_priority.lock() = prio_state.next;
} else {
let task = &*self.to.task;
let mut saved_priority = self.saved_priority.lock();
saved_priority.sched_policy = task.policy();
saved_priority.prio = task.normal_prio();
}
drop(prio_state);
to_thread.set_priority(&desired, self);
}
fn should_sync_wakeup(&self) -> bool {
self.flags & TF_ONE_WAY == 0
}
fn debug_print(&self, m: &SeqFile, _prefix: &str, tprefix: &str) -> Result<()> {
self.debug_print_inner(m, tprefix);
Ok(())
}
}
#[pinned_drop]
impl PinnedDrop for Transaction {
fn drop(self: Pin<&mut Self>) {
self.drop_outstanding_txn();
}
}