FROMLIST: ring-buffer: Introducing ring-buffer writer
A ring-buffer writer is an entity outside of the kernel (most likely a firmware or a hypervisor) capable of writing events in a ring-buffer following the same format as the tracefs ring-buffer. To setup the ring-buffer on the kernel side, a description of the pages (struct trace_page_desc) is necessary. A callback (get_reader_page) must also be provided. It is called whenever it is done reading the previous reader page. It is expected from the writer to keep the meta-page updated. Bug: 357781595 Link: https://lore.kernel.org/all/20240911093029.3279154-3-vdonnefort@google.com/ Change-Id: Ib006ff19d79a5b2559b09661fba7bcd426ef8cba Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
This commit is contained in:
committed by
Keir Fraser
parent
3239f7d23b
commit
162c11d25b
@@ -83,11 +83,14 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
|
||||
void ring_buffer_discard_commit(struct trace_buffer *buffer,
|
||||
struct ring_buffer_event *event);
|
||||
|
||||
struct ring_buffer_writer;
|
||||
|
||||
/*
|
||||
* size is in bytes for each per CPU buffer.
|
||||
*/
|
||||
struct trace_buffer *
|
||||
__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key);
|
||||
__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key,
|
||||
struct ring_buffer_writer *writer);
|
||||
|
||||
struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags,
|
||||
int order, unsigned long start,
|
||||
@@ -102,10 +105,10 @@ bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text,
|
||||
* traced by ftrace, it can produce lockdep warnings. We need to keep each
|
||||
* ring buffer's lock class separate.
|
||||
*/
|
||||
#define ring_buffer_alloc(size, flags) \
|
||||
({ \
|
||||
static struct lock_class_key __key; \
|
||||
__ring_buffer_alloc((size), (flags), &__key); \
|
||||
#define ring_buffer_alloc(size, flags) \
|
||||
({ \
|
||||
static struct lock_class_key __key; \
|
||||
__ring_buffer_alloc((size), (flags), &__key, NULL); \
|
||||
})
|
||||
|
||||
/*
|
||||
@@ -248,4 +251,54 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
|
||||
struct vm_area_struct *vma);
|
||||
int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
|
||||
int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu);
|
||||
|
||||
#define meta_pages_lost(__meta) \
|
||||
((__meta)->Reserved1)
|
||||
#define meta_pages_touched(__meta) \
|
||||
((__meta)->Reserved2)
|
||||
|
||||
struct rb_page_desc {
|
||||
int cpu;
|
||||
int nr_page_va; /* exclude the meta page */
|
||||
unsigned long meta_va;
|
||||
unsigned long page_va[];
|
||||
};
|
||||
|
||||
struct trace_page_desc {
|
||||
int nr_cpus;
|
||||
char __data[]; /* list of rb_page_desc */
|
||||
};
|
||||
|
||||
static inline
|
||||
struct rb_page_desc *__next_rb_page_desc(struct rb_page_desc *pdesc)
|
||||
{
|
||||
size_t len = struct_size(pdesc, page_va, pdesc->nr_page_va);
|
||||
|
||||
return (struct rb_page_desc *)((void *)pdesc + len);
|
||||
}
|
||||
|
||||
static inline
|
||||
struct rb_page_desc *__first_rb_page_desc(struct trace_page_desc *trace_pdesc)
|
||||
{
|
||||
return (struct rb_page_desc *)(&trace_pdesc->__data[0]);
|
||||
}
|
||||
|
||||
#define for_each_rb_page_desc(__pdesc, __cpu, __trace_pdesc) \
|
||||
for (__pdesc = __first_rb_page_desc(__trace_pdesc), __cpu = 0; \
|
||||
__cpu < (__trace_pdesc)->nr_cpus; \
|
||||
__cpu++, __pdesc = __next_rb_page_desc(__pdesc))
|
||||
|
||||
struct ring_buffer_writer {
|
||||
struct trace_page_desc *pdesc;
|
||||
int (*get_reader_page)(int cpu);
|
||||
int (*reset)(int cpu);
|
||||
};
|
||||
|
||||
int ring_buffer_poll_writer(struct trace_buffer *buffer, int cpu);
|
||||
|
||||
#define ring_buffer_reader(writer) \
|
||||
({ \
|
||||
static struct lock_class_key __key; \
|
||||
__ring_buffer_alloc(0, RB_FL_OVERWRITE, &__key, writer);\
|
||||
})
|
||||
#endif /* _LINUX_RING_BUFFER_H */
|
||||
|
||||
@@ -517,6 +517,8 @@ struct ring_buffer_per_cpu {
|
||||
struct trace_buffer_meta *meta_page;
|
||||
struct ring_buffer_meta *ring_meta;
|
||||
|
||||
struct ring_buffer_writer *writer;
|
||||
|
||||
/* ring buffer pages to update, > 0 to add, < 0 to remove */
|
||||
long nr_pages_to_update;
|
||||
struct list_head new_pages; /* new pages to add */
|
||||
@@ -539,6 +541,8 @@ struct trace_buffer {
|
||||
|
||||
struct ring_buffer_per_cpu **buffers;
|
||||
|
||||
struct ring_buffer_writer *writer;
|
||||
|
||||
struct hlist_node node;
|
||||
u64 (*clock)(void);
|
||||
|
||||
@@ -2083,6 +2087,42 @@ free_pages:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static struct rb_page_desc *rb_page_desc(struct trace_page_desc *trace_pdesc,
|
||||
int cpu)
|
||||
{
|
||||
struct rb_page_desc *pdesc;
|
||||
size_t len;
|
||||
int i;
|
||||
|
||||
if (!trace_pdesc)
|
||||
return NULL;
|
||||
|
||||
if (cpu >= trace_pdesc->nr_cpus)
|
||||
return NULL;
|
||||
|
||||
pdesc = __first_rb_page_desc(trace_pdesc);
|
||||
len = struct_size(pdesc, page_va, pdesc->nr_page_va);
|
||||
pdesc += len * cpu;
|
||||
|
||||
if (pdesc->cpu == cpu)
|
||||
return pdesc;
|
||||
|
||||
/* Missing CPUs, need to linear search */
|
||||
|
||||
for_each_rb_page_desc(pdesc, i, trace_pdesc) {
|
||||
if (pdesc->cpu == cpu)
|
||||
return pdesc;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void *rb_page_desc_page(struct rb_page_desc *pdesc, int page_id)
|
||||
{
|
||||
return page_id > pdesc->nr_page_va ? NULL : (void *)pdesc->page_va[page_id];
|
||||
}
|
||||
|
||||
|
||||
static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
@@ -2143,6 +2183,31 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
|
||||
|
||||
cpu_buffer->reader_page = bpage;
|
||||
|
||||
if (buffer->writer) {
|
||||
struct rb_page_desc *pdesc = rb_page_desc(buffer->writer->pdesc, cpu);
|
||||
|
||||
if (!pdesc)
|
||||
goto fail_free_reader;
|
||||
|
||||
cpu_buffer->writer = buffer->writer;
|
||||
cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)pdesc->meta_va;
|
||||
cpu_buffer->subbuf_ids = pdesc->page_va;
|
||||
cpu_buffer->nr_pages = pdesc->nr_page_va - 1;
|
||||
atomic_inc(&cpu_buffer->record_disabled);
|
||||
atomic_inc(&cpu_buffer->resize_disabled);
|
||||
|
||||
bpage->page = rb_page_desc_page(pdesc,
|
||||
cpu_buffer->meta_page->reader.id);
|
||||
if (!bpage->page)
|
||||
goto fail_free_reader;
|
||||
/*
|
||||
* The meta-page can only describe which of the ring-buffer page
|
||||
* is the reader. There is no need to init the rest of the
|
||||
* ring-buffer.
|
||||
*/
|
||||
return cpu_buffer;
|
||||
}
|
||||
|
||||
if (buffer->range_addr_start) {
|
||||
/*
|
||||
* Range mapped buffers have the same restrictions as memory
|
||||
@@ -2220,6 +2285,10 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
|
||||
|
||||
irq_work_sync(&cpu_buffer->irq_work.work);
|
||||
|
||||
/* ring_buffers with writer set do not own the data pages */
|
||||
if (cpu_buffer->writer)
|
||||
cpu_buffer->reader_page->page = NULL;
|
||||
|
||||
free_buffer_page(cpu_buffer->reader_page);
|
||||
|
||||
if (head) {
|
||||
@@ -2241,7 +2310,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
|
||||
static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
|
||||
int order, unsigned long start,
|
||||
unsigned long end,
|
||||
struct lock_class_key *key)
|
||||
struct lock_class_key *key,
|
||||
struct ring_buffer_writer *writer)
|
||||
{
|
||||
struct trace_buffer *buffer;
|
||||
long nr_pages;
|
||||
@@ -2269,6 +2339,11 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
|
||||
buffer->flags = flags;
|
||||
buffer->clock = trace_clock_local;
|
||||
buffer->reader_lock_key = key;
|
||||
if (writer) {
|
||||
buffer->writer = writer;
|
||||
/* The writer is external and never done by the kernel */
|
||||
atomic_inc(&buffer->record_disabled);
|
||||
}
|
||||
|
||||
init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
|
||||
init_waitqueue_head(&buffer->irq_work.waiters);
|
||||
@@ -2375,10 +2450,11 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
|
||||
* drop data when the tail hits the head.
|
||||
*/
|
||||
struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
|
||||
struct lock_class_key *key)
|
||||
struct lock_class_key *key,
|
||||
struct ring_buffer_writer *writer)
|
||||
{
|
||||
/* Default buffer page size - one system page */
|
||||
return alloc_buffer(size, flags, 0, 0, 0,key);
|
||||
return alloc_buffer(size, flags, 0, 0, 0, key, writer);
|
||||
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
|
||||
@@ -2402,7 +2478,7 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag
|
||||
unsigned long range_size,
|
||||
struct lock_class_key *key)
|
||||
{
|
||||
return alloc_buffer(size, flags, order, start, start + range_size, key);
|
||||
return alloc_buffer(size, flags, order, start, start + range_size, key, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -5153,8 +5229,54 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
|
||||
}
|
||||
}
|
||||
|
||||
static bool rb_read_writer_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
|
||||
{
|
||||
local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries));
|
||||
local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun));
|
||||
local_set(&cpu_buffer->pages_touched, READ_ONCE(meta_pages_touched(cpu_buffer->meta_page)));
|
||||
local_set(&cpu_buffer->pages_lost, READ_ONCE(meta_pages_lost(cpu_buffer->meta_page)));
|
||||
/*
|
||||
* No need to get the "read" field, it can be tracked here as any
|
||||
* reader will have to go through a rign_buffer_per_cpu.
|
||||
*/
|
||||
|
||||
return rb_num_of_entries(cpu_buffer);
|
||||
}
|
||||
|
||||
static struct buffer_page *
|
||||
rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
|
||||
__rb_get_reader_page_from_writer(struct ring_buffer_per_cpu *cpu_buffer)
|
||||
{
|
||||
u32 prev_reader;
|
||||
|
||||
if (!rb_read_writer_meta_page(cpu_buffer))
|
||||
return NULL;
|
||||
|
||||
/* More to read on the reader page */
|
||||
if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page))
|
||||
return cpu_buffer->reader_page;
|
||||
|
||||
prev_reader = cpu_buffer->meta_page->reader.id;
|
||||
|
||||
WARN_ON(cpu_buffer->writer->get_reader_page(cpu_buffer->cpu));
|
||||
/* nr_pages doesn't include the reader page */
|
||||
if (cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages) {
|
||||
WARN_ON(1);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
cpu_buffer->reader_page->page =
|
||||
(void *)cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id];
|
||||
cpu_buffer->reader_page->read = 0;
|
||||
cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
|
||||
cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events;
|
||||
|
||||
WARN_ON(prev_reader == cpu_buffer->meta_page->reader.id);
|
||||
|
||||
return cpu_buffer->reader_page;
|
||||
}
|
||||
|
||||
static struct buffer_page *
|
||||
__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
|
||||
{
|
||||
struct buffer_page *reader = NULL;
|
||||
unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
|
||||
@@ -5324,6 +5446,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
|
||||
return reader;
|
||||
}
|
||||
|
||||
static struct buffer_page *
|
||||
rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
|
||||
{
|
||||
return cpu_buffer->writer ? __rb_get_reader_page_from_writer(cpu_buffer) :
|
||||
__rb_get_reader_page(cpu_buffer);
|
||||
}
|
||||
|
||||
static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
|
||||
{
|
||||
struct ring_buffer_event *event;
|
||||
@@ -5728,7 +5857,7 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
|
||||
struct ring_buffer_per_cpu *cpu_buffer;
|
||||
struct ring_buffer_iter *iter;
|
||||
|
||||
if (!cpumask_test_cpu(cpu, buffer->cpumask))
|
||||
if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->writer)
|
||||
return NULL;
|
||||
|
||||
iter = kzalloc(sizeof(*iter), flags);
|
||||
@@ -5901,6 +6030,22 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
|
||||
{
|
||||
struct buffer_page *page;
|
||||
|
||||
if (cpu_buffer->writer) {
|
||||
if (!cpu_buffer->writer->reset)
|
||||
return;
|
||||
|
||||
cpu_buffer->writer->reset(cpu_buffer->cpu);
|
||||
rb_read_writer_meta_page(cpu_buffer);
|
||||
|
||||
/* Read related values, not covered by the meta-page */
|
||||
local_set(&cpu_buffer->pages_read, 0);
|
||||
cpu_buffer->read = 0;
|
||||
cpu_buffer->read_bytes = 0;
|
||||
cpu_buffer->last_overrun = 0;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
rb_head_page_deactivate(cpu_buffer);
|
||||
|
||||
cpu_buffer->head_page
|
||||
@@ -6148,6 +6293,49 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
|
||||
|
||||
int ring_buffer_poll_writer(struct trace_buffer *buffer, int cpu)
|
||||
{
|
||||
struct ring_buffer_per_cpu *cpu_buffer;
|
||||
unsigned long flags;
|
||||
|
||||
if (cpu != RING_BUFFER_ALL_CPUS) {
|
||||
if (!cpumask_test_cpu(cpu, buffer->cpumask))
|
||||
return -EINVAL;
|
||||
|
||||
cpu_buffer = buffer->buffers[cpu];
|
||||
|
||||
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
|
||||
if (rb_read_writer_meta_page(cpu_buffer))
|
||||
rb_wakeups(buffer, cpu_buffer);
|
||||
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure all the ring buffers are up to date before we start reading
|
||||
* them.
|
||||
*/
|
||||
for_each_buffer_cpu(buffer, cpu) {
|
||||
cpu_buffer = buffer->buffers[cpu];
|
||||
|
||||
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
|
||||
rb_read_writer_meta_page(buffer->buffers[cpu]);
|
||||
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
|
||||
}
|
||||
|
||||
for_each_buffer_cpu(buffer, cpu) {
|
||||
cpu_buffer = buffer->buffers[cpu];
|
||||
|
||||
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
|
||||
if (rb_num_of_entries(cpu_buffer))
|
||||
rb_wakeups(buffer, buffer->buffers[cpu]);
|
||||
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
|
||||
/**
|
||||
* ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
|
||||
@@ -6399,6 +6587,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
|
||||
unsigned int commit;
|
||||
unsigned int read;
|
||||
u64 save_timestamp;
|
||||
bool force_memcpy;
|
||||
int ret = -1;
|
||||
|
||||
if (!cpumask_test_cpu(cpu, buffer->cpumask))
|
||||
@@ -6436,6 +6625,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
|
||||
/* Check if any events were dropped */
|
||||
missed_events = cpu_buffer->lost_events;
|
||||
|
||||
force_memcpy = cpu_buffer->mapped || cpu_buffer->writer;
|
||||
|
||||
/*
|
||||
* If this page has been partially read or
|
||||
* if len is not big enough to read the rest of the page or
|
||||
@@ -6445,7 +6636,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
|
||||
*/
|
||||
if (read || (len < (commit - read)) ||
|
||||
cpu_buffer->reader_page == cpu_buffer->commit_page ||
|
||||
cpu_buffer->mapped) {
|
||||
force_memcpy) {
|
||||
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
|
||||
unsigned int rpos = read;
|
||||
unsigned int pos = 0;
|
||||
@@ -7020,7 +7211,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
|
||||
unsigned long flags, *subbuf_ids;
|
||||
int err = 0;
|
||||
|
||||
if (!cpumask_test_cpu(cpu, buffer->cpumask))
|
||||
if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->writer)
|
||||
return -EINVAL;
|
||||
|
||||
cpu_buffer = buffer->buffers[cpu];
|
||||
|
||||
Reference in New Issue
Block a user