diff --git a/Documentation/ABI/testing/debugfs-cxl b/Documentation/ABI/testing/debugfs-cxl index c61f9b813973..12488c14be64 100644 --- a/Documentation/ABI/testing/debugfs-cxl +++ b/Documentation/ABI/testing/debugfs-cxl @@ -14,9 +14,10 @@ Description: event to its internal Informational Event log, updates the Event Status register, and if configured, interrupts the host. It is not an error to inject poison into an address that - already has poison present and no error is returned. The - inject_poison attribute is only visible for devices supporting - the capability. + already has poison present and no error is returned. If the + device returns 'Inject Poison Limit Reached' an -EBUSY error + is returned to the user. The inject_poison attribute is only + visible for devices supporting the capability. What: /sys/kernel/debug/memX/clear_poison diff --git a/Documentation/driver-api/cxl/index.rst b/Documentation/driver-api/cxl/index.rst index 036e49553542..12b82725d322 100644 --- a/Documentation/driver-api/cxl/index.rst +++ b/Documentation/driver-api/cxl/index.rst @@ -9,4 +9,6 @@ Compute Express Link memory-devices + maturity-map + .. only:: subproject and html diff --git a/Documentation/driver-api/cxl/maturity-map.rst b/Documentation/driver-api/cxl/maturity-map.rst new file mode 100644 index 000000000000..df8e2ac2a320 --- /dev/null +++ b/Documentation/driver-api/cxl/maturity-map.rst @@ -0,0 +1,202 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +=========================================== +Compute Express Link Subsystem Maturity Map +=========================================== + +The Linux CXL subsystem tracks the dynamic `CXL specification +`_ that +continues to respond to new use cases with new features, capability +updates and fixes. At any given point some aspects of the subsystem are +more mature than others. While the periodic pull requests summarize the +`work being incorporated each merge window +`_, +those do not always convey progress relative to a starting point and a +future end goal. + +What follows is a coarse breakdown of the subsystem's major +responsibilities along with a maturity score. The expectation is that +the change-history of this document provides an overview summary of the +subsystem maturation over time. + +The maturity scores are: + +- [3] Mature: Work in this area is complete and no changes on the horizon. + Note that this score can regress from one kernel release to the next + based on new test results or end user reports. + +- [2] Stabilizing: Major functionality operational, common cases are + mature, but known corner cases are still a work in progress. + +- [1] Initial: Capability that has exited the Proof of Concept phase, but + may still have significant gaps to close and fixes to apply as real + world testing occurs. + +- [0] Known gap: Feature is on a medium to long term horizon to + implement. If the specification has a feature that does not even have + a '0' score in this document, there is a good chance that no one in + the linux-cxl@vger.kernel.org community has started to look at it. + +- X: Out of scope for kernel enabling, or kernel enabling not required + +Feature and Capabilities +======================== + +Enumeration / Provisioning +-------------------------- +All of the fundamental enumeration an object model of the subsystem is +in place, but there are several corner cases that are pending closure. + + +* [2] CXL Window Enumeration + + * [0] :ref:`Extended-linear memory-side cache ` + * [0] Low Memory-hole + * [0] Hetero-interleave + +* [2] Switch Enumeration + + * [0] CXL register enumeration link-up dependency + +* [2] HDM Decoder Configuration + + * [0] Decoder target and granularity constraints + +* [2] Performance enumeration + + * [3] Endpoint CDAT + * [3] Switch CDAT + * [1] CDAT to Core-mm integration + + * [1] x86 + * [0] Arm64 + * [0] All other arch. + + * [0] Shared link + +* [2] Hotplug + (see CXL Window Enumeration) + + * [0] Handle Soft Reserved conflicts + +* [0] :ref:`RCH link status ` +* [0] Fabrics / G-FAM (chapter 7) +* [0] Global Access Endpoint + + +RAS +--- +In many ways CXL can be seen as a standardization of what would normally +be handled by custom EDAC drivers. The open development here is +mainly caused by the enumeration corner cases above. + +* [3] Component events (OS) +* [2] Component events (FFM) +* [1] Endpoint protocol errors (OS) +* [1] Endpoint protocol errors (FFM) +* [0] Switch protocol errors (OS) +* [1] Switch protocol errors (FFM) +* [2] DPA->HPA Address translation + + * [1] XOR Interleave translation + (see CXL Window Enumeration) + +* [1] Memory Failure coordination +* [0] Scrub control +* [2] ACPI error injection EINJ + + * [0] EINJ v2 + * [X] Compliance DOE + +* [2] Native error injection +* [3] RCH error handling +* [1] VH error handling +* [0] PPR +* [0] Sparing +* [0] Device built in test + + +Mailbox commands +---------------- + +* [3] Firmware update +* [3] Health / Alerts +* [1] :ref:`Background commands ` +* [3] Sanitization +* [3] Security commands +* [3] RAW Command Debug Passthrough +* [0] CEL-only-validation Passthrough +* [0] Switch CCI +* [3] Timestamp +* [1] PMEM labels +* [0] PMEM GPF / Dirty Shutdown +* [0] Scan Media + +PMU +--- +* [1] Type 3 PMU +* [0] Switch USP/ DSP, Root Port + +Security +-------- + +* [X] CXL Trusted Execution Environment Security Protocol (TSP) +* [X] CXL IDE (subsumed by TSP) + +Memory-pooling +-------------- + +* [1] Hotplug of LDs (via PCI hotplug) +* [0] Dynamic Capacity Device (DCD) Support + +Multi-host sharing +------------------ + +* [0] Hardware coherent shared memory +* [0] Software managed coherency shared memory + +Multi-host memory +----------------- + +* [0] Dynamic Capacity Device Support +* [0] Sharing + +Accelerator +----------- + +* [0] Accelerator memory enumeration HDM-D (CXL 1.1/2.0 Type-2) +* [0] Accelerator memory enumeration HDM-DB (CXL 3.0 Type-2) +* [0] CXL.cache 68b (CXL 2.0) +* [0] CXL.cache 256b Cache IDs (CXL 3.0) + +User Flow Support +----------------- + +* [0] HPA->DPA Address translation (need xormaps export solution) + +Details +======= + +.. _extended-linear: + +* **Extended-linear memory-side cache**: An HMAT proposal to enumerate the presence of a + memory-side cache where the cache capacity extends the SRAT address + range capacity. `See the ECN + `_ + for more details: + +.. _rch-link-status: + +* **RCH Link Status**: RCH (Restricted CXL Host) topologies, end up + hiding some standard registers like PCIe Link Status / Capabilities in + the CXL RCRB (Root Complex Register Block). + +.. _background-commands: + +* **Background commands**: The CXL background command mechanism is + awkward as the single slot is monopolized potentially indefinitely by + various commands. A `cancel on conflict + `_ + facility is needed to make sure the kernel can ensure forward progress + of priority commands. diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 1497e80f030e..3fc63f27c226 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -89,14 +89,7 @@ docs on :ref:`Building Linux with Clang/LLVM `. Rust (optional) --------------- -A particular version of the Rust toolchain is required. Newer versions may or -may not work because the kernel depends on some unstable Rust features, for -the moment. - -Each Rust toolchain comes with several "components", some of which are required -(like ``rustc``) and some that are optional. The ``rust-src`` component (which -is optional) needs to be installed to build the kernel. Other components are -useful for developing. +A recent version of the Rust compiler is required. Please see Documentation/rust/quick-start.rst for instructions on how to satisfy the build requirements of Rust support. In particular, the ``Makefile`` diff --git a/Documentation/rust/general-information.rst b/Documentation/rust/general-information.rst index 4bb6ac12d482..e3f388ef4ee4 100644 --- a/Documentation/rust/general-information.rst +++ b/Documentation/rust/general-information.rst @@ -7,6 +7,14 @@ This document contains useful information to know when working with the Rust support in the kernel. +``no_std`` +---------- + +The Rust support in the kernel can link only `core `_, +but not `std `_. Crates for use in the +kernel must opt into this behavior using the ``#![no_std]`` attribute. + + Code documentation ------------------ diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst index cc3f11e0d441..d06a36106cd4 100644 --- a/Documentation/rust/quick-start.rst +++ b/Documentation/rust/quick-start.rst @@ -5,17 +5,93 @@ Quick Start This document describes how to get started with kernel development in Rust. +There are a few ways to install a Rust toolchain needed for kernel development. +A simple way is to use the packages from your Linux distribution if they are +suitable -- the first section below explains this approach. An advantage of this +approach is that, typically, the distribution will match the LLVM used by Rust +and Clang. + +Another way is using the prebuilt stable versions of LLVM+Rust provided on +`kernel.org `_. These are the same slim +and fast LLVM toolchains from :ref:`Getting LLVM ` with versions +of Rust added to them that Rust for Linux supports. Two sets are provided: the +"latest LLVM" and "matching LLVM" (please see the link for more information). + +Alternatively, the next two "Requirements" sections explain each component and +how to install them through ``rustup``, the standalone installers from Rust +and/or building them. + +The rest of the document explains other aspects on how to get started. + + +Distributions +------------- + +Arch Linux +********** + +Arch Linux provides recent Rust releases and thus it should generally work out +of the box, e.g.:: + + pacman -S rust rust-src rust-bindgen + + +Debian +****** + +Debian Unstable (Sid), outside of the freeze period, provides recent Rust +releases and thus it should generally work out of the box, e.g.:: + + apt install rustc rust-src bindgen rustfmt rust-clippy + + +Fedora Linux +************ + +Fedora Linux provides recent Rust releases and thus it should generally work out +of the box, e.g.:: + + dnf install rust rust-src bindgen-cli rustfmt clippy + + +Gentoo Linux +************ + +Gentoo Linux (and especially the testing branch) provides recent Rust releases +and thus it should generally work out of the box, e.g.:: + + USE='rust-src rustfmt clippy' emerge dev-lang/rust dev-util/bindgen + +``LIBCLANG_PATH`` may need to be set. + + +Nix +*** + +Nix (unstable channel) provides recent Rust releases and thus it should +generally work out of the box, e.g.:: + + { pkgs ? import {} }: + pkgs.mkShell { + nativeBuildInputs = with pkgs; [ rustc rust-bindgen rustfmt clippy ]; + RUST_LIB_SRC = "${pkgs.rust.packages.stable.rustPlatform.rustLibSrc}"; + } + + +openSUSE +******** + +openSUSE Slowroll and openSUSE Tumbleweed provide recent Rust releases and thus +they should generally work out of the box, e.g.:: + + zypper install rust rust1.79-src rust-bindgen clang + Requirements: Building ---------------------- This section explains how to fetch the tools needed for building. -Some of these requirements might be available from Linux distributions -under names like ``rustc``, ``rust-src``, ``rust-bindgen``, etc. However, -at the time of writing, they are likely not to be recent enough unless -the distribution tracks the latest releases. - To easily check whether the requirements are met, the following target can be used:: @@ -29,16 +105,15 @@ if that is the case. rustc ***** -A particular version of the Rust compiler is required. Newer versions may or -may not work because, for the moment, the kernel depends on some unstable -Rust features. +A recent version of the Rust compiler is required. If ``rustup`` is being used, enter the kernel build directory (or use -``--path=`` argument to the ``set`` sub-command) and run:: +``--path=`` argument to the ``set`` sub-command) and run, +for instance:: - rustup override set $(scripts/min-tool-version.sh rustc) + rustup override set stable -This will configure your working directory to use the correct version of +This will configure your working directory to use the given version of ``rustc`` without affecting your default toolchain. Note that the override applies to the current working directory (and its @@ -65,9 +140,9 @@ version later on requires re-adding the component. Otherwise, if a standalone installer is used, the Rust source tree may be downloaded into the toolchain's installation folder:: - curl -L "https://static.rust-lang.org/dist/rust-src-$(scripts/min-tool-version.sh rustc).tar.gz" | + curl -L "https://static.rust-lang.org/dist/rust-src-$(rustc --version | cut -d' ' -f2).tar.gz" | tar -xzf - -C "$(rustc --print sysroot)/lib" \ - "rust-src-$(scripts/min-tool-version.sh rustc)/rust-src/lib/" \ + "rust-src-$(rustc --version | cut -d' ' -f2)/rust-src/lib/" \ --strip-components=3 In this case, upgrading the Rust compiler version later on requires manually @@ -101,26 +176,22 @@ bindgen ******* The bindings to the C side of the kernel are generated at build time using -the ``bindgen`` tool. A particular version is required. +the ``bindgen`` tool. -Install it via (note that this will download and build the tool from source):: +Install it, for instance, via (note that this will download and build the tool +from source):: - cargo install --locked --version $(scripts/min-tool-version.sh bindgen) bindgen-cli + cargo install --locked bindgen-cli -``bindgen`` needs to find a suitable ``libclang`` in order to work. If it is -not found (or a different ``libclang`` than the one found should be used), -the process can be tweaked using the environment variables understood by -``clang-sys`` (the Rust bindings crate that ``bindgen`` uses to access -``libclang``): +``bindgen`` uses the ``clang-sys`` crate to find a suitable ``libclang`` (which +may be linked statically, dynamically or loaded at runtime). By default, the +``cargo`` command above will produce a ``bindgen`` binary that will load +``libclang`` at runtime. If it is not found (or a different ``libclang`` than +the one found should be used), the process can be tweaked, e.g. by using the +``LIBCLANG_PATH`` environment variable. For details, please see ``clang-sys``'s +documentation at: -* ``LLVM_CONFIG_PATH`` can be pointed to an ``llvm-config`` executable. - -* Or ``LIBCLANG_PATH`` can be pointed to a ``libclang`` shared library - or to the directory containing it. - -* Or ``CLANG_PATH`` can be pointed to a ``clang`` executable. - -For details, please see ``clang-sys``'s documentation at: + https://github.com/KyleMayes/clang-sys#linking https://github.com/KyleMayes/clang-sys#environment-variables @@ -164,20 +235,6 @@ can be installed manually:: The standalone installers also come with ``clippy``. -cargo -***** - -``cargo`` is the Rust native build system. It is currently required to run -the tests since it is used to build a custom standard library that contains -the facilities provided by the custom ``alloc`` in the kernel. The tests can -be run using the ``rusttest`` Make target. - -If ``rustup`` is being used, all the profiles already install the tool, -thus nothing needs to be done. - -The standalone installers also come with ``cargo``. - - rustdoc ******* diff --git a/Documentation/rust/testing.rst b/Documentation/rust/testing.rst index acfd0c2be48d..568b71b415a4 100644 --- a/Documentation/rust/testing.rst +++ b/Documentation/rust/testing.rst @@ -131,9 +131,8 @@ Additionally, there are the ``#[test]`` tests. These can be run using the make LLVM=1 rusttest -This requires the kernel ``.config`` and downloads external repositories. It -runs the ``#[test]`` tests on the host (currently) and thus is fairly limited in -what these tests can test. +This requires the kernel ``.config``. It runs the ``#[test]`` tests on the host +(currently) and thus is fairly limited in what these tests can test. The Kselftests -------------- diff --git a/MAINTAINERS b/MAINTAINERS index a5b85cbd365d..b7bdab5a62b5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5613,6 +5613,7 @@ M: Ira Weiny M: Dan Williams L: linux-cxl@vger.kernel.org S: Maintained +F: Documentation/driver-api/cxl F: drivers/cxl/ F: include/linux/einj-cxl.h F: include/linux/cxl-event.h diff --git a/Makefile b/Makefile index 9eb3228da7bb..0133feafb627 100644 --- a/Makefile +++ b/Makefile @@ -463,17 +463,17 @@ KBUILD_USERLDFLAGS := $(USERLDFLAGS) # host programs. export rust_common_flags := --edition=2021 \ -Zbinary_dep_depinfo=y \ - -Dunsafe_op_in_unsafe_fn -Drust_2018_idioms \ - -Dunreachable_pub -Dnon_ascii_idents \ + -Dunsafe_op_in_unsafe_fn \ + -Dnon_ascii_idents \ + -Wrust_2018_idioms \ + -Wunreachable_pub \ -Wmissing_docs \ - -Drustdoc::missing_crate_level_docs \ - -Dclippy::correctness -Dclippy::style \ - -Dclippy::suspicious -Dclippy::complexity \ - -Dclippy::perf \ - -Dclippy::let_unit_value -Dclippy::mut_mut \ - -Dclippy::needless_bitwise_bool \ - -Dclippy::needless_continue \ - -Dclippy::no_mangle_with_rust_abi \ + -Wrustdoc::missing_crate_level_docs \ + -Wclippy::all \ + -Wclippy::mut_mut \ + -Wclippy::needless_bitwise_bool \ + -Wclippy::needless_continue \ + -Wclippy::no_mangle_with_rust_abi \ -Wclippy::dbg_macro KBUILD_HOSTCFLAGS := $(KBUILD_USERHOSTCFLAGS) $(HOST_LFS_CFLAGS) \ @@ -511,7 +511,6 @@ RUSTDOC = rustdoc RUSTFMT = rustfmt CLIPPY_DRIVER = clippy-driver BINDGEN = bindgen -CARGO = cargo PAHOLE = pahole RESOLVE_BTFIDS = $(objtree)/tools/bpf/resolve_btfids/resolve_btfids LEX = flex @@ -577,7 +576,7 @@ KBUILD_RUSTFLAGS := $(rust_common_flags) \ -Csymbol-mangling-version=v0 \ -Crelocation-model=static \ -Zfunction-sections=n \ - -Dclippy::float_arithmetic + -Wclippy::float_arithmetic KBUILD_AFLAGS_KERNEL := KBUILD_CFLAGS_KERNEL := @@ -605,7 +604,7 @@ endif export RUSTC_BOOTSTRAP := 1 export ARCH SRCARCH CONFIG_SHELL BASH HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE LD CC HOSTPKG_CONFIG -export RUSTC RUSTDOC RUSTFMT RUSTC_OR_CLIPPY_QUIET RUSTC_OR_CLIPPY BINDGEN CARGO +export RUSTC RUSTDOC RUSTFMT RUSTC_OR_CLIPPY_QUIET RUSTC_OR_CLIPPY BINDGEN export HOSTRUSTC KBUILD_HOSTRUSTFLAGS export CPP AR NM STRIP OBJCOPY OBJDUMP READELF PAHOLE RESOLVE_BTFIDS LEX YACC AWK INSTALLKERNEL export PERL PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX @@ -2005,9 +2004,12 @@ quiet_cmd_tags = GEN $@ tags TAGS cscope gtags: FORCE $(call cmd,tags) -# IDE support targets +# Generate rust-project.json (a file that describes the structure of non-Cargo +# Rust projects) for rust-analyzer (an implementation of the Language Server +# Protocol). PHONY += rust-analyzer rust-analyzer: + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh $(Q)$(MAKE) $(build)=rust $@ # Script to generate missing namespace dependencies diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 93e54ba91fbf..f5931499c2d6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -110,7 +110,7 @@ static inline void pgd_list_del(pgd_t *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) #define MAX_UNSHARED_PTRS_PER_PGD \ - max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) + MAX_T(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) diff --git a/block/genhd.c b/block/genhd.c index 4dc95a463505..1c05dd4c6980 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -663,12 +663,12 @@ void del_gendisk(struct gendisk *disk) */ if (!test_bit(GD_DEAD, &disk->state)) blk_report_disk_dead(disk, false); - __blk_mark_disk_dead(disk); /* * Drop all partitions now that the disk is marked dead. */ mutex_lock(&disk->open_mutex); + __blk_mark_disk_dead(disk); xa_for_each_start(&disk->part_tbl, idx, part, 1) drop_partition(part); mutex_unlock(&disk->open_mutex); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index f92673f05c7a..a9e49b212341 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3422,6 +3422,7 @@ void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local) /** * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() * @device: DRBD device. + * @peer_device: Peer DRBD device. * * Sets all bits in the bitmap and writes the whole bitmap to stable storage. */ @@ -3448,6 +3449,7 @@ int drbd_bmio_set_n_write(struct drbd_device *device, /** * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() * @device: DRBD device. + * @peer_device: Peer DRBD device. * * Clears all bits in the bitmap and writes the whole bitmap to stable storage. */ @@ -3501,6 +3503,7 @@ static int w_bitmap_io(struct drbd_work *w, int unused) * @done: callback to be called after the bitmap IO was performed * @why: Descriptive text of the reason for doing the IO * @flags: Bitmap flags + * @peer_device: Peer DRBD device. * * While IO on the bitmap happens we freeze application IO thus we ensure * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be @@ -3549,6 +3552,7 @@ void drbd_queue_bitmap_io(struct drbd_device *device, * @io_fn: IO callback to be called when bitmap IO is possible * @why: Descriptive text of the reason for doing the IO * @flags: Bitmap flags + * @peer_device: Peer DRBD device. * * freezes application IO while that the actual IO operations runs. This * functions MAY NOT be called from worker context. diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 6fb799ec0d88..890c08792ba8 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -48,6 +48,9 @@ #define UBLK_MINORS (1U << MINORBITS) +/* private ioctl command mirror */ +#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) + /* All UBLK_F_* have to be included into UBLK_F_ALL */ #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ | UBLK_F_URING_CMD_COMP_IN_TASK \ @@ -2903,7 +2906,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, case UBLK_CMD_DEL_DEV: ret = ublk_ctrl_del_dev(&ub, true); break; - case UBLK_U_CMD_DEL_DEV_ASYNC: + case UBLK_CMD_DEL_DEV_ASYNC: ret = ublk_ctrl_del_dev(&ub, false); break; case UBLK_CMD_GET_QUEUE_AFFINITY: diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 571069863c62..82b78e331d8e 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -22,56 +22,42 @@ static const guid_t acpi_cxl_qtg_id_guid = GUID_INIT(0xF365F9A6, 0xA7DE, 0x4071, 0xA6, 0x6A, 0xB4, 0x0C, 0x0B, 0x4F, 0x8E, 0x52); -/* - * Find a targets entry (n) in the host bridge interleave list. - * CXL Specification 3.0 Table 9-22 - */ -static int cxl_xor_calc_n(u64 hpa, struct cxl_cxims_data *cximsd, int iw, - int ig) -{ - int i = 0, n = 0; - u8 eiw; - /* IW: 2,4,6,8,12,16 begin building 'n' using xormaps */ - if (iw != 3) { - for (i = 0; i < cximsd->nr_maps; i++) - n |= (hweight64(hpa & cximsd->xormaps[i]) & 1) << i; - } - /* IW: 3,6,12 add a modulo calculation to 'n' */ - if (!is_power_of_2(iw)) { - if (ways_to_eiw(iw, &eiw)) - return -1; - hpa &= GENMASK_ULL(51, eiw + ig); - n |= do_div(hpa, 3) << i; - } - return n; -} - -static struct cxl_dport *cxl_hb_xor(struct cxl_root_decoder *cxlrd, int pos) +static u64 cxl_xor_hpa_to_spa(struct cxl_root_decoder *cxlrd, u64 hpa) { struct cxl_cxims_data *cximsd = cxlrd->platform_data; - struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd; - struct cxl_decoder *cxld = &cxlsd->cxld; - int ig = cxld->interleave_granularity; - int iw = cxld->interleave_ways; - int n = 0; - u64 hpa; + int hbiw = cxlrd->cxlsd.nr_targets; + u64 val; + int pos; - if (dev_WARN_ONCE(&cxld->dev, - cxld->interleave_ways != cxlsd->nr_targets, - "misconfigured root decoder\n")) - return NULL; + /* No xormaps for host bridge interleave ways of 1 or 3 */ + if (hbiw == 1 || hbiw == 3) + return hpa; - hpa = cxlrd->res->start + pos * ig; + /* + * For root decoders using xormaps (hbiw: 2,4,6,8,12,16) restore + * the position bit to its value before the xormap was applied at + * HPA->DPA translation. + * + * pos is the lowest set bit in an XORMAP + * val is the XORALLBITS(HPA & XORMAP) + * + * XORALLBITS: The CXL spec (3.1 Table 9-22) defines XORALLBITS + * as an operation that outputs a single bit by XORing all the + * bits in the input (hpa & xormap). Implement XORALLBITS using + * hweight64(). If the hamming weight is even the XOR of those + * bits results in val==0, if odd the XOR result is val==1. + */ - /* Entry (n) is 0 for no interleave (iw == 1) */ - if (iw != 1) - n = cxl_xor_calc_n(hpa, cximsd, iw, ig); + for (int i = 0; i < cximsd->nr_maps; i++) { + if (!cximsd->xormaps[i]) + continue; + pos = __ffs(cximsd->xormaps[i]); + val = (hweight64(hpa & cximsd->xormaps[i]) & 1); + hpa = (hpa & ~(1ULL << pos)) | (val << pos); + } - if (n < 0) - return NULL; - - return cxlrd->cxlsd.target[n]; + return hpa; } struct cxl_cxims_context { @@ -361,7 +347,6 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, struct cxl_port *root_port = ctx->root_port; struct cxl_cxims_context cxims_ctx; struct device *dev = ctx->dev; - cxl_calc_hb_fn cxl_calc_hb; struct cxl_decoder *cxld; unsigned int ways, i, ig; int rc; @@ -389,13 +374,9 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, if (rc) return rc; - if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_MODULO) - cxl_calc_hb = cxl_hb_modulo; - else - cxl_calc_hb = cxl_hb_xor; - struct cxl_root_decoder *cxlrd __free(put_cxlrd) = - cxl_root_decoder_alloc(root_port, ways, cxl_calc_hb); + cxl_root_decoder_alloc(root_port, ways); + if (IS_ERR(cxlrd)) return PTR_ERR(cxlrd); @@ -434,6 +415,9 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, cxlrd->qos_class = cfmws->qtg_id; + if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) + cxlrd->hpa_to_spa = cxl_xor_hpa_to_spa; + rc = cxl_decoder_add(cxld, target_map); if (rc) return rc; @@ -482,6 +466,8 @@ struct cxl_chbs_context { unsigned long long uid; resource_size_t base; u32 cxl_version; + int nr_versions; + u32 saved_version; }; static int cxl_get_chbs_iter(union acpi_subtable_headers *header, void *arg, @@ -490,22 +476,31 @@ static int cxl_get_chbs_iter(union acpi_subtable_headers *header, void *arg, struct cxl_chbs_context *ctx = arg; struct acpi_cedt_chbs *chbs; - if (ctx->base != CXL_RESOURCE_NONE) - return 0; - chbs = (struct acpi_cedt_chbs *) header; - if (ctx->uid != chbs->uid) - return 0; - - ctx->cxl_version = chbs->cxl_version; - if (!chbs->base) - return 0; - if (chbs->cxl_version == ACPI_CEDT_CHBS_VERSION_CXL11 && chbs->length != CXL_RCRB_SIZE) return 0; + if (!chbs->base) + return 0; + + if (ctx->saved_version != chbs->cxl_version) { + /* + * cxl_version cannot be overwritten before the next two + * checks, then use saved_version + */ + ctx->saved_version = chbs->cxl_version; + ctx->nr_versions++; + } + + if (ctx->base != CXL_RESOURCE_NONE) + return 0; + + if (ctx->uid != chbs->uid) + return 0; + + ctx->cxl_version = chbs->cxl_version; ctx->base = chbs->base; return 0; @@ -529,10 +524,19 @@ static int cxl_get_chbs(struct device *dev, struct acpi_device *hb, .uid = uid, .base = CXL_RESOURCE_NONE, .cxl_version = UINT_MAX, + .saved_version = UINT_MAX, }; acpi_table_parse_cedt(ACPI_CEDT_TYPE_CHBS, cxl_get_chbs_iter, ctx); + if (ctx->nr_versions > 1) { + /* + * Disclaim eRCD support given some component register may + * only be found via CHBCR + */ + dev_info(dev, "Unsupported platform config, mixed Virtual Host and Restricted CXL Host hierarchy."); + } + return 0; } @@ -921,6 +925,7 @@ static void __exit cxl_acpi_exit(void) /* load before dax_hmem sees 'Soft Reserved' CXL ranges */ subsys_initcall(cxl_acpi_init); module_exit(cxl_acpi_exit); +MODULE_DESCRIPTION("CXL ACPI: Platform Support"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); MODULE_IMPORT_NS(ACPI); diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 625394486459..72a506c9dbd0 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -28,12 +28,12 @@ int cxl_region_init(void); void cxl_region_exit(void); int cxl_get_poison_by_endpoint(struct cxl_port *port); struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa); -u64 cxl_trace_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, - u64 dpa); +u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, + u64 dpa); #else -static inline u64 -cxl_trace_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, u64 dpa) +static inline u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, + const struct cxl_memdev *cxlmd, u64 dpa) { return ULLONG_MAX; } diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 2626f3fff201..e5cdeafdf76e 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -875,10 +875,10 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, guard(rwsem_read)(&cxl_region_rwsem); guard(rwsem_read)(&cxl_dpa_rwsem); - dpa = le64_to_cpu(evt->common.phys_addr) & CXL_DPA_MASK; + dpa = le64_to_cpu(evt->media_hdr.phys_addr) & CXL_DPA_MASK; cxlr = cxl_dpa_to_region(cxlmd, dpa); if (cxlr) - hpa = cxl_trace_hpa(cxlr, cxlmd, dpa); + hpa = cxl_dpa_to_hpa(cxlr, cxlmd, dpa); if (event_type == CXL_CPER_EVENT_GEN_MEDIA) trace_cxl_general_media(cxlmd, type, cxlr, hpa, diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 8567dd11eaac..a663e7566c48 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -338,10 +338,6 @@ int cxl_dvsec_rr_decode(struct device *dev, int d, if (rc) return rc; - rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); - if (rc) - return rc; - if (!(cap & CXL_DVSEC_MEM_CAPABLE)) { dev_dbg(dev, "Not MEM Capable\n"); return -ENXIO; @@ -368,6 +364,10 @@ int cxl_dvsec_rr_decode(struct device *dev, int d, * disabled, and they will remain moot after the HDM Decoder * capability is enabled. */ + rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); + if (rc) + return rc; + info->mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl); if (!info->mem_enabled) return 0; diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index cb730050d3d4..1d5007e3795a 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1733,21 +1733,6 @@ static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd, return 0; } -struct cxl_dport *cxl_hb_modulo(struct cxl_root_decoder *cxlrd, int pos) -{ - struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd; - struct cxl_decoder *cxld = &cxlsd->cxld; - int iw; - - iw = cxld->interleave_ways; - if (dev_WARN_ONCE(&cxld->dev, iw != cxlsd->nr_targets, - "misconfigured root decoder\n")) - return NULL; - - return cxlrd->cxlsd.target[pos % iw]; -} -EXPORT_SYMBOL_NS_GPL(cxl_hb_modulo, CXL); - static struct lock_class_key cxl_decoder_key; /** @@ -1807,7 +1792,6 @@ static int cxl_switch_decoder_init(struct cxl_port *port, * cxl_root_decoder_alloc - Allocate a root level decoder * @port: owning CXL root of this decoder * @nr_targets: static number of downstream targets - * @calc_hb: which host bridge covers the n'th position by granularity * * Return: A new cxl decoder to be registered by cxl_decoder_add(). A * 'CXL root' decoder is one that decodes from a top-level / static platform @@ -1815,8 +1799,7 @@ static int cxl_switch_decoder_init(struct cxl_port *port, * topology. */ struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, - unsigned int nr_targets, - cxl_calc_hb_fn calc_hb) + unsigned int nr_targets) { struct cxl_root_decoder *cxlrd; struct cxl_switch_decoder *cxlsd; @@ -1838,7 +1821,6 @@ struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, return ERR_PTR(rc); } - cxlrd->calc_hb = calc_hb; mutex_init(&cxlrd->range_lock); cxld = &cxlsd->cxld; @@ -2356,5 +2338,6 @@ static void cxl_core_exit(void) subsys_initcall(cxl_core_init); module_exit(cxl_core_exit); +MODULE_DESCRIPTION("CXL: Core Compute Express Link support"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 538ebd5a64fd..21ad5f242875 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include "core.h" @@ -1632,10 +1633,13 @@ static int cxl_region_attach_position(struct cxl_region *cxlr, const struct cxl_dport *dport, int pos) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd; + struct cxl_decoder *cxld = &cxlsd->cxld; + int iw = cxld->interleave_ways; struct cxl_port *iter; int rc; - if (cxlrd->calc_hb(cxlrd, pos) != dport) { + if (dport != cxlrd->cxlsd.target[pos % iw]) { dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n", dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), dev_name(&cxlrd->cxlsd.cxld.dev)); @@ -2310,6 +2314,7 @@ static void unregister_region(void *_cxlr) int i; unregister_memory_notifier(&cxlr->memory_notifier); + unregister_mt_adistance_algorithm(&cxlr->adist_notifier); device_del(&cxlr->dev); /* @@ -2386,14 +2391,23 @@ static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid) return true; } +static int cxl_region_nid(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct resource *res; + + guard(rwsem_read)(&cxl_region_rwsem); + res = p->res; + if (!res) + return NUMA_NO_NODE; + return phys_to_target_node(res->start); +} + static int cxl_region_perf_attrs_callback(struct notifier_block *nb, unsigned long action, void *arg) { struct cxl_region *cxlr = container_of(nb, struct cxl_region, memory_notifier); - struct cxl_region_params *p = &cxlr->params; - struct cxl_endpoint_decoder *cxled = p->targets[0]; - struct cxl_decoder *cxld = &cxled->cxld; struct memory_notify *mnb = arg; int nid = mnb->status_change_nid; int region_nid; @@ -2401,7 +2415,7 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb, if (nid == NUMA_NO_NODE || action != MEM_ONLINE) return NOTIFY_DONE; - region_nid = phys_to_target_node(cxld->hpa_range.start); + region_nid = cxl_region_nid(cxlr); if (nid != region_nid) return NOTIFY_DONE; @@ -2411,6 +2425,27 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb, return NOTIFY_OK; } +static int cxl_region_calculate_adistance(struct notifier_block *nb, + unsigned long nid, void *data) +{ + struct cxl_region *cxlr = container_of(nb, struct cxl_region, + adist_notifier); + struct access_coordinate *perf; + int *adist = data; + int region_nid; + + region_nid = cxl_region_nid(cxlr); + if (nid != region_nid) + return NOTIFY_OK; + + perf = &cxlr->coord[ACCESS_COORDINATE_CPU]; + + if (mt_perf_to_adistance(perf, adist)) + return NOTIFY_OK; + + return NOTIFY_STOP; +} + /** * devm_cxl_add_region - Adds a region to a decoder * @cxlrd: root decoder @@ -2453,6 +2488,10 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, cxlr->memory_notifier.priority = CXL_CALLBACK_PRI; register_memory_notifier(&cxlr->memory_notifier); + cxlr->adist_notifier.notifier_call = cxl_region_calculate_adistance; + cxlr->adist_notifier.priority = 100; + register_mt_adistance_algorithm(&cxlr->adist_notifier); + rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr); if (rc) return ERR_PTR(rc); @@ -2816,20 +2855,13 @@ struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa) return ctx.cxlr; } -static bool cxl_is_hpa_in_range(u64 hpa, struct cxl_region *cxlr, int pos) +static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos) { struct cxl_region_params *p = &cxlr->params; int gran = p->interleave_granularity; int ways = p->interleave_ways; u64 offset; - /* Is the hpa within this region at all */ - if (hpa < p->res->start || hpa > p->res->end) { - dev_dbg(&cxlr->dev, - "Addr trans fail: hpa 0x%llx not in region\n", hpa); - return false; - } - /* Is the hpa in an expected chunk for its pos(-ition) */ offset = hpa - p->res->start; offset = do_div(offset, gran * ways); @@ -2842,15 +2874,26 @@ static bool cxl_is_hpa_in_range(u64 hpa, struct cxl_region *cxlr, int pos) return false; } -static u64 cxl_dpa_to_hpa(u64 dpa, struct cxl_region *cxlr, - struct cxl_endpoint_decoder *cxled) +u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, + u64 dpa) { + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); u64 dpa_offset, hpa_offset, bits_upper, mask_upper, hpa; struct cxl_region_params *p = &cxlr->params; - int pos = cxled->pos; + struct cxl_endpoint_decoder *cxled = NULL; u16 eig = 0; u8 eiw = 0; + int pos; + for (int i = 0; i < p->nr_targets; i++) { + cxled = p->targets[i]; + if (cxlmd == cxled_to_memdev(cxled)) + break; + } + if (!cxled || cxlmd != cxled_to_memdev(cxled)) + return ULLONG_MAX; + + pos = cxled->pos; ways_to_eiw(p->interleave_ways, &eiw); granularity_to_eig(p->interleave_granularity, &eig); @@ -2884,29 +2927,23 @@ static u64 cxl_dpa_to_hpa(u64 dpa, struct cxl_region *cxlr, /* Apply the hpa_offset to the region base address */ hpa = hpa_offset + p->res->start; - if (!cxl_is_hpa_in_range(hpa, cxlr, cxled->pos)) + /* Root decoder translation overrides typical modulo decode */ + if (cxlrd->hpa_to_spa) + hpa = cxlrd->hpa_to_spa(cxlrd, hpa); + + if (hpa < p->res->start || hpa > p->res->end) { + dev_dbg(&cxlr->dev, + "Addr trans fail: hpa 0x%llx not in region\n", hpa); + return ULLONG_MAX; + } + + /* Simple chunk check, by pos & gran, only applies to modulo decodes */ + if (!cxlrd->hpa_to_spa && (!cxl_is_hpa_in_chunk(hpa, cxlr, pos))) return ULLONG_MAX; return hpa; } -u64 cxl_trace_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, - u64 dpa) -{ - struct cxl_region_params *p = &cxlr->params; - struct cxl_endpoint_decoder *cxled = NULL; - - for (int i = 0; i < p->nr_targets; i++) { - cxled = p->targets[i]; - if (cxlmd == cxled_to_memdev(cxled)) - break; - } - if (!cxled || cxlmd != cxled_to_memdev(cxled)) - return ULLONG_MAX; - - return cxl_dpa_to_hpa(dpa, cxlr, cxled); -} - static struct lock_class_key cxl_pmem_region_key; static int cxl_pmem_region_alloc(struct cxl_region *cxlr) diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index ee5cd4eb2f16..9167cfba7f59 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -340,23 +340,23 @@ TRACE_EVENT(cxl_general_media, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, rec->media_hdr.hdr); __entry->hdr_uuid = CXL_EVENT_GEN_MEDIA_UUID; /* General Media */ - __entry->dpa = le64_to_cpu(rec->phys_addr); + __entry->dpa = le64_to_cpu(rec->media_hdr.phys_addr); __entry->dpa_flags = __entry->dpa & CXL_DPA_FLAGS_MASK; /* Mask after flags have been parsed */ __entry->dpa &= CXL_DPA_MASK; - __entry->descriptor = rec->descriptor; - __entry->type = rec->type; - __entry->transaction_type = rec->transaction_type; - __entry->channel = rec->channel; - __entry->rank = rec->rank; + __entry->descriptor = rec->media_hdr.descriptor; + __entry->type = rec->media_hdr.type; + __entry->transaction_type = rec->media_hdr.transaction_type; + __entry->channel = rec->media_hdr.channel; + __entry->rank = rec->media_hdr.rank; __entry->device = get_unaligned_le24(rec->device); memcpy(__entry->comp_id, &rec->component_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE); - __entry->validity_flags = get_unaligned_le16(&rec->validity_flags); + __entry->validity_flags = get_unaligned_le16(&rec->media_hdr.validity_flags); __entry->hpa = hpa; if (cxlr) { __assign_str(region_name); @@ -440,19 +440,19 @@ TRACE_EVENT(cxl_dram, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, rec->media_hdr.hdr); __entry->hdr_uuid = CXL_EVENT_DRAM_UUID; /* DRAM */ - __entry->dpa = le64_to_cpu(rec->phys_addr); + __entry->dpa = le64_to_cpu(rec->media_hdr.phys_addr); __entry->dpa_flags = __entry->dpa & CXL_DPA_FLAGS_MASK; __entry->dpa &= CXL_DPA_MASK; - __entry->descriptor = rec->descriptor; - __entry->type = rec->type; - __entry->transaction_type = rec->transaction_type; - __entry->validity_flags = get_unaligned_le16(rec->validity_flags); - __entry->channel = rec->channel; - __entry->rank = rec->rank; + __entry->descriptor = rec->media_hdr.descriptor; + __entry->type = rec->media_hdr.type; + __entry->transaction_type = rec->media_hdr.transaction_type; + __entry->validity_flags = get_unaligned_le16(rec->media_hdr.validity_flags); + __entry->channel = rec->media_hdr.channel; + __entry->rank = rec->media_hdr.rank; __entry->nibble_mask = get_unaligned_le24(rec->nibble_mask); __entry->bank_group = rec->bank_group; __entry->bank = rec->bank; @@ -704,8 +704,8 @@ TRACE_EVENT(cxl_poison, if (cxlr) { __assign_str(region); memcpy(__entry->uuid, &cxlr->params.uuid, 16); - __entry->hpa = cxl_trace_hpa(cxlr, cxlmd, - __entry->dpa); + __entry->hpa = cxl_dpa_to_hpa(cxlr, cxlmd, + __entry->dpa); } else { __assign_str(region); memset(__entry->uuid, 0, 16); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 720aa07976b0..9afb407d438f 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -434,14 +434,13 @@ struct cxl_switch_decoder { }; struct cxl_root_decoder; -typedef struct cxl_dport *(*cxl_calc_hb_fn)(struct cxl_root_decoder *cxlrd, - int pos); +typedef u64 (*cxl_hpa_to_spa_fn)(struct cxl_root_decoder *cxlrd, u64 hpa); /** * struct cxl_root_decoder - Static platform CXL address decoder * @res: host / parent resource for region allocations * @region_id: region id for next region provisioning event - * @calc_hb: which host bridge covers the n'th position by granularity + * @hpa_to_spa: translate CXL host-physical-address to Platform system-physical-address * @platform_data: platform specific configuration data * @range_lock: sync region autodiscovery by address range * @qos_class: QoS performance class cookie @@ -450,7 +449,7 @@ typedef struct cxl_dport *(*cxl_calc_hb_fn)(struct cxl_root_decoder *cxlrd, struct cxl_root_decoder { struct resource *res; atomic_t region_id; - cxl_calc_hb_fn calc_hb; + cxl_hpa_to_spa_fn hpa_to_spa; void *platform_data; struct mutex range_lock; int qos_class; @@ -524,6 +523,7 @@ struct cxl_region_params { * @params: active + config params for the region * @coord: QoS access coordinates for the region * @memory_notifier: notifier for setting the access coordinates to node + * @adist_notifier: notifier for calculating the abstract distance of node */ struct cxl_region { struct device dev; @@ -536,6 +536,7 @@ struct cxl_region { struct cxl_region_params params; struct access_coordinate coord[ACCESS_COORDINATE_MAX]; struct notifier_block memory_notifier; + struct notifier_block adist_notifier; }; struct cxl_nvdimm_bridge { @@ -774,9 +775,7 @@ bool is_root_decoder(struct device *dev); bool is_switch_decoder(struct device *dev); bool is_endpoint_decoder(struct device *dev); struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, - unsigned int nr_targets, - cxl_calc_hb_fn calc_hb); -struct cxl_dport *cxl_hb_modulo(struct cxl_root_decoder *cxlrd, int pos); + unsigned int nr_targets); struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, unsigned int nr_targets); int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map); diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index af8169ccdbc0..afb53d058d62 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -161,7 +161,7 @@ struct cxl_mbox_cmd { C(FWRESET, -ENXIO, "FW failed to activate, needs cold reset"), \ C(HANDLE, -ENXIO, "one or more Event Record Handles were invalid"), \ C(PADDR, -EFAULT, "physical address specified is invalid"), \ - C(POISONLMT, -ENXIO, "poison injection limit has been reached"), \ + C(POISONLMT, -EBUSY, "poison injection limit has been reached"), \ C(MEDIAFAILURE, -ENXIO, "permanent issue with the media"), \ C(ABORT, -ENXIO, "background cmd was aborted by device"), \ C(SECURITY, -ENXIO, "not valid in the current security state"), \ @@ -563,7 +563,7 @@ enum cxl_opcode { 0x3b, 0x3f, 0x17) #define DEFINE_CXL_VENDOR_DEBUG_UUID \ - UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19, \ + UUID_INIT(0x5e1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19, \ 0x40, 0x3d, 0x86) struct cxl_mbox_get_supported_logs { diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 2f1b49bfe162..7de232eaeb17 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -253,6 +253,7 @@ static struct cxl_driver cxl_mem_driver = { module_cxl_driver(cxl_mem_driver); +MODULE_DESCRIPTION("CXL: Memory Expansion"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); MODULE_ALIAS_CXL(CXL_DEVICE_MEMORY_EXPANDER); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index e53646e9f2fb..4be35dc22202 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1066,5 +1066,6 @@ static void __exit cxl_pci_driver_exit(void) module_init(cxl_pci_driver_init); module_exit(cxl_pci_driver_exit); +MODULE_DESCRIPTION("CXL: PCI manageability"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index 2ecdaee63021..4ef93da22335 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -453,6 +453,7 @@ static __exit void cxl_pmem_exit(void) cxl_driver_unregister(&cxl_nvdimm_bridge_driver); } +MODULE_DESCRIPTION("CXL PMEM: Persistent Memory Support"); MODULE_LICENSE("GPL v2"); module_init(cxl_pmem_init); module_exit(cxl_pmem_exit); diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 97c21566677a..d7d5d982ce69 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -209,6 +209,7 @@ static struct cxl_driver cxl_port_driver = { }; module_cxl_driver(cxl_port_driver); +MODULE_DESCRIPTION("CXL: Port enumeration and services"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); MODULE_ALIAS_CXL(CXL_DEVICE_PORT); diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index cbc92d3683e6..e5c05a876947 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -109,8 +109,8 @@ static const u32 knl_interleave_list[] = { 0x104, 0x10c, 0x114, 0x11c, /* 20-23 */ }; #define MAX_INTERLEAVE \ - (max_t(unsigned int, ARRAY_SIZE(sbridge_interleave_list), \ - max_t(unsigned int, ARRAY_SIZE(ibridge_interleave_list), \ + (MAX_T(unsigned int, ARRAY_SIZE(sbridge_interleave_list), \ + MAX_T(unsigned int, ARRAY_SIZE(ibridge_interleave_list), \ ARRAY_SIZE(knl_interleave_list)))) struct interleave_pkg { diff --git a/drivers/gpu/drm/drm_color_mgmt.c b/drivers/gpu/drm/drm_color_mgmt.c index d021497841b8..3969dc548cff 100644 --- a/drivers/gpu/drm/drm_color_mgmt.c +++ b/drivers/gpu/drm/drm_color_mgmt.c @@ -532,7 +532,7 @@ int drm_plane_create_color_properties(struct drm_plane *plane, { struct drm_device *dev = plane->dev; struct drm_property *prop; - struct drm_prop_enum_list enum_list[max_t(int, DRM_COLOR_ENCODING_MAX, + struct drm_prop_enum_list enum_list[MAX_T(int, DRM_COLOR_ENCODING_MAX, DRM_COLOR_RANGE_MAX)]; int i, len; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 48ac628f471b..51e6964c1305 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1776,7 +1776,7 @@ static void integrity_metadata(struct work_struct *w) struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); char *checksums; unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; - char checksums_onstack[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; sector_t sector; unsigned int sectors_to_process; @@ -2064,7 +2064,7 @@ retry_kmap: } while (++s < ic->sectors_per_block); #ifdef INTERNAL_VERIFY if (ic->internal_hash) { - char checksums_onstack[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { @@ -2837,7 +2837,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start unlikely(from_replay) && #endif ic->internal_hash) { - char test_tag[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + char test_tag[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), (char *)access_journal_data(ic, i, l), test_tag); diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index f82e3423acb9..60d0155be869 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -390,7 +390,8 @@ int ubiblock_create(struct ubi_volume_info *vi) ret = blk_mq_alloc_tag_set(&dev->tag_set); if (ret) { - dev_err(disk_to_dev(dev->gd), "blk_mq_alloc_tag_set failed"); + pr_err("ubiblock%d_%d: blk_mq_alloc_tag_set failed\n", + dev->ubi_num, dev->vol_id); goto out_free_dev; } @@ -407,8 +408,8 @@ int ubiblock_create(struct ubi_volume_info *vi) gd->minors = 1; gd->first_minor = idr_alloc(&ubiblock_minor_idr, dev, 0, 0, GFP_KERNEL); if (gd->first_minor < 0) { - dev_err(disk_to_dev(gd), - "block: dynamic minor allocation failed"); + pr_err("ubiblock%d_%d: block: dynamic minor allocation failed\n", + dev->ubi_num, dev->vol_id); ret = -ENODEV; goto out_cleanup_disk; } @@ -669,7 +670,7 @@ err_unreg: return ret; } -void __exit ubiblock_exit(void) +void ubiblock_exit(void) { ubi_unregister_volume_notifier(&ubiblock_notifier); ubiblock_remove_all(); diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index a7e3a6246c0e..30be4ed68fad 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -112,7 +112,7 @@ static struct attribute *ubi_class_attrs[] = { ATTRIBUTE_GROUPS(ubi_class); /* Root UBI "class" object (corresponds to '//class/ubi/') */ -struct class ubi_class = { +const struct class ubi_class = { .name = UBI_NAME_STR, .class_groups = ubi_class_groups, }; @@ -1372,7 +1372,7 @@ static int __init ubi_init(void) /* See comment above re-ubi_is_module(). */ if (ubi_is_module()) - goto out_slab; + goto out_debugfs; } register_mtd_user(&ubi_mtd_notifier); @@ -1387,6 +1387,9 @@ static int __init ubi_init(void) out_mtd_notifier: unregister_mtd_user(&ubi_mtd_notifier); + ubiblock_exit(); +out_debugfs: + ubi_debugfs_exit(); out_slab: kmem_cache_destroy(ubi_wl_entry_slab); out_dev_unreg: diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c index d57f52bd2ff3..9ec3b8b6a0aa 100644 --- a/drivers/mtd/ubi/debug.c +++ b/drivers/mtd/ubi/debug.c @@ -598,9 +598,9 @@ int ubi_debugfs_init_dev(struct ubi_device *ubi) if (!IS_ENABLED(CONFIG_DEBUG_FS)) return 0; - n = snprintf(d->dfs_dir_name, UBI_DFS_DIR_LEN + 1, UBI_DFS_DIR_NAME, + n = snprintf(d->dfs_dir_name, UBI_DFS_DIR_LEN, UBI_DFS_DIR_NAME, ubi->ubi_num); - if (n > UBI_DFS_DIR_LEN) { + if (n >= UBI_DFS_DIR_LEN) { /* The array size is too small */ return -EINVAL; } diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c index e5ac3cd0bbae..c7ba7a15c9f7 100644 --- a/drivers/mtd/ubi/eba.c +++ b/drivers/mtd/ubi/eba.c @@ -1564,6 +1564,7 @@ int self_check_eba(struct ubi_device *ubi, struct ubi_attach_info *ai_fastmap, GFP_KERNEL); if (!fm_eba[i]) { ret = -ENOMEM; + kfree(scan_eba[i]); goto out_free; } @@ -1599,7 +1600,7 @@ int self_check_eba(struct ubi_device *ubi, struct ubi_attach_info *ai_fastmap, } out_free: - for (i = 0; i < num_volumes; i++) { + while (--i >= 0) { if (!ubi->volumes[i]) continue; diff --git a/drivers/mtd/ubi/nvmem.c b/drivers/mtd/ubi/nvmem.c index 8aeb9c428e51..a94a1a9aaec1 100644 --- a/drivers/mtd/ubi/nvmem.c +++ b/drivers/mtd/ubi/nvmem.c @@ -6,7 +6,6 @@ /* UBI NVMEM provider */ #include "ubi.h" #include -#include /* List of all NVMEM devices */ static LIST_HEAD(nvmem_devices); @@ -27,14 +26,15 @@ static int ubi_nvmem_reg_read(void *priv, unsigned int from, struct ubi_nvmem *unv = priv; struct ubi_volume_desc *desc; uint32_t offs; - uint64_t lnum = from; + uint32_t lnum; int err = 0; desc = ubi_open_volume(unv->ubi_num, unv->vol_id, UBI_READONLY); if (IS_ERR(desc)) return PTR_ERR(desc); - offs = do_div(lnum, unv->usable_leb_size); + offs = from % unv->usable_leb_size; + lnum = from / unv->usable_leb_size; while (bytes_left) { to_read = unv->usable_leb_size - offs; diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index 32009a24869e..1c9e874e8ede 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -420,7 +420,7 @@ struct ubi_debug_info { unsigned int power_cut_min; unsigned int power_cut_max; unsigned int emulate_failures; - char dfs_dir_name[UBI_DFS_DIR_LEN + 1]; + char dfs_dir_name[UBI_DFS_DIR_LEN]; struct dentry *dfs_dir; struct dentry *dfs_chk_gen; struct dentry *dfs_chk_io; @@ -814,7 +814,7 @@ extern struct kmem_cache *ubi_wl_entry_slab; extern const struct file_operations ubi_ctrl_cdev_operations; extern const struct file_operations ubi_cdev_operations; extern const struct file_operations ubi_vol_cdev_operations; -extern struct class ubi_class; +extern const struct class ubi_class; extern struct mutex ubi_devices_mutex; extern struct blocking_notifier_head ubi_notifiers; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 12689774d755..f3a1b179aaea 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2912,7 +2912,7 @@ static void stmmac_dma_interrupt(struct stmmac_priv *priv) u32 channels_to_check = tx_channel_count > rx_channel_count ? tx_channel_count : rx_channel_count; u32 chan; - int status[max_t(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)]; + int status[MAX_T(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)]; /* Make sure we never check beyond our status buffer. */ if (WARN_ON_ONCE(channels_to_check > ARRAY_SIZE(status))) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 19917253ba7b..053d5b4909cd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1876,12 +1876,18 @@ static void nvme_configure_pi_elbas(struct nvme_ns_head *head, struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm) { u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]); + u8 guard_type; /* no support for storage tag formats right now */ if (nvme_elbaf_sts(elbaf)) return; - head->guard_type = nvme_elbaf_guard_type(elbaf); + guard_type = nvme_elbaf_guard_type(elbaf); + if ((nvm->pic & NVME_ID_NS_NVM_QPIFS) && + guard_type == NVME_NVM_NS_QTYPE_GUARD) + guard_type = nvme_elbaf_qualified_guard_type(elbaf); + + head->guard_type = guard_type; switch (head->guard_type) { case NVME_NVM_NS_64B_GUARD: head->pi_size = sizeof(struct crc64_pi_tuple); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 44e342a46f39..f5f545fa0103 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -1403,10 +1403,10 @@ static void __nvmf_concat_opt_tokens(struct seq_file *seq_file) tok = &opt_tokens[idx]; if (tok->token == NVMF_OPT_ERR) continue; - seq_puts(seq_file, ","); + seq_putc(seq_file, ','); seq_puts(seq_file, tok->pattern); } - seq_puts(seq_file, "\n"); + seq_putc(seq_file, '\n'); } static int nvmf_dev_show(struct seq_file *seq_file, void *private) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 21f8e1b9801f..6cd9395ba9ec 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -863,7 +863,8 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) nvme_start_request(req); return BLK_STS_OK; out_unmap_data: - nvme_unmap_data(dev, req); + if (blk_rq_nr_phys_segments(req)) + nvme_unmap_data(dev, req); out_free_cmd: nvme_cleanup_cmd(req); return ret; @@ -1309,7 +1310,7 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) dev_warn(dev->ctrl.device, "Does your device have a faulty power saving mode enabled?\n"); dev_warn(dev->ctrl.device, - "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off\" and report a bug\n"); + "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off pcie_port_pm=off\" and report a bug\n"); } static enum blk_eh_timer_return nvme_timeout(struct request *req) @@ -2968,6 +2969,13 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND; } + /* + * NVMe SSD drops off the PCIe bus after system idle + * for 10 hours on a Lenovo N60z board. + */ + if (dmi_match(DMI_BOARD_NAME, "LXKT-ZXEG-N6")) + return NVME_QUIRK_NO_APST; + return 0; } diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 3c55f7edd181..ba05faaac562 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -233,13 +233,12 @@ static ssize_t nuse_show(struct device *dev, struct device_attribute *attr, { struct nvme_ns_head *head = dev_to_ns_head(dev); struct gendisk *disk = dev_to_disk(dev); - struct block_device *bdev = disk->part0; int ret; - if (nvme_disk_is_ns_head(bdev->bd_disk)) + if (nvme_disk_is_ns_head(disk)) ret = ns_head_update_nuse(head); else - ret = ns_update_nuse(bdev->bd_disk->private_data); + ret = ns_update_nuse(disk->private_data); if (ret) return ret; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 777c1f77ff58..22df574ca99e 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "hostfs.h" #include @@ -929,7 +930,6 @@ static const struct inode_operations hostfs_link_iops = { static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct hostfs_fs_info *fsi = sb->s_fs_info; - const char *host_root = fc->source; struct inode *root_inode; int err; @@ -943,15 +943,6 @@ static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; - /* NULL is printed as '(null)' by printf(): avoid that. */ - if (fc->source == NULL) - host_root = ""; - - fsi->host_root_path = - kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); - if (fsi->host_root_path == NULL) - return -ENOMEM; - root_inode = hostfs_iget(sb, fsi->host_root_path); if (IS_ERR(root_inode)) return PTR_ERR(root_inode); @@ -977,6 +968,58 @@ static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc) return 0; } +enum hostfs_parma { + Opt_hostfs, +}; + +static const struct fs_parameter_spec hostfs_param_specs[] = { + fsparam_string_empty("hostfs", Opt_hostfs), + {} +}; + +static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct hostfs_fs_info *fsi = fc->s_fs_info; + struct fs_parse_result result; + char *host_root; + int opt; + + opt = fs_parse(fc, hostfs_param_specs, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_hostfs: + host_root = param->string; + if (!*host_root) + host_root = ""; + fsi->host_root_path = + kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); + if (fsi->host_root_path == NULL) + return -ENOMEM; + break; + } + + return 0; +} + +static int hostfs_parse_monolithic(struct fs_context *fc, void *data) +{ + struct hostfs_fs_info *fsi = fc->s_fs_info; + char *host_root = (char *)data; + + /* NULL is printed as '(null)' by printf(): avoid that. */ + if (host_root == NULL) + host_root = ""; + + fsi->host_root_path = + kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); + if (fsi->host_root_path == NULL) + return -ENOMEM; + + return 0; +} + static int hostfs_fc_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, hostfs_fill_super); @@ -994,6 +1037,8 @@ static void hostfs_fc_free(struct fs_context *fc) } static const struct fs_context_operations hostfs_context_ops = { + .parse_monolithic = hostfs_parse_monolithic, + .parse_param = hostfs_parse_param, .get_tree = hostfs_fc_get_tree, .free = hostfs_fc_free, }; diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index c92937bed133..2c4b357d85e2 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1894,12 +1894,12 @@ init_cifs(void) WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); if (!serverclose_wq) { rc = -ENOMEM; - goto out_destroy_serverclose_wq; + goto out_destroy_deferredclose_wq; } rc = cifs_init_inodecache(); if (rc) - goto out_destroy_deferredclose_wq; + goto out_destroy_serverclose_wq; rc = cifs_init_netfs(); if (rc) @@ -1967,6 +1967,8 @@ out_destroy_netfs: cifs_destroy_netfs(); out_destroy_inodecache: cifs_destroy_inodecache(); +out_destroy_serverclose_wq: + destroy_workqueue(serverclose_wq); out_destroy_deferredclose_wq: destroy_workqueue(deferredclose_wq); out_destroy_cifsoplockd_wq: @@ -1977,8 +1979,6 @@ out_destroy_decrypt_wq: destroy_workqueue(decrypt_wq); out_destroy_cifsiod_wq: destroy_workqueue(cifsiod_wq); -out_destroy_serverclose_wq: - destroy_workqueue(serverclose_wq); out_clean_proc: cifs_proc_clean(); return rc; diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 7a16e12f5da8..d2307162a2de 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2614,6 +2614,13 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) cifs_dbg(VFS, "Server does not support mounting with posix SMB3.11 extensions\n"); rc = -EOPNOTSUPP; goto out_fail; + } else if (ses->server->vals->protocol_id == SMB10_PROT_ID) + if (cap_unix(ses)) + cifs_dbg(FYI, "Unix Extensions requested on SMB1 mount\n"); + else { + cifs_dbg(VFS, "SMB1 Unix Extensions not supported by server\n"); + rc = -EOPNOTSUPP; + goto out_fail; } else { cifs_dbg(VFS, "Check vers= mount option. SMB3.11 disabled but required for POSIX extensions\n"); @@ -3686,6 +3693,7 @@ error: } #endif +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /* * Issue a TREE_CONNECT request. */ @@ -3807,11 +3815,25 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, else tcon->Flags = 0; cifs_dbg(FYI, "Tcon flags: 0x%x\n", tcon->Flags); - } + /* + * reset_cifs_unix_caps calls QFSInfo which requires + * need_reconnect to be false, but we would not need to call + * reset_caps if this were not a reconnect case so must check + * need_reconnect flag here. The caller will also clear + * need_reconnect when tcon was successful but needed to be + * cleared earlier in the case of unix extensions reconnect + */ + if (tcon->need_reconnect && tcon->unix_ext) { + cifs_dbg(FYI, "resetting caps for %s\n", tcon->tree_name); + tcon->need_reconnect = false; + reset_cifs_unix_caps(xid, tcon, NULL, NULL); + } + } cifs_buf_release(smb_buffer); return rc; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ static void delayed_free(struct rcu_head *p) { diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 7fe59235f090..322cabc69c6f 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1812,6 +1812,10 @@ smb2_copychunk_range(const unsigned int xid, tcon = tlink_tcon(trgtfile->tlink); + trace_smb3_copychunk_enter(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dest_off, len); + while (len > 0) { pcchunk->SourceOffset = cpu_to_le64(src_off); pcchunk->TargetOffset = cpu_to_le64(dest_off); @@ -1863,6 +1867,9 @@ smb2_copychunk_range(const unsigned int xid, le32_to_cpu(retbuf->ChunksWritten), le32_to_cpu(retbuf->ChunkBytesWritten), bytes_written); + trace_smb3_copychunk_done(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dest_off, len); } else if (rc == -EINVAL) { if (ret_data_len != sizeof(struct copychunk_ioctl_rsp)) goto cchunk_out; @@ -2046,7 +2053,9 @@ smb2_duplicate_extents(const unsigned int xid, dup_ext_buf.ByteCount = cpu_to_le64(len); cifs_dbg(FYI, "Duplicate extents: src off %lld dst off %lld len %lld\n", src_off, dest_off, len); - + trace_smb3_clone_enter(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dest_off, len); inode = d_inode(trgtfile->dentry); if (inode->i_size < dest_off + len) { rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false); @@ -2075,6 +2084,15 @@ smb2_duplicate_extents(const unsigned int xid, cifs_dbg(FYI, "Non-zero response length in duplicate extents\n"); duplicate_extents_out: + if (rc) + trace_smb3_clone_err(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, + tcon->tid, tcon->ses->Suid, src_off, + dest_off, len, rc); + else + trace_smb3_clone_done(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dest_off, len); return rc; } diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 9fc5b11c0b6c..9a06b5594669 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1562,8 +1562,14 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) cifs_small_buf_release(sess_data->iov[0].iov_base); if (rc == 0) sess_data->ses->expired_pwd = false; - else if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) + else if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) { + if (sess_data->ses->expired_pwd == false) + trace_smb3_key_expired(sess_data->server->hostname, + sess_data->ses->user_name, + sess_data->server->conn_id, + &sess_data->server->dstaddr, rc); sess_data->ses->expired_pwd = true; + } memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec)); diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index 36d5295c2a6f..6b3bdfb97211 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -206,6 +206,116 @@ DEFINE_SMB3_OTHER_ERR_EVENT(query_dir_err); DEFINE_SMB3_OTHER_ERR_EVENT(zero_err); DEFINE_SMB3_OTHER_ERR_EVENT(falloc_err); +/* + * For logging errors in reflink and copy_range ops e.g. smb2_copychunk_range + * and smb2_duplicate_extents + */ +DECLARE_EVENT_CLASS(smb3_copy_range_err_class, + TP_PROTO(unsigned int xid, + __u64 src_fid, + __u64 target_fid, + __u32 tid, + __u64 sesid, + __u64 src_offset, + __u64 target_offset, + __u32 len, + int rc), + TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, src_fid) + __field(__u64, target_fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, src_offset) + __field(__u64, target_offset) + __field(__u32, len) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->src_fid = src_fid; + __entry->target_fid = target_fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->src_offset = src_offset; + __entry->target_offset = target_offset; + __entry->len = len; + __entry->rc = rc; + ), + TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x rc=%d", + __entry->xid, __entry->sesid, __entry->tid, __entry->target_fid, + __entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len, __entry->rc) +) + +#define DEFINE_SMB3_COPY_RANGE_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_copy_range_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 src_fid, \ + __u64 target_fid, \ + __u32 tid, \ + __u64 sesid, \ + __u64 src_offset, \ + __u64 target_offset, \ + __u32 len, \ + int rc), \ + TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len, rc)) + +DEFINE_SMB3_COPY_RANGE_ERR_EVENT(clone_err); +/* TODO: Add SMB3_COPY_RANGE_ERR_EVENT(copychunk_err) */ + +DECLARE_EVENT_CLASS(smb3_copy_range_done_class, + TP_PROTO(unsigned int xid, + __u64 src_fid, + __u64 target_fid, + __u32 tid, + __u64 sesid, + __u64 src_offset, + __u64 target_offset, + __u32 len), + TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, src_fid) + __field(__u64, target_fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, src_offset) + __field(__u64, target_offset) + __field(__u32, len) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->src_fid = src_fid; + __entry->target_fid = target_fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->src_offset = src_offset; + __entry->target_offset = target_offset; + __entry->len = len; + ), + TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x", + __entry->xid, __entry->sesid, __entry->tid, __entry->target_fid, + __entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len) +) + +#define DEFINE_SMB3_COPY_RANGE_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_copy_range_done_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 src_fid, \ + __u64 target_fid, \ + __u32 tid, \ + __u64 sesid, \ + __u64 src_offset, \ + __u64 target_offset, \ + __u32 len), \ + TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len)) + +DEFINE_SMB3_COPY_RANGE_DONE_EVENT(copychunk_enter); +DEFINE_SMB3_COPY_RANGE_DONE_EVENT(clone_enter); +DEFINE_SMB3_COPY_RANGE_DONE_EVENT(copychunk_done); +DEFINE_SMB3_COPY_RANGE_DONE_EVENT(clone_done); + /* For logging successful read or write */ DECLARE_EVENT_CLASS(smb3_rw_done_class, @@ -1171,6 +1281,46 @@ DEFINE_EVENT(smb3_connect_err_class, smb3_##name, \ DEFINE_SMB3_CONNECT_ERR_EVENT(connect_err); +DECLARE_EVENT_CLASS(smb3_sess_setup_err_class, + TP_PROTO(char *hostname, char *username, __u64 conn_id, + const struct __kernel_sockaddr_storage *dst_addr, int rc), + TP_ARGS(hostname, username, conn_id, dst_addr, rc), + TP_STRUCT__entry( + __string(hostname, hostname) + __string(username, username) + __field(__u64, conn_id) + __array(__u8, dst_addr, sizeof(struct sockaddr_storage)) + __field(int, rc) + ), + TP_fast_assign( + struct sockaddr_storage *pss = NULL; + + __entry->conn_id = conn_id; + __entry->rc = rc; + pss = (struct sockaddr_storage *)__entry->dst_addr; + *pss = *dst_addr; + __assign_str(hostname); + __assign_str(username); + ), + TP_printk("rc=%d user=%s conn_id=0x%llx server=%s addr=%pISpsfc", + __entry->rc, + __get_str(username), + __entry->conn_id, + __get_str(hostname), + __entry->dst_addr) +) + +#define DEFINE_SMB3_SES_SETUP_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_sess_setup_err_class, smb3_##name, \ + TP_PROTO(char *hostname, \ + char *username, \ + __u64 conn_id, \ + const struct __kernel_sockaddr_storage *addr, \ + int rc), \ + TP_ARGS(hostname, username, conn_id, addr, rc)) + +DEFINE_SMB3_SES_SETUP_ERR_EVENT(key_expired); + DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_PROTO(__u64 currmid, __u64 conn_id, diff --git a/fs/super.c b/fs/super.c index 095ba793e10c..38d72a3cf6fc 100644 --- a/fs/super.c +++ b/fs/super.c @@ -736,6 +736,17 @@ struct super_block *sget_fc(struct fs_context *fc, struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; int err; + /* + * Never allow s_user_ns != &init_user_ns when FS_USERNS_MOUNT is + * not set, as the filesystem is likely unprepared to handle it. + * This can happen when fsconfig() is called from init_user_ns with + * an fs_fd opened in another user namespace. + */ + if (user_ns != &init_user_ns && !(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) { + errorfc(fc, "VFS: Mounting from non-initial user namespace is not allowed"); + return ERR_PTR(-EPERM); + } + retry: spin_lock(&sb_lock); if (test) { diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index 75461777c466..0b48cbab8a3d 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -82,6 +82,7 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; /** * ubifs_compress - compress data. + * @c: UBIFS file-system description object * @in_buf: data to compress * @in_len: length of the data to compress * @out_buf: output buffer where compressed data should be stored @@ -140,6 +141,7 @@ no_compr: /** * ubifs_decompress - decompress data. + * @c: UBIFS file-system description object * @in_buf: data to decompress * @in_len: length of the data to decompress * @out_buf: output buffer where decompressed data should diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index ac77ac1fd73e..d91cec93d968 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2827,9 +2827,9 @@ void dbg_debugfs_init_fs(struct ubifs_info *c) const char *fname; struct ubifs_debug_info *d = c->dbg; - n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, + n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN, UBIFS_DFS_DIR_NAME, c->vi.ubi_num, c->vi.vol_id); - if (n > UBIFS_DFS_DIR_LEN) { + if (n >= UBIFS_DFS_DIR_LEN) { /* The array size is too small */ return; } diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index ed966108da80..d425861e6b82 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -19,10 +19,11 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c, /* * The UBIFS debugfs directory name pattern and maximum name length (3 for "ubi" - * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte. + * + 1 for "_" and 2 for UBI device numbers and 3 for volume number and 1 for + * the trailing zero byte. */ #define UBIFS_DFS_DIR_NAME "ubi%d_%d" -#define UBIFS_DFS_DIR_LEN (3 + 1 + 2*2 + 1) +#define UBIFS_DFS_DIR_LEN (3 + 1 + 2 + 3 + 1) /** * ubifs_debug_info - per-FS debugging information. @@ -103,7 +104,7 @@ struct ubifs_debug_info { unsigned int chk_fs:1; unsigned int tst_rcvry:1; - char dfs_dir_name[UBIFS_DFS_DIR_LEN + 1]; + char dfs_dir_name[UBIFS_DFS_DIR_LEN]; struct dentry *dfs_dir; struct dentry *dfs_dump_lprops; struct dentry *dfs_dump_budg; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index eac0fef801f1..c77ea57fe696 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -71,8 +71,13 @@ static int inherit_flags(const struct inode *dir, umode_t mode) * @is_xattr: whether the inode is xattr inode * * This function finds an unused inode number, allocates new inode and - * initializes it. Returns new inode in case of success and an error code in - * case of failure. + * initializes it. Non-xattr new inode may be written with xattrs(selinux/ + * encryption) before writing dentry, which could cause inconsistent problem + * when powercut happens between two operations. To deal with it, non-xattr + * new inode is initialized with zero-nlink and added into orphan list, caller + * should make sure that inode is relinked later, and make sure that orphan + * removing and journal writing into an committing atomic operation. Returns + * new inode in case of success and an error code in case of failure. */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, umode_t mode, bool is_xattr) @@ -163,9 +168,25 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, ui->creat_sqnum = ++c->max_sqnum; spin_unlock(&c->cnt_lock); + if (!is_xattr) { + set_nlink(inode, 0); + err = ubifs_add_orphan(c, inode->i_ino); + if (err) { + ubifs_err(c, "ubifs_add_orphan failed: %i", err); + goto out_iput; + } + down_read(&c->commit_sem); + ui->del_cmtno = c->cmt_no; + up_read(&c->commit_sem); + } + if (encrypted) { err = fscrypt_set_context(inode, NULL); if (err) { + if (!is_xattr) { + set_nlink(inode, 1); + ubifs_delete_orphan(c, inode->i_ino); + } ubifs_err(c, "fscrypt_set_context failed: %i", err); goto out_iput; } @@ -320,12 +341,13 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir, if (err) goto out_inode; + set_nlink(inode, 1); mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -340,8 +362,8 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); + set_nlink(inode, 0); out_inode: - make_bad_inode(inode); iput(inode); out_fname: fscrypt_free_filename(&nm); @@ -386,7 +408,6 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) return inode; out_inode: - make_bad_inode(inode); iput(inode); out_free: ubifs_err(c, "cannot create whiteout file, error %d", err); @@ -470,6 +491,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, if (err) goto out_inode; + set_nlink(inode, 1); mutex_lock(&ui->ui_mutex); insert_inode_hash(inode); d_tmpfile(file, inode); @@ -479,7 +501,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, mutex_unlock(&ui->ui_mutex); lock_2_inodes(dir, inode); - err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 1); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -492,7 +514,6 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, out_cancel: unlock_2_inodes(dir, inode); out_inode: - make_bad_inode(inode); if (!instantiated) iput(inode); out_budg: @@ -760,10 +781,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, lock_2_inodes(dir, inode); - /* Handle O_TMPFILE corner case, it is allowed to link a O_TMPFILE. */ - if (inode->i_nlink == 0) - ubifs_delete_orphan(c, inode->i_ino); - inc_nlink(inode); ihold(inode); inode_set_ctime_current(inode); @@ -771,7 +788,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, inode->i_nlink == 1); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -785,8 +802,6 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; drop_nlink(inode); - if (inode->i_nlink == 0) - ubifs_add_orphan(c, inode->i_ino); unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); iput(inode); @@ -846,7 +861,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 0); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -950,7 +965,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 0); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -1017,6 +1032,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (err) goto out_inode; + set_nlink(inode, 1); mutex_lock(&dir_ui->ui_mutex); insert_inode_hash(inode); inc_nlink(inode); @@ -1025,7 +1041,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1); if (err) { ubifs_err(c, "cannot create directory, error %d", err); goto out_cancel; @@ -1042,8 +1058,8 @@ out_cancel: dir_ui->ui_size = dir->i_size; drop_nlink(dir); mutex_unlock(&dir_ui->ui_mutex); + set_nlink(inode, 0); out_inode: - make_bad_inode(inode); iput(inode); out_fname: fscrypt_free_filename(&nm); @@ -1102,22 +1118,25 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, goto out_fname; } + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) { + kfree(dev); + goto out_inode; + } + init_special_inode(inode, inode->i_mode, rdev); inode->i_size = ubifs_inode(inode)->ui_size = devlen; ui = ubifs_inode(inode); ui->data = dev; ui->data_len = devlen; - - err = ubifs_init_security(dir, inode, &dentry->d_name); - if (err) - goto out_inode; + set_nlink(inode, 1); mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -1132,10 +1151,8 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); + set_nlink(inode, 0); out_inode: - /* Free inode->i_link before inode is marked as bad. */ - fscrypt_free_inode(inode); - make_bad_inode(inode); iput(inode); out_fname: fscrypt_free_filename(&nm); @@ -1186,6 +1203,10 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out_fname; } + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_inode; + ui = ubifs_inode(inode); ui->data = kmalloc(disk_link.len, GFP_NOFS); if (!ui->data) { @@ -1210,17 +1231,14 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, */ ui->data_len = disk_link.len - 1; inode->i_size = ubifs_inode(inode)->ui_size = disk_link.len - 1; - - err = ubifs_init_security(dir, inode, &dentry->d_name); - if (err) - goto out_inode; + set_nlink(inode, 1); mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -1234,10 +1252,10 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); + set_nlink(inode, 0); out_inode: /* Free inode->i_link before inode is marked as bad. */ fscrypt_free_inode(inode); - make_bad_inode(inode); iput(inode); out_fname: fscrypt_free_filename(&nm); @@ -1405,14 +1423,10 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, */ err = ubifs_budget_space(c, &wht_req); if (err) { - /* - * Whiteout inode can not be written on flash by - * ubifs_jnl_write_inode(), because it's neither - * dirty nor zero-nlink. - */ iput(whiteout); goto out_release; } + set_nlink(whiteout, 1); /* Add the old_dentry size to the old_dir size. */ old_sz -= CALC_DENT_SIZE(fname_len(&old_nm)); @@ -1491,7 +1505,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, } err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir, - new_inode, &new_nm, whiteout, sync); + new_inode, &new_nm, whiteout, sync, !!whiteout); if (err) goto out_cancel; @@ -1544,6 +1558,7 @@ out_cancel: unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); if (whiteout) { ubifs_release_budget(c, &wht_req); + set_nlink(whiteout, 0); iput(whiteout); } out_release: diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index a1f46919934c..68e104423a48 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1027,7 +1027,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, /* Is the folio fully inside i_size? */ if (folio_pos(folio) + len <= i_size) { - if (folio_pos(folio) >= synced_i_size) { + if (folio_pos(folio) + len > synced_i_size) { err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_redirty; diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 6ebf3c04ac5f..643718906b9f 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -73,7 +73,7 @@ static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops) * @c: the UBIFS file-system description object * @lprops: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @arg: information passed to and from the caller of the scan * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree @@ -340,7 +340,7 @@ out: * @c: the UBIFS file-system description object * @lprops: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @arg: information passed to and from the caller of the scan * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree @@ -581,7 +581,7 @@ out: * @c: the UBIFS file-system description object * @lprops: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @arg: information passed to and from the caller of the scan * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree @@ -773,7 +773,7 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) * @c: the UBIFS file-system description object * @lprops: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @arg: information passed to and from the caller of the scan * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 74aee92433d7..4a35f9e8f668 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -359,7 +359,7 @@ static void wake_up_reservation(struct ubifs_info *c) } /** - * wake_up_reservation - add current task in queue or start queuing. + * add_or_start_queue - add current task in queue or start queuing. * @c: UBIFS file-system description object * * This function starts queuing if queuing is not started, otherwise adds @@ -643,6 +643,7 @@ static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) * @inode: inode to update * @deletion: indicates a directory entry deletion i.e unlink or rmdir * @xent: non-zero if the directory entry is an extended attribute entry + * @in_orphan: indicates whether the @inode is in orphan list * * This function updates an inode by writing a directory entry (or extended * attribute entry), the inode itself, and the parent directory inode (or the @@ -664,7 +665,7 @@ static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) */ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, const struct fscrypt_name *nm, const struct inode *inode, - int deletion, int xent) + int deletion, int xent, int in_orphan) { int err, dlen, ilen, len, lnum, ino_offs, dent_offs, orphan_added = 0; int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir); @@ -750,7 +751,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, if (err) goto out_release; - if (last_reference) { + if (last_reference && !in_orphan) { err = ubifs_add_orphan(c, inode->i_ino); if (err) { release_head(c, BASEHD); @@ -806,6 +807,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, if (err) goto out_ro; + if (in_orphan && inode->i_nlink) + ubifs_delete_orphan(c, inode->i_ino); + finish_reservation(c); spin_lock(&ui->ui_lock); ui->synced_i_size = ui->ui_size; @@ -1336,6 +1340,7 @@ out_free: * @new_nm: new name of the new directory entry * @whiteout: whiteout inode * @sync: non-zero if the write-buffer has to be synchronized + * @delete_orphan: indicates an orphan entry deletion for @whiteout * * This function implements the re-name operation which may involve writing up * to 4 inodes(new inode, whiteout inode, old and new parent directory inodes) @@ -1348,7 +1353,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct inode *new_dir, const struct inode *new_inode, const struct fscrypt_name *new_nm, - const struct inode *whiteout, int sync) + const struct inode *whiteout, int sync, int delete_orphan) { void *p; union ubifs_key key; @@ -1565,6 +1570,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, goto out_ro; } + if (delete_orphan) + ubifs_delete_orphan(c, whiteout->i_ino); + finish_reservation(c); if (new_inode) { mark_inode_clean(c, new_ui); diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index a11c3dab7e16..8788740ec57f 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1005,7 +1005,7 @@ out: * @c: the UBIFS file-system description object * @lp: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @lst: lprops statistics to update + * @arg: lprops statistics to update * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 778a22bf9a92..441d0beca4cf 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -1918,6 +1918,7 @@ out_err: * @pnode: where to keep a pnode * @cnode: where to keep a cnode * @in_tree: is the node in the tree in memory + * @ptr: union of node pointers * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in * the tree * @ptr.pnode: ditto for pnode diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 7adc37c10b6a..a148760fa49e 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -67,10 +67,13 @@ static int mst_node_check_hash(const struct ubifs_info *c, { u8 calc[UBIFS_MAX_HASH_LEN]; const void *node = mst; + int ret; - crypto_shash_tfm_digest(c->hash_tfm, node + sizeof(struct ubifs_ch), + ret = crypto_shash_tfm_digest(c->hash_tfm, node + sizeof(struct ubifs_ch), UBIFS_MST_NODE_SZ - sizeof(struct ubifs_ch), calc); + if (ret) + return ret; if (ubifs_check_hash(c, expected, calc)) return -EPERM; diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 4909321d84cf..fb957d963ba6 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -42,24 +42,30 @@ static int dbg_check_orphans(struct ubifs_info *c); -static struct ubifs_orphan *orphan_add(struct ubifs_info *c, ino_t inum, - struct ubifs_orphan *parent_orphan) +/** + * ubifs_add_orphan - add an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * Add an orphan. This function is called when an inodes link count drops to + * zero. + */ +int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) { struct ubifs_orphan *orphan, *o; struct rb_node **p, *parent = NULL; orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS); if (!orphan) - return ERR_PTR(-ENOMEM); + return -ENOMEM; orphan->inum = inum; orphan->new = 1; - INIT_LIST_HEAD(&orphan->child_list); spin_lock(&c->orphan_lock); if (c->tot_orphans >= c->max_orphans) { spin_unlock(&c->orphan_lock); kfree(orphan); - return ERR_PTR(-ENFILE); + return -ENFILE; } p = &c->orph_tree.rb_node; while (*p) { @@ -73,7 +79,7 @@ static struct ubifs_orphan *orphan_add(struct ubifs_info *c, ino_t inum, ubifs_err(c, "orphaned twice"); spin_unlock(&c->orphan_lock); kfree(orphan); - return ERR_PTR(-EINVAL); + return -EINVAL; } } c->tot_orphans += 1; @@ -83,14 +89,9 @@ static struct ubifs_orphan *orphan_add(struct ubifs_info *c, ino_t inum, list_add_tail(&orphan->list, &c->orph_list); list_add_tail(&orphan->new_list, &c->orph_new); - if (parent_orphan) { - list_add_tail(&orphan->child_list, - &parent_orphan->child_list); - } - spin_unlock(&c->orphan_lock); dbg_gen("ino %lu", (unsigned long)inum); - return orphan; + return 0; } static struct ubifs_orphan *lookup_orphan(struct ubifs_info *c, ino_t inum) @@ -135,6 +136,7 @@ static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) if (orph->cmt) { orph->del = 1; + rb_erase(&orph->rb, &c->orph_tree); orph->dnext = c->orph_dnext; c->orph_dnext = orph; dbg_gen("delete later ino %lu", (unsigned long)orph->inum); @@ -144,59 +146,6 @@ static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) __orphan_drop(c, orph); } -/** - * ubifs_add_orphan - add an orphan. - * @c: UBIFS file-system description object - * @inum: orphan inode number - * - * Add an orphan. This function is called when an inodes link count drops to - * zero. - */ -int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) -{ - int err = 0; - ino_t xattr_inum; - union ubifs_key key; - struct ubifs_dent_node *xent, *pxent = NULL; - struct fscrypt_name nm = {0}; - struct ubifs_orphan *xattr_orphan; - struct ubifs_orphan *orphan; - - orphan = orphan_add(c, inum, NULL); - if (IS_ERR(orphan)) - return PTR_ERR(orphan); - - lowest_xent_key(c, &key, inum); - while (1) { - xent = ubifs_tnc_next_ent(c, &key, &nm); - if (IS_ERR(xent)) { - err = PTR_ERR(xent); - if (err == -ENOENT) - break; - kfree(pxent); - return err; - } - - fname_name(&nm) = xent->name; - fname_len(&nm) = le16_to_cpu(xent->nlen); - xattr_inum = le64_to_cpu(xent->inum); - - xattr_orphan = orphan_add(c, xattr_inum, orphan); - if (IS_ERR(xattr_orphan)) { - kfree(pxent); - kfree(xent); - return PTR_ERR(xattr_orphan); - } - - kfree(pxent); - pxent = xent; - key_read(c, &xent->key, &key); - } - kfree(pxent); - - return 0; -} - /** * ubifs_delete_orphan - delete an orphan. * @c: UBIFS file-system description object @@ -206,7 +155,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) */ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) { - struct ubifs_orphan *orph, *child_orph, *tmp_o; + struct ubifs_orphan *orph; spin_lock(&c->orphan_lock); @@ -219,11 +168,6 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) return; } - list_for_each_entry_safe(child_orph, tmp_o, &orph->child_list, child_list) { - list_del(&child_orph->child_list); - orphan_delete(c, child_orph); - } - orphan_delete(c, orph); spin_unlock(&c->orphan_lock); @@ -518,7 +462,6 @@ static void erase_deleted(struct ubifs_info *c) dnext = orphan->dnext; ubifs_assert(c, !orphan->new); ubifs_assert(c, orphan->del); - rb_erase(&orphan->rb, &c->orph_tree); list_del(&orphan->list); c->tot_orphans -= 1; dbg_gen("deleting orphan ino %lu", (unsigned long)orphan->inum); @@ -570,51 +513,6 @@ int ubifs_clear_orphans(struct ubifs_info *c) return 0; } -/** - * insert_dead_orphan - insert an orphan. - * @c: UBIFS file-system description object - * @inum: orphan inode number - * - * This function is a helper to the 'do_kill_orphans()' function. The orphan - * must be kept until the next commit, so it is added to the rb-tree and the - * deletion list. - */ -static int insert_dead_orphan(struct ubifs_info *c, ino_t inum) -{ - struct ubifs_orphan *orphan, *o; - struct rb_node **p, *parent = NULL; - - orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL); - if (!orphan) - return -ENOMEM; - orphan->inum = inum; - - p = &c->orph_tree.rb_node; - while (*p) { - parent = *p; - o = rb_entry(parent, struct ubifs_orphan, rb); - if (inum < o->inum) - p = &(*p)->rb_left; - else if (inum > o->inum) - p = &(*p)->rb_right; - else { - /* Already added - no problem */ - kfree(orphan); - return 0; - } - } - c->tot_orphans += 1; - rb_link_node(&orphan->rb, parent, p); - rb_insert_color(&orphan->rb, &c->orph_tree); - list_add_tail(&orphan->list, &c->orph_list); - orphan->del = 1; - orphan->dnext = c->orph_dnext; - c->orph_dnext = orphan; - dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum, - c->new_orphans, c->tot_orphans); - return 0; -} - /** * do_kill_orphans - remove orphan inodes from the index. * @c: UBIFS file-system description object @@ -691,12 +589,12 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3; for (i = 0; i < n; i++) { - union ubifs_key key1, key2; + union ubifs_key key; inum = le64_to_cpu(orph->inos[i]); - ino_key_init(c, &key1, inum); - err = ubifs_tnc_lookup(c, &key1, ino); + ino_key_init(c, &key, inum); + err = ubifs_tnc_lookup(c, &key, ino); if (err && err != -ENOENT) goto out_free; @@ -708,17 +606,10 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, dbg_rcvry("deleting orphaned inode %lu", (unsigned long)inum); - lowest_ino_key(c, &key1, inum); - highest_ino_key(c, &key2, inum); - - err = ubifs_tnc_remove_range(c, &key1, &key2); + err = ubifs_tnc_remove_ino(c, inum); if (err) goto out_ro; } - - err = insert_dead_orphan(c, inum); - if (err) - goto out_free; } *last_cmt_no = cmt_no; @@ -925,8 +816,12 @@ static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, inum = key_inum(c, &zbr->key); if (inum != ci->last_ino) { - /* Lowest node type is the inode node, so it comes first */ - if (key_type(c, &zbr->key) != UBIFS_INO_KEY) + /* + * Lowest node type is the inode node or xattr entry(when + * selinux/encryption is enabled), so it comes first + */ + if (key_type(c, &zbr->key) != UBIFS_INO_KEY && + key_type(c, &zbr->key) != UBIFS_XENT_KEY) ubifs_err(c, "found orphan node ino %lu, type %d", (unsigned long)inum, key_type(c, &zbr->key)); ci->last_ino = inum; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 17da28d6247a..a950c5f2560e 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -29,6 +29,7 @@ * @lnum: logical eraseblock number of the node * @offs: node offset * @len: node length + * @hash: node hash * @deletion: non-zero if this entry corresponds to a node deletion * @sqnum: node sequence number * @list: links the replay list diff --git a/fs/ubifs/sysfs.c b/fs/ubifs/sysfs.c index 1c958148bb87..aae32222f11b 100644 --- a/fs/ubifs/sysfs.c +++ b/fs/ubifs/sysfs.c @@ -91,17 +91,17 @@ static struct kset ubifs_kset = { int ubifs_sysfs_register(struct ubifs_info *c) { int ret, n; - char dfs_dir_name[UBIFS_DFS_DIR_LEN+1]; + char dfs_dir_name[UBIFS_DFS_DIR_LEN]; c->stats = kzalloc(sizeof(struct ubifs_stats_info), GFP_KERNEL); if (!c->stats) { ret = -ENOMEM; goto out_last; } - n = snprintf(dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, + n = snprintf(dfs_dir_name, UBIFS_DFS_DIR_LEN, UBIFS_DFS_DIR_NAME, c->vi.ubi_num, c->vi.vol_id); - if (n > UBIFS_DFS_DIR_LEN) { + if (n >= UBIFS_DFS_DIR_LEN) { /* The array size is too small */ ret = -EINVAL; goto out_free; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 1f3ea879d93a..d69a5a42d693 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -157,13 +157,6 @@ #define UBIFS_HMAC_ARR_SZ 0 #endif -/* - * The UBIFS sysfs directory name pattern and maximum name length (3 for "ubi" - * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte. - */ -#define UBIFS_DFS_DIR_NAME "ubi%d_%d" -#define UBIFS_DFS_DIR_LEN (3 + 1 + 2*2 + 1) - /* * Lockdep classes for UBIFS inode @ui_mutex. */ @@ -923,8 +916,6 @@ struct ubifs_budget_req { * @rb: rb-tree node of rb-tree of orphans sorted by inode number * @list: list head of list of orphans in order added * @new_list: list head of list of orphans added since the last commit - * @child_list: list of xattr children if this orphan hosts xattrs, list head - * if this orphan is a xattr, not used otherwise. * @cnext: next orphan to commit * @dnext: next orphan to delete * @inum: inode number @@ -936,7 +927,6 @@ struct ubifs_orphan { struct rb_node rb; struct list_head list; struct list_head new_list; - struct list_head child_list; struct ubifs_orphan *cnext; struct ubifs_orphan *dnext; ino_t inum; @@ -1803,7 +1793,7 @@ int ubifs_consolidate_log(struct ubifs_info *c); /* journal.c */ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, const struct fscrypt_name *nm, const struct inode *inode, - int deletion, int xent); + int deletion, int xent, int in_orphan); int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len); int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); @@ -1820,7 +1810,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct inode *new_dir, const struct inode *new_inode, const struct fscrypt_name *new_nm, - const struct inode *whiteout, int sync); + const struct inode *whiteout, int sync, int delete_orphan); int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, loff_t old_size, loff_t new_size); int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 0847db521984..f734588b224a 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -149,7 +149,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, if (strcmp(fname_name(nm), UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) == 0) host_ui->flags |= UBIFS_CRYPT_FL; - err = ubifs_jnl_update(c, host, nm, inode, 0, 1); + err = ubifs_jnl_update(c, host, nm, inode, 0, 1, 0); if (err) goto out_cancel; ubifs_set_inode_flags(host); diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c index bc1a7c8b5c8d..77b685db8275 100644 --- a/fs/unicode/mkutf8data.c +++ b/fs/unicode/mkutf8data.c @@ -3352,6 +3352,7 @@ static void write_file(void) fprintf(file, "};\n"); fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);"); fprintf(file, "\n"); + fprintf(file, "MODULE_DESCRIPTION(\"UTF8 data table\");\n"); fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n"); fclose(file); } diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c index eb2bbdd688d7..600e15efe9ed 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/utf8-selftest.c @@ -14,8 +14,8 @@ #include "utf8n.h" -unsigned int failed_tests; -unsigned int total_tests; +static unsigned int failed_tests; +static unsigned int total_tests; /* Tests will be based on this version. */ #define UTF8_LATEST UNICODE_AGE(12, 1, 0) @@ -307,4 +307,5 @@ module_init(init_test_ucd); module_exit(exit_test_ucd); MODULE_AUTHOR("Gabriel Krisman Bertazi "); +MODULE_DESCRIPTION("Kernel module for testing utf-8 support"); MODULE_LICENSE("GPL"); diff --git a/fs/unicode/utf8data.c_shipped b/fs/unicode/utf8data.c_shipped index d9b62901aa96..dafa5fed761d 100644 --- a/fs/unicode/utf8data.c_shipped +++ b/fs/unicode/utf8data.c_shipped @@ -4120,4 +4120,5 @@ struct utf8data_table utf8_data_table = { .utf8data = utf8data, }; EXPORT_SYMBOL_GPL(utf8_data_table); +MODULE_DESCRIPTION("UTF8 data table"); MODULE_LICENSE("GPL v2"); diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 60b25020281f..0bea1afbd747 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -21,6 +21,21 @@ struct cxl_event_record_hdr { u8 reserved[15]; } __packed; +struct cxl_event_media_hdr { + struct cxl_event_record_hdr hdr; + __le64 phys_addr; + u8 descriptor; + u8 type; + u8 transaction_type; + /* + * The meaning of Validity Flags from bit 2 is + * different across DRAM and General Media records + */ + u8 validity_flags[2]; + u8 channel; + u8 rank; +} __packed; + #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 struct cxl_event_generic { struct cxl_event_record_hdr hdr; @@ -33,14 +48,7 @@ struct cxl_event_generic { */ #define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 struct cxl_event_gen_media { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; - u8 descriptor; - u8 type; - u8 transaction_type; - u8 validity_flags[2]; - u8 channel; - u8 rank; + struct cxl_event_media_hdr media_hdr; u8 device[3]; u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE]; u8 reserved[46]; @@ -52,14 +60,7 @@ struct cxl_event_gen_media { */ #define CXL_EVENT_DER_CORRECTION_MASK_SIZE 0x20 struct cxl_event_dram { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; - u8 descriptor; - u8 type; - u8 transaction_type; - u8 validity_flags[2]; - u8 channel; - u8 rank; + struct cxl_event_media_hdr media_hdr; u8 nibble_mask[3]; u8 bank_group; u8 bank; @@ -95,21 +96,13 @@ struct cxl_event_mem_module { u8 reserved[0x3d]; } __packed; -/* - * General Media or DRAM Event Common Fields - * - provides common access to phys_addr - */ -struct cxl_event_common { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; -} __packed; - union cxl_event { struct cxl_event_generic generic; struct cxl_event_gen_media gen_media; struct cxl_event_dram dram; struct cxl_event_mem_module mem_module; - struct cxl_event_common common; + /* dram & gen_media event header */ + struct cxl_event_media_hdr media_hdr; } __packed; /* diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3bb6198d1523..3315005df117 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -404,7 +404,7 @@ struct io_ring_ctx { spinlock_t napi_lock; /* napi_list lock */ /* napi busy poll default timeout */ - unsigned int napi_busy_poll_to; + ktime_t napi_busy_poll_dt; bool napi_prefer_busy_poll; bool napi_enabled; @@ -461,7 +461,6 @@ enum { REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, REQ_F_POLL_NO_LAZY_BIT, - REQ_F_CANCEL_SEQ_BIT, REQ_F_CAN_POLL_BIT, REQ_F_BL_EMPTY_BIT, REQ_F_BL_NO_RECYCLE_BIT, @@ -536,8 +535,6 @@ enum { REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT), /* don't use lazy poll wake for this request */ REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), - /* cancel sequence is set and valid */ - REQ_F_CANCEL_SEQ = IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT), /* file is pollable */ REQ_F_CAN_POLL = IO_REQ_FLAG(REQ_F_CAN_POLL_BIT), /* buffer list was empty after selection of buffer */ diff --git a/include/linux/minmax.h b/include/linux/minmax.h index a7ef65f78933..9c2848abc804 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -45,17 +45,20 @@ #define __cmp(op, x, y) ((x) __cmp_op_##op (y) ? (x) : (y)) -#define __cmp_once(op, x, y, unique_x, unique_y) ({ \ - typeof(x) unique_x = (x); \ - typeof(y) unique_y = (y); \ +#define __cmp_once_unique(op, type, x, y, ux, uy) \ + ({ type ux = (x); type uy = (y); __cmp(op, ux, uy); }) + +#define __cmp_once(op, type, x, y) \ + __cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) + +#define __careful_cmp_once(op, x, y) ({ \ static_assert(__types_ok(x, y), \ #op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \ - __cmp(op, unique_x, unique_y); }) + __cmp_once(op, __auto_type, x, y); }) #define __careful_cmp(op, x, y) \ __builtin_choose_expr(__is_constexpr((x) - (y)), \ - __cmp(op, x, y), \ - __cmp_once(op, x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y))) + __cmp(op, x, y), __careful_cmp_once(op, x, y)) #define __clamp(val, lo, hi) \ ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) @@ -158,7 +161,7 @@ * @x: first value * @y: second value */ -#define min_t(type, x, y) __careful_cmp(min, (type)(x), (type)(y)) +#define min_t(type, x, y) __cmp_once(min, type, x, y) /** * max_t - return maximum of two values, using the specified type @@ -166,7 +169,7 @@ * @x: first value * @y: second value */ -#define max_t(type, x, y) __careful_cmp(max, (type)(x), (type)(y)) +#define max_t(type, x, y) __cmp_once(max, type, x, y) /* * Do not check the array parameter using __must_be_array(). diff --git a/include/linux/nvme.h b/include/linux/nvme.h index c12a329dd463..7b2ae2e43544 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -485,6 +485,9 @@ enum { NVME_ID_NS_NVM_STS_MASK = 0x7f, NVME_ID_NS_NVM_GUARD_SHIFT = 7, NVME_ID_NS_NVM_GUARD_MASK = 0x3, + NVME_ID_NS_NVM_QPIF_SHIFT = 9, + NVME_ID_NS_NVM_QPIF_MASK = 0xf, + NVME_ID_NS_NVM_QPIFS = 1 << 3, }; static inline __u8 nvme_elbaf_sts(__u32 elbaf) @@ -497,6 +500,11 @@ static inline __u8 nvme_elbaf_guard_type(__u32 elbaf) return (elbaf >> NVME_ID_NS_NVM_GUARD_SHIFT) & NVME_ID_NS_NVM_GUARD_MASK; } +static inline __u8 nvme_elbaf_qualified_guard_type(__u32 elbaf) +{ + return (elbaf >> NVME_ID_NS_NVM_QPIF_SHIFT) & NVME_ID_NS_NVM_QPIF_MASK; +} + struct nvme_id_ctrl_nvm { __u8 vsl; __u8 wzsl; @@ -576,6 +584,7 @@ enum { NVME_NVM_NS_16B_GUARD = 0, NVME_NVM_NS_32B_GUARD = 1, NVME_NVM_NS_64B_GUARD = 2, + NVME_NVM_NS_QTYPE_GUARD = 3, }; static inline __u8 nvme_lbaf_index(__u8 flbas) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 3064314f4832..d8e4105a2f21 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -138,13 +139,26 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) return raw_copy_to_user(to, from, n); } -#ifdef INLINE_COPY_FROM_USER +/* + * Architectures that #define INLINE_COPY_TO_USER use this function + * directly in the normal copy_to/from_user(), the other ones go + * through an extern _copy_to/from_user(), which expands the same code + * here. + * + * Rust code always uses the extern definition. + */ static inline __must_check unsigned long -_copy_from_user(void *to, const void __user *from, unsigned long n) +_inline_copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { + /* + * Ensure that bad access_ok() speculation will not + * lead to nasty side effects *after* the copy is + * finished: + */ + barrier_nospec(); instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); @@ -153,14 +167,11 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) memset(to + (n - res), 0, res); return res; } -#else extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); -#endif -#ifdef INLINE_COPY_TO_USER static inline __must_check unsigned long -_copy_to_user(void __user *to, const void *from, unsigned long n) +_inline_copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) @@ -171,25 +182,32 @@ _copy_to_user(void __user *to, const void *from, unsigned long n) } return n; } -#else extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); -#endif static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { - if (check_copy_size(to, n, false)) - n = _copy_from_user(to, from, n); - return n; + if (!check_copy_size(to, n, false)) + return n; +#ifdef INLINE_COPY_FROM_USER + return _inline_copy_from_user(to, from, n); +#else + return _copy_from_user(to, from, n); +#endif } static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { - if (check_copy_size(from, n, true)) - n = _copy_to_user(to, from, n); - return n; + if (!check_copy_size(from, n, true)) + return n; + +#ifdef INLINE_COPY_TO_USER + return _inline_copy_to_user(to, from, n); +#else + return _copy_to_user(to, from, n); +#endif } #ifndef copy_mc_to_kernel diff --git a/init/Kconfig b/init/Kconfig index f5f3232fc39c..76c67620dc89 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1924,7 +1924,10 @@ config RUSTC_VERSION_TEXT config BINDGEN_VERSION_TEXT string depends on RUST - default $(shell,command -v $(BINDGEN) >/dev/null 2>&1 && $(BINDGEN) --version || echo n) + # The dummy parameter `workaround-for-0.69.0` is required to support 0.69.0 + # (https://github.com/rust-lang/rust-bindgen/pull/2678). It can be removed when + # the minimum version is upgraded past that (0.69.1 already fixed the issue). + default $(shell,command -v $(BINDGEN) >/dev/null 2>&1 && $(BINDGEN) --version workaround-for-0.69.0 || echo n) # # Place an empty function call at each tracepoint site. Can be diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8e6faa942a6f..3942db160f18 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1849,7 +1849,7 @@ fail: } while (1); /* avoid locking problems by failing it from a clean context */ - if (ret < 0) + if (ret) io_req_task_queue_fail(req, ret); } @@ -2416,12 +2416,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (uts) { struct timespec64 ts; + ktime_t dt; if (get_timespec64(&ts, uts)) return -EFAULT; - iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); - io_napi_adjust_timeout(ctx, &iowq, &ts); + dt = timespec64_to_ktime(ts); + iowq.timeout = ktime_add(dt, ktime_get()); + io_napi_adjust_timeout(ctx, &iowq, dt); } if (sig) { @@ -3031,8 +3033,11 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) bool loop = false; io_uring_drop_tctx_refs(current); + if (!tctx_inflight(tctx, !cancel_all)) + break; + /* read completions before cancelations */ - inflight = tctx_inflight(tctx, !cancel_all); + inflight = tctx_inflight(tctx, false); if (!inflight) break; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e1ce908f0679..c2acf6180845 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -43,7 +43,7 @@ struct io_wait_queue { ktime_t timeout; #ifdef CONFIG_NET_RX_BUSY_POLL - unsigned int napi_busy_poll_to; + ktime_t napi_busy_poll_dt; bool napi_prefer_busy_poll; #endif }; diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 29fa9285a33d..7fd9badcfaf8 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -110,10 +110,10 @@ static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) if (spin_trylock(&ctx->msg_lock)) { req = io_alloc_cache_get(&ctx->msg_cache); spin_unlock(&ctx->msg_lock); + if (req) + return req; } - if (req) - return req; - return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN); + return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); } static int io_msg_data_remote(struct io_kiocb *req) diff --git a/io_uring/napi.c b/io_uring/napi.c index 762254a7ff3f..4fd6bb331e1e 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -33,6 +33,12 @@ static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, return NULL; } +static inline ktime_t net_to_ktime(unsigned long t) +{ + /* napi approximating usecs, reverse busy_loop_current_time */ + return ns_to_ktime(t << 10); +} + void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) { struct hlist_head *hash_list; @@ -102,14 +108,14 @@ static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) __io_napi_remove_stale(ctx); } -static inline bool io_napi_busy_loop_timeout(unsigned long start_time, - unsigned long bp_usec) +static inline bool io_napi_busy_loop_timeout(ktime_t start_time, + ktime_t bp) { - if (bp_usec) { - unsigned long end_time = start_time + bp_usec; - unsigned long now = busy_loop_current_time(); + if (bp) { + ktime_t end_time = ktime_add(start_time, bp); + ktime_t now = net_to_ktime(busy_loop_current_time()); - return time_after(now, end_time); + return ktime_after(now, end_time); } return true; @@ -124,7 +130,8 @@ static bool io_napi_busy_loop_should_end(void *data, return true; if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return true; - if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to)) + if (io_napi_busy_loop_timeout(net_to_ktime(start_time), + iowq->napi_busy_poll_dt)) return true; return false; @@ -181,10 +188,12 @@ static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, */ void io_napi_init(struct io_ring_ctx *ctx) { + u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC; + INIT_LIST_HEAD(&ctx->napi_list); spin_lock_init(&ctx->napi_lock); ctx->napi_prefer_busy_poll = false; - ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll); + ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt); } /* @@ -217,11 +226,13 @@ void io_napi_free(struct io_ring_ctx *ctx) int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { - .busy_poll_to = ctx->napi_busy_poll_to, + .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), .prefer_busy_poll = ctx->napi_prefer_busy_poll }; struct io_uring_napi napi; + if (ctx->flags & IORING_SETUP_IOPOLL) + return -EINVAL; if (copy_from_user(&napi, arg, sizeof(napi))) return -EFAULT; if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv) @@ -230,7 +241,7 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) if (copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; - WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to); + WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC); WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll); WRITE_ONCE(ctx->napi_enabled, true); return 0; @@ -247,14 +258,14 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { - .busy_poll_to = ctx->napi_busy_poll_to, + .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), .prefer_busy_poll = ctx->napi_prefer_busy_poll }; if (arg && copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; - WRITE_ONCE(ctx->napi_busy_poll_to, 0); + WRITE_ONCE(ctx->napi_busy_poll_dt, 0); WRITE_ONCE(ctx->napi_prefer_busy_poll, false); WRITE_ONCE(ctx->napi_enabled, false); return 0; @@ -271,25 +282,14 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) * the NAPI timeout accordingly. */ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - struct timespec64 *ts) + ktime_t to_wait) { - unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to); + ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); - if (ts) { - struct timespec64 poll_to_ts; + if (to_wait) + poll_dt = min(poll_dt, to_wait); - poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); - if (timespec64_compare(ts, &poll_to_ts) < 0) { - s64 poll_to_ns = timespec64_to_ns(ts); - if (poll_to_ns > 0) { - u64 val = poll_to_ns + 999; - do_div(val, 1000); - poll_to = val; - } - } - } - - iowq->napi_busy_poll_to = poll_to; + iowq->napi_busy_poll_dt = poll_dt; } /* @@ -318,7 +318,7 @@ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) LIST_HEAD(napi_list); bool is_stale = false; - if (!READ_ONCE(ctx->napi_busy_poll_to)) + if (!READ_ONCE(ctx->napi_busy_poll_dt)) return 0; if (list_empty_careful(&ctx->napi_list)) return 0; diff --git a/io_uring/napi.h b/io_uring/napi.h index 6fc0393d0dbe..88f1c21d5548 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -18,7 +18,7 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock); void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, struct timespec64 *ts); + struct io_wait_queue *iowq, ktime_t to_wait); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); @@ -29,11 +29,11 @@ static inline bool io_napi(struct io_ring_ctx *ctx) static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - struct timespec64 *ts) + ktime_t to_wait) { if (!io_napi(ctx)) return; - __io_napi_adjust_timeout(ctx, iowq, ts); + __io_napi_adjust_timeout(ctx, iowq, to_wait); } static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, @@ -55,7 +55,7 @@ static inline void io_napi_add(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct socket *sock; - if (!READ_ONCE(ctx->napi_busy_poll_to)) + if (!READ_ONCE(ctx->napi_busy_poll_dt)) return; sock = sock_from_file(req->file); @@ -88,7 +88,7 @@ static inline void io_napi_add(struct io_kiocb *req) } static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - struct timespec64 *ts) + ktime_t to_wait) { } static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 1c9bf07499b1..9973876d91b0 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -639,7 +639,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) static bool io_match_task(struct io_kiocb *head, struct task_struct *task, bool cancel_all) - __must_hold(&req->ctx->timeout_lock) + __must_hold(&head->ctx->timeout_lock) { struct io_kiocb *req; diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index a54163a83968..8391c7c7c1ec 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -265,7 +265,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); io_req_uring_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); - return ret < 0 ? ret : IOU_OK; + return IOU_OK; } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, diff --git a/lib/usercopy.c b/lib/usercopy.c index 499a7a7d54db..7b17b83c8042 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -12,40 +12,18 @@ /* out-of-line parts */ -#ifndef INLINE_COPY_FROM_USER +#if !defined(INLINE_COPY_FROM_USER) || defined(CONFIG_RUST) unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { - unsigned long res = n; - might_fault(); - if (!should_fail_usercopy() && likely(access_ok(from, n))) { - /* - * Ensure that bad access_ok() speculation will not - * lead to nasty side effects *after* the copy is - * finished: - */ - barrier_nospec(); - instrument_copy_from_user_before(to, from, n); - res = raw_copy_from_user(to, from, n); - instrument_copy_from_user_after(to, from, n, res); - } - if (unlikely(res)) - memset(to + (n - res), 0, res); - return res; + return _inline_copy_from_user(to, from, n); } EXPORT_SYMBOL(_copy_from_user); #endif -#ifndef INLINE_COPY_TO_USER +#if !defined(INLINE_COPY_TO_USER) || defined(CONFIG_RUST) unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { - might_fault(); - if (should_fail_usercopy()) - return n; - if (likely(access_ok(to, n))) { - instrument_copy_to_user(to, from, n); - n = raw_copy_to_user(to, from, n); - } - return n; + return _inline_copy_to_user(to, from, n); } EXPORT_SYMBOL(_copy_to_user); #endif diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6c4664c681ca..40053a02bae1 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -44,7 +44,7 @@ #include #include -#define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX) +#define TCPUDP_MIB_MAX MAX_T(u32, UDP_MIB_MAX, TCP_MIB_MAX) /* * Report socket allocation statistics [mea@utu.fi] diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 6d1d9221649d..752327b10dde 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -27,7 +27,7 @@ #include #define MAX4(a, b, c, d) \ - max_t(u32, max_t(u32, a, b), max_t(u32, c, d)) + MAX_T(u32, MAX_T(u32, a, b), MAX_T(u32, c, d)) #define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \ IPSTATS_MIB_MAX, ICMP_MIB_MAX) diff --git a/rust/Makefile b/rust/Makefile index 83f675adbfab..1f10f92737f2 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -44,17 +44,10 @@ rustc_sysroot := $(shell MAKEFLAGS= $(RUSTC) $(rust_flags) --print sysroot) rustc_host_target := $(shell $(RUSTC) --version --verbose | grep -F 'host: ' | cut -d' ' -f2) RUST_LIB_SRC ?= $(rustc_sysroot)/lib/rustlib/src/rust/library -ifeq ($(quiet),silent_) -cargo_quiet=-q +ifneq ($(quiet),) rust_test_quiet=-q rustdoc_test_quiet=--test-args -q rustdoc_test_kernel_quiet=>/dev/null -else ifeq ($(quiet),quiet_) -rust_test_quiet=-q -rustdoc_test_quiet=--test-args -q -rustdoc_test_kernel_quiet=>/dev/null -else -cargo_quiet=--verbose endif core-cfgs = \ @@ -135,22 +128,21 @@ quiet_cmd_rustc_test_library = RUSTC TL $< @$(objtree)/include/generated/rustc_cfg $(rustc_target_flags) \ --crate-type $(if $(rustc_test_library_proc),proc-macro,rlib) \ --out-dir $(objtree)/$(obj)/test --cfg testlib \ - --sysroot $(objtree)/$(obj)/test/sysroot \ -L$(objtree)/$(obj)/test \ --crate-name $(subst rusttest-,,$(subst rusttestlib-,,$@)) $< -rusttestlib-build_error: $(src)/build_error.rs rusttest-prepare FORCE +rusttestlib-build_error: $(src)/build_error.rs FORCE +$(call if_changed,rustc_test_library) rusttestlib-macros: private rustc_target_flags = --extern proc_macro rusttestlib-macros: private rustc_test_library_proc = yes -rusttestlib-macros: $(src)/macros/lib.rs rusttest-prepare FORCE +rusttestlib-macros: $(src)/macros/lib.rs FORCE +$(call if_changed,rustc_test_library) -rusttestlib-bindings: $(src)/bindings/lib.rs rusttest-prepare FORCE +rusttestlib-bindings: $(src)/bindings/lib.rs FORCE +$(call if_changed,rustc_test_library) -rusttestlib-uapi: $(src)/uapi/lib.rs rusttest-prepare FORCE +rusttestlib-uapi: $(src)/uapi/lib.rs FORCE +$(call if_changed,rustc_test_library) quiet_cmd_rustdoc_test = RUSTDOC T $< @@ -159,7 +151,7 @@ quiet_cmd_rustdoc_test = RUSTDOC T $< $(RUSTDOC) --test $(rust_common_flags) \ @$(objtree)/include/generated/rustc_cfg \ $(rustc_target_flags) $(rustdoc_test_target_flags) \ - --sysroot $(objtree)/$(obj)/test/sysroot $(rustdoc_test_quiet) \ + $(rustdoc_test_quiet) \ -L$(objtree)/$(obj)/test --output $(rustdoc_output) \ --crate-name $(subst rusttest-,,$@) $< @@ -192,7 +184,6 @@ quiet_cmd_rustc_test = RUSTC T $< $(RUSTC) --test $(rust_common_flags) \ @$(objtree)/include/generated/rustc_cfg \ $(rustc_target_flags) --out-dir $(objtree)/$(obj)/test \ - --sysroot $(objtree)/$(obj)/test/sysroot \ -L$(objtree)/$(obj)/test \ --crate-name $(subst rusttest-,,$@) $<; \ $(objtree)/$(obj)/test/$(subst rusttest-,,$@) $(rust_test_quiet) \ @@ -200,60 +191,15 @@ quiet_cmd_rustc_test = RUSTC T $< rusttest: rusttest-macros rusttest-kernel -# This prepares a custom sysroot with our custom `alloc` instead of -# the standard one. -# -# This requires several hacks: -# - Unlike `core` and `alloc`, `std` depends on more than a dozen crates, -# including third-party crates that need to be downloaded, plus custom -# `build.rs` steps. Thus hardcoding things here is not maintainable. -# - `cargo` knows how to build the standard library, but it is an unstable -# feature so far (`-Zbuild-std`). -# - `cargo` only considers the use case of building the standard library -# to use it in a given package. Thus we need to create a dummy package -# and pick the generated libraries from there. -# - The usual ways of modifying the dependency graph in `cargo` do not seem -# to apply for the `-Zbuild-std` steps, thus we have to mislead it -# by modifying the sources in the sysroot. -# - To avoid messing with the user's Rust installation, we create a clone -# of the sysroot. However, `cargo` ignores `RUSTFLAGS` in the `-Zbuild-std` -# steps, thus we use a wrapper binary passed via `RUSTC` to pass the flag. -# -# In the future, we hope to avoid the whole ordeal by either: -# - Making the `test` crate not depend on `std` (either improving upstream -# or having our own custom crate). -# - Making the tests run in kernel space (requires the previous point). -# - Making `std` and friends be more like a "normal" crate, so that -# `-Zbuild-std` and related hacks are not needed. -quiet_cmd_rustsysroot = RUSTSYSROOT - cmd_rustsysroot = \ - rm -rf $(objtree)/$(obj)/test; \ - mkdir -p $(objtree)/$(obj)/test; \ - cp -a $(rustc_sysroot) $(objtree)/$(obj)/test/sysroot; \ - echo '\#!/bin/sh' > $(objtree)/$(obj)/test/rustc_sysroot; \ - echo "$(RUSTC) --sysroot=$(abspath $(objtree)/$(obj)/test/sysroot) \"\$$@\"" \ - >> $(objtree)/$(obj)/test/rustc_sysroot; \ - chmod u+x $(objtree)/$(obj)/test/rustc_sysroot; \ - $(CARGO) -q new $(objtree)/$(obj)/test/dummy; \ - RUSTC=$(objtree)/$(obj)/test/rustc_sysroot $(CARGO) $(cargo_quiet) \ - test -Zbuild-std --target $(rustc_host_target) \ - --manifest-path $(objtree)/$(obj)/test/dummy/Cargo.toml; \ - rm $(objtree)/$(obj)/test/sysroot/lib/rustlib/$(rustc_host_target)/lib/*; \ - cp $(objtree)/$(obj)/test/dummy/target/$(rustc_host_target)/debug/deps/* \ - $(objtree)/$(obj)/test/sysroot/lib/rustlib/$(rustc_host_target)/lib - -rusttest-prepare: FORCE - +$(call if_changed,rustsysroot) - rusttest-macros: private rustc_target_flags = --extern proc_macro rusttest-macros: private rustdoc_test_target_flags = --crate-type proc-macro -rusttest-macros: $(src)/macros/lib.rs rusttest-prepare FORCE +rusttest-macros: $(src)/macros/lib.rs FORCE +$(call if_changed,rustc_test) +$(call if_changed,rustdoc_test) rusttest-kernel: private rustc_target_flags = --extern alloc \ --extern build_error --extern macros --extern bindings --extern uapi -rusttest-kernel: $(src)/kernel/lib.rs rusttest-prepare \ +rusttest-kernel: $(src)/kernel/lib.rs \ rusttestlib-build_error rusttestlib-macros rusttestlib-bindings \ rusttestlib-uapi FORCE +$(call if_changed,rustc_test) @@ -421,7 +367,7 @@ ifneq ($(or $(CONFIG_ARM64),$(and $(CONFIG_RISCV),$(CONFIG_64BIT))),) endif $(obj)/core.o: private skip_clippy = 1 -$(obj)/core.o: private skip_flags = -Dunreachable_pub +$(obj)/core.o: private skip_flags = -Wunreachable_pub $(obj)/core.o: private rustc_objcopy = $(foreach sym,$(redirect-intrinsics),--redefine-sym $(sym)=__rust$(sym)) $(obj)/core.o: private rustc_target_flags = $(core-cfgs) $(obj)/core.o: $(RUST_LIB_SRC)/core/src/lib.rs FORCE @@ -435,7 +381,7 @@ $(obj)/compiler_builtins.o: $(src)/compiler_builtins.rs $(obj)/core.o FORCE +$(call if_changed_dep,rustc_library) $(obj)/alloc.o: private skip_clippy = 1 -$(obj)/alloc.o: private skip_flags = -Dunreachable_pub +$(obj)/alloc.o: private skip_flags = -Wunreachable_pub $(obj)/alloc.o: private rustc_target_flags = $(alloc-cfgs) $(obj)/alloc.o: $(RUST_LIB_SRC)/alloc/src/lib.rs $(obj)/compiler_builtins.o FORCE +$(call if_changed_dep,rustc_library) diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 53c996e4bedf..b940a5777330 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -30,4 +30,5 @@ const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL; const gfp_t RUST_CONST_HELPER_GFP_KERNEL_ACCOUNT = GFP_KERNEL_ACCOUNT; const gfp_t RUST_CONST_HELPER_GFP_NOWAIT = GFP_NOWAIT; const gfp_t RUST_CONST_HELPER___GFP_ZERO = __GFP_ZERO; +const gfp_t RUST_CONST_HELPER___GFP_HIGHMEM = ___GFP_HIGHMEM; const blk_features_t RUST_CONST_HELPER_BLK_FEAT_ROTATIONAL = BLK_FEAT_ROTATIONAL; diff --git a/rust/bindings/lib.rs b/rust/bindings/lib.rs index 40ddaee50d8b..93a1a3fc97bc 100644 --- a/rust/bindings/lib.rs +++ b/rust/bindings/lib.rs @@ -24,6 +24,7 @@ unsafe_op_in_unsafe_fn )] +#[allow(dead_code)] mod bindings_raw { // Use glob import here to expose all helpers. // Symbols defined within the module will take precedence to the glob import. diff --git a/rust/helpers.c b/rust/helpers.c index 87ed0a5b6099..92d3c03ae1bd 100644 --- a/rust/helpers.c +++ b/rust/helpers.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include #include #include @@ -40,6 +42,20 @@ __noreturn void rust_helper_BUG(void) } EXPORT_SYMBOL_GPL(rust_helper_BUG); +unsigned long rust_helper_copy_from_user(void *to, const void __user *from, + unsigned long n) +{ + return copy_from_user(to, from, n); +} +EXPORT_SYMBOL_GPL(rust_helper_copy_from_user); + +unsigned long rust_helper_copy_to_user(void __user *to, const void *from, + unsigned long n) +{ + return copy_to_user(to, from, n); +} +EXPORT_SYMBOL_GPL(rust_helper_copy_to_user); + void rust_helper_mutex_lock(struct mutex *lock) { mutex_lock(lock); @@ -81,6 +97,24 @@ int rust_helper_signal_pending(struct task_struct *t) } EXPORT_SYMBOL_GPL(rust_helper_signal_pending); +struct page *rust_helper_alloc_pages(gfp_t gfp_mask, unsigned int order) +{ + return alloc_pages(gfp_mask, order); +} +EXPORT_SYMBOL_GPL(rust_helper_alloc_pages); + +void *rust_helper_kmap_local_page(struct page *page) +{ + return kmap_local_page(page); +} +EXPORT_SYMBOL_GPL(rust_helper_kmap_local_page); + +void rust_helper_kunmap_local(const void *addr) +{ + kunmap_local(addr); +} +EXPORT_SYMBOL_GPL(rust_helper_kunmap_local); + refcount_t rust_helper_REFCOUNT_INIT(int n) { return (refcount_t)REFCOUNT_INIT(n); diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs index 531b5e471cb1..1966bd407017 100644 --- a/rust/kernel/alloc.rs +++ b/rust/kernel/alloc.rs @@ -20,6 +20,13 @@ pub struct AllocError; #[derive(Clone, Copy)] pub struct Flags(u32); +impl Flags { + /// Get the raw representation of this flag. + pub(crate) fn as_raw(self) -> u32 { + self.0 + } +} + impl core::ops::BitOr for Flags { type Output = Self; fn bitor(self, rhs: Self) -> Self::Output { @@ -52,6 +59,14 @@ pub mod flags { /// This is normally or'd with other flags. pub const __GFP_ZERO: Flags = Flags(bindings::__GFP_ZERO); + /// Allow the allocation to be in high memory. + /// + /// Allocations in high memory may not be mapped into the kernel's address space, so this can't + /// be used with `kmalloc` and other similar methods. + /// + /// This is normally or'd with other flags. + pub const __GFP_HIGHMEM: Flags = Flags(bindings::__GFP_HIGHMEM); + /// Users can not sleep and need the allocation to succeed. /// /// A lower watermark is applied to allow access to "atomic reserves". The current @@ -66,7 +81,7 @@ pub mod flags { /// The same as [`GFP_KERNEL`], except the allocation is accounted to kmemcg. pub const GFP_KERNEL_ACCOUNT: Flags = Flags(bindings::GFP_KERNEL_ACCOUNT); - /// Ror kernel allocations that should not stall for direct reclaim, start physical IO or + /// For kernel allocations that should not stall for direct reclaim, start physical IO or /// use any filesystem callback. It is very likely to fail to allocate memory, even for very /// small allocations. pub const GFP_NOWAIT: Flags = Flags(bindings::GFP_NOWAIT); diff --git a/rust/kernel/init.rs b/rust/kernel/init.rs index 68605b633e73..495c09ebe3a3 100644 --- a/rust/kernel/init.rs +++ b/rust/kernel/init.rs @@ -843,11 +843,8 @@ where let val = unsafe { &mut *slot }; // SAFETY: `slot` is considered pinned. let val = unsafe { Pin::new_unchecked(val) }; - (self.1)(val).map_err(|e| { - // SAFETY: `slot` was initialized above. - unsafe { core::ptr::drop_in_place(slot) }; - e - }) + // SAFETY: `slot` was initialized above. + (self.1)(val).inspect_err(|_| unsafe { core::ptr::drop_in_place(slot) }) } } @@ -941,11 +938,9 @@ where // SAFETY: All requirements fulfilled since this function is `__init`. unsafe { self.0.__pinned_init(slot)? }; // SAFETY: The above call initialized `slot` and we still have unique access. - (self.1)(unsafe { &mut *slot }).map_err(|e| { + (self.1)(unsafe { &mut *slot }).inspect_err(|_| // SAFETY: `slot` was initialized above. - unsafe { core::ptr::drop_in_place(slot) }; - e - }) + unsafe { core::ptr::drop_in_place(slot) }) } } diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index e6b7d3a80bbc..274bdc1b0a82 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -40,6 +40,7 @@ pub mod ioctl; pub mod kunit; #[cfg(CONFIG_NET)] pub mod net; +pub mod page; pub mod prelude; pub mod print; mod static_assert; @@ -50,6 +51,7 @@ pub mod sync; pub mod task; pub mod time; pub mod types; +pub mod uaccess; pub mod workqueue; #[doc(hidden)] diff --git a/rust/kernel/page.rs b/rust/kernel/page.rs new file mode 100644 index 000000000000..208a006d587c --- /dev/null +++ b/rust/kernel/page.rs @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Kernel page allocation and management. + +use crate::{ + alloc::{AllocError, Flags}, + bindings, + error::code::*, + error::Result, + uaccess::UserSliceReader, +}; +use core::ptr::{self, NonNull}; + +/// A bitwise shift for the page size. +pub const PAGE_SHIFT: usize = bindings::PAGE_SHIFT as usize; + +/// The number of bytes in a page. +pub const PAGE_SIZE: usize = bindings::PAGE_SIZE; + +/// A bitmask that gives the page containing a given address. +pub const PAGE_MASK: usize = !(PAGE_SIZE - 1); + +/// A pointer to a page that owns the page allocation. +/// +/// # Invariants +/// +/// The pointer is valid, and has ownership over the page. +pub struct Page { + page: NonNull, +} + +// SAFETY: Pages have no logic that relies on them staying on a given thread, so moving them across +// threads is safe. +unsafe impl Send for Page {} + +// SAFETY: Pages have no logic that relies on them not being accessed concurrently, so accessing +// them concurrently is safe. +unsafe impl Sync for Page {} + +impl Page { + /// Allocates a new page. + /// + /// # Examples + /// + /// Allocate memory for a page. + /// + /// ``` + /// use kernel::page::Page; + /// + /// # fn dox() -> Result<(), kernel::alloc::AllocError> { + /// let page = Page::alloc_page(GFP_KERNEL)?; + /// # Ok(()) } + /// ``` + /// + /// Allocate memory for a page and zero its contents. + /// + /// ``` + /// use kernel::page::Page; + /// + /// # fn dox() -> Result<(), kernel::alloc::AllocError> { + /// let page = Page::alloc_page(GFP_KERNEL | __GFP_ZERO)?; + /// # Ok(()) } + /// ``` + pub fn alloc_page(flags: Flags) -> Result { + // SAFETY: Depending on the value of `gfp_flags`, this call may sleep. Other than that, it + // is always safe to call this method. + let page = unsafe { bindings::alloc_pages(flags.as_raw(), 0) }; + let page = NonNull::new(page).ok_or(AllocError)?; + // INVARIANT: We just successfully allocated a page, so we now have ownership of the newly + // allocated page. We transfer that ownership to the new `Page` object. + Ok(Self { page }) + } + + /// Returns a raw pointer to the page. + pub fn as_ptr(&self) -> *mut bindings::page { + self.page.as_ptr() + } + + /// Runs a piece of code with this page mapped to an address. + /// + /// The page is unmapped when this call returns. + /// + /// # Using the raw pointer + /// + /// It is up to the caller to use the provided raw pointer correctly. The pointer is valid for + /// `PAGE_SIZE` bytes and for the duration in which the closure is called. The pointer might + /// only be mapped on the current thread, and when that is the case, dereferencing it on other + /// threads is UB. Other than that, the usual rules for dereferencing a raw pointer apply: don't + /// cause data races, the memory may be uninitialized, and so on. + /// + /// If multiple threads map the same page at the same time, then they may reference with + /// different addresses. However, even if the addresses are different, the underlying memory is + /// still the same for these purposes (e.g., it's still a data race if they both write to the + /// same underlying byte at the same time). + fn with_page_mapped(&self, f: impl FnOnce(*mut u8) -> T) -> T { + // SAFETY: `page` is valid due to the type invariants on `Page`. + let mapped_addr = unsafe { bindings::kmap_local_page(self.as_ptr()) }; + + let res = f(mapped_addr.cast()); + + // This unmaps the page mapped above. + // + // SAFETY: Since this API takes the user code as a closure, it can only be used in a manner + // where the pages are unmapped in reverse order. This is as required by `kunmap_local`. + // + // In other words, if this call to `kunmap_local` happens when a different page should be + // unmapped first, then there must necessarily be a call to `kmap_local_page` other than the + // call just above in `with_page_mapped` that made that possible. In this case, it is the + // unsafe block that wraps that other call that is incorrect. + unsafe { bindings::kunmap_local(mapped_addr) }; + + res + } + + /// Runs a piece of code with a raw pointer to a slice of this page, with bounds checking. + /// + /// If `f` is called, then it will be called with a pointer that points at `off` bytes into the + /// page, and the pointer will be valid for at least `len` bytes. The pointer is only valid on + /// this task, as this method uses a local mapping. + /// + /// If `off` and `len` refers to a region outside of this page, then this method returns + /// [`EINVAL`] and does not call `f`. + /// + /// # Using the raw pointer + /// + /// It is up to the caller to use the provided raw pointer correctly. The pointer is valid for + /// `len` bytes and for the duration in which the closure is called. The pointer might only be + /// mapped on the current thread, and when that is the case, dereferencing it on other threads + /// is UB. Other than that, the usual rules for dereferencing a raw pointer apply: don't cause + /// data races, the memory may be uninitialized, and so on. + /// + /// If multiple threads map the same page at the same time, then they may reference with + /// different addresses. However, even if the addresses are different, the underlying memory is + /// still the same for these purposes (e.g., it's still a data race if they both write to the + /// same underlying byte at the same time). + fn with_pointer_into_page( + &self, + off: usize, + len: usize, + f: impl FnOnce(*mut u8) -> Result, + ) -> Result { + let bounds_ok = off <= PAGE_SIZE && len <= PAGE_SIZE && (off + len) <= PAGE_SIZE; + + if bounds_ok { + self.with_page_mapped(move |page_addr| { + // SAFETY: The `off` integer is at most `PAGE_SIZE`, so this pointer offset will + // result in a pointer that is in bounds or one off the end of the page. + f(unsafe { page_addr.add(off) }) + }) + } else { + Err(EINVAL) + } + } + + /// Maps the page and reads from it into the given buffer. + /// + /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes + /// outside of the page, then this call returns [`EINVAL`]. + /// + /// # Safety + /// + /// * Callers must ensure that `dst` is valid for writing `len` bytes. + /// * Callers must ensure that this call does not race with a write to the same page that + /// overlaps with this read. + pub unsafe fn read_raw(&self, dst: *mut u8, offset: usize, len: usize) -> Result { + self.with_pointer_into_page(offset, len, move |src| { + // SAFETY: If `with_pointer_into_page` calls into this closure, then + // it has performed a bounds check and guarantees that `src` is + // valid for `len` bytes. + // + // There caller guarantees that there is no data race. + unsafe { ptr::copy_nonoverlapping(src, dst, len) }; + Ok(()) + }) + } + + /// Maps the page and writes into it from the given buffer. + /// + /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes + /// outside of the page, then this call returns [`EINVAL`]. + /// + /// # Safety + /// + /// * Callers must ensure that `src` is valid for reading `len` bytes. + /// * Callers must ensure that this call does not race with a read or write to the same page + /// that overlaps with this write. + pub unsafe fn write_raw(&self, src: *const u8, offset: usize, len: usize) -> Result { + self.with_pointer_into_page(offset, len, move |dst| { + // SAFETY: If `with_pointer_into_page` calls into this closure, then it has performed a + // bounds check and guarantees that `dst` is valid for `len` bytes. + // + // There caller guarantees that there is no data race. + unsafe { ptr::copy_nonoverlapping(src, dst, len) }; + Ok(()) + }) + } + + /// Maps the page and zeroes the given slice. + /// + /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes + /// outside of the page, then this call returns [`EINVAL`]. + /// + /// # Safety + /// + /// Callers must ensure that this call does not race with a read or write to the same page that + /// overlaps with this write. + pub unsafe fn fill_zero_raw(&self, offset: usize, len: usize) -> Result { + self.with_pointer_into_page(offset, len, move |dst| { + // SAFETY: If `with_pointer_into_page` calls into this closure, then it has performed a + // bounds check and guarantees that `dst` is valid for `len` bytes. + // + // There caller guarantees that there is no data race. + unsafe { ptr::write_bytes(dst, 0u8, len) }; + Ok(()) + }) + } + + /// Copies data from userspace into this page. + /// + /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes + /// outside of the page, then this call returns [`EINVAL`]. + /// + /// Like the other `UserSliceReader` methods, data races are allowed on the userspace address. + /// However, they are not allowed on the page you are copying into. + /// + /// # Safety + /// + /// Callers must ensure that this call does not race with a read or write to the same page that + /// overlaps with this write. + pub unsafe fn copy_from_user_slice_raw( + &self, + reader: &mut UserSliceReader, + offset: usize, + len: usize, + ) -> Result { + self.with_pointer_into_page(offset, len, move |dst| { + // SAFETY: If `with_pointer_into_page` calls into this closure, then it has performed a + // bounds check and guarantees that `dst` is valid for `len` bytes. Furthermore, we have + // exclusive access to the slice since the caller guarantees that there are no races. + reader.read_raw(unsafe { core::slice::from_raw_parts_mut(dst.cast(), len) }) + }) + } +} + +impl Drop for Page { + fn drop(&mut self) { + // SAFETY: By the type invariants, we have ownership of the page and can free it. + unsafe { bindings::__free_pages(self.page.as_ptr(), 0) }; + } +} diff --git a/rust/kernel/types.rs b/rust/kernel/types.rs index 2e7c9008621f..bd189d646adb 100644 --- a/rust/kernel/types.rs +++ b/rust/kernel/types.rs @@ -409,3 +409,67 @@ pub enum Either { /// Constructs an instance of [`Either`] containing a value of type `R`. Right(R), } + +/// Types for which any bit pattern is valid. +/// +/// Not all types are valid for all values. For example, a `bool` must be either zero or one, so +/// reading arbitrary bytes into something that contains a `bool` is not okay. +/// +/// It's okay for the type to have padding, as initializing those bytes has no effect. +/// +/// # Safety +/// +/// All bit-patterns must be valid for this type. This type must not have interior mutability. +pub unsafe trait FromBytes {} + +// SAFETY: All bit patterns are acceptable values of the types below. +unsafe impl FromBytes for u8 {} +unsafe impl FromBytes for u16 {} +unsafe impl FromBytes for u32 {} +unsafe impl FromBytes for u64 {} +unsafe impl FromBytes for usize {} +unsafe impl FromBytes for i8 {} +unsafe impl FromBytes for i16 {} +unsafe impl FromBytes for i32 {} +unsafe impl FromBytes for i64 {} +unsafe impl FromBytes for isize {} +// SAFETY: If all bit patterns are acceptable for individual values in an array, then all bit +// patterns are also acceptable for arrays of that type. +unsafe impl FromBytes for [T] {} +unsafe impl FromBytes for [T; N] {} + +/// Types that can be viewed as an immutable slice of initialized bytes. +/// +/// If a struct implements this trait, then it is okay to copy it byte-for-byte to userspace. This +/// means that it should not have any padding, as padding bytes are uninitialized. Reading +/// uninitialized memory is not just undefined behavior, it may even lead to leaking sensitive +/// information on the stack to userspace. +/// +/// The struct should also not hold kernel pointers, as kernel pointer addresses are also considered +/// sensitive. However, leaking kernel pointers is not considered undefined behavior by Rust, so +/// this is a correctness requirement, but not a safety requirement. +/// +/// # Safety +/// +/// Values of this type may not contain any uninitialized bytes. This type must not have interior +/// mutability. +pub unsafe trait AsBytes {} + +// SAFETY: Instances of the following types have no uninitialized portions. +unsafe impl AsBytes for u8 {} +unsafe impl AsBytes for u16 {} +unsafe impl AsBytes for u32 {} +unsafe impl AsBytes for u64 {} +unsafe impl AsBytes for usize {} +unsafe impl AsBytes for i8 {} +unsafe impl AsBytes for i16 {} +unsafe impl AsBytes for i32 {} +unsafe impl AsBytes for i64 {} +unsafe impl AsBytes for isize {} +unsafe impl AsBytes for bool {} +unsafe impl AsBytes for char {} +unsafe impl AsBytes for str {} +// SAFETY: If individual values in an array have no uninitialized portions, then the array itself +// does not have any uninitialized portions either. +unsafe impl AsBytes for [T] {} +unsafe impl AsBytes for [T; N] {} diff --git a/rust/kernel/uaccess.rs b/rust/kernel/uaccess.rs new file mode 100644 index 000000000000..e9347cff99ab --- /dev/null +++ b/rust/kernel/uaccess.rs @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Slices to user space memory regions. +//! +//! C header: [`include/linux/uaccess.h`](srctree/include/linux/uaccess.h) + +use crate::{ + alloc::Flags, + bindings, + error::Result, + prelude::*, + types::{AsBytes, FromBytes}, +}; +use alloc::vec::Vec; +use core::ffi::{c_ulong, c_void}; +use core::mem::{size_of, MaybeUninit}; + +/// The type used for userspace addresses. +pub type UserPtr = usize; + +/// A pointer to an area in userspace memory, which can be either read-only or read-write. +/// +/// All methods on this struct are safe: attempting to read or write on bad addresses (either out of +/// the bound of the slice or unmapped addresses) will return [`EFAULT`]. Concurrent access, +/// *including data races to/from userspace memory*, is permitted, because fundamentally another +/// userspace thread/process could always be modifying memory at the same time (in the same way that +/// userspace Rust's [`std::io`] permits data races with the contents of files on disk). In the +/// presence of a race, the exact byte values read/written are unspecified but the operation is +/// well-defined. Kernelspace code should validate its copy of data after completing a read, and not +/// expect that multiple reads of the same address will return the same value. +/// +/// These APIs are designed to make it difficult to accidentally write TOCTOU (time-of-check to +/// time-of-use) bugs. Every time a memory location is read, the reader's position is advanced by +/// the read length and the next read will start from there. This helps prevent accidentally reading +/// the same location twice and causing a TOCTOU bug. +/// +/// Creating a [`UserSliceReader`] and/or [`UserSliceWriter`] consumes the `UserSlice`, helping +/// ensure that there aren't multiple readers or writers to the same location. +/// +/// If double-fetching a memory location is necessary for some reason, then that is done by creating +/// multiple readers to the same memory location, e.g. using [`clone_reader`]. +/// +/// # Examples +/// +/// Takes a region of userspace memory from the current process, and modify it by adding one to +/// every byte in the region. +/// +/// ```no_run +/// use alloc::vec::Vec; +/// use core::ffi::c_void; +/// use kernel::error::Result; +/// use kernel::uaccess::{UserPtr, UserSlice}; +/// +/// fn bytes_add_one(uptr: UserPtr, len: usize) -> Result<()> { +/// let (read, mut write) = UserSlice::new(uptr, len).reader_writer(); +/// +/// let mut buf = Vec::new(); +/// read.read_all(&mut buf, GFP_KERNEL)?; +/// +/// for b in &mut buf { +/// *b = b.wrapping_add(1); +/// } +/// +/// write.write_slice(&buf)?; +/// Ok(()) +/// } +/// ``` +/// +/// Example illustrating a TOCTOU (time-of-check to time-of-use) bug. +/// +/// ```no_run +/// use alloc::vec::Vec; +/// use core::ffi::c_void; +/// use kernel::error::{code::EINVAL, Result}; +/// use kernel::uaccess::{UserPtr, UserSlice}; +/// +/// /// Returns whether the data in this region is valid. +/// fn is_valid(uptr: UserPtr, len: usize) -> Result { +/// let read = UserSlice::new(uptr, len).reader(); +/// +/// let mut buf = Vec::new(); +/// read.read_all(&mut buf, GFP_KERNEL)?; +/// +/// todo!() +/// } +/// +/// /// Returns the bytes behind this user pointer if they are valid. +/// fn get_bytes_if_valid(uptr: UserPtr, len: usize) -> Result> { +/// if !is_valid(uptr, len)? { +/// return Err(EINVAL); +/// } +/// +/// let read = UserSlice::new(uptr, len).reader(); +/// +/// let mut buf = Vec::new(); +/// read.read_all(&mut buf, GFP_KERNEL)?; +/// +/// // THIS IS A BUG! The bytes could have changed since we checked them. +/// // +/// // To avoid this kind of bug, don't call `UserSlice::new` multiple +/// // times with the same address. +/// Ok(buf) +/// } +/// ``` +/// +/// [`std::io`]: https://doc.rust-lang.org/std/io/index.html +/// [`clone_reader`]: UserSliceReader::clone_reader +pub struct UserSlice { + ptr: UserPtr, + length: usize, +} + +impl UserSlice { + /// Constructs a user slice from a raw pointer and a length in bytes. + /// + /// Constructing a [`UserSlice`] performs no checks on the provided address and length, it can + /// safely be constructed inside a kernel thread with no current userspace process. Reads and + /// writes wrap the kernel APIs `copy_from_user` and `copy_to_user`, which check the memory map + /// of the current process and enforce that the address range is within the user range (no + /// additional calls to `access_ok` are needed). Validity of the pointer is checked when you + /// attempt to read or write, not in the call to `UserSlice::new`. + /// + /// Callers must be careful to avoid time-of-check-time-of-use (TOCTOU) issues. The simplest way + /// is to create a single instance of [`UserSlice`] per user memory block as it reads each byte + /// at most once. + pub fn new(ptr: UserPtr, length: usize) -> Self { + UserSlice { ptr, length } + } + + /// Reads the entirety of the user slice, appending it to the end of the provided buffer. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address. + pub fn read_all(self, buf: &mut Vec, flags: Flags) -> Result { + self.reader().read_all(buf, flags) + } + + /// Constructs a [`UserSliceReader`]. + pub fn reader(self) -> UserSliceReader { + UserSliceReader { + ptr: self.ptr, + length: self.length, + } + } + + /// Constructs a [`UserSliceWriter`]. + pub fn writer(self) -> UserSliceWriter { + UserSliceWriter { + ptr: self.ptr, + length: self.length, + } + } + + /// Constructs both a [`UserSliceReader`] and a [`UserSliceWriter`]. + /// + /// Usually when this is used, you will first read the data, and then overwrite it afterwards. + pub fn reader_writer(self) -> (UserSliceReader, UserSliceWriter) { + ( + UserSliceReader { + ptr: self.ptr, + length: self.length, + }, + UserSliceWriter { + ptr: self.ptr, + length: self.length, + }, + ) + } +} + +/// A reader for [`UserSlice`]. +/// +/// Used to incrementally read from the user slice. +pub struct UserSliceReader { + ptr: UserPtr, + length: usize, +} + +impl UserSliceReader { + /// Skip the provided number of bytes. + /// + /// Returns an error if skipping more than the length of the buffer. + pub fn skip(&mut self, num_skip: usize) -> Result { + // Update `self.length` first since that's the fallible part of this operation. + self.length = self.length.checked_sub(num_skip).ok_or(EFAULT)?; + self.ptr = self.ptr.wrapping_add(num_skip); + Ok(()) + } + + /// Create a reader that can access the same range of data. + /// + /// Reading from the clone does not advance the current reader. + /// + /// The caller should take care to not introduce TOCTOU issues, as described in the + /// documentation for [`UserSlice`]. + pub fn clone_reader(&self) -> UserSliceReader { + UserSliceReader { + ptr: self.ptr, + length: self.length, + } + } + + /// Returns the number of bytes left to be read from this reader. + /// + /// Note that even reading less than this number of bytes may fail. + pub fn len(&self) -> usize { + self.length + } + + /// Returns `true` if no data is available in the io buffer. + pub fn is_empty(&self) -> bool { + self.length == 0 + } + + /// Reads raw data from the user slice into a kernel buffer. + /// + /// For a version that uses `&mut [u8]`, please see [`UserSliceReader::read_slice`]. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address, or if the read goes out of + /// bounds of this [`UserSliceReader`]. This call may modify `out` even if it returns an error. + /// + /// # Guarantees + /// + /// After a successful call to this method, all bytes in `out` are initialized. + pub fn read_raw(&mut self, out: &mut [MaybeUninit]) -> Result { + let len = out.len(); + let out_ptr = out.as_mut_ptr().cast::(); + if len > self.length { + return Err(EFAULT); + } + let Ok(len_ulong) = c_ulong::try_from(len) else { + return Err(EFAULT); + }; + // SAFETY: `out_ptr` points into a mutable slice of length `len_ulong`, so we may write + // that many bytes to it. + let res = + unsafe { bindings::copy_from_user(out_ptr, self.ptr as *const c_void, len_ulong) }; + if res != 0 { + return Err(EFAULT); + } + self.ptr = self.ptr.wrapping_add(len); + self.length -= len; + Ok(()) + } + + /// Reads raw data from the user slice into a kernel buffer. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address, or if the read goes out of + /// bounds of this [`UserSliceReader`]. This call may modify `out` even if it returns an error. + pub fn read_slice(&mut self, out: &mut [u8]) -> Result { + // SAFETY: The types are compatible and `read_raw` doesn't write uninitialized bytes to + // `out`. + let out = unsafe { &mut *(out as *mut [u8] as *mut [MaybeUninit]) }; + self.read_raw(out) + } + + /// Reads a value of the specified type. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address, or if the read goes out of + /// bounds of this [`UserSliceReader`]. + pub fn read(&mut self) -> Result { + let len = size_of::(); + if len > self.length { + return Err(EFAULT); + } + let Ok(len_ulong) = c_ulong::try_from(len) else { + return Err(EFAULT); + }; + let mut out: MaybeUninit = MaybeUninit::uninit(); + // SAFETY: The local variable `out` is valid for writing `size_of::()` bytes. + // + // By using the _copy_from_user variant, we skip the check_object_size check that verifies + // the kernel pointer. This mirrors the logic on the C side that skips the check when the + // length is a compile-time constant. + let res = unsafe { + bindings::_copy_from_user( + out.as_mut_ptr().cast::(), + self.ptr as *const c_void, + len_ulong, + ) + }; + if res != 0 { + return Err(EFAULT); + } + self.ptr = self.ptr.wrapping_add(len); + self.length -= len; + // SAFETY: The read above has initialized all bytes in `out`, and since `T` implements + // `FromBytes`, any bit-pattern is a valid value for this type. + Ok(unsafe { out.assume_init() }) + } + + /// Reads the entirety of the user slice, appending it to the end of the provided buffer. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address. + pub fn read_all(mut self, buf: &mut Vec, flags: Flags) -> Result { + let len = self.length; + VecExt::::reserve(buf, len, flags)?; + + // The call to `try_reserve` was successful, so the spare capacity is at least `len` bytes + // long. + self.read_raw(&mut buf.spare_capacity_mut()[..len])?; + + // SAFETY: Since the call to `read_raw` was successful, so the next `len` bytes of the + // vector have been initialized. + unsafe { buf.set_len(buf.len() + len) }; + Ok(()) + } +} + +/// A writer for [`UserSlice`]. +/// +/// Used to incrementally write into the user slice. +pub struct UserSliceWriter { + ptr: UserPtr, + length: usize, +} + +impl UserSliceWriter { + /// Returns the amount of space remaining in this buffer. + /// + /// Note that even writing less than this number of bytes may fail. + pub fn len(&self) -> usize { + self.length + } + + /// Returns `true` if no more data can be written to this buffer. + pub fn is_empty(&self) -> bool { + self.length == 0 + } + + /// Writes raw data to this user pointer from a kernel buffer. + /// + /// Fails with [`EFAULT`] if the write happens on a bad address, or if the write goes out of + /// bounds of this [`UserSliceWriter`]. This call may modify the associated userspace slice even + /// if it returns an error. + pub fn write_slice(&mut self, data: &[u8]) -> Result { + let len = data.len(); + let data_ptr = data.as_ptr().cast::(); + if len > self.length { + return Err(EFAULT); + } + let Ok(len_ulong) = c_ulong::try_from(len) else { + return Err(EFAULT); + }; + // SAFETY: `data_ptr` points into an immutable slice of length `len_ulong`, so we may read + // that many bytes from it. + let res = unsafe { bindings::copy_to_user(self.ptr as *mut c_void, data_ptr, len_ulong) }; + if res != 0 { + return Err(EFAULT); + } + self.ptr = self.ptr.wrapping_add(len); + self.length -= len; + Ok(()) + } + + /// Writes the provided Rust value to this userspace pointer. + /// + /// Fails with [`EFAULT`] if the write happens on a bad address, or if the write goes out of + /// bounds of this [`UserSliceWriter`]. This call may modify the associated userspace slice even + /// if it returns an error. + pub fn write(&mut self, value: &T) -> Result { + let len = size_of::(); + if len > self.length { + return Err(EFAULT); + } + let Ok(len_ulong) = c_ulong::try_from(len) else { + return Err(EFAULT); + }; + // SAFETY: The reference points to a value of type `T`, so it is valid for reading + // `size_of::()` bytes. + // + // By using the _copy_to_user variant, we skip the check_object_size check that verifies the + // kernel pointer. This mirrors the logic on the C side that skips the check when the length + // is a compile-time constant. + let res = unsafe { + bindings::_copy_to_user( + self.ptr as *mut c_void, + (value as *const T).cast::(), + len_ulong, + ) + }; + if res != 0 { + return Err(EFAULT); + } + self.ptr = self.ptr.wrapping_add(len); + self.length -= len; + Ok(()) + } +} diff --git a/rust/kernel/workqueue.rs b/rust/kernel/workqueue.rs index 1cec63a2aea8..553a5cba2adc 100644 --- a/rust/kernel/workqueue.rs +++ b/rust/kernel/workqueue.rs @@ -482,24 +482,26 @@ pub unsafe trait HasWork { /// use kernel::sync::Arc; /// use kernel::workqueue::{self, impl_has_work, Work}; /// -/// struct MyStruct { -/// work_field: Work, +/// struct MyStruct<'a, T, const N: usize> { +/// work_field: Work, 17>, +/// f: fn(&'a [T; N]), /// } /// /// impl_has_work! { -/// impl HasWork for MyStruct { self.work_field } +/// impl{'a, T, const N: usize} HasWork, 17> +/// for MyStruct<'a, T, N> { self.work_field } /// } /// ``` #[macro_export] macro_rules! impl_has_work { - ($(impl$(<$($implarg:ident),*>)? + ($(impl$({$($generics:tt)*})? HasWork<$work_type:ty $(, $id:tt)?> - for $self:ident $(<$($selfarg:ident),*>)? + for $self:ty { self.$field:ident } )*) => {$( // SAFETY: The implementation of `raw_get_work` only compiles if the field has the right // type. - unsafe impl$(<$($implarg),*>)? $crate::workqueue::HasWork<$work_type $(, $id)?> for $self $(<$($selfarg),*>)? { + unsafe impl$(<$($generics)+>)? $crate::workqueue::HasWork<$work_type $(, $id)?> for $self { const OFFSET: usize = ::core::mem::offset_of!(Self, $field) as usize; #[inline] @@ -515,7 +517,7 @@ macro_rules! impl_has_work { pub use impl_has_work; impl_has_work! { - impl HasWork for ClosureWork { self.work } + impl{T} HasWork for ClosureWork { self.work } } unsafe impl WorkItemPointer for Arc diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs index 520eae5fd792..159e75292970 100644 --- a/rust/macros/lib.rs +++ b/rust/macros/lib.rs @@ -35,6 +35,7 @@ use proc_macro::TokenStream; /// author: "Rust for Linux Contributors", /// description: "My very own kernel module!", /// license: "GPL", +/// alias: ["alternate_module_name"], /// } /// /// struct MyModule; @@ -55,13 +56,45 @@ use proc_macro::TokenStream; /// } /// ``` /// +/// ## Firmware +/// +/// The following example shows how to declare a kernel module that needs +/// to load binary firmware files. You need to specify the file names of +/// the firmware in the `firmware` field. The information is embedded +/// in the `modinfo` section of the kernel module. For example, a tool to +/// build an initramfs uses this information to put the firmware files into +/// the initramfs image. +/// +/// ```ignore +/// use kernel::prelude::*; +/// +/// module!{ +/// type: MyDeviceDriverModule, +/// name: "my_device_driver_module", +/// author: "Rust for Linux Contributors", +/// description: "My device driver requires firmware", +/// license: "GPL", +/// firmware: ["my_device_firmware1.bin", "my_device_firmware2.bin"], +/// } +/// +/// struct MyDeviceDriverModule; +/// +/// impl kernel::Module for MyDeviceDriverModule { +/// fn init() -> Result { +/// Ok(Self) +/// } +/// } +/// ``` +/// /// # Supported argument types /// - `type`: type which implements the [`Module`] trait (required). -/// - `name`: byte array of the name of the kernel module (required). -/// - `author`: byte array of the author of the kernel module. -/// - `description`: byte array of the description of the kernel module. -/// - `license`: byte array of the license of the kernel module (required). -/// - `alias`: byte array of alias name of the kernel module. +/// - `name`: ASCII string literal of the name of the kernel module (required). +/// - `author`: string literal of the author of the kernel module. +/// - `description`: string literal of the description of the kernel module. +/// - `license`: ASCII string literal of the license of the kernel module (required). +/// - `alias`: array of ASCII string literals of the alias names of the kernel module. +/// - `firmware`: array of ASCII string literals of the firmware files of +/// the kernel module. #[proc_macro] pub fn module(ts: TokenStream) -> TokenStream { module::module(ts) @@ -312,7 +345,7 @@ pub fn pinned_drop(args: TokenStream, input: TokenStream) -> TokenStream { /// /// Currently supported modifiers are: /// * `span`: change the span of concatenated identifier to the span of the specified token. By -/// default the span of the `[< >]` group is used. +/// default the span of the `[< >]` group is used. /// * `lower`: change the identifier to lower case. /// * `upper`: change the identifier to upper case. /// diff --git a/rust/macros/module.rs b/rust/macros/module.rs index acd0393b5095..411dc103d82e 100644 --- a/rust/macros/module.rs +++ b/rust/macros/module.rs @@ -97,14 +97,22 @@ struct ModuleInfo { author: Option, description: Option, alias: Option>, + firmware: Option>, } impl ModuleInfo { fn parse(it: &mut token_stream::IntoIter) -> Self { let mut info = ModuleInfo::default(); - const EXPECTED_KEYS: &[&str] = - &["type", "name", "author", "description", "license", "alias"]; + const EXPECTED_KEYS: &[&str] = &[ + "type", + "name", + "author", + "description", + "license", + "alias", + "firmware", + ]; const REQUIRED_KEYS: &[&str] = &["type", "name", "license"]; let mut seen_keys = Vec::new(); @@ -131,6 +139,7 @@ impl ModuleInfo { "description" => info.description = Some(expect_string(it)), "license" => info.license = expect_string_ascii(it), "alias" => info.alias = Some(expect_string_array(it)), + "firmware" => info.firmware = Some(expect_string_array(it)), _ => panic!( "Unknown key \"{}\". Valid keys are: {:?}.", key, EXPECTED_KEYS @@ -186,6 +195,11 @@ pub(crate) fn module(ts: TokenStream) -> TokenStream { modinfo.emit("alias", &alias); } } + if let Some(firmware) = info.firmware { + for fw in firmware { + modinfo.emit("firmware", &fw); + } + } // Built-in modules also export the `file` modinfo string. let file = diff --git a/rust/uapi/lib.rs b/rust/uapi/lib.rs index 0caad902ba40..80a00260e3e7 100644 --- a/rust/uapi/lib.rs +++ b/rust/uapi/lib.rs @@ -14,6 +14,7 @@ #![cfg_attr(test, allow(unsafe_op_in_unsafe_fn))] #![allow( clippy::all, + dead_code, missing_docs, non_camel_case_types, non_upper_case_globals, diff --git a/scripts/gcc-x86_32-has-stack-protector.sh b/scripts/gcc-x86_32-has-stack-protector.sh index 825c75c5b715..9459ca4f0f11 100755 --- a/scripts/gcc-x86_32-has-stack-protector.sh +++ b/scripts/gcc-x86_32-has-stack-protector.sh @@ -5,4 +5,4 @@ # -mstack-protector-guard-reg, added by # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81708 -echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -c -m32 -O0 -fstack-protector -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard - -o - 2> /dev/null | grep -q "%fs" +echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -m32 -O0 -fstack-protector -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard - -o - 2> /dev/null | grep -q "%fs" diff --git a/scripts/gcc-x86_64-has-stack-protector.sh b/scripts/gcc-x86_64-has-stack-protector.sh index 75e4e22b986a..f680bb01aeeb 100755 --- a/scripts/gcc-x86_64-has-stack-protector.sh +++ b/scripts/gcc-x86_64-has-stack-protector.sh @@ -1,4 +1,4 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 -echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -c -m64 -O0 -mcmodel=kernel -fno-PIE -fstack-protector - -o - 2> /dev/null | grep -q "%gs" +echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -m64 -O0 -mcmodel=kernel -fno-PIE -fstack-protector - -o - 2> /dev/null | grep -q "%gs" diff --git a/scripts/package/kernel.spec b/scripts/package/kernel.spec index 74355ff0e106..ac3e5ac01d8a 100644 --- a/scripts/package/kernel.spec +++ b/scripts/package/kernel.spec @@ -74,7 +74,7 @@ ln -fns /usr/src/kernels/%{KERNELRELEASE} %{buildroot}/lib/modules/%{KERNELRELEA echo "/lib/modules/%{KERNELRELEASE}" for x in alias alias.bin builtin.alias.bin builtin.bin dep dep.bin \ - devname softdep symbols symbols.bin; do + devname softdep symbols symbols.bin weakdep; do echo "%ghost /lib/modules/%{KERNELRELEASE}/modules.${x}" done diff --git a/scripts/package/mkspec b/scripts/package/mkspec index ead54d67a024..4dc1466dfc81 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -50,6 +50,6 @@ fi cat << EOF %changelog -* $(LC_ALL=C; date +'%a %b %d %Y') ${name} <${email}> +* $(LC_ALL=C date +'%a %b %d %Y') ${name} <${email}> - Custom built Linux kernel. EOF diff --git a/scripts/rust_is_available.sh b/scripts/rust_is_available.sh index 117018946b57..5262c56dd674 100755 --- a/scripts/rust_is_available.sh +++ b/scripts/rust_is_available.sh @@ -117,20 +117,16 @@ if [ "$rust_compiler_cversion" -lt "$rust_compiler_min_cversion" ]; then echo >&2 "***" exit 1 fi -if [ "$rust_compiler_cversion" -gt "$rust_compiler_min_cversion" ]; then - echo >&2 "***" - echo >&2 "*** Rust compiler '$RUSTC' is too new. This may or may not work." - echo >&2 "*** Your version: $rust_compiler_version" - echo >&2 "*** Expected version: $rust_compiler_min_version" - echo >&2 "***" - warning=1 -fi # Check that the Rust bindings generator is suitable. # # Non-stable and distributions' versions may have a version suffix, e.g. `-dev`. +# +# The dummy parameter `workaround-for-0.69.0` is required to support 0.69.0 +# (https://github.com/rust-lang/rust-bindgen/pull/2678). It can be removed when +# the minimum version is upgraded past that (0.69.1 already fixed the issue). rust_bindings_generator_output=$( \ - LC_ALL=C "$BINDGEN" --version 2>/dev/null + LC_ALL=C "$BINDGEN" --version workaround-for-0.69.0 2>/dev/null ) || rust_bindings_generator_code=$? if [ -n "$rust_bindings_generator_code" ]; then echo >&2 "***" @@ -165,13 +161,18 @@ if [ "$rust_bindings_generator_cversion" -lt "$rust_bindings_generator_min_cvers echo >&2 "***" exit 1 fi -if [ "$rust_bindings_generator_cversion" -gt "$rust_bindings_generator_min_cversion" ]; then - echo >&2 "***" - echo >&2 "*** Rust bindings generator '$BINDGEN' is too new. This may or may not work." - echo >&2 "*** Your version: $rust_bindings_generator_version" - echo >&2 "*** Expected version: $rust_bindings_generator_min_version" - echo >&2 "***" - warning=1 +if [ "$rust_bindings_generator_cversion" -eq 6600 ] || + [ "$rust_bindings_generator_cversion" -eq 6601 ]; then + # Distributions may have patched the issue (e.g. Debian did). + if ! "$BINDGEN" $(dirname $0)/rust_is_available_bindgen_0_66.h >/dev/null; then + echo >&2 "***" + echo >&2 "*** Rust bindings generator '$BINDGEN' versions 0.66.0 and 0.66.1 may not" + echo >&2 "*** work due to a bug (https://github.com/rust-lang/rust-bindgen/pull/2567)," + echo >&2 "*** unless patched (like Debian's)." + echo >&2 "*** Your version: $rust_bindings_generator_version" + echo >&2 "***" + warning=1 + fi fi # Check that the `libclang` used by the Rust bindings generator is suitable. diff --git a/scripts/rust_is_available_bindgen_0_66.h b/scripts/rust_is_available_bindgen_0_66.h new file mode 100644 index 000000000000..c0431293421c --- /dev/null +++ b/scripts/rust_is_available_bindgen_0_66.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#define A "\0" diff --git a/scripts/rust_is_available_test.py b/scripts/rust_is_available_test.py index 57613fe5ed75..413741037fb3 100755 --- a/scripts/rust_is_available_test.py +++ b/scripts/rust_is_available_test.py @@ -54,18 +54,34 @@ else: """) @classmethod - def generate_bindgen(cls, version_stdout, libclang_stderr): + def generate_bindgen(cls, version_stdout, libclang_stderr, version_0_66_patched=False): + if libclang_stderr is None: + libclang_case = f"raise SystemExit({cls.bindgen_default_bindgen_libclang_failure_exit_code})" + else: + libclang_case = f"print({repr(libclang_stderr)}, file=sys.stderr)" + + if version_0_66_patched: + version_0_66_case = "pass" + else: + version_0_66_case = "raise SystemExit(1)" + return cls.generate_executable(f"""#!/usr/bin/env python3 import sys if "rust_is_available_bindgen_libclang.h" in " ".join(sys.argv): - print({repr(libclang_stderr)}, file=sys.stderr) + {libclang_case} +elif "rust_is_available_bindgen_0_66.h" in " ".join(sys.argv): + {version_0_66_case} else: print({repr(version_stdout)}) """) @classmethod - def generate_bindgen_version(cls, stdout): - return cls.generate_bindgen(stdout, cls.bindgen_default_bindgen_libclang_stderr) + def generate_bindgen_version(cls, stdout, version_0_66_patched=False): + return cls.generate_bindgen(stdout, cls.bindgen_default_bindgen_libclang_stderr, version_0_66_patched) + + @classmethod + def generate_bindgen_libclang_failure(cls): + return cls.generate_bindgen(cls.bindgen_default_bindgen_version_stdout, None) @classmethod def generate_bindgen_libclang(cls, stderr): @@ -89,6 +105,7 @@ else: cls.rust_default_sysroot = subprocess.check_output(("rustc", "--print", "sysroot")).decode().strip() cls.bindgen_default_bindgen_version_stdout = f"bindgen {cls.bindgen_default_version}" + cls.bindgen_default_bindgen_libclang_failure_exit_code = 42 cls.bindgen_default_bindgen_libclang_stderr = f"scripts/rust_is_available_bindgen_libclang.h:2:9: warning: clang version {cls.llvm_default_version} [-W#pragma-messages], err: false" cls.default_rustc = cls.generate_rustc(f"rustc {cls.rustc_default_version}") @@ -193,11 +210,6 @@ else: result = self.run_script(self.Expected.FAILURE, { "RUSTC": rustc }) self.assertIn(f"Rust compiler '{rustc}' is too old.", result.stderr) - def test_rustc_new_version(self): - rustc = self.generate_rustc("rustc 1.999.0 (a8314ef7d 2099-06-27)") - result = self.run_script(self.Expected.SUCCESS_WITH_WARNINGS, { "RUSTC": rustc }) - self.assertIn(f"Rust compiler '{rustc}' is too new. This may or may not work.", result.stderr) - def test_bindgen_nonexecutable(self): result = self.run_script(self.Expected.FAILURE, { "BINDGEN": self.nonexecutable }) self.assertIn(f"Running '{self.nonexecutable}' to check the Rust bindings generator version failed with", result.stderr) @@ -226,21 +238,24 @@ else: result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen }) self.assertIn(f"Rust bindings generator '{bindgen}' is too old.", result.stderr) - def test_bindgen_new_version(self): - bindgen = self.generate_bindgen_version("bindgen 0.999.0") - result = self.run_script(self.Expected.SUCCESS_WITH_WARNINGS, { "BINDGEN": bindgen }) - self.assertIn(f"Rust bindings generator '{bindgen}' is too new. This may or may not work.", result.stderr) + def test_bindgen_bad_version_0_66_0_and_0_66_1(self): + for version in ("0.66.0", "0.66.1"): + with self.subTest(version=version): + bindgen = self.generate_bindgen_version(f"bindgen {version}") + result = self.run_script(self.Expected.SUCCESS_WITH_WARNINGS, { "BINDGEN": bindgen }) + self.assertIn(f"Rust bindings generator '{bindgen}' versions 0.66.0 and 0.66.1 may not", result.stderr) + + def test_bindgen_bad_version_0_66_0_and_0_66_1_patched(self): + for version in ("0.66.0", "0.66.1"): + with self.subTest(version=version): + bindgen = self.generate_bindgen_version(f"bindgen {version}", True) + result = self.run_script(self.Expected.SUCCESS, { "BINDGEN": bindgen }) def test_bindgen_libclang_failure(self): - for env in ( - { "LLVM_CONFIG_PATH": self.missing }, - { "LIBCLANG_PATH": self.missing }, - { "CLANG_PATH": self.missing }, - ): - with self.subTest(env=env): - result = self.run_script(self.Expected.FAILURE, env | { "PATH": os.environ["PATH"], "BINDGEN": "bindgen" }) - self.assertIn("Running 'bindgen' to check the libclang version (used by the Rust", result.stderr) - self.assertIn("bindings generator) failed with code ", result.stderr) + bindgen = self.generate_bindgen_libclang_failure() + result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen }) + self.assertIn(f"Running '{bindgen}' to check the libclang version (used by the Rust", result.stderr) + self.assertIn(f"bindings generator) failed with code {self.bindgen_default_bindgen_libclang_failure_exit_code}. This may be caused by", result.stderr) def test_bindgen_libclang_unexpected_version(self): bindgen = self.generate_bindgen_libclang("scripts/rust_is_available_bindgen_libclang.h:2:9: warning: clang version unexpected [-W#pragma-messages], err: false") diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index bcfea073e3f2..01b923d97a44 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1692,6 +1692,10 @@ int __aafs_profile_mkdir(struct aa_profile *profile, struct dentry *parent) struct aa_profile *p; p = aa_deref_parent(profile); dent = prof_dir(p); + if (!dent) { + error = -ENOENT; + goto fail2; + } /* adding to parent that previously didn't have children */ dent = aafs_create_dir("profiles", dent); if (IS_ERR(dent)) diff --git a/security/apparmor/file.c b/security/apparmor/file.c index c03eb7c19f16..d52a5b14dad4 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -144,19 +144,6 @@ int aa_audit_file(const struct cred *subj_cred, return aa_audit(type, profile, &ad, file_audit_cb); } -/** - * is_deleted - test if a file has been completely unlinked - * @dentry: dentry of file to test for deletion (NOT NULL) - * - * Returns: true if deleted else false - */ -static inline bool is_deleted(struct dentry *dentry) -{ - if (d_unlinked(dentry) && d_backing_inode(dentry)->i_nlink == 0) - return true; - return false; -} - static int path_name(const char *op, const struct cred *subj_cred, struct aa_label *label, const struct path *path, int flags, char *buffer, diff --git a/security/apparmor/include/cred.h b/security/apparmor/include/cred.h index 58fdc72af664..7265d2f81dd5 100644 --- a/security/apparmor/include/cred.h +++ b/security/apparmor/include/cred.h @@ -63,6 +63,26 @@ static inline struct aa_label *aa_get_newest_cred_label(const struct cred *cred) return aa_get_newest_label(aa_cred_raw_label(cred)); } +static inline struct aa_label *aa_get_newest_cred_label_condref(const struct cred *cred, + bool *needput) +{ + struct aa_label *l = aa_cred_raw_label(cred); + + if (unlikely(label_is_stale(l))) { + *needput = true; + return aa_get_newest_label(l); + } + + *needput = false; + return l; +} + +static inline void aa_put_label_condref(struct aa_label *l, bool needput) +{ + if (unlikely(needput)) + aa_put_label(l); +} + /** * aa_current_raw_label - find the current tasks confining label * diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 16568b6d589d..808060f9effb 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -461,6 +461,7 @@ static int apparmor_file_open(struct file *file) struct aa_file_ctx *fctx = file_ctx(file); struct aa_label *label; int error = 0; + bool needput; if (!path_mediated_fs(file->f_path.dentry)) return 0; @@ -477,7 +478,7 @@ static int apparmor_file_open(struct file *file) return 0; } - label = aa_get_newest_cred_label(file->f_cred); + label = aa_get_newest_cred_label_condref(file->f_cred, &needput); if (!unconfined(label)) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); @@ -494,7 +495,7 @@ static int apparmor_file_open(struct file *file) /* todo cache full allowed permissions set and state */ fctx->allow = aa_map_file_to_perms(file); } - aa_put_label(label); + aa_put_label_condref(label, needput); return error; } @@ -1124,7 +1125,7 @@ static int apparmor_socket_create(int family, int type, int protocol, int kern) * @sock: socket that is being setup * @family: family of socket being created * @type: type of the socket - * @ptotocol: protocol of the socket + * @protocol: protocol of the socket * @kern: socket is a special kernel socket * * Note: @@ -1304,6 +1305,13 @@ static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) if (!skb->secmark) return 0; + /* + * If reach here before socket_post_create hook is called, in which + * case label is null, drop the packet. + */ + if (!ctx->label) + return -EACCES; + return apparmor_secmark_check(ctx->label, OP_RECVMSG, AA_MAY_RECEIVE, skb->secmark, sk); } diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c index 49fe8da6fea4..bf8863253e07 100644 --- a/security/apparmor/mount.c +++ b/security/apparmor/mount.c @@ -44,6 +44,8 @@ static void audit_mnt_flags(struct audit_buffer *ab, unsigned long flags) audit_log_format(ab, ", mand"); if (flags & MS_DIRSYNC) audit_log_format(ab, ", dirsync"); + if (flags & MS_NOSYMFOLLOW) + audit_log_format(ab, ", nosymfollow"); if (flags & MS_NOATIME) audit_log_format(ab, ", noatime"); if (flags & MS_NODIRATIME) diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 957654d253dd..14df15e35695 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -225,7 +225,7 @@ static void aa_free_data(void *ptr, void *arg) { struct aa_data *data = ptr; - kfree_sensitive(data->data); + kvfree_sensitive(data->data, data->size); kfree_sensitive(data->key); kfree_sensitive(data); } diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 5e578ef0ddff..5a570235427d 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -747,34 +747,42 @@ static int unpack_pdb(struct aa_ext *e, struct aa_policydb **policy, *info = "missing required dfa"; goto fail; } - goto out; + } else { + /* + * only unpack the following if a dfa is present + * + * sadly start was given different names for file and policydb + * but since it is optional we can try both + */ + if (!aa_unpack_u32(e, &pdb->start[0], "start")) + /* default start state */ + pdb->start[0] = DFA_START; + if (!aa_unpack_u32(e, &pdb->start[AA_CLASS_FILE], "dfa_start")) { + /* default start state for xmatch and file dfa */ + pdb->start[AA_CLASS_FILE] = DFA_START; + } /* setup class index */ + for (i = AA_CLASS_FILE + 1; i <= AA_CLASS_LAST; i++) { + pdb->start[i] = aa_dfa_next(pdb->dfa, pdb->start[0], + i); + } } /* - * only unpack the following if a dfa is present - * - * sadly start was given different names for file and policydb - * but since it is optional we can try both + * Unfortunately due to a bug in earlier userspaces, a + * transition table may be present even when the dfa is + * not. For compatibility reasons unpack and discard. */ - if (!aa_unpack_u32(e, &pdb->start[0], "start")) - /* default start state */ - pdb->start[0] = DFA_START; - if (!aa_unpack_u32(e, &pdb->start[AA_CLASS_FILE], "dfa_start")) { - /* default start state for xmatch and file dfa */ - pdb->start[AA_CLASS_FILE] = DFA_START; - } /* setup class index */ - for (i = AA_CLASS_FILE + 1; i <= AA_CLASS_LAST; i++) { - pdb->start[i] = aa_dfa_next(pdb->dfa, pdb->start[0], - i); - } if (!unpack_trans_table(e, &pdb->trans) && required_trans) { *info = "failed to unpack profile transition table"; goto fail; } + if (!pdb->dfa && pdb->trans.table) + aa_free_str_table(&pdb->trans); + /* TODO: move compat mapping here, requires dfa merging first */ /* TODO: move verify here, it has to be done after compat mappings */ -out: + *policy = pdb; return 0; @@ -1071,6 +1079,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) if (rhashtable_insert_fast(profile->data, &data->head, profile->data->p)) { + kvfree_sensitive(data->data, data->size); kfree_sensitive(data->key); kfree_sensitive(data); info = "failed to insert data to table"; diff --git a/security/apparmor/policy_unpack_test.c b/security/apparmor/policy_unpack_test.c index 5c9bde25e56d..874fcf97794e 100644 --- a/security/apparmor/policy_unpack_test.c +++ b/security/apparmor/policy_unpack_test.c @@ -604,4 +604,5 @@ static struct kunit_suite apparmor_policy_unpack_test_module = { kunit_test_suite(apparmor_policy_unpack_test_module); +MODULE_DESCRIPTION("KUnit tests for AppArmor's policy unpack"); MODULE_LICENSE("GPL"); diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index b1e6817f1e54..3946d5254a1f 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -46,6 +46,7 @@ snapshot: turbostat @echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h @echo '#define BUILD_BUG_ON(cond) do { enum { compile_time_check ## __COUNTER__ = 1/(!(cond)) }; } while (0)' > $(SNAPSHOT)/build_bug.h + @echo '#define __must_be_array(arr) 0' >> $(SNAPSHOT)/build_bug.h @echo PWD=. > $(SNAPSHOT)/Makefile @echo "CFLAGS += -DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 8d37acd39201..067717bce1d4 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -28,10 +28,13 @@ name as necessary to disambiguate it from others is necessary. Note that option .PP \fB--add attributes\fP add column with counter having specified 'attributes'. The 'location' attribute is required, all others are optional. .nf - location: {\fBmsrDDD\fP | \fBmsr0xXXX\fP | \fB/sys/path...\fP} + location: {\fBmsrDDD\fP | \fBmsr0xXXX\fP | \fB/sys/path...\fP | \fBperf//\fP} msrDDD is a decimal offset, eg. msr16 msr0xXXX is a hex offset, eg. msr0x10 /sys/path... is an absolute path to a sysfs attribute + is a perf device from /sys/bus/event_source/devices/ eg. cstate_core + is a perf event for given device from /sys/bus/event_source/devices//events/ eg. c1-residency + perf/cstate_core/c1-residency would then use /sys/bus/event_source/devices/cstate_core/events/c1-residency scope: {\fBcpu\fP | \fBcore\fP | \fBpackage\fP} sample and print the counter for every cpu, core, or package. @@ -52,6 +55,39 @@ name as necessary to disambiguate it from others is necessary. Note that option as the column header. .fi .PP +\fB--add pmt,[attr_name=attr_value, ...]\fP add column with a PMT (Intel Platform Monitoring Technology) counter in a similar way to --add option above, but require PMT metadata to be supplied to correctly read and display the counter. The metadata can be found in the Intel PMT XML files, hosted at https://github.com/intel/Intel-PMT. For a complete example see "ADD PMT COUNTER EXAMPLE". +.nf + name="name_string" + For column header. + + type={\fBraw\fP} + 'raw' shows the counter contents in hex. + default: raw + + format={\fBraw\fP | \fBdelta\fP} + 'raw' shows the counter contents in hex. + 'delta' shows the difference in values during the measurement interval. + default: raw + + domain={\fBcpu%u\fP | \fBcore%u\fP | \fBpackage%u\fP} + 'cpu' per cpu/thread counter. + 'core' per core counter. + 'package' per package counter. + '%u' denotes id of the domain that the counter is associated with. For example core4 would mean that the counter is associated with core number 4. + + offset=\fB%u\fP + '%u' offset within the PMT MMIO region. + + lsb=\fB%u\fP + '%u' least significant bit within the 64 bit value read from 'offset'. Together with 'msb', used to form a read mask. + + msb=\fB%u\fP + '%u' most significant bit within the 64 bit value read from 'offset'. Together with 'lsb', used to form a read mask. + + guid=\fB%x\fP + '%x' hex identifier of the PMT MMIO region. +.fi +.PP \fB--cpu cpu-set\fP limit output to system summary plus the specified cpu-set. If cpu-set is the string "core", then the system summary plus the first CPU in each core are printed -- eg. subsequent HT siblings are not printed. Or if cpu-set is the string "package", then the system summary plus the first CPU in each package is printed. Otherwise, the system summary plus the specified set of CPUs are printed. The cpu-set is ordered from low to high, comma delimited with ".." and "-" permitted to denote a range. eg. 1,2,8,14..17,21-44 .PP \fB--hide column\fP do not show the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. @@ -67,10 +103,10 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP \fB--quiet\fP Do not decode and print the system configuration header information. .PP -+\fB--no-msr\fP Disable all the uses of the MSR driver. -+.PP -+\fB--no-perf\fP Disable all the uses of the perf API. -+.PP +\fB--no-msr\fP Disable all the uses of the MSR driver. +.PP +\fB--no-perf\fP Disable all the uses of the perf API. +.PP \fB--interval seconds\fP overrides the default 5.0 second measurement interval. .PP \fB--num_iterations num\fP number of the measurement iterations. @@ -320,7 +356,7 @@ available on all processors. Here we limit turbostat to showing just the CPU number for cpu0 - cpu3. We add a counter showing the 32-bit raw value of MSR 0x199 (MSR_IA32_PERF_CTL), labeling it with the column header, "PRF_CTRL", and display it only once, -afte the conclusion of a 0.1 second sleep. +after the conclusion of a 0.1 second sleep. .nf sudo ./turbostat --quiet --cpu 0-3 --show CPU --add msr0x199,u32,raw,PRF_CTRL sleep .1 0.101604 sec @@ -333,6 +369,56 @@ CPU PRF_CTRL .fi +.SH ADD PERF COUNTER EXAMPLE +Here we limit turbostat to showing just the CPU number for cpu0 - cpu3. +We add a counter showing time spent in C1 core cstate, +labeling it with the column header, "pCPU%c1", and display it only once, +after the conclusion of 0.1 second sleep. +We also show CPU%c1 built-in counter that should show similar values. +.nf +sudo ./turbostat --quiet --cpu 0-3 --show CPU,CPU%c1 --add perf/cstate_core/c1-residency,cpu,delta,percent,pCPU%c1 sleep .1 +0.102448 sec +CPU pCPU%c1 CPU%c1 +- 34.89 34.89 +0 45.99 45.99 +1 45.94 45.94 +2 23.83 23.83 +3 23.84 23.84 + +.fi + +.SH ADD PMT COUNTER EXAMPLE +Here we limit turbostat to showing just the CPU number 0. +We add two counters, showing crystal clock count and the DC6 residency. +All the parameters passed are based on the metadata found in the PMT XML files. + +For the crystal clock count, we +label it with the column header, "XTAL", +we set the type to 'raw', to read the number of clock ticks in hex, +we set the format to 'delta', to display the difference in ticks during the measurement interval, +we set the domain to 'package0', to collect it and associate it with the whole package number 0, +we set the offset to '0', which is a offset of the counter within the PMT MMIO region, +we set the lsb and msb to cover all 64 bits of the read 64 bit value, +and finally we set the guid to '0x1a067102', that identifies the PMT MMIO region to which the 'offset' is applied to read the counter value. + +For the DC6 residency counter, we +label it with the column header, "Die%c6", +we set the type to 'txtal_time', to obtain the percent residency value +we set the format to 'delta', to display the difference in ticks during the measurement interval, +we set the domain to 'package0', to collect it and associate it with the whole package number 0, +we set the offset to '0', which is a offset of the counter within the PMT MMIO region, +we set the lsb and msb to cover all 64 bits of the read 64 bit value, +and finally we set the guid to '0x1a067102', that identifies the PMT MMIO region to which the 'offset' is applied to read the counter value. + +.nf +sudo ./turbostat --quiet --cpu 0 --show CPU --add pmt,name=XTAL,type=raw,format=delta,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102 --add pmt,name=Die%c6,type=txtal_time,format=delta,domain=package0,offset=120,lsb=0,msb=63,guid=0x1a067102 +0.104352 sec +CPU XTAL Die%c6 +- 0x0000006d4d957ca7 0.00 +0 0x0000006d4d957ca7 0.00 +0.102448 sec +.fi + .SH INPUT For interval-mode, turbostat will immediately end the current interval diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9f5d053d4bc6..089220aaa5c9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9,6 +9,30 @@ #define _GNU_SOURCE #include MSRHEADER + +// copied from arch/x86/include/asm/cpu_device_id.h +#define VFM_MODEL_BIT 0 +#define VFM_FAMILY_BIT 8 +#define VFM_VENDOR_BIT 16 +#define VFM_RSVD_BIT 24 + +#define VFM_MODEL_MASK GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT) +#define VFM_FAMILY_MASK GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT) +#define VFM_VENDOR_MASK GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT) + +#define VFM_MODEL(vfm) (((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT) +#define VFM_FAMILY(vfm) (((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT) +#define VFM_VENDOR(vfm) (((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT) + +#define VFM_MAKE(_vendor, _family, _model) ( \ + ((_model) << VFM_MODEL_BIT) | \ + ((_family) << VFM_FAMILY_BIT) | \ + ((_vendor) << VFM_VENDOR_BIT) \ +) +// end copied section + +#define X86_VENDOR_INTEL 0 + #include INTEL_FAMILY_HEADER #include BUILD_BUG_HEADER #include @@ -20,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -55,15 +80,39 @@ */ #define NAME_BYTES 20 #define PATH_BYTES 128 +#define PERF_NAME_BYTES 128 #define MAX_NOFILE 0x8000 +#define COUNTER_KIND_PERF_PREFIX "perf/" +#define COUNTER_KIND_PERF_PREFIX_LEN strlen(COUNTER_KIND_PERF_PREFIX) +#define PERF_DEV_NAME_BYTES 32 +#define PERF_EVT_NAME_BYTES 32 + enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; -enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; -enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; -enum cstate_source { CSTATE_SOURCE_NONE, CSTATE_SOURCE_PERF, CSTATE_SOURCE_MSR }; +enum counter_source { COUNTER_SOURCE_NONE, COUNTER_SOURCE_PERF, COUNTER_SOURCE_MSR }; + +struct perf_counter_info { + struct perf_counter_info *next; + + /* How to open the counter / What counter it is. */ + char device[PERF_DEV_NAME_BYTES]; + char event[PERF_EVT_NAME_BYTES]; + + /* How to show/format the counter. */ + char name[PERF_NAME_BYTES]; + unsigned int width; + enum counter_scope scope; + enum counter_type type; + enum counter_format format; + double scale; + + /* For reading the counter. */ + int *fd_perf_per_domain; + size_t num_domains; +}; struct sysfs_path { char path[PATH_BYTES]; @@ -144,6 +193,7 @@ struct msr_counter bic[] = { { 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Die%c6", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -205,11 +255,12 @@ struct msr_counter bic[] = { #define BIC_SAM_mc6 (1ULL << 55) #define BIC_SAMMHz (1ULL << 56) #define BIC_SAMACTMHz (1ULL << 57) +#define BIC_Diec6 (1ULL << 58) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) -#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6) +#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -252,7 +303,6 @@ char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; int *fd_instr_count_percpu; -struct amperf_group_fd *fd_amperf_percpu; /* File descriptors for perf group with APERF and MPERF counters. */ struct timeval interval_tv = { 5, 0 }; struct timespec interval_ts = { 5, 0 }; @@ -267,6 +317,7 @@ unsigned int summary_only; unsigned int list_header_only; unsigned int dump_only; unsigned int has_aperf; +unsigned int has_aperf_access; unsigned int has_epb; unsigned int has_turbo; unsigned int is_hybrid; @@ -307,7 +358,6 @@ unsigned int first_counter_read = 1; int ignore_stdin; bool no_msr; bool no_perf; -enum amperf_source amperf_source; enum gfx_sysfs_idx { GFX_rc6, @@ -367,7 +417,7 @@ struct platform_features { }; struct platform_data { - unsigned int model; + unsigned int vfm; const struct platform_features *features; }; @@ -910,75 +960,75 @@ static const struct platform_features amd_features_with_rapl = { }; static const struct platform_data turbostat_pdata[] = { - { INTEL_FAM6_NEHALEM, &nhm_features }, - { INTEL_FAM6_NEHALEM_G, &nhm_features }, - { INTEL_FAM6_NEHALEM_EP, &nhm_features }, - { INTEL_FAM6_NEHALEM_EX, &nhx_features }, - { INTEL_FAM6_WESTMERE, &nhm_features }, - { INTEL_FAM6_WESTMERE_EP, &nhm_features }, - { INTEL_FAM6_WESTMERE_EX, &nhx_features }, - { INTEL_FAM6_SANDYBRIDGE, &snb_features }, - { INTEL_FAM6_SANDYBRIDGE_X, &snx_features }, - { INTEL_FAM6_IVYBRIDGE, &ivb_features }, - { INTEL_FAM6_IVYBRIDGE_X, &ivx_features }, - { INTEL_FAM6_HASWELL, &hsw_features }, - { INTEL_FAM6_HASWELL_X, &hsx_features }, - { INTEL_FAM6_HASWELL_L, &hswl_features }, - { INTEL_FAM6_HASWELL_G, &hswg_features }, - { INTEL_FAM6_BROADWELL, &bdw_features }, - { INTEL_FAM6_BROADWELL_G, &bdwg_features }, - { INTEL_FAM6_BROADWELL_X, &bdx_features }, - { INTEL_FAM6_BROADWELL_D, &bdx_features }, - { INTEL_FAM6_SKYLAKE_L, &skl_features }, - { INTEL_FAM6_SKYLAKE, &skl_features }, - { INTEL_FAM6_SKYLAKE_X, &skx_features }, - { INTEL_FAM6_KABYLAKE_L, &skl_features }, - { INTEL_FAM6_KABYLAKE, &skl_features }, - { INTEL_FAM6_COMETLAKE, &skl_features }, - { INTEL_FAM6_COMETLAKE_L, &skl_features }, - { INTEL_FAM6_CANNONLAKE_L, &cnl_features }, - { INTEL_FAM6_ICELAKE_X, &icx_features }, - { INTEL_FAM6_ICELAKE_D, &icx_features }, - { INTEL_FAM6_ICELAKE_L, &cnl_features }, - { INTEL_FAM6_ICELAKE_NNPI, &cnl_features }, - { INTEL_FAM6_ROCKETLAKE, &cnl_features }, - { INTEL_FAM6_TIGERLAKE_L, &cnl_features }, - { INTEL_FAM6_TIGERLAKE, &cnl_features }, - { INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features }, - { INTEL_FAM6_EMERALDRAPIDS_X, &spr_features }, - { INTEL_FAM6_GRANITERAPIDS_X, &spr_features }, - { INTEL_FAM6_LAKEFIELD, &cnl_features }, - { INTEL_FAM6_ALDERLAKE, &adl_features }, - { INTEL_FAM6_ALDERLAKE_L, &adl_features }, - { INTEL_FAM6_RAPTORLAKE, &adl_features }, - { INTEL_FAM6_RAPTORLAKE_P, &adl_features }, - { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, - { INTEL_FAM6_METEORLAKE, &cnl_features }, - { INTEL_FAM6_METEORLAKE_L, &cnl_features }, - { INTEL_FAM6_ARROWLAKE_H, &arl_features }, - { INTEL_FAM6_ARROWLAKE_U, &arl_features }, - { INTEL_FAM6_ARROWLAKE, &arl_features }, - { INTEL_FAM6_LUNARLAKE_M, &arl_features }, - { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, - { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, - { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, - { INTEL_FAM6_ATOM_GOLDMONT, &gmt_features }, - { INTEL_FAM6_ATOM_GOLDMONT_D, &gmtd_features }, - { INTEL_FAM6_ATOM_GOLDMONT_PLUS, &gmtp_features }, - { INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features }, - { INTEL_FAM6_ATOM_TREMONT, &tmt_features }, - { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features }, - { INTEL_FAM6_ATOM_GRACEMONT, &adl_features }, - { INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features }, - { INTEL_FAM6_ATOM_CRESTMONT, &grr_features }, - { INTEL_FAM6_XEON_PHI_KNL, &knl_features }, - { INTEL_FAM6_XEON_PHI_KNM, &knl_features }, + { INTEL_NEHALEM, &nhm_features }, + { INTEL_NEHALEM_G, &nhm_features }, + { INTEL_NEHALEM_EP, &nhm_features }, + { INTEL_NEHALEM_EX, &nhx_features }, + { INTEL_WESTMERE, &nhm_features }, + { INTEL_WESTMERE_EP, &nhm_features }, + { INTEL_WESTMERE_EX, &nhx_features }, + { INTEL_SANDYBRIDGE, &snb_features }, + { INTEL_SANDYBRIDGE_X, &snx_features }, + { INTEL_IVYBRIDGE, &ivb_features }, + { INTEL_IVYBRIDGE_X, &ivx_features }, + { INTEL_HASWELL, &hsw_features }, + { INTEL_HASWELL_X, &hsx_features }, + { INTEL_HASWELL_L, &hswl_features }, + { INTEL_HASWELL_G, &hswg_features }, + { INTEL_BROADWELL, &bdw_features }, + { INTEL_BROADWELL_G, &bdwg_features }, + { INTEL_BROADWELL_X, &bdx_features }, + { INTEL_BROADWELL_D, &bdx_features }, + { INTEL_SKYLAKE_L, &skl_features }, + { INTEL_SKYLAKE, &skl_features }, + { INTEL_SKYLAKE_X, &skx_features }, + { INTEL_KABYLAKE_L, &skl_features }, + { INTEL_KABYLAKE, &skl_features }, + { INTEL_COMETLAKE, &skl_features }, + { INTEL_COMETLAKE_L, &skl_features }, + { INTEL_CANNONLAKE_L, &cnl_features }, + { INTEL_ICELAKE_X, &icx_features }, + { INTEL_ICELAKE_D, &icx_features }, + { INTEL_ICELAKE_L, &cnl_features }, + { INTEL_ICELAKE_NNPI, &cnl_features }, + { INTEL_ROCKETLAKE, &cnl_features }, + { INTEL_TIGERLAKE_L, &cnl_features }, + { INTEL_TIGERLAKE, &cnl_features }, + { INTEL_SAPPHIRERAPIDS_X, &spr_features }, + { INTEL_EMERALDRAPIDS_X, &spr_features }, + { INTEL_GRANITERAPIDS_X, &spr_features }, + { INTEL_LAKEFIELD, &cnl_features }, + { INTEL_ALDERLAKE, &adl_features }, + { INTEL_ALDERLAKE_L, &adl_features }, + { INTEL_RAPTORLAKE, &adl_features }, + { INTEL_RAPTORLAKE_P, &adl_features }, + { INTEL_RAPTORLAKE_S, &adl_features }, + { INTEL_METEORLAKE, &cnl_features }, + { INTEL_METEORLAKE_L, &cnl_features }, + { INTEL_ARROWLAKE_H, &arl_features }, + { INTEL_ARROWLAKE_U, &arl_features }, + { INTEL_ARROWLAKE, &arl_features }, + { INTEL_LUNARLAKE_M, &arl_features }, + { INTEL_ATOM_SILVERMONT, &slv_features }, + { INTEL_ATOM_SILVERMONT_D, &slvd_features }, + { INTEL_ATOM_AIRMONT, &amt_features }, + { INTEL_ATOM_GOLDMONT, &gmt_features }, + { INTEL_ATOM_GOLDMONT_D, &gmtd_features }, + { INTEL_ATOM_GOLDMONT_PLUS, &gmtp_features }, + { INTEL_ATOM_TREMONT_D, &tmtd_features }, + { INTEL_ATOM_TREMONT, &tmt_features }, + { INTEL_ATOM_TREMONT_L, &tmt_features }, + { INTEL_ATOM_GRACEMONT, &adl_features }, + { INTEL_ATOM_CRESTMONT_X, &srf_features }, + { INTEL_ATOM_CRESTMONT, &grr_features }, + { INTEL_XEON_PHI_KNL, &knl_features }, + { INTEL_XEON_PHI_KNM, &knl_features }, /* * Missing support for - * INTEL_FAM6_ICELAKE - * INTEL_FAM6_ATOM_SILVERMONT_MID - * INTEL_FAM6_ATOM_AIRMONT_MID - * INTEL_FAM6_ATOM_AIRMONT_NP + * INTEL_ICELAKE + * INTEL_ATOM_SILVERMONT_MID + * INTEL_ATOM_AIRMONT_MID + * INTEL_ATOM_AIRMONT_NP */ { 0, NULL }, }; @@ -1003,11 +1053,11 @@ void probe_platform_features(unsigned int family, unsigned int model) return; } - if (!genuine_intel || family != 6) + if (!genuine_intel) return; for (i = 0; turbostat_pdata[i].features; i++) { - if (turbostat_pdata[i].model == model) { + if (VFM_FAMILY(turbostat_pdata[i].vfm) == family && VFM_MODEL(turbostat_pdata[i].vfm) == model) { platform = turbostat_pdata[i].features; return; } @@ -1034,8 +1084,13 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi #define MAX_ADDED_THREAD_COUNTERS 24 #define MAX_ADDED_CORE_COUNTERS 8 #define MAX_ADDED_PACKAGE_COUNTERS 16 +#define PMT_MAX_ADDED_THREAD_COUNTERS 24 +#define PMT_MAX_ADDED_CORE_COUNTERS 8 +#define PMT_MAX_ADDED_PACKAGE_COUNTERS 16 #define BITMASK_SIZE 32 +#define ZERO_ARRAY(arr) (memset(arr, 0, sizeof(arr)) + __must_be_array(arr)) + /* Indexes used to map data read from perf and MSRs into global variables */ enum rapl_rci_index { RAPL_RCI_INDEX_ENERGY_PKG = 0, @@ -1056,19 +1111,13 @@ enum rapl_unit { struct rapl_counter_info_t { unsigned long long data[NUM_RAPL_COUNTERS]; - enum rapl_source source[NUM_RAPL_COUNTERS]; + enum counter_source source[NUM_RAPL_COUNTERS]; unsigned long long flags[NUM_RAPL_COUNTERS]; double scale[NUM_RAPL_COUNTERS]; enum rapl_unit unit[NUM_RAPL_COUNTERS]; - - union { - /* Active when source == RAPL_SOURCE_MSR */ - struct { - unsigned long long msr[NUM_RAPL_COUNTERS]; - unsigned long long msr_mask[NUM_RAPL_COUNTERS]; - int msr_shift[NUM_RAPL_COUNTERS]; - }; - }; + unsigned long long msr[NUM_RAPL_COUNTERS]; + unsigned long long msr_mask[NUM_RAPL_COUNTERS]; + int msr_shift[NUM_RAPL_COUNTERS]; int fd_perf; }; @@ -1224,7 +1273,7 @@ enum ccstate_rci_index { struct cstate_counter_info_t { unsigned long long data[NUM_CSTATE_COUNTERS]; - enum cstate_source source[NUM_CSTATE_COUNTERS]; + enum counter_source source[NUM_CSTATE_COUNTERS]; unsigned long long msr[NUM_CSTATE_COUNTERS]; int fd_perf_core; int fd_perf_pkg; @@ -1361,6 +1410,167 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { }, }; +/* Indexes used to map data read from perf and MSRs into global variables */ +enum msr_rci_index { + MSR_RCI_INDEX_APERF = 0, + MSR_RCI_INDEX_MPERF = 1, + MSR_RCI_INDEX_SMI = 2, + NUM_MSR_COUNTERS, +}; + +struct msr_counter_info_t { + unsigned long long data[NUM_MSR_COUNTERS]; + enum counter_source source[NUM_MSR_COUNTERS]; + unsigned long long msr[NUM_MSR_COUNTERS]; + unsigned long long msr_mask[NUM_MSR_COUNTERS]; + int fd_perf; +}; + +struct msr_counter_info_t *msr_counter_info; +unsigned int msr_counter_info_size; + +struct msr_counter_arch_info { + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned long long msr_mask; + unsigned int rci_index; /* Maps data from perf counters to global variables */ + bool needed; + bool present; +}; + +enum msr_arch_info_index { + MSR_ARCH_INFO_APERF_INDEX = 0, + MSR_ARCH_INFO_MPERF_INDEX = 1, + MSR_ARCH_INFO_SMI_INDEX = 2, +}; + +static struct msr_counter_arch_info msr_counter_arch_infos[] = { + [MSR_ARCH_INFO_APERF_INDEX] = { + .perf_subsys = "msr", + .perf_name = "aperf", + .msr = MSR_IA32_APERF, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .rci_index = MSR_RCI_INDEX_APERF, + }, + + [MSR_ARCH_INFO_MPERF_INDEX] = { + .perf_subsys = "msr", + .perf_name = "mperf", + .msr = MSR_IA32_MPERF, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .rci_index = MSR_RCI_INDEX_MPERF, + }, + + [MSR_ARCH_INFO_SMI_INDEX] = { + .perf_subsys = "msr", + .perf_name = "smi", + .msr = MSR_SMI_COUNT, + .msr_mask = 0xFFFFFFFF, + .rci_index = MSR_RCI_INDEX_SMI, + }, +}; + +/* Can be redefined when compiling, useful for testing. */ +#ifndef SYSFS_TELEM_PATH +#define SYSFS_TELEM_PATH "/sys/class/intel_pmt" +#endif + +#define PMT_COUNTER_MTL_DC6_OFFSET 120 +#define PMT_COUNTER_MTL_DC6_LSB 0 +#define PMT_COUNTER_MTL_DC6_MSB 63 +#define PMT_MTL_DC6_GUID 0x1a067102 + +#define PMT_COUNTER_NAME_SIZE_BYTES 16 +#define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32 + +struct pmt_mmio { + struct pmt_mmio *next; + + unsigned int guid; + unsigned int size; + + /* Base pointer to the mmaped memory. */ + void *mmio_base; + + /* + * Offset to be applied to the mmio_base + * to get the beginning of the PMT counters for given GUID. + */ + unsigned long pmt_offset; +} *pmt_mmios; + +enum pmt_datatype { + PMT_TYPE_RAW, + PMT_TYPE_XTAL_TIME, +}; + +struct pmt_domain_info { + /* + * Pointer to the MMIO obtained by applying a counter offset + * to the mmio_base of the mmaped region for the given GUID. + * + * This is where to read the raw value of the counter from. + */ + unsigned long *pcounter; +}; + +struct pmt_counter { + struct pmt_counter *next; + + /* PMT metadata */ + char name[PMT_COUNTER_NAME_SIZE_BYTES]; + enum pmt_datatype type; + enum counter_scope scope; + unsigned int lsb; + unsigned int msb; + + /* BIC-like metadata */ + enum counter_format format; + + unsigned int num_domains; + struct pmt_domain_info *domains; +}; + +unsigned int pmt_counter_get_width(const struct pmt_counter *p) +{ + return (p->msb - p->lsb) + 1; +} + +void pmt_counter_resize_(struct pmt_counter *pcounter, unsigned int new_size) +{ + struct pmt_domain_info *new_mem; + + new_mem = (struct pmt_domain_info *)reallocarray(pcounter->domains, new_size, sizeof(*pcounter->domains)); + if (!new_mem) { + fprintf(stderr, "%s: failed to allocate memory for PMT counters\n", __func__); + exit(1); + } + + /* Zero initialize just allocated memory. */ + const size_t num_new_domains = new_size - pcounter->num_domains; + + memset(&new_mem[pcounter->num_domains], 0, num_new_domains * sizeof(*pcounter->domains)); + + pcounter->num_domains = new_size; + pcounter->domains = new_mem; +} + +void pmt_counter_resize(struct pmt_counter *pcounter, unsigned int new_size) +{ + /* + * Allocate more memory ahead of time. + * + * Always allocate space for at least 8 elements + * and double the size when growing. + */ + if (new_size < 8) + new_size = 8; + new_size = MAX(new_size, pcounter->num_domains * 2); + + pmt_counter_resize_(pcounter, new_size); +} + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -1378,6 +1588,8 @@ struct thread_data { unsigned int flags; bool is_atom; unsigned long long counter[MAX_ADDED_THREAD_COUNTERS]; + unsigned long long perf_counter[MAX_ADDED_THREAD_COUNTERS]; + unsigned long long pmt_counter[PMT_MAX_ADDED_THREAD_COUNTERS]; } *thread_even, *thread_odd; struct core_data { @@ -1391,6 +1603,8 @@ struct core_data { unsigned int core_id; unsigned long long core_throt_cnt; unsigned long long counter[MAX_ADDED_CORE_COUNTERS]; + unsigned long long perf_counter[MAX_ADDED_CORE_COUNTERS]; + unsigned long long pmt_counter[PMT_MAX_ADDED_CORE_COUNTERS]; } *core_even, *core_odd; struct pkg_data { @@ -1423,7 +1637,10 @@ struct pkg_data { struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; + unsigned long long die_c6; unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; + unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS]; + unsigned long long pmt_counter[PMT_MAX_ADDED_PACKAGE_COUNTERS]; } *package_even, *package_odd; #define ODD_COUNTERS thread_odd, core_odd, package_odd @@ -1558,12 +1775,25 @@ int idx_valid(int idx) } struct sys_counters { + /* MSR added counters */ unsigned int added_thread_counters; unsigned int added_core_counters; unsigned int added_package_counters; struct msr_counter *tp; struct msr_counter *cp; struct msr_counter *pp; + + /* perf added counters */ + unsigned int added_thread_perf_counters; + unsigned int added_core_perf_counters; + unsigned int added_package_perf_counters; + struct perf_counter_info *perf_tp; + struct perf_counter_info *perf_cp; + struct perf_counter_info *perf_pp; + + struct pmt_counter *pmt_tp; + struct pmt_counter *pmt_cp; + struct pmt_counter *pmt_pp; } sys; static size_t free_msr_counters_(struct msr_counter **pp) @@ -1747,7 +1977,7 @@ int get_msr_fd(int cpu) static void bic_disable_msr_access(void) { - const unsigned long bic_msrs = BIC_SMI | BIC_Mod_c6 | BIC_CoreTmp | + const unsigned long bic_msrs = BIC_Mod_c6 | BIC_CoreTmp | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp; bic_enabled &= ~bic_msrs; @@ -1823,6 +2053,23 @@ int probe_msr(int cpu, off_t offset) return 0; } +/* Convert CPU ID to domain ID for given added perf counter. */ +unsigned int cpu_to_domain(const struct perf_counter_info *pc, int cpu) +{ + switch (pc->scope) { + case SCOPE_CPU: + return cpu; + + case SCOPE_CORE: + return cpus[cpu].physical_core_id; + + case SCOPE_PACKAGE: + return cpus[cpu].physical_package_id; + } + + __builtin_unreachable(); +} + #define MAX_DEFERRED 16 char *deferred_add_names[MAX_DEFERRED]; char *deferred_skip_names[MAX_DEFERRED]; @@ -1846,9 +2093,12 @@ void help(void) "to print statistics, until interrupted.\n" " -a, --add add a counter\n" " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" + " eg. --add perf/cstate_pkg/c2-residency,package,delta,percent,perfPC2\n" + " eg. --add pmt,name=XTAL,type=raw,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102\n" " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" " {core | package | j,k,l..m,n-p }\n" " -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n" + " debug messages are printed to stderr\n" " -D, --Dump displays the raw counter values\n" " -e, --enable [all | column]\n" " shows all or the specified disabled column\n" @@ -1955,6 +2205,8 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) void print_header(char *delim) { struct msr_counter *mp; + struct perf_counter_info *pp; + struct pmt_counter *ppmt; int printed = 0; if (DO_BIC(BIC_USEC)) @@ -2012,6 +2264,40 @@ void print_header(char *delim) } } + for (pp = sys.perf_tp; pp; pp = pp->next) { + + if (pp->format == FORMAT_RAW) { + if (pp->width == 64) + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); + } else { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); + } + } + + ppmt = sys.pmt_tp; + while (ppmt) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); + else + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); + + break; + + case PMT_TYPE_XTAL_TIME: + outp += sprintf(outp, "%s%s", delim, ppmt->name); + break; + } + + ppmt = ppmt->next; + } + if (DO_BIC(BIC_CPU_c1)) outp += sprintf(outp, "%sCPU%%c1", (printed++ ? delim : "")); if (DO_BIC(BIC_CPU_c3)) @@ -2052,6 +2338,40 @@ void print_header(char *delim) } } + for (pp = sys.perf_cp; pp; pp = pp->next) { + + if (pp->format == FORMAT_RAW) { + if (pp->width == 64) + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); + } else { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); + } + } + + ppmt = sys.pmt_cp; + while (ppmt) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); + else + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); + + break; + + case PMT_TYPE_XTAL_TIME: + outp += sprintf(outp, "%s%s", delim, ppmt->name); + break; + } + + ppmt = ppmt->next; + } + if (DO_BIC(BIC_PkgTmp)) outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : "")); @@ -2096,6 +2416,8 @@ void print_header(char *delim) outp += sprintf(outp, "%sPkg%%pc9", (printed++ ? delim : "")); if (DO_BIC(BIC_Pkgpc10)) outp += sprintf(outp, "%sPk%%pc10", (printed++ ? delim : "")); + if (DO_BIC(BIC_Diec6)) + outp += sprintf(outp, "%sDie%%c6", (printed++ ? delim : "")); if (DO_BIC(BIC_CPU_LPI)) outp += sprintf(outp, "%sCPU%%LPI", (printed++ ? delim : "")); if (DO_BIC(BIC_SYS_LPI)) @@ -2147,6 +2469,40 @@ void print_header(char *delim) } } + for (pp = sys.perf_pp; pp; pp = pp->next) { + + if (pp->format == FORMAT_RAW) { + if (pp->width == 64) + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); + } else { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); + } + } + + ppmt = sys.pmt_pp; + while (ppmt) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); + else + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); + + break; + + case PMT_TYPE_XTAL_TIME: + outp += sprintf(outp, "%s%s", delim, ppmt->name); + break; + } + + ppmt = ppmt->next; + } + outp += sprintf(outp, "\n"); } @@ -2267,6 +2623,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data char *fmt8; int i; struct msr_counter *mp; + struct perf_counter_info *pp; + struct pmt_counter *ppmt; char *delim = "\t"; int printed = 0; @@ -2404,6 +2762,51 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } } + /* Added perf counters */ + for (i = 0, pp = sys.perf_tp; pp; ++i, pp = pp->next) { + if (pp->format == FORMAT_RAW) { + if (pp->width == 32) + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)t->perf_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->perf_counter[i]); + } else if (pp->format == FORMAT_DELTA) { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->perf_counter[i]); + else + outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->perf_counter[i]); + } else if (pp->format == FORMAT_PERCENT) { + if (pp->type == COUNTER_USEC) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + t->perf_counter[i] / interval_float / 10000); + else + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->perf_counter[i] / tsc); + } + } + + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)t->pmt_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->pmt_counter[i]); + + break; + + case PMT_TYPE_XTAL_TIME: + const unsigned long value_raw = t->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + break; + } + } + /* C1 */ if (DO_BIC(BIC_CPU_c1)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc); @@ -2447,6 +2850,44 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } } + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) { + if (pp->width == 32) + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)c->perf_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->perf_counter[i]); + } else if (pp->format == FORMAT_DELTA) { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->perf_counter[i]); + else + outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->perf_counter[i]); + } else if (pp->format == FORMAT_PERCENT) { + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->perf_counter[i] / tsc); + } + } + + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)c->pmt_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->pmt_counter[i]); + + break; + + case PMT_TYPE_XTAL_TIME: + const unsigned long value_raw = c->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + break; + } + } + fmt8 = "%s%.2f"; if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) @@ -2526,6 +2967,10 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_Pkgpc10)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc); + if (DO_BIC(BIC_Diec6)) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float); + if (DO_BIC(BIC_CPU_LPI)) { if (p->cpu_lpi >= 0) outp += @@ -2601,6 +3046,47 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000); } + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) { + if (pp->width == 32) + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)p->perf_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->perf_counter[i]); + } else if (pp->format == FORMAT_DELTA) { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->perf_counter[i]); + else + outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->perf_counter[i]); + } else if (pp->format == FORMAT_PERCENT) { + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->perf_counter[i] / tsc); + } else if (pp->type == COUNTER_K2M) { + outp += + sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->perf_counter[i] / 1000); + } + } + + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)p->pmt_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->pmt_counter[i]); + + break; + + case PMT_TYPE_XTAL_TIME: + const unsigned long value_raw = p->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + break; + } + } + done: if (*(outp - 1) != '\n') outp += sprintf(outp, "\n"); @@ -2654,6 +3140,8 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) { int i; struct msr_counter *mp; + struct perf_counter_info *pp; + struct pmt_counter *ppmt; if (DO_BIC(BIC_Totl_c0)) old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0; @@ -2674,6 +3162,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->pc8 = new->pc8 - old->pc8; old->pc9 = new->pc9 - old->pc9; old->pc10 = new->pc10 - old->pc10; + old->die_c6 = new->die_c6 - old->die_c6; old->cpu_lpi = new->cpu_lpi - old->cpu_lpi; old->sys_lpi = new->sys_lpi - old->sys_lpi; old->pkg_temp_c = new->pkg_temp_c; @@ -2714,6 +3203,22 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->counter[i] = new->counter[i] - old->counter[i]; } + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + old->perf_counter[i] = new->perf_counter[i]; + else if (pp->format == FORMAT_AVERAGE) + old->perf_counter[i] = new->perf_counter[i]; + else + old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; + } + + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + if (ppmt->format == FORMAT_RAW) + old->pmt_counter[i] = new->pmt_counter[i]; + else + old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i]; + } + return 0; } @@ -2721,6 +3226,8 @@ void delta_core(struct core_data *new, struct core_data *old) { int i; struct msr_counter *mp; + struct perf_counter_info *pp; + struct pmt_counter *ppmt; old->c3 = new->c3 - old->c3; old->c6 = new->c6 - old->c6; @@ -2737,6 +3244,20 @@ void delta_core(struct core_data *new, struct core_data *old) else old->counter[i] = new->counter[i] - old->counter[i]; } + + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + old->perf_counter[i] = new->perf_counter[i]; + else + old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; + } + + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + if (ppmt->format == FORMAT_RAW) + old->pmt_counter[i] = new->pmt_counter[i]; + else + old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i]; + } } int soft_c1_residency_display(int bic) @@ -2754,6 +3275,8 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d { int i; struct msr_counter *mp; + struct perf_counter_info *pp; + struct pmt_counter *ppmt; /* we run cpuid just the 1st time, copy the results */ if (DO_BIC(BIC_APIC)) @@ -2832,6 +3355,21 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d else old->counter[i] = new->counter[i] - old->counter[i]; } + + for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + old->perf_counter[i] = new->perf_counter[i]; + else + old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; + } + + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + if (ppmt->format == FORMAT_RAW) + old->pmt_counter[i] = new->pmt_counter[i]; + else + old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i]; + } + return 0; } @@ -2908,6 +3446,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->pc8 = 0; p->pc9 = 0; p->pc10 = 0; + p->die_c6 = 0; p->cpu_lpi = 0; p->sys_lpi = 0; @@ -2934,6 +3473,14 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) p->counter[i] = 0; + + memset(&t->perf_counter[0], 0, sizeof(t->perf_counter)); + memset(&c->perf_counter[0], 0, sizeof(c->perf_counter)); + memset(&p->perf_counter[0], 0, sizeof(p->perf_counter)); + + memset(&t->pmt_counter[0], 0, ARRAY_SIZE(t->pmt_counter)); + memset(&c->pmt_counter[0], 0, ARRAY_SIZE(c->pmt_counter)); + memset(&p->pmt_counter[0], 0, ARRAY_SIZE(p->pmt_counter)); } void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src) @@ -2954,6 +3501,8 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; struct msr_counter *mp; + struct perf_counter_info *pp; + struct pmt_counter *ppmt; /* copy un-changing apic_id's */ if (DO_BIC(BIC_APIC)) @@ -2984,6 +3533,16 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.threads.counter[i] += t->counter[i]; } + for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + average.threads.perf_counter[i] += t->perf_counter[i]; + } + + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + average.threads.pmt_counter[i] += t->pmt_counter[i]; + } + /* sum per-core values only for 1st thread in core */ if (!is_cpu_first_thread_in_core(t, c, p)) return 0; @@ -3004,6 +3563,16 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.cores.counter[i] += c->counter[i]; } + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + average.cores.perf_counter[i] += c->perf_counter[i]; + } + + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + average.cores.pmt_counter[i] += c->pmt_counter[i]; + } + /* sum per-pkg values only for 1st core in pkg */ if (!is_cpu_first_core_in_package(t, c, p)) return 0; @@ -3027,6 +3596,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.pc8 += p->pc8; average.packages.pc9 += p->pc9; average.packages.pc10 += p->pc10; + average.packages.die_c6 += p->die_c6; average.packages.cpu_lpi = p->cpu_lpi; average.packages.sys_lpi = p->sys_lpi; @@ -3055,6 +3625,18 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) else average.packages.counter[i] += p->counter[i]; } + + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if ((pp->format == FORMAT_RAW) && (topo.num_packages == 0)) + average.packages.perf_counter[i] = p->perf_counter[i]; + else + average.packages.perf_counter[i] += p->perf_counter[i]; + } + + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + average.packages.pmt_counter[i] += p->pmt_counter[i]; + } + return 0; } @@ -3066,6 +3648,8 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data { int i; struct msr_counter *mp; + struct perf_counter_info *pp; + struct pmt_counter *ppmt; clear_counters(&average.threads, &average.cores, &average.packages); @@ -3108,6 +3692,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data average.packages.pc8 /= topo.allowed_packages; average.packages.pc9 /= topo.allowed_packages; average.packages.pc10 /= topo.allowed_packages; + average.packages.die_c6 /= topo.allowed_packages; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -3137,6 +3722,45 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data } average.packages.counter[i] /= topo.allowed_packages; } + + for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + if (pp->type == COUNTER_ITEMS) { + if (average.threads.perf_counter[i] > 9999999) + sums_need_wide_columns = 1; + continue; + } + average.threads.perf_counter[i] /= topo.allowed_cpus; + } + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + if (pp->type == COUNTER_ITEMS) { + if (average.cores.perf_counter[i] > 9999999) + sums_need_wide_columns = 1; + } + average.cores.perf_counter[i] /= topo.allowed_cores; + } + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + if (pp->type == COUNTER_ITEMS) { + if (average.packages.perf_counter[i] > 9999999) + sums_need_wide_columns = 1; + } + average.packages.perf_counter[i] /= topo.allowed_packages; + } + + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + average.threads.pmt_counter[i] /= topo.allowed_cpus; + } + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + average.cores.pmt_counter[i] /= topo.allowed_cores; + } + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + average.packages.pmt_counter[i] /= topo.allowed_packages; + } } static unsigned long long rdtsc(void) @@ -3382,30 +4006,6 @@ static unsigned int read_perf_counter_info_n(const char *const path, const char return v; } -static unsigned int read_msr_type(void) -{ - const char *const path = "/sys/bus/event_source/devices/msr/type"; - const char *const format = "%u"; - - return read_perf_counter_info_n(path, format); -} - -static unsigned int read_aperf_config(void) -{ - const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; - const char *const format = "event=%x"; - - return read_perf_counter_info_n(path, format); -} - -static unsigned int read_mperf_config(void) -{ - const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; - const char *const format = "event=%x"; - - return read_perf_counter_info_n(path, format); -} - static unsigned int read_perf_type(const char *subsys) { const char *const path_format = "/sys/bus/event_source/devices/%s/type"; @@ -3417,15 +4017,55 @@ static unsigned int read_perf_type(const char *subsys) return read_perf_counter_info_n(path, format); } -static unsigned int read_rapl_config(const char *subsys, const char *event_name) +static unsigned int read_perf_config(const char *subsys, const char *event_name) { const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s"; - const char *const format = "event=%x"; + FILE *fconfig = NULL; char path[128]; + char config_str[64]; + unsigned int config; + unsigned int umask; + bool has_config = false; + bool has_umask = false; + unsigned int ret = -1; snprintf(path, sizeof(path), path_format, subsys, event_name); - return read_perf_counter_info_n(path, format); + fconfig = fopen(path, "r"); + if (!fconfig) + return -1; + + if (fgets(config_str, ARRAY_SIZE(config_str), fconfig) != config_str) + goto cleanup_and_exit; + + for (char *pconfig_str = &config_str[0]; pconfig_str;) { + if (sscanf(pconfig_str, "event=%x", &config) == 1) { + has_config = true; + goto next; + } + + if (sscanf(pconfig_str, "umask=%x", &umask) == 1) { + has_umask = true; + goto next; + } + +next: + pconfig_str = strchr(pconfig_str, ','); + if (pconfig_str) { + *pconfig_str = '\0'; + ++pconfig_str; + } + } + + if (!has_umask) + umask = 0; + + if (has_config) + ret = (umask << 8) | config; + +cleanup_and_exit: + fclose(fconfig); + return ret; } static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name) @@ -3444,7 +4084,7 @@ static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_na return RAPL_UNIT_INVALID; } -static double read_perf_rapl_scale(const char *subsys, const char *event_name) +static double read_perf_scale(const char *subsys, const char *event_name) { const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale"; const char *const format = "%lf"; @@ -3459,130 +4099,12 @@ static double read_perf_rapl_scale(const char *subsys, const char *event_name) return scale; } -static struct amperf_group_fd open_amperf_fd(int cpu) -{ - const unsigned int msr_type = read_msr_type(); - const unsigned int aperf_config = read_aperf_config(); - const unsigned int mperf_config = read_mperf_config(); - struct amperf_group_fd fds = {.aperf = -1, .mperf = -1 }; - - fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); - fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); - - return fds; -} - -static int get_amperf_fd(int cpu) -{ - assert(fd_amperf_percpu); - - if (fd_amperf_percpu[cpu].aperf) - return fd_amperf_percpu[cpu].aperf; - - fd_amperf_percpu[cpu] = open_amperf_fd(cpu); - - return fd_amperf_percpu[cpu].aperf; -} - -/* Read APERF, MPERF and TSC using the perf API. */ -static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu) -{ - union { - struct { - unsigned long nr_entries; - unsigned long aperf; - unsigned long mperf; - }; - - unsigned long as_array[3]; - } cnt; - - const int fd_amperf = get_amperf_fd(cpu); - - /* - * Read the TSC with rdtsc, because we want the absolute value and not - * the offset from the start of the counter. - */ - t->tsc = rdtsc(); - - const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array)); - - if (n != sizeof(cnt.as_array)) - return -2; - - t->aperf = cnt.aperf * aperf_mperf_multiplier; - t->mperf = cnt.mperf * aperf_mperf_multiplier; - - return 0; -} - -/* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */ -static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu) -{ - unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; - int aperf_mperf_retry_count = 0; - - /* - * The TSC, APERF and MPERF must be read together for - * APERF/MPERF and MPERF/TSC to give accurate results. - * - * Unfortunately, APERF and MPERF are read by - * individual system call, so delays may occur - * between them. If the time to read them - * varies by a large amount, we re-read them. - */ - - /* - * This initial dummy APERF read has been seen to - * reduce jitter in the subsequent reads. - */ - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - -retry: - t->tsc = rdtsc(); /* re-read close to APERF */ - - tsc_before = t->tsc; - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - - tsc_between = rdtsc(); - - if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) - return -4; - - tsc_after = rdtsc(); - - aperf_time = tsc_between - tsc_before; - mperf_time = tsc_after - tsc_between; - - /* - * If the system call latency to read APERF and MPERF - * differ by more than 2x, then try again. - */ - if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) { - aperf_mperf_retry_count++; - if (aperf_mperf_retry_count < 5) - goto retry; - else - warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); - } - aperf_mperf_retry_count = 0; - - t->aperf = t->aperf * aperf_mperf_multiplier; - t->mperf = t->mperf * aperf_mperf_multiplier; - - return 0; -} - size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) { size_t ret = 0; for (int i = 0; i < NUM_RAPL_COUNTERS; ++i) - if (rci->source[i] == RAPL_SOURCE_PERF) + if (rci->source[i] == COUNTER_SOURCE_PERF) ++ret; return ret; @@ -3593,7 +4115,7 @@ static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t size_t ret = 0; for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) - if (cci->source[i] == CSTATE_SOURCE_PERF) + if (cci->source[i] == COUNTER_SOURCE_PERF) ++ret; return ret; @@ -3611,7 +4133,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; struct rapl_counter_info_t *rci; - if (debug) + if (debug >= 2) fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain); assert(rapl_counter_info_perdomain); @@ -3634,14 +4156,14 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) { switch (rci->source[i]) { - case RAPL_SOURCE_NONE: + case COUNTER_SOURCE_NONE: break; - case RAPL_SOURCE_PERF: + case COUNTER_SOURCE_PERF: assert(pi < ARRAY_SIZE(perf_data)); assert(rci->fd_perf != -1); - if (debug) + if (debug >= 2) fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n", i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]); @@ -3650,8 +4172,8 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct ++pi; break; - case RAPL_SOURCE_MSR: - if (debug) + case COUNTER_SOURCE_MSR: + if (debug >= 2) fprintf(stderr, "Reading rapl counter via msr at %u\n", i); assert(!no_msr); @@ -3709,15 +4231,15 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat struct cstate_counter_info_t *cci; - if (debug) + if (debug >= 2) fprintf(stderr, "%s: cpu%d\n", __func__, cpu); assert(ccstate_counter_info); assert(cpu <= ccstate_counter_info_size); - memset(perf_data, 0, sizeof(perf_data)); - memset(perf_data_core, 0, sizeof(perf_data_core)); - memset(perf_data_pkg, 0, sizeof(perf_data_pkg)); + ZERO_ARRAY(perf_data); + ZERO_ARRAY(perf_data_core); + ZERO_ARRAY(perf_data_pkg); cci = &ccstate_counter_info[cpu]; @@ -3772,30 +4294,28 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) { switch (cci->source[i]) { - case CSTATE_SOURCE_NONE: + case COUNTER_SOURCE_NONE: break; - case CSTATE_SOURCE_PERF: + case COUNTER_SOURCE_PERF: assert(pi < ARRAY_SIZE(perf_data)); assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1); - if (debug) { + if (debug >= 2) fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]); - } cci->data[i] = perf_data[pi]; ++pi; break; - case CSTATE_SOURCE_MSR: + case COUNTER_SOURCE_MSR: assert(!no_msr); if (get_msr(cpu, cci->msr[i], &cci->data[i])) return -13 - i; - if (debug) { + if (debug >= 2) fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]); - } break; } @@ -3809,7 +4329,7 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat * when invoked for the thread sibling. */ #define PERF_COUNTER_WRITE_DATA(out_counter, index) do { \ - if (cci->source[index] != CSTATE_SOURCE_NONE) \ + if (cci->source[index] != COUNTER_SOURCE_NONE) \ out_counter = cci->data[index]; \ } while (0) @@ -3833,6 +4353,135 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat return 0; } +size_t msr_counter_info_count_perf(const struct msr_counter_info_t *mci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_MSR_COUNTERS; ++i) + if (mci->source[i] == COUNTER_SOURCE_PERF) + ++ret; + + return ret; +} + +int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t) +{ + unsigned long long perf_data[NUM_MSR_COUNTERS + 1]; + + struct msr_counter_info_t *mci; + + if (debug >= 2) + fprintf(stderr, "%s: cpu%d\n", __func__, cpu); + + assert(msr_counter_info); + assert(cpu <= msr_counter_info_size); + + mci = &msr_counter_info[cpu]; + + ZERO_ARRAY(perf_data); + ZERO_ARRAY(mci->data); + + if (mci->fd_perf != -1) { + const size_t num_perf_counters = msr_counter_info_count_perf(mci); + const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long); + const ssize_t actual_read_size = read(mci->fd_perf, &perf_data[0], sizeof(perf_data)); + + if (actual_read_size != expected_read_size) + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, + actual_read_size); + } + + for (unsigned int i = 0, pi = 1; i < NUM_MSR_COUNTERS; ++i) { + switch (mci->source[i]) { + case COUNTER_SOURCE_NONE: + break; + + case COUNTER_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(mci->fd_perf != -1); + + if (debug >= 2) + fprintf(stderr, "Reading msr counter via perf at %u: %llu\n", i, perf_data[pi]); + + mci->data[i] = perf_data[pi]; + + ++pi; + break; + + case COUNTER_SOURCE_MSR: + assert(!no_msr); + + if (get_msr(cpu, mci->msr[i], &mci->data[i])) + return -2 - i; + + mci->data[i] &= mci->msr_mask[i]; + + if (debug >= 2) + fprintf(stderr, "Reading msr counter via msr at %u: %llu\n", i, mci->data[i]); + + break; + } + } + + BUILD_BUG_ON(NUM_MSR_COUNTERS != 3); + t->aperf = mci->data[MSR_RCI_INDEX_APERF]; + t->mperf = mci->data[MSR_RCI_INDEX_MPERF]; + t->smi_count = mci->data[MSR_RCI_INDEX_SMI]; + + return 0; +} + +int perf_counter_info_read_values(struct perf_counter_info *pp, int cpu, unsigned long long *out, size_t out_size) +{ + unsigned int domain; + unsigned long long value; + int fd_counter; + + for (size_t i = 0; pp; ++i, pp = pp->next) { + domain = cpu_to_domain(pp, cpu); + assert(domain < pp->num_domains); + + fd_counter = pp->fd_perf_per_domain[domain]; + + if (fd_counter == -1) + continue; + + if (read(fd_counter, &value, sizeof(value)) != sizeof(value)) + return 1; + + assert(i < out_size); + out[i] = value * pp->scale; + } + + return 0; +} + +unsigned long pmt_gen_value_mask(unsigned int lsb, unsigned int msb) +{ + unsigned long mask; + + if (msb == 63) + mask = 0xffffffffffffffff; + else + mask = ((1 << (msb + 1)) - 1); + + mask -= (1 << lsb) - 1; + + return mask; +} + +unsigned long pmt_read_counter(struct pmt_counter *ppmt, unsigned int domain_id) +{ + assert(domain_id < ppmt->num_domains); + + const unsigned long *pmmio = ppmt->domains[domain_id].pcounter; + const unsigned long value = pmmio ? *pmmio : 0; + const unsigned long value_mask = pmt_gen_value_mask(ppmt->lsb, ppmt->msb); + const unsigned long value_shift = ppmt->lsb; + + return (value & value_mask) >> value_shift; +} + /* * get_counters(...) * migrate to cpu @@ -3843,6 +4492,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) int cpu = t->cpu_id; unsigned long long msr; struct msr_counter *mp; + struct pmt_counter *pp; int i; int status; @@ -3858,24 +4508,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) t->tsc = rdtsc(); /* we are running on local CPU of interest */ - if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC) - || soft_c1_residency_display(BIC_Avg_MHz)) { - int status = -1; - - assert(!no_perf || !no_msr); - - switch (amperf_source) { - case AMPERF_SOURCE_PERF: - status = read_aperf_mperf_tsc_perf(t, cpu); - break; - case AMPERF_SOURCE_MSR: - status = read_aperf_mperf_tsc_msr(t, cpu); - break; - } - - if (status != 0) - return status; - } + get_smi_aperf_mperf(cpu, t); if (DO_BIC(BIC_IPC)) if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long)) @@ -3883,11 +4516,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (DO_BIC(BIC_IRQ)) t->irq_count = irqs_per_cpu[cpu]; - if (DO_BIC(BIC_SMI)) { - if (get_msr(cpu, MSR_SMI_COUNT, &msr)) - return -5; - t->smi_count = msr & 0xFFFFFFFF; - } get_cstate_counters(cpu, t, c, p); @@ -3896,6 +4524,12 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -10; } + if (perf_counter_info_read_values(sys.perf_tp, cpu, t->perf_counter, MAX_ADDED_THREAD_COUNTERS)) + return -10; + + for (i = 0, pp = sys.pmt_tp; pp; i++, pp = pp->next) + t->pmt_counter[i] = pmt_read_counter(pp, t->cpu_id); + /* collect core counters only for 1st thread in core */ if (!is_cpu_first_thread_in_core(t, c, p)) goto done; @@ -3934,6 +4568,12 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -10; } + if (perf_counter_info_read_values(sys.perf_cp, cpu, c->perf_counter, MAX_ADDED_CORE_COUNTERS)) + return -10; + + for (i = 0, pp = sys.pmt_cp; pp; i++, pp = pp->next) + c->pmt_counter[i] = pmt_read_counter(pp, c->core_id); + /* collect package counters only for 1st core in package */ if (!is_cpu_first_core_in_package(t, c, p)) goto done; @@ -4006,6 +4646,13 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_mp(cpu, mp, &p->counter[i], path)) return -10; } + + if (perf_counter_info_read_values(sys.perf_pp, cpu, p->perf_counter, MAX_ADDED_PACKAGE_COUNTERS)) + return -10; + + for (i = 0, pp = sys.pmt_pp; pp; i++, pp = pp->next) + p->pmt_counter[i] = pmt_read_counter(pp, p->package_id); + done: gettimeofday(&t->tv_end, (struct timezone *)NULL); @@ -4469,25 +5116,6 @@ void free_fd_percpu(void) fd_percpu = NULL; } -void free_fd_amperf_percpu(void) -{ - int i; - - if (!fd_amperf_percpu) - return; - - for (i = 0; i < topo.max_cpu_num + 1; ++i) { - if (fd_amperf_percpu[i].mperf != 0) - close(fd_amperf_percpu[i].mperf); - - if (fd_amperf_percpu[i].aperf != 0) - close(fd_amperf_percpu[i].aperf); - } - - free(fd_amperf_percpu); - fd_amperf_percpu = NULL; -} - void free_fd_instr_count_percpu(void) { if (!fd_instr_count_percpu) @@ -4522,6 +5150,21 @@ void free_fd_cstate(void) ccstate_counter_info_size = 0; } +void free_fd_msr(void) +{ + if (!msr_counter_info) + return; + + for (int cpu = 0; cpu < topo.max_cpu_num; ++cpu) { + if (msr_counter_info[cpu].fd_perf != -1) + close(msr_counter_info[cpu].fd_perf); + } + + free(msr_counter_info); + msr_counter_info = NULL; + msr_counter_info_size = 0; +} + void free_fd_rapl_percpu(void) { if (!rapl_counter_info_perdomain) @@ -4539,6 +5182,36 @@ void free_fd_rapl_percpu(void) rapl_counter_info_perdomain_size = 0; } +void free_fd_added_perf_counters_(struct perf_counter_info *pp) +{ + if (!pp) + return; + + if (!pp->fd_perf_per_domain) + return; + + while (pp) { + for (size_t domain = 0; domain < pp->num_domains; ++domain) { + if (pp->fd_perf_per_domain[domain] != -1) { + close(pp->fd_perf_per_domain[domain]); + pp->fd_perf_per_domain[domain] = -1; + } + } + + free(pp->fd_perf_per_domain); + pp->fd_perf_per_domain = NULL; + + pp = pp->next; + } +} + +void free_fd_added_perf_counters(void) +{ + free_fd_added_perf_counters_(sys.perf_tp); + free_fd_added_perf_counters_(sys.perf_cp); + free_fd_added_perf_counters_(sys.perf_pp); +} + void free_all_buffers(void) { int i; @@ -4581,9 +5254,10 @@ void free_all_buffers(void) free_fd_percpu(); free_fd_instr_count_percpu(); - free_fd_amperf_percpu(); + free_fd_msr(); free_fd_rapl_percpu(); free_fd_cstate(); + free_fd_added_perf_counters(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -4918,16 +5592,22 @@ static void update_effective_set(bool startup) } void linux_perf_init(void); +void msr_perf_init(void); void rapl_perf_init(void); void cstate_perf_init(void); +void added_perf_counters_init(void); +void pmt_init(void); void re_initialize(void) { free_all_buffers(); setup_all_buffers(false); linux_perf_init(); + msr_perf_init(); rapl_perf_init(); cstate_perf_init(); + added_perf_counters_init(); + pmt_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -6779,22 +7459,13 @@ static int has_instr_count_access(void) return has_access; } -bool is_aperf_access_required(void) -{ - return BIC_IS_ENABLED(BIC_Avg_MHz) - || BIC_IS_ENABLED(BIC_Busy) - || BIC_IS_ENABLED(BIC_Bzy_MHz) - || BIC_IS_ENABLED(BIC_IPC) - || BIC_IS_ENABLED(BIC_CPU_c1); -} - int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, double *scale_, enum rapl_unit *unit_) { if (no_perf) return -1; - const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name); + const double scale = read_perf_scale(cai->perf_subsys, cai->perf_name); if (scale == 0.0) return -1; @@ -6805,7 +7476,7 @@ int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struc return -1; const unsigned int rapl_type = read_perf_type(cai->perf_subsys); - const unsigned int rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); + const unsigned int rapl_energy_pkg_config = read_perf_config(cai->perf_subsys, cai->perf_name); const int fd_counter = open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP); @@ -6826,7 +7497,7 @@ int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct { int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit); - if (debug) + if (debug >= 2) fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); return ret; @@ -6846,14 +7517,6 @@ void linux_perf_init(void) if (fd_instr_count_percpu == NULL) err(-1, "calloc fd_instr_count_percpu"); } - - const bool aperf_required = is_aperf_access_required(); - - if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { - fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); - if (fd_amperf_percpu == NULL) - err(-1, "calloc fd_amperf_percpu"); - } } void rapl_perf_init(void) @@ -6875,7 +7538,7 @@ void rapl_perf_init(void) rci->fd_perf = -1; for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) { rci->data[i] = 0; - rci->source[i] = RAPL_SOURCE_NONE; + rci->source[i] = COUNTER_SOURCE_NONE; } } @@ -6917,14 +7580,14 @@ void rapl_perf_init(void) /* Use perf API for this counter */ if (!no_perf && cai->perf_name && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) { - rci->source[cai->rci_index] = RAPL_SOURCE_PERF; + rci->source[cai->rci_index] = COUNTER_SOURCE_PERF; rci->scale[cai->rci_index] = scale * cai->compat_scale; rci->unit[cai->rci_index] = unit; rci->flags[cai->rci_index] = cai->flags; /* Use MSR for this counter */ } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { - rci->source[cai->rci_index] = RAPL_SOURCE_MSR; + rci->source[cai->rci_index] = COUNTER_SOURCE_MSR; rci->msr[cai->rci_index] = cai->msr; rci->msr_mask[cai->rci_index] = cai->msr_mask; rci->msr_shift[cai->rci_index] = cai->msr_shift; @@ -6934,7 +7597,7 @@ void rapl_perf_init(void) } } - if (rci->source[cai->rci_index] != RAPL_SOURCE_NONE) + if (rci->source[cai->rci_index] != COUNTER_SOURCE_NONE) has_counter = 1; } @@ -6946,75 +7609,11 @@ void rapl_perf_init(void) free(domain_visited); } -static int has_amperf_access_via_msr(void) -{ - if (no_msr) - return 0; - - if (probe_msr(base_cpu, MSR_IA32_APERF)) - return 0; - - if (probe_msr(base_cpu, MSR_IA32_MPERF)) - return 0; - - return 1; -} - -static int has_amperf_access_via_perf(void) -{ - struct amperf_group_fd fds; - - /* - * Cache the last result, so we don't warn the user multiple times - * - * Negative means cached, no access - * Zero means not cached - * Positive means cached, has access - */ - static int has_access_cached; - - if (no_perf) - return 0; - - if (has_access_cached != 0) - return has_access_cached > 0; - - fds = open_amperf_fd(base_cpu); - has_access_cached = (fds.aperf != -1) && (fds.mperf != -1); - - if (fds.aperf == -1) - warnx("Failed to access %s. Some of the counters may not be available\n" - "\tRun as root to enable them or use %s to disable the access explicitly", - "APERF perf counter", "--no-perf"); - else - close(fds.aperf); - - if (fds.mperf == -1) - warnx("Failed to access %s. Some of the counters may not be available\n" - "\tRun as root to enable them or use %s to disable the access explicitly", - "MPERF perf counter", "--no-perf"); - else - close(fds.mperf); - - if (has_access_cached == 0) - has_access_cached = -1; - - return has_access_cached > 0; -} - -/* Check if we can access APERF and MPERF */ +/* Assumes msr_counter_info is populated */ static int has_amperf_access(void) { - if (!is_aperf_access_required()) - return 0; - - if (!no_msr && has_amperf_access_via_msr()) - return 1; - - if (!no_perf && has_amperf_access_via_perf()) - return 1; - - return 0; + return msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].present && + msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].present; } int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name) @@ -7039,7 +7638,7 @@ int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const s return -1; const unsigned int type = read_perf_type(cai->perf_subsys); - const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); + const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name); const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP); @@ -7057,12 +7656,120 @@ int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const st { int ret = add_cstate_perf_counter_(cpu, cci, cai); - if (debug) + if (debug >= 2) fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); return ret; } +int add_msr_perf_counter_(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai) +{ + if (no_perf) + return -1; + + const unsigned int type = read_perf_type(cai->perf_subsys); + const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name); + + const int fd_counter = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP); + + if (fd_counter == -1) + return -1; + + /* If it's the first counter opened, make it a group descriptor */ + if (cci->fd_perf == -1) + cci->fd_perf = fd_counter; + + return fd_counter; +} + +int add_msr_perf_counter(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai) +{ + int ret = add_msr_perf_counter_(cpu, cci, cai); + + if (debug) + fprintf(stderr, "%s: %s/%s: %d (cpu: %d)\n", __func__, cai->perf_subsys, cai->perf_name, ret, cpu); + + return ret; +} + +void msr_perf_init_(void) +{ + const int mci_num = topo.max_cpu_num + 1; + + msr_counter_info = calloc(mci_num, sizeof(*msr_counter_info)); + if (!msr_counter_info) + err(1, "calloc msr_counter_info"); + msr_counter_info_size = mci_num; + + for (int cpu = 0; cpu < mci_num; ++cpu) + msr_counter_info[cpu].fd_perf = -1; + + for (int cidx = 0; cidx < NUM_MSR_COUNTERS; ++cidx) { + + struct msr_counter_arch_info *cai = &msr_counter_arch_infos[cidx]; + + cai->present = false; + + for (int cpu = 0; cpu < mci_num; ++cpu) { + + struct msr_counter_info_t *const cci = &msr_counter_info[cpu]; + + if (cpu_is_not_allowed(cpu)) + continue; + + if (cai->needed) { + /* Use perf API for this counter */ + if (!no_perf && cai->perf_name && add_msr_perf_counter(cpu, cci, cai) != -1) { + cci->source[cai->rci_index] = COUNTER_SOURCE_PERF; + cai->present = true; + + /* User MSR for this counter */ + } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { + cci->source[cai->rci_index] = COUNTER_SOURCE_MSR; + cci->msr[cai->rci_index] = cai->msr; + cci->msr_mask[cai->rci_index] = cai->msr_mask; + cai->present = true; + } + } + } + } +} + +/* Initialize data for reading perf counters from the MSR group. */ +void msr_perf_init(void) +{ + bool need_amperf = false, need_smi = false; + const bool need_soft_c1 = (!platform->has_msr_core_c1_res) && (platform->supported_cstates & CC1); + + need_amperf = BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || BIC_IS_ENABLED(BIC_Bzy_MHz) + || BIC_IS_ENABLED(BIC_IPC) || need_soft_c1; + + if (BIC_IS_ENABLED(BIC_SMI)) + need_smi = true; + + /* Enable needed counters */ + msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].needed = need_amperf; + msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].needed = need_amperf; + msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].needed = need_smi; + + msr_perf_init_(); + + const bool has_amperf = has_amperf_access(); + const bool has_smi = msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].present; + + has_aperf_access = has_amperf; + + if (has_amperf) { + BIC_PRESENT(BIC_Avg_MHz); + BIC_PRESENT(BIC_Busy); + BIC_PRESENT(BIC_Bzy_MHz); + BIC_PRESENT(BIC_SMI); + } + + if (has_smi) + BIC_PRESENT(BIC_SMI); +} + void cstate_perf_init_(bool soft_c1) { bool has_counter; @@ -7127,17 +7834,17 @@ void cstate_perf_init_(bool soft_c1) /* Use perf API for this counter */ if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) { - cci->source[cai->rci_index] = CSTATE_SOURCE_PERF; + cci->source[cai->rci_index] = COUNTER_SOURCE_PERF; /* User MSR for this counter */ } else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit && probe_msr(cpu, cai->msr) == 0) { - cci->source[cai->rci_index] = CSTATE_SOURCE_MSR; + cci->source[cai->rci_index] = COUNTER_SOURCE_MSR; cci->msr[cai->rci_index] = cai->msr; } } - if (cci->source[cai->rci_index] != CSTATE_SOURCE_NONE) { + if (cci->source[cai->rci_index] != COUNTER_SOURCE_NONE) { has_counter = true; cores_visited[core_id] = true; pkg_visited[pkg_id] = true; @@ -7320,12 +8027,6 @@ void process_cpuid() __cpuid(0x6, eax, ebx, ecx, edx); has_aperf = ecx & (1 << 0); - if (has_aperf && has_amperf_access()) { - BIC_PRESENT(BIC_Avg_MHz); - BIC_PRESENT(BIC_Busy); - BIC_PRESENT(BIC_Bzy_MHz); - BIC_PRESENT(BIC_IPC); - } do_dts = eax & (1 << 0); if (do_dts) BIC_PRESENT(BIC_CoreTmp); @@ -7442,6 +8143,11 @@ static void counter_info_init(void) if (platform->has_msr_atom_pkg_c6_residency && cai->msr == MSR_PKG_C6_RESIDENCY) cai->msr = MSR_ATOM_PKG_C6_RESIDENCY; } + + for (int i = 0; i < NUM_MSR_COUNTERS; ++i) { + msr_counter_arch_infos[i].present = false; + msr_counter_arch_infos[i].needed = false; + } } void probe_pm_features(void) @@ -7817,21 +8523,6 @@ void set_base_cpu(void) err(-ENODEV, "No valid cpus found"); } -static void set_amperf_source(void) -{ - amperf_source = AMPERF_SOURCE_PERF; - - const bool aperf_required = is_aperf_access_required(); - - if (no_perf || !aperf_required || !has_amperf_access_via_perf()) - amperf_source = AMPERF_SOURCE_MSR; - - if (quiet || !debug) - return; - - fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); -} - bool has_added_counters(void) { /* @@ -7842,54 +8533,8 @@ bool has_added_counters(void) return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters; } -bool is_msr_access_required(void) -{ - if (no_msr) - return false; - - if (has_added_counters()) - return true; - - return BIC_IS_ENABLED(BIC_SMI) - || BIC_IS_ENABLED(BIC_CPU_c1) - || BIC_IS_ENABLED(BIC_CPU_c3) - || BIC_IS_ENABLED(BIC_CPU_c6) - || BIC_IS_ENABLED(BIC_CPU_c7) - || BIC_IS_ENABLED(BIC_Mod_c6) - || BIC_IS_ENABLED(BIC_CoreTmp) - || BIC_IS_ENABLED(BIC_Totl_c0) - || BIC_IS_ENABLED(BIC_Any_c0) - || BIC_IS_ENABLED(BIC_GFX_c0) - || BIC_IS_ENABLED(BIC_CPUGFX) - || BIC_IS_ENABLED(BIC_Pkgpc3) - || BIC_IS_ENABLED(BIC_Pkgpc6) - || BIC_IS_ENABLED(BIC_Pkgpc2) - || BIC_IS_ENABLED(BIC_Pkgpc7) - || BIC_IS_ENABLED(BIC_Pkgpc8) - || BIC_IS_ENABLED(BIC_Pkgpc9) - || BIC_IS_ENABLED(BIC_Pkgpc10) - /* TODO: Multiplex access with perf */ - || BIC_IS_ENABLED(BIC_CorWatt) - || BIC_IS_ENABLED(BIC_Cor_J) - || BIC_IS_ENABLED(BIC_PkgWatt) - || BIC_IS_ENABLED(BIC_CorWatt) - || BIC_IS_ENABLED(BIC_GFXWatt) - || BIC_IS_ENABLED(BIC_RAMWatt) - || BIC_IS_ENABLED(BIC_Pkg_J) - || BIC_IS_ENABLED(BIC_Cor_J) - || BIC_IS_ENABLED(BIC_GFX_J) - || BIC_IS_ENABLED(BIC_RAM_J) - || BIC_IS_ENABLED(BIC_PKG__) - || BIC_IS_ENABLED(BIC_RAM__) - || BIC_IS_ENABLED(BIC_PkgTmp) - || (is_aperf_access_required() && !has_amperf_access_via_perf()); -} - void check_msr_access(void) { - if (!is_msr_access_required()) - no_msr = 1; - check_dev_msr(); check_msr_permission(); @@ -7899,18 +8544,425 @@ void check_msr_access(void) void check_perf_access(void) { - const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC); - - if (no_perf || !intrcount_required || !has_instr_count_access()) + if (no_perf || !BIC_IS_ENABLED(BIC_IPC) || !has_instr_count_access()) bic_enabled &= ~BIC_IPC; +} - const bool aperf_required = is_aperf_access_required(); +int added_perf_counters_init_(struct perf_counter_info *pinfo) +{ + size_t num_domains = 0; + unsigned int next_domain; + bool *domain_visited; + unsigned int perf_type, perf_config; + double perf_scale; + int fd_perf; - if (!aperf_required || !has_amperf_access()) { - bic_enabled &= ~BIC_Avg_MHz; - bic_enabled &= ~BIC_Busy; - bic_enabled &= ~BIC_Bzy_MHz; - bic_enabled &= ~BIC_IPC; + if (!pinfo) + return 0; + + const size_t max_num_domains = MAX(topo.max_cpu_num + 1, MAX(topo.max_core_id + 1, topo.max_package_id + 1)); + + domain_visited = calloc(max_num_domains, sizeof(*domain_visited)); + + while (pinfo) { + switch (pinfo->scope) { + case SCOPE_CPU: + num_domains = topo.max_cpu_num + 1; + break; + + case SCOPE_CORE: + num_domains = topo.max_core_id + 1; + break; + + case SCOPE_PACKAGE: + num_domains = topo.max_package_id + 1; + break; + } + + /* Allocate buffer for file descriptor for each domain. */ + pinfo->fd_perf_per_domain = calloc(num_domains, sizeof(*pinfo->fd_perf_per_domain)); + if (!pinfo->fd_perf_per_domain) + errx(1, "%s: alloc %s", __func__, "fd_perf_per_domain"); + + for (size_t i = 0; i < num_domains; ++i) + pinfo->fd_perf_per_domain[i] = -1; + + pinfo->num_domains = num_domains; + pinfo->scale = 1.0; + + memset(domain_visited, 0, max_num_domains * sizeof(*domain_visited)); + + for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) { + + next_domain = cpu_to_domain(pinfo, cpu); + + assert(next_domain < num_domains); + + if (cpu_is_not_allowed(cpu)) + continue; + + if (domain_visited[next_domain]) + continue; + + perf_type = read_perf_type(pinfo->device); + if (perf_type == (unsigned int)-1) { + warnx("%s: perf/%s/%s: failed to read %s", + __func__, pinfo->device, pinfo->event, "type"); + continue; + } + + perf_config = read_perf_config(pinfo->device, pinfo->event); + if (perf_config == (unsigned int)-1) { + warnx("%s: perf/%s/%s: failed to read %s", + __func__, pinfo->device, pinfo->event, "config"); + continue; + } + + /* Scale is not required, some counters just don't have it. */ + perf_scale = read_perf_scale(pinfo->device, pinfo->event); + if (perf_scale == 0.0) + perf_scale = 1.0; + + fd_perf = open_perf_counter(cpu, perf_type, perf_config, -1, 0); + if (fd_perf == -1) { + warnx("%s: perf/%s/%s: failed to open counter on cpu%d", + __func__, pinfo->device, pinfo->event, cpu); + continue; + } + + domain_visited[next_domain] = 1; + pinfo->fd_perf_per_domain[next_domain] = fd_perf; + pinfo->scale = perf_scale; + + if (debug) + fprintf(stderr, "Add perf/%s/%s cpu%d: %d\n", + pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]); + } + + pinfo = pinfo->next; + } + + free(domain_visited); + + return 0; +} + +void added_perf_counters_init(void) +{ + if (added_perf_counters_init_(sys.perf_tp)) + errx(1, "%s: %s", __func__, "thread"); + + if (added_perf_counters_init_(sys.perf_cp)) + errx(1, "%s: %s", __func__, "core"); + + if (added_perf_counters_init_(sys.perf_pp)) + errx(1, "%s: %s", __func__, "package"); +} + +int parse_telem_info_file(int fd_dir, const char *info_filename, const char *format, unsigned long *output) +{ + int fd_telem_info; + FILE *file_telem_info; + unsigned long value; + + fd_telem_info = openat(fd_dir, info_filename, O_RDONLY); + if (fd_telem_info == -1) + return -1; + + file_telem_info = fdopen(fd_telem_info, "r"); + if (file_telem_info == NULL) { + close(fd_telem_info); + return -1; + } + + if (fscanf(file_telem_info, format, &value) != 1) { + fclose(file_telem_info); + return -1; + } + + fclose(file_telem_info); + + *output = value; + + return 0; +} + +struct pmt_mmio *pmt_mmio_open(unsigned int target_guid) +{ + DIR *dirp; + struct dirent *entry; + struct stat st; + unsigned int telem_idx; + int fd_telem_dir, fd_pmt; + unsigned long guid, size, offset; + size_t mmap_size; + void *mmio; + struct pmt_mmio *ret = NULL; + + if (stat(SYSFS_TELEM_PATH, &st) == -1) + return NULL; + + dirp = opendir(SYSFS_TELEM_PATH); + if (dirp == NULL) + return NULL; + + for (;;) { + entry = readdir(dirp); + + if (entry == NULL) + break; + + if (strcmp(entry->d_name, ".") == 0) + continue; + + if (strcmp(entry->d_name, "..") == 0) + continue; + + if (sscanf(entry->d_name, "telem%u", &telem_idx) != 1) + continue; + + if (fstatat(dirfd(dirp), entry->d_name, &st, 0) == -1) { + break; + } + + if (!S_ISDIR(st.st_mode)) + continue; + + fd_telem_dir = openat(dirfd(dirp), entry->d_name, O_RDONLY); + if (fd_telem_dir == -1) { + break; + } + + if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) { + close(fd_telem_dir); + break; + } + + if (parse_telem_info_file(fd_telem_dir, "size", "%lu", &size)) { + close(fd_telem_dir); + break; + } + + if (guid != target_guid) { + close(fd_telem_dir); + continue; + } + + if (parse_telem_info_file(fd_telem_dir, "offset", "%lu", &offset)) { + close(fd_telem_dir); + break; + } + + assert(offset == 0); + + fd_pmt = openat(fd_telem_dir, "telem", O_RDONLY); + if (fd_pmt == -1) + goto loop_cleanup_and_break; + + mmap_size = (size + 0x1000UL) & (~0x1000UL); + mmio = mmap(0, mmap_size, PROT_READ, MAP_SHARED, fd_pmt, 0); + if (mmio != MAP_FAILED) { + + if (debug) + fprintf(stderr, "%s: 0x%lx mmaped at: %p\n", __func__, guid, mmio); + + ret = calloc(1, sizeof(*ret)); + + if (!ret) { + fprintf(stderr, "%s: Failed to allocate pmt_mmio\n", __func__); + exit(1); + } + + ret->guid = guid; + ret->mmio_base = mmio; + ret->pmt_offset = offset; + ret->size = size; + + ret->next = pmt_mmios; + pmt_mmios = ret; + } + +loop_cleanup_and_break: + close(fd_pmt); + close(fd_telem_dir); + break; + } + + closedir(dirp); + + return ret; +} + +struct pmt_mmio *pmt_mmio_find(unsigned int guid) +{ + struct pmt_mmio *pmmio = pmt_mmios; + + while (pmmio) { + if (pmmio->guid == guid) + return pmmio; + + pmmio = pmmio->next; + } + + return NULL; +} + +void *pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offset) +{ + char *ret; + + /* Get base of mmaped PMT file. */ + ret = (char *)pmmio->mmio_base; + + /* + * Apply PMT MMIO offset to obtain beginning of the mmaped telemetry data. + * It's not guaranteed that the mmaped memory begins with the telemetry data + * - we might have to apply the offset first. + */ + ret += pmmio->pmt_offset; + + /* Apply the counter offset to get the address to the mmaped counter. */ + ret += counter_offset; + + return ret; +} + +struct pmt_mmio *pmt_add_guid(unsigned int guid) +{ + struct pmt_mmio *ret; + + ret = pmt_mmio_find(guid); + if (!ret) + ret = pmt_mmio_open(guid); + + return ret; +} + +enum pmt_open_mode { + PMT_OPEN_TRY, /* Open failure is not an error. */ + PMT_OPEN_REQUIRED, /* Open failure is a fatal error. */ +}; + +struct pmt_counter *pmt_find_counter(struct pmt_counter *pcounter, const char *name) +{ + while (pcounter) { + if (strcmp(pcounter->name, name) == 0) + break; + + pcounter = pcounter->next; + } + + return pcounter; +} + +struct pmt_counter **pmt_get_scope_root(enum counter_scope scope) +{ + switch (scope) { + case SCOPE_CPU: + return &sys.pmt_tp; + case SCOPE_CORE: + return &sys.pmt_cp; + case SCOPE_PACKAGE: + return &sys.pmt_pp; + } + + __builtin_unreachable(); +} + +void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio, unsigned int domain_id) +{ + /* Make sure the new domain fits. */ + if (domain_id >= pcounter->num_domains) + pmt_counter_resize(pcounter, domain_id + 1); + + assert(pcounter->domains); + assert(domain_id < pcounter->num_domains); + + pcounter->domains[domain_id].pcounter = pmmio; +} + +int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type, + unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope, + enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode) +{ + struct pmt_mmio *mmio; + struct pmt_counter *pcounter; + struct pmt_counter **const pmt_root = pmt_get_scope_root(scope); + bool new_counter = false; + int conflict = 0; + + if (lsb > msb) { + fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "lsb <= msb", name); + exit(1); + } + + if (msb >= 64) { + fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "msb < 64", name); + exit(1); + } + + mmio = pmt_add_guid(guid); + if (!mmio) { + if (mode != PMT_OPEN_TRY) { + fprintf(stderr, "%s: failed to map PMT MMIO for guid %x\n", __func__, guid); + exit(1); + } + + return 1; + } + + if (offset >= mmio->size) { + if (mode != PMT_OPEN_TRY) { + fprintf(stderr, "%s: offset %u outside of PMT MMIO size %u\n", __func__, offset, mmio->size); + exit(1); + } + + return 1; + } + + pcounter = pmt_find_counter(*pmt_root, name); + if (!pcounter) { + pcounter = calloc(1, sizeof(*pcounter)); + new_counter = true; + } + + if (new_counter) { + strncpy(pcounter->name, name, ARRAY_SIZE(pcounter->name) - 1); + pcounter->type = type; + pcounter->scope = scope; + pcounter->lsb = lsb; + pcounter->msb = msb; + pcounter->format = format; + } else { + conflict += pcounter->type != type; + conflict += pcounter->scope != scope; + conflict += pcounter->lsb != lsb; + conflict += pcounter->msb != msb; + conflict += pcounter->format != format; + } + + if (conflict) { + fprintf(stderr, "%s: conflicting parameters for the PMT counter with the same name %s\n", + __func__, name); + exit(1); + } + + pmt_counter_add_domain(pcounter, pmt_get_counter_pointer(mmio, offset), domain_id); + + if (new_counter) { + pcounter->next = *pmt_root; + *pmt_root = pcounter; + } + + return 0; +} + +void pmt_init(void) +{ + if (BIC_IS_ENABLED(BIC_Diec6)) { + pmt_add_counter(PMT_MTL_DC6_GUID, "Die%c6", PMT_TYPE_XTAL_TIME, PMT_COUNTER_MTL_DC6_LSB, + PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET, SCOPE_PACKAGE, FORMAT_DELTA, + 0, PMT_OPEN_TRY); } } @@ -7923,16 +8975,18 @@ void turbostat_init() process_cpuid(); counter_info_init(); probe_pm_features(); - set_amperf_source(); + msr_perf_init(); linux_perf_init(); rapl_perf_init(); cstate_perf_init(); + added_perf_counters_init(); + pmt_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); - if (DO_BIC(BIC_IPC)) - (void)get_instr_count_fd(base_cpu); + if (BIC_IS_ENABLED(BIC_IPC) && has_aperf_access && get_instr_count_fd(base_cpu) != -1) + BIC_PRESENT(BIC_IPC); /* * If TSC tweak is needed, but couldn't get it, @@ -8017,7 +9071,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2024.05.10 - Len Brown \n"); + fprintf(outf, "turbostat version 2024.07.26 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 @@ -8049,7 +9103,7 @@ struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name) for (mp = head; mp; mp = mp->next) { if (debug) - printf("%s: %s %s\n", __func__, name, mp->name); + fprintf(stderr, "%s: %s %s\n", __func__, name, mp->name); if (!strncmp(name, mp->name, strlen(mp->name))) return mp; } @@ -8066,8 +9120,8 @@ int add_counter(unsigned int msr_num, char *path, char *name, errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num); if (debug) - printf("%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", __func__, msr_num, - path, name, width, scope, type, format, flags, id); + fprintf(stderr, "%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", + __func__, msr_num, path, name, width, scope, type, format, flags, id); switch (scope) { @@ -8075,7 +9129,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = find_msrp_by_name(sys.tp, name); if (msrp) { if (debug) - printf("%s: %s FOUND\n", __func__, name); + fprintf(stderr, "%s: %s FOUND\n", __func__, name); break; } if (sys.added_thread_counters++ >= MAX_ADDED_THREAD_COUNTERS) { @@ -8087,7 +9141,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = find_msrp_by_name(sys.cp, name); if (msrp) { if (debug) - printf("%s: %s FOUND\n", __func__, name); + fprintf(stderr, "%s: %s FOUND\n", __func__, name); break; } if (sys.added_core_counters++ >= MAX_ADDED_CORE_COUNTERS) { @@ -8099,7 +9153,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = find_msrp_by_name(sys.pp, name); if (msrp) { if (debug) - printf("%s: %s FOUND\n", __func__, name); + fprintf(stderr, "%s: %s FOUND\n", __func__, name); break; } if (sys.added_package_counters++ >= MAX_ADDED_PACKAGE_COUNTERS) { @@ -8116,6 +9170,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = calloc(1, sizeof(struct msr_counter)); if (msrp == NULL) err(-1, "calloc msr_counter"); + msrp->msr_num = msr_num; strncpy(msrp->name, name, NAME_BYTES - 1); msrp->width = width; @@ -8156,11 +9211,106 @@ int add_counter(unsigned int msr_num, char *path, char *name, return 0; } -void parse_add_command(char *add_command) +/* + * Initialize the fields used for identifying and opening the counter. + * + * Defer the initialization of any runtime buffers for actually reading + * the counters for when we initialize all perf counters, so we can later + * easily call re_initialize(). + */ +struct perf_counter_info *make_perf_counter_info(const char *perf_device, + const char *perf_event, + const char *name, + unsigned int width, + enum counter_scope scope, + enum counter_type type, enum counter_format format) +{ + struct perf_counter_info *pinfo; + + pinfo = calloc(1, sizeof(*pinfo)); + if (!pinfo) + errx(1, "%s: Failed to allocate %s/%s\n", __func__, perf_device, perf_event); + + strncpy(pinfo->device, perf_device, ARRAY_SIZE(pinfo->device) - 1); + strncpy(pinfo->event, perf_event, ARRAY_SIZE(pinfo->event) - 1); + + strncpy(pinfo->name, name, ARRAY_SIZE(pinfo->name) - 1); + pinfo->width = width; + pinfo->scope = scope; + pinfo->type = type; + pinfo->format = format; + + return pinfo; +} + +int add_perf_counter(const char *perf_device, const char *perf_event, const char *name_buffer, unsigned int width, + enum counter_scope scope, enum counter_type type, enum counter_format format) +{ + struct perf_counter_info *pinfo; + + switch (scope) { + case SCOPE_CPU: + if (sys.added_thread_perf_counters >= MAX_ADDED_THREAD_COUNTERS) { + warnx("ignoring thread counter perf/%s/%s", perf_device, perf_event); + return -1; + } + break; + + case SCOPE_CORE: + if (sys.added_core_perf_counters >= MAX_ADDED_CORE_COUNTERS) { + warnx("ignoring core counter perf/%s/%s", perf_device, perf_event); + return -1; + } + break; + + case SCOPE_PACKAGE: + if (sys.added_package_perf_counters >= MAX_ADDED_PACKAGE_COUNTERS) { + warnx("ignoring package counter perf/%s/%s", perf_device, perf_event); + return -1; + } + break; + } + + pinfo = make_perf_counter_info(perf_device, perf_event, name_buffer, width, scope, type, format); + + if (!pinfo) + return -1; + + switch (scope) { + case SCOPE_CPU: + pinfo->next = sys.perf_tp; + sys.perf_tp = pinfo; + ++sys.added_thread_perf_counters; + break; + + case SCOPE_CORE: + pinfo->next = sys.perf_cp; + sys.perf_cp = pinfo; + ++sys.added_core_perf_counters; + break; + + case SCOPE_PACKAGE: + pinfo->next = sys.perf_pp; + sys.perf_pp = pinfo; + ++sys.added_package_perf_counters; + break; + } + + // FIXME: we might not have debug here yet + if (debug) + fprintf(stderr, "%s: %s/%s, name: %s, scope%d\n", + __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope); + + return 0; +} + +void parse_add_command_msr(char *add_command) { int msr_num = 0; char *path = NULL; - char name_buffer[NAME_BYTES] = ""; + char perf_device[PERF_DEV_NAME_BYTES] = ""; + char perf_event[PERF_EVT_NAME_BYTES] = ""; + char name_buffer[PERF_NAME_BYTES] = ""; int width = 64; int fail = 0; enum counter_scope scope = SCOPE_CPU; @@ -8175,6 +9325,11 @@ void parse_add_command(char *add_command) if (sscanf(add_command, "msr%d", &msr_num) == 1) goto next; + BUILD_BUG_ON(ARRAY_SIZE(perf_device) <= 31); + BUILD_BUG_ON(ARRAY_SIZE(perf_event) <= 31); + if (sscanf(add_command, "perf/%31[^/]/%31[^,]", &perf_device[0], &perf_event[0]) == 2) + goto next; + if (*add_command == '/') { path = add_command; goto next; @@ -8222,7 +9377,8 @@ void parse_add_command(char *add_command) goto next; } - if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) { /* 18 < NAME_BYTES */ + BUILD_BUG_ON(ARRAY_SIZE(name_buffer) <= 18); + if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) { char *eos; eos = strchr(name_buffer, ','); @@ -8239,21 +9395,33 @@ next: } } - if ((msr_num == 0) && (path == NULL)) { - fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter ) required\n"); + if ((msr_num == 0) && (path == NULL) && (perf_device[0] == '\0' || perf_event[0] == '\0')) { + fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter | perf/device/event ) required\n"); fail++; } + /* Test for non-empty perf_device and perf_event */ + const bool is_perf_counter = perf_device[0] && perf_event[0]; + /* generate default column header */ if (*name_buffer == '\0') { - if (width == 32) - sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); - else - sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); + if (is_perf_counter) { + snprintf(name_buffer, ARRAY_SIZE(name_buffer), "perf/%s", perf_event); + } else { + if (width == 32) + sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); + else + sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); + } } - if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0)) - fail++; + if (is_perf_counter) { + if (add_perf_counter(perf_device, perf_event, name_buffer, width, scope, type, format)) + fail++; + } else { + if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0)) + fail++; + } if (fail) { help(); @@ -8261,6 +9429,195 @@ next: } } +bool starts_with(const char *str, const char *prefix) +{ + return strncmp(prefix, str, strlen(prefix)) == 0; +} + +void parse_add_command_pmt(char *add_command) +{ + char *name = NULL; + char *type_name = NULL; + char *format_name = NULL; + unsigned int offset; + unsigned int lsb; + unsigned int msb; + unsigned int guid; + unsigned int domain_id; + enum counter_scope scope = 0; + enum pmt_datatype type = PMT_TYPE_RAW; + enum counter_format format = FORMAT_RAW; + bool has_offset = false; + bool has_lsb = false; + bool has_msb = false; + bool has_format = true; /* Format has a default value. */ + bool has_guid = false; + bool has_scope = false; + bool has_type = true; /* Type has a default value. */ + + /* Consume the "pmt," prefix. */ + add_command = strchr(add_command, ','); + if (!add_command) { + help(); + exit(1); + } + ++add_command; + + while (add_command) { + if (starts_with(add_command, "name=")) { + name = add_command + strlen("name="); + goto next; + } + + if (starts_with(add_command, "type=")) { + type_name = add_command + strlen("type="); + goto next; + } + + if (starts_with(add_command, "domain=")) { + const size_t prefix_len = strlen("domain="); + + if (sscanf(add_command + prefix_len, "cpu%u", &domain_id) == 1) { + scope = SCOPE_CPU; + has_scope = true; + } else if (sscanf(add_command + prefix_len, "core%u", &domain_id) == 1) { + scope = SCOPE_CORE; + has_scope = true; + } else if (sscanf(add_command + prefix_len, "package%u", &domain_id) == 1) { + scope = SCOPE_PACKAGE; + has_scope = true; + } + + if (!has_scope) { + printf("%s: invalid value for scope. Expected cpu%%u, core%%u or package%%u.\n", + __func__); + exit(1); + } + + goto next; + } + + if (starts_with(add_command, "format=")) { + format_name = add_command + strlen("format="); + goto next; + } + + if (sscanf(add_command, "offset=%u", &offset) == 1) { + has_offset = true; + goto next; + } + + if (sscanf(add_command, "lsb=%u", &lsb) == 1) { + has_lsb = true; + goto next; + } + + if (sscanf(add_command, "msb=%u", &msb) == 1) { + has_msb = true; + goto next; + } + + if (sscanf(add_command, "guid=%x", &guid) == 1) { + has_guid = true; + goto next; + } + +next: + add_command = strchr(add_command, ','); + if (add_command) { + *add_command = '\0'; + add_command++; + } + } + + if (!name) { + printf("%s: missing %s\n", __func__, "name"); + exit(1); + } + + if (strlen(name) >= PMT_COUNTER_NAME_SIZE_BYTES) { + printf("%s: name has to be at most %d characters long\n", __func__, PMT_COUNTER_NAME_SIZE_BYTES); + exit(1); + } + + if (format_name) { + has_format = false; + + if (strcmp("raw", format_name) == 0) { + format = FORMAT_RAW; + has_format = true; + } + + if (strcmp("delta", format_name) == 0) { + format = FORMAT_DELTA; + has_format = true; + } + + if (!has_format) { + fprintf(stderr, "%s: Invalid format %s. Expected raw or delta\n", __func__, format_name); + exit(1); + } + } + + if (type_name) { + has_type = false; + + if (strcmp("raw", type_name) == 0) { + type = PMT_TYPE_RAW; + has_type = true; + } + + if (strcmp("txtal_time", type_name) == 0) { + type = PMT_TYPE_XTAL_TIME; + has_type = true; + } + + if (!has_type) { + printf("%s: invalid %s: %s\n", __func__, "type", type_name); + exit(1); + } + } + + if (!has_offset) { + printf("%s : missing %s\n", __func__, "offset"); + exit(1); + } + + if (!has_lsb) { + printf("%s: missing %s\n", __func__, "lsb"); + exit(1); + } + + if (!has_msb) { + printf("%s: missing %s\n", __func__, "msb"); + exit(1); + } + + if (!has_guid) { + printf("%s: missing %s\n", __func__, "guid"); + exit(1); + } + + if (!has_scope) { + printf("%s: missing %s\n", __func__, "scope"); + exit(1); + } + + if (lsb > msb) { + printf("%s: lsb > msb doesn't make sense\n", __func__); + exit(1); + } + + pmt_add_counter(guid, name, type, lsb, msb, offset, scope, format, domain_id, PMT_OPEN_REQUIRED); +} + +void parse_add_command(char *add_command) +{ + if (strncmp(add_command, "pmt", strlen("pmt")) == 0) + return parse_add_command_pmt(add_command); + return parse_add_command_msr(add_command); +} + int is_deferred_add(char *name) { int i; diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index eaf091a3d331..129f179b0ac5 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -385,19 +385,21 @@ struct cxl_test_gen_media { struct cxl_test_gen_media gen_media = { .id = CXL_EVENT_GEN_MEDIA_UUID, .rec = { - .hdr = { - .length = sizeof(struct cxl_test_gen_media), - .flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT, - /* .handle = Set dynamically */ - .related_handle = cpu_to_le16(0), + .media_hdr = { + .hdr = { + .length = sizeof(struct cxl_test_gen_media), + .flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT, + /* .handle = Set dynamically */ + .related_handle = cpu_to_le16(0), + }, + .phys_addr = cpu_to_le64(0x2000), + .descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, + .type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR, + .transaction_type = CXL_GMER_TRANS_HOST_WRITE, + /* .validity_flags = */ + .channel = 1, + .rank = 30, }, - .phys_addr = cpu_to_le64(0x2000), - .descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, - .type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR, - .transaction_type = CXL_GMER_TRANS_HOST_WRITE, - /* .validity_flags = */ - .channel = 1, - .rank = 30 }, }; @@ -409,18 +411,20 @@ struct cxl_test_dram { struct cxl_test_dram dram = { .id = CXL_EVENT_DRAM_UUID, .rec = { - .hdr = { - .length = sizeof(struct cxl_test_dram), - .flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, - /* .handle = Set dynamically */ - .related_handle = cpu_to_le16(0), + .media_hdr = { + .hdr = { + .length = sizeof(struct cxl_test_dram), + .flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, + /* .handle = Set dynamically */ + .related_handle = cpu_to_le16(0), + }, + .phys_addr = cpu_to_le64(0x8000), + .descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, + .type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR, + .transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB, + /* .validity_flags = */ + .channel = 1, }, - .phys_addr = cpu_to_le64(0x8000), - .descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, - .type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR, - .transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB, - /* .validity_flags = */ - .channel = 1, .bank_group = 5, .bank = 2, .column = {0xDE, 0xAD}, @@ -474,11 +478,11 @@ static int mock_set_timestamp(struct cxl_dev_state *cxlds, static void cxl_mock_add_event_logs(struct mock_event_store *mes) { put_unaligned_le16(CXL_GMER_VALID_CHANNEL | CXL_GMER_VALID_RANK, - &gen_media.rec.validity_flags); + &gen_media.rec.media_hdr.validity_flags); put_unaligned_le16(CXL_DER_VALID_CHANNEL | CXL_DER_VALID_BANK_GROUP | CXL_DER_VALID_BANK | CXL_DER_VALID_COLUMN, - &dram.rec.validity_flags); + &dram.rec.media_hdr.validity_flags); mes_add_event(mes, CXL_EVENT_TYPE_INFO, &maint_needed); mes_add_event(mes, CXL_EVENT_TYPE_INFO, @@ -1131,27 +1135,28 @@ static bool mock_poison_dev_max_injected(struct cxl_dev_state *cxlds) return (count >= poison_inject_dev_max); } -static bool mock_poison_add(struct cxl_dev_state *cxlds, u64 dpa) +static int mock_poison_add(struct cxl_dev_state *cxlds, u64 dpa) { + /* Return EBUSY to match the CXL driver handling */ if (mock_poison_dev_max_injected(cxlds)) { dev_dbg(cxlds->dev, "Device poison injection limit has been reached: %d\n", - MOCK_INJECT_DEV_MAX); - return false; + poison_inject_dev_max); + return -EBUSY; } for (int i = 0; i < MOCK_INJECT_TEST_MAX; i++) { if (!mock_poison_list[i].cxlds) { mock_poison_list[i].cxlds = cxlds; mock_poison_list[i].dpa = dpa; - return true; + return 0; } } dev_dbg(cxlds->dev, "Mock test poison injection limit has been reached: %d\n", MOCK_INJECT_TEST_MAX); - return false; + return -ENXIO; } static bool mock_poison_found(struct cxl_dev_state *cxlds, u64 dpa) @@ -1175,10 +1180,8 @@ static int mock_inject_poison(struct cxl_dev_state *cxlds, dev_dbg(cxlds->dev, "DPA: 0x%llx already poisoned\n", dpa); return 0; } - if (!mock_poison_add(cxlds, dpa)) - return -ENXIO; - return 0; + return mock_poison_add(cxlds, dpa); } static bool mock_poison_del(struct cxl_dev_state *cxlds, u64 dpa) diff --git a/tools/testing/selftests/turbostat/added_perf_counters.py b/tools/testing/selftests/turbostat/added_perf_counters.py new file mode 100755 index 000000000000..9ab4aaf45fb8 --- /dev/null +++ b/tools/testing/selftests/turbostat/added_perf_counters.py @@ -0,0 +1,178 @@ +#!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +from shutil import which +from os import pread + +class PerfCounterInfo: + def __init__(self, subsys, event): + self.subsys = subsys + self.event = event + + def get_perf_event_name(self): + return f'{self.subsys}/{self.event}/' + + def get_turbostat_perf_id(self, counter_scope, counter_type, column_name): + return f'perf/{self.subsys}/{self.event},{counter_scope},{counter_type},{column_name}' + +PERF_COUNTERS_CANDIDATES = [ + PerfCounterInfo('msr', 'mperf'), + PerfCounterInfo('msr', 'aperf'), + PerfCounterInfo('msr', 'tsc'), + PerfCounterInfo('cstate_core', 'c1-residency'), + PerfCounterInfo('cstate_core', 'c6-residency'), + PerfCounterInfo('cstate_core', 'c7-residency'), + PerfCounterInfo('cstate_pkg', 'c2-residency'), + PerfCounterInfo('cstate_pkg', 'c3-residency'), + PerfCounterInfo('cstate_pkg', 'c6-residency'), + PerfCounterInfo('cstate_pkg', 'c7-residency'), + PerfCounterInfo('cstate_pkg', 'c8-residency'), + PerfCounterInfo('cstate_pkg', 'c9-residency'), + PerfCounterInfo('cstate_pkg', 'c10-residency'), +] +present_perf_counters = [] + +def check_perf_access(): + perf = which('perf') + if perf is None: + print('SKIP: Could not find perf binary, thus could not determine perf access.') + return False + + def has_perf_counter_access(counter_name): + proc_perf = subprocess.run([perf, 'stat', '-e', counter_name, '--timeout', '10'], + capture_output = True) + + if proc_perf.returncode != 0: + print(f'SKIP: Could not read {counter_name} perf counter.') + return False + + if b'' in proc_perf.stderr: + print(f'SKIP: Could not read {counter_name} perf counter.') + return False + + return True + + for counter in PERF_COUNTERS_CANDIDATES: + if has_perf_counter_access(counter.get_perf_event_name()): + present_perf_counters.append(counter) + + if len(present_perf_counters) == 0: + print('SKIP: Could not read any perf counter.') + return False + + if len(present_perf_counters) != len(PERF_COUNTERS_CANDIDATES): + print(f'WARN: Could not access all of the counters - some will be left untested') + + return True + +if not check_perf_access(): + exit(0) + +turbostat_counter_source_opts = [''] + +turbostat = which('turbostat') +if turbostat is None: + print('Could not find turbostat binary') + exit(1) + +timeout = which('timeout') +if timeout is None: + print('Could not find timeout binary') + exit(1) + +proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +EXPECTED_COLUMNS_DEBUG_DEFAULT = [b'usec', b'Time_Of_Day_Seconds', b'APIC', b'X2APIC'] + +expected_columns = [b'CPU'] +counters_argv = [] +for counter in present_perf_counters: + if counter.subsys == 'cstate_core': + counter_scope = 'core' + elif counter.subsys == 'cstate_pkg': + counter_scope = 'package' + else: + counter_scope = 'cpu' + + counter_type = 'delta' + column_name = counter.event + + cparams = counter.get_turbostat_perf_id( + counter_scope = counter_scope, + counter_type = counter_type, + column_name = column_name + ) + expected_columns.append(column_name.encode()) + counters_argv.extend(['--add', cparams]) + +expected_columns_debug = EXPECTED_COLUMNS_DEBUG_DEFAULT + expected_columns + +def gen_user_friendly_cmdline(argv_): + argv = argv_[:] + ret = '' + + while len(argv) != 0: + arg = argv.pop(0) + arg_next = '' + + if arg in ('-i', '--show', '--add'): + arg_next = argv.pop(0) if len(argv) > 0 else '' + + ret += f'{arg} {arg_next} \\\n\t' + + # Remove the last separator and return + return ret[:-4] + +# +# Run turbostat for some time and send SIGINT +# +timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '0.2s'] +turbostat_argv = [turbostat, '-i', '0.50', '--show', 'CPU'] + counters_argv + +def check_columns_or_fail(expected_columns: list, actual_columns: list): + if len(actual_columns) != len(expected_columns): + print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}') + exit(1) + + failed = False + for expected_column in expected_columns: + if expected_column not in actual_columns: + print(f'turbostat column check failed: missing column {expected_column.decode()}') + failed = True + + if failed: + exit(1) + +cmdline = gen_user_friendly_cmdline(turbostat_argv) +print(f'Running turbostat with:\n\t{cmdline}\n... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +actual_columns = proc_turbostat.stdout.split(b'\n')[0].split(b'\t') +check_columns_or_fail(expected_columns, actual_columns) +print('OK') + +# +# Same, but with --debug +# +# We explicitly specify '--show CPU' to make sure turbostat +# don't show a bunch of default counters instead. +# +turbostat_argv.append('--debug') + +cmdline = gen_user_friendly_cmdline(turbostat_argv) +print(f'Running turbostat (in debug mode) with:\n\t{cmdline}\n... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +actual_columns = proc_turbostat.stdout.split(b'\n')[0].split(b'\t') +check_columns_or_fail(expected_columns_debug, actual_columns) +print('OK') diff --git a/tools/testing/selftests/turbostat/smi_aperf_mperf.py b/tools/testing/selftests/turbostat/smi_aperf_mperf.py new file mode 100755 index 000000000000..6289cc47d5f0 --- /dev/null +++ b/tools/testing/selftests/turbostat/smi_aperf_mperf.py @@ -0,0 +1,157 @@ +#!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +from shutil import which +from os import pread + +# CDLL calls dlopen underneath. +# Calling it with None (null), we get handle to the our own image (python interpreter). +# We hope to find sched_getcpu() inside ;] +# This is a bit ugly, but helps shipping working software, so.. +try: + import ctypes + + this_image = ctypes.CDLL(None) + BASE_CPU = this_image.sched_getcpu() +except: + BASE_CPU = 0 # If we fail, set to 0 and pray it's not offline. + +MSR_IA32_MPERF = 0x000000e7 +MSR_IA32_APERF = 0x000000e8 + +def check_perf_access(): + perf = which('perf') + if perf is None: + print('SKIP: Could not find perf binary, thus could not determine perf access.') + return False + + def has_perf_counter_access(counter_name): + proc_perf = subprocess.run([perf, 'stat', '-e', counter_name, '--timeout', '10'], + capture_output = True) + + if proc_perf.returncode != 0: + print(f'SKIP: Could not read {counter_name} perf counter, assuming no access.') + return False + + if b'' in proc_perf.stderr: + print(f'SKIP: Could not read {counter_name} perf counter, assuming no access.') + return False + + return True + + if not has_perf_counter_access('msr/mperf/'): + return False + if not has_perf_counter_access('msr/aperf/'): + return False + if not has_perf_counter_access('msr/smi/'): + return False + + return True + +def check_msr_access(): + try: + file_msr = open(f'/dev/cpu/{BASE_CPU}/msr', 'rb') + except: + return False + + if len(pread(file_msr.fileno(), 8, MSR_IA32_MPERF)) != 8: + return False + + if len(pread(file_msr.fileno(), 8, MSR_IA32_APERF)) != 8: + return False + + return True + +has_perf_access = check_perf_access() +has_msr_access = check_msr_access() + +turbostat_counter_source_opts = [''] + +if has_msr_access: + turbostat_counter_source_opts.append('--no-perf') +else: + print('SKIP: doesn\'t have MSR access, skipping run with --no-perf') + +if has_perf_access: + turbostat_counter_source_opts.append('--no-msr') +else: + print('SKIP: doesn\'t have perf access, skipping run with --no-msr') + +if not has_msr_access and not has_perf_access: + print('SKIP: No MSR nor perf access detected. Skipping the tests entirely') + exit(0) + +turbostat = which('turbostat') +if turbostat is None: + print('Could not find turbostat binary') + exit(1) + +timeout = which('timeout') +if timeout is None: + print('Could not find timeout binary') + exit(1) + +proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +EXPECTED_COLUMNS_DEBUG_DEFAULT = b'usec\tTime_Of_Day_Seconds\tAPIC\tX2APIC' + +SMI_APERF_MPERF_DEPENDENT_BICS = [ + 'SMI', + 'Avg_MHz', + 'Busy%', + 'Bzy_MHz', +] +if has_perf_access: + SMI_APERF_MPERF_DEPENDENT_BICS.append('IPC') + +for bic in SMI_APERF_MPERF_DEPENDENT_BICS: + for counter_source_opt in turbostat_counter_source_opts: + + # Ugly special case, but it is what it is.. + if counter_source_opt == '--no-perf' and bic == 'IPC': + continue + + expected_columns = bic.encode() + expected_columns_debug = EXPECTED_COLUMNS_DEBUG_DEFAULT + f'\t{bic}'.encode() + + # + # Run turbostat for some time and send SIGINT + # + timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '0.2s'] + turbostat_argv = [turbostat, '-i', '0.50', '--show', bic] + + if counter_source_opt: + turbostat_argv.append(counter_source_opt) + + print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) + proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) + if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + + actual_columns = proc_turbostat.stdout.split(b'\n')[0] + if expected_columns != actual_columns: + print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}') + exit(1) + print('OK') + + # + # Same, but with --debug + # + turbostat_argv.append('--debug') + + print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) + proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) + if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + + actual_columns = proc_turbostat.stdout.split(b'\n')[0] + if expected_columns_debug != actual_columns: + print(f'turbostat column check failed\n{expected_columns_debug=}\n{actual_columns=}') + exit(1) + print('OK')