8cb2595f93
GKI (arm64) relevant 87 out of 414 changes, affecting 112 files +738/-352bdb71ee651configfs: Do not override creating attribute file failure in populate_attrs() [1 file, +1/-1]ba789be63dio_uring: account drain memory to cgroup [1 file, +1/-1]c58b577cf7io_uring/kbuf: account ring io_buffer_list memory [1 file, +1/-1]f78b38af35jbd2: fix data-race and null-ptr-deref in jbd2_journal_dirty_metadata() [1 file, +3/-2]2429bb9fadmedia: v4l2-dev: fix error handling in __video_register_device() [1 file, +7/-7]5d8b057ed7media: videobuf2: use sgtable-based scatterlist wrappers [1 file, +2/-2]b52dc88361media: uvcvideo: Return the number of processed controls [1 file, +10/-1]6d2b12e7c5media: uvcvideo: Send control events for partial succeeds [1 file, +9/-3]aac91ae06cmedia: uvcvideo: Fix deferred probing error [1 file, +19/-8]86d9837e46arm64/mm: Close theoretical race where stale TLB entry remains valid [1 file, +5/-4]5538af3843block: use plug request list tail for one-shot backmerge attempt [1 file, +13/-13]943801c380block: Clear BIO_EMULATES_ZONE_APPEND flag on BIO completion [1 file, +1/-0]1c71f3cf5fcgroup,freezer: fix incomplete freezing when attaching tasks [1 file, +1/-2]a0890b7805bus: firewall: Fix missing static inline annotations for stubs [1 file, +9/-6]5766da2237ext4: inline: fix len overflow in ext4_prepare_inline_data [1 file, +1/-1]796632e6f8ext4: fix calculation of credits for extent tree modification [1 file, +6/-5]4b36399711ext4: ensure i_size is smaller than maxbytes [1 file, +2/-1]be5f3061a6ext4: only dirty folios when data journaling regular files [1 file, +6/-1]a0b1c91adaInput: gpio-keys - fix possible concurrent access in gpio_keys_irq_timer() [1 file, +2/-0]fed611bd8cf2fs: fix to do sanity check on ino and xnid [1 file, +6/-0]aaa644e7fff2fs: prevent kernel warning due to negative i_nlink from corrupted image [1 file, +9/-0]ee1b421c46f2fs: fix to do sanity check on sit_bitmap_size [1 file, +8/-0]f16a797dcewatchdog: fix watchdog may detect false positive of softlockup [1 file, +27/-14]02137179ffmm: fix ratelimit_pages update error in dirty_ratio_handler() [1 file, +1/-1]462eee6d42firmware: arm_scmi: Ensure that the message-id supports fastchannel [2 files, +45/-33]e3cf1ef571dm-verity: fix a memory leak if some arguments are specified multiple times [3 files, +24/-5]f2986bccf2dm: lock limits when reading them [1 file, +7/-1]ec5f0b4412ovl: Fix nested backing file paths [1 file, +2/-2]92776ca0ccremoteproc: core: Cleanup acquired resources when rproc_handle_resources() fails in rproc_attach() [1 file, +2/-3]f4ef928ca5remoteproc: core: Release rproc->clean_table after rproc_attach() fails [1 file, +1/-0]68e58f5791PCI: dwc: ep: Correct PBA offset in .set_msix() callback [1 file, +3/-2]b20701d594PCI: Add ACS quirk for Loongson PCIe [1 file, +23/-0]be0cf75cbdPCI: Fix lock symmetry in pci_slot_unlock() [1 file, +2/-1]7b45d2401dclocksource: Fix the CPUs' choice in the watchdog per CPU verification [1 file, +1/-1]c05aba32a9ACPICA: Avoid sequence overread in call to strncmp() [1 file, +1/-1]66613b13cdACPI: Add missing prototype for non CONFIG_SUSPEND/CONFIG_X86 case [1 file, +8/-1]33cd650d38pmdomain: core: Reset genpd->states to avoid freeing invalid data [1 file, +3/-1]f34e0c1556platform-msi: Add msi_remove_device_irq_domain() in platform_device_msi_free_irqs_all() [1 file, +1/-0]c519f81e9cgpiolib: of: Add polarity quirk for s5m8767 [1 file, +9/-0]1f152ae557PM: runtime: fix denying of auto suspend in pm_suspend_timer_fn() [1 file, +1/-1]6c1151d53ctipc: use kfree_sensitive() for aead cleanup [1 file, +1/-1]b0e647442cf2fs: use vmalloc instead of kvmalloc in .init_{,de}compress_ctx [2 files, +15/-13]2d834477bbbpf: Check rcu_read_lock_trace_held() in bpf_map_lookup_percpu_elem() [1 file, +2/-1]77ff6aec7ccpufreq: scmi: Skip SCMI devices that aren't used by the CPUs [1 file, +35/-1]0a8446058ctcp: always seek for minimal rtt in tcp_rcv_rtt_update() [1 file, +8/-14]f97085d365tcp: remove zero TCP TS samples for autotuning [1 file, +5/-5]89b20c406etcp: fix initial tp->rcvq_space.space value for passive TS enabled flows [1 file, +3/-3]84c156a351tcp: add receive queue awareness in tcp_rcv_space_adjust() [2 files, +5/-3]3a9e74d158ipv4/route: Use this_cpu_inc() for stats on PREEMPT_RT [1 file, +4/-0]5eb9c50e0cnet: page_pool: Don't recycle into cache on PREEMPT_RT [1 file, +4/-0]8b0741b167xfrm: validate assignment of maximal possible SEQ number [1 file, +42/-10]8fdf2f79ebbpf: Pass the same orig_call value to trampoline functions [1 file, +1/-1]f0023d7a2af2fs: fix to bail out in get_new_segment() [2 files, +6/-1]448dc45eeabpf: Use proper type to calculate bpf_raw_tp_null_args.mask index [1 file, +2/-2]78f768e36cnet: bridge: mcast: re-implement br_multicast_{enable, disable}_port functions [1 file, +69/-8]4b3383110bsoftware node: Correct a OOB check in software_node_get_reference_args() [1 file, +1/-1]b7129ef57dsock: Correct error checking condition for (assign|release)_proto_idx() [1 file, +2/-2]a58f0a0e99f2fs: fix to set atomic write status more clear [3 files, +12/-2]b8b4b8bb34bpf, sockmap: Fix data lost during EAGAIN retries [1 file, +2/-1]7c41f73b64fs/xattr.c: fix simple_xattr_list() [1 file, +1/-0]2e10dc9c2aio_uring/kbuf: don't truncate end buffer for multiple buffer peeks [1 file, +4/-1]1a4254ab06io_uring: fix task leak issue in io_wq_create() [1 file, +3/-1]4220cc0b98nvme: always punt polled uring_cmd end_io work to task_work [1 file, +7/-14]f9b97d466enet_sched: sch_sfq: reject invalid perturb period [1 file, +8/-2]2a3ad42a57net: clear the dst when changing skb protocol [1 file, +13/-6]510a29d776mm: close theoretical race where stale TLB entries could linger [1 file, +2/-0]57ec081869sched_ext, sched/core: Don't call scx_group_set_weight() prematurely from sched_create_group() [3 files, +9/-2]3d828519bdatm: Revert atm_account_tx() if copy_from_iter_full() fails. [3 files, +8/-1]47f34289d1arm64: Restrict pagetable teardown to avoid false warning [1 file, +2/-1]9cf5b2a3b7mm/hugetlb: unshare page tables during VMA split, not before [5 files, +57/-16]dc5f0aef9enet: Fix checksum update for ILA adj-transport [4 files, +7/-7]2516299184bpf: Fix L4 csum update on IPv6 in CHECKSUM_COMPLETE [3 files, +7/-2]50189d9c5eerofs: remove unused trace event erofs_destroy_inode [1 file, +0/-18]348e541fefipv6: remove leftover ip6 cookie initializer [1 file, +0/-2]3c44ebad5aipv6: replace ipcm6_init calls with ipcm6_init_sk [4 files, +3/-29]6b358b3adfio_uring/sqpoll: don't put task_struct on tctx setup failure [1 file, +1/-4]8873080b88workqueue: Initialize wq_isolated_cpumask in workqueue_init_early() [1 file, +2/-1]ac462a75fdnet: netmem: fix skb_ensure_writable with unreadable skbs [1 file, +0/-3]61b39e189dptp: allow reading of currently dialed frequency to succeed on free-running clocks [1 file, +2/-1]397c1faf8ftcp: fix tcp_packet_delayed() for tcp_is_non_sack_preventing_reopen() behavior [1 file, +25/-12]0d3d91c350tipc: fix null-ptr-deref when acquiring remote ip of ethernet bearer [1 file, +2/-2]31d50dfe9ctcp: fix passive TFO socket having invalid NAPI ID [1 file, +3/-0]0f8df5d6f2ublk: santizize the arguments from userspace when adding a device [1 file, +3/-0]456019adaaperf: Fix sample vs do_exit() [2 files, +16/-8]7335c33d62perf: Fix cgroup state vs ERROR [1 file, +30/-21]fd199366bfperf/core: Fix WARN in perf_cgroup_switch() [1 file, +20/-2]22f935bc86arm64/ptrace: Fix stack-out-of-bounds read in regs_get_kernel_stack_nth() [1 file, +1/-1] Changes in 6.12.35 configfs: Do not override creating attribute file failure in populate_attrs() crypto: marvell/cesa - Do not chain submitted requests gfs2: move msleep to sleepable context crypto: qat - add shutdown handler to qat_c3xxx crypto: qat - add shutdown handler to qat_420xx crypto: qat - add shutdown handler to qat_4xxx crypto: qat - add shutdown handler to qat_c62x crypto: qat - add shutdown handler to qat_dh895xcc ASoC: qcom: sdm845: Add error handling in sdm845_slim_snd_hw_params() ASoC: meson: meson-card-utils: use of_property_present() for DT parsing ASoC: amd: sof_amd_sdw: Fix unlikely uninitialized variable use in create_sdw_dailinks() io_uring: account drain memory to cgroup io_uring/kbuf: account ring io_buffer_list memory powerpc/pseries/msi: Avoid reading PCI device registers in reduced power states s390/pci: Remove redundant bus removal and disable from zpci_release_device() s390/pci: Prevent self deletion in disable_slot() s390/pci: Allow re-add of a reserved but not yet removed device s390/pci: Serialize device addition and removal regulator: max20086: Fix MAX200086 chip id regulator: max20086: Change enable gpio to optional net/mlx5_core: Add error handling inmlx5_query_nic_vport_qkey_viol_cntr() net/mlx5: Add error handling in mlx5_query_nic_vport_node_guid() wifi: p54: prevent buffer-overflow in p54_rx_eeprom_readback() wifi: mt76: mt7925: fix host interrupt register initialization wifi: ath11k: fix rx completion meta data corruption wifi: rtw88: usb: Upload the firmware in bigger chunks wifi: ath11k: fix ring-buffer corruption NFSD: unregister filesystem in case genl_register_family() fails NFSD: fix race between nfsd registration and exports_proc NFSD: Implement FATTR4_CLONE_BLKSIZE attribute nfsd: nfsd4_spo_must_allow() must check this is a v4 compound request nfsd: Initialize ssc before laundromat_work to prevent NULL dereference SUNRPC: Prevent hang on NFS mount with xprtsec=[m]tls NFSv4: Don't check for OPEN feature support in v4.1 fs/nfs/read: fix double-unlock bug in nfs_return_empty_folio() wifi: ath12k: fix ring-buffer corruption jbd2: fix data-race and null-ptr-deref in jbd2_journal_dirty_metadata() svcrdma: Unregister the device if svc_rdma_accept() fails wifi: rtw88: usb: Reduce control message timeout to 500 ms wifi: rtlwifi: disable ASPM for RTL8723BE with subsystem ID 11ad:1723 media: ov8856: suppress probe deferral errors media: ov5675: suppress probe deferral errors media: imx335: Use correct register width for HNUM media: nxp: imx8-isi: better handle the m2m usage_count media: i2c: ds90ub913: Fix returned fmt from .set_fmt() media: ccs-pll: Start VT pre-PLL multiplier search from correct value media: ov2740: Move pm-runtime cleanup on probe-errors to proper place media: ccs-pll: Start OP pre-PLL multiplier search from correct value media: ccs-pll: Correct the upper limit of maximum op_pre_pll_clk_div media: ccs-pll: Check for too high VT PLL multiplier in dual PLL case media: cxusb: no longer judge rbuf when the write fails media: davinci: vpif: Fix memory leak in probe error path media: gspca: Add error handling for stv06xx_read_sensor() media: i2c: imx335: Fix frame size enumeration media: imagination: fix a potential memory leak in e5010_probe() media: intel/ipu6: Fix dma mask for non-secure mode media: ipu6: Remove workaround for Meteor Lake ES2 media: mediatek: vcodec: Correct vsi_core framebuffer size media: omap3isp: use sgtable-based scatterlist wrappers media: v4l2-dev: fix error handling in __video_register_device() media: venus: Fix probe error handling media: videobuf2: use sgtable-based scatterlist wrappers media: vidtv: Terminating the subsequent process of initialization failure media: vivid: Change the siize of the composing media: imx-jpeg: Drop the first error frames media: imx-jpeg: Move mxc_jpeg_free_slot_data() ahead media: imx-jpeg: Reset slot data pointers when freed media: imx-jpeg: Cleanup after an allocation error media: uvcvideo: Return the number of processed controls media: uvcvideo: Send control events for partial succeeds media: uvcvideo: Fix deferred probing error arm64/mm: Close theoretical race where stale TLB entry remains valid ARM: 9447/1: arm/memremap: fix arch_memremap_can_ram_remap() ARM: omap: pmic-cpcap: do not mess around without CPCAP or OMAP4 ASoC: codecs: wcd9375: Fix double free of regulator supplies ASoC: codecs: wcd937x: Drop unused buck_supply block: use plug request list tail for one-shot backmerge attempt block: Clear BIO_EMULATES_ZONE_APPEND flag on BIO completion bus: mhi: ep: Update read pointer only after buffer is written bus: mhi: host: Fix conflict between power_up and SYSERR can: kvaser_pciefd: refine error prone echo_skb_max handling logic can: tcan4x5x: fix power regulator retrieval during probe ceph: avoid kernel BUG for encrypted inode with unaligned file size ceph: set superblock s_magic for IMA fsmagic matching cgroup,freezer: fix incomplete freezing when attaching tasks bus: firewall: Fix missing static inline annotations for stubs ata: pata_via: Force PIO for ATAPI devices on VT6415/VT6330 ata: ahci: Disallow LPM for ASUSPRO-D840SA motherboard ata: ahci: Disallow LPM for Asus B550-F motherboard bus: fsl-mc: do not add a device-link for the UAPI used DPMCP device bus: fsl-mc: fix GET/SET_TAILDROP command ids ext4: inline: fix len overflow in ext4_prepare_inline_data ext4: fix calculation of credits for extent tree modification ext4: factor out ext4_get_maxbytes() ext4: ensure i_size is smaller than maxbytes ext4: only dirty folios when data journaling regular files Input: ims-pcu - check record size in ims_pcu_flash_firmware() Input: gpio-keys - fix possible concurrent access in gpio_keys_irq_timer() f2fs: fix to do sanity check on ino and xnid f2fs: prevent kernel warning due to negative i_nlink from corrupted image f2fs: fix to do sanity check on sit_bitmap_size hwmon: (ftsteutates) Fix TOCTOU race in fts_read() NFC: nci: uart: Set tty->disc_data only in success path net/sched: fix use-after-free in taprio_dev_notifier net: ftgmac100: select FIXED_PHY iommu/vt-d: Restore context entry setup order for aliased devices fbdev: Fix do_register_framebuffer to prevent null-ptr-deref in fb_videomode_to_var EDAC/altera: Use correct write width with the INTTEST register fbdev: Fix fb_set_var to prevent null-ptr-deref in fb_videomode_to_var parisc/unaligned: Fix hex output to show 8 hex chars vgacon: Add check for vc_origin address range in vgacon_scroll() parisc: fix building with gcc-15 clk: meson-g12a: add missing fclk_div2 to spicc ipc: fix to protect IPCS lookups using RCU watchdog: fix watchdog may detect false positive of softlockup RDMA/iwcm: Fix use-after-free of work objects after cm_id destruction mm: fix ratelimit_pages update error in dirty_ratio_handler() soc: qcom: pmic_glink_altmode: fix spurious DP hotplug events configfs-tsm-report: Fix NULL dereference of tsm_ops firmware: arm_scmi: Ensure that the message-id supports fastchannel mtd: rawnand: sunxi: Add randomizer configuration in sunxi_nfc_hw_ecc_write_chunk mtd: nand: sunxi: Add randomizer configuration before randomizer enable KVM: SVM: Clear current_vmcb during vCPU free for all *possible* CPUs KVM: VMX: Flush shadow VMCS on emergency reboot dm-mirror: fix a tiny race condition dm-verity: fix a memory leak if some arguments are specified multiple times mtd: rawnand: qcom: Fix read len for onfi param page ftrace: Fix UAF when lookup kallsym after ftrace disabled dm: lock limits when reading them phy: fsl-imx8mq-usb: fix phy_tx_vboost_level_from_property() net: ch9200: fix uninitialised access during mii_nway_restart KVM: s390: rename PROT_NONE to PROT_TYPE_DUMMY sysfb: Fix screen_info type check for VGA video: screen_info: Relocate framebuffers behind PCI bridges pwm: axi-pwmgen: fix missing separate external clock staging: iio: ad5933: Correct settling cycles encoding per datasheet mips: Add -std= flag specified in KBUILD_CFLAGS to vdso CFLAGS ovl: Fix nested backing file paths regulator: max14577: Add error check for max14577_read_reg() remoteproc: core: Cleanup acquired resources when rproc_handle_resources() fails in rproc_attach() remoteproc: core: Release rproc->clean_table after rproc_attach() fails remoteproc: k3-m4: Don't assert reset in detach routine cifs: reset connections for all channels when reconnect requested cifs: update dstaddr whenever channel iface is updated cifs: dns resolution is needed only for primary channel smb: client: add NULL check in automount_fullpath Drivers: hv: Allocate interrupt and monitor pages aligned to system page boundary uio_hv_generic: Use correct size for interrupt and monitor pages uio_hv_generic: Align ring size to system page PCI: cadence-ep: Correct PBA offset in .set_msix() callback PCI: dwc: ep: Correct PBA offset in .set_msix() callback PCI: Add ACS quirk for Loongson PCIe PCI: Fix lock symmetry in pci_slot_unlock() PCI: dw-rockchip: Remove PCIE_L0S_ENTRY check from rockchip_pcie_link_up() PCI: dw-rockchip: Fix PHY function call sequence in rockchip_pcie_phy_deinit() iio: accel: fxls8962af: Fix temperature scan element sign accel/ivpu: Improve buffer object logging accel/ivpu: Use firmware names from upstream repo accel/ivpu: Use dma_resv_lock() instead of a custom mutex accel/ivpu: Fix warning in ivpu_gem_bo_free() dummycon: Trigger redraw when switching consoles with deferred takeover mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race iio: imu: inv_icm42600: Fix temperature calculation iio: adc: ad7944: mask high bits on direct read iio: adc: ti-ads1298: Kconfig: add kfifo dependency to fix module build iio: adc: ad7606_spi: fix reg write value mask ACPICA: fix acpi operand cache leak in dswstate.c ASoC: amd: yc: Add quirk for Lenovo Yoga Pro 7 14ASP9 clocksource: Fix the CPUs' choice in the watchdog per CPU verification power: supply: collie: Fix wakeup source leaks on device unbind mmc: Add quirk to disable DDR50 tuning ACPICA: Avoid sequence overread in call to strncmp() ASoC: tas2770: Power cycle amp on ISENSE/VSENSE change ASoC: intel/sdw_utils: Assign initial value in asoc_sdw_rt_amp_spk_rtd_init() ACPI: bus: Bail out if acpi_kobj registration fails ACPI: Add missing prototype for non CONFIG_SUSPEND/CONFIG_X86 case ACPICA: fix acpi parse and parseext cache leaks ACPICA: Apply pack(1) to union aml_resource ALSA: hda: cs35l41: Fix swapped l/r audio channels for Acer Helios laptops power: supply: bq27xxx: Retrieve again when busy pmdomain: core: Reset genpd->states to avoid freeing invalid data ACPICA: utilities: Fix overflow check in vsnprintf() platform-msi: Add msi_remove_device_irq_domain() in platform_device_msi_free_irqs_all() ASoC: tegra210_ahub: Add check to of_device_get_match_data() Make 'cc-option' work correctly for the -Wno-xyzzy pattern gpiolib: of: Add polarity quirk for s5m8767 PM: runtime: fix denying of auto suspend in pm_suspend_timer_fn() power: supply: max17040: adjust thermal channel scaling ACPI: battery: negate current when discharging net: macb: Check return value of dma_set_mask_and_coherent() net: lan743x: Modify the EEPROM and OTP size for PCI1xxxx devices tipc: use kfree_sensitive() for aead cleanup f2fs: use vmalloc instead of kvmalloc in .init_{,de}compress_ctx bpf: Check rcu_read_lock_trace_held() in bpf_map_lookup_percpu_elem() Bluetooth: btusb: Add new VID/PID 13d3/3584 for MT7922 i2c: designware: Invoke runtime suspend on quick slave re-registration wifi: mt76: mt7996: drop fragments with multicast or broadcast RA emulex/benet: correct command version selection in be_cmd_get_stats() Bluetooth: btusb: Add new VID/PID 13d3/3630 for MT7925 wifi: mt76: mt76x2: Add support for LiteOn WN4516R,WN4519R wifi: mt76: mt7921: add 160 MHz AP for mt7922 device wifi: mt76: mt7925: introduce thermal protection wifi: mac80211: validate SCAN_FLAG_AP in scan request during MLO sctp: Do not wake readers in __sctp_write_space() libbpf/btf: Fix string handling to support multi-split BTF cpufreq: scmi: Skip SCMI devices that aren't used by the CPUs i2c: tegra: check msg length in SMBUS block read i2c: npcm: Add clock toggle recovery clk: qcom: gcc-x1e80100: Set FORCE MEM CORE for UFS clocks net: dlink: add synchronization for stats update wifi: ath12k: fix macro definition HAL_RX_MSDU_PKT_LENGTH_GET wifi: ath12k: fix a possible dead lock caused by ab->base_lock wifi: ath11k: Fix QMI memory reuse logic iommu/amd: Allow matching ACPI HID devices without matching UIDs wifi: rtw89: leave idle mode when setting WEP encryption for AP mode tcp: always seek for minimal rtt in tcp_rcv_rtt_update() tcp: remove zero TCP TS samples for autotuning tcp: fix initial tp->rcvq_space.space value for passive TS enabled flows tcp: add receive queue awareness in tcp_rcv_space_adjust() x86/sgx: Prevent attempts to reclaim poisoned pages ipv4/route: Use this_cpu_inc() for stats on PREEMPT_RT net: page_pool: Don't recycle into cache on PREEMPT_RT xfrm: validate assignment of maximal possible SEQ number net: atlantic: generate software timestamp just before the doorbell pinctrl: armada-37xx: propagate error from armada_37xx_pmx_set_by_name() pinctrl: armada-37xx: propagate error from armada_37xx_gpio_get_direction() bpf: Pass the same orig_call value to trampoline functions net: stmmac: generate software timestamp just before the doorbell pinctrl: armada-37xx: propagate error from armada_37xx_pmx_gpio_set_direction() libbpf: Check bpf_map_skeleton link for NULL pinctrl: armada-37xx: propagate error from armada_37xx_gpio_get() net: mlx4: add SOF_TIMESTAMPING_TX_SOFTWARE flag when getting ts info net: vertexcom: mse102x: Return code for mse102x_rx_pkt_spi wireless: purelifi: plfxlc: fix memory leak in plfxlc_usb_wreq_asyn() wifi: mac80211: do not offer a mesh path if forwarding is disabled clk: rockchip: rk3036: mark ddrphy as critical hid-asus: check ROG Ally MCU version and warn wifi: iwlwifi: mvm: fix beacon CCK flag f2fs: fix to bail out in get_new_segment() netfilter: nft_set_pipapo: clamp maximum map bucket size to INT_MAX libbpf: Add identical pointer detection to btf_dedup_is_equiv() scsi: lpfc: Fix lpfc_check_sli_ndlp() handling for GEN_REQUEST64 commands scsi: smartpqi: Add new PCI IDs iommu/amd: Ensure GA log notifier callbacks finish running before module unload wifi: iwlwifi: pcie: make sure to lock rxq->read wifi: rtw89: 8922a: fix TX fail with wrong VCO setting wifi: mac80211_hwsim: Prevent tsf from setting if beacon is disabled netdevsim: Mark NAPI ID on skb in nsim_rcv net/mlx5: HWS, Fix IP version decision bpf: Use proper type to calculate bpf_raw_tp_null_args.mask index wifi: mac80211: VLAN traffic in multicast path Revert "mac80211: Dynamically set CoDel parameters per station" wifi: iwlwifi: Add missing MODULE_FIRMWARE for Qu-c0-jf-b0 net: bridge: mcast: update multicast contex when vlan state is changed net: bridge: mcast: re-implement br_multicast_{enable, disable}_port functions vxlan: Do not treat dst cache initialization errors as fatal bnxt_en: Remove unused field "ref_count" in struct bnxt_ulp wifi: ath12k: using msdu end descriptor to check for rx multicast packets net: ethernet: ti: am65-cpsw: handle -EPROBE_DEFER software node: Correct a OOB check in software_node_get_reference_args() isofs: fix Y2038 and Y2156 issues in Rock Ridge TF entry pinctrl: mcp23s08: Reset all pins to input at probe wifi: ath12k: fix failed to set mhi state error during reboot with hardware grouping scsi: lpfc: Use memcpy() for BIOS version sock: Correct error checking condition for (assign|release)_proto_idx() i40e: fix MMIO write access to an invalid page in i40e_clear_hw ixgbe: Fix unreachable retry logic in combined and byte I2C write functions RDMA/hns: initialize db in update_srq_db() ice: fix check for existing switch rule usbnet: asix AX88772: leave the carrier control to phylink f2fs: fix to set atomic write status more clear bpf, sockmap: Fix data lost during EAGAIN retries net: ethernet: cortina: Use TOE/TSO on all TCP octeontx2-pf: Add error log forcn10k_map_unmap_rq_policer() wifi: ath11k: determine PM policy based on machine model wifi: ath12k: fix link valid field initialization in the monitor Rx wifi: ath12k: fix incorrect CE addresses wifi: ath12k: Pass correct values of center freq1 and center freq2 for 160 MHz net/mlx5: HWS, Harden IP version definer checks fbcon: Make sure modelist not set on unregistered console watchdog: da9052_wdt: respect TWDMIN bus: fsl-mc: increase MC_CMD_COMPLETION_TIMEOUT_MS value ARM: OMAP2+: Fix l4ls clk domain handling in STANDBY tee: Prevent size calculation wraparound on 32-bit kernels Revert "bus: ti-sysc: Probe for l4_wkup and l4_cfg interconnect devices first" fs/xattr.c: fix simple_xattr_list() platform/x86/amd: pmc: Clear metrics table at start of cycle platform/x86/amd: pmf: Prevent amd_pmf_tee_deinit() from running twice platform/x86: dell_rbu: Fix list usage platform/x86: dell_rbu: Stop overwriting data buffer powerpc/vdso: Fix build of VDSO32 with pcrel powerpc/eeh: Fix missing PE bridge reconfiguration during VFIO EEH recovery io_uring/kbuf: don't truncate end buffer for multiple buffer peeks io_uring: fix task leak issue in io_wq_create() drivers/rapidio/rio_cm.c: prevent possible heap overwrite platform/loongarch: laptop: Get brightness setting from EC on probe platform/loongarch: laptop: Unregister generic_sub_drivers on exit platform/loongarch: laptop: Add backlight power control support LoongArch: vDSO: Correctly use asm parameters in syscall wrappers LoongArch: Avoid using $r0/$r1 as "mask" for csrxchg LoongArch: Fix panic caused by NULL-PMD in huge_pte_offset() jffs2: check that raw node were preallocated before writing summary jffs2: check jffs2_prealloc_raw_node_refs() result in few other places cifs: deal with the channel loading lag while picking channels cifs: serialize other channels when query server interfaces is pending cifs: do not disable interface polling on failure smb: improve directory cache reuse for readdir operations scsi: storvsc: Increase the timeouts to storvsc_timeout scsi: s390: zfcp: Ensure synchronous unit_add nvme: always punt polled uring_cmd end_io work to task_work net_sched: sch_sfq: reject invalid perturb period net: clear the dst when changing skb protocol mm: close theoretical race where stale TLB entries could linger udmabuf: use sgtable-based scatterlist wrappers x86/virt/tdx: Avoid indirect calls to TDX assembly functions selftests/x86: Add a test to detect infinite SIGTRAP handler loop ksmbd: fix null pointer dereference in destroy_previous_session platform/x86: ideapad-laptop: use usleep_range() for EC polling selinux: fix selinux_xfrm_alloc_user() to set correct ctx_len platform/x86/intel-uncore-freq: Fail module load when plat_info is NULL sched_ext, sched/core: Don't call scx_group_set_weight() prematurely from sched_create_group() atm: Revert atm_account_tx() if copy_from_iter_full() fails. wifi: rtw89: phy: add dummy C2H event handler for report of TAS power cpufreq/amd-pstate: Add missing NULL ptr check in amd_pstate_update Input: sparcspkr - avoid unannotated fall-through wifi: ath12k: Clear affinity hint before calling ath12k_pci_free_irq() in error path wifi: cfg80211: init wiphy_work before allocating rfkill fails arm64: Restrict pagetable teardown to avoid false warning ALSA: usb-audio: Rename ALSA kcontrol PCM and PCM1 for the KTMicro sound card ALSA: hda/intel: Add Thinkpad E15 to PM deny list ALSA: hda/realtek - Add mute LED support for HP Victus 16-s1xxx and HP Victus 15-fa1xxx ALSA: hda/realtek: enable headset mic on Latitude 5420 Rugged ALSA: hda/realtek: Fix built-in mic on ASUS VivoBook X513EA ALSA: hda/realtek: Add quirk for Asus GU605C iio: accel: fxls8962af: Fix temperature calculation mm/hugetlb: unshare page tables during VMA split, not before drm/amdgpu: read back register after written for VCN v4.0.5 kbuild: rust: add rustc-min-version support function rust: compile libcore with edition 2024 for 1.87+ net: Fix checksum update for ILA adj-transport bpf: Fix L4 csum update on IPv6 in CHECKSUM_COMPLETE erofs: remove unused trace event erofs_destroy_inode nfsd: use threads array as-is in netlink interface sunrpc: handle SVC_GARBAGE during svc auth processing as auth error drm/v3d: Avoid NULL pointer dereference in `v3d_job_update_stats()` Kunit to check the longest symbol length x86/tools: Drop duplicate unlikely() definition in insn_decoder_test.c ipv6: remove leftover ip6 cookie initializer ipv6: replace ipcm6_init calls with ipcm6_init_sk smb: fix secondary channel creation issue with kerberos by populating hostname when adding channels drm/msm/disp: Correct porch timing for SDM845 drm/msm/dsi/dsi_phy_10nm: Fix missing initial VCO rate drm/msm: Fix CP_RESET_CONTEXT_STATE bitfield names drm/msm/a7xx: Call CP_RESET_CONTEXT_STATE drm/ssd130x: fix ssd132x_clear_screen() columns ionic: Prevent driver/fw getting out of sync on devcmd(s) drm/nouveau/bl: increase buffer size to avoid truncate warning drm/i915/pmu: Fix build error with GCOV and AutoFDO enabled hwmon: (occ) Rework attribute registration for stack usage hwmon: (occ) fix unaligned accesses hwmon: (ltc4282) avoid repeated register write pldmfw: Select CRC32 when PLDMFW is selected aoe: clean device rq_list in aoedev_downdev() io_uring/sqpoll: don't put task_struct on tctx setup failure net: ice: Perform accurate aRFS flow match ice: fix eswitch code memory leak in reset scenario e1000e: set fixed clock frequency indication for Nahum 11 and Nahum 13 workqueue: Initialize wq_isolated_cpumask in workqueue_init_early() ksmbd: add free_transport ops in ksmbd connection net: netmem: fix skb_ensure_writable with unreadable skbs bnxt_en: Fix double invocation of bnxt_ulp_stop()/bnxt_ulp_start() eth: bnxt: fix out-of-range access of vnic_info array bnxt_en: Add a helper function to configure MRU and RSS bnxt_en: Update MRU and RSS table of RSS contexts on queue reset ptp: fix breakage after ptp_vclock_in_use() rework ptp: allow reading of currently dialed frequency to succeed on free-running clocks wifi: carl9170: do not ping device which has failed to load firmware mpls: Use rcu_dereference_rtnl() in mpls_route_input_rcu(). atm: atmtcp: Free invalid length skb in atmtcp_c_send(). tcp: fix tcp_packet_delayed() for tcp_is_non_sack_preventing_reopen() behavior tipc: fix null-ptr-deref when acquiring remote ip of ethernet bearer tcp: fix passive TFO socket having invalid NAPI ID eth: fbnic: avoid double free when failing to DMA-map FW msg net: lan743x: fix potential out-of-bounds write in lan743x_ptp_io_event_clock_get() ublk: santizize the arguments from userspace when adding a device drm/xe: Wire up device shutdown handler drm/xe/gt: Update handling of xe_force_wake_get return drm/xe/bmg: Update Wa_16023588340 calipso: Fix null-ptr-deref in calipso_req_{set,del}attr(). mlxbf_gige: return EPROBE_DEFER if PHY IRQ is not available net: atm: add lec_mutex net: atm: fix /proc/net/atm/lec handling EDAC/amd64: Correct number of UMCs for family 19h models 70h-7fh dt-bindings: i2c: nvidia,tegra20-i2c: Specify the required properties smb: Log an error when close_all_cached_dirs fails serial: sh-sci: Clean sci_ports[0] after at earlycon exit serial: sh-sci: Increment the runtime usage counter for the earlycon device smb: client: fix first command failure during re-negotiation smb: client: fix max_sge overflow in smb_extract_folioq_to_rdma() s390/pci: Fix __pcilg_mio_inuser() inline assembly perf: Fix sample vs do_exit() perf: Fix cgroup state vs ERROR perf/core: Fix WARN in perf_cgroup_switch() arm64/ptrace: Fix stack-out-of-bounds read in regs_get_kernel_stack_nth() scsi: elx: efct: Fix memory leak in efct_hw_parse_filter() RISC-V: KVM: Fix the size parameter check in SBI SFENCE calls RISC-V: KVM: Don't treat SBI HFENCE calls as NOPs gpio: pca953x: fix wrong error probe return value perf evsel: Missed close() when probing hybrid core PMUs perf test: Directory file descriptor leak gpio: mlxbf3: only get IRQ for device instance 0 cifs: Remove duplicate fattr->cf_dtype assignment from wsl_to_fattr() function bpftool: Fix cgroup command to only show cgroup bpf programs Linux 6.12.35 Change-Id: Ida57d269272a624bedb979bfad0b3c5e7df7e846 Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
1164 lines
32 KiB
C
1164 lines
32 KiB
C
/* SPDX-License-Identifier: GPL-2.0
|
|
*
|
|
* page_pool.c
|
|
* Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
|
|
* Copyright (C) 2016 Red Hat, Inc.
|
|
*/
|
|
|
|
#include <linux/error-injection.h>
|
|
#include <linux/types.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/device.h>
|
|
|
|
#include <net/netdev_rx_queue.h>
|
|
#include <net/page_pool/helpers.h>
|
|
#include <net/xdp.h>
|
|
|
|
#include <linux/dma-direction.h>
|
|
#include <linux/dma-mapping.h>
|
|
#include <linux/page-flags.h>
|
|
#include <linux/mm.h> /* for put_page() */
|
|
#include <linux/poison.h>
|
|
#include <linux/ethtool.h>
|
|
#include <linux/netdevice.h>
|
|
|
|
#include <trace/events/page_pool.h>
|
|
|
|
#include "mp_dmabuf_devmem.h"
|
|
#include "netmem_priv.h"
|
|
#include "page_pool_priv.h"
|
|
|
|
DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
|
|
|
|
#define DEFER_TIME (msecs_to_jiffies(1000))
|
|
#define DEFER_WARN_INTERVAL (60 * HZ)
|
|
|
|
#define BIAS_MAX (LONG_MAX >> 1)
|
|
|
|
#ifdef CONFIG_PAGE_POOL_STATS
|
|
static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
|
|
|
|
/* alloc_stat_inc is intended to be used in softirq context */
|
|
#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
|
|
/* recycle_stat_inc is safe to use when preemption is possible. */
|
|
#define recycle_stat_inc(pool, __stat) \
|
|
do { \
|
|
struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
|
|
this_cpu_inc(s->__stat); \
|
|
} while (0)
|
|
|
|
#define recycle_stat_add(pool, __stat, val) \
|
|
do { \
|
|
struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
|
|
this_cpu_add(s->__stat, val); \
|
|
} while (0)
|
|
|
|
static const char pp_stats[][ETH_GSTRING_LEN] = {
|
|
"rx_pp_alloc_fast",
|
|
"rx_pp_alloc_slow",
|
|
"rx_pp_alloc_slow_ho",
|
|
"rx_pp_alloc_empty",
|
|
"rx_pp_alloc_refill",
|
|
"rx_pp_alloc_waive",
|
|
"rx_pp_recycle_cached",
|
|
"rx_pp_recycle_cache_full",
|
|
"rx_pp_recycle_ring",
|
|
"rx_pp_recycle_ring_full",
|
|
"rx_pp_recycle_released_ref",
|
|
};
|
|
|
|
/**
|
|
* page_pool_get_stats() - fetch page pool stats
|
|
* @pool: pool from which page was allocated
|
|
* @stats: struct page_pool_stats to fill in
|
|
*
|
|
* Retrieve statistics about the page_pool. This API is only available
|
|
* if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
|
|
* A pointer to a caller allocated struct page_pool_stats structure
|
|
* is passed to this API which is filled in. The caller can then report
|
|
* those stats to the user (perhaps via ethtool, debugfs, etc.).
|
|
*/
|
|
bool page_pool_get_stats(const struct page_pool *pool,
|
|
struct page_pool_stats *stats)
|
|
{
|
|
int cpu = 0;
|
|
|
|
if (!stats)
|
|
return false;
|
|
|
|
/* The caller is responsible to initialize stats. */
|
|
stats->alloc_stats.fast += pool->alloc_stats.fast;
|
|
stats->alloc_stats.slow += pool->alloc_stats.slow;
|
|
stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
|
|
stats->alloc_stats.empty += pool->alloc_stats.empty;
|
|
stats->alloc_stats.refill += pool->alloc_stats.refill;
|
|
stats->alloc_stats.waive += pool->alloc_stats.waive;
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
const struct page_pool_recycle_stats *pcpu =
|
|
per_cpu_ptr(pool->recycle_stats, cpu);
|
|
|
|
stats->recycle_stats.cached += pcpu->cached;
|
|
stats->recycle_stats.cache_full += pcpu->cache_full;
|
|
stats->recycle_stats.ring += pcpu->ring;
|
|
stats->recycle_stats.ring_full += pcpu->ring_full;
|
|
stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
EXPORT_SYMBOL(page_pool_get_stats);
|
|
|
|
u8 *page_pool_ethtool_stats_get_strings(u8 *data)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
|
|
memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
|
|
data += ETH_GSTRING_LEN;
|
|
}
|
|
|
|
return data;
|
|
}
|
|
EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
|
|
|
|
int page_pool_ethtool_stats_get_count(void)
|
|
{
|
|
return ARRAY_SIZE(pp_stats);
|
|
}
|
|
EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
|
|
|
|
u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
|
|
{
|
|
const struct page_pool_stats *pool_stats = stats;
|
|
|
|
*data++ = pool_stats->alloc_stats.fast;
|
|
*data++ = pool_stats->alloc_stats.slow;
|
|
*data++ = pool_stats->alloc_stats.slow_high_order;
|
|
*data++ = pool_stats->alloc_stats.empty;
|
|
*data++ = pool_stats->alloc_stats.refill;
|
|
*data++ = pool_stats->alloc_stats.waive;
|
|
*data++ = pool_stats->recycle_stats.cached;
|
|
*data++ = pool_stats->recycle_stats.cache_full;
|
|
*data++ = pool_stats->recycle_stats.ring;
|
|
*data++ = pool_stats->recycle_stats.ring_full;
|
|
*data++ = pool_stats->recycle_stats.released_refcnt;
|
|
|
|
return data;
|
|
}
|
|
EXPORT_SYMBOL(page_pool_ethtool_stats_get);
|
|
|
|
#else
|
|
#define alloc_stat_inc(...) do { } while (0)
|
|
#define recycle_stat_inc(...) do { } while (0)
|
|
#define recycle_stat_add(...) do { } while (0)
|
|
#endif
|
|
|
|
static bool page_pool_producer_lock(struct page_pool *pool)
|
|
__acquires(&pool->ring.producer_lock)
|
|
{
|
|
bool in_softirq = in_softirq();
|
|
|
|
if (in_softirq)
|
|
spin_lock(&pool->ring.producer_lock);
|
|
else
|
|
spin_lock_bh(&pool->ring.producer_lock);
|
|
|
|
return in_softirq;
|
|
}
|
|
|
|
static void page_pool_producer_unlock(struct page_pool *pool,
|
|
bool in_softirq)
|
|
__releases(&pool->ring.producer_lock)
|
|
{
|
|
if (in_softirq)
|
|
spin_unlock(&pool->ring.producer_lock);
|
|
else
|
|
spin_unlock_bh(&pool->ring.producer_lock);
|
|
}
|
|
|
|
static void page_pool_struct_check(void)
|
|
{
|
|
CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
|
|
CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
|
|
CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
|
|
CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
|
|
PAGE_POOL_FRAG_GROUP_ALIGN);
|
|
}
|
|
|
|
static int page_pool_init(struct page_pool *pool,
|
|
const struct page_pool_params *params,
|
|
int cpuid)
|
|
{
|
|
unsigned int ring_qsize = 1024; /* Default */
|
|
struct netdev_rx_queue *rxq;
|
|
int err;
|
|
|
|
page_pool_struct_check();
|
|
|
|
memcpy(&pool->p, ¶ms->fast, sizeof(pool->p));
|
|
memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow));
|
|
|
|
pool->cpuid = cpuid;
|
|
|
|
/* Validate only known flags were used */
|
|
if (pool->slow.flags & ~PP_FLAG_ALL)
|
|
return -EINVAL;
|
|
|
|
if (pool->p.pool_size)
|
|
ring_qsize = pool->p.pool_size;
|
|
|
|
/* Sanity limit mem that can be pinned down */
|
|
if (ring_qsize > 32768)
|
|
return -E2BIG;
|
|
|
|
/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
|
|
* DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
|
|
* which is the XDP_TX use-case.
|
|
*/
|
|
if (pool->slow.flags & PP_FLAG_DMA_MAP) {
|
|
if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
|
|
(pool->p.dma_dir != DMA_BIDIRECTIONAL))
|
|
return -EINVAL;
|
|
|
|
pool->dma_map = true;
|
|
}
|
|
|
|
if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
|
|
/* In order to request DMA-sync-for-device the page
|
|
* needs to be mapped
|
|
*/
|
|
if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
|
|
return -EINVAL;
|
|
|
|
if (!pool->p.max_len)
|
|
return -EINVAL;
|
|
|
|
pool->dma_sync = true;
|
|
|
|
/* pool->p.offset has to be set according to the address
|
|
* offset used by the DMA engine to start copying rx data
|
|
*/
|
|
}
|
|
|
|
pool->has_init_callback = !!pool->slow.init_callback;
|
|
|
|
#ifdef CONFIG_PAGE_POOL_STATS
|
|
if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
|
|
pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
|
|
if (!pool->recycle_stats)
|
|
return -ENOMEM;
|
|
} else {
|
|
/* For system page pool instance we use a singular stats object
|
|
* instead of allocating a separate percpu variable for each
|
|
* (also percpu) page pool instance.
|
|
*/
|
|
pool->recycle_stats = &pp_system_recycle_stats;
|
|
pool->system = true;
|
|
}
|
|
#endif
|
|
|
|
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
|
|
#ifdef CONFIG_PAGE_POOL_STATS
|
|
if (!pool->system)
|
|
free_percpu(pool->recycle_stats);
|
|
#endif
|
|
return -ENOMEM;
|
|
}
|
|
|
|
atomic_set(&pool->pages_state_release_cnt, 0);
|
|
|
|
/* Driver calling page_pool_create() also call page_pool_destroy() */
|
|
refcount_set(&pool->user_cnt, 1);
|
|
|
|
if (pool->dma_map)
|
|
get_device(pool->p.dev);
|
|
|
|
if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
|
|
/* We rely on rtnl_lock()ing to make sure netdev_rx_queue
|
|
* configuration doesn't change while we're initializing
|
|
* the page_pool.
|
|
*/
|
|
ASSERT_RTNL();
|
|
rxq = __netif_get_rx_queue(pool->slow.netdev,
|
|
pool->slow.queue_idx);
|
|
pool->mp_priv = rxq->mp_params.mp_priv;
|
|
}
|
|
|
|
if (pool->mp_priv) {
|
|
err = mp_dmabuf_devmem_init(pool);
|
|
if (err) {
|
|
pr_warn("%s() mem-provider init failed %d\n", __func__,
|
|
err);
|
|
goto free_ptr_ring;
|
|
}
|
|
|
|
static_branch_inc(&page_pool_mem_providers);
|
|
}
|
|
|
|
return 0;
|
|
|
|
free_ptr_ring:
|
|
ptr_ring_cleanup(&pool->ring, NULL);
|
|
#ifdef CONFIG_PAGE_POOL_STATS
|
|
if (!pool->system)
|
|
free_percpu(pool->recycle_stats);
|
|
#endif
|
|
return err;
|
|
}
|
|
|
|
static void page_pool_uninit(struct page_pool *pool)
|
|
{
|
|
ptr_ring_cleanup(&pool->ring, NULL);
|
|
|
|
if (pool->dma_map)
|
|
put_device(pool->p.dev);
|
|
|
|
#ifdef CONFIG_PAGE_POOL_STATS
|
|
if (!pool->system)
|
|
free_percpu(pool->recycle_stats);
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* page_pool_create_percpu() - create a page pool for a given cpu.
|
|
* @params: parameters, see struct page_pool_params
|
|
* @cpuid: cpu identifier
|
|
*/
|
|
struct page_pool *
|
|
page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
|
|
{
|
|
struct page_pool *pool;
|
|
int err;
|
|
|
|
pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
|
|
if (!pool)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
err = page_pool_init(pool, params, cpuid);
|
|
if (err < 0)
|
|
goto err_free;
|
|
|
|
err = page_pool_list(pool);
|
|
if (err)
|
|
goto err_uninit;
|
|
|
|
return pool;
|
|
|
|
err_uninit:
|
|
page_pool_uninit(pool);
|
|
err_free:
|
|
pr_warn("%s() gave up with errno %d\n", __func__, err);
|
|
kfree(pool);
|
|
return ERR_PTR(err);
|
|
}
|
|
EXPORT_SYMBOL(page_pool_create_percpu);
|
|
|
|
/**
|
|
* page_pool_create() - create a page pool
|
|
* @params: parameters, see struct page_pool_params
|
|
*/
|
|
struct page_pool *page_pool_create(const struct page_pool_params *params)
|
|
{
|
|
return page_pool_create_percpu(params, -1);
|
|
}
|
|
EXPORT_SYMBOL(page_pool_create);
|
|
|
|
static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
|
|
|
|
static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
|
|
{
|
|
struct ptr_ring *r = &pool->ring;
|
|
netmem_ref netmem;
|
|
int pref_nid; /* preferred NUMA node */
|
|
|
|
/* Quicker fallback, avoid locks when ring is empty */
|
|
if (__ptr_ring_empty(r)) {
|
|
alloc_stat_inc(pool, empty);
|
|
return 0;
|
|
}
|
|
|
|
/* Softirq guarantee CPU and thus NUMA node is stable. This,
|
|
* assumes CPU refilling driver RX-ring will also run RX-NAPI.
|
|
*/
|
|
#ifdef CONFIG_NUMA
|
|
pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
|
|
#else
|
|
/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
|
|
pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
|
|
#endif
|
|
|
|
/* Refill alloc array, but only if NUMA match */
|
|
do {
|
|
netmem = (__force netmem_ref)__ptr_ring_consume(r);
|
|
if (unlikely(!netmem))
|
|
break;
|
|
|
|
if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
|
|
pool->alloc.cache[pool->alloc.count++] = netmem;
|
|
} else {
|
|
/* NUMA mismatch;
|
|
* (1) release 1 page to page-allocator and
|
|
* (2) break out to fallthrough to alloc_pages_node.
|
|
* This limit stress on page buddy alloactor.
|
|
*/
|
|
page_pool_return_page(pool, netmem);
|
|
alloc_stat_inc(pool, waive);
|
|
netmem = 0;
|
|
break;
|
|
}
|
|
} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
|
|
|
|
/* Return last page */
|
|
if (likely(pool->alloc.count > 0)) {
|
|
netmem = pool->alloc.cache[--pool->alloc.count];
|
|
alloc_stat_inc(pool, refill);
|
|
}
|
|
|
|
return netmem;
|
|
}
|
|
|
|
/* fast path */
|
|
static netmem_ref __page_pool_get_cached(struct page_pool *pool)
|
|
{
|
|
netmem_ref netmem;
|
|
|
|
/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
|
|
if (likely(pool->alloc.count)) {
|
|
/* Fast-path */
|
|
netmem = pool->alloc.cache[--pool->alloc.count];
|
|
alloc_stat_inc(pool, fast);
|
|
} else {
|
|
netmem = page_pool_refill_alloc_cache(pool);
|
|
}
|
|
|
|
return netmem;
|
|
}
|
|
|
|
static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
|
|
netmem_ref netmem,
|
|
u32 dma_sync_size)
|
|
{
|
|
#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
|
|
dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
|
|
|
|
dma_sync_size = min(dma_sync_size, pool->p.max_len);
|
|
__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
|
|
dma_sync_size, pool->p.dma_dir);
|
|
#endif
|
|
}
|
|
|
|
static __always_inline void
|
|
page_pool_dma_sync_for_device(const struct page_pool *pool,
|
|
netmem_ref netmem,
|
|
u32 dma_sync_size)
|
|
{
|
|
if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
|
|
__page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
|
|
}
|
|
|
|
static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
|
|
{
|
|
dma_addr_t dma;
|
|
|
|
/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
|
|
* since dma_addr_t can be either 32 or 64 bits and does not always fit
|
|
* into page private data (i.e 32bit cpu with 64bit DMA caps)
|
|
* This mapping is kept for lifetime of page, until leaving pool.
|
|
*/
|
|
dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
|
|
(PAGE_SIZE << pool->p.order), pool->p.dma_dir,
|
|
DMA_ATTR_SKIP_CPU_SYNC |
|
|
DMA_ATTR_WEAK_ORDERING);
|
|
if (dma_mapping_error(pool->p.dev, dma))
|
|
return false;
|
|
|
|
if (page_pool_set_dma_addr_netmem(netmem, dma))
|
|
goto unmap_failed;
|
|
|
|
page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
|
|
|
|
return true;
|
|
|
|
unmap_failed:
|
|
WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
|
|
dma_unmap_page_attrs(pool->p.dev, dma,
|
|
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
|
|
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
|
|
return false;
|
|
}
|
|
|
|
static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
|
|
gfp_t gfp)
|
|
{
|
|
struct page *page;
|
|
|
|
gfp |= __GFP_COMP;
|
|
page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
|
|
if (unlikely(!page))
|
|
return NULL;
|
|
|
|
if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
|
|
put_page(page);
|
|
return NULL;
|
|
}
|
|
|
|
alloc_stat_inc(pool, slow_high_order);
|
|
page_pool_set_pp_info(pool, page_to_netmem(page));
|
|
|
|
/* Track how many pages are held 'in-flight' */
|
|
pool->pages_state_hold_cnt++;
|
|
trace_page_pool_state_hold(pool, page_to_netmem(page),
|
|
pool->pages_state_hold_cnt);
|
|
return page;
|
|
}
|
|
|
|
/* slow path */
|
|
static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
|
|
gfp_t gfp)
|
|
{
|
|
const int bulk = PP_ALLOC_CACHE_REFILL;
|
|
unsigned int pp_order = pool->p.order;
|
|
bool dma_map = pool->dma_map;
|
|
netmem_ref netmem;
|
|
int i, nr_pages;
|
|
|
|
/* Don't support bulk alloc for high-order pages */
|
|
if (unlikely(pp_order))
|
|
return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
|
|
|
|
/* Unnecessary as alloc cache is empty, but guarantees zero count */
|
|
if (unlikely(pool->alloc.count > 0))
|
|
return pool->alloc.cache[--pool->alloc.count];
|
|
|
|
/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
|
|
memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
|
|
|
|
nr_pages = alloc_pages_bulk_array_node(gfp,
|
|
pool->p.nid, bulk,
|
|
(struct page **)pool->alloc.cache);
|
|
if (unlikely(!nr_pages))
|
|
return 0;
|
|
|
|
/* Pages have been filled into alloc.cache array, but count is zero and
|
|
* page element have not been (possibly) DMA mapped.
|
|
*/
|
|
for (i = 0; i < nr_pages; i++) {
|
|
netmem = pool->alloc.cache[i];
|
|
if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
|
|
put_page(netmem_to_page(netmem));
|
|
continue;
|
|
}
|
|
|
|
page_pool_set_pp_info(pool, netmem);
|
|
pool->alloc.cache[pool->alloc.count++] = netmem;
|
|
/* Track how many pages are held 'in-flight' */
|
|
pool->pages_state_hold_cnt++;
|
|
trace_page_pool_state_hold(pool, netmem,
|
|
pool->pages_state_hold_cnt);
|
|
}
|
|
|
|
/* Return last page */
|
|
if (likely(pool->alloc.count > 0)) {
|
|
netmem = pool->alloc.cache[--pool->alloc.count];
|
|
alloc_stat_inc(pool, slow);
|
|
} else {
|
|
netmem = 0;
|
|
}
|
|
|
|
/* When page just alloc'ed is should/must have refcnt 1. */
|
|
return netmem;
|
|
}
|
|
|
|
/* For using page_pool replace: alloc_pages() API calls, but provide
|
|
* synchronization guarantee for allocation side.
|
|
*/
|
|
netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp)
|
|
{
|
|
netmem_ref netmem;
|
|
|
|
/* Fast-path: Get a page from cache */
|
|
netmem = __page_pool_get_cached(pool);
|
|
if (netmem)
|
|
return netmem;
|
|
|
|
/* Slow-path: cache empty, do real allocation */
|
|
if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
|
|
netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp);
|
|
else
|
|
netmem = __page_pool_alloc_pages_slow(pool, gfp);
|
|
return netmem;
|
|
}
|
|
EXPORT_SYMBOL(page_pool_alloc_netmem);
|
|
|
|
struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
|
|
{
|
|
return netmem_to_page(page_pool_alloc_netmem(pool, gfp));
|
|
}
|
|
EXPORT_SYMBOL(page_pool_alloc_pages);
|
|
ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);
|
|
|
|
/* Calculate distance between two u32 values, valid if distance is below 2^(31)
|
|
* https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
|
|
*/
|
|
#define _distance(a, b) (s32)((a) - (b))
|
|
|
|
s32 page_pool_inflight(const struct page_pool *pool, bool strict)
|
|
{
|
|
u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
|
|
u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
|
|
s32 inflight;
|
|
|
|
inflight = _distance(hold_cnt, release_cnt);
|
|
|
|
if (strict) {
|
|
trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
|
|
WARN(inflight < 0, "Negative(%d) inflight packet-pages",
|
|
inflight);
|
|
} else {
|
|
inflight = max(0, inflight);
|
|
}
|
|
|
|
return inflight;
|
|
}
|
|
|
|
void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
|
|
{
|
|
netmem_set_pp(netmem, pool);
|
|
netmem_or_pp_magic(netmem, PP_SIGNATURE);
|
|
|
|
/* Ensuring all pages have been split into one fragment initially:
|
|
* page_pool_set_pp_info() is only called once for every page when it
|
|
* is allocated from the page allocator and page_pool_fragment_page()
|
|
* is dirtying the same cache line as the page->pp_magic above, so
|
|
* the overhead is negligible.
|
|
*/
|
|
page_pool_fragment_netmem(netmem, 1);
|
|
if (pool->has_init_callback)
|
|
pool->slow.init_callback(netmem, pool->slow.init_arg);
|
|
}
|
|
|
|
void page_pool_clear_pp_info(netmem_ref netmem)
|
|
{
|
|
netmem_clear_pp_magic(netmem);
|
|
netmem_set_pp(netmem, NULL);
|
|
}
|
|
|
|
static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
|
|
netmem_ref netmem)
|
|
{
|
|
dma_addr_t dma;
|
|
|
|
if (!pool->dma_map)
|
|
/* Always account for inflight pages, even if we didn't
|
|
* map them
|
|
*/
|
|
return;
|
|
|
|
dma = page_pool_get_dma_addr_netmem(netmem);
|
|
|
|
/* When page is unmapped, it cannot be returned to our pool */
|
|
dma_unmap_page_attrs(pool->p.dev, dma,
|
|
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
|
|
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
|
|
page_pool_set_dma_addr_netmem(netmem, 0);
|
|
}
|
|
|
|
/* Disconnects a page (from a page_pool). API users can have a need
|
|
* to disconnect a page (from a page_pool), to allow it to be used as
|
|
* a regular page (that will eventually be returned to the normal
|
|
* page-allocator via put_page).
|
|
*/
|
|
void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
|
|
{
|
|
int count;
|
|
bool put;
|
|
|
|
put = true;
|
|
if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
|
|
put = mp_dmabuf_devmem_release_page(pool, netmem);
|
|
else
|
|
__page_pool_release_page_dma(pool, netmem);
|
|
|
|
/* This may be the last page returned, releasing the pool, so
|
|
* it is not safe to reference pool afterwards.
|
|
*/
|
|
count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
|
|
trace_page_pool_state_release(pool, netmem, count);
|
|
|
|
if (put) {
|
|
page_pool_clear_pp_info(netmem);
|
|
put_page(netmem_to_page(netmem));
|
|
}
|
|
/* An optimization would be to call __free_pages(page, pool->p.order)
|
|
* knowing page is not part of page-cache (thus avoiding a
|
|
* __page_cache_release() call).
|
|
*/
|
|
}
|
|
|
|
static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
|
|
{
|
|
bool in_softirq, ret;
|
|
|
|
/* BH protection not needed if current is softirq */
|
|
in_softirq = page_pool_producer_lock(pool);
|
|
ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem);
|
|
if (ret)
|
|
recycle_stat_inc(pool, ring);
|
|
page_pool_producer_unlock(pool, in_softirq);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/* Only allow direct recycling in special circumstances, into the
|
|
* alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case.
|
|
*
|
|
* Caller must provide appropriate safe context.
|
|
*/
|
|
static bool page_pool_recycle_in_cache(netmem_ref netmem,
|
|
struct page_pool *pool)
|
|
{
|
|
if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
|
|
recycle_stat_inc(pool, cache_full);
|
|
return false;
|
|
}
|
|
|
|
/* Caller MUST have verified/know (page_ref_count(page) == 1) */
|
|
pool->alloc.cache[pool->alloc.count++] = netmem;
|
|
recycle_stat_inc(pool, cached);
|
|
return true;
|
|
}
|
|
|
|
static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
|
|
{
|
|
return netmem_is_net_iov(netmem) ||
|
|
(page_ref_count(netmem_to_page(netmem)) == 1 &&
|
|
!page_is_pfmemalloc(netmem_to_page(netmem)));
|
|
}
|
|
|
|
/* If the page refcnt == 1, this will try to recycle the page.
|
|
* If pool->dma_sync is set, we'll try to sync the DMA area for
|
|
* the configured size min(dma_sync_size, pool->max_len).
|
|
* If the page refcnt != 1, then the page will be returned to memory
|
|
* subsystem.
|
|
*/
|
|
static __always_inline netmem_ref
|
|
__page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
|
|
unsigned int dma_sync_size, bool allow_direct)
|
|
{
|
|
lockdep_assert_no_hardirq();
|
|
|
|
/* This allocator is optimized for the XDP mode that uses
|
|
* one-frame-per-page, but have fallbacks that act like the
|
|
* regular page allocator APIs.
|
|
*
|
|
* refcnt == 1 means page_pool owns page, and can recycle it.
|
|
*
|
|
* page is NOT reusable when allocated when system is under
|
|
* some pressure. (page_is_pfmemalloc)
|
|
*/
|
|
if (likely(__page_pool_page_can_be_recycled(netmem))) {
|
|
/* Read barrier done in page_ref_count / READ_ONCE */
|
|
|
|
page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
|
|
|
|
if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
|
|
return 0;
|
|
|
|
/* Page found as candidate for recycling */
|
|
return netmem;
|
|
}
|
|
|
|
/* Fallback/non-XDP mode: API user have elevated refcnt.
|
|
*
|
|
* Many drivers split up the page into fragments, and some
|
|
* want to keep doing this to save memory and do refcnt based
|
|
* recycling. Support this use case too, to ease drivers
|
|
* switching between XDP/non-XDP.
|
|
*
|
|
* In-case page_pool maintains the DMA mapping, API user must
|
|
* call page_pool_put_page once. In this elevated refcnt
|
|
* case, the DMA is unmapped/released, as driver is likely
|
|
* doing refcnt based recycle tricks, meaning another process
|
|
* will be invoking put_page.
|
|
*/
|
|
recycle_stat_inc(pool, released_refcnt);
|
|
page_pool_return_page(pool, netmem);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static bool page_pool_napi_local(const struct page_pool *pool)
|
|
{
|
|
const struct napi_struct *napi;
|
|
u32 cpuid;
|
|
|
|
/* On PREEMPT_RT the softirq can be preempted by the consumer */
|
|
if (IS_ENABLED(CONFIG_PREEMPT_RT))
|
|
return false;
|
|
|
|
if (unlikely(!in_softirq()))
|
|
return false;
|
|
|
|
/* Allow direct recycle if we have reasons to believe that we are
|
|
* in the same context as the consumer would run, so there's
|
|
* no possible race.
|
|
* __page_pool_put_page() makes sure we're not in hardirq context
|
|
* and interrupts are enabled prior to accessing the cache.
|
|
*/
|
|
cpuid = smp_processor_id();
|
|
if (READ_ONCE(pool->cpuid) == cpuid)
|
|
return true;
|
|
|
|
napi = READ_ONCE(pool->p.napi);
|
|
|
|
return napi && READ_ONCE(napi->list_owner) == cpuid;
|
|
}
|
|
|
|
void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
|
|
unsigned int dma_sync_size, bool allow_direct)
|
|
{
|
|
if (!allow_direct)
|
|
allow_direct = page_pool_napi_local(pool);
|
|
|
|
netmem =
|
|
__page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
|
|
if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
|
|
/* Cache full, fallback to free pages */
|
|
recycle_stat_inc(pool, ring_full);
|
|
page_pool_return_page(pool, netmem);
|
|
}
|
|
}
|
|
EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
|
|
|
|
void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
|
|
unsigned int dma_sync_size, bool allow_direct)
|
|
{
|
|
page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
|
|
allow_direct);
|
|
}
|
|
EXPORT_SYMBOL(page_pool_put_unrefed_page);
|
|
|
|
/**
|
|
* page_pool_put_page_bulk() - release references on multiple pages
|
|
* @pool: pool from which pages were allocated
|
|
* @data: array holding page pointers
|
|
* @count: number of pages in @data
|
|
*
|
|
* Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
|
|
* producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
|
|
* will release leftover pages to the page allocator.
|
|
* page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
|
|
* completion loop for the XDP_REDIRECT use case.
|
|
*
|
|
* Please note the caller must not use data area after running
|
|
* page_pool_put_page_bulk(), as this function overwrites it.
|
|
*/
|
|
void page_pool_put_page_bulk(struct page_pool *pool, void **data,
|
|
int count)
|
|
{
|
|
int i, bulk_len = 0;
|
|
bool allow_direct;
|
|
bool in_softirq;
|
|
|
|
allow_direct = page_pool_napi_local(pool);
|
|
|
|
for (i = 0; i < count; i++) {
|
|
netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i]));
|
|
|
|
/* It is not the last user for the page frag case */
|
|
if (!page_pool_is_last_ref(netmem))
|
|
continue;
|
|
|
|
netmem = __page_pool_put_page(pool, netmem, -1, allow_direct);
|
|
/* Approved for bulk recycling in ptr_ring cache */
|
|
if (netmem)
|
|
data[bulk_len++] = (__force void *)netmem;
|
|
}
|
|
|
|
if (!bulk_len)
|
|
return;
|
|
|
|
/* Bulk producer into ptr_ring page_pool cache */
|
|
in_softirq = page_pool_producer_lock(pool);
|
|
for (i = 0; i < bulk_len; i++) {
|
|
if (__ptr_ring_produce(&pool->ring, data[i])) {
|
|
/* ring full */
|
|
recycle_stat_inc(pool, ring_full);
|
|
break;
|
|
}
|
|
}
|
|
recycle_stat_add(pool, ring, i);
|
|
page_pool_producer_unlock(pool, in_softirq);
|
|
|
|
/* Hopefully all pages was return into ptr_ring */
|
|
if (likely(i == bulk_len))
|
|
return;
|
|
|
|
/* ptr_ring cache full, free remaining pages outside producer lock
|
|
* since put_page() with refcnt == 1 can be an expensive operation
|
|
*/
|
|
for (; i < bulk_len; i++)
|
|
page_pool_return_page(pool, (__force netmem_ref)data[i]);
|
|
}
|
|
EXPORT_SYMBOL(page_pool_put_page_bulk);
|
|
|
|
static netmem_ref page_pool_drain_frag(struct page_pool *pool,
|
|
netmem_ref netmem)
|
|
{
|
|
long drain_count = BIAS_MAX - pool->frag_users;
|
|
|
|
/* Some user is still using the page frag */
|
|
if (likely(page_pool_unref_netmem(netmem, drain_count)))
|
|
return 0;
|
|
|
|
if (__page_pool_page_can_be_recycled(netmem)) {
|
|
page_pool_dma_sync_for_device(pool, netmem, -1);
|
|
return netmem;
|
|
}
|
|
|
|
page_pool_return_page(pool, netmem);
|
|
return 0;
|
|
}
|
|
|
|
static void page_pool_free_frag(struct page_pool *pool)
|
|
{
|
|
long drain_count = BIAS_MAX - pool->frag_users;
|
|
netmem_ref netmem = pool->frag_page;
|
|
|
|
pool->frag_page = 0;
|
|
|
|
if (!netmem || page_pool_unref_netmem(netmem, drain_count))
|
|
return;
|
|
|
|
page_pool_return_page(pool, netmem);
|
|
}
|
|
|
|
netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
|
|
unsigned int *offset, unsigned int size,
|
|
gfp_t gfp)
|
|
{
|
|
unsigned int max_size = PAGE_SIZE << pool->p.order;
|
|
netmem_ref netmem = pool->frag_page;
|
|
|
|
if (WARN_ON(size > max_size))
|
|
return 0;
|
|
|
|
size = ALIGN(size, dma_get_cache_alignment());
|
|
*offset = pool->frag_offset;
|
|
|
|
if (netmem && *offset + size > max_size) {
|
|
netmem = page_pool_drain_frag(pool, netmem);
|
|
if (netmem) {
|
|
alloc_stat_inc(pool, fast);
|
|
goto frag_reset;
|
|
}
|
|
}
|
|
|
|
if (!netmem) {
|
|
netmem = page_pool_alloc_netmem(pool, gfp);
|
|
if (unlikely(!netmem)) {
|
|
pool->frag_page = 0;
|
|
return 0;
|
|
}
|
|
|
|
pool->frag_page = netmem;
|
|
|
|
frag_reset:
|
|
pool->frag_users = 1;
|
|
*offset = 0;
|
|
pool->frag_offset = size;
|
|
page_pool_fragment_netmem(netmem, BIAS_MAX);
|
|
return netmem;
|
|
}
|
|
|
|
pool->frag_users++;
|
|
pool->frag_offset = *offset + size;
|
|
alloc_stat_inc(pool, fast);
|
|
return netmem;
|
|
}
|
|
EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
|
|
|
|
struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
|
|
unsigned int size, gfp_t gfp)
|
|
{
|
|
return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
|
|
gfp));
|
|
}
|
|
EXPORT_SYMBOL(page_pool_alloc_frag);
|
|
|
|
static void page_pool_empty_ring(struct page_pool *pool)
|
|
{
|
|
netmem_ref netmem;
|
|
|
|
/* Empty recycle ring */
|
|
while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
|
|
/* Verify the refcnt invariant of cached pages */
|
|
if (!(netmem_ref_count(netmem) == 1))
|
|
pr_crit("%s() page_pool refcnt %d violation\n",
|
|
__func__, netmem_ref_count(netmem));
|
|
|
|
page_pool_return_page(pool, netmem);
|
|
}
|
|
}
|
|
|
|
static void __page_pool_destroy(struct page_pool *pool)
|
|
{
|
|
if (pool->disconnect)
|
|
pool->disconnect(pool);
|
|
|
|
page_pool_unlist(pool);
|
|
page_pool_uninit(pool);
|
|
|
|
if (pool->mp_priv) {
|
|
mp_dmabuf_devmem_destroy(pool);
|
|
static_branch_dec(&page_pool_mem_providers);
|
|
}
|
|
|
|
kfree(pool);
|
|
}
|
|
|
|
static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
|
|
{
|
|
netmem_ref netmem;
|
|
|
|
if (pool->destroy_cnt)
|
|
return;
|
|
|
|
/* Empty alloc cache, assume caller made sure this is
|
|
* no-longer in use, and page_pool_alloc_pages() cannot be
|
|
* call concurrently.
|
|
*/
|
|
while (pool->alloc.count) {
|
|
netmem = pool->alloc.cache[--pool->alloc.count];
|
|
page_pool_return_page(pool, netmem);
|
|
}
|
|
}
|
|
|
|
static void page_pool_scrub(struct page_pool *pool)
|
|
{
|
|
page_pool_empty_alloc_cache_once(pool);
|
|
pool->destroy_cnt++;
|
|
|
|
/* No more consumers should exist, but producers could still
|
|
* be in-flight.
|
|
*/
|
|
page_pool_empty_ring(pool);
|
|
}
|
|
|
|
static int page_pool_release(struct page_pool *pool)
|
|
{
|
|
bool in_softirq;
|
|
int inflight;
|
|
|
|
page_pool_scrub(pool);
|
|
inflight = page_pool_inflight(pool, true);
|
|
/* Acquire producer lock to make sure producers have exited. */
|
|
in_softirq = page_pool_producer_lock(pool);
|
|
page_pool_producer_unlock(pool, in_softirq);
|
|
if (!inflight)
|
|
__page_pool_destroy(pool);
|
|
|
|
return inflight;
|
|
}
|
|
|
|
static void page_pool_release_retry(struct work_struct *wq)
|
|
{
|
|
struct delayed_work *dwq = to_delayed_work(wq);
|
|
struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
|
|
void *netdev;
|
|
int inflight;
|
|
|
|
inflight = page_pool_release(pool);
|
|
/* In rare cases, a driver bug may cause inflight to go negative.
|
|
* Don't reschedule release if inflight is 0 or negative.
|
|
* - If 0, the page_pool has been destroyed
|
|
* - if negative, we will never recover
|
|
* in both cases no reschedule is necessary.
|
|
*/
|
|
if (inflight <= 0)
|
|
return;
|
|
|
|
/* Periodic warning for page pools the user can't see */
|
|
netdev = READ_ONCE(pool->slow.netdev);
|
|
if (time_after_eq(jiffies, pool->defer_warn) &&
|
|
(!netdev || netdev == NET_PTR_POISON)) {
|
|
int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
|
|
|
|
pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
|
|
__func__, pool->user.id, inflight, sec);
|
|
pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
|
|
}
|
|
|
|
/* Still not ready to be disconnected, retry later */
|
|
schedule_delayed_work(&pool->release_dw, DEFER_TIME);
|
|
}
|
|
|
|
void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
|
|
const struct xdp_mem_info *mem)
|
|
{
|
|
refcount_inc(&pool->user_cnt);
|
|
pool->disconnect = disconnect;
|
|
pool->xdp_mem_id = mem->id;
|
|
}
|
|
|
|
void page_pool_disable_direct_recycling(struct page_pool *pool)
|
|
{
|
|
/* Disable direct recycling based on pool->cpuid.
|
|
* Paired with READ_ONCE() in page_pool_napi_local().
|
|
*/
|
|
WRITE_ONCE(pool->cpuid, -1);
|
|
|
|
if (!pool->p.napi)
|
|
return;
|
|
|
|
/* To avoid races with recycling and additional barriers make sure
|
|
* pool and NAPI are unlinked when NAPI is disabled.
|
|
*/
|
|
WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state));
|
|
WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);
|
|
|
|
WRITE_ONCE(pool->p.napi, NULL);
|
|
}
|
|
EXPORT_SYMBOL(page_pool_disable_direct_recycling);
|
|
|
|
void page_pool_destroy(struct page_pool *pool)
|
|
{
|
|
if (!pool)
|
|
return;
|
|
|
|
if (!page_pool_put(pool))
|
|
return;
|
|
|
|
page_pool_disable_direct_recycling(pool);
|
|
page_pool_free_frag(pool);
|
|
|
|
if (!page_pool_release(pool))
|
|
return;
|
|
|
|
page_pool_detached(pool);
|
|
pool->defer_start = jiffies;
|
|
pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
|
|
|
|
INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
|
|
schedule_delayed_work(&pool->release_dw, DEFER_TIME);
|
|
}
|
|
EXPORT_SYMBOL(page_pool_destroy);
|
|
|
|
/* Caller must provide appropriate safe context, e.g. NAPI. */
|
|
void page_pool_update_nid(struct page_pool *pool, int new_nid)
|
|
{
|
|
netmem_ref netmem;
|
|
|
|
trace_page_pool_update_nid(pool, new_nid);
|
|
pool->p.nid = new_nid;
|
|
|
|
/* Flush pool alloc cache, as refill will check NUMA node */
|
|
while (pool->alloc.count) {
|
|
netmem = pool->alloc.cache[--pool->alloc.count];
|
|
page_pool_return_page(pool, netmem);
|
|
}
|
|
}
|
|
EXPORT_SYMBOL(page_pool_update_nid);
|