diff options
142 files changed, 2912 insertions, 1157 deletions
@@ -549,6 +549,8 @@ Nicolas Pitre <nico@fluxnic.net> <nicolas.pitre@linaro.org> Nicolas Pitre <nico@fluxnic.net> <nico@linaro.org> Nicolas Saenz Julienne <nsaenz@kernel.org> <nsaenzjulienne@suse.de> Nicolas Saenz Julienne <nsaenz@kernel.org> <nsaenzjulienne@suse.com> +Nicolas Schier <nicolas.schier@linux.dev> <n.schier@avm.de> +Nicolas Schier <nicolas.schier@linux.dev> <nicolas@fjasle.eu> Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se> Nikolay Aleksandrov <razor@blackwall.org> <naleksan@redhat.com> Nikolay Aleksandrov <razor@blackwall.org> <nikolay@redhat.com> diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index 402af4b2b905..a02707cb7cbc 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -177,6 +177,12 @@ Description: The cache write policy: 0 for write-back, 1 for write-through, other or unknown. +What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/address_mode +Date: March 2025 +Contact: Dave Jiang <dave.jiang@intel.com> +Description: + The address mode: 0 for reserved, 1 for extended-linear. + What: /sys/devices/system/node/nodeX/x86/sgx_total_bytes Date: November 2021 Contact: Jarkko Sakkinen <jarkko@kernel.org> diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 3f5627a1210a..99bb3faf7a0e 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -1,5 +1,5 @@ What: /sys/bus/cxl/flush -Date: Januarry, 2022 +Date: January, 2022 KernelVersion: v5.18 Contact: linux-cxl@vger.kernel.org Description: @@ -18,6 +18,24 @@ Description: specification. +What: /sys/bus/cxl/devices/memX/payload_max +Date: December, 2020 +KernelVersion: v5.12 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) Maximum size (in bytes) of the mailbox command payload + registers. Linux caps this at 1MB if the device reports a + larger size. + + +What: /sys/bus/cxl/devices/memX/label_storage_size +Date: May, 2021 +KernelVersion: v5.13 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) Size (in bytes) of the Label Storage Area (LSA). + + What: /sys/bus/cxl/devices/memX/ram/size Date: December, 2020 KernelVersion: v5.12 @@ -33,7 +51,7 @@ Date: May, 2023 KernelVersion: v6.8 Contact: linux-cxl@vger.kernel.org Description: - (RO) For CXL host platforms that support "QoS Telemmetry" + (RO) For CXL host platforms that support "QoS Telemetry" this attribute conveys a comma delimited list of platform specific cookies that identifies a QoS performance class for the volatile partition of the CXL mem device. These @@ -60,7 +78,7 @@ Date: May, 2023 KernelVersion: v6.8 Contact: linux-cxl@vger.kernel.org Description: - (RO) For CXL host platforms that support "QoS Telemmetry" + (RO) For CXL host platforms that support "QoS Telemetry" this attribute conveys a comma delimited list of platform specific cookies that identifies a QoS performance class for the persistent partition of the CXL mem device. These @@ -321,14 +339,13 @@ KernelVersion: v6.0 Contact: linux-cxl@vger.kernel.org Description: (RW) When a CXL decoder is of devtype "cxl_decoder_endpoint" it - translates from a host physical address range, to a device local - address range. Device-local address ranges are further split - into a 'ram' (volatile memory) range and 'pmem' (persistent - memory) range. The 'mode' attribute emits one of 'ram', 'pmem', - 'mixed', or 'none'. The 'mixed' indication is for error cases - when a decoder straddles the volatile/persistent partition - boundary, and 'none' indicates the decoder is not actively - decoding, or no DPA allocation policy has been set. + translates from a host physical address range, to a device + local address range. Device-local address ranges are further + split into a 'ram' (volatile memory) range and 'pmem' + (persistent memory) range. The 'mode' attribute emits one of + 'ram', 'pmem', or 'none'. The 'none' indicates the decoder is + not actively decoding, or no DPA allocation policy has been + set. 'mode' can be written, when the decoder is in the 'disabled' state, with either 'ram' or 'pmem' to set the boundaries for the @@ -423,7 +440,7 @@ Date: May, 2023 KernelVersion: v6.5 Contact: linux-cxl@vger.kernel.org Description: - (RO) For CXL host platforms that support "QoS Telemmetry" this + (RO) For CXL host platforms that support "QoS Telemetry" this root-decoder-only attribute conveys a platform specific cookie that identifies a QoS performance class for the CXL Window. This class-id can be compared against a similar "qos_class" @@ -586,3 +603,15 @@ Description: See Documentation/ABI/stable/sysfs-devices-node. access0 provides the number to the closest initiator and access1 provides the number to the closest CPU. + + +What: /sys/bus/cxl/devices/nvdimm-bridge0/ndbusX/nmemY/cxl/dirty_shutdown +Date: Feb, 2025 +KernelVersion: v6.15 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) The device dirty shutdown count value, which is the number + of times the device could have incurred in potential data loss. + The count is persistent across power loss and wraps back to 0 + upon overflow. If this file is not present, the device does not + have the necessary support for dirty tracking. diff --git a/Documentation/admin-guide/device-mapper/dm-crypt.rst b/Documentation/admin-guide/device-mapper/dm-crypt.rst index 9f8139ff97d6..4467f6d4b632 100644 --- a/Documentation/admin-guide/device-mapper/dm-crypt.rst +++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst @@ -146,6 +146,11 @@ integrity:<bytes>:<type> integrity for the encrypted device. The additional space is then used for storing authentication tag (and persistent IV if needed). +integrity_key_size:<bytes> + Optionally set the integrity key size if it differs from the digest size. + It allows the use of wrapped key algorithms where the key size is + independent of the cryptographic key size. + sector_size:<bytes> Use <bytes> as the encryption unit instead of 512 bytes sectors. This option can be in range 512 - 4096 bytes and must be power of two. diff --git a/Documentation/admin-guide/device-mapper/dm-integrity.rst b/Documentation/admin-guide/device-mapper/dm-integrity.rst index d8a5f14d0e3c..c2e18ecc065c 100644 --- a/Documentation/admin-guide/device-mapper/dm-integrity.rst +++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst @@ -92,6 +92,11 @@ Target arguments: allowed. This mode is useful for data recovery if the device cannot be activated in any of the other standard modes. + I - inline mode - in this mode, dm-integrity will store integrity + data directly in the underlying device sectors. + The underlying device must have an integrity profile that + allows storing user integrity data and provides enough + space for the selected integrity tag. 5. the number of additional arguments diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst index a65c1602cb23..8c3f1f967a3c 100644 --- a/Documentation/admin-guide/device-mapper/verity.rst +++ b/Documentation/admin-guide/device-mapper/verity.rst @@ -87,6 +87,15 @@ panic_on_corruption Panic the device when a corrupted block is discovered. This option is not compatible with ignore_corruption and restart_on_corruption. +restart_on_error + Restart the system when an I/O error is detected. + This option can be combined with the restart_on_corruption option. + +panic_on_error + Panic the device when an I/O error is detected. This option is + not compatible with the restart_on_error option but can be combined + with the panic_on_corruption option. + ignore_zero_blocks Do not verify blocks that are expected to contain zeroes and always return zeroes instead. This may be useful if the partition contains unused blocks @@ -142,8 +151,15 @@ root_hash_sig_key_desc <key_description> already in the secondary trusted keyring. try_verify_in_tasklet - If verity hashes are in cache, verify data blocks in kernel tasklet instead - of workqueue. This option can reduce IO latency. + If verity hashes are in cache and the IO size does not exceed the limit, + verify data blocks in bottom half instead of workqueue. This option can + reduce IO latency. The size limits can be configured via + /sys/module/dm_verity/parameters/use_bh_bytes. The four parameters + correspond to limits for IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, + IOPRIO_CLASS_BE and IOPRIO_CLASS_IDLE in turn. + For example: + <none>,<rt>,<be>,<idle> + 4096,4096,4096,4096 Theory of operation =================== diff --git a/Documentation/driver-api/cxl/maturity-map.rst b/Documentation/driver-api/cxl/maturity-map.rst index df8e2ac2a320..a2288f9df658 100644 --- a/Documentation/driver-api/cxl/maturity-map.rst +++ b/Documentation/driver-api/cxl/maturity-map.rst @@ -130,7 +130,7 @@ Mailbox commands * [0] Switch CCI * [3] Timestamp * [1] PMEM labels -* [0] PMEM GPF / Dirty Shutdown +* [3] PMEM GPF / Dirty Shutdown * [0] Scan Media PMU diff --git a/Documentation/features/core/mseal_sys_mappings/arch-support.txt b/Documentation/features/core/mseal_sys_mappings/arch-support.txt new file mode 100644 index 000000000000..c6cab9760d57 --- /dev/null +++ b/Documentation/features/core/mseal_sys_mappings/arch-support.txt @@ -0,0 +1,30 @@ +# +# Feature name: mseal-system-mappings +# Kconfig: ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS +# description: arch supports mseal system mappings +# + ----------------------- + | arch |status| + ----------------------- + | alpha: | TODO | + | arc: | N/A | + | arm: | N/A | + | arm64: | ok | + | csky: | N/A | + | hexagon: | N/A | + | loongarch: | TODO | + | m68k: | N/A | + | microblaze: | N/A | + | mips: | TODO | + | nios2: | N/A | + | openrisc: | N/A | + | parisc: | TODO | + | powerpc: | TODO | + | riscv: | TODO | + | s390: | ok | + | sh: | N/A | + | sparc: | TODO | + | um: | TODO | + | x86: | ok | + | xtensa: | N/A | + ----------------------- diff --git a/Documentation/userspace-api/mseal.rst b/Documentation/userspace-api/mseal.rst index 41102f74c5e2..1dabfc29be0d 100644 --- a/Documentation/userspace-api/mseal.rst +++ b/Documentation/userspace-api/mseal.rst @@ -130,6 +130,27 @@ Use cases - Chrome browser: protect some security sensitive data structures. +- System mappings: + The system mappings are created by the kernel and includes vdso, vvar, + vvar_vclock, vectors (arm compat-mode), sigpage (arm compat-mode), uprobes. + + Those system mappings are readonly only or execute only, memory sealing can + protect them from ever changing to writable or unmmap/remapped as different + attributes. This is useful to mitigate memory corruption issues where a + corrupted pointer is passed to a memory management system. + + If supported by an architecture (CONFIG_ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS), + the CONFIG_MSEAL_SYSTEM_MAPPINGS seals all system mappings of this + architecture. + + The following architectures currently support this feature: x86-64, arm64, + and s390. + + WARNING: This feature breaks programs which rely on relocating + or unmapping system mappings. Known broken software at the time + of writing includes CHECKPOINT_RESTORE, UML, gVisor, rr. Therefore + this config can't be enabled universally. + When not to use mseal ===================== Applications can apply sealing to any virtual memory region from userspace, diff --git a/MAINTAINERS b/MAINTAINERS index d32ce85c5c66..a7a1d121a83e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15487,6 +15487,45 @@ F: tools/mm/ F: tools/testing/selftests/mm/ N: include/linux/page[-_]* +MEMORY MANAGEMENT - EXECMEM +M: Andrew Morton <akpm@linux-foundation.org> +M: Mike Rapoport <rppt@kernel.org> +L: linux-mm@kvack.org +S: Maintained +F: include/linux/execmem.h +F: mm/execmem.c + +MEMORY MANAGEMENT - NUMA MEMBLOCKS AND NUMA EMULATION +M: Andrew Morton <akpm@linux-foundation.org> +M: Mike Rapoport <rppt@kernel.org> +L: linux-mm@kvack.org +S: Maintained +F: include/linux/numa_memblks.h +F: mm/numa.c +F: mm/numa_emulation.c +F: mm/numa_memblks.c + +MEMORY MANAGEMENT - SECRETMEM +M: Andrew Morton <akpm@linux-foundation.org> +M: Mike Rapoport <rppt@kernel.org> +L: linux-mm@kvack.org +S: Maintained +F: include/linux/secretmem.h +F: mm/secretmem.c + +MEMORY MANAGEMENT - USERFAULTFD +M: Andrew Morton <akpm@linux-foundation.org> +R: Peter Xu <peterx@redhat.com> +L: linux-mm@kvack.org +S: Maintained +F: Documentation/admin-guide/mm/userfaultfd.rst +F: fs/userfaultfd.c +F: include/asm-generic/pgtable_uffd.h +F: include/linux/userfaultfd_k.h +F: include/uapi/linux/userfaultfd.h +F: mm/userfaultfd.c +F: tools/testing/selftests/mm/uffd-*.[ch] + MEMORY MAPPING M: Andrew Morton <akpm@linux-foundation.org> M: Liam R. Howlett <Liam.Howlett@oracle.com> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 748c34dc953c..a182295e6f08 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -38,6 +38,7 @@ config ARM64 select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_MEM_ENCRYPT + select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_NONLEAF_PMD_YOUNG if ARM64_HAFT diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 887ac0b05961..78ddf6bdecad 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -130,7 +130,8 @@ static int __setup_additional_pages(enum vdso_abi abi, mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, VM_READ|VM_EXEC|gp_flags| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_SEALED_SYSMAP, vdso_info[abi].cm); if (IS_ERR(ret)) goto up_fail; @@ -256,7 +257,8 @@ static int aarch32_kuser_helpers_setup(struct mm_struct *mm) */ ret = _install_special_mapping(mm, AARCH32_VECTORS_BASE, PAGE_SIZE, VM_READ | VM_EXEC | - VM_MAYREAD | VM_MAYEXEC, + VM_MAYREAD | VM_MAYEXEC | + VM_SEALED_SYSMAP, &aarch32_vdso_maps[AA32_MAP_VECTORS]); return PTR_ERR_OR_ZERO(ret); @@ -279,7 +281,8 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm) */ ret = _install_special_mapping(mm, addr, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | - VM_MAYWRITE | VM_MAYEXEC, + VM_MAYWRITE | VM_MAYEXEC | + VM_SEALED_SYSMAP, &aarch32_vdso_maps[AA32_MAP_SIGPAGE]); if (IS_ERR(ret)) goto out; diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index bf8400c28b5a..11055c574968 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -61,11 +61,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) return ret; } -#define __pte_free_tlb(tlb, pte, address) \ -do { \ - pagetable_dtor(page_ptdesc(pte)); \ - tlb_remove_page_ptdesc(tlb, page_ptdesc(pte)); \ -} while (0) +#define __pte_free_tlb(tlb, pte, address) \ + tlb_remove_ptdesc((tlb), page_ptdesc(pte)) extern void pagetable_init(void); extern void mmu_init(unsigned long min_pfn, unsigned long max_pfn); diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index 1ee5f5f157ca..937a11ef4c33 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -87,10 +87,7 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, max_kernel_seg = pmdindex; } -#define __pte_free_tlb(tlb, pte, addr) \ -do { \ - pagetable_dtor((page_ptdesc(pte))); \ - tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ -} while (0) +#define __pte_free_tlb(tlb, pte, addr) \ + tlb_remove_ptdesc((tlb), page_ptdesc(pte)) #endif diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h index 7211dff8c969..b58f587f0f0a 100644 --- a/arch/loongarch/include/asm/pgalloc.h +++ b/arch/loongarch/include/asm/pgalloc.h @@ -55,11 +55,8 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) return pte; } -#define __pte_free_tlb(tlb, pte, address) \ -do { \ - pagetable_dtor(page_ptdesc(pte)); \ - tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ -} while (0) +#define __pte_free_tlb(tlb, pte, address) \ + tlb_remove_ptdesc((tlb), page_ptdesc(pte)) #ifndef __PAGETABLE_PMD_FOLDED diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 80afc3a18724..1e21c758b774 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -17,11 +17,8 @@ extern const char bad_pmd_string[]; -#define __pte_free_tlb(tlb, pte, addr) \ -do { \ - pagetable_dtor(page_ptdesc(pte)); \ - tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ -} while (0) +#define __pte_free_tlb(tlb, pte, addr) \ + tlb_remove_ptdesc((tlb), page_ptdesc(pte)) static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 65f0d1fb8a2a..31d475cdb1c5 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -118,7 +118,7 @@ int page_is_ram(unsigned long pfn) /* * Check for command-line options that affect what MMU_init will do. */ -static void mm_cmdline_setup(void) +static void __init mm_cmdline_setup(void) { unsigned long maxmem = 0; char *p = cmd_line; diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 26c7a6ede983..bbca420c96d3 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -48,11 +48,8 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) extern void pgd_init(void *addr); extern pgd_t *pgd_alloc(struct mm_struct *mm); -#define __pte_free_tlb(tlb, pte, address) \ -do { \ - pagetable_dtor(page_ptdesc(pte)); \ - tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ -} while (0) +#define __pte_free_tlb(tlb, pte, address) \ + tlb_remove_ptdesc((tlb), page_ptdesc(pte)) #ifndef __PAGETABLE_PMD_FOLDED diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index 12a536b7bfbd..db122b093a8b 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -28,10 +28,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, extern pgd_t *pgd_alloc(struct mm_struct *mm); -#define __pte_free_tlb(tlb, pte, addr) \ - do { \ - pagetable_dtor(page_ptdesc(pte)); \ - tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ - } while (0) +#define __pte_free_tlb(tlb, pte, addr) \ + tlb_remove_ptdesc((tlb), page_ptdesc(pte)) #endif /* _ASM_NIOS2_PGALLOC_H */ diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index 3372f4e6ab4b..3f110931d8f6 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -64,10 +64,7 @@ extern inline pgd_t *pgd_alloc(struct mm_struct *mm) extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); -#define __pte_free_tlb(tlb, pte, addr) \ -do { \ - pagetable_dtor(page_ptdesc(pte)); \ - tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ -} while (0) +#define __pte_free_tlb(tlb, pte, addr) \ + tlb_remove_ptdesc((tlb), page_ptdesc(pte)) #endif diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index 3e2aebea6312..770ce18a7328 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -15,24 +15,6 @@ #define __HAVE_ARCH_PUD_FREE #include <asm-generic/pgalloc.h> -/* - * While riscv platforms with riscv_ipi_for_rfence as true require an IPI to - * perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use - * SBI to perform TLB shootdown. To keep software pagetable walkers safe in this - * case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the - * comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h - * for more details. - */ -static inline void riscv_tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt) -{ - if (riscv_use_sbi_for_rfence()) { - tlb_remove_ptdesc(tlb, pt); - } else { - pagetable_dtor(pt); - tlb_remove_page_ptdesc(tlb, pt); - } -} - static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { @@ -108,14 +90,14 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, unsigned long addr) { if (pgtable_l4_enabled) - riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); } static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr) { if (pgtable_l5_enabled) - riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); } #endif /* __PAGETABLE_PMD_FOLDED */ @@ -143,7 +125,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { - riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); } #endif /* __PAGETABLE_PMD_FOLDED */ @@ -151,7 +133,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - riscv_tlb_remove_ptdesc(tlb, page_ptdesc(pte)); + tlb_remove_ptdesc(tlb, page_ptdesc(pte)); } #endif /* CONFIG_MMU */ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c809c486d136..b8fa367c1fc9 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -137,6 +137,7 @@ config S390 select ARCH_SUPPORTS_DEBUG_PAGEALLOC select ARCH_SUPPORTS_HUGETLBFS select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && CC_IS_CLANG + select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_USE_BUILTIN_BSWAP diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 70c8f9ad13cd..430feb1a5013 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -80,7 +80,7 @@ static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len) vdso_text_start = vvar_start + VDSO_NR_PAGES * PAGE_SIZE; /* VM_MAYWRITE for COW so gdb can set breakpoints */ vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len, - VM_READ|VM_EXEC| + VM_READ|VM_EXEC|VM_SEALED_SYSMAP| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso_mapping); if (IS_ERR(vma)) { diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 96d938fdf224..6fe7123d38fa 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -32,10 +32,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, set_pmd(pmd, __pmd((unsigned long)page_address(pte))); } -#define __pte_free_tlb(tlb, pte, addr) \ -do { \ - pagetable_dtor(page_ptdesc(pte)); \ - tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ -} while (0) +#define __pte_free_tlb(tlb, pte, addr) \ + tlb_remove_ptdesc((tlb), page_ptdesc(pte)) #endif /* __ASM_SH_PGALLOC_H */ diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index f0af23c3aeb2..826ec44b58cd 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -25,27 +25,18 @@ */ extern pgd_t *pgd_alloc(struct mm_struct *); -#define __pte_free_tlb(tlb, pte, address) \ -do { \ - pagetable_dtor(page_ptdesc(pte)); \ - tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ -} while (0) +#define __pte_free_tlb(tlb, pte, address) \ + tlb_remove_ptdesc((tlb), page_ptdesc(pte)) #if CONFIG_PGTABLE_LEVELS > 2 -#define __pmd_free_tlb(tlb, pmd, address) \ -do { \ - pagetable_dtor(virt_to_ptdesc(pmd)); \ - tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \ -} while (0) +#define __pmd_free_tlb(tlb, pmd, address) \ + tlb_remove_ptdesc((tlb), virt_to_ptdesc(pmd)) #if CONFIG_PGTABLE_LEVELS > 3 -#define __pud_free_tlb(tlb, pud, address) \ -do { \ - pagetable_dtor(virt_to_ptdesc(pud)); \ - tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pud)); \ -} while (0) +#define __pud_free_tlb(tlb, pud, address) \ + tlb_remove_ptdesc((tlb), virt_to_ptdesc(pud)) #endif #endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2caf4c0f789e..85ba2e187571 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -27,6 +27,7 @@ config X86_64 # Options that are inherently 64-bit kernel only: select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_PTDUMP + select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 9518bf1ddf35..adb299d3b6a1 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -162,7 +162,8 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) text_start, image->size, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_SEALED_SYSMAP, &vdso_mapping); if (IS_ERR(vma)) { @@ -181,7 +182,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) VDSO_VCLOCK_PAGES_START(addr), VDSO_NR_VCLOCK_PAGES * PAGE_SIZE, VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| - VM_PFNMAP, + VM_PFNMAP|VM_SEALED_SYSMAP, &vvar_vclock_mapping); if (IS_ERR(vma)) { diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 72405d315b41..def3d9284254 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2274,6 +2274,7 @@ int set_mce_nospec(unsigned long pfn) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); return rc; } +EXPORT_SYMBOL_GPL(set_mce_nospec); /* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsigned long pfn) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index cec321fb74f2..a05fcddfc811 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -20,7 +20,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { paravirt_release_pte(page_to_pfn(pte)); - tlb_remove_table(tlb, page_ptdesc(pte)); + tlb_remove_ptdesc(tlb, page_ptdesc(pte)); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -34,21 +34,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif - tlb_remove_table(tlb, virt_to_ptdesc(pmd)); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - tlb_remove_table(tlb, virt_to_ptdesc(pud)); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - tlb_remove_table(tlb, virt_to_ptdesc(p4d)); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index b72772494655..289e365f84b2 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -674,6 +674,105 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata, schedule_work(&entry->work); } +/* Room for 8 entries */ +#define CXL_CPER_PROT_ERR_FIFO_DEPTH 8 +static DEFINE_KFIFO(cxl_cper_prot_err_fifo, struct cxl_cper_prot_err_work_data, + CXL_CPER_PROT_ERR_FIFO_DEPTH); + +/* Synchronize schedule_work() with cxl_cper_prot_err_work changes */ +static DEFINE_SPINLOCK(cxl_cper_prot_err_work_lock); +struct work_struct *cxl_cper_prot_err_work; + +static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err, + int severity) +{ +#ifdef CONFIG_ACPI_APEI_PCIEAER + struct cxl_cper_prot_err_work_data wd; + u8 *dvsec_start, *cap_start; + + if (!(prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS)) { + pr_err_ratelimited("CXL CPER invalid agent type\n"); + return; + } + + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) { + pr_err_ratelimited("CXL CPER invalid protocol error log\n"); + return; + } + + if (prot_err->err_len != sizeof(struct cxl_ras_capability_regs)) { + pr_err_ratelimited("CXL CPER invalid RAS Cap size (%u)\n", + prot_err->err_len); + return; + } + + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER)) + pr_warn(FW_WARN "CXL CPER no device serial number\n"); + + guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock); + + if (!cxl_cper_prot_err_work) + return; + + switch (prot_err->agent_type) { + case RCD: + case DEVICE: + case LD: + case FMLD: + case RP: + case DSP: + case USP: + memcpy(&wd.prot_err, prot_err, sizeof(wd.prot_err)); + + dvsec_start = (u8 *)(prot_err + 1); + cap_start = dvsec_start + prot_err->dvsec_len; + + memcpy(&wd.ras_cap, cap_start, sizeof(wd.ras_cap)); + wd.severity = cper_severity_to_aer(severity); + break; + default: + pr_err_ratelimited("CXL CPER invalid agent type: %d\n", + prot_err->agent_type); + return; + } + + if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) { + pr_err_ratelimited("CXL CPER kfifo overflow\n"); + return; + } + + schedule_work(cxl_cper_prot_err_work); +#endif +} + +int cxl_cper_register_prot_err_work(struct work_struct *work) +{ + if (cxl_cper_prot_err_work) + return -EINVAL; + + guard(spinlock)(&cxl_cper_prot_err_work_lock); + cxl_cper_prot_err_work = work; + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_cper_register_prot_err_work, "CXL"); + +int cxl_cper_unregister_prot_err_work(struct work_struct *work) +{ + if (cxl_cper_prot_err_work != work) + return -EINVAL; + + guard(spinlock)(&cxl_cper_prot_err_work_lock); + cxl_cper_prot_err_work = NULL; + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_prot_err_work, "CXL"); + +int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd) +{ + return kfifo_get(&cxl_cper_prot_err_fifo, wd); +} +EXPORT_SYMBOL_NS_GPL(cxl_cper_prot_err_kfifo_get, "CXL"); + /* Room for 8 entries for each of the 4 event log queues */ #define CXL_CPER_FIFO_DEPTH 32 DEFINE_KFIFO(cxl_cper_fifo, struct cxl_cper_work_data, CXL_CPER_FIFO_DEPTH); @@ -777,6 +876,10 @@ static bool ghes_do_proc(struct ghes *ghes, } else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) { queued = ghes_handle_arm_hw_error(gdata, sev, sync); + } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) { + struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata); + + cxl_cper_post_prot_err(prot_err, gdata->error_severity); } else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) { struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata); diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index a5d47819b3a4..ae035b93da08 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -485,7 +485,7 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, cmd_mask = nd_desc->cmd_mask; if (cmd == ND_CMD_CALL && call_pkg->nd_family) { family = call_pkg->nd_family; - if (family > NVDIMM_BUS_FAMILY_MAX || + if (call_pkg->nd_family > NVDIMM_BUS_FAMILY_MAX || !test_bit(family, &nd_desc->bus_family_mask)) return -EINVAL; family = array_index_nospec(family, diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index bfbb08b1e6af..9d9052258e92 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -108,6 +108,45 @@ static struct memory_target *find_mem_target(unsigned int mem_pxm) return NULL; } +/** + * hmat_get_extended_linear_cache_size - Retrieve the extended linear cache size + * @backing_res: resource from the backing media + * @nid: node id for the memory region + * @cache_size: (Output) size of extended linear cache. + * + * Return: 0 on success. Errno on failure. + * + */ +int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid, + resource_size_t *cache_size) +{ + unsigned int pxm = node_to_pxm(nid); + struct memory_target *target; + struct target_cache *tcache; + struct resource *res; + + target = find_mem_target(pxm); + if (!target) + return -ENOENT; + + list_for_each_entry(tcache, &target->caches, node) { + if (tcache->cache_attrs.address_mode != + NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR) + continue; + + res = &target->memregions; + if (!resource_contains(res, backing_res)) + continue; + + *cache_size = tcache->cache_attrs.size; + return 0; + } + + *cache_size = 0; + return 0; +} +EXPORT_SYMBOL_NS_GPL(hmat_get_extended_linear_cache_size, "CXL"); + static struct memory_target *acpi_find_genport_target(u32 uid) { struct memory_target *target; @@ -506,6 +545,11 @@ static __init int hmat_parse_cache(union acpi_subtable_headers *header, switch ((attrs & ACPI_HMAT_CACHE_ASSOCIATIVITY) >> 8) { case ACPI_HMAT_CA_DIRECT_MAPPED: tcache->cache_attrs.indexing = NODE_CACHE_DIRECT_MAP; + /* Extended Linear mode is only valid if cache is direct mapped */ + if (cache->address_mode == ACPI_HMAT_CACHE_MODE_EXTENDED_LINEAR) { + tcache->cache_attrs.address_mode = + NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR; + } break; case ACPI_HMAT_CA_COMPLEX_CACHE_INDEXING: tcache->cache_attrs.indexing = NODE_CACHE_INDEXED; diff --git a/drivers/base/node.c b/drivers/base/node.c index 0ea653fa3433..cd13ef287011 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -244,12 +244,14 @@ CACHE_ATTR(size, "%llu") CACHE_ATTR(line_size, "%u") CACHE_ATTR(indexing, "%u") CACHE_ATTR(write_policy, "%u") +CACHE_ATTR(address_mode, "%#x") static struct attribute *cache_attrs[] = { &dev_attr_indexing.attr, &dev_attr_size.attr, &dev_attr_line_size.attr, &dev_attr_write_policy.attr, + &dev_attr_address_mode.attr, NULL, }; ATTRIBUTE_GROUPS(cache); diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 8ac1e9d70eeb..cf1ba673b8c2 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -158,4 +158,8 @@ config CXL_REGION_INVALIDATION_TEST If unsure, or if this kernel is meant for production environments, say N. +config CXL_MCE + def_bool y + depends on X86_MCE && MEMORY_FAILURE + endif diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index b0bfbd9eac9b..086df97a0fcf 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -14,6 +14,9 @@ cxl_core-y += pci.o cxl_core-y += hdm.o cxl_core-y += pmu.o cxl_core-y += cdat.o +cxl_core-y += ras.o +cxl_core-y += acpi.o cxl_core-$(CONFIG_TRACING) += trace.o cxl_core-$(CONFIG_CXL_REGION) += region.o +cxl_core-$(CONFIG_CXL_MCE) += mce.o cxl_core-$(CONFIG_CXL_FEATURES) += features.o diff --git a/drivers/cxl/core/acpi.c b/drivers/cxl/core/acpi.c new file mode 100644 index 000000000000..f13b4dae6ac5 --- /dev/null +++ b/drivers/cxl/core/acpi.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2024 Intel Corporation. All rights reserved. */ +#include <linux/acpi.h> +#include "cxl.h" +#include "core.h" + +int cxl_acpi_get_extended_linear_cache_size(struct resource *backing_res, + int nid, resource_size_t *size) +{ + return hmat_get_extended_linear_cache_size(backing_res, nid, size); +} diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index 8153f8d83a16..edb4f41eeacc 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -258,27 +258,29 @@ static void update_perf_entry(struct device *dev, struct dsmas_entry *dent, static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds, struct xarray *dsmas_xa) { - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); struct device *dev = cxlds->dev; - struct range pmem_range = { - .start = cxlds->pmem_res.start, - .end = cxlds->pmem_res.end, - }; - struct range ram_range = { - .start = cxlds->ram_res.start, - .end = cxlds->ram_res.end, - }; struct dsmas_entry *dent; unsigned long index; xa_for_each(dsmas_xa, index, dent) { - if (resource_size(&cxlds->ram_res) && - range_contains(&ram_range, &dent->dpa_range)) - update_perf_entry(dev, dent, &mds->ram_perf); - else if (resource_size(&cxlds->pmem_res) && - range_contains(&pmem_range, &dent->dpa_range)) - update_perf_entry(dev, dent, &mds->pmem_perf); - else + bool found = false; + + for (int i = 0; i < cxlds->nr_partitions; i++) { + struct resource *res = &cxlds->part[i].res; + struct range range = { + .start = res->start, + .end = res->end, + }; + + if (range_contains(&range, &dent->dpa_range)) { + update_perf_entry(dev, dent, + &cxlds->part[i].perf); + found = true; + break; + } + } + + if (!found) dev_dbg(dev, "no partition for dsmas dpa: %pra\n", &dent->dpa_range); } @@ -343,36 +345,46 @@ static int match_cxlrd_hb(struct device *dev, void *data) return 0; } -static int cxl_qos_class_verify(struct cxl_memdev *cxlmd) +static void cxl_qos_class_verify(struct cxl_memdev *cxlmd) { struct cxl_dev_state *cxlds = cxlmd->cxlds; - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); struct cxl_port *root_port; - int rc; struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(cxlmd->endpoint); + /* + * No need to reset_dpa_perf() here as find_cxl_root() is guaranteed to + * succeed when called in the cxl_endpoint_port_probe() path. + */ if (!cxl_root) - return -ENODEV; + return; root_port = &cxl_root->port; - /* Check that the QTG IDs are all sane between end device and root decoders */ - if (!cxl_qos_match(root_port, &mds->ram_perf)) - reset_dpa_perf(&mds->ram_perf); - if (!cxl_qos_match(root_port, &mds->pmem_perf)) - reset_dpa_perf(&mds->pmem_perf); - - /* Check to make sure that the device's host bridge is under a root decoder */ - rc = device_for_each_child(&root_port->dev, - cxlmd->endpoint->host_bridge, match_cxlrd_hb); - if (!rc) { - reset_dpa_perf(&mds->ram_perf); - reset_dpa_perf(&mds->pmem_perf); + /* + * Save userspace from needing to check if a qos class has any matches + * by hiding qos class info if the memdev is not mapped by a root + * decoder, or the partition class does not match any root decoder + * class. + */ + if (!device_for_each_child(&root_port->dev, + cxlmd->endpoint->host_bridge, + match_cxlrd_hb)) { + for (int i = 0; i < cxlds->nr_partitions; i++) { + struct cxl_dpa_perf *perf = &cxlds->part[i].perf; + + reset_dpa_perf(perf); + } + return; } - return rc; + for (int i = 0; i < cxlds->nr_partitions; i++) { + struct cxl_dpa_perf *perf = &cxlds->part[i].perf; + + if (!cxl_qos_match(root_port, perf)) + reset_dpa_perf(perf); + } } static void discard_dsmas(struct xarray *xa) @@ -570,23 +582,18 @@ static bool dpa_perf_contains(struct cxl_dpa_perf *perf, return range_contains(&perf->dpa_range, &dpa); } -static struct cxl_dpa_perf *cxled_get_dpa_perf(struct cxl_endpoint_decoder *cxled, - enum cxl_decoder_mode mode) +static struct cxl_dpa_perf *cxled_get_dpa_perf(struct cxl_endpoint_decoder *cxled) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_dpa_perf *perf; - switch (mode) { - case CXL_DECODER_RAM: - perf = &mds->ram_perf; - break; - case CXL_DECODER_PMEM: - perf = &mds->pmem_perf; - break; - default: + if (cxled->part < 0) + return ERR_PTR(-EINVAL); + perf = &cxlds->part[cxled->part].perf; + + if (!perf) return ERR_PTR(-EINVAL); - } if (!dpa_perf_contains(perf, cxled->dpa_res)) return ERR_PTR(-EINVAL); @@ -647,11 +654,10 @@ static int cxl_endpoint_gather_bandwidth(struct cxl_region *cxlr, if (cxlds->rcd) return -ENODEV; - perf = cxled_get_dpa_perf(cxled, cxlr->mode); + perf = cxled_get_dpa_perf(cxled); if (IS_ERR(perf)) return PTR_ERR(perf); - gp_port = to_cxl_port(parent_port->dev.parent); *gp_is_root = is_cxl_root(gp_port); /* @@ -1053,7 +1059,7 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, lockdep_assert_held(&cxl_dpa_rwsem); - perf = cxled_get_dpa_perf(cxled, cxlr->mode); + perf = cxled_get_dpa_perf(cxled); if (IS_ERR(perf)) return; diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 17e99a25c29a..15699299dc11 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -74,8 +74,8 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, resource_size_t length); struct dentry *cxl_debugfs_create_dir(const char *dir); -int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled, - enum cxl_decoder_mode mode); +int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, + enum cxl_partition_mode mode); int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size); int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled); @@ -117,6 +117,12 @@ bool cxl_need_node_perf_attrs_update(int nid); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); +int cxl_ras_init(void); +void cxl_ras_exit(void); +int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port); +int cxl_acpi_get_extended_linear_cache_size(struct resource *backing_res, + int nid, resource_size_t *size); + #ifdef CONFIG_CXL_FEATURES size_t cxl_get_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid, enum cxl_get_feat_selection selection, diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 50e6a45b30ba..70cae4ebf8a4 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -213,16 +213,46 @@ void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds) { struct resource *p1, *p2; - down_read(&cxl_dpa_rwsem); + guard(rwsem_read)(&cxl_dpa_rwsem); for (p1 = cxlds->dpa_res.child; p1; p1 = p1->sibling) { __cxl_dpa_debug(file, p1, 0); for (p2 = p1->child; p2; p2 = p2->sibling) __cxl_dpa_debug(file, p2, 1); } - up_read(&cxl_dpa_rwsem); } EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, "CXL"); +/* See request_skip() kernel-doc */ +static resource_size_t __adjust_skip(struct cxl_dev_state *cxlds, + const resource_size_t skip_base, + const resource_size_t skip_len, + const char *requester) +{ + const resource_size_t skip_end = skip_base + skip_len - 1; + + for (int i = 0; i < cxlds->nr_partitions; i++) { + const struct resource *part_res = &cxlds->part[i].res; + resource_size_t adjust_start, adjust_end, size; + + adjust_start = max(skip_base, part_res->start); + adjust_end = min(skip_end, part_res->end); + + if (adjust_end < adjust_start) + continue; + + size = adjust_end - adjust_start + 1; + + if (!requester) + __release_region(&cxlds->dpa_res, adjust_start, size); + else if (!__request_region(&cxlds->dpa_res, adjust_start, size, + requester, 0)) + return adjust_start - skip_base; + } + + return skip_len; +} +#define release_skip(c, b, l) __adjust_skip((c), (b), (l), NULL) + /* * Must be called in a context that synchronizes against this decoder's * port ->remove() callback (like an endpoint decoder sysfs attribute) @@ -241,7 +271,7 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled) skip_start = res->start - cxled->skip; __release_region(&cxlds->dpa_res, res->start, resource_size(res)); if (cxled->skip) - __release_region(&cxlds->dpa_res, skip_start, cxled->skip); + release_skip(cxlds, skip_start, cxled->skip); cxled->skip = 0; cxled->dpa_res = NULL; put_device(&cxled->cxld.dev); @@ -250,9 +280,8 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled) static void cxl_dpa_release(void *cxled) { - down_write(&cxl_dpa_rwsem); + guard(rwsem_write)(&cxl_dpa_rwsem); __cxl_dpa_release(cxled); - up_write(&cxl_dpa_rwsem); } /* @@ -268,6 +297,58 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled) __cxl_dpa_release(cxled); } +/** + * request_skip() - Track DPA 'skip' in @cxlds->dpa_res resource tree + * @cxlds: CXL.mem device context that parents @cxled + * @cxled: Endpoint decoder establishing new allocation that skips lower DPA + * @skip_base: DPA < start of new DPA allocation (DPAnew) + * @skip_len: @skip_base + @skip_len == DPAnew + * + * DPA 'skip' arises from out-of-sequence DPA allocation events relative + * to free capacity across multiple partitions. It is a wasteful event + * as usable DPA gets thrown away, but if a deployment has, for example, + * a dual RAM+PMEM device, wants to use PMEM, and has unallocated RAM + * DPA, the free RAM DPA must be sacrificed to start allocating PMEM. + * See third "Implementation Note" in CXL 3.1 8.2.4.19.13 "Decoder + * Protection" for more details. + * + * A 'skip' always covers the last allocated DPA in a previous partition + * to the start of the current partition to allocate. Allocations never + * start in the middle of a partition, and allocations are always + * de-allocated in reverse order (see cxl_dpa_free(), or natural devm + * unwind order from forced in-order allocation). + * + * If @cxlds->nr_partitions was guaranteed to be <= 2 then the 'skip' + * would always be contained to a single partition. Given + * @cxlds->nr_partitions may be > 2 it results in cases where the 'skip' + * might span "tail capacity of partition[0], all of partition[1], ..., + * all of partition[N-1]" to support allocating from partition[N]. That + * in turn interacts with the partition 'struct resource' boundaries + * within @cxlds->dpa_res whereby 'skip' requests need to be divided by + * partition. I.e. this is a quirk of using a 'struct resource' tree to + * detect range conflicts while also tracking partition boundaries in + * @cxlds->dpa_res. + */ +static int request_skip(struct cxl_dev_state *cxlds, + struct cxl_endpoint_decoder *cxled, + const resource_size_t skip_base, + const resource_size_t skip_len) +{ + resource_size_t skipped = __adjust_skip(cxlds, skip_base, skip_len, + dev_name(&cxled->cxld.dev)); + + if (skipped == skip_len) + return 0; + + dev_dbg(cxlds->dev, + "%s: failed to reserve skipped space (%pa %pa %pa)\n", + dev_name(&cxled->cxld.dev), &skip_base, &skip_len, &skipped); + + release_skip(cxlds, skip_base, skipped); + + return -EBUSY; +} + static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, resource_size_t base, resource_size_t len, resource_size_t skipped) @@ -277,6 +358,7 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, struct cxl_dev_state *cxlds = cxlmd->cxlds; struct device *dev = &port->dev; struct resource *res; + int rc; lockdep_assert_held_write(&cxl_dpa_rwsem); @@ -305,14 +387,9 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, } if (skipped) { - res = __request_region(&cxlds->dpa_res, base - skipped, skipped, - dev_name(&cxled->cxld.dev), 0); - if (!res) { - dev_dbg(dev, - "decoder%d.%d: failed to reserve skipped space\n", - port->id, cxled->cxld.id); - return -EBUSY; - } + rc = request_skip(cxlds, cxled, base - skipped, skipped); + if (rc) + return rc; } res = __request_region(&cxlds->dpa_res, base, len, dev_name(&cxled->cxld.dev), 0); @@ -320,28 +397,117 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n", port->id, cxled->cxld.id); if (skipped) - __release_region(&cxlds->dpa_res, base - skipped, - skipped); + release_skip(cxlds, base - skipped, skipped); return -EBUSY; } cxled->dpa_res = res; cxled->skip = skipped; - if (resource_contains(&cxlds->pmem_res, res)) - cxled->mode = CXL_DECODER_PMEM; - else if (resource_contains(&cxlds->ram_res, res)) - cxled->mode = CXL_DECODER_RAM; - else { - dev_warn(dev, "decoder%d.%d: %pr mixed mode not supported\n", - port->id, cxled->cxld.id, cxled->dpa_res); - cxled->mode = CXL_DECODER_MIXED; - } + /* + * When allocating new capacity, ->part is already set, when + * discovering decoder settings at initial enumeration, ->part + * is not set. + */ + if (cxled->part < 0) + for (int i = 0; cxlds->nr_partitions; i++) + if (resource_contains(&cxlds->part[i].res, res)) { + cxled->part = i; + break; + } + + if (cxled->part < 0) + dev_warn(dev, "decoder%d.%d: %pr does not map any partition\n", + port->id, cxled->cxld.id, res); port->hdm_end++; get_device(&cxled->cxld.dev); return 0; } +static int add_dpa_res(struct device *dev, struct resource *parent, + struct resource *res, resource_size_t start, + resource_size_t size, const char *type) +{ + int rc; + + *res = (struct resource) { + .name = type, + .start = start, + .end = start + size - 1, + .flags = IORESOURCE_MEM, + }; + if (resource_size(res) == 0) { + dev_dbg(dev, "DPA(%s): no capacity\n", res->name); + return 0; + } + rc = request_resource(parent, res); + if (rc) { + dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name, + res, rc); + return rc; + } + + dev_dbg(dev, "DPA(%s): %pr\n", res->name, res); + + return 0; +} + +static const char *cxl_mode_name(enum cxl_partition_mode mode) +{ + switch (mode) { + case CXL_PARTMODE_RAM: + return "ram"; + case CXL_PARTMODE_PMEM: + return "pmem"; + default: + return ""; + }; +} + +/* if this fails the caller must destroy @cxlds, there is no recovery */ +int cxl_dpa_setup(struct cxl_dev_state *cxlds, const struct cxl_dpa_info *info) +{ + struct device *dev = cxlds->dev; + + guard(rwsem_write)(&cxl_dpa_rwsem); + + if (cxlds->nr_partitions) + return -EBUSY; + + if (!info->size || !info->nr_partitions) { + cxlds->dpa_res = DEFINE_RES_MEM(0, 0); + cxlds->nr_partitions = 0; + return 0; + } + + cxlds->dpa_res = DEFINE_RES_MEM(0, info->size); + + for (int i = 0; i < info->nr_partitions; i++) { + const struct cxl_dpa_part_info *part = &info->part[i]; + int rc; + + cxlds->part[i].perf.qos_class = CXL_QOS_CLASS_INVALID; + cxlds->part[i].mode = part->mode; + + /* Require ordered + contiguous partitions */ + if (i) { + const struct cxl_dpa_part_info *prev = &info->part[i - 1]; + + if (prev->range.end + 1 != part->range.start) + return -EINVAL; + } + rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->part[i].res, + part->range.start, range_len(&part->range), + cxl_mode_name(part->mode)); + if (rc) + return rc; + cxlds->nr_partitions++; + } + + return 0; +} +EXPORT_SYMBOL_GPL(cxl_dpa_setup); + int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, resource_size_t base, resource_size_t len, resource_size_t skipped) @@ -362,14 +528,11 @@ EXPORT_SYMBOL_NS_GPL(devm_cxl_dpa_reserve, "CXL"); resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled) { - resource_size_t size = 0; - - down_read(&cxl_dpa_rwsem); + guard(rwsem_read)(&cxl_dpa_rwsem); if (cxled->dpa_res) - size = resource_size(cxled->dpa_res); - up_read(&cxl_dpa_rwsem); + return resource_size(cxled->dpa_res); - return size; + return 0; } resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled) @@ -387,151 +550,136 @@ int cxl_dpa_free(struct cxl_endpoint_decoder *cxled) { struct cxl_port *port = cxled_to_port(cxled); struct device *dev = &cxled->cxld.dev; - int rc; - down_write(&cxl_dpa_rwsem); - if (!cxled->dpa_res) { - rc = 0; - goto out; - } + guard(rwsem_write)(&cxl_dpa_rwsem); + if (!cxled->dpa_res) + return 0; if (cxled->cxld.region) { dev_dbg(dev, "decoder assigned to: %s\n", dev_name(&cxled->cxld.region->dev)); - rc = -EBUSY; - goto out; + return -EBUSY; } if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { dev_dbg(dev, "decoder enabled\n"); - rc = -EBUSY; - goto out; + return -EBUSY; } if (cxled->cxld.id != port->hdm_end) { dev_dbg(dev, "expected decoder%d.%d\n", port->id, port->hdm_end); - rc = -EBUSY; - goto out; + return -EBUSY; } + devm_cxl_dpa_release(cxled); - rc = 0; -out: - up_write(&cxl_dpa_rwsem); - return rc; + return 0; } -int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled, - enum cxl_decoder_mode mode) +int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, + enum cxl_partition_mode mode) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct device *dev = &cxled->cxld.dev; - - switch (mode) { - case CXL_DECODER_RAM: - case CXL_DECODER_PMEM: - break; - default: - dev_dbg(dev, "unsupported mode: %d\n", mode); - return -EINVAL; - } + int part; guard(rwsem_write)(&cxl_dpa_rwsem); if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) return -EBUSY; - /* - * Only allow modes that are supported by the current partition - * configuration - */ - if (mode == CXL_DECODER_PMEM && !resource_size(&cxlds->pmem_res)) { - dev_dbg(dev, "no available pmem capacity\n"); - return -ENXIO; + for (part = 0; part < cxlds->nr_partitions; part++) + if (cxlds->part[part].mode == mode) + break; + + if (part >= cxlds->nr_partitions) { + dev_dbg(dev, "unsupported mode: %d\n", mode); + return -EINVAL; } - if (mode == CXL_DECODER_RAM && !resource_size(&cxlds->ram_res)) { - dev_dbg(dev, "no available ram capacity\n"); + + if (!resource_size(&cxlds->part[part].res)) { + dev_dbg(dev, "no available capacity for mode: %d\n", mode); return -ENXIO; } - cxled->mode = mode; + cxled->part = part; return 0; } -int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) +static int __cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - resource_size_t free_ram_start, free_pmem_start; - struct cxl_port *port = cxled_to_port(cxled); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct device *dev = &cxled->cxld.dev; - resource_size_t start, avail, skip; + struct resource *res, *prev = NULL; + resource_size_t start, avail, skip, skip_start; struct resource *p, *last; - int rc; + int part; - down_write(&cxl_dpa_rwsem); + guard(rwsem_write)(&cxl_dpa_rwsem); if (cxled->cxld.region) { dev_dbg(dev, "decoder attached to %s\n", dev_name(&cxled->cxld.region->dev)); - rc = -EBUSY; - goto out; + return -EBUSY; } if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { dev_dbg(dev, "decoder enabled\n"); - rc = -EBUSY; - goto out; + return -EBUSY; } - for (p = cxlds->ram_res.child, last = NULL; p; p = p->sibling) - last = p; - if (last) - free_ram_start = last->end + 1; - else - free_ram_start = cxlds->ram_res.start; + part = cxled->part; + if (part < 0) { + dev_dbg(dev, "partition not set\n"); + return -EBUSY; + } - for (p = cxlds->pmem_res.child, last = NULL; p; p = p->sibling) + res = &cxlds->part[part].res; + for (p = res->child, last = NULL; p; p = p->sibling) last = p; if (last) - free_pmem_start = last->end + 1; + start = last->end + 1; else - free_pmem_start = cxlds->pmem_res.start; - - if (cxled->mode == CXL_DECODER_RAM) { - start = free_ram_start; - avail = cxlds->ram_res.end - start + 1; - skip = 0; - } else if (cxled->mode == CXL_DECODER_PMEM) { - resource_size_t skip_start, skip_end; - - start = free_pmem_start; - avail = cxlds->pmem_res.end - start + 1; - skip_start = free_ram_start; + start = res->start; - /* - * If some pmem is already allocated, then that allocation - * already handled the skip. - */ - if (cxlds->pmem_res.child && - skip_start == cxlds->pmem_res.child->start) - skip_end = skip_start - 1; - else - skip_end = start - 1; - skip = skip_end - skip_start + 1; - } else { - dev_dbg(dev, "mode not set\n"); - rc = -EINVAL; - goto out; + /* + * To allocate at partition N, a skip needs to be calculated for all + * unallocated space at lower partitions indices. + * + * If a partition has any allocations, the search can end because a + * previous cxl_dpa_alloc() invocation is assumed to have accounted for + * all previous partitions. + */ + skip_start = CXL_RESOURCE_NONE; + for (int i = part; i; i--) { + prev = &cxlds->part[i - 1].res; + for (p = prev->child, last = NULL; p; p = p->sibling) + last = p; + if (last) { + skip_start = last->end + 1; + break; + } + skip_start = prev->start; } + avail = res->end - start + 1; + if (skip_start == CXL_RESOURCE_NONE) + skip = 0; + else + skip = res->start - skip_start; + if (size > avail) { dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size, - cxl_decoder_mode_name(cxled->mode), &avail); - rc = -ENOSPC; - goto out; + res->name, &avail); + return -ENOSPC; } - rc = __cxl_dpa_reserve(cxled, start, size, skip); -out: - up_write(&cxl_dpa_rwsem); + return __cxl_dpa_reserve(cxled, start, size, skip); +} + +int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) +{ + struct cxl_port *port = cxled_to_port(cxled); + int rc; + rc = __cxl_dpa_alloc(cxled, size); if (rc) return rc; diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 78c5346e3e89..d72764056ce6 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -11,6 +11,7 @@ #include "core.h" #include "trace.h" +#include "mce.h" static bool cxl_raw_allow_all; @@ -900,7 +901,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, } if (trace_cxl_general_media_enabled() || trace_cxl_dram_enabled()) { - u64 dpa, hpa = ULLONG_MAX; + u64 dpa, hpa = ULLONG_MAX, hpa_alias = ULLONG_MAX; struct cxl_region *cxlr; /* @@ -913,14 +914,20 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, dpa = le64_to_cpu(evt->media_hdr.phys_addr) & CXL_DPA_MASK; cxlr = cxl_dpa_to_region(cxlmd, dpa); - if (cxlr) + if (cxlr) { + u64 cache_size = cxlr->params.cache_size; + hpa = cxl_dpa_to_hpa(cxlr, cxlmd, dpa); + if (cache_size) + hpa_alias = hpa - cache_size; + } if (event_type == CXL_CPER_EVENT_GEN_MEDIA) trace_cxl_general_media(cxlmd, type, cxlr, hpa, - &evt->gen_media); + hpa_alias, &evt->gen_media); else if (event_type == CXL_CPER_EVENT_DRAM) - trace_cxl_dram(cxlmd, type, cxlr, hpa, &evt->dram); + trace_cxl_dram(cxlmd, type, cxlr, hpa, hpa_alias, + &evt->dram); } } EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, "CXL"); @@ -1126,10 +1133,6 @@ static int cxl_mem_get_partition_info(struct cxl_memdev_state *mds) le64_to_cpu(pi.active_volatile_cap) * CXL_CAPACITY_MULTIPLIER; mds->active_persistent_bytes = le64_to_cpu(pi.active_persistent_cap) * CXL_CAPACITY_MULTIPLIER; - mds->next_volatile_bytes = - le64_to_cpu(pi.next_volatile_cap) * CXL_CAPACITY_MULTIPLIER; - mds->next_persistent_bytes = - le64_to_cpu(pi.next_volatile_cap) * CXL_CAPACITY_MULTIPLIER; return 0; } @@ -1251,74 +1254,54 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd) { struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); struct cxl_port *endpoint; - int rc; /* synchronize with cxl_mem_probe() and decoder write operations */ guard(device)(&cxlmd->dev); endpoint = cxlmd->endpoint; - down_read(&cxl_region_rwsem); + guard(rwsem_read)(&cxl_region_rwsem); /* * Require an endpoint to be safe otherwise the driver can not * be sure that the device is unmapped. */ if (endpoint && cxl_num_decoders_committed(endpoint) == 0) - rc = __cxl_mem_sanitize(mds, cmd); - else - rc = -EBUSY; - up_read(&cxl_region_rwsem); + return __cxl_mem_sanitize(mds, cmd); - return rc; + return -EBUSY; } -static int add_dpa_res(struct device *dev, struct resource *parent, - struct resource *res, resource_size_t start, - resource_size_t size, const char *type) +static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode) { - int rc; - - res->name = type; - res->start = start; - res->end = start + size - 1; - res->flags = IORESOURCE_MEM; - if (resource_size(res) == 0) { - dev_dbg(dev, "DPA(%s): no capacity\n", res->name); - return 0; - } - rc = request_resource(parent, res); - if (rc) { - dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name, - res, rc); - return rc; - } + int i = info->nr_partitions; - dev_dbg(dev, "DPA(%s): %pr\n", res->name, res); + if (size == 0) + return; - return 0; + info->part[i].range = (struct range) { + .start = start, + .end = start + size - 1, + }; + info->part[i].mode = mode; + info->nr_partitions++; } -int cxl_mem_create_range_info(struct cxl_memdev_state *mds) +int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info) { struct cxl_dev_state *cxlds = &mds->cxlds; struct device *dev = cxlds->dev; int rc; if (!cxlds->media_ready) { - cxlds->dpa_res = DEFINE_RES_MEM(0, 0); - cxlds->ram_res = DEFINE_RES_MEM(0, 0); - cxlds->pmem_res = DEFINE_RES_MEM(0, 0); + info->size = 0; return 0; } - cxlds->dpa_res = DEFINE_RES_MEM(0, mds->total_bytes); + info->size = mds->total_bytes; if (mds->partition_align_bytes == 0) { - rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, - mds->volatile_only_bytes, "ram"); - if (rc) - return rc; - return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res, - mds->volatile_only_bytes, - mds->persistent_only_bytes, "pmem"); + add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM); + add_part(info, mds->volatile_only_bytes, + mds->persistent_only_bytes, CXL_PARTMODE_PMEM); + return 0; } rc = cxl_mem_get_partition_info(mds); @@ -1327,15 +1310,52 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds) return rc; } - rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, - mds->active_volatile_bytes, "ram"); - if (rc) - return rc; - return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res, - mds->active_volatile_bytes, - mds->active_persistent_bytes, "pmem"); + add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM); + add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes, + CXL_PARTMODE_PMEM); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL"); + +int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_mbox_get_health_info_out hi; + struct cxl_mbox_cmd mbox_cmd; + int rc; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_GET_HEALTH_INFO, + .size_out = sizeof(hi), + .payload_out = &hi, + }; + + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (!rc) + *count = le32_to_cpu(hi.dirty_shutdown_cnt); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_dirty_count, "CXL"); + +int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_mbox_cmd mbox_cmd; + struct cxl_mbox_set_shutdown_state_in in = { + .state = 1 + }; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_SET_SHUTDOWN_STATE, + .size_in = sizeof(in), + .payload_in = &in, + }; + + return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); } -EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, "CXL"); +EXPORT_SYMBOL_NS_GPL(cxl_arm_dirty_shutdown, "CXL"); int cxl_set_timestamp(struct cxl_memdev_state *mds) { @@ -1467,6 +1487,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_mailbox_init, "CXL"); struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) { struct cxl_memdev_state *mds; + int rc; mds = devm_kzalloc(dev, sizeof(*mds), GFP_KERNEL); if (!mds) { @@ -1480,8 +1501,12 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) mds->cxlds.cxl_mbox.host = dev; mds->cxlds.reg_map.resource = CXL_RESOURCE_NONE; mds->cxlds.type = CXL_DEVTYPE_CLASSMEM; - mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID; - mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID; + + rc = devm_cxl_register_mce_notifier(dev, &mds->mce_notifier); + if (rc == -EOPNOTSUPP) + dev_warn(dev, "CXL MCE unsupported\n"); + else if (rc) + return ERR_PTR(rc); return mds; } diff --git a/drivers/cxl/core/mce.c b/drivers/cxl/core/mce.c new file mode 100644 index 000000000000..ff8d078c6ca1 --- /dev/null +++ b/drivers/cxl/core/mce.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2024 Intel Corporation. All rights reserved. */ +#include <linux/mm.h> +#include <linux/notifier.h> +#include <linux/set_memory.h> +#include <asm/mce.h> +#include <cxlmem.h> +#include "mce.h" + +static int cxl_handle_mce(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cxl_memdev_state *mds = container_of(nb, struct cxl_memdev_state, + mce_notifier); + struct cxl_memdev *cxlmd = mds->cxlds.cxlmd; + struct cxl_port *endpoint = cxlmd->endpoint; + struct mce *mce = data; + u64 spa, spa_alias; + unsigned long pfn; + + if (!mce || !mce_usable_address(mce)) + return NOTIFY_DONE; + + if (!endpoint) + return NOTIFY_DONE; + + spa = mce->addr & MCI_ADDR_PHYSADDR; + + pfn = spa >> PAGE_SHIFT; + if (!pfn_valid(pfn)) + return NOTIFY_DONE; + + spa_alias = cxl_port_get_spa_cache_alias(endpoint, spa); + if (spa_alias == ~0ULL) + return NOTIFY_DONE; + + pfn = spa_alias >> PAGE_SHIFT; + + /* + * Take down the aliased memory page. The original memory page flagged + * by the MCE will be taken cared of by the standard MCE handler. + */ + dev_emerg(mds->cxlds.dev, "Offlining aliased SPA address0: %#llx\n", + spa_alias); + if (!memory_failure(pfn, 0)) + set_mce_nospec(pfn); + + return NOTIFY_OK; +} + +static void cxl_unregister_mce_notifier(void *mce_notifier) +{ + mce_unregister_decode_chain(mce_notifier); +} + +int devm_cxl_register_mce_notifier(struct device *dev, + struct notifier_block *mce_notifier) +{ + mce_notifier->notifier_call = cxl_handle_mce; + mce_notifier->priority = MCE_PRIO_UC; + mce_register_decode_chain(mce_notifier); + + return devm_add_action_or_reset(dev, cxl_unregister_mce_notifier, + mce_notifier); +} diff --git a/drivers/cxl/core/mce.h b/drivers/cxl/core/mce.h new file mode 100644 index 000000000000..ace73424eeb6 --- /dev/null +++ b/drivers/cxl/core/mce.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2024 Intel Corporation. All rights reserved. */ +#ifndef _CXL_CORE_MCE_H_ +#define _CXL_CORE_MCE_H_ + +#include <linux/notifier.h> + +#ifdef CONFIG_CXL_MCE +int devm_cxl_register_mce_notifier(struct device *dev, + struct notifier_block *mce_notifer); +#else +static inline int +devm_cxl_register_mce_notifier(struct device *dev, + struct notifier_block *mce_notifier) +{ + return -EOPNOTSUPP; +} +#endif + +#endif diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 2e2e035abdaa..a16a5886d40a 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -75,12 +75,20 @@ static ssize_t label_storage_size_show(struct device *dev, } static DEVICE_ATTR_RO(label_storage_size); +static resource_size_t cxl_ram_size(struct cxl_dev_state *cxlds) +{ + /* Static RAM is only expected at partition 0. */ + if (cxlds->part[0].mode != CXL_PARTMODE_RAM) + return 0; + return resource_size(&cxlds->part[0].res); +} + static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - unsigned long long len = resource_size(&cxlds->ram_res); + unsigned long long len = cxl_ram_size(cxlds); return sysfs_emit(buf, "%#llx\n", len); } @@ -93,7 +101,7 @@ static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr, { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - unsigned long long len = resource_size(&cxlds->pmem_res); + unsigned long long len = cxl_pmem_size(cxlds); return sysfs_emit(buf, "%#llx\n", len); } @@ -198,22 +206,17 @@ static int cxl_get_poison_by_memdev(struct cxl_memdev *cxlmd) int rc = 0; /* CXL 3.0 Spec 8.2.9.8.4.1 Separate pmem and ram poison requests */ - if (resource_size(&cxlds->pmem_res)) { - offset = cxlds->pmem_res.start; - length = resource_size(&cxlds->pmem_res); - rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); - if (rc) - return rc; - } - if (resource_size(&cxlds->ram_res)) { - offset = cxlds->ram_res.start; - length = resource_size(&cxlds->ram_res); + for (int i = 0; i < cxlds->nr_partitions; i++) { + const struct resource *res = &cxlds->part[i].res; + + offset = res->start; + length = resource_size(res); rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); /* * Invalid Physical Address is not an error for * volatile addresses. Device support is optional. */ - if (rc == -EFAULT) + if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM) rc = 0; } return rc; @@ -404,14 +407,21 @@ static struct attribute *cxl_memdev_attributes[] = { NULL, }; +static struct cxl_dpa_perf *to_pmem_perf(struct cxl_dev_state *cxlds) +{ + for (int i = 0; i < cxlds->nr_partitions; i++) + if (cxlds->part[i].mode == CXL_PARTMODE_PMEM) + return &cxlds->part[i].perf; + return NULL; +} + static ssize_t pmem_qos_class_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); - return sysfs_emit(buf, "%d\n", mds->pmem_perf.qos_class); + return sysfs_emit(buf, "%d\n", to_pmem_perf(cxlds)->qos_class); } static struct device_attribute dev_attr_pmem_qos_class = @@ -423,14 +433,20 @@ static struct attribute *cxl_memdev_pmem_attributes[] = { NULL, }; +static struct cxl_dpa_perf *to_ram_perf(struct cxl_dev_state *cxlds) +{ + if (cxlds->part[0].mode != CXL_PARTMODE_RAM) + return NULL; + return &cxlds->part[0].perf; +} + static ssize_t ram_qos_class_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); - return sysfs_emit(buf, "%d\n", mds->ram_perf.qos_class); + return sysfs_emit(buf, "%d\n", to_ram_perf(cxlds)->qos_class); } static struct device_attribute dev_attr_ram_qos_class = @@ -466,11 +482,11 @@ static umode_t cxl_ram_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = kobj_to_dev(kobj); struct cxl_memdev *cxlmd = to_cxl_memdev(dev); - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + struct cxl_dpa_perf *perf = to_ram_perf(cxlmd->cxlds); - if (a == &dev_attr_ram_qos_class.attr) - if (mds->ram_perf.qos_class == CXL_QOS_CLASS_INVALID) - return 0; + if (a == &dev_attr_ram_qos_class.attr && + (!perf || perf->qos_class == CXL_QOS_CLASS_INVALID)) + return 0; return a->mode; } @@ -485,11 +501,11 @@ static umode_t cxl_pmem_visible(struct kobject *kobj, struct attribute *a, int n { struct device *dev = kobj_to_dev(kobj); struct cxl_memdev *cxlmd = to_cxl_memdev(dev); - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + struct cxl_dpa_perf *perf = to_pmem_perf(cxlmd->cxlds); - if (a == &dev_attr_pmem_qos_class.attr) - if (mds->pmem_perf.qos_class == CXL_QOS_CLASS_INVALID) - return 0; + if (a == &dev_attr_pmem_qos_class.attr && + (!perf || perf->qos_class == CXL_QOS_CLASS_INVALID)) + return 0; return a->mode; } @@ -566,10 +582,9 @@ void set_exclusive_cxl_commands(struct cxl_memdev_state *mds, { struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; - down_write(&cxl_memdev_rwsem); + guard(rwsem_write)(&cxl_memdev_rwsem); bitmap_or(cxl_mbox->exclusive_cmds, cxl_mbox->exclusive_cmds, cmds, CXL_MEM_COMMAND_ID_MAX); - up_write(&cxl_memdev_rwsem); } EXPORT_SYMBOL_NS_GPL(set_exclusive_cxl_commands, "CXL"); @@ -583,10 +598,9 @@ void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds, { struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; - down_write(&cxl_memdev_rwsem); + guard(rwsem_write)(&cxl_memdev_rwsem); bitmap_andnot(cxl_mbox->exclusive_cmds, cxl_mbox->exclusive_cmds, cmds, CXL_MEM_COMMAND_ID_MAX); - up_write(&cxl_memdev_rwsem); } EXPORT_SYMBOL_NS_GPL(clear_exclusive_cxl_commands, "CXL"); @@ -594,9 +608,8 @@ static void cxl_memdev_shutdown(struct device *dev) { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); - down_write(&cxl_memdev_rwsem); + guard(rwsem_write)(&cxl_memdev_rwsem); cxlmd->cxlds = NULL; - up_write(&cxl_memdev_rwsem); } static void cxl_memdev_unregister(void *_cxlmd) @@ -678,15 +691,13 @@ static long cxl_memdev_ioctl(struct file *file, unsigned int cmd, { struct cxl_memdev *cxlmd = file->private_data; struct cxl_dev_state *cxlds; - int rc = -ENXIO; - down_read(&cxl_memdev_rwsem); + guard(rwsem_read)(&cxl_memdev_rwsem); cxlds = cxlmd->cxlds; if (cxlds && cxlds->type == CXL_DEVTYPE_CLASSMEM) - rc = __cxl_memdev_ioctl(cxlmd, cmd, arg); - up_read(&cxl_memdev_rwsem); + return __cxl_memdev_ioctl(cxlmd, cmd, arg); - return rc; + return -ENXIO; } static int cxl_memdev_open(struct inode *inode, struct file *file) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 013b869b66cb..96fecb799cbc 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1054,3 +1054,100 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c) return 0; } + +/* + * Set max timeout such that platforms will optimize GPF flow to avoid + * the implied worst-case scenario delays. On a sane platform, all + * devices should always complete GPF within the energy budget of + * the GPF flow. The kernel does not have enough information to pick + * anything better than "maximize timeouts and hope it works". + * + * A misbehaving device could block forward progress of GPF for all + * the other devices, exhausting the energy budget of the platform. + * However, the spec seems to assume that moving on from slow to respond + * devices is a virtue. It is not possible to know that, in actuality, + * the slow to respond device is *the* most critical device in the + * system to wait. + */ +#define GPF_TIMEOUT_BASE_MAX 2 +#define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */ + +u16 cxl_gpf_get_dvsec(struct device *dev, bool is_port) +{ + u16 dvsec; + + if (!dev_is_pci(dev)) + return 0; + + dvsec = pci_find_dvsec_capability(to_pci_dev(dev), PCI_VENDOR_ID_CXL, + is_port ? CXL_DVSEC_PORT_GPF : CXL_DVSEC_DEVICE_GPF); + if (!dvsec) + dev_warn(dev, "%s GPF DVSEC not present\n", + is_port ? "Port" : "Device"); + return dvsec; +} +EXPORT_SYMBOL_NS_GPL(cxl_gpf_get_dvsec, "CXL"); + +static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase) +{ + u64 base, scale; + int rc, offset; + u16 ctrl; + + switch (phase) { + case 1: + offset = CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET; + base = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK; + scale = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK; + break; + case 2: + offset = CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET; + base = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK; + scale = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK; + break; + default: + return -EINVAL; + } + + rc = pci_read_config_word(pdev, dvsec + offset, &ctrl); + if (rc) + return rc; + + if (FIELD_GET(base, ctrl) == GPF_TIMEOUT_BASE_MAX && + FIELD_GET(scale, ctrl) == GPF_TIMEOUT_SCALE_MAX) + return 0; + + ctrl = FIELD_PREP(base, GPF_TIMEOUT_BASE_MAX); + ctrl |= FIELD_PREP(scale, GPF_TIMEOUT_SCALE_MAX); + + rc = pci_write_config_word(pdev, dvsec + offset, ctrl); + if (!rc) + pci_dbg(pdev, "Port GPF phase %d timeout: %d0 secs\n", + phase, GPF_TIMEOUT_BASE_MAX); + + return rc; +} + +int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port) +{ + struct pci_dev *pdev; + + if (!port) + return -EINVAL; + + if (!port->gpf_dvsec) { + int dvsec; + + dvsec = cxl_gpf_get_dvsec(dport_dev, true); + if (!dvsec) + return -EINVAL; + + port->gpf_dvsec = dvsec; + } + + pdev = to_pci_dev(dport_dev); + update_gpf_port_dvsec(pdev, port->gpf_dvsec, 1); + update_gpf_port_dvsec(pdev, port->gpf_dvsec, 2); + + return 0; +} diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 78a5c2c25982..0fd6646c1a2e 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -194,25 +194,35 @@ static ssize_t mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + /* without @cxl_dpa_rwsem, make sure @part is not reloaded */ + int part = READ_ONCE(cxled->part); + const char *desc; + + if (part < 0) + desc = "none"; + else + desc = cxlds->part[part].res.name; - return sysfs_emit(buf, "%s\n", cxl_decoder_mode_name(cxled->mode)); + return sysfs_emit(buf, "%s\n", desc); } static ssize_t mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); - enum cxl_decoder_mode mode; + enum cxl_partition_mode mode; ssize_t rc; if (sysfs_streq(buf, "pmem")) - mode = CXL_DECODER_PMEM; + mode = CXL_PARTMODE_PMEM; else if (sysfs_streq(buf, "ram")) - mode = CXL_DECODER_RAM; + mode = CXL_PARTMODE_RAM; else return -EINVAL; - rc = cxl_dpa_set_mode(cxled, mode); + rc = cxl_dpa_set_part(cxled, mode); if (rc) return rc; @@ -549,13 +559,9 @@ static ssize_t decoders_committed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cxl_port *port = to_cxl_port(dev); - int rc; - - down_read(&cxl_region_rwsem); - rc = sysfs_emit(buf, "%d\n", cxl_num_decoders_committed(port)); - up_read(&cxl_region_rwsem); - return rc; + guard(rwsem_read)(&cxl_region_rwsem); + return sysfs_emit(buf, "%d\n", cxl_num_decoders_committed(port)); } static DEVICE_ATTR_RO(decoders_committed); @@ -1672,6 +1678,8 @@ retry: if (rc && rc != -EBUSY) return rc; + cxl_gpf_port_setup(dport_dev, port); + /* Any more ports to add between this one and the root? */ if (!dev_is_cxl_root_child(&port->dev)) continue; @@ -1899,6 +1907,7 @@ struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port) return ERR_PTR(-ENOMEM); cxled->pos = -1; + cxled->part = -1; cxld = &cxled->cxld; rc = cxl_decoder_init(port, cxld); if (rc) { @@ -2339,8 +2348,14 @@ static __init int cxl_core_init(void) if (rc) goto err_region; + rc = cxl_ras_init(); + if (rc) + goto err_ras; + return 0; +err_ras: + cxl_region_exit(); err_region: bus_unregister(&cxl_bus_type); err_bus: @@ -2352,6 +2367,7 @@ err_wq: static void cxl_core_exit(void) { + cxl_ras_exit(); cxl_region_exit(); bus_unregister(&cxl_bus_type); destroy_workqueue(cxl_bus_wq); diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c new file mode 100644 index 000000000000..485a831695c7 --- /dev/null +++ b/drivers/cxl/core/ras.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2025 AMD Corporation. All rights reserved. */ + +#include <linux/pci.h> +#include <linux/aer.h> +#include <cxl/event.h> +#include <cxlmem.h> +#include "trace.h" + +static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; + + trace_cxl_port_aer_correctable_error(&pdev->dev, status); +} + +static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; + u32 fe; + + if (hweight32(status) > 1) + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + ras_cap.cap_control)); + else + fe = status; + + trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe, + ras_cap.header_log); +} + +static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; + struct cxl_dev_state *cxlds; + + cxlds = pci_get_drvdata(pdev); + if (!cxlds) + return; + + trace_cxl_aer_correctable_error(cxlds->cxlmd, status); +} + +static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; + struct cxl_dev_state *cxlds; + u32 fe; + + cxlds = pci_get_drvdata(pdev); + if (!cxlds) + return; + + if (hweight32(status) > 1) + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + ras_cap.cap_control)); + else + fe = status; + + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, + ras_cap.header_log); +} + +static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) +{ + unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device, + data->prot_err.agent_addr.function); + struct pci_dev *pdev __free(pci_dev_put) = + pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment, + data->prot_err.agent_addr.bus, + devfn); + int port_type; + + if (!pdev) + return; + + guard(device)(&pdev->dev); + + port_type = pci_pcie_type(pdev); + if (port_type == PCI_EXP_TYPE_ROOT_PORT || + port_type == PCI_EXP_TYPE_DOWNSTREAM || + port_type == PCI_EXP_TYPE_UPSTREAM) { + if (data->severity == AER_CORRECTABLE) + cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap); + else + cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap); + + return; + } + + if (data->severity == AER_CORRECTABLE) + cxl_cper_trace_corr_prot_err(pdev, data->ras_cap); + else + cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap); +} + +static void cxl_cper_prot_err_work_fn(struct work_struct *work) +{ + struct cxl_cper_prot_err_work_data wd; + + while (cxl_cper_prot_err_kfifo_get(&wd)) + cxl_cper_handle_prot_err(&wd); +} +static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn); + +int cxl_ras_init(void) +{ + return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work); +} + +void cxl_ras_exit(void) +{ + cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work); + cancel_work_sync(&cxl_cper_prot_err_work); +} diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index e8d11a988fd9..c3f4dc244df7 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -144,7 +144,7 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, rc = down_read_interruptible(&cxl_region_rwsem); if (rc) return rc; - if (cxlr->mode != CXL_DECODER_PMEM) + if (cxlr->mode != CXL_PARTMODE_PMEM) rc = sysfs_emit(buf, "\n"); else rc = sysfs_emit(buf, "%pUb\n", &p->uuid); @@ -441,7 +441,7 @@ static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a, * Support tooling that expects to find a 'uuid' attribute for all * regions regardless of mode. */ - if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_DECODER_PMEM) + if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM) return 0444; return a->mode; } @@ -603,8 +603,16 @@ static ssize_t mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cxl_region *cxlr = to_cxl_region(dev); + const char *desc; - return sysfs_emit(buf, "%s\n", cxl_decoder_mode_name(cxlr->mode)); + if (cxlr->mode == CXL_PARTMODE_RAM) + desc = "ram"; + else if (cxlr->mode == CXL_PARTMODE_PMEM) + desc = "pmem"; + else + desc = ""; + + return sysfs_emit(buf, "%s\n", desc); } static DEVICE_ATTR_RO(mode); @@ -630,7 +638,7 @@ static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size) /* ways, granularity and uuid (if PMEM) need to be set before HPA */ if (!p->interleave_ways || !p->interleave_granularity || - (cxlr->mode == CXL_DECODER_PMEM && uuid_is_null(&p->uuid))) + (cxlr->mode == CXL_PARTMODE_PMEM && uuid_is_null(&p->uuid))) return -ENXIO; div64_u64_rem(size, (u64)SZ_256M * p->interleave_ways, &remainder); @@ -824,6 +832,21 @@ static int match_free_decoder(struct device *dev, const void *data) return 1; } +static bool region_res_match_cxl_range(const struct cxl_region_params *p, + struct range *range) +{ + if (!p->res) + return false; + + /* + * If an extended linear cache region then the CXL range is assumed + * to be fronted by the DRAM range in current known implementation. + * This assumption will be made until a variant implementation exists. + */ + return p->res->start + p->cache_size == range->start && + p->res->end == range->end; +} + static int match_auto_decoder(struct device *dev, const void *data) { const struct cxl_region_params *p = data; @@ -836,7 +859,7 @@ static int match_auto_decoder(struct device *dev, const void *data) cxld = to_cxl_decoder(dev); r = &cxld->hpa_range; - if (p->res && p->res->start == r->start && p->res->end == r->end) + if (region_res_match_cxl_range(p, r)) return 1; return 0; @@ -1424,8 +1447,7 @@ static int cxl_port_setup_targets(struct cxl_port *port, if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { if (cxld->interleave_ways != iw || cxld->interleave_granularity != ig || - cxld->hpa_range.start != p->res->start || - cxld->hpa_range.end != p->res->end || + !region_res_match_cxl_range(p, &cxld->hpa_range) || ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) { dev_err(&cxlr->dev, "%s:%s %s expected iw: %d ig: %d %pr\n", @@ -1888,6 +1910,7 @@ static int cxl_region_attach(struct cxl_region *cxlr, { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_region_params *p = &cxlr->params; struct cxl_port *ep_port, *root_port; struct cxl_dport *dport; @@ -1902,17 +1925,17 @@ static int cxl_region_attach(struct cxl_region *cxlr, return rc; } - if (cxled->mode != cxlr->mode) { - dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n", - dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode); - return -EINVAL; - } - - if (cxled->mode == CXL_DECODER_DEAD) { + if (cxled->part < 0) { dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev)); return -ENODEV; } + if (cxlds->part[cxled->part].mode != cxlr->mode) { + dev_dbg(&cxlr->dev, "%s region mode: %d mismatch\n", + dev_name(&cxled->cxld.dev), cxlr->mode); + return -EINVAL; + } + /* all full of members, or interleave config not established? */ if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) { dev_dbg(&cxlr->dev, "region already active\n"); @@ -1951,13 +1974,13 @@ static int cxl_region_attach(struct cxl_region *cxlr, return -ENXIO; } - if (resource_size(cxled->dpa_res) * p->interleave_ways != + if (resource_size(cxled->dpa_res) * p->interleave_ways + p->cache_size != resource_size(p->res)) { dev_dbg(&cxlr->dev, - "%s:%s: decoder-size-%#llx * ways-%d != region-size-%#llx\n", + "%s:%s-size-%#llx * ways-%d + cache-%#llx != region-size-%#llx\n", dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), (u64)resource_size(cxled->dpa_res), p->interleave_ways, - (u64)resource_size(p->res)); + (u64)p->cache_size, (u64)resource_size(p->res)); return -EINVAL; } @@ -2115,7 +2138,7 @@ out: void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled) { down_write(&cxl_region_rwsem); - cxled->mode = CXL_DECODER_DEAD; + cxled->part = -1; cxl_region_detach(cxled); up_write(&cxl_region_rwsem); } @@ -2471,7 +2494,7 @@ static int cxl_region_calculate_adistance(struct notifier_block *nb, */ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, int id, - enum cxl_decoder_mode mode, + enum cxl_partition_mode mode, enum cxl_decoder_type type) { struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent); @@ -2525,13 +2548,13 @@ static ssize_t create_ram_region_show(struct device *dev, } static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, - enum cxl_decoder_mode mode, int id) + enum cxl_partition_mode mode, int id) { int rc; switch (mode) { - case CXL_DECODER_RAM: - case CXL_DECODER_PMEM: + case CXL_PARTMODE_RAM: + case CXL_PARTMODE_PMEM: break; default: dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode); @@ -2551,7 +2574,7 @@ static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, } static ssize_t create_region_store(struct device *dev, const char *buf, - size_t len, enum cxl_decoder_mode mode) + size_t len, enum cxl_partition_mode mode) { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); struct cxl_region *cxlr; @@ -2572,7 +2595,7 @@ static ssize_t create_pmem_region_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - return create_region_store(dev, buf, len, CXL_DECODER_PMEM); + return create_region_store(dev, buf, len, CXL_PARTMODE_PMEM); } DEVICE_ATTR_RW(create_pmem_region); @@ -2580,7 +2603,7 @@ static ssize_t create_ram_region_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - return create_region_store(dev, buf, len, CXL_DECODER_RAM); + return create_region_store(dev, buf, len, CXL_PARTMODE_RAM); } DEVICE_ATTR_RW(create_ram_region); @@ -2678,7 +2701,7 @@ EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, "CXL"); struct cxl_poison_context { struct cxl_port *port; - enum cxl_decoder_mode mode; + int part; u64 offset; }; @@ -2686,47 +2709,45 @@ static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd, struct cxl_poison_context *ctx) { struct cxl_dev_state *cxlds = cxlmd->cxlds; + const struct resource *res; + struct resource *p, *last; u64 offset, length; int rc = 0; + if (ctx->part < 0) + return 0; + /* - * Collect poison for the remaining unmapped resources - * after poison is collected by committed endpoints. - * - * Knowing that PMEM must always follow RAM, get poison - * for unmapped resources based on the last decoder's mode: - * ram: scan remains of ram range, then any pmem range - * pmem: scan remains of pmem range + * Collect poison for the remaining unmapped resources after + * poison is collected by committed endpoints decoders. */ - - if (ctx->mode == CXL_DECODER_RAM) { - offset = ctx->offset; - length = resource_size(&cxlds->ram_res) - offset; + for (int i = ctx->part; i < cxlds->nr_partitions; i++) { + res = &cxlds->part[i].res; + for (p = res->child, last = NULL; p; p = p->sibling) + last = p; + if (last) + offset = last->end + 1; + else + offset = res->start; + length = res->end - offset + 1; + if (!length) + break; rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); - if (rc == -EFAULT) - rc = 0; + if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM) + continue; if (rc) - return rc; - } - if (ctx->mode == CXL_DECODER_PMEM) { - offset = ctx->offset; - length = resource_size(&cxlds->dpa_res) - offset; - if (!length) - return 0; - } else if (resource_size(&cxlds->pmem_res)) { - offset = cxlds->pmem_res.start; - length = resource_size(&cxlds->pmem_res); - } else { - return 0; + break; } - return cxl_mem_get_poison(cxlmd, offset, length, NULL); + return rc; } static int poison_by_decoder(struct device *dev, void *arg) { struct cxl_poison_context *ctx = arg; struct cxl_endpoint_decoder *cxled; + enum cxl_partition_mode mode; + struct cxl_dev_state *cxlds; struct cxl_memdev *cxlmd; u64 offset, length; int rc = 0; @@ -2735,27 +2756,18 @@ static int poison_by_decoder(struct device *dev, void *arg) return rc; cxled = to_cxl_endpoint_decoder(dev); - if (!cxled->dpa_res || !resource_size(cxled->dpa_res)) - return rc; - - /* - * Regions are only created with single mode decoders: pmem or ram. - * Linux does not support mixed mode decoders. This means that - * reading poison per endpoint decoder adheres to the requirement - * that poison reads of pmem and ram must be separated. - * CXL 3.0 Spec 8.2.9.8.4.1 - */ - if (cxled->mode == CXL_DECODER_MIXED) { - dev_dbg(dev, "poison list read unsupported in mixed mode\n"); + if (!cxled->dpa_res) return rc; - } cxlmd = cxled_to_memdev(cxled); + cxlds = cxlmd->cxlds; + mode = cxlds->part[cxled->part].mode; + if (cxled->skip) { offset = cxled->dpa_res->start - cxled->skip; length = cxled->skip; rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); - if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM) + if (rc == -EFAULT && mode == CXL_PARTMODE_RAM) rc = 0; if (rc) return rc; @@ -2764,7 +2776,7 @@ static int poison_by_decoder(struct device *dev, void *arg) offset = cxled->dpa_res->start; length = cxled->dpa_res->end - offset + 1; rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region); - if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM) + if (rc == -EFAULT && mode == CXL_PARTMODE_RAM) rc = 0; if (rc) return rc; @@ -2772,7 +2784,7 @@ static int poison_by_decoder(struct device *dev, void *arg) /* Iterate until commit_end is reached */ if (cxled->cxld.id == ctx->port->commit_end) { ctx->offset = cxled->dpa_res->end + 1; - ctx->mode = cxled->mode; + ctx->part = cxled->part; return 1; } @@ -2785,7 +2797,8 @@ int cxl_get_poison_by_endpoint(struct cxl_port *port) int rc = 0; ctx = (struct cxl_poison_context) { - .port = port + .port = port, + .part = -1, }; rc = device_for_each_child(&port->dev, &ctx, poison_by_decoder); @@ -2921,7 +2934,7 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0); /* Apply the hpa_offset to the region base address */ - hpa = hpa_offset + p->res->start; + hpa = hpa_offset + p->res->start + p->cache_size; /* Root decoder translation overrides typical modulo decode */ if (cxlrd->hpa_to_spa) @@ -3038,17 +3051,13 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr) struct cxl_dax_region *cxlr_dax; struct device *dev; - down_read(&cxl_region_rwsem); - if (p->state != CXL_CONFIG_COMMIT) { - cxlr_dax = ERR_PTR(-ENXIO); - goto out; - } + guard(rwsem_read)(&cxl_region_rwsem); + if (p->state != CXL_CONFIG_COMMIT) + return ERR_PTR(-ENXIO); cxlr_dax = kzalloc(sizeof(*cxlr_dax), GFP_KERNEL); - if (!cxlr_dax) { - cxlr_dax = ERR_PTR(-ENOMEM); - goto out; - } + if (!cxlr_dax) + return ERR_PTR(-ENOMEM); cxlr_dax->hpa_range.start = p->res->start; cxlr_dax->hpa_range.end = p->res->end; @@ -3061,8 +3070,6 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr) dev->parent = &cxlr->dev; dev->bus = &cxl_bus_type; dev->type = &cxl_dax_region_type; -out: - up_read(&cxl_region_rwsem); return cxlr_dax; } @@ -3208,7 +3215,6 @@ static int match_region_by_range(struct device *dev, const void *data) struct cxl_region_params *p; struct cxl_region *cxlr; const struct range *r = data; - int rc = 0; if (!is_cxl_region(dev)) return 0; @@ -3216,60 +3222,96 @@ static int match_region_by_range(struct device *dev, const void *data) cxlr = to_cxl_region(dev); p = &cxlr->params; - down_read(&cxl_region_rwsem); + guard(rwsem_read)(&cxl_region_rwsem); if (p->res && p->res->start == r->start && p->res->end == r->end) - rc = 1; - up_read(&cxl_region_rwsem); + return 1; - return rc; + return 0; } -/* Establish an empty region covering the given HPA range */ -static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, - struct cxl_endpoint_decoder *cxled) +static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr, + struct resource *res) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_region_params *p = &cxlr->params; + int nid = phys_to_target_node(res->start); + resource_size_t size = resource_size(res); + resource_size_t cache_size, start; + int rc; + + rc = cxl_acpi_get_extended_linear_cache_size(res, nid, &cache_size); + if (rc) + return rc; + + if (!cache_size) + return 0; + + if (size != cache_size) { + dev_warn(&cxlr->dev, + "Extended Linear Cache size %pa != CXL size %pa. No Support!", + &cache_size, &size); + return -ENXIO; + } + + /* + * Move the start of the range to where the cache range starts. The + * implementation assumes that the cache range is in front of the + * CXL range. This is not dictated by the HMAT spec but is how the + * current known implementation is configured. + * + * The cache range is expected to be within the CFMWS. The adjusted + * res->start should not be less than cxlrd->res->start. + */ + start = res->start - cache_size; + if (start < cxlrd->res->start) + return -ENXIO; + + res->start = start; + p->cache_size = cache_size; + + return 0; +} + +static int __construct_region(struct cxl_region *cxlr, + struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder *cxled) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct cxl_port *port = cxlrd_to_port(cxlrd); struct range *hpa = &cxled->cxld.hpa_range; struct cxl_region_params *p; - struct cxl_region *cxlr; struct resource *res; int rc; - do { - cxlr = __create_region(cxlrd, cxled->mode, - atomic_read(&cxlrd->region_id)); - } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY); - - if (IS_ERR(cxlr)) { - dev_err(cxlmd->dev.parent, - "%s:%s: %s failed assign region: %ld\n", - dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), - __func__, PTR_ERR(cxlr)); - return cxlr; - } - - down_write(&cxl_region_rwsem); + guard(rwsem_write)(&cxl_region_rwsem); p = &cxlr->params; if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) { dev_err(cxlmd->dev.parent, "%s:%s: %s autodiscovery interrupted\n", dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__); - rc = -EBUSY; - goto err; + return -EBUSY; } set_bit(CXL_REGION_F_AUTO, &cxlr->flags); res = kmalloc(sizeof(*res), GFP_KERNEL); - if (!res) { - rc = -ENOMEM; - goto err; - } + if (!res) + return -ENOMEM; *res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa), dev_name(&cxlr->dev)); + + rc = cxl_extended_linear_cache_resize(cxlr, res); + if (rc && rc != -EOPNOTSUPP) { + /* + * Failing to support extended linear cache region resize does not + * prevent the region from functioning. Only causes cxl list showing + * incorrect region size. + */ + dev_warn(cxlmd->dev.parent, + "Extended linear cache calculation failed rc:%d\n", rc); + } + rc = insert_resource(cxlrd->res, res); if (rc) { /* @@ -3289,7 +3331,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group()); if (rc) - goto err; + return rc; dev_dbg(cxlmd->dev.parent, "%s:%s: %s %s res: %pr iw: %d ig: %d\n", dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__, @@ -3298,14 +3340,40 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, /* ...to match put_device() in cxl_add_to_region() */ get_device(&cxlr->dev); - up_write(&cxl_region_rwsem); - return cxlr; + return 0; +} -err: - up_write(&cxl_region_rwsem); - devm_release_action(port->uport_dev, unregister_region, cxlr); - return ERR_PTR(rc); +/* Establish an empty region covering the given HPA range */ +static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder *cxled) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *port = cxlrd_to_port(cxlrd); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + int rc, part = READ_ONCE(cxled->part); + struct cxl_region *cxlr; + + do { + cxlr = __create_region(cxlrd, cxlds->part[part].mode, + atomic_read(&cxlrd->region_id)); + } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY); + + if (IS_ERR(cxlr)) { + dev_err(cxlmd->dev.parent, + "%s:%s: %s failed assign region: %ld\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + __func__, PTR_ERR(cxlr)); + return cxlr; + } + + rc = __construct_region(cxlr, cxlrd, cxled); + if (rc) { + devm_release_action(port->uport_dev, unregister_region, cxlr); + return ERR_PTR(rc); + } + + return cxlr; } int cxl_add_to_region(struct cxl_port *root, struct cxl_endpoint_decoder *cxled) @@ -3375,6 +3443,34 @@ out: } EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, "CXL"); +u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa) +{ + struct cxl_region_ref *iter; + unsigned long index; + + if (!endpoint) + return ~0ULL; + + guard(rwsem_write)(&cxl_region_rwsem); + + xa_for_each(&endpoint->regions, index, iter) { + struct cxl_region_params *p = &iter->region->params; + + if (p->res->start <= spa && spa <= p->res->end) { + if (!p->cache_size) + return ~0ULL; + + if (spa >= p->res->start + p->cache_size) + return spa - p->cache_size; + + return spa + p->cache_size; + } + } + + return ~0ULL; +} +EXPORT_SYMBOL_NS_GPL(cxl_port_get_spa_cache_alias, "CXL"); + static int is_system_ram(struct resource *res, void *arg) { struct cxl_region *cxlr = arg; @@ -3440,9 +3536,9 @@ out: return rc; switch (cxlr->mode) { - case CXL_DECODER_PMEM: + case CXL_PARTMODE_PMEM: return devm_cxl_add_pmem_region(cxlr); - case CXL_DECODER_RAM: + case CXL_PARTMODE_RAM: /* * The region can not be manged by CXL if any portion of * it is already online as 'System RAM' diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index cea706b683b5..25ebfbc1616c 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -48,6 +48,34 @@ { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \ ) +TRACE_EVENT(cxl_port_aer_uncorrectable_error, + TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), + TP_ARGS(dev, status, fe, hl), + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __string(host, dev_name(dev->parent)) + __field(u32, status) + __field(u32, first_error) + __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) + ), + TP_fast_assign( + __assign_str(device); + __assign_str(host); + __entry->status = status; + __entry->first_error = fe; + /* + * Embed the 512B headerlog data for user app retrieval and + * parsing, but no need to print this in the trace buffer. + */ + memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); + ), + TP_printk("device=%s host=%s status: '%s' first_error: '%s'", + __get_str(device), __get_str(host), + show_uc_errs(__entry->status), + show_uc_errs(__entry->first_error) + ) +); + TRACE_EVENT(cxl_aer_uncorrectable_error, TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl), TP_ARGS(cxlmd, status, fe, hl), @@ -96,6 +124,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error, { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \ ) +TRACE_EVENT(cxl_port_aer_correctable_error, + TP_PROTO(struct device *dev, u32 status), + TP_ARGS(dev, status), + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __string(host, dev_name(dev->parent)) + __field(u32, status) + ), + TP_fast_assign( + __assign_str(device); + __assign_str(host); + __entry->status = status; + ), + TP_printk("device=%s host=%s status='%s'", + __get_str(device), __get_str(host), + show_ce_errs(__entry->status) + ) +); + TRACE_EVENT(cxl_aer_correctable_error, TP_PROTO(const struct cxl_memdev *cxlmd, u32 status), TP_ARGS(cxlmd, status), @@ -392,9 +439,10 @@ TRACE_EVENT(cxl_generic_event, TRACE_EVENT(cxl_general_media, TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, - struct cxl_region *cxlr, u64 hpa, struct cxl_event_gen_media *rec), + struct cxl_region *cxlr, u64 hpa, u64 hpa_alias0, + struct cxl_event_gen_media *rec), - TP_ARGS(cxlmd, log, cxlr, hpa, rec), + TP_ARGS(cxlmd, log, cxlr, hpa, hpa_alias0, rec), TP_STRUCT__entry( CXL_EVT_TP_entry @@ -408,6 +456,7 @@ TRACE_EVENT(cxl_general_media, __array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE) /* Following are out of order to pack trace record */ __field(u64, hpa) + __field(u64, hpa_alias0) __field_struct(uuid_t, region_uuid) __field(u16, validity_flags) __field(u8, rank) @@ -438,6 +487,7 @@ TRACE_EVENT(cxl_general_media, CXL_EVENT_GEN_MED_COMP_ID_SIZE); __entry->validity_flags = get_unaligned_le16(&rec->media_hdr.validity_flags); __entry->hpa = hpa; + __entry->hpa_alias0 = hpa_alias0; if (cxlr) { __assign_str(region_name); uuid_copy(&__entry->region_uuid, &cxlr->params.uuid); @@ -455,7 +505,7 @@ TRACE_EVENT(cxl_general_media, "device=%x validity_flags='%s' " \ "comp_id=%s comp_id_pldm_valid_flags='%s' " \ "pldm_entity_id=%s pldm_resource_id=%s " \ - "hpa=%llx region=%s region_uuid=%pUb " \ + "hpa=%llx hpa_alias0=%llx region=%s region_uuid=%pUb " \ "cme_threshold_ev_flags='%s' cme_count=%u", __entry->dpa, show_dpa_flags(__entry->dpa_flags), show_event_desc_flags(__entry->descriptor), @@ -470,7 +520,7 @@ TRACE_EVENT(cxl_general_media, CXL_GMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id), show_pldm_resource_id(__entry->validity_flags, CXL_GMER_VALID_COMPONENT, CXL_GMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id), - __entry->hpa, __get_str(region_name), &__entry->region_uuid, + __entry->hpa, __entry->hpa_alias0, __get_str(region_name), &__entry->region_uuid, show_cme_threshold_ev_flags(__entry->cme_threshold_ev_flags), __entry->cme_count ) ); @@ -529,9 +579,10 @@ TRACE_EVENT(cxl_general_media, TRACE_EVENT(cxl_dram, TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, - struct cxl_region *cxlr, u64 hpa, struct cxl_event_dram *rec), + struct cxl_region *cxlr, u64 hpa, u64 hpa_alias0, + struct cxl_event_dram *rec), - TP_ARGS(cxlmd, log, cxlr, hpa, rec), + TP_ARGS(cxlmd, log, cxlr, hpa, hpa_alias0, rec), TP_STRUCT__entry( CXL_EVT_TP_entry @@ -547,6 +598,7 @@ TRACE_EVENT(cxl_dram, __field(u32, row) __array(u8, cor_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE) __field(u64, hpa) + __field(u64, hpa_alias0) __field_struct(uuid_t, region_uuid) __field(u8, rank) /* Out of order to pack trace record */ __field(u8, bank_group) /* Out of order to pack trace record */ @@ -584,6 +636,7 @@ TRACE_EVENT(cxl_dram, memcpy(__entry->cor_mask, &rec->correction_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE); __entry->hpa = hpa; + __entry->hpa_alias0 = hpa_alias0; if (cxlr) { __assign_str(region_name); uuid_copy(&__entry->region_uuid, &cxlr->params.uuid); @@ -604,7 +657,7 @@ TRACE_EVENT(cxl_dram, "validity_flags='%s' " \ "comp_id=%s comp_id_pldm_valid_flags='%s' " \ "pldm_entity_id=%s pldm_resource_id=%s " \ - "hpa=%llx region=%s region_uuid=%pUb " \ + "hpa=%llx hpa_alias0=%llx region=%s region_uuid=%pUb " \ "sub_channel=%u cme_threshold_ev_flags='%s' cvme_count=%u", __entry->dpa, show_dpa_flags(__entry->dpa_flags), show_event_desc_flags(__entry->descriptor), @@ -622,7 +675,7 @@ TRACE_EVENT(cxl_dram, CXL_DER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id), show_pldm_resource_id(__entry->validity_flags, CXL_DER_VALID_COMPONENT, CXL_DER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id), - __entry->hpa, __get_str(region_name), &__entry->region_uuid, + __entry->hpa, __entry->hpa_alias0, __get_str(region_name), &__entry->region_uuid, __entry->sub_channel, show_cme_threshold_ev_flags(__entry->cme_threshold_ev_flags), __entry->cvme_count ) @@ -870,6 +923,7 @@ TRACE_EVENT(cxl_poison, __string(region, cxlr ? dev_name(&cxlr->dev) : "") __field(u64, overflow_ts) __field(u64, hpa) + __field(u64, hpa_alias0) __field(u64, dpa) __field(u32, dpa_length) __array(char, uuid, 16) @@ -892,16 +946,22 @@ TRACE_EVENT(cxl_poison, memcpy(__entry->uuid, &cxlr->params.uuid, 16); __entry->hpa = cxl_dpa_to_hpa(cxlr, cxlmd, __entry->dpa); + if (__entry->hpa != ULLONG_MAX && cxlr->params.cache_size) + __entry->hpa_alias0 = __entry->hpa + + cxlr->params.cache_size; + else + __entry->hpa_alias0 = ULLONG_MAX; } else { __assign_str(region); memset(__entry->uuid, 0, 16); __entry->hpa = ULLONG_MAX; + __entry->hpa_alias0 = ULLONG_MAX; } ), TP_printk("memdev=%s host=%s serial=%lld trace_type=%s region=%s " \ - "region_uuid=%pU hpa=0x%llx dpa=0x%llx dpa_length=0x%x " \ - "source=%s flags=%s overflow_time=%llu", + "region_uuid=%pU hpa=0x%llx hpa_alias0=0x%llx dpa=0x%llx " \ + "dpa_length=0x%x source=%s flags=%s overflow_time=%llu", __get_str(memdev), __get_str(host), __entry->serial, @@ -909,6 +969,7 @@ TRACE_EVENT(cxl_poison, __get_str(region), __entry->uuid, __entry->hpa, + __entry->hpa_alias0, __entry->dpa, __entry->dpa_length, show_poison_source(__entry->source), diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index bbbaa0d0a670..be8a7dc77719 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -373,32 +373,6 @@ struct cxl_decoder { }; /* - * CXL_DECODER_DEAD prevents endpoints from being reattached to regions - * while cxld_unregister() is running - */ -enum cxl_decoder_mode { - CXL_DECODER_NONE, - CXL_DECODER_RAM, - CXL_DECODER_PMEM, - CXL_DECODER_MIXED, - CXL_DECODER_DEAD, -}; - -static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode) -{ - static const char * const names[] = { - [CXL_DECODER_NONE] = "none", - [CXL_DECODER_RAM] = "ram", - [CXL_DECODER_PMEM] = "pmem", - [CXL_DECODER_MIXED] = "mixed", - }; - - if (mode >= CXL_DECODER_NONE && mode <= CXL_DECODER_MIXED) - return names[mode]; - return "mixed"; -} - -/* * Track whether this decoder is reserved for region autodiscovery, or * free for userspace provisioning. */ @@ -412,16 +386,16 @@ enum cxl_decoder_state { * @cxld: base cxl_decoder_object * @dpa_res: actively claimed DPA span of this decoder * @skip: offset into @dpa_res where @cxld.hpa_range maps - * @mode: which memory type / access-mode-partition this decoder targets * @state: autodiscovery state + * @part: partition index this decoder maps * @pos: interleave position in @cxld.region */ struct cxl_endpoint_decoder { struct cxl_decoder cxld; struct resource *dpa_res; resource_size_t skip; - enum cxl_decoder_mode mode; enum cxl_decoder_state state; + int part; int pos; }; @@ -493,6 +467,7 @@ enum cxl_config_state { * @res: allocated iomem capacity for this region * @targets: active ordered targets in current decoder configuration * @nr_targets: number of targets + * @cache_size: extended linear cache size if exists, otherwise zero. * * State transitions are protected by the cxl_region_rwsem */ @@ -504,6 +479,12 @@ struct cxl_region_params { struct resource *res; struct cxl_endpoint_decoder *targets[CXL_DECODER_MAX_INTERLEAVE]; int nr_targets; + resource_size_t cache_size; +}; + +enum cxl_partition_mode { + CXL_PARTMODE_RAM, + CXL_PARTMODE_PMEM, }; /* @@ -525,7 +506,7 @@ struct cxl_region_params { * struct cxl_region - CXL region * @dev: This region's device * @id: This region's id. Id is globally unique across all regions - * @mode: Endpoint decoder allocation / access mode + * @mode: Operational mode of the mapped capacity * @type: Endpoint decoder target type * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge @@ -538,7 +519,7 @@ struct cxl_region_params { struct cxl_region { struct device dev; int id; - enum cxl_decoder_mode mode; + enum cxl_partition_mode mode; enum cxl_decoder_type type; struct cxl_nvdimm_bridge *cxl_nvb; struct cxl_pmem_region *cxlr_pmem; @@ -563,6 +544,7 @@ struct cxl_nvdimm { struct device dev; struct cxl_memdev *cxlmd; u8 dev_id[CXL_DEV_ID_LEN]; /* for nvdimm, string of 'serial' */ + u64 dirty_shutdowns; }; struct cxl_pmem_region_mapping { @@ -610,6 +592,7 @@ struct cxl_dax_region { * @cdat: Cached CDAT data * @cdat_available: Should a CDAT attribute be available in sysfs * @pci_latency: Upstream latency in picoseconds + * @gpf_dvsec: Cached GPF port DVSEC */ struct cxl_port { struct device dev; @@ -633,6 +616,7 @@ struct cxl_port { } cdat; bool cdat_available; long pci_latency; + int gpf_dvsec; }; /** @@ -875,6 +859,7 @@ struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev); int cxl_add_to_region(struct cxl_port *root, struct cxl_endpoint_decoder *cxled); struct cxl_dax_region *to_cxl_dax_region(struct device *dev); +u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa); #else static inline bool is_cxl_pmem_region(struct device *dev) { @@ -893,6 +878,11 @@ static inline struct cxl_dax_region *to_cxl_dax_region(struct device *dev) { return NULL; } +static inline u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, + u64 spa) +{ + return 0; +} #endif void cxl_endpoint_parse_cdat(struct cxl_port *port); @@ -920,4 +910,6 @@ bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port); #define __mock static #endif +u16 cxl_gpf_get_dvsec(struct device *dev, bool is_port); + #endif /* __CXL_H__ */ diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index dd2b7060d501..3ec6b906371b 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -97,6 +97,19 @@ int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, resource_size_t base, resource_size_t len, resource_size_t skipped); +#define CXL_NR_PARTITIONS_MAX 2 + +struct cxl_dpa_info { + u64 size; + struct cxl_dpa_part_info { + struct range range; + enum cxl_partition_mode mode; + } part[CXL_NR_PARTITIONS_MAX]; + int nr_partitions; +}; + +int cxl_dpa_setup(struct cxl_dev_state *cxlds, const struct cxl_dpa_info *info); + static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port, struct cxl_memdev *cxlmd) { @@ -373,6 +386,18 @@ struct cxl_dpa_perf { }; /** + * struct cxl_dpa_partition - DPA partition descriptor + * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res) + * @perf: performance attributes of the partition from CDAT + * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic... + */ +struct cxl_dpa_partition { + struct resource res; + struct cxl_dpa_perf perf; + enum cxl_partition_mode mode; +}; + +/** * struct cxl_dev_state - The driver device state * * cxl_dev_state represents the CXL driver/device state. It provides an @@ -387,8 +412,8 @@ struct cxl_dpa_perf { * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) * @media_ready: Indicate whether the device media is usable * @dpa_res: Overall DPA resource tree for the device - * @pmem_res: Active Persistent memory capacity configuration - * @ram_res: Active Volatile memory capacity configuration + * @part: DPA partition array + * @nr_partitions: Number of DPA partitions * @serial: PCIe Device Serial Number * @type: Generic Memory Class device or Vendor Specific Memory device * @cxl_mbox: CXL mailbox context @@ -403,8 +428,8 @@ struct cxl_dev_state { bool rcd; bool media_ready; struct resource dpa_res; - struct resource pmem_res; - struct resource ram_res; + struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX]; + unsigned int nr_partitions; u64 serial; enum cxl_devtype type; struct cxl_mailbox cxl_mbox; @@ -413,6 +438,18 @@ struct cxl_dev_state { #endif }; +static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds) +{ + /* + * Static PMEM may be at partition index 0 when there is no static RAM + * capacity. + */ + for (int i = 0; i < cxlds->nr_partitions; i++) + if (cxlds->part[i].mode == CXL_PARTMODE_PMEM) + return resource_size(&cxlds->part[i].res); + return 0; +} + static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox) { return dev_get_drvdata(cxl_mbox->host); @@ -435,14 +472,11 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox) * @partition_align_bytes: alignment size for partition-able capacity * @active_volatile_bytes: sum of hard + soft volatile * @active_persistent_bytes: sum of hard + soft persistent - * @next_volatile_bytes: volatile capacity change pending device reset - * @next_persistent_bytes: persistent capacity change pending device reset - * @ram_perf: performance data entry matched to RAM partition - * @pmem_perf: performance data entry matched to PMEM partition * @event: event log driver state * @poison: poison driver state info * @security: security driver state info * @fw: firmware upload / activation state + * @mce_notifier: MCE notifier * * See CXL 3.0 8.2.9.8.2 Capacity Configuration and Label Storage for * details on capacity parameters. @@ -457,16 +491,12 @@ struct cxl_memdev_state { u64 partition_align_bytes; u64 active_volatile_bytes; u64 active_persistent_bytes; - u64 next_volatile_bytes; - u64 next_persistent_bytes; - - struct cxl_dpa_perf ram_perf; - struct cxl_dpa_perf pmem_perf; struct cxl_event_state event; struct cxl_poison_state poison; struct cxl_security_state security; struct cxl_fw_state fw; + struct notifier_block mce_notifier; }; static inline struct cxl_memdev_state * @@ -660,6 +690,23 @@ struct cxl_mbox_set_partition_info { #define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0) +/* Get Health Info Output Payload CXL 3.2 Spec 8.2.10.9.3.1 Table 8-148 */ +struct cxl_mbox_get_health_info_out { + u8 health_status; + u8 media_status; + u8 additional_status; + u8 life_used; + __le16 device_temperature; + __le32 dirty_shutdown_cnt; + __le32 corrected_volatile_error_cnt; + __le32 corrected_persistent_error_cnt; +} __packed; + +/* Set Shutdown State Input Payload CXL 3.2 Spec 8.2.10.9.3.5 Table 8-152 */ +struct cxl_mbox_set_shutdown_state_in { + u8 state; +} __packed; + /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */ struct cxl_mbox_set_timestamp_in { __le64 timestamp; @@ -785,7 +832,7 @@ int cxl_internal_send_cmd(struct cxl_mailbox *cxl_mbox, int cxl_dev_state_identify(struct cxl_memdev_state *mds); int cxl_await_media_ready(struct cxl_dev_state *cxlds); int cxl_enumerate_cmds(struct cxl_memdev_state *mds); -int cxl_mem_create_range_info(struct cxl_memdev_state *mds); +int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info); struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev); void set_exclusive_cxl_commands(struct cxl_memdev_state *mds, unsigned long *cmds); @@ -796,6 +843,8 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, enum cxl_event_log_type type, enum cxl_event_type event_type, const uuid_t *uuid, union cxl_event *evt); +int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count); +int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds); int cxl_set_timestamp(struct cxl_memdev_state *mds); int cxl_poison_state_init(struct cxl_memdev_state *mds); int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len, diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 4da07727ab9c..54e219b0049e 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -40,6 +40,12 @@ /* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */ #define CXL_DVSEC_PORT_GPF 4 +#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C +#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK GENMASK(3, 0) +#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK GENMASK(11, 8) +#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE +#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK GENMASK(3, 0) +#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK GENMASK(11, 8) /* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */ #define CXL_DVSEC_DEVICE_GPF 5 diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 2f03a4d5606e..9675243bd05b 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -152,7 +152,7 @@ static int cxl_mem_probe(struct device *dev) return -ENXIO; } - if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM)) { + if (cxl_pmem_size(cxlds) && IS_ENABLED(CONFIG_CXL_PMEM)) { rc = devm_cxl_add_nvdimm(parent_port, cxlmd); if (rc) { if (rc == -ENODEV) diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 993fa60fe453..7b14a154463c 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -903,6 +903,7 @@ __ATTRIBUTE_GROUPS(cxl_rcd); static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct pci_host_bridge *host_bridge = pci_find_host_bridge(pdev->bus); + struct cxl_dpa_info range_info = { 0 }; struct cxl_memdev_state *mds; struct cxl_dev_state *cxlds; struct cxl_register_map map; @@ -993,7 +994,11 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) return rc; - rc = cxl_mem_create_range_info(mds); + rc = cxl_mem_dpa_fetch(mds, &range_info); + if (rc) + return rc; + + rc = cxl_dpa_setup(cxlds, &range_info); if (rc) return rc; diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index f9c95996e937..d061fe3d2b86 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -42,15 +42,44 @@ static ssize_t id_show(struct device *dev, struct device_attribute *attr, char * } static DEVICE_ATTR_RO(id); +static ssize_t dirty_shutdown_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm); + + return sysfs_emit(buf, "%llu\n", cxl_nvd->dirty_shutdowns); +} +static DEVICE_ATTR_RO(dirty_shutdown); + static struct attribute *cxl_dimm_attributes[] = { &dev_attr_id.attr, &dev_attr_provider.attr, + &dev_attr_dirty_shutdown.attr, NULL }; +#define CXL_INVALID_DIRTY_SHUTDOWN_COUNT ULLONG_MAX +static umode_t cxl_dimm_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + if (a == &dev_attr_dirty_shutdown.attr) { + struct device *dev = kobj_to_dev(kobj); + struct nvdimm *nvdimm = to_nvdimm(dev); + struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm); + + if (cxl_nvd->dirty_shutdowns == + CXL_INVALID_DIRTY_SHUTDOWN_COUNT) + return 0; + } + + return a->mode; +} + static const struct attribute_group cxl_dimm_attribute_group = { .name = "cxl", .attrs = cxl_dimm_attributes, + .is_visible = cxl_dimm_visible }; static const struct attribute_group *cxl_dimm_attribute_groups[] = { @@ -58,6 +87,38 @@ static const struct attribute_group *cxl_dimm_attribute_groups[] = { NULL }; +static void cxl_nvdimm_arm_dirty_shutdown_tracking(struct cxl_nvdimm *cxl_nvd) +{ + struct cxl_memdev *cxlmd = cxl_nvd->cxlmd; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + struct device *dev = &cxl_nvd->dev; + u32 count; + + /* + * Dirty tracking is enabled and exposed to the user, only when: + * - dirty shutdown on the device can be set, and, + * - the device has a Device GPF DVSEC (albeit unused), and, + * - the Get Health Info cmd can retrieve the device's dirty count. + */ + cxl_nvd->dirty_shutdowns = CXL_INVALID_DIRTY_SHUTDOWN_COUNT; + + if (cxl_arm_dirty_shutdown(mds)) { + dev_warn(dev, "GPF: could not set dirty shutdown state\n"); + return; + } + + if (!cxl_gpf_get_dvsec(cxlds->dev, false)) + return; + + if (cxl_get_dirty_count(mds, &count)) { + dev_warn(dev, "GPF: could not retrieve dirty count\n"); + return; + } + + cxl_nvd->dirty_shutdowns = count; +} + static int cxl_nvdimm_probe(struct device *dev) { struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev); @@ -78,6 +139,14 @@ static int cxl_nvdimm_probe(struct device *dev) set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask); set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask); set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask); + + /* + * Set dirty shutdown now, with the expectation that the device + * clear it upon a successful GPF flow. The exception to this + * is upon Viral detection, per CXL 3.2 section 12.4.2. + */ + cxl_nvdimm_arm_dirty_shutdown_tracking(cxl_nvd); + nvdimm = __nvdimm_create(cxl_nvb->nvdimm_bus, cxl_nvd, cxl_dimm_attribute_groups, flags, cmd_mask, 0, NULL, cxl_nvd->dev_id, @@ -375,6 +444,16 @@ static int cxl_pmem_region_probe(struct device *dev) goto out_nvd; } + if (cxlds->serial == 0) { + /* include missing alongside invalid in this error message. */ + dev_err(dev, "%s: invalid or missing serial number\n", + dev_name(&cxlmd->dev)); + rc = -ENXIO; + goto out_nvd; + } + info[i].serial = cxlds->serial; + info[i].offset = m->start; + m->cxl_nvd = cxl_nvd; mappings[i] = (struct nd_mapping_desc) { .nvdimm = nvdimm, @@ -382,8 +461,6 @@ static int cxl_pmem_region_probe(struct device *dev) .size = m->size, .position = i, }; - info[i].offset = m->start; - info[i].serial = cxlds->serial; } ndr_desc.num_mappings = cxlr_pmem->nr_mappings; ndr_desc.mapping = mappings; diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index b360dca2c69e..bd04980009a4 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -1137,10 +1137,7 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg) unsigned long payload, buffer_end, transmit_header_bytes = 0; u32 control; int count; - struct { - struct fw_iso_packet packet; - u8 header[256]; - } u; + DEFINE_RAW_FLEX(struct fw_iso_packet, u, header, 64); if (ctx == NULL || a->handle != 0) return -EINVAL; @@ -1172,29 +1169,29 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg) while (p < end) { if (get_user(control, &p->control)) return -EFAULT; - u.packet.payload_length = GET_PAYLOAD_LENGTH(control); - u.packet.interrupt = GET_INTERRUPT(control); - u.packet.skip = GET_SKIP(control); - u.packet.tag = GET_TAG(control); - u.packet.sy = GET_SY(control); - u.packet.header_length = GET_HEADER_LENGTH(control); + u->payload_length = GET_PAYLOAD_LENGTH(control); + u->interrupt = GET_INTERRUPT(control); + u->skip = GET_SKIP(control); + u->tag = GET_TAG(control); + u->sy = GET_SY(control); + u->header_length = GET_HEADER_LENGTH(control); switch (ctx->type) { case FW_ISO_CONTEXT_TRANSMIT: - if (u.packet.header_length & 3) + if (u->header_length & 3) return -EINVAL; - transmit_header_bytes = u.packet.header_length; + transmit_header_bytes = u->header_length; break; case FW_ISO_CONTEXT_RECEIVE: - if (u.packet.header_length == 0 || - u.packet.header_length % ctx->header_size != 0) + if (u->header_length == 0 || + u->header_length % ctx->header_size != 0) return -EINVAL; break; case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL: - if (u.packet.payload_length == 0 || - u.packet.payload_length & 3) + if (u->payload_length == 0 || + u->payload_length & 3) return -EINVAL; break; } @@ -1204,20 +1201,19 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg) if (next > end) return -EINVAL; if (copy_from_user - (u.packet.header, p->header, transmit_header_bytes)) + (u->header, p->header, transmit_header_bytes)) return -EFAULT; - if (u.packet.skip && ctx->type == FW_ISO_CONTEXT_TRANSMIT && - u.packet.header_length + u.packet.payload_length > 0) + if (u->skip && ctx->type == FW_ISO_CONTEXT_TRANSMIT && + u->header_length + u->payload_length > 0) return -EINVAL; - if (payload + u.packet.payload_length > buffer_end) + if (payload + u->payload_length > buffer_end) return -EINVAL; - if (fw_iso_context_queue(ctx, &u.packet, - &client->buffer, payload)) + if (fw_iso_context_queue(ctx, u, &client->buffer, payload)) break; p = next; - payload += u.packet.payload_length; + payload += u->payload_length; count++; } fw_iso_context_queue_flush(ctx); diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index b69e68ef3f02..928409199a1a 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -24,7 +24,7 @@ #include <linux/bcd.h> #include <acpi/ghes.h> #include <ras/ras_event.h> -#include "cper_cxl.h" +#include <cxl/event.h> /* * CPER record ID need to be unique even after reboot, because record @@ -624,11 +624,11 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata else goto err_section_too_small; } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) { - struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata); + struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata); printk("%ssection_type: CXL Protocol Error\n", newpfx); if (gdata->error_data_length >= sizeof(*prot_err)) - cper_print_prot_err(newpfx, prot_err); + cxl_cper_print_prot_err(newpfx, prot_err); else goto err_section_too_small; } else { diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c index a55771b99a97..8a7667faf953 100644 --- a/drivers/firmware/efi/cper_cxl.c +++ b/drivers/firmware/efi/cper_cxl.c @@ -8,26 +8,7 @@ */ #include <linux/cper.h> -#include "cper_cxl.h" - -#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0) -#define PROT_ERR_VALID_AGENT_ADDRESS BIT_ULL(1) -#define PROT_ERR_VALID_DEVICE_ID BIT_ULL(2) -#define PROT_ERR_VALID_SERIAL_NUMBER BIT_ULL(3) -#define PROT_ERR_VALID_CAPABILITY BIT_ULL(4) -#define PROT_ERR_VALID_DVSEC BIT_ULL(5) -#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6) - -/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */ -struct cxl_ras_capability_regs { - u32 uncor_status; - u32 uncor_mask; - u32 uncor_severity; - u32 cor_status; - u32 cor_mask; - u32 cap_control; - u32 header_log[16]; -}; +#include <cxl/event.h> static const char * const prot_err_agent_type_strs[] = { "Restricted CXL Device", @@ -40,22 +21,8 @@ static const char * const prot_err_agent_type_strs[] = { "CXL Upstream Switch Port", }; -/* - * The layout of the enumeration and the values matches CXL Agent Type - * field in the UEFI 2.10 Section N.2.13, - */ -enum { - RCD, /* Restricted CXL Device */ - RCH_DP, /* Restricted CXL Host Downstream Port */ - DEVICE, /* CXL Device */ - LD, /* CXL Logical Device */ - FMLD, /* CXL Fabric Manager managed Logical Device */ - RP, /* CXL Root Port */ - DSP, /* CXL Downstream Switch Port */ - USP, /* CXL Upstream Switch Port */ -}; - -void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err) +void cxl_cper_print_prot_err(const char *pfx, + const struct cxl_cper_sec_prot_err *prot_err) { if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE) pr_info("%s agent_type: %d, %s\n", pfx, prot_err->agent_type, diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h deleted file mode 100644 index 86bfcf7909ec..000000000000 --- a/drivers/firmware/efi/cper_cxl.h +++ /dev/null @@ -1,66 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * UEFI Common Platform Error Record (CPER) support for CXL Section. - * - * Copyright (C) 2022 Advanced Micro Devices, Inc. - * - * Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> - */ - -#ifndef LINUX_CPER_CXL_H -#define LINUX_CPER_CXL_H - -/* CXL Protocol Error Section */ -#define CPER_SEC_CXL_PROT_ERR \ - GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \ - 0x4B, 0x77, 0x10, 0x48) - -#pragma pack(1) - -/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */ -struct cper_sec_prot_err { - u64 valid_bits; - u8 agent_type; - u8 reserved[7]; - - /* - * Except for RCH Downstream Port, all the remaining CXL Agent - * types are uniquely identified by the PCIe compatible SBDF number. - */ - union { - u64 rcrb_base_addr; - struct { - u8 function; - u8 device; - u8 bus; - u16 segment; - u8 reserved_1[3]; - }; - } agent_addr; - - struct { - u16 vendor_id; - u16 device_id; - u16 subsystem_vendor_id; - u16 subsystem_id; - u8 class_code[2]; - u16 slot; - u8 reserved_1[4]; - } device_id; - - struct { - u32 lower_dw; - u32 upper_dw; - } dev_serial_num; - - u8 capability[60]; - u16 dvsec_len; - u16 err_len; - u8 reserved_2[4]; -}; - -#pragma pack() - -void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err); - -#endif //__CPER_CXL_ diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 0b1870a09e1f..06f809e70f15 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -267,6 +267,7 @@ config DM_CRYPT depends on BLK_DEV_DM depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) depends on (TRUSTED_KEYS || TRUSTED_KEYS=n) + select CRC32 select CRYPTO select CRYPTO_CBC select CRYPTO_ESSIV diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index aab8240429b0..9c8ed65cd87e 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -2234,7 +2234,7 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c } EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); -static bool forget_buffer(struct dm_bufio_client *c, sector_t block) +static void forget_buffer(struct dm_bufio_client *c, sector_t block) { struct dm_buffer *b; @@ -2249,8 +2249,6 @@ static bool forget_buffer(struct dm_bufio_client *c, sector_t block) cache_put_and_wake(c, b); } } - - return b ? true : false; } /* diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 9cb797a561d6..a10d75a562db 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -406,6 +406,12 @@ struct cache { mempool_t migration_pool; struct bio_set bs; + + /* + * Cache_size entries. Set bits indicate blocks mapped beyond the + * target length, which are marked for invalidation. + */ + unsigned long *invalid_bitset; }; struct per_bio_data { @@ -1922,6 +1928,9 @@ static void __destroy(struct cache *cache) if (cache->discard_bitset) free_bitset(cache->discard_bitset); + if (cache->invalid_bitset) + free_bitset(cache->invalid_bitset); + if (cache->copier) dm_kcopyd_client_destroy(cache->copier); @@ -2510,6 +2519,13 @@ static int cache_create(struct cache_args *ca, struct cache **result) } clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); + cache->invalid_bitset = alloc_bitset(from_cblock(cache->cache_size)); + if (!cache->invalid_bitset) { + *error = "could not allocate bitset for invalid blocks"; + goto bad; + } + clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); + cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(cache->copier)) { *error = "could not create kcopyd client"; @@ -2808,6 +2824,24 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); } +static int load_filtered_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, + bool dirty, uint32_t hint, bool hint_valid) +{ + struct cache *cache = context; + + if (from_oblock(oblock) >= from_oblock(cache->origin_blocks)) { + if (dirty) { + DMERR("%s: unable to shrink origin; cache block %u is dirty", + cache_device_name(cache), from_cblock(cblock)); + return -EFBIG; + } + set_bit(from_cblock(cblock), cache->invalid_bitset); + return 0; + } + + return load_mapping(context, oblock, cblock, dirty, hint, hint_valid); +} + /* * The discard block size in the on disk metadata is not * necessarily the same as we're currently using. So we have to @@ -2899,6 +2933,27 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache) return to_cblock(size); } +static bool can_resume(struct cache *cache) +{ + /* + * Disallow retrying the resume operation for devices that failed the + * first resume attempt, as the failure leaves the policy object partially + * initialized. Retrying could trigger BUG_ON when loading cache mappings + * into the incomplete policy object. + */ + if (cache->sized && !cache->loaded_mappings) { + if (get_cache_mode(cache) != CM_WRITE) + DMERR("%s: unable to resume a failed-loaded cache, please check metadata.", + cache_device_name(cache)); + else + DMERR("%s: unable to resume cache due to missing proper cache table reload", + cache_device_name(cache)); + return false; + } + + return true; +} + static bool can_resize(struct cache *cache, dm_cblock_t new_size) { if (from_cblock(new_size) > from_cblock(cache->cache_size)) { @@ -2941,12 +2996,33 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) return 0; } +static int truncate_oblocks(struct cache *cache) +{ + uint32_t nr_blocks = from_cblock(cache->cache_size); + uint32_t i; + int r; + + for_each_set_bit(i, cache->invalid_bitset, nr_blocks) { + r = dm_cache_remove_mapping(cache->cmd, to_cblock(i)); + if (r) { + DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", + cache_device_name(cache)); + return r; + } + } + + return 0; +} + static int cache_preresume(struct dm_target *ti) { int r = 0; struct cache *cache = ti->private; dm_cblock_t csize = get_cache_dev_size(cache); + if (!can_resume(cache)) + return -EINVAL; + /* * Check to see if the cache has resized. */ @@ -2962,11 +3038,25 @@ static int cache_preresume(struct dm_target *ti) } if (!cache->loaded_mappings) { + /* + * The fast device could have been resized since the last + * failed preresume attempt. To be safe we start by a blank + * bitset for cache blocks. + */ + clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); + r = dm_cache_load_mappings(cache->cmd, cache->policy, - load_mapping, cache); + load_filtered_mapping, cache); if (r) { DMERR("%s: could not load cache mappings", cache_device_name(cache)); - metadata_operation_failed(cache, "dm_cache_load_mappings", r); + if (r != -EFBIG) + metadata_operation_failed(cache, "dm_cache_load_mappings", r); + return r; + } + + r = truncate_oblocks(cache); + if (r) { + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); return r; } @@ -3426,7 +3516,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {2, 2, 0}, + .version = {2, 3, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 02a2919f4e5a..9dfdb63220d7 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -17,6 +17,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> +#include <linux/crc32.h> #include <linux/mempool.h> #include <linux/slab.h> #include <linux/crypto.h> @@ -125,7 +126,6 @@ struct iv_lmk_private { #define TCW_WHITENING_SIZE 16 struct iv_tcw_private { - struct crypto_shash *crc32_tfm; u8 *iv_seed; u8 *whitening; }; @@ -607,10 +607,6 @@ static void crypt_iv_tcw_dtr(struct crypt_config *cc) tcw->iv_seed = NULL; kfree_sensitive(tcw->whitening); tcw->whitening = NULL; - - if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm)) - crypto_free_shash(tcw->crc32_tfm); - tcw->crc32_tfm = NULL; } static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, @@ -628,13 +624,6 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, return -EINVAL; } - tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, - CRYPTO_ALG_ALLOCATES_MEMORY); - if (IS_ERR(tcw->crc32_tfm)) { - ti->error = "Error initializing CRC32 in TCW"; - return PTR_ERR(tcw->crc32_tfm); - } - tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL); tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL); if (!tcw->iv_seed || !tcw->whitening) { @@ -668,36 +657,28 @@ static int crypt_iv_tcw_wipe(struct crypt_config *cc) return 0; } -static int crypt_iv_tcw_whitening(struct crypt_config *cc, - struct dm_crypt_request *dmreq, - u8 *data) +static void crypt_iv_tcw_whitening(struct crypt_config *cc, + struct dm_crypt_request *dmreq, u8 *data) { struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; __le64 sector = cpu_to_le64(dmreq->iv_sector); u8 buf[TCW_WHITENING_SIZE]; - SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm); - int i, r; + int i; /* xor whitening with sector number */ crypto_xor_cpy(buf, tcw->whitening, (u8 *)§or, 8); crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)§or, 8); /* calculate crc32 for every 32bit part and xor it */ - desc->tfm = tcw->crc32_tfm; - for (i = 0; i < 4; i++) { - r = crypto_shash_digest(desc, &buf[i * 4], 4, &buf[i * 4]); - if (r) - goto out; - } + for (i = 0; i < 4; i++) + put_unaligned_le32(crc32(0, &buf[i * 4], 4), &buf[i * 4]); crypto_xor(&buf[0], &buf[12], 4); crypto_xor(&buf[4], &buf[8], 4); /* apply whitening (8 bytes) to whole sector */ for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) crypto_xor(data + i * 8, buf, 8); -out: memzero_explicit(buf, sizeof(buf)); - return r; } static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, @@ -707,13 +688,12 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; __le64 sector = cpu_to_le64(dmreq->iv_sector); u8 *src; - int r = 0; /* Remove whitening from ciphertext */ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { sg = crypt_get_sg_data(cc, dmreq->sg_in); src = kmap_local_page(sg_page(sg)); - r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); + crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); kunmap_local(src); } @@ -723,7 +703,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)§or, cc->iv_size - 8); - return r; + return 0; } static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, @@ -731,7 +711,6 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, { struct scatterlist *sg; u8 *dst; - int r; if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) return 0; @@ -739,10 +718,10 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, /* Apply whitening on ciphertext */ sg = crypt_get_sg_data(cc, dmreq->sg_out); dst = kmap_local_page(sg_page(sg)); - r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); + crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); kunmap_local(dst); - return r; + return 0; } static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 08f6387620c1..d4cf0ac2a7aa 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -369,6 +369,21 @@ static int delay_map(struct dm_target *ti, struct bio *bio) return delay_bio(dc, c, bio); } +#ifdef CONFIG_BLK_DEV_ZONED +static int delay_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + struct delay_c *dc = ti->private; + struct delay_class *c = &dc->read; + + return dm_report_zones(c->dev->bdev, c->start, + c->start + dm_target_offset(ti, args->next_sector), + args, nr_zones); +} +#else +#define delay_report_zones NULL +#endif + #define DMEMIT_DELAY_CLASS(c) \ DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay) @@ -424,11 +439,12 @@ out: static struct target_type delay_target = { .name = "delay", .version = {1, 4, 0}, - .features = DM_TARGET_PASSES_INTEGRITY, + .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM, .module = THIS_MODULE, .ctr = delay_ctr, .dtr = delay_dtr, .map = delay_map, + .report_zones = delay_report_zones, .presuspend = delay_presuspend, .resume = delay_resume, .status = delay_status, diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index 18ae45dcbfb2..b19b0142a690 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -390,6 +390,12 @@ static int ebs_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } +static void ebs_postsuspend(struct dm_target *ti) +{ + struct ebs_c *ec = ti->private; + dm_bufio_client_reset(ec->bufio); +} + static void ebs_status(struct dm_target *ti, status_type_t type, unsigned int status_flags, char *result, unsigned int maxlen) { @@ -447,6 +453,7 @@ static struct target_type ebs_target = { .ctr = ebs_ctr, .dtr = ebs_dtr, .map = ebs_map, + .postsuspend = ebs_postsuspend, .status = ebs_status, .io_hints = ebs_io_hints, .prepare_ioctl = ebs_prepare_ioctl, diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index c8c1a00e7d80..8b219b1199b4 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -21,6 +21,7 @@ #include <linux/reboot.h> #include <crypto/hash.h> #include <crypto/skcipher.h> +#include <crypto/utils.h> #include <linux/async_tx.h> #include <linux/dm-bufio.h> @@ -516,7 +517,7 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr) dm_integrity_io_error(ic, "crypto_shash_digest", r); return r; } - if (memcmp(mac, actual_mac, mac_size)) { + if (crypto_memneq(mac, actual_mac, mac_size)) { dm_integrity_io_error(ic, "superblock mac", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0); return -EILSEQ; @@ -859,7 +860,7 @@ static void rw_section_mac(struct dm_integrity_c *ic, unsigned int section, bool if (likely(wr)) memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR); else { - if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) { + if (crypto_memneq(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) { dm_integrity_io_error(ic, "journal mac", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0); } @@ -1401,10 +1402,9 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, unsigned int *metadata_offset, unsigned int total_size, int op) { -#define MAY_BE_FILLER 1 -#define MAY_BE_HASH 2 unsigned int hash_offset = 0; - unsigned int may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + unsigned char mismatch_hash = 0; + unsigned char mismatch_filler = !ic->discard; do { unsigned char *data, *dp; @@ -1425,7 +1425,7 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se if (op == TAG_READ) { memcpy(tag, dp, to_copy); } else if (op == TAG_WRITE) { - if (memcmp(dp, tag, to_copy)) { + if (crypto_memneq(dp, tag, to_copy)) { memcpy(dp, tag, to_copy); dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy); } @@ -1433,29 +1433,30 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se /* e.g.: op == TAG_CMP */ if (likely(is_power_of_2(ic->tag_size))) { - if (unlikely(memcmp(dp, tag, to_copy))) - if (unlikely(!ic->discard) || - unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) { - goto thorough_test; - } + if (unlikely(crypto_memneq(dp, tag, to_copy))) + goto thorough_test; } else { unsigned int i, ts; thorough_test: ts = total_size; for (i = 0; i < to_copy; i++, ts--) { - if (unlikely(dp[i] != tag[i])) - may_be &= ~MAY_BE_HASH; - if (likely(dp[i] != DISCARD_FILLER)) - may_be &= ~MAY_BE_FILLER; + /* + * Warning: the control flow must not be + * dependent on match/mismatch of + * individual bytes. + */ + mismatch_hash |= dp[i] ^ tag[i]; + mismatch_filler |= dp[i] ^ DISCARD_FILLER; hash_offset++; if (unlikely(hash_offset == ic->tag_size)) { - if (unlikely(!may_be)) { + if (unlikely(mismatch_hash) && unlikely(mismatch_filler)) { dm_bufio_release(b); return ts; } hash_offset = 0; - may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + mismatch_hash = 0; + mismatch_filler = !ic->discard; } } } @@ -1476,8 +1477,6 @@ thorough_test: } while (unlikely(total_size)); return 0; -#undef MAY_BE_FILLER -#undef MAY_BE_HASH } struct flush_request { @@ -2076,7 +2075,7 @@ retry_kmap: char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); - if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { + if (unlikely(crypto_memneq(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx", logical_sector); dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum", @@ -2595,7 +2594,7 @@ static void dm_integrity_inline_recheck(struct work_struct *w) bio_put(outgoing_bio); integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest); - if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { + if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", ic->dev->bdev, dio->bio_details.bi_iter.bi_sector); atomic64_inc(&ic->number_of_mismatches); @@ -2634,7 +2633,7 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status char *mem = bvec_kmap_local(&bv); //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT); integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest); - if (unlikely(memcmp(digest, dio->integrity_payload + pos, + if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { kunmap_local(mem); dm_integrity_free_payload(dio); @@ -2911,7 +2910,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), (char *)access_journal_data(ic, i, l), test_tag); - if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { + if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0); } @@ -5072,16 +5071,19 @@ try_smaller_buffer: ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); if (!ic->recalc_bitmap) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); if (!ic->may_write_bitmap) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL); if (!ic->bbs) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 3786ac67cefe..a1b7535c508a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -467,7 +467,7 @@ static struct target_type stripe_target = { .name = "striped", .version = {1, 7, 0}, .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | - DM_TARGET_ATOMIC_WRITES, + DM_TARGET_ATOMIC_WRITES | DM_TARGET_PASSES_CRYPTO, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 453803f1edf5..35100a435c88 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -697,6 +697,10 @@ int dm_table_add_target(struct dm_table *t, const char *type, DMERR("%s: zero-length target", dm_device_name(t->md)); return -EINVAL; } + if (start + len < start || start + len > LLONG_MAX >> SECTOR_SHIFT) { + DMERR("%s: too large device", dm_device_name(t->md)); + return -EINVAL; + } ti->type = dm_get_target_type(type); if (!ti->type) { diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c index 89cb7942ec5c..baf683cabb1b 100644 --- a/drivers/md/dm-vdo/block-map.c +++ b/drivers/md/dm-vdo/block-map.c @@ -451,7 +451,7 @@ static struct page_info * __must_check find_page(struct vdo_page_cache *cache, * select_lru_page() - Determine which page is least recently used. * * Picks the least recently used from among the non-busy entries at the front of each of the lru - * ring. Since whenever we mark a page busy we also put it to the end of the ring it is unlikely + * list. Since whenever we mark a page busy we also put it to the end of the list it is unlikely * that the entries at the front are busy unless the queue is very short, but not impossible. * * Return: A pointer to the info structure for a relevant page, or NULL if no such page can be @@ -1544,7 +1544,7 @@ static void write_page_if_not_dirtied(struct vdo_waiter *waiter, void *context) static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio) { - return_vio_to_pool(zone->vio_pool, vio); + return_vio_to_pool(vio); check_for_drain_complete(zone); } @@ -1837,7 +1837,7 @@ static void finish_block_map_page_load(struct vdo_completion *completion) if (!vdo_copy_valid_page(vio->data, nonce, pbn, page)) vdo_format_block_map_page(page, nonce, pbn, false); - return_vio_to_pool(zone->vio_pool, pooled); + return_vio_to_pool(pooled); /* Release our claim to the load and wake any waiters */ release_page_lock(data_vio, "load"); @@ -1851,10 +1851,9 @@ static void handle_io_error(struct vdo_completion *completion) struct vio *vio = as_vio(completion); struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio); struct data_vio *data_vio = completion->parent; - struct block_map_zone *zone = pooled->context; vio_record_metadata_io_error(vio); - return_vio_to_pool(zone->vio_pool, pooled); + return_vio_to_pool(pooled); abort_load(data_vio, result); } @@ -2499,7 +2498,7 @@ static void finish_cursor(struct cursor *cursor) struct cursors *cursors = cursor->parent; struct vdo_completion *completion = cursors->completion; - return_vio_to_pool(cursors->pool, vdo_forget(cursor->vio)); + return_vio_to_pool(vdo_forget(cursor->vio)); if (--cursors->active_roots > 0) return; @@ -2746,7 +2745,7 @@ static int __must_check initialize_block_map_zone(struct block_map *map, if (result != VDO_SUCCESS) return result; - result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, + result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, 1, zone->thread_id, VIO_TYPE_BLOCK_MAP_INTERIOR, VIO_PRIORITY_METADATA, zone, &zone->vio_pool); if (result != VDO_SUCCESS) diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h index a8c4d6e24b38..2a8b03779f87 100644 --- a/drivers/md/dm-vdo/constants.h +++ b/drivers/md/dm-vdo/constants.h @@ -44,9 +44,6 @@ enum { /* The default size of each slab journal, in blocks */ DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224, - /* Unit test minimum */ - MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2, - /* * The initial size of lbn_operations and pbn_operations, which is based upon the expected * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index 3f3d29af1be4..5c49d49e023c 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -226,7 +226,7 @@ struct hash_lock { * A list containing the data VIOs sharing this lock, all having the same record name and * data block contents, linked by their hash_lock_node fields. */ - struct list_head duplicate_ring; + struct list_head duplicate_vios; /* The number of data_vios sharing this lock instance */ data_vio_count_t reference_count; @@ -343,7 +343,7 @@ static void return_hash_lock_to_pool(struct hash_zone *zone, struct hash_lock *l { memset(lock, 0, sizeof(*lock)); INIT_LIST_HEAD(&lock->pool_node); - INIT_LIST_HEAD(&lock->duplicate_ring); + INIT_LIST_HEAD(&lock->duplicate_vios); vdo_waitq_init(&lock->waiters); list_add_tail(&lock->pool_node, &zone->lock_pool); } @@ -441,7 +441,7 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) VDO_ASSERT_LOG_ONLY(data_vio->hash_zone != NULL, "must have a hash zone when holding a hash lock"); VDO_ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry), - "must be on a hash lock ring when holding a hash lock"); + "must be on a hash lock list when holding a hash lock"); VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 0, "hash lock reference must be counted"); @@ -464,10 +464,10 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) if (new_lock != NULL) { /* - * Keep all data_vios sharing the lock on a ring since they can complete in any + * Keep all data_vios sharing the lock on a list since they can complete in any * order and we'll always need a pointer to one to compare data. */ - list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_ring); + list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_vios); new_lock->reference_count += 1; if (new_lock->max_references < new_lock->reference_count) new_lock->max_references = new_lock->reference_count; @@ -1789,10 +1789,10 @@ static bool is_hash_collision(struct hash_lock *lock, struct data_vio *candidate struct hash_zone *zone; bool collides; - if (list_empty(&lock->duplicate_ring)) + if (list_empty(&lock->duplicate_vios)) return false; - lock_holder = list_first_entry(&lock->duplicate_ring, struct data_vio, + lock_holder = list_first_entry(&lock->duplicate_vios, struct data_vio, hash_lock_entry); zone = candidate->hash_zone; collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data); @@ -1815,7 +1815,7 @@ static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio return result; result = VDO_ASSERT(list_empty(&data_vio->hash_lock_entry), - "must not already be a member of a hash lock ring"); + "must not already be a member of a hash lock list"); if (result != VDO_SUCCESS) return result; @@ -1942,8 +1942,8 @@ void vdo_release_hash_lock(struct data_vio *data_vio) "returned hash lock must not be in use with state %s", get_hash_lock_state_name(lock->state)); VDO_ASSERT_LOG_ONLY(list_empty(&lock->pool_node), - "hash lock returned to zone must not be in a pool ring"); - VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring), + "hash lock returned to zone must not be in a pool list"); + VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_vios), "hash lock returned to zone must not reference DataVIOs"); return_hash_lock_to_pool(zone, lock); diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c index 100e92f8f866..b7cc0f41caca 100644 --- a/drivers/md/dm-vdo/encodings.c +++ b/drivers/md/dm-vdo/encodings.c @@ -711,24 +711,11 @@ int vdo_configure_slab(block_count_t slab_size, block_count_t slab_journal_block ref_blocks = vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks); meta_blocks = (ref_blocks + slab_journal_blocks); - /* Make sure test code hasn't configured slabs to be too small. */ + /* Make sure configured slabs are not too small. */ if (meta_blocks >= slab_size) return VDO_BAD_CONFIGURATION; - /* - * If the slab size is very small, assume this must be a unit test and override the number - * of data blocks to be a power of two (wasting blocks in the slab). Many tests need their - * data_blocks fields to be the exact capacity of the configured volume, and that used to - * fall out since they use a power of two for the number of data blocks, the slab size was - * a power of two, and every block in a slab was a data block. - * - * TODO: Try to figure out some way of structuring testParameters and unit tests so this - * hack isn't needed without having to edit several unit tests every time the metadata size - * changes by one block. - */ data_blocks = slab_size - meta_blocks; - if ((slab_size < 1024) && !is_power_of_2(data_blocks)) - data_blocks = ((block_count_t) 1 << ilog2(data_blocks)); /* * Configure the slab journal thresholds. The flush threshold is 168 of 224 blocks in @@ -1221,11 +1208,6 @@ int vdo_validate_config(const struct vdo_config *config, if (result != VDO_SUCCESS) return result; - result = VDO_ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS, - "slab journal size meets minimum size"); - if (result != VDO_SUCCESS) - return result; - result = VDO_ASSERT(config->slab_journal_blocks <= config->slab_size, "slab journal size is within expected bound"); if (result != VDO_SUCCESS) diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c index af8fab83b0f3..61edf2b72427 100644 --- a/drivers/md/dm-vdo/indexer/index-layout.c +++ b/drivers/md/dm-vdo/indexer/index-layout.c @@ -54,7 +54,6 @@ * Each save also has a unique nonce. */ -#define MAGIC_SIZE 32 #define NONCE_INFO_SIZE 32 #define MAX_SAVES 2 @@ -98,9 +97,11 @@ enum region_type { #define SUPER_VERSION_CURRENT 3 #define SUPER_VERSION_MAXIMUM 7 -static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; +static const u8 LAYOUT_MAGIC[] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */ +#define MAGIC_SIZE (sizeof(LAYOUT_MAGIC) - 1) + struct region_header { u64 magic; u64 region_blocks; diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c index aee0914d604a..aa575a24e0b2 100644 --- a/drivers/md/dm-vdo/indexer/index-session.c +++ b/drivers/md/dm-vdo/indexer/index-session.c @@ -100,7 +100,6 @@ static int get_index_session(struct uds_index_session *index_session) int uds_launch_request(struct uds_request *request) { - size_t internal_size; int result; if (request->callback == NULL) { @@ -121,10 +120,7 @@ int uds_launch_request(struct uds_request *request) } /* Reset all internal fields before processing. */ - internal_size = - sizeof(struct uds_request) - offsetof(struct uds_request, zone_number); - // FIXME should be using struct_group for this instead - memset((char *) request + sizeof(*request) - internal_size, 0, internal_size); + memset(&request->internal, 0, sizeof(request->internal)); result = get_index_session(request->session); if (result != UDS_SUCCESS) diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h index 183a94eb7e92..7c1fc4577f5b 100644 --- a/drivers/md/dm-vdo/indexer/indexer.h +++ b/drivers/md/dm-vdo/indexer/indexer.h @@ -8,6 +8,7 @@ #include <linux/mutex.h> #include <linux/sched.h> +#include <linux/stddef.h> #include <linux/types.h> #include <linux/wait.h> @@ -73,7 +74,7 @@ enum uds_request_type { /* Remove any mapping for a name. */ UDS_DELETE, -}; +} __packed; enum uds_open_index_type { /* Create a new index. */ @@ -226,7 +227,7 @@ struct uds_zone_message { enum uds_zone_message_type type; /* The virtual chapter number to which the message applies */ u64 virtual_chapter; -}; +} __packed; struct uds_index_session; struct uds_index; @@ -253,34 +254,32 @@ struct uds_request { /* The existing data associated with the request name, if any */ struct uds_record_data old_metadata; - /* Either UDS_SUCCESS or an error code for the request */ - int status; /* True if the record name had an existing entry in the index */ bool found; + /* Either UDS_SUCCESS or an error code for the request */ + int status; - /* - * The remaining fields are used internally and should not be altered by clients. The index - * relies on zone_number being the first field in this section. - */ - - /* The number of the zone which will process this request*/ - unsigned int zone_number; - /* A link for adding a request to a lock-free queue */ - struct funnel_queue_entry queue_link; - /* A link for adding a request to a standard linked list */ - struct uds_request *next_request; - /* A pointer to the index processing this request */ - struct uds_index *index; - /* Control message for coordinating between zones */ - struct uds_zone_message zone_message; - /* If true, process request immediately by waking the worker thread */ - bool unbatched; - /* If true, continue this request before processing newer requests */ - bool requeued; - /* The virtual chapter containing the record name, if known */ - u64 virtual_chapter; - /* The region of the index containing the record name */ - enum uds_index_region location; + /* The remaining fields are used internally and should not be altered by clients. */ + struct_group(internal, + /* The virtual chapter containing the record name, if known */ + u64 virtual_chapter; + /* The region of the index containing the record name */ + enum uds_index_region location; + /* If true, process request immediately by waking the worker thread */ + bool unbatched; + /* If true, continue this request before processing newer requests */ + bool requeued; + /* Control message for coordinating between zones */ + struct uds_zone_message zone_message; + /* The number of the zone which will process this request*/ + unsigned int zone_number; + /* A link for adding a request to a lock-free queue */ + struct funnel_queue_entry queue_link; + /* A link for adding a request to a standard linked list */ + struct uds_request *next_request; + /* A pointer to the index processing this request */ + struct uds_index *index; + ); }; /* A session is required for most index operations. */ diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c index 421e5436c32c..11d47770b54d 100644 --- a/drivers/md/dm-vdo/io-submitter.c +++ b/drivers/md/dm-vdo/io-submitter.c @@ -327,6 +327,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio) * @error_handler: the handler for submission or I/O errors (may be NULL) * @operation: the type of I/O to perform * @data: the buffer to read or write (may be NULL) + * @size: the I/O amount in bytes * * The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block * other vdo threads. @@ -338,7 +339,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio) */ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, - blk_opf_t operation, char *data) + blk_opf_t operation, char *data, int size) { int result; struct vdo_completion *completion = &vio->completion; @@ -349,7 +350,8 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, vdo_reset_completion(completion); completion->error_handler = error_handler; - result = vio_reset_bio(vio, data, callback, operation | REQ_META, physical); + result = vio_reset_bio_with_size(vio, data, size, callback, operation | REQ_META, + physical); if (result != VDO_SUCCESS) { continue_vio(vio, result); return; diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h index 80748699496f..3088f11055fd 100644 --- a/drivers/md/dm-vdo/io-submitter.h +++ b/drivers/md/dm-vdo/io-submitter.h @@ -8,6 +8,7 @@ #include <linux/bio.h> +#include "constants.h" #include "types.h" struct io_submitter; @@ -26,14 +27,25 @@ void vdo_submit_data_vio(struct data_vio *data_vio); void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, - blk_opf_t operation, char *data); + blk_opf_t operation, char *data, int size); static inline void vdo_submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, blk_opf_t operation) { __submit_metadata_vio(vio, physical, callback, error_handler, - operation, vio->data); + operation, vio->data, vio->block_count * VDO_BLOCK_SIZE); +} + +static inline void vdo_submit_metadata_vio_with_size(struct vio *vio, + physical_block_number_t physical, + bio_end_io_t callback, + vdo_action_fn error_handler, + blk_opf_t operation, + int size) +{ + __submit_metadata_vio(vio, physical, callback, error_handler, + operation, vio->data, size); } static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, @@ -41,7 +53,7 @@ static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, { /* FIXME: Can we just use REQ_OP_FLUSH? */ __submit_metadata_vio(vio, 0, callback, error_handler, - REQ_OP_WRITE | REQ_PREFLUSH, NULL); + REQ_OP_WRITE | REQ_PREFLUSH, NULL, 0); } #endif /* VDO_IO_SUBMITTER_H */ diff --git a/drivers/md/dm-vdo/packer.h b/drivers/md/dm-vdo/packer.h index 0f3be44710b5..8c8d6892582d 100644 --- a/drivers/md/dm-vdo/packer.h +++ b/drivers/md/dm-vdo/packer.h @@ -46,7 +46,7 @@ struct compressed_block { /* * Each packer_bin holds an incomplete batch of data_vios that only partially fill a compressed - * block. The bins are kept in a ring sorted by the amount of unused space so the first bin with + * block. The bins are kept in a list sorted by the amount of unused space so the first bin with * enough space to hold a newly-compressed data_vio can easily be found. When the bin fills up or * is flushed, the first uncanceled data_vio in the bin is selected to be the agent for that bin. * Upon entering the packer, each data_vio already has its compressed data in the first slot of the diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c index 42d3d8d0e4b5..9bae8256ba4e 100644 --- a/drivers/md/dm-vdo/priority-table.c +++ b/drivers/md/dm-vdo/priority-table.c @@ -199,7 +199,7 @@ void vdo_priority_table_remove(struct priority_table *table, struct list_head *e /* * Remove the entry from the bucket list, remembering a pointer to another entry in the - * ring. + * list. */ next_entry = entry->next; list_del_init(entry); diff --git a/drivers/md/dm-vdo/recovery-journal.h b/drivers/md/dm-vdo/recovery-journal.h index 899071173015..25e7ec6d19f6 100644 --- a/drivers/md/dm-vdo/recovery-journal.h +++ b/drivers/md/dm-vdo/recovery-journal.h @@ -43,9 +43,9 @@ * has a vio which is used to commit that block to disk. The vio's data is the on-disk * representation of the journal block. In addition each in-memory block has a buffer which is used * to accumulate entries while a partial commit of the block is in progress. In-memory blocks are - * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active - * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is - * moved back to the 'free_tail_blocks' ring. + * kept on two lists. Free blocks live on the 'free_tail_blocks' list. When a block becomes active + * (see below) it is moved to the 'active_tail_blocks' list. When a block is fully committed, it is + * moved back to the 'free_tail_blocks' list. * * When entries are added to the journal, they are added to the active in-memory block, as * indicated by the 'active_block' field. If the caller wishes to wait for the entry to be diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index 8f0a35c63af6..f3d80ff7bef5 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -139,7 +139,7 @@ static bool is_slab_journal_blank(const struct vdo_slab *slab) } /** - * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct + * mark_slab_journal_dirty() - Put a slab journal on the dirty list of its allocator in the correct * order. * @journal: The journal to be marked dirty. * @lock: The recovery journal lock held by the slab journal. @@ -414,8 +414,7 @@ static void complete_reaping(struct vdo_completion *completion) { struct slab_journal *journal = completion->parent; - return_vio_to_pool(journal->slab->allocator->vio_pool, - vio_as_pooled_vio(as_vio(vdo_forget(completion)))); + return_vio_to_pool(vio_as_pooled_vio(as_vio(completion))); finish_reaping(journal); reap_slab_journal(journal); } @@ -698,7 +697,7 @@ static void complete_write(struct vdo_completion *completion) sequence_number_t committed = get_committing_sequence_number(pooled); list_del_init(&pooled->list_entry); - return_vio_to_pool(journal->slab->allocator->vio_pool, vdo_forget(pooled)); + return_vio_to_pool(pooled); if (result != VDO_SUCCESS) { vio_record_metadata_io_error(as_vio(completion)); @@ -822,7 +821,7 @@ static void commit_tail(struct slab_journal *journal) /* * Since we are about to commit the tail block, this journal no longer needs to be on the - * ring of journals which the recovery journal might ask to commit. + * list of journals which the recovery journal might ask to commit. */ mark_slab_journal_clean(journal); @@ -1076,7 +1075,7 @@ static void finish_reference_block_write(struct vdo_completion *completion) /* Release the slab journal lock. */ adjust_slab_journal_block_reference(&slab->journal, block->slab_journal_lock_to_release, -1); - return_vio_to_pool(slab->allocator->vio_pool, pooled); + return_vio_to_pool(pooled); /* * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause @@ -1170,8 +1169,8 @@ static void handle_io_error(struct vdo_completion *completion) struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab; vio_record_metadata_io_error(vio); - return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); - slab->active_count--; + return_vio_to_pool(vio_as_pooled_vio(vio)); + slab->active_count -= vio->io_size / VDO_BLOCK_SIZE; vdo_enter_read_only_mode(slab->allocator->depot->vdo, result); check_if_slab_drained(slab); } @@ -1372,7 +1371,7 @@ static unsigned int calculate_slab_priority(struct vdo_slab *slab) static void prioritize_slab(struct vdo_slab *slab) { VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), - "a slab must not already be on a ring when prioritizing"); + "a slab must not already be on a list when prioritizing"); slab->priority = calculate_slab_priority(slab); vdo_priority_table_enqueue(slab->allocator->prioritized_slabs, slab->priority, &slab->allocq_entry); @@ -2165,28 +2164,95 @@ static void dirty_all_reference_blocks(struct vdo_slab *slab) dirty_block(&slab->reference_blocks[i]); } +static inline bool journal_points_equal(struct journal_point first, + struct journal_point second) +{ + return ((first.sequence_number == second.sequence_number) && + (first.entry_count == second.entry_count)); +} + /** - * clear_provisional_references() - Clear the provisional reference counts from a reference block. - * @block: The block to clear. + * match_bytes() - Check an 8-byte word for bytes matching the value specified + * @input: A word to examine the bytes of + * @match: The byte value sought + * + * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise */ -static void clear_provisional_references(struct reference_block *block) +static inline u64 match_bytes(u64 input, u8 match) { - vdo_refcount_t *counters = get_reference_counters_for_block(block); - block_count_t j; + u64 temp = input ^ (match * 0x0101010101010101ULL); + /* top bit of each byte is set iff top bit of temp byte is clear; rest are 0 */ + u64 test_top_bits = ~temp & 0x8080808080808080ULL; + /* top bit of each byte is set iff low 7 bits of temp byte are clear; rest are useless */ + u64 test_low_bits = 0x8080808080808080ULL - (temp & 0x7f7f7f7f7f7f7f7fULL); + /* return 1 when both tests indicate temp byte is 0 */ + return (test_top_bits & test_low_bits) >> 7; +} + +/** + * count_valid_references() - Process a newly loaded refcount array + * @counters: the array of counters from a metadata block + * + * Scan a 8-byte-aligned array of counters, fixing up any "provisional" values that weren't + * cleaned up at shutdown, changing them internally to "empty". + * + * Return: the number of blocks that are referenced (counters not "empty") + */ +static unsigned int count_valid_references(vdo_refcount_t *counters) +{ + u64 *words = (u64 *)counters; + /* It's easier to count occurrences of a specific byte than its absences. */ + unsigned int empty_count = 0; + /* For speed, we process 8 bytes at once. */ + unsigned int words_left = COUNTS_PER_BLOCK / sizeof(u64); + + /* + * Sanity check assumptions used for optimizing this code: Counters are bytes. The counter + * array is a multiple of the word size. + */ + BUILD_BUG_ON(sizeof(vdo_refcount_t) != 1); + BUILD_BUG_ON((COUNTS_PER_BLOCK % sizeof(u64)) != 0); + + while (words_left > 0) { + /* + * This is used effectively as 8 byte-size counters. Byte 0 counts how many words + * had the target value found in byte 0, etc. We just have to avoid overflow. + */ + u64 split_count = 0; + /* + * The counter "% 255" trick used below to fold split_count into empty_count + * imposes a limit of 254 bytes examined each iteration of the outer loop. We + * process a word at a time, so that limit gets rounded down to 31 u64 words. + */ + const unsigned int max_words_per_iteration = 254 / sizeof(u64); + unsigned int iter_words_left = min_t(unsigned int, words_left, + max_words_per_iteration); + + words_left -= iter_words_left; + + while (iter_words_left--) { + u64 word = *words; + u64 temp; + + /* First, if we have any provisional refcount values, clear them. */ + temp = match_bytes(word, PROVISIONAL_REFERENCE_COUNT); + if (temp) { + /* + * 'temp' has 0x01 bytes where 'word' has PROVISIONAL; this xor + * will alter just those bytes, changing PROVISIONAL to EMPTY. + */ + word ^= temp * (PROVISIONAL_REFERENCE_COUNT ^ EMPTY_REFERENCE_COUNT); + *words = word; + } - for (j = 0; j < COUNTS_PER_BLOCK; j++) { - if (counters[j] == PROVISIONAL_REFERENCE_COUNT) { - counters[j] = EMPTY_REFERENCE_COUNT; - block->allocated_count--; + /* Now count the EMPTY_REFERENCE_COUNT bytes, updating the 8 counters. */ + split_count += match_bytes(word, EMPTY_REFERENCE_COUNT); + words++; } + empty_count += split_count % 255; } -} -static inline bool journal_points_equal(struct journal_point first, - struct journal_point second) -{ - return ((first.sequence_number == second.sequence_number) && - (first.entry_count == second.entry_count)); + return COUNTS_PER_BLOCK - empty_count; } /** @@ -2197,7 +2263,6 @@ static inline bool journal_points_equal(struct journal_point first, static void unpack_reference_block(struct packed_reference_block *packed, struct reference_block *block) { - block_count_t index; sector_count_t i; struct vdo_slab *slab = block->slab; vdo_refcount_t *counters = get_reference_counters_for_block(block); @@ -2223,11 +2288,7 @@ static void unpack_reference_block(struct packed_reference_block *packed, } } - block->allocated_count = 0; - for (index = 0; index < COUNTS_PER_BLOCK; index++) { - if (counters[index] != EMPTY_REFERENCE_COUNT) - block->allocated_count++; - } + block->allocated_count = count_valid_references(counters); } /** @@ -2240,13 +2301,19 @@ static void finish_reference_block_load(struct vdo_completion *completion) struct pooled_vio *pooled = vio_as_pooled_vio(vio); struct reference_block *block = completion->parent; struct vdo_slab *slab = block->slab; + unsigned int block_count = vio->io_size / VDO_BLOCK_SIZE; + unsigned int i; + char *data = vio->data; - unpack_reference_block((struct packed_reference_block *) vio->data, block); - return_vio_to_pool(slab->allocator->vio_pool, pooled); - slab->active_count--; - clear_provisional_references(block); + for (i = 0; i < block_count; i++, block++, data += VDO_BLOCK_SIZE) { + struct packed_reference_block *packed = (struct packed_reference_block *) data; + + unpack_reference_block(packed, block); + slab->free_blocks -= block->allocated_count; + } + return_vio_to_pool(pooled); + slab->active_count -= block_count; - slab->free_blocks -= block->allocated_count; check_if_slab_drained(slab); } @@ -2260,23 +2327,25 @@ static void load_reference_block_endio(struct bio *bio) } /** - * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the - * block. - * @waiter: The waiter of the block to load. + * load_reference_block_group() - After a block waiter has gotten a VIO from the VIO pool, load + * a set of blocks. + * @waiter: The waiter of the first block to load. * @context: The VIO returned by the pool. */ -static void load_reference_block(struct vdo_waiter *waiter, void *context) +static void load_reference_block_group(struct vdo_waiter *waiter, void *context) { struct pooled_vio *pooled = context; struct vio *vio = &pooled->vio; struct reference_block *block = container_of(waiter, struct reference_block, waiter); - size_t block_offset = (block - block->slab->reference_blocks); + u32 block_offset = block - block->slab->reference_blocks; + u32 max_block_count = block->slab->reference_block_count - block_offset; + u32 block_count = min_t(int, vio->block_count, max_block_count); vio->completion.parent = block; - vdo_submit_metadata_vio(vio, block->slab->ref_counts_origin + block_offset, - load_reference_block_endio, handle_io_error, - REQ_OP_READ); + vdo_submit_metadata_vio_with_size(vio, block->slab->ref_counts_origin + block_offset, + load_reference_block_endio, handle_io_error, + REQ_OP_READ, block_count * VDO_BLOCK_SIZE); } /** @@ -2286,14 +2355,21 @@ static void load_reference_block(struct vdo_waiter *waiter, void *context) static void load_reference_blocks(struct vdo_slab *slab) { block_count_t i; + u64 blocks_per_vio = slab->allocator->refcount_blocks_per_big_vio; + struct vio_pool *pool = slab->allocator->refcount_big_vio_pool; + + if (!pool) { + pool = slab->allocator->vio_pool; + blocks_per_vio = 1; + } slab->free_blocks = slab->block_count; slab->active_count = slab->reference_block_count; - for (i = 0; i < slab->reference_block_count; i++) { + for (i = 0; i < slab->reference_block_count; i += blocks_per_vio) { struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter; - waiter->callback = load_reference_block; - acquire_vio_from_pool(slab->allocator->vio_pool, waiter); + waiter->callback = load_reference_block_group; + acquire_vio_from_pool(pool, waiter); } } @@ -2429,7 +2505,7 @@ static void finish_loading_journal(struct vdo_completion *completion) initialize_journal_state(journal); } - return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); + return_vio_to_pool(vio_as_pooled_vio(vio)); vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab)); } @@ -2449,7 +2525,7 @@ static void handle_load_error(struct vdo_completion *completion) struct vio *vio = as_vio(completion); vio_record_metadata_io_error(vio); - return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio)); + return_vio_to_pool(vio_as_pooled_vio(vio)); vdo_finish_loading_with_result(&journal->slab->state, result); } @@ -2547,7 +2623,7 @@ static void queue_slab(struct vdo_slab *slab) int result; VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), - "a requeued slab must not already be on a ring"); + "a requeued slab must not already be on a list"); if (vdo_is_read_only(allocator->depot->vdo)) return; @@ -2700,6 +2776,7 @@ static void finish_scrubbing(struct slab_scrubber *scrubber, int result) vdo_log_info("VDO commencing normal operation"); else if (prior_state == VDO_RECOVERING) vdo_log_info("Exiting recovery mode"); + free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool)); } /* @@ -3281,7 +3358,7 @@ int vdo_release_block_reference(struct block_allocator *allocator, * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as * the primary key and the 'emptiness' field as the secondary key. * - * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping + * Slabs need to be pushed onto the lists in the same order they are to be popped off. Popping * should always get the most empty first, so pushing should be from most empty to least empty. * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements * before larger ones. @@ -3983,6 +4060,7 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot, struct vdo *vdo = depot->vdo; block_count_t max_free_blocks = depot->slab_config.data_blocks; unsigned int max_priority = (2 + ilog2(max_free_blocks)); + u32 reference_block_count, refcount_reads_needed, refcount_blocks_per_vio; *allocator = (struct block_allocator) { .depot = depot, @@ -4000,12 +4078,24 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot, return result; vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION); - result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, allocator->thread_id, + result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, 1, allocator->thread_id, VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, allocator, &allocator->vio_pool); if (result != VDO_SUCCESS) return result; + /* Initialize the refcount-reading vio pool. */ + reference_block_count = vdo_get_saved_reference_count_size(depot->slab_config.slab_blocks); + refcount_reads_needed = DIV_ROUND_UP(reference_block_count, MAX_BLOCKS_PER_VIO); + refcount_blocks_per_vio = DIV_ROUND_UP(reference_block_count, refcount_reads_needed); + allocator->refcount_blocks_per_big_vio = refcount_blocks_per_vio; + result = make_vio_pool(vdo, BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE, + allocator->refcount_blocks_per_big_vio, allocator->thread_id, + VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, + NULL, &allocator->refcount_big_vio_pool); + if (result != VDO_SUCCESS) + return result; + result = initialize_slab_scrubber(allocator); if (result != VDO_SUCCESS) return result; @@ -4223,6 +4313,7 @@ void vdo_free_slab_depot(struct slab_depot *depot) uninitialize_allocator_summary(allocator); uninitialize_scrubber_vio(&allocator->scrubber); free_vio_pool(vdo_forget(allocator->vio_pool)); + free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool)); vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs)); } diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h index f234853501ca..fadc0c9d4dc4 100644 --- a/drivers/md/dm-vdo/slab-depot.h +++ b/drivers/md/dm-vdo/slab-depot.h @@ -45,6 +45,13 @@ enum { /* The number of vios in the vio pool is proportional to the throughput of the VDO. */ BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128, + + /* + * The number of vios in the vio pool used for loading reference count data. A slab's + * refcounts is capped at ~8MB, and we process one at a time in a zone, so 9 should be + * plenty. + */ + BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE = 9, }; /* @@ -248,7 +255,7 @@ struct vdo_slab { /* A list of the dirty blocks waiting to be written out */ struct vdo_wait_queue dirty_blocks; - /* The number of blocks which are currently writing */ + /* The number of blocks which are currently reading or writing */ size_t active_count; /* A waiter object for updating the slab summary */ @@ -425,6 +432,10 @@ struct block_allocator { /* The vio pool for reading and writing block allocator metadata */ struct vio_pool *vio_pool; + /* The vio pool for large initial reads of ref count areas */ + struct vio_pool *refcount_big_vio_pool; + /* How many ref count blocks are read per vio at initial load */ + u32 refcount_blocks_per_big_vio; /* The dm_kcopyd client for erasing slab journals */ struct dm_kcopyd_client *eraser; /* Iterator over the slabs to be erased */ diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h index dbe892b10f26..cdf36e7d7702 100644 --- a/drivers/md/dm-vdo/types.h +++ b/drivers/md/dm-vdo/types.h @@ -376,6 +376,9 @@ struct vio { /* The size of this vio in blocks */ unsigned int block_count; + /* The amount of data to be read or written, in bytes */ + unsigned int io_size; + /* The data being read or written. */ char *data; diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c index a7e32baab4af..80b608674022 100644 --- a/drivers/md/dm-vdo/vdo.c +++ b/drivers/md/dm-vdo/vdo.c @@ -31,9 +31,7 @@ #include <linux/completion.h> #include <linux/device-mapper.h> -#include <linux/kernel.h> #include <linux/lz4.h> -#include <linux/module.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/types.h> @@ -142,12 +140,6 @@ static void finish_vdo_request_queue(void *ptr) vdo_unregister_allocating_thread(); } -#ifdef MODULE -#define MODULE_NAME THIS_MODULE->name -#else -#define MODULE_NAME "dm-vdo" -#endif /* MODULE */ - static const struct vdo_work_queue_type default_queue_type = { .start = start_vdo_request_queue, .finish = finish_vdo_request_queue, @@ -559,8 +551,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason, *vdo_ptr = vdo; snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix), - "%s%u", MODULE_NAME, instance); - BUG_ON(vdo->thread_name_prefix[0] == '\0'); + "vdo%u", instance); result = vdo_allocate(vdo->thread_config.thread_count, struct vdo_thread, __func__, &vdo->threads); if (result != VDO_SUCCESS) { diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index e710f3c5a972..e7f4153e55e3 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -188,14 +188,23 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb /* * Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated - * bio, as it assumes the bio wraps a 4k buffer that is 4k aligned, but there does not have to be a - * vio associated with the bio. + * bio, as it assumes the bio wraps a 4k-multiple buffer that is 4k aligned, but there does not + * have to be a vio associated with the bio. */ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, blk_opf_t bi_opf, physical_block_number_t pbn) { - int bvec_count, offset, len, i; + return vio_reset_bio_with_size(vio, data, vio->block_count * VDO_BLOCK_SIZE, + callback, bi_opf, pbn); +} + +int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback, + blk_opf_t bi_opf, physical_block_number_t pbn) +{ + int bvec_count, offset, i; struct bio *bio = vio->bio; + int vio_size = vio->block_count * VDO_BLOCK_SIZE; + int remaining; bio_reset(bio, bio->bi_bdev, bi_opf); vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn); @@ -205,22 +214,21 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, bio->bi_ioprio = 0; bio->bi_io_vec = bio->bi_inline_vecs; bio->bi_max_vecs = vio->block_count + 1; - len = VDO_BLOCK_SIZE * vio->block_count; + if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d", + size, vio_size) != VDO_SUCCESS) + size = vio_size; + vio->io_size = size; offset = offset_in_page(data); - bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE); + bvec_count = DIV_ROUND_UP(offset + size, PAGE_SIZE); + remaining = size; - /* - * If we knew that data was always on one page, or contiguous pages, we wouldn't need the - * loop. But if we're using vmalloc, it's not impossible that the data is in different - * pages that can't be merged in bio_add_page... - */ - for (i = 0; (i < bvec_count) && (len > 0); i++) { + for (i = 0; (i < bvec_count) && (remaining > 0); i++) { struct page *page; int bytes_added; int bytes = PAGE_SIZE - offset; - if (bytes > len) - bytes = len; + if (bytes > remaining) + bytes = remaining; page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data); bytes_added = bio_add_page(bio, page, bytes, offset); @@ -232,7 +240,7 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, } data += bytes; - len -= bytes; + remaining -= bytes; offset = 0; } @@ -301,6 +309,7 @@ void vio_record_metadata_io_error(struct vio *vio) * make_vio_pool() - Create a new vio pool. * @vdo: The vdo. * @pool_size: The number of vios in the pool. + * @block_count: The number of 4k blocks per vio. * @thread_id: The ID of the thread using this pool. * @vio_type: The type of vios in the pool. * @priority: The priority with which vios from the pool should be enqueued. @@ -309,13 +318,14 @@ void vio_record_metadata_io_error(struct vio *vio) * * Return: A success or error code. */ -int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, +int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_id_t thread_id, enum vio_type vio_type, enum vio_priority priority, void *context, struct vio_pool **pool_ptr) { struct vio_pool *pool; char *ptr; int result; + size_t per_vio_size = VDO_BLOCK_SIZE * block_count; result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio, __func__, &pool); @@ -326,7 +336,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, INIT_LIST_HEAD(&pool->available); INIT_LIST_HEAD(&pool->busy); - result = vdo_allocate(pool_size * VDO_BLOCK_SIZE, char, + result = vdo_allocate(pool_size * per_vio_size, char, "VIO pool buffer", &pool->buffer); if (result != VDO_SUCCESS) { free_vio_pool(pool); @@ -334,10 +344,10 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, } ptr = pool->buffer; - for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += VDO_BLOCK_SIZE) { + for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += per_vio_size) { struct pooled_vio *pooled = &pool->vios[pool->size]; - result = allocate_vio_components(vdo, vio_type, priority, NULL, 1, ptr, + result = allocate_vio_components(vdo, vio_type, priority, NULL, block_count, ptr, &pooled->vio); if (result != VDO_SUCCESS) { free_vio_pool(pool); @@ -345,6 +355,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, } pooled->context = context; + pooled->pool = pool; list_add_tail(&pooled->pool_entry, &pool->available); } @@ -419,12 +430,13 @@ void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter) } /** - * return_vio_to_pool() - Return a vio to the pool - * @pool: The vio pool. + * return_vio_to_pool() - Return a vio to its pool * @vio: The pooled vio to return. */ -void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio) +void return_vio_to_pool(struct pooled_vio *vio) { + struct vio_pool *pool = vio->pool; + VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()), "vio pool entry returned on same thread as it was acquired"); diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h index 3490e9f59b04..4bfcb21901f1 100644 --- a/drivers/md/dm-vdo/vio.h +++ b/drivers/md/dm-vdo/vio.h @@ -30,6 +30,8 @@ struct pooled_vio { void *context; /* The list entry used by the pool */ struct list_head pool_entry; + /* The pool this vio is allocated from */ + struct vio_pool *pool; }; /** @@ -123,6 +125,8 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, blk_opf_t bi_opf, physical_block_number_t pbn); +int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback, + blk_opf_t bi_opf, physical_block_number_t pbn); void update_vio_error_stats(struct vio *vio, const char *format, ...) __printf(2, 3); @@ -188,12 +192,13 @@ static inline struct pooled_vio *vio_as_pooled_vio(struct vio *vio) struct vio_pool; -int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, - enum vio_type vio_type, enum vio_priority priority, - void *context, struct vio_pool **pool_ptr); +int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, + thread_id_t thread_id, enum vio_type vio_type, + enum vio_priority priority, void *context, + struct vio_pool **pool_ptr); void free_vio_pool(struct vio_pool *pool); bool __must_check is_vio_pool_busy(struct vio_pool *pool); void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter); -void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio); +void return_vio_to_pool(struct pooled_vio *vio); #endif /* VIO_H */ diff --git a/drivers/md/dm-vdo/wait-queue.c b/drivers/md/dm-vdo/wait-queue.c index 6e1e739277ef..f81ed0cee2bf 100644 --- a/drivers/md/dm-vdo/wait-queue.c +++ b/drivers/md/dm-vdo/wait-queue.c @@ -34,7 +34,7 @@ void vdo_waitq_enqueue_waiter(struct vdo_wait_queue *waitq, struct vdo_waiter *w waitq->last_waiter->next_waiter = waiter; } - /* In both cases, the waiter we added to the ring becomes the last waiter. */ + /* In both cases, the waiter we added to the list becomes the last waiter. */ waitq->last_waiter = waiter; waitq->length += 1; } diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index e86c1431b108..3c427f18a04b 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -30,6 +30,7 @@ #define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR" #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 +#define DM_VERITY_USE_BH_DEFAULT_BYTES 8192 #define DM_VERITY_MAX_CORRUPTED_ERRS 100 @@ -49,6 +50,15 @@ static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644); +static unsigned int dm_verity_use_bh_bytes[4] = { + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_NONE + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_RT + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_BE + 0 // IOPRIO_CLASS_IDLE +}; + +module_param_array_named(use_bh_bytes, dm_verity_use_bh_bytes, uint, NULL, 0644); + static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled); /* Is at least one dm-verity instance using ahash_tfm instead of shash_tfm? */ @@ -311,7 +321,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { data = dm_bufio_get(v->bufio, hash_block, &buf); - if (data == NULL) { + if (IS_ERR_OR_NULL(data)) { /* * In tasklet and the hash was not in the bufio cache. * Return early and resume execution from a work-queue @@ -324,8 +334,24 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, &buf, bio->bi_ioprio); } - if (IS_ERR(data)) - return PTR_ERR(data); + if (IS_ERR(data)) { + if (skip_unverified) + return 1; + r = PTR_ERR(data); + data = dm_bufio_new(v->bufio, hash_block, &buf); + if (IS_ERR(data)) + return r; + if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA, + hash_block, data) == 0) { + aux = dm_bufio_get_aux_data(buf); + aux->hash_verified = 1; + goto release_ok; + } else { + dm_bufio_release(buf); + dm_bufio_forget(v->bufio, hash_block); + return r; + } + } aux = dm_bufio_get_aux_data(buf); @@ -366,6 +392,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, } } +release_ok: data += offset; memcpy(want_digest, data, v->digest_size); r = 0; @@ -652,9 +679,17 @@ static void verity_bh_work(struct work_struct *w) verity_finish_io(io, errno_to_blk_status(err)); } +static inline bool verity_use_bh(unsigned int bytes, unsigned short ioprio) +{ + return ioprio <= IOPRIO_CLASS_IDLE && + bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]); +} + static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; + unsigned short ioprio = IOPRIO_PRIO_CLASS(bio->bi_ioprio); + unsigned int bytes = io->n_blocks << io->v->data_dev_block_bits; if (bio->bi_status && (!verity_fec_is_enabled(io->v) || @@ -664,9 +699,14 @@ static void verity_end_io(struct bio *bio) return; } - if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq) { - INIT_WORK(&io->bh_work, verity_bh_work); - queue_work(system_bh_wq, &io->bh_work); + if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq && + verity_use_bh(bytes, ioprio)) { + if (in_hardirq() || irqs_disabled()) { + INIT_WORK(&io->bh_work, verity_bh_work); + queue_work(system_bh_wq, &io->bh_work); + } else { + verity_bh_work(&io->bh_work); + } } else { INIT_WORK(&io->work, verity_work); queue_work(io->v->verify_wq, &io->work); @@ -796,6 +836,13 @@ static int verity_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } +static void verity_postsuspend(struct dm_target *ti) +{ + struct dm_verity *v = ti->private; + flush_workqueue(v->verify_wq); + dm_bufio_client_reset(v->bufio); +} + /* * Status: V (valid) or C (corruption found) */ @@ -1761,11 +1808,12 @@ static struct target_type verity_target = { .name = "verity", /* Note: the LSMs depend on the singleton and immutable features */ .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, - .version = {1, 10, 0}, + .version = {1, 11, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, .map = verity_map, + .postsuspend = verity_postsuspend, .status = verity_status, .prepare_ioctl = verity_prepare_ioctl, .iterate_devices = verity_iterate_devices, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4d1e42891d24..5ab7574c0c76 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1540,14 +1540,18 @@ static void __send_empty_flush(struct clone_info *ci) { struct dm_table *t = ci->map; struct bio flush_bio; + blk_opf_t opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + + if ((ci->io->orig_bio->bi_opf & (REQ_IDLE | REQ_SYNC)) == + (REQ_IDLE | REQ_SYNC)) + opf |= REQ_IDLE; /* * Use an on-stack bio for this, it's safe since we don't * need to reference it after submit. It's just used as * the basis for the clone(s). */ - bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, - REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); + bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf); ci->bio = &flush_bio; ci->sector_count = 0; diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c index 9e84ab411564..51614651d2e7 100644 --- a/drivers/nvdimm/claim.c +++ b/drivers/nvdimm/claim.c @@ -56,17 +56,6 @@ bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, return true; } -bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, - struct nd_namespace_common **_ndns) -{ - bool claimed; - - nvdimm_bus_lock(&attach->dev); - claimed = __nd_attach_ndns(dev, attach, _ndns); - nvdimm_bus_unlock(&attach->dev); - return claimed; -} - static bool is_idle(struct device *dev, struct nd_namespace_common *ndns) { struct nd_region *nd_region = to_nd_region(dev->parent); diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index 082253a3a956..04f4a049599a 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -442,7 +442,8 @@ int nd_label_data_init(struct nvdimm_drvdata *ndd) if (ndd->data) return 0; - if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0) { + if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0 || + ndd->nsarea.config_size == 0) { dev_dbg(ndd->dev, "failed to init config data area: (%u:%u)\n", ndd->nsarea.max_xfer, ndd->nsarea.config_size); return -ENXIO; diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 86976a9e8a15..bfc6bfeb6e24 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -127,8 +127,6 @@ resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region); resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, struct nd_mapping *nd_mapping); resource_size_t nd_region_available_dpa(struct nd_region *nd_region); -int nd_region_conflict(struct nd_region *nd_region, resource_size_t start, - resource_size_t size); resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, struct nd_label_id *label_id); int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd); @@ -136,8 +134,6 @@ void get_ndd(struct nvdimm_drvdata *ndd); resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns); void nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns); void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns); -bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, - struct nd_namespace_common **_ndns); bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, struct nd_namespace_common **_ndns); ssize_t nd_namespace_store(struct device *dev, diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 37417ce5ec7b..de1ee5ebc851 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -1229,45 +1229,4 @@ bool is_nvdimm_sync(struct nd_region *nd_region) } EXPORT_SYMBOL_GPL(is_nvdimm_sync); -struct conflict_context { - struct nd_region *nd_region; - resource_size_t start, size; -}; - -static int region_conflict(struct device *dev, void *data) -{ - struct nd_region *nd_region; - struct conflict_context *ctx = data; - resource_size_t res_end, region_end, region_start; - - if (!is_memory(dev)) - return 0; - - nd_region = to_nd_region(dev); - if (nd_region == ctx->nd_region) - return 0; - - res_end = ctx->start + ctx->size; - region_start = nd_region->ndr_start; - region_end = region_start + nd_region->ndr_size; - if (ctx->start >= region_start && ctx->start < region_end) - return -EBUSY; - if (res_end > region_start && res_end <= region_end) - return -EBUSY; - return 0; -} - -int nd_region_conflict(struct nd_region *nd_region, resource_size_t start, - resource_size_t size) -{ - struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); - struct conflict_context ctx = { - .nd_region = nd_region, - .start = start, - .size = size, - }; - - return device_for_each_child(&nvdimm_bus->dev, &ctx, region_conflict); -} - MODULE_IMPORT_NS("DEVMEM"); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 97c4d71115d8..d80f94346199 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -396,32 +396,6 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) goto out; /* - * If it's already released don't get it. This avoids to loop - * in __get_user_pages if userfaultfd_release waits on the - * caller of handle_userfault to release the mmap_lock. - */ - if (unlikely(READ_ONCE(ctx->released))) { - /* - * Don't return VM_FAULT_SIGBUS in this case, so a non - * cooperative manager can close the uffd after the - * last UFFDIO_COPY, without risking to trigger an - * involuntary SIGBUS if the process was starting the - * userfaultfd while the userfaultfd was still armed - * (but after the last UFFDIO_COPY). If the uffd - * wasn't already closed when the userfault reached - * this point, that would normally be solved by - * userfaultfd_must_wait returning 'false'. - * - * If we were to return VM_FAULT_SIGBUS here, the non - * cooperative manager would be instead forced to - * always call UFFDIO_UNREGISTER before it can safely - * close the uffd. - */ - ret = VM_FAULT_NOPAGE; - goto out; - } - - /* * Check that we can return VM_FAULT_RETRY. * * NOTE: it should become possible to return VM_FAULT_RETRY @@ -457,6 +431,31 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; + if (unlikely(READ_ONCE(ctx->released))) { + /* + * If a concurrent release is detected, do not return + * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always + * return VM_FAULT_RETRY with lock released proactively. + * + * If we were to return VM_FAULT_SIGBUS here, the non + * cooperative manager would be instead forced to + * always call UFFDIO_UNREGISTER before it can safely + * close the uffd, to avoid involuntary SIGBUS triggered. + * + * If we were to return VM_FAULT_NOPAGE, it would work for + * the fault path, in which the lock will be released + * later. However for GUP, faultin_page() does nothing + * special on NOPAGE, so GUP would spin retrying without + * releasing the mmap read lock, causing possible livelock. + * + * Here only VM_FAULT_RETRY would make sure the mmap lock + * be released immediately, so that the thread concurrently + * releasing the userfault would always make progress. + */ + release_fault_lock(vmf); + goto out; + } + /* take the reference before dropping the mmap_lock */ userfaultfd_ctx_get(ctx); diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index d1adfba8387e..88a42973fa47 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -227,10 +227,10 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page); */ static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) { - struct page *page = (struct page *)table; + struct ptdesc *ptdesc = (struct ptdesc *)table; - pagetable_dtor(page_ptdesc(page)); - tlb_remove_page(tlb, page); + pagetable_dtor(ptdesc); + tlb_remove_page(tlb, ptdesc_page(ptdesc)); } #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ @@ -493,17 +493,11 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) return tlb_remove_page_size(tlb, page, PAGE_SIZE); } -static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt) +static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt) { tlb_remove_table(tlb, pt); } -/* Like tlb_remove_ptdesc, but for page-like page directories. */ -static inline void tlb_remove_page_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt) -{ - tlb_remove_page(tlb, ptdesc_page(pt)); -} - static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { diff --git a/include/cxl/event.h b/include/cxl/event.h index 04edd44bd26f..f9ae1796da85 100644 --- a/include/cxl/event.h +++ b/include/cxl/event.h @@ -164,10 +164,99 @@ struct cxl_cper_work_data { struct cxl_cper_event_rec rec; }; +#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0) +#define PROT_ERR_VALID_AGENT_ADDRESS BIT_ULL(1) +#define PROT_ERR_VALID_DEVICE_ID BIT_ULL(2) +#define PROT_ERR_VALID_SERIAL_NUMBER BIT_ULL(3) +#define PROT_ERR_VALID_CAPABILITY BIT_ULL(4) +#define PROT_ERR_VALID_DVSEC BIT_ULL(5) +#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6) + +/* + * The layout of the enumeration and the values matches CXL Agent Type + * field in the UEFI 2.10 Section N.2.13, + */ +enum { + RCD, /* Restricted CXL Device */ + RCH_DP, /* Restricted CXL Host Downstream Port */ + DEVICE, /* CXL Device */ + LD, /* CXL Logical Device */ + FMLD, /* CXL Fabric Manager managed Logical Device */ + RP, /* CXL Root Port */ + DSP, /* CXL Downstream Switch Port */ + USP, /* CXL Upstream Switch Port */ +}; + +#pragma pack(1) + +/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */ +struct cxl_cper_sec_prot_err { + u64 valid_bits; + u8 agent_type; + u8 reserved[7]; + + /* + * Except for RCH Downstream Port, all the remaining CXL Agent + * types are uniquely identified by the PCIe compatible SBDF number. + */ + union { + u64 rcrb_base_addr; + struct { + u8 function; + u8 device; + u8 bus; + u16 segment; + u8 reserved_1[3]; + }; + } agent_addr; + + struct { + u16 vendor_id; + u16 device_id; + u16 subsystem_vendor_id; + u16 subsystem_id; + u8 class_code[2]; + u16 slot; + u8 reserved_1[4]; + } device_id; + + struct { + u32 lower_dw; + u32 upper_dw; + } dev_serial_num; + + u8 capability[60]; + u16 dvsec_len; + u16 err_len; + u8 reserved_2[4]; +}; + +#pragma pack() + +/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */ +struct cxl_ras_capability_regs { + u32 uncor_status; + u32 uncor_mask; + u32 uncor_severity; + u32 cor_status; + u32 cor_mask; + u32 cap_control; + u32 header_log[16]; +}; + +struct cxl_cper_prot_err_work_data { + struct cxl_cper_sec_prot_err prot_err; + struct cxl_ras_capability_regs ras_cap; + int severity; +}; + #ifdef CONFIG_ACPI_APEI_GHES int cxl_cper_register_work(struct work_struct *work); int cxl_cper_unregister_work(struct work_struct *work); int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd); +int cxl_cper_register_prot_err_work(struct work_struct *work); +int cxl_cper_unregister_prot_err_work(struct work_struct *work); +int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd); #else static inline int cxl_cper_register_work(struct work_struct *work) { @@ -182,6 +271,18 @@ static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd) { return 0; } +static inline int cxl_cper_register_prot_err_work(struct work_struct *work) +{ + return 0; +} +static inline int cxl_cper_unregister_prot_err_work(struct work_struct *work) +{ + return 0; +} +static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd) +{ + return 0; +} #endif #endif /* _LINUX_CXL_EVENT_H */ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index a70e62d69dc7..3f2e93ed9730 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1094,6 +1094,17 @@ static inline acpi_handle acpi_get_processor_handle(int cpu) #endif /* !CONFIG_ACPI */ +#ifdef CONFIG_ACPI_HMAT +int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid, + resource_size_t *size); +#else +static inline int hmat_get_extended_linear_cache_size(struct resource *backing_res, + int nid, resource_size_t *size) +{ + return -EOPNOTSUPP; +} +#endif + extern void arch_post_acpi_subsys_init(void); #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC diff --git a/include/linux/cper.h b/include/linux/cper.h index 265b0f8fc0b3..0ed60a91eca9 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -89,6 +89,10 @@ enum { #define CPER_NOTIFY_DMAR \ GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \ 0x72, 0x2D, 0xEB, 0x41) +/* CXL Protocol Error Section */ +#define CPER_SEC_CXL_PROT_ERR \ + GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \ + 0x4B, 0x77, 0x10, 0x48) /* CXL Event record UUIDs are formatted as GUIDs and reported in section type */ /* @@ -601,4 +605,8 @@ void cper_estatus_print(const char *pfx, int cper_estatus_check_header(const struct acpi_hest_generic_status *estatus); int cper_estatus_check(const struct acpi_hest_generic_status *estatus); +struct cxl_cper_sec_prot_err; +void cxl_cper_print_prot_err(const char *pfx, + const struct cxl_cper_sec_prot_err *prot_err); + #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index d66bc0e97632..b7f13f087954 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4238,4 +4238,14 @@ int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *st int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); + +/* + * mseal of userspace process's system mappings. + */ +#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS +#define VM_SEALED_SYSMAP VM_SEALED +#else +#define VM_SEALED_SYSMAP VM_NONE +#endif + #endif /* _LINUX_MM_H */ diff --git a/include/linux/node.h b/include/linux/node.h index 9a881c2208b3..2b7517892230 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -57,6 +57,11 @@ enum cache_write_policy { NODE_CACHE_WRITE_OTHER, }; +enum cache_mode { + NODE_CACHE_ADDR_MODE_RESERVED, + NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR, +}; + /** * struct node_cache_attrs - system memory caching attributes * @@ -65,6 +70,7 @@ enum cache_write_policy { * @size: Total size of cache in bytes * @line_size: Number of bytes fetched on a cache miss * @level: The cache hierarchy level + * @address_mode: The address mode */ struct node_cache_attrs { enum cache_indexing indexing; @@ -72,6 +78,7 @@ struct node_cache_attrs { u64 size; u16 line_size; u8 level; + u16 address_mode; }; #ifdef CONFIG_HMEM_REPORTING diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5bd9492a66ee..e6a21b62dcce 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -226,11 +226,48 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page } return page; } + +static __always_inline bool page_count_writable(const struct page *page, int u) +{ + if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) + return true; + + /* + * The refcount check is ordered before the fake-head check to prevent + * the following race: + * CPU 1 (HVO) CPU 2 (speculative PFN walker) + * + * page_ref_freeze() + * synchronize_rcu() + * rcu_read_lock() + * page_is_fake_head() is false + * vmemmap_remap_pte() + * XXX: struct page[] becomes r/o + * + * page_ref_unfreeze() + * page_ref_count() is not zero + * + * atomic_add_unless(&page->_refcount) + * XXX: try to modify r/o struct page[] + * + * The refcount check also prevents modification attempts to other (r/o) + * tail pages that are not fake heads. + */ + if (atomic_read_acquire(&page->_refcount) == u) + return false; + + return page_fixed_fake_head(page) == page; +} #else static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } + +static inline bool page_count_writable(const struct page *page, int u) +{ + return true; +} #endif static __always_inline int page_is_fake_head(const struct page *page) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 8c236c651d1d..544150d1d5fd 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -234,7 +234,7 @@ static inline bool page_ref_add_unless(struct page *page, int nr, int u) rcu_read_lock(); /* avoid writing to the vmemmap area being remapped */ - if (!page_is_fake_head(page) && page_ref_count(page) != u) + if (page_count_writable(page, u)) ret = atomic_add_unless(&page->_refcount, nr, u); rcu_read_unlock(); diff --git a/include/linux/sort.h b/include/linux/sort.h index e163287ac6c1..8e5603b10941 100644 --- a/include/linux/sort.h +++ b/include/linux/sort.h @@ -13,4 +13,15 @@ void sort(void *base, size_t num, size_t size, cmp_func_t cmp_func, swap_func_t swap_func); +/* Versions that periodically call cond_resched(): */ + +void sort_r_nonatomic(void *base, size_t num, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv); + +void sort_nonatomic(void *base, size_t num, size_t size, + cmp_func_t cmp_func, + swap_func_t swap_func); + #endif diff --git a/init/Kconfig b/init/Kconfig index 681f38ee68db..18717967fc8c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1888,6 +1888,28 @@ config ARCH_HAS_MEMBARRIER_CALLBACKS config ARCH_HAS_MEMBARRIER_SYNC_CORE bool +config ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS + bool + help + Control MSEAL_SYSTEM_MAPPINGS access based on architecture. + + A 64-bit kernel is required for the memory sealing feature. + No specific hardware features from the CPU are needed. + + To enable this feature, the architecture needs to update their + special mappings calls to include the sealing flag and confirm + that it doesn't unmap/remap system mappings during the life + time of the process. The existence of this flag for an architecture + implies that it does not require the remapping of the system + mappings during process lifetime, so sealing these mappings is safe + from a kernel perspective. + + After the architecture enables this, a distribution can set + CONFIG_MSEAL_SYSTEM_MAPPING to manage access to the feature. + + For complete descriptions of memory sealing, please see + Documentation/userspace-api/mseal.rst + config HAVE_PERF_EVENTS bool help diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2746791ce1e2..615b4e6d22c7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1703,7 +1703,8 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO| + VM_SEALED_SYSMAP, &xol_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 21575d39c376..66bcd40a28ca 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4171,8 +4171,8 @@ static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) init_dsq(dsq, dsq_id); - ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, - dsq_hash_params); + ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node, + dsq_hash_params); if (ret) { kfree(dsq); return ERR_PTR(ret); @@ -5361,6 +5361,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ cpus_read_lock(); + scx_idle_enable(ops); + if (scx_ops.init) { ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); if (ret) { @@ -5427,8 +5429,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (scx_ops.cpu_acquire || scx_ops.cpu_release) static_branch_enable(&scx_ops_cpu_preempt); - scx_idle_enable(ops); - /* * Lock out forks, cgroup on/offlining and moves before opening the * floodgate so that they don't wander into the operations prematurely. diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 52c36a70a3d0..cb343ca889e0 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -544,7 +544,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 * core. */ if (flags & SCX_PICK_IDLE_CORE) { - cpu = prev_cpu; + cpu = -EBUSY; goto out_unlock; } } @@ -584,8 +584,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 * increasing distance. */ cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags); - if (cpu >= 0) - goto out_unlock; out_unlock: rcu_read_unlock(); @@ -723,14 +721,14 @@ static void reset_idle_masks(struct sched_ext_ops *ops) void scx_idle_enable(struct sched_ext_ops *ops) { if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) - static_branch_enable(&scx_builtin_idle_enabled); + static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); else - static_branch_disable(&scx_builtin_idle_enabled); + static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) - static_branch_enable(&scx_builtin_idle_per_node); + static_branch_enable_cpuslocked(&scx_builtin_idle_per_node); else - static_branch_disable(&scx_builtin_idle_per_node); + static_branch_disable_cpuslocked(&scx_builtin_idle_per_node); #ifdef CONFIG_SMP reset_idle_masks(ops); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 033fba0633cf..a3f35c7d83b6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -265,8 +265,7 @@ config FUNCTION_GRAPH_RETADDR config FUNCTION_TRACE_ARGS bool - depends on HAVE_FUNCTION_ARG_ACCESS_API - depends on DEBUG_INFO_BTF + depends on PROBE_EVENTS_BTF_ARGS default y help If supported with function argument access API and BTF, then diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 92015de6203d..1a48aedb5255 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6855,6 +6855,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) } } } + cond_resched(); } while_for_each_ftrace_rec(); return fail ? -EINVAL : 0; diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 50344aa9f7f9..968c5c3b0246 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -809,7 +809,8 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) if (p && rv_is_nested_monitor(p)) { pr_info("Parent monitor %s is already nested, cannot nest further\n", parent->name); - return -EINVAL; + retval = -EINVAL; + goto out_unlock; } r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bc957a2507e2..d058b519d502 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9604,6 +9604,7 @@ static void free_trace_buffers(struct trace_array *tr) return; free_trace_buffer(&tr->array_buffer); + kfree(tr->module_delta); #ifdef CONFIG_TRACER_MAX_TRACE free_trace_buffer(&tr->max_buffer); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8638b7f7ff85..069e92856bda 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -470,6 +470,7 @@ static void test_event_printk(struct trace_event_call *call) case '%': continue; case 'p': + do_pointer: /* Find dereferencing fields */ switch (fmt[i + 1]) { case 'B': case 'R': case 'r': @@ -498,6 +499,12 @@ static void test_event_printk(struct trace_event_call *call) continue; if (fmt[i + j] == '*') { star = true; + /* Handle %*pbl case */ + if (!j && fmt[i + 1] == 'p') { + arg++; + i++; + goto do_pointer; + } continue; } if ((fmt[i + j] == 's')) { diff --git a/lib/sg_split.c b/lib/sg_split.c index 60a0babebf2e..0f89aab5c671 100644 --- a/lib/sg_split.c +++ b/lib/sg_split.c @@ -88,8 +88,6 @@ static void sg_split_phys(struct sg_splitter *splitters, const int nb_splits) if (!j) { out_sg->offset += split->skip_sg0; out_sg->length -= split->skip_sg0; - } else { - out_sg->offset = 0; } sg_dma_address(out_sg) = 0; sg_dma_len(out_sg) = 0; diff --git a/lib/sort.c b/lib/sort.c index 8e73dc55476b..52363995ccc5 100644 --- a/lib/sort.c +++ b/lib/sort.c @@ -186,36 +186,13 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size) return i / 2; } -/** - * sort_r - sort an array of elements - * @base: pointer to data to sort - * @num: number of elements - * @size: size of each element - * @cmp_func: pointer to comparison function - * @swap_func: pointer to swap function or NULL - * @priv: third argument passed to comparison function - * - * This function does a heapsort on the given array. You may provide - * a swap_func function if you need to do something more than a memory - * copy (e.g. fix up pointers or auxiliary data), but the built-in swap - * avoids a slow retpoline and so is significantly faster. - * - * The comparison function must adhere to specific mathematical - * properties to ensure correct and stable sorting: - * - Antisymmetry: cmp_func(a, b) must return the opposite sign of - * cmp_func(b, a). - * - Transitivity: if cmp_func(a, b) <= 0 and cmp_func(b, c) <= 0, then - * cmp_func(a, c) <= 0. - * - * Sorting time is O(n log n) both on average and worst-case. While - * quicksort is slightly faster on average, it suffers from exploitable - * O(n*n) worst-case behavior and extra memory requirements that make - * it less suitable for kernel use. - */ -void sort_r(void *base, size_t num, size_t size, - cmp_r_func_t cmp_func, - swap_r_func_t swap_func, - const void *priv) +#include <linux/sched.h> + +static void __sort_r(void *base, size_t num, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv, + bool may_schedule) { /* pre-scale counters for performance */ size_t n = num * size, a = (num/2) * size; @@ -286,6 +263,9 @@ void sort_r(void *base, size_t num, size_t size, b = parent(b, lsbit, size); do_swap(base + b, base + c, size, swap_func, priv); } + + if (may_schedule) + cond_resched(); } n -= size; @@ -293,8 +273,63 @@ void sort_r(void *base, size_t num, size_t size, if (n == size * 2 && do_cmp(base, base + size, cmp_func, priv) > 0) do_swap(base, base + size, size, swap_func, priv); } + +/** + * sort_r - sort an array of elements + * @base: pointer to data to sort + * @num: number of elements + * @size: size of each element + * @cmp_func: pointer to comparison function + * @swap_func: pointer to swap function or NULL + * @priv: third argument passed to comparison function + * + * This function does a heapsort on the given array. You may provide + * a swap_func function if you need to do something more than a memory + * copy (e.g. fix up pointers or auxiliary data), but the built-in swap + * avoids a slow retpoline and so is significantly faster. + * + * The comparison function must adhere to specific mathematical + * properties to ensure correct and stable sorting: + * - Antisymmetry: cmp_func(a, b) must return the opposite sign of + * cmp_func(b, a). + * - Transitivity: if cmp_func(a, b) <= 0 and cmp_func(b, c) <= 0, then + * cmp_func(a, c) <= 0. + * + * Sorting time is O(n log n) both on average and worst-case. While + * quicksort is slightly faster on average, it suffers from exploitable + * O(n*n) worst-case behavior and extra memory requirements that make + * it less suitable for kernel use. + */ +void sort_r(void *base, size_t num, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + __sort_r(base, num, size, cmp_func, swap_func, priv, false); +} EXPORT_SYMBOL(sort_r); +/** + * sort_r_nonatomic - sort an array of elements, with cond_resched + * @base: pointer to data to sort + * @num: number of elements + * @size: size of each element + * @cmp_func: pointer to comparison function + * @swap_func: pointer to swap function or NULL + * @priv: third argument passed to comparison function + * + * Same as sort_r, but preferred for larger arrays as it does a periodic + * cond_resched(). + */ +void sort_r_nonatomic(void *base, size_t num, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + __sort_r(base, num, size, cmp_func, swap_func, priv, true); +} +EXPORT_SYMBOL(sort_r_nonatomic); + void sort(void *base, size_t num, size_t size, cmp_func_t cmp_func, swap_func_t swap_func) @@ -304,6 +339,19 @@ void sort(void *base, size_t num, size_t size, .swap = swap_func, }; - return sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); + return __sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w, false); } EXPORT_SYMBOL(sort); + +void sort_nonatomic(void *base, size_t num, size_t size, + cmp_func_t cmp_func, + swap_func_t swap_func) +{ + struct wrapper w = { + .cmp = cmp_func, + .swap = swap_func, + }; + + return __sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w, true); +} +EXPORT_SYMBOL(sort_nonatomic); diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c index c715e217ec65..3693c6caf2c4 100644 --- a/lib/vdso/datastore.c +++ b/lib/vdso/datastore.c @@ -99,7 +99,8 @@ const struct vm_special_mapping vdso_vvar_mapping = { struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr) { return _install_special_mapping(mm, addr, VDSO_NR_PAGES * PAGE_SIZE, - VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP | VM_PFNMAP, + VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP | + VM_PFNMAP | VM_SEALED_SYSMAP, &vdso_vvar_mapping); } diff --git a/mm/damon/core.c b/mm/damon/core.c index fc1eba3da419..f0c1676f0599 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -76,14 +76,13 @@ int damon_register_ops(struct damon_operations *ops) if (ops->id >= NR_DAMON_OPS) return -EINVAL; + mutex_lock(&damon_ops_lock); /* Fail for already registered ops */ - if (__damon_is_registered_ops(ops->id)) { + if (__damon_is_registered_ops(ops->id)) err = -EINVAL; - goto out; - } - damon_registered_ops[ops->id] = *ops; -out: + else + damon_registered_ops[ops->id] = *ops; mutex_unlock(&damon_ops_lock); return err; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6fccfe6d046c..39f92aad7bd1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5179,7 +5179,7 @@ static const struct ctl_table hugetlb_table[] = { }, }; -static void hugetlb_sysctl_init(void) +static void __init hugetlb_sysctl_init(void) { register_sysctl_init("vm", hugetlb_table); } diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 59d673400085..3ea317837c2d 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1073,14 +1073,11 @@ static void kmem_cache_rcu_uaf(struct kunit *test) kmem_cache_destroy(cache); } -static void empty_cache_ctor(void *object) { } - static void kmem_cache_double_destroy(struct kunit *test) { struct kmem_cache *cache; - /* Provide a constructor to prevent cache merging. */ - cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor); + cache = kmem_cache_create("test_cache", 200, 0, SLAB_NO_MERGE, NULL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); kmem_cache_destroy(cache); KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache)); diff --git a/mm/memblock.c b/mm/memblock.c index 284154445409..0a53db4d9f7b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2167,6 +2167,9 @@ static unsigned long __init __free_memory_core(phys_addr_t start, unsigned long start_pfn = PFN_UP(start); unsigned long end_pfn = PFN_DOWN(end); + if (!IS_ENABLED(CONFIG_HIGHMEM) && end_pfn > max_low_pfn) + end_pfn = max_low_pfn; + if (start_pfn >= end_pfn) return 0; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 75401866fb76..8305483de38b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1813,21 +1813,15 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) page = pfn_to_page(pfn); folio = page_folio(page); - /* - * No reference or lock is held on the folio, so it might - * be modified concurrently (e.g. split). As such, - * folio_nr_pages() may read garbage. This is fine as the outer - * loop will revisit the split folio later. - */ - if (folio_test_large(folio)) - pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; - if (!folio_try_get(folio)) continue; if (unlikely(page_folio(page) != folio)) goto put_folio; + if (folio_test_large(folio)) + pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; + if (folio_contain_hwpoisoned_page(folio)) { if (WARN_ON(folio_test_lru(folio))) folio_isolate_lru(folio); diff --git a/mm/mm_init.c b/mm/mm_init.c index a38a1909b407..84f14fa12d0d 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -984,19 +984,19 @@ static void __init memmap_init(void) } } -#ifdef CONFIG_SPARSEMEM /* * Initialize the memory map for hole in the range [memory_end, - * section_end]. + * section_end] for SPARSEMEM and in the range [memory_end, memmap_end] + * for FLATMEM. * Append the pages in this hole to the highest zone in the last * node. - * The call to init_unavailable_range() is outside the ifdef to - * silence the compiler warining about zone_id set but not used; - * for FLATMEM it is a nop anyway */ +#ifdef CONFIG_SPARSEMEM end_pfn = round_up(end_pfn, PAGES_PER_SECTION); - if (hole_pfn < end_pfn) +#else + end_pfn = round_up(end_pfn, MAX_ORDER_NR_PAGES); #endif + if (hole_pfn < end_pfn) init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); } diff --git a/mm/mremap.c b/mm/mremap.c index 0865387531ed..7db9da609c84 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1561,11 +1561,12 @@ static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm) * adjacent to the expanded vma and otherwise * compatible. */ - vma = vrm->vma = vma_merge_extend(&vmi, vma, vrm->delta); + vma = vma_merge_extend(&vmi, vma, vrm->delta); if (!vma) { vrm_uncharge(vrm); return -ENOMEM; } + vrm->vma = vma; vrm_stat_account(vrm, vrm->delta); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b173c2da641..fd6b865cb1ab 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1593,7 +1593,7 @@ static __always_inline void page_del_and_expand(struct zone *zone, static void check_new_page_bad(struct page *page) { - if (unlikely(page->flags & __PG_HWPOISON)) { + if (unlikely(PageHWPoison(page))) { /* Don't complain about hwpoisoned pages */ if (PageBuddy(page)) __ClearPageBuddy(page); @@ -4604,8 +4604,8 @@ retry: goto retry; /* Reclaim/compaction failed to prevent the fallback */ - if (defrag_mode) { - alloc_flags &= ALLOC_NOFRAGMENT; + if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT)) { + alloc_flags &= ~ALLOC_NOFRAGMENT; goto retry; } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index a051a29e95ad..b2fc5266e3d2 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -83,7 +83,14 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e unsigned int skip_pages; if (PageHuge(page)) { - if (!hugepage_migration_supported(folio_hstate(folio))) + struct hstate *h; + + /* + * The huge page may be freed so can not + * use folio_hstate() directly. + */ + h = size_to_hstate(folio_size(folio)); + if (h && !hugepage_migration_supported(h)) return page; } else if (!folio_test_lru(folio) && !__folio_test_movable(folio)) { return page; diff --git a/mm/zswap.c b/mm/zswap.c index 0dcc54eab58b..204fb59da33c 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -883,18 +883,32 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct acomp_req *req; + struct crypto_acomp *acomp; + u8 *buffer; + + if (IS_ERR_OR_NULL(acomp_ctx)) + return 0; mutex_lock(&acomp_ctx->mutex); - if (!IS_ERR_OR_NULL(acomp_ctx)) { - if (!IS_ERR_OR_NULL(acomp_ctx->req)) - acomp_request_free(acomp_ctx->req); - acomp_ctx->req = NULL; - if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) - crypto_free_acomp(acomp_ctx->acomp); - kfree(acomp_ctx->buffer); - } + req = acomp_ctx->req; + acomp = acomp_ctx->acomp; + buffer = acomp_ctx->buffer; + acomp_ctx->req = NULL; + acomp_ctx->acomp = NULL; + acomp_ctx->buffer = NULL; mutex_unlock(&acomp_ctx->mutex); + /* + * Do the actual freeing after releasing the mutex to avoid subtle + * locking dependencies causing deadlocks. + */ + if (!IS_ERR_OR_NULL(req)) + acomp_request_free(req); + if (!IS_ERR_OR_NULL(acomp)) + crypto_free_acomp(acomp); + kfree(buffer); + return 0; } diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 999f78d380ae..1a05fc153353 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -319,7 +319,8 @@ TRACE_EVENT(foo_bar, __assign_cpumask(cpum, cpumask_bits(mask)); ), - TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar, + TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s [%d] %*pbl", + __entry->foo, __entry->bar, /* * Notice here the use of some helper functions. This includes: @@ -370,7 +371,10 @@ TRACE_EVENT(foo_bar, __get_str(str), __get_str(lstr), __get_bitmask(cpus), __get_cpumask(cpum), - __get_str(vstr)) + __get_str(vstr), + __get_dynamic_array_len(cpus), + __get_dynamic_array_len(cpus), + __get_dynamic_array(cpus)) ); /* diff --git a/scripts/sorttable.c b/scripts/sorttable.c index 7b4b3714b1af..deed676bfe38 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -857,7 +857,7 @@ static void *sort_mcount_loc(void *arg) for (void *ptr = vals; ptr < vals + size; ptr += long_size) { uint64_t key; - key = long_size == 4 ? r((uint32_t *)ptr) : r8((uint64_t *)ptr); + key = long_size == 4 ? *(uint32_t *)ptr : *(uint64_t *)ptr; if (!find_func(key)) { if (long_size == 4) *(uint32_t *)ptr = 0; diff --git a/security/Kconfig b/security/Kconfig index 536061cf33a9..4816fc74f81e 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -51,6 +51,27 @@ config PROC_MEM_NO_FORCE endchoice +config MSEAL_SYSTEM_MAPPINGS + bool "mseal system mappings" + depends on 64BIT + depends on ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS + depends on !CHECKPOINT_RESTORE + help + Apply mseal on system mappings. + The system mappings includes vdso, vvar, vvar_vclock, + vectors (arm compat-mode), sigpage (arm compat-mode), uprobes. + + A 64-bit kernel is required for the memory sealing feature. + No specific hardware features from the CPU are needed. + + WARNING: This feature breaks programs which rely on relocating + or unmapping system mappings. Known broken software at the time + of writing includes CHECKPOINT_RESTORE, UML, gVisor, rr. Therefore + this config can't be enabled universally. + + For complete descriptions of memory sealing, please see + Documentation/userspace-api/mseal.rst + config SECURITY bool "Enable different security models" depends on SYSFS diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index dc4333d23189..8787048c6762 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -586,36 +586,48 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s } } -#define READ_ONCE(x) \ -({ \ - union { typeof(x) __val; char __c[1]; } __u = \ - { .__c = { 0 } }; \ - __read_once_size(&(x), __u.__c, sizeof(x)); \ - __u.__val; \ -}) - -#define WRITE_ONCE(x, val) \ -({ \ - union { typeof(x) __val; char __c[1]; } __u = \ - { .__val = (val) }; \ - __write_once_size(&(x), __u.__c, sizeof(x)); \ - __u.__val; \ -}) - -#define READ_ONCE_ARENA(type, x) \ -({ \ - union { type __val; char __c[1]; } __u = \ - { .__c = { 0 } }; \ - __read_once_size((void *)&(x), __u.__c, sizeof(x)); \ - __u.__val; \ +/* + * __unqual_typeof(x) - Declare an unqualified scalar type, leaving + * non-scalar types unchanged, + * + * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char' + * is not type-compatible with 'signed char', and we define a separate case. + * + * This is copied verbatim from kernel's include/linux/compiler_types.h, but + * with default expression (for pointers) changed from (x) to (typeof(x)0). + * + * This is because LLVM has a bug where for lvalue (x), it does not get rid of + * an extra address_space qualifier, but does in case of rvalue (typeof(x)0). + * Hence, for pointers, we need to create an rvalue expression to get the + * desired type. See https://github.com/llvm/llvm-project/issues/53400. + */ +#define __scalar_type_to_expr_cases(type) \ + unsigned type : (unsigned type)0, signed type : (signed type)0 + +#define __unqual_typeof(x) \ + typeof(_Generic((x), \ + char: (char)0, \ + __scalar_type_to_expr_cases(char), \ + __scalar_type_to_expr_cases(short), \ + __scalar_type_to_expr_cases(int), \ + __scalar_type_to_expr_cases(long), \ + __scalar_type_to_expr_cases(long long), \ + default: (typeof(x))0)) + +#define READ_ONCE(x) \ +({ \ + union { __unqual_typeof(x) __val; char __c[1]; } __u = \ + { .__c = { 0 } }; \ + __read_once_size((__unqual_typeof(x) *)&(x), __u.__c, sizeof(x)); \ + __u.__val; \ }) -#define WRITE_ONCE_ARENA(type, x, val) \ -({ \ - union { type __val; char __c[1]; } __u = \ - { .__val = (val) }; \ - __write_once_size((void *)&(x), __u.__c, sizeof(x)); \ - __u.__val; \ +#define WRITE_ONCE(x, val) \ +({ \ + union { __unqual_typeof(x) __val; char __c[1]; } __u = \ + { .__val = (val) }; \ + __write_once_size((__unqual_typeof(x) *)&(x), __u.__c, sizeof(x)); \ + __u.__val; \ }) /* @@ -648,6 +660,23 @@ static inline u32 log2_u64(u64 v) return log2_u32(v) + 1; } +/* + * Return a value proportionally scaled to the task's weight. + */ +static inline u64 scale_by_task_weight(const struct task_struct *p, u64 value) +{ + return (value * p->scx.weight) / 100; +} + +/* + * Return a value inversely proportional to the task's weight. + */ +static inline u64 scale_by_task_weight_inverse(const struct task_struct *p, u64 value) +{ + return value * 100 / p->scx.weight; +} + + #include "compat.bpf.h" #include "enums.bpf.h" diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h index 6e6c45f14fe1..c2c33df9292c 100644 --- a/tools/sched_ext/include/scx/enum_defs.autogen.h +++ b/tools/sched_ext/include/scx/enum_defs.autogen.h @@ -88,6 +88,8 @@ #define HAVE_SCX_OPS_ENQ_LAST #define HAVE_SCX_OPS_ENQ_EXITING #define HAVE_SCX_OPS_SWITCH_PARTIAL +#define HAVE_SCX_OPS_ENQ_MIGRATION_DISABLED +#define HAVE_SCX_OPS_ALLOW_QUEUED_WAKEUP #define HAVE_SCX_OPS_HAS_CGROUP_WEIGHT #define HAVE_SCX_OPS_ALL_FLAGS #define HAVE_SCX_OPSS_NONE @@ -104,6 +106,7 @@ #define HAVE_SCX_RQ_BAL_PENDING #define HAVE_SCX_RQ_BAL_KEEP #define HAVE_SCX_RQ_BYPASSING +#define HAVE_SCX_RQ_CLK_VALID #define HAVE_SCX_RQ_IN_WAKEUP #define HAVE_SCX_RQ_IN_BALANCE #define HAVE_SCX_TASK_NONE diff --git a/tools/sched_ext/include/scx/enums.autogen.bpf.h b/tools/sched_ext/include/scx/enums.autogen.bpf.h index 0e941a0d6f88..2f8002bcc19a 100644 --- a/tools/sched_ext/include/scx/enums.autogen.bpf.h +++ b/tools/sched_ext/include/scx/enums.autogen.bpf.h @@ -13,6 +13,30 @@ const volatile u64 __SCX_SLICE_DFL __weak; const volatile u64 __SCX_SLICE_INF __weak; #define SCX_SLICE_INF __SCX_SLICE_INF +const volatile u64 __SCX_RQ_ONLINE __weak; +#define SCX_RQ_ONLINE __SCX_RQ_ONLINE + +const volatile u64 __SCX_RQ_CAN_STOP_TICK __weak; +#define SCX_RQ_CAN_STOP_TICK __SCX_RQ_CAN_STOP_TICK + +const volatile u64 __SCX_RQ_BAL_PENDING __weak; +#define SCX_RQ_BAL_PENDING __SCX_RQ_BAL_PENDING + +const volatile u64 __SCX_RQ_BAL_KEEP __weak; +#define SCX_RQ_BAL_KEEP __SCX_RQ_BAL_KEEP + +const volatile u64 __SCX_RQ_BYPASSING __weak; +#define SCX_RQ_BYPASSING __SCX_RQ_BYPASSING + +const volatile u64 __SCX_RQ_CLK_VALID __weak; +#define SCX_RQ_CLK_VALID __SCX_RQ_CLK_VALID + +const volatile u64 __SCX_RQ_IN_WAKEUP __weak; +#define SCX_RQ_IN_WAKEUP __SCX_RQ_IN_WAKEUP + +const volatile u64 __SCX_RQ_IN_BALANCE __weak; +#define SCX_RQ_IN_BALANCE __SCX_RQ_IN_BALANCE + const volatile u64 __SCX_DSQ_FLAG_BUILTIN __weak; #define SCX_DSQ_FLAG_BUILTIN __SCX_DSQ_FLAG_BUILTIN diff --git a/tools/sched_ext/include/scx/enums.autogen.h b/tools/sched_ext/include/scx/enums.autogen.h index 88137a140e72..fedec938584b 100644 --- a/tools/sched_ext/include/scx/enums.autogen.h +++ b/tools/sched_ext/include/scx/enums.autogen.h @@ -8,6 +8,14 @@ SCX_ENUM_SET(skel, scx_public_consts, SCX_OPS_NAME_LEN); \ SCX_ENUM_SET(skel, scx_public_consts, SCX_SLICE_DFL); \ SCX_ENUM_SET(skel, scx_public_consts, SCX_SLICE_INF); \ + SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_ONLINE); \ + SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_CAN_STOP_TICK); \ + SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_BAL_PENDING); \ + SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_BAL_KEEP); \ + SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_BYPASSING); \ + SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_CLK_VALID); \ + SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_IN_WAKEUP); \ + SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_IN_BALANCE); \ SCX_ENUM_SET(skel, scx_dsq_id_flags, SCX_DSQ_FLAG_BUILTIN); \ SCX_ENUM_SET(skel, scx_dsq_id_flags, SCX_DSQ_FLAG_LOCAL_ON); \ SCX_ENUM_SET(skel, scx_dsq_id_flags, SCX_DSQ_INVALID); \ diff --git a/tools/sched_ext/include/scx/enums.h b/tools/sched_ext/include/scx/enums.h index 34cbebe974b7..8e7c91575f0b 100644 --- a/tools/sched_ext/include/scx/enums.h +++ b/tools/sched_ext/include/scx/enums.h @@ -14,7 +14,8 @@ static inline void __ENUM_set(u64 *val, char *type, char *name) bool res; res = __COMPAT_read_enum(type, name, val); - SCX_BUG_ON(!res, "enum not found(%s)", name); + if (!res) + *val = 0; } #define SCX_ENUM_SET(skel, type, name) do { \ diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 0a6572ab6f37..387f3df8b988 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -61,8 +61,11 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o cxl_core-y += $(CXL_CORE_SRC)/hdm.o cxl_core-y += $(CXL_CORE_SRC)/pmu.o cxl_core-y += $(CXL_CORE_SRC)/cdat.o +cxl_core-y += $(CXL_CORE_SRC)/ras.o +cxl_core-y += $(CXL_CORE_SRC)/acpi.o cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o +cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o cxl_core-y += config_check.o cxl_core-y += cxl_core_test.o diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index cc8948f49117..1c3336095923 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -155,7 +155,7 @@ static struct { } cfmws7; struct { struct acpi_cedt_cfmws cfmws; - u32 target[4]; + u32 target[3]; } cfmws8; struct { struct acpi_cedt_cxims cxims; @@ -331,14 +331,14 @@ static struct { .length = sizeof(mock_cedt.cfmws8), }, .interleave_arithmetic = ACPI_CEDT_CFMWS_ARITHMETIC_XOR, - .interleave_ways = 2, - .granularity = 0, + .interleave_ways = 8, + .granularity = 1, .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = FAKE_QTG_ID, - .window_size = SZ_256M * 16UL, + .window_size = SZ_512M * 6UL, }, - .target = { 0, 1, 0, 1, }, + .target = { 0, 1, 2, }, }, .cxims0 = { .cxims = { @@ -1000,25 +1000,21 @@ static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port) find_cxl_root(port); struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); struct access_coordinate ep_c[ACCESS_COORDINATE_MAX]; - struct range pmem_range = { - .start = cxlds->pmem_res.start, - .end = cxlds->pmem_res.end, - }; - struct range ram_range = { - .start = cxlds->ram_res.start, - .end = cxlds->ram_res.end, - }; if (!cxl_root) return; - if (range_len(&ram_range)) - dpa_perf_setup(port, &ram_range, &mds->ram_perf); + for (int i = 0; i < cxlds->nr_partitions; i++) { + struct resource *res = &cxlds->part[i].res; + struct cxl_dpa_perf *perf = &cxlds->part[i].perf; + struct range range = { + .start = res->start, + .end = res->end, + }; - if (range_len(&pmem_range)) - dpa_perf_setup(port, &pmem_range, &mds->pmem_perf); + dpa_perf_setup(port, &range, perf); + } cxl_memdev_update_perf(cxlmd); diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 9495dbcc03a7..f2957a3e36fe 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -78,6 +78,10 @@ static struct cxl_cel_entry mock_cel[] = { .effect = CXL_CMD_EFFECT_NONE, }, { + .opcode = cpu_to_le16(CXL_MBOX_OP_SET_SHUTDOWN_STATE), + .effect = POLICY_CHANGE_IMMEDIATE, + }, + { .opcode = cpu_to_le16(CXL_MBOX_OP_GET_POISON), .effect = CXL_CMD_EFFECT_NONE, }, @@ -178,6 +182,7 @@ struct cxl_mockmem_data { u64 timestamp; unsigned long sanitize_timeout; struct vendor_test_feat test_feat; + u8 shutdown_state; }; static struct mock_event_log *event_find_log(struct device *dev, int log_type) @@ -1105,6 +1110,21 @@ static int mock_health_info(struct cxl_mbox_cmd *cmd) return 0; } +static int mock_set_shutdown_state(struct cxl_mockmem_data *mdata, + struct cxl_mbox_cmd *cmd) +{ + struct cxl_mbox_set_shutdown_state_in *ss = cmd->payload_in; + + if (cmd->size_in != sizeof(*ss)) + return -EINVAL; + + if (cmd->size_out != 0) + return -EINVAL; + + mdata->shutdown_state = ss->state; + return 0; +} + static struct mock_poison { struct cxl_dev_state *cxlds; u64 dpa; @@ -1583,6 +1603,9 @@ static int cxl_mock_mbox_send(struct cxl_mailbox *cxl_mbox, case CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE: rc = mock_passphrase_secure_erase(mdata, cmd); break; + case CXL_MBOX_OP_SET_SHUTDOWN_STATE: + rc = mock_set_shutdown_state(mdata, cmd); + break; case CXL_MBOX_OP_GET_POISON: rc = mock_get_poison(cxlds, cmd); break; @@ -1670,6 +1693,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) struct cxl_dev_state *cxlds; struct cxl_mockmem_data *mdata; struct cxl_mailbox *cxl_mbox; + struct cxl_dpa_info range_info = { 0 }; int rc; mdata = devm_kzalloc(dev, sizeof(*mdata), GFP_KERNEL); @@ -1709,7 +1733,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) mds->event.buf = (struct cxl_get_event_payload *) mdata->event_buf; INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mockmem_sanitize_work); - cxlds->serial = pdev->id; + cxlds->serial = pdev->id + 1; if (is_rcd(pdev)) cxlds->rcd = true; @@ -1730,7 +1754,11 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) if (rc) return rc; - rc = cxl_mem_create_range_info(mds); + rc = cxl_mem_dpa_fetch(mds, &range_info); + if (rc) + return rc; + + rc = cxl_dpa_setup(cxlds, &range_info); if (rc) return rc; diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 2694344274bf..c77c8c8e3d9b 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -62,6 +62,7 @@ TARGETS += mount TARGETS += mount_setattr TARGETS += move_mount_set_group TARGETS += mqueue +TARGETS += mseal_system_mappings TARGETS += nci TARGETS += net TARGETS += net/af_unix diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index 2c725773cd79..1f92e8caceac 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -41,6 +41,31 @@ check_supported_x86_64() fi } +check_supported_ppc64() +{ + local config="/proc/config.gz" + [[ -f "${config}" ]] || config="/boot/config-$(uname -r)" + [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot" + + local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2) + if [[ "${pg_table_levels}" -lt 5 ]]; then + echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test" + exit $ksft_skip + fi + + local mmu_support=$(grep -m1 "mmu" /proc/cpuinfo | awk '{print $3}') + if [[ "$mmu_support" != "radix" ]]; then + echo "$0: System does not use Radix MMU, required for 5-level paging" + exit $ksft_skip + fi + + local hugepages_total=$(awk '/HugePages_Total/ {print $2}' /proc/meminfo) + if [[ "${hugepages_total}" -eq 0 ]]; then + echo "$0: HugePages are not enabled, required for some tests" + exit $ksft_skip + fi +} + check_test_requirements() { # The test supports x86_64 and powerpc64. We currently have no useful @@ -50,6 +75,9 @@ check_test_requirements() "x86_64") check_supported_x86_64 ;; + "ppc64le"|"ppc64") + check_supported_ppc64 + ;; *) return 0 ;; diff --git a/tools/testing/selftests/mseal_system_mappings/.gitignore b/tools/testing/selftests/mseal_system_mappings/.gitignore new file mode 100644 index 000000000000..319c497a595e --- /dev/null +++ b/tools/testing/selftests/mseal_system_mappings/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +sysmap_is_sealed diff --git a/tools/testing/selftests/mseal_system_mappings/Makefile b/tools/testing/selftests/mseal_system_mappings/Makefile new file mode 100644 index 000000000000..2b4504e2f52f --- /dev/null +++ b/tools/testing/selftests/mseal_system_mappings/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only +CFLAGS += -std=c99 -pthread -Wall $(KHDR_INCLUDES) + +TEST_GEN_PROGS := sysmap_is_sealed + +include ../lib.mk diff --git a/tools/testing/selftests/mseal_system_mappings/config b/tools/testing/selftests/mseal_system_mappings/config new file mode 100644 index 000000000000..675cb9f37b86 --- /dev/null +++ b/tools/testing/selftests/mseal_system_mappings/config @@ -0,0 +1 @@ +CONFIG_MSEAL_SYSTEM_MAPPINGS=y diff --git a/tools/testing/selftests/mseal_system_mappings/sysmap_is_sealed.c b/tools/testing/selftests/mseal_system_mappings/sysmap_is_sealed.c new file mode 100644 index 000000000000..0d2af30c3bf5 --- /dev/null +++ b/tools/testing/selftests/mseal_system_mappings/sysmap_is_sealed.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * test system mappings are sealed when + * KCONFIG_MSEAL_SYSTEM_MAPPINGS=y + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <errno.h> +#include <unistd.h> +#include <string.h> +#include <stdbool.h> + +#include "../kselftest.h" +#include "../kselftest_harness.h" + +#define VMFLAGS "VmFlags:" +#define MSEAL_FLAGS "sl" +#define MAX_LINE_LEN 512 + +bool has_mapping(char *name, FILE *maps) +{ + char line[MAX_LINE_LEN]; + + while (fgets(line, sizeof(line), maps)) { + if (strstr(line, name)) + return true; + } + + return false; +} + +bool mapping_is_sealed(char *name, FILE *maps) +{ + char line[MAX_LINE_LEN]; + + while (fgets(line, sizeof(line), maps)) { + if (!strncmp(line, VMFLAGS, strlen(VMFLAGS))) { + if (strstr(line, MSEAL_FLAGS)) + return true; + + return false; + } + } + + return false; +} + +FIXTURE(basic) { + FILE *maps; +}; + +FIXTURE_SETUP(basic) +{ + self->maps = fopen("/proc/self/smaps", "r"); + if (!self->maps) + SKIP(return, "Could not open /proc/self/smap, errno=%d", + errno); +}; + +FIXTURE_TEARDOWN(basic) +{ + if (self->maps) + fclose(self->maps); +}; + +FIXTURE_VARIANT(basic) +{ + char *name; + bool sealed; +}; + +FIXTURE_VARIANT_ADD(basic, vdso) { + .name = "[vdso]", + .sealed = true, +}; + +FIXTURE_VARIANT_ADD(basic, vvar) { + .name = "[vvar]", + .sealed = true, +}; + +FIXTURE_VARIANT_ADD(basic, vvar_vclock) { + .name = "[vvar_vclock]", + .sealed = true, +}; + +FIXTURE_VARIANT_ADD(basic, sigpage) { + .name = "[sigpage]", + .sealed = true, +}; + +FIXTURE_VARIANT_ADD(basic, vectors) { + .name = "[vectors]", + .sealed = true, +}; + +FIXTURE_VARIANT_ADD(basic, uprobes) { + .name = "[uprobes]", + .sealed = true, +}; + +FIXTURE_VARIANT_ADD(basic, stack) { + .name = "[stack]", + .sealed = false, +}; + +TEST_F(basic, check_sealed) +{ + if (!has_mapping(variant->name, self->maps)) { + SKIP(return, "could not find the mapping, %s", + variant->name); + } + + EXPECT_EQ(variant->sealed, + mapping_is_sealed(variant->name, self->maps)); +}; + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/x86/test_mremap_vdso.c b/tools/testing/selftests/x86/test_mremap_vdso.c index d53959e03593..94bee6e0c813 100644 --- a/tools/testing/selftests/x86/test_mremap_vdso.c +++ b/tools/testing/selftests/x86/test_mremap_vdso.c @@ -14,6 +14,7 @@ #include <errno.h> #include <unistd.h> #include <string.h> +#include <stdbool.h> #include <sys/mman.h> #include <sys/auxv.h> @@ -55,13 +56,55 @@ static int try_to_remap(void *vdso_addr, unsigned long size) } +#define VDSO_NAME "[vdso]" +#define VMFLAGS "VmFlags:" +#define MSEAL_FLAGS "sl" +#define MAX_LINE_LEN 512 + +bool vdso_sealed(FILE *maps) +{ + char line[MAX_LINE_LEN]; + bool has_vdso = false; + + while (fgets(line, sizeof(line), maps)) { + if (strstr(line, VDSO_NAME)) + has_vdso = true; + + if (has_vdso && !strncmp(line, VMFLAGS, strlen(VMFLAGS))) { + if (strstr(line, MSEAL_FLAGS)) + return true; + + return false; + } + } + + return false; +} + int main(int argc, char **argv, char **envp) { pid_t child; + FILE *maps; ksft_print_header(); ksft_set_plan(1); + maps = fopen("/proc/self/smaps", "r"); + if (!maps) { + ksft_test_result_skip( + "Could not open /proc/self/smaps, errno=%d\n", + errno); + + return 0; + } + + if (vdso_sealed(maps)) { + ksft_test_result_skip("vdso is sealed\n"); + return 0; + } + + fclose(maps); + child = fork(); if (child == -1) ksft_exit_fail_msg("failed to fork (%d): %m\n", errno); |