summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.mailmap2
-rw-r--r--Documentation/ABI/stable/sysfs-devices-node6
-rw-r--r--Documentation/ABI/testing/sysfs-bus-cxl53
-rw-r--r--Documentation/admin-guide/device-mapper/dm-crypt.rst5
-rw-r--r--Documentation/admin-guide/device-mapper/dm-integrity.rst5
-rw-r--r--Documentation/admin-guide/device-mapper/verity.rst20
-rw-r--r--Documentation/driver-api/cxl/maturity-map.rst2
-rw-r--r--Documentation/features/core/mseal_sys_mappings/arch-support.txt30
-rw-r--r--Documentation/userspace-api/mseal.rst21
-rw-r--r--MAINTAINERS39
-rw-r--r--arch/arm64/Kconfig1
-rw-r--r--arch/arm64/kernel/vdso.c9
-rw-r--r--arch/csky/include/asm/pgalloc.h7
-rw-r--r--arch/hexagon/include/asm/pgalloc.h7
-rw-r--r--arch/loongarch/include/asm/pgalloc.h7
-rw-r--r--arch/m68k/include/asm/sun3_pgalloc.h7
-rw-r--r--arch/microblaze/mm/init.c2
-rw-r--r--arch/mips/include/asm/pgalloc.h7
-rw-r--r--arch/nios2/include/asm/pgalloc.h7
-rw-r--r--arch/openrisc/include/asm/pgalloc.h7
-rw-r--r--arch/riscv/include/asm/pgalloc.h26
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/kernel/vdso.c2
-rw-r--r--arch/sh/include/asm/pgalloc.h7
-rw-r--r--arch/um/include/asm/pgalloc.h21
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/entry/vdso/vma.c5
-rw-r--r--arch/x86/mm/pat/set_memory.c1
-rw-r--r--arch/x86/mm/pgtable.c8
-rw-r--r--drivers/acpi/apei/ghes.c103
-rw-r--r--drivers/acpi/nfit/core.c2
-rw-r--r--drivers/acpi/numa/hmat.c44
-rw-r--r--drivers/base/node.c2
-rw-r--r--drivers/cxl/Kconfig4
-rw-r--r--drivers/cxl/core/Makefile3
-rw-r--r--drivers/cxl/core/acpi.c11
-rw-r--r--drivers/cxl/core/cdat.c102
-rw-r--r--drivers/cxl/core/core.h10
-rw-r--r--drivers/cxl/core/hdm.c382
-rw-r--r--drivers/cxl/core/mbox.c141
-rw-r--r--drivers/cxl/core/mce.c65
-rw-r--r--drivers/cxl/core/mce.h20
-rw-r--r--drivers/cxl/core/memdev.c83
-rw-r--r--drivers/cxl/core/pci.c97
-rw-r--r--drivers/cxl/core/port.c38
-rw-r--r--drivers/cxl/core/ras.c119
-rw-r--r--drivers/cxl/core/region.c336
-rw-r--r--drivers/cxl/core/trace.h81
-rw-r--r--drivers/cxl/cxl.h52
-rw-r--r--drivers/cxl/cxlmem.h77
-rw-r--r--drivers/cxl/cxlpci.h6
-rw-r--r--drivers/cxl/mem.c2
-rw-r--r--drivers/cxl/pci.c7
-rw-r--r--drivers/cxl/pmem.c81
-rw-r--r--drivers/firewire/core-cdev.c42
-rw-r--r--drivers/firmware/efi/cper.c6
-rw-r--r--drivers/firmware/efi/cper_cxl.c39
-rw-r--r--drivers/firmware/efi/cper_cxl.h66
-rw-r--r--drivers/md/Kconfig1
-rw-r--r--drivers/md/dm-bufio.c4
-rw-r--r--drivers/md/dm-cache-target.c96
-rw-r--r--drivers/md/dm-crypt.c41
-rw-r--r--drivers/md/dm-delay.c18
-rw-r--r--drivers/md/dm-ebs-target.c7
-rw-r--r--drivers/md/dm-integrity.c48
-rw-r--r--drivers/md/dm-stripe.c2
-rw-r--r--drivers/md/dm-table.c4
-rw-r--r--drivers/md/dm-vdo/block-map.c13
-rw-r--r--drivers/md/dm-vdo/constants.h3
-rw-r--r--drivers/md/dm-vdo/dedupe.c20
-rw-r--r--drivers/md/dm-vdo/encodings.c20
-rw-r--r--drivers/md/dm-vdo/indexer/index-layout.c5
-rw-r--r--drivers/md/dm-vdo/indexer/index-session.c6
-rw-r--r--drivers/md/dm-vdo/indexer/indexer.h53
-rw-r--r--drivers/md/dm-vdo/io-submitter.c6
-rw-r--r--drivers/md/dm-vdo/io-submitter.h18
-rw-r--r--drivers/md/dm-vdo/packer.h2
-rw-r--r--drivers/md/dm-vdo/priority-table.c2
-rw-r--r--drivers/md/dm-vdo/recovery-journal.h6
-rw-r--r--drivers/md/dm-vdo/slab-depot.c193
-rw-r--r--drivers/md/dm-vdo/slab-depot.h13
-rw-r--r--drivers/md/dm-vdo/types.h3
-rw-r--r--drivers/md/dm-vdo/vdo.c11
-rw-r--r--drivers/md/dm-vdo/vio.c54
-rw-r--r--drivers/md/dm-vdo/vio.h13
-rw-r--r--drivers/md/dm-vdo/wait-queue.c2
-rw-r--r--drivers/md/dm-verity-target.c62
-rw-r--r--drivers/md/dm.c8
-rw-r--r--drivers/nvdimm/claim.c11
-rw-r--r--drivers/nvdimm/label.c3
-rw-r--r--drivers/nvdimm/nd-core.h4
-rw-r--r--drivers/nvdimm/region_devs.c41
-rw-r--r--fs/userfaultfd.c51
-rw-r--r--include/asm-generic/tlb.h14
-rw-r--r--include/cxl/event.h101
-rw-r--r--include/linux/acpi.h11
-rw-r--r--include/linux/cper.h8
-rw-r--r--include/linux/mm.h10
-rw-r--r--include/linux/node.h7
-rw-r--r--include/linux/page-flags.h37
-rw-r--r--include/linux/page_ref.h2
-rw-r--r--include/linux/sort.h11
-rw-r--r--init/Kconfig22
-rw-r--r--kernel/events/uprobes.c3
-rw-r--r--kernel/sched/ext.c8
-rw-r--r--kernel/sched/ext_idle.c12
-rw-r--r--kernel/trace/Kconfig3
-rw-r--r--kernel/trace/ftrace.c1
-rw-r--r--kernel/trace/rv/rv.c3
-rw-r--r--kernel/trace/trace.c1
-rw-r--r--kernel/trace/trace_events.c7
-rw-r--r--lib/sg_split.c2
-rw-r--r--lib/sort.c110
-rw-r--r--lib/vdso/datastore.c3
-rw-r--r--mm/damon/core.c9
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/kasan/kasan_test_c.c5
-rw-r--r--mm/memblock.c3
-rw-r--r--mm/memory_hotplug.c12
-rw-r--r--mm/mm_init.c12
-rw-r--r--mm/mremap.c3
-rw-r--r--mm/page_alloc.c6
-rw-r--r--mm/page_isolation.c9
-rw-r--r--mm/zswap.c30
-rw-r--r--samples/trace_events/trace-events-sample.h8
-rw-r--r--scripts/sorttable.c2
-rw-r--r--security/Kconfig21
-rw-r--r--tools/sched_ext/include/scx/common.bpf.h85
-rw-r--r--tools/sched_ext/include/scx/enum_defs.autogen.h3
-rw-r--r--tools/sched_ext/include/scx/enums.autogen.bpf.h24
-rw-r--r--tools/sched_ext/include/scx/enums.autogen.h8
-rw-r--r--tools/sched_ext/include/scx/enums.h3
-rw-r--r--tools/testing/cxl/Kbuild3
-rw-r--r--tools/testing/cxl/test/cxl.c32
-rw-r--r--tools/testing/cxl/test/mem.c32
-rw-r--r--tools/testing/selftests/Makefile1
-rwxr-xr-xtools/testing/selftests/mm/va_high_addr_switch.sh28
-rw-r--r--tools/testing/selftests/mseal_system_mappings/.gitignore2
-rw-r--r--tools/testing/selftests/mseal_system_mappings/Makefile6
-rw-r--r--tools/testing/selftests/mseal_system_mappings/config1
-rw-r--r--tools/testing/selftests/mseal_system_mappings/sysmap_is_sealed.c119
-rw-r--r--tools/testing/selftests/x86/test_mremap_vdso.c43
142 files changed, 2912 insertions, 1157 deletions
diff --git a/.mailmap b/.mailmap
index c85576e4695f..4f7cd8e23177 100644
--- a/.mailmap
+++ b/.mailmap
@@ -549,6 +549,8 @@ Nicolas Pitre <nico@fluxnic.net> <nicolas.pitre@linaro.org>
Nicolas Pitre <nico@fluxnic.net> <nico@linaro.org>
Nicolas Saenz Julienne <nsaenz@kernel.org> <nsaenzjulienne@suse.de>
Nicolas Saenz Julienne <nsaenz@kernel.org> <nsaenzjulienne@suse.com>
+Nicolas Schier <nicolas.schier@linux.dev> <n.schier@avm.de>
+Nicolas Schier <nicolas.schier@linux.dev> <nicolas@fjasle.eu>
Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Nikolay Aleksandrov <razor@blackwall.org> <naleksan@redhat.com>
Nikolay Aleksandrov <razor@blackwall.org> <nikolay@redhat.com>
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index 402af4b2b905..a02707cb7cbc 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -177,6 +177,12 @@ Description:
The cache write policy: 0 for write-back, 1 for write-through,
other or unknown.
+What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/address_mode
+Date: March 2025
+Contact: Dave Jiang <dave.jiang@intel.com>
+Description:
+ The address mode: 0 for reserved, 1 for extended-linear.
+
What: /sys/devices/system/node/nodeX/x86/sgx_total_bytes
Date: November 2021
Contact: Jarkko Sakkinen <jarkko@kernel.org>
diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
index 3f5627a1210a..99bb3faf7a0e 100644
--- a/Documentation/ABI/testing/sysfs-bus-cxl
+++ b/Documentation/ABI/testing/sysfs-bus-cxl
@@ -1,5 +1,5 @@
What: /sys/bus/cxl/flush
-Date: Januarry, 2022
+Date: January, 2022
KernelVersion: v5.18
Contact: linux-cxl@vger.kernel.org
Description:
@@ -18,6 +18,24 @@ Description:
specification.
+What: /sys/bus/cxl/devices/memX/payload_max
+Date: December, 2020
+KernelVersion: v5.12
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) Maximum size (in bytes) of the mailbox command payload
+ registers. Linux caps this at 1MB if the device reports a
+ larger size.
+
+
+What: /sys/bus/cxl/devices/memX/label_storage_size
+Date: May, 2021
+KernelVersion: v5.13
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) Size (in bytes) of the Label Storage Area (LSA).
+
+
What: /sys/bus/cxl/devices/memX/ram/size
Date: December, 2020
KernelVersion: v5.12
@@ -33,7 +51,7 @@ Date: May, 2023
KernelVersion: v6.8
Contact: linux-cxl@vger.kernel.org
Description:
- (RO) For CXL host platforms that support "QoS Telemmetry"
+ (RO) For CXL host platforms that support "QoS Telemetry"
this attribute conveys a comma delimited list of platform
specific cookies that identifies a QoS performance class
for the volatile partition of the CXL mem device. These
@@ -60,7 +78,7 @@ Date: May, 2023
KernelVersion: v6.8
Contact: linux-cxl@vger.kernel.org
Description:
- (RO) For CXL host platforms that support "QoS Telemmetry"
+ (RO) For CXL host platforms that support "QoS Telemetry"
this attribute conveys a comma delimited list of platform
specific cookies that identifies a QoS performance class
for the persistent partition of the CXL mem device. These
@@ -321,14 +339,13 @@ KernelVersion: v6.0
Contact: linux-cxl@vger.kernel.org
Description:
(RW) When a CXL decoder is of devtype "cxl_decoder_endpoint" it
- translates from a host physical address range, to a device local
- address range. Device-local address ranges are further split
- into a 'ram' (volatile memory) range and 'pmem' (persistent
- memory) range. The 'mode' attribute emits one of 'ram', 'pmem',
- 'mixed', or 'none'. The 'mixed' indication is for error cases
- when a decoder straddles the volatile/persistent partition
- boundary, and 'none' indicates the decoder is not actively
- decoding, or no DPA allocation policy has been set.
+ translates from a host physical address range, to a device
+ local address range. Device-local address ranges are further
+ split into a 'ram' (volatile memory) range and 'pmem'
+ (persistent memory) range. The 'mode' attribute emits one of
+ 'ram', 'pmem', or 'none'. The 'none' indicates the decoder is
+ not actively decoding, or no DPA allocation policy has been
+ set.
'mode' can be written, when the decoder is in the 'disabled'
state, with either 'ram' or 'pmem' to set the boundaries for the
@@ -423,7 +440,7 @@ Date: May, 2023
KernelVersion: v6.5
Contact: linux-cxl@vger.kernel.org
Description:
- (RO) For CXL host platforms that support "QoS Telemmetry" this
+ (RO) For CXL host platforms that support "QoS Telemetry" this
root-decoder-only attribute conveys a platform specific cookie
that identifies a QoS performance class for the CXL Window.
This class-id can be compared against a similar "qos_class"
@@ -586,3 +603,15 @@ Description:
See Documentation/ABI/stable/sysfs-devices-node. access0 provides
the number to the closest initiator and access1 provides the
number to the closest CPU.
+
+
+What: /sys/bus/cxl/devices/nvdimm-bridge0/ndbusX/nmemY/cxl/dirty_shutdown
+Date: Feb, 2025
+KernelVersion: v6.15
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) The device dirty shutdown count value, which is the number
+ of times the device could have incurred in potential data loss.
+ The count is persistent across power loss and wraps back to 0
+ upon overflow. If this file is not present, the device does not
+ have the necessary support for dirty tracking.
diff --git a/Documentation/admin-guide/device-mapper/dm-crypt.rst b/Documentation/admin-guide/device-mapper/dm-crypt.rst
index 9f8139ff97d6..4467f6d4b632 100644
--- a/Documentation/admin-guide/device-mapper/dm-crypt.rst
+++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst
@@ -146,6 +146,11 @@ integrity:<bytes>:<type>
integrity for the encrypted device. The additional space is then
used for storing authentication tag (and persistent IV if needed).
+integrity_key_size:<bytes>
+ Optionally set the integrity key size if it differs from the digest size.
+ It allows the use of wrapped key algorithms where the key size is
+ independent of the cryptographic key size.
+
sector_size:<bytes>
Use <bytes> as the encryption unit instead of 512 bytes sectors.
This option can be in range 512 - 4096 bytes and must be power of two.
diff --git a/Documentation/admin-guide/device-mapper/dm-integrity.rst b/Documentation/admin-guide/device-mapper/dm-integrity.rst
index d8a5f14d0e3c..c2e18ecc065c 100644
--- a/Documentation/admin-guide/device-mapper/dm-integrity.rst
+++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst
@@ -92,6 +92,11 @@ Target arguments:
allowed. This mode is useful for data recovery if the
device cannot be activated in any of the other standard
modes.
+ I - inline mode - in this mode, dm-integrity will store integrity
+ data directly in the underlying device sectors.
+ The underlying device must have an integrity profile that
+ allows storing user integrity data and provides enough
+ space for the selected integrity tag.
5. the number of additional arguments
diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst
index a65c1602cb23..8c3f1f967a3c 100644
--- a/Documentation/admin-guide/device-mapper/verity.rst
+++ b/Documentation/admin-guide/device-mapper/verity.rst
@@ -87,6 +87,15 @@ panic_on_corruption
Panic the device when a corrupted block is discovered. This option is
not compatible with ignore_corruption and restart_on_corruption.
+restart_on_error
+ Restart the system when an I/O error is detected.
+ This option can be combined with the restart_on_corruption option.
+
+panic_on_error
+ Panic the device when an I/O error is detected. This option is
+ not compatible with the restart_on_error option but can be combined
+ with the panic_on_corruption option.
+
ignore_zero_blocks
Do not verify blocks that are expected to contain zeroes and always return
zeroes instead. This may be useful if the partition contains unused blocks
@@ -142,8 +151,15 @@ root_hash_sig_key_desc <key_description>
already in the secondary trusted keyring.
try_verify_in_tasklet
- If verity hashes are in cache, verify data blocks in kernel tasklet instead
- of workqueue. This option can reduce IO latency.
+ If verity hashes are in cache and the IO size does not exceed the limit,
+ verify data blocks in bottom half instead of workqueue. This option can
+ reduce IO latency. The size limits can be configured via
+ /sys/module/dm_verity/parameters/use_bh_bytes. The four parameters
+ correspond to limits for IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT,
+ IOPRIO_CLASS_BE and IOPRIO_CLASS_IDLE in turn.
+ For example:
+ <none>,<rt>,<be>,<idle>
+ 4096,4096,4096,4096
Theory of operation
===================
diff --git a/Documentation/driver-api/cxl/maturity-map.rst b/Documentation/driver-api/cxl/maturity-map.rst
index df8e2ac2a320..a2288f9df658 100644
--- a/Documentation/driver-api/cxl/maturity-map.rst
+++ b/Documentation/driver-api/cxl/maturity-map.rst
@@ -130,7 +130,7 @@ Mailbox commands
* [0] Switch CCI
* [3] Timestamp
* [1] PMEM labels
-* [0] PMEM GPF / Dirty Shutdown
+* [3] PMEM GPF / Dirty Shutdown
* [0] Scan Media
PMU
diff --git a/Documentation/features/core/mseal_sys_mappings/arch-support.txt b/Documentation/features/core/mseal_sys_mappings/arch-support.txt
new file mode 100644
index 000000000000..c6cab9760d57
--- /dev/null
+++ b/Documentation/features/core/mseal_sys_mappings/arch-support.txt
@@ -0,0 +1,30 @@
+#
+# Feature name: mseal-system-mappings
+# Kconfig: ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS
+# description: arch supports mseal system mappings
+#
+ -----------------------
+ | arch |status|
+ -----------------------
+ | alpha: | TODO |
+ | arc: | N/A |
+ | arm: | N/A |
+ | arm64: | ok |
+ | csky: | N/A |
+ | hexagon: | N/A |
+ | loongarch: | TODO |
+ | m68k: | N/A |
+ | microblaze: | N/A |
+ | mips: | TODO |
+ | nios2: | N/A |
+ | openrisc: | N/A |
+ | parisc: | TODO |
+ | powerpc: | TODO |
+ | riscv: | TODO |
+ | s390: | ok |
+ | sh: | N/A |
+ | sparc: | TODO |
+ | um: | TODO |
+ | x86: | ok |
+ | xtensa: | N/A |
+ -----------------------
diff --git a/Documentation/userspace-api/mseal.rst b/Documentation/userspace-api/mseal.rst
index 41102f74c5e2..1dabfc29be0d 100644
--- a/Documentation/userspace-api/mseal.rst
+++ b/Documentation/userspace-api/mseal.rst
@@ -130,6 +130,27 @@ Use cases
- Chrome browser: protect some security sensitive data structures.
+- System mappings:
+ The system mappings are created by the kernel and includes vdso, vvar,
+ vvar_vclock, vectors (arm compat-mode), sigpage (arm compat-mode), uprobes.
+
+ Those system mappings are readonly only or execute only, memory sealing can
+ protect them from ever changing to writable or unmmap/remapped as different
+ attributes. This is useful to mitigate memory corruption issues where a
+ corrupted pointer is passed to a memory management system.
+
+ If supported by an architecture (CONFIG_ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS),
+ the CONFIG_MSEAL_SYSTEM_MAPPINGS seals all system mappings of this
+ architecture.
+
+ The following architectures currently support this feature: x86-64, arm64,
+ and s390.
+
+ WARNING: This feature breaks programs which rely on relocating
+ or unmapping system mappings. Known broken software at the time
+ of writing includes CHECKPOINT_RESTORE, UML, gVisor, rr. Therefore
+ this config can't be enabled universally.
+
When not to use mseal
=====================
Applications can apply sealing to any virtual memory region from userspace,
diff --git a/MAINTAINERS b/MAINTAINERS
index d32ce85c5c66..a7a1d121a83e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15487,6 +15487,45 @@ F: tools/mm/
F: tools/testing/selftests/mm/
N: include/linux/page[-_]*
+MEMORY MANAGEMENT - EXECMEM
+M: Andrew Morton <akpm@linux-foundation.org>
+M: Mike Rapoport <rppt@kernel.org>
+L: linux-mm@kvack.org
+S: Maintained
+F: include/linux/execmem.h
+F: mm/execmem.c
+
+MEMORY MANAGEMENT - NUMA MEMBLOCKS AND NUMA EMULATION
+M: Andrew Morton <akpm@linux-foundation.org>
+M: Mike Rapoport <rppt@kernel.org>
+L: linux-mm@kvack.org
+S: Maintained
+F: include/linux/numa_memblks.h
+F: mm/numa.c
+F: mm/numa_emulation.c
+F: mm/numa_memblks.c
+
+MEMORY MANAGEMENT - SECRETMEM
+M: Andrew Morton <akpm@linux-foundation.org>
+M: Mike Rapoport <rppt@kernel.org>
+L: linux-mm@kvack.org
+S: Maintained
+F: include/linux/secretmem.h
+F: mm/secretmem.c
+
+MEMORY MANAGEMENT - USERFAULTFD
+M: Andrew Morton <akpm@linux-foundation.org>
+R: Peter Xu <peterx@redhat.com>
+L: linux-mm@kvack.org
+S: Maintained
+F: Documentation/admin-guide/mm/userfaultfd.rst
+F: fs/userfaultfd.c
+F: include/asm-generic/pgtable_uffd.h
+F: include/linux/userfaultfd_k.h
+F: include/uapi/linux/userfaultfd.h
+F: mm/userfaultfd.c
+F: tools/testing/selftests/mm/uffd-*.[ch]
+
MEMORY MAPPING
M: Andrew Morton <akpm@linux-foundation.org>
M: Liam R. Howlett <Liam.Howlett@oracle.com>
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 748c34dc953c..a182295e6f08 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -38,6 +38,7 @@ config ARM64
select ARCH_HAS_KEEPINITRD
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_MEM_ENCRYPT
+ select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_NONLEAF_PMD_YOUNG if ARM64_HAFT
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 887ac0b05961..78ddf6bdecad 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -130,7 +130,8 @@ static int __setup_additional_pages(enum vdso_abi abi,
mm->context.vdso = (void *)vdso_base;
ret = _install_special_mapping(mm, vdso_base, vdso_text_len,
VM_READ|VM_EXEC|gp_flags|
- VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+ VM_SEALED_SYSMAP,
vdso_info[abi].cm);
if (IS_ERR(ret))
goto up_fail;
@@ -256,7 +257,8 @@ static int aarch32_kuser_helpers_setup(struct mm_struct *mm)
*/
ret = _install_special_mapping(mm, AARCH32_VECTORS_BASE, PAGE_SIZE,
VM_READ | VM_EXEC |
- VM_MAYREAD | VM_MAYEXEC,
+ VM_MAYREAD | VM_MAYEXEC |
+ VM_SEALED_SYSMAP,
&aarch32_vdso_maps[AA32_MAP_VECTORS]);
return PTR_ERR_OR_ZERO(ret);
@@ -279,7 +281,8 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm)
*/
ret = _install_special_mapping(mm, addr, PAGE_SIZE,
VM_READ | VM_EXEC | VM_MAYREAD |
- VM_MAYWRITE | VM_MAYEXEC,
+ VM_MAYWRITE | VM_MAYEXEC |
+ VM_SEALED_SYSMAP,
&aarch32_vdso_maps[AA32_MAP_SIGPAGE]);
if (IS_ERR(ret))
goto out;
diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h
index bf8400c28b5a..11055c574968 100644
--- a/arch/csky/include/asm/pgalloc.h
+++ b/arch/csky/include/asm/pgalloc.h
@@ -61,11 +61,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
return ret;
}
-#define __pte_free_tlb(tlb, pte, address) \
-do { \
- pagetable_dtor(page_ptdesc(pte)); \
- tlb_remove_page_ptdesc(tlb, page_ptdesc(pte)); \
-} while (0)
+#define __pte_free_tlb(tlb, pte, address) \
+ tlb_remove_ptdesc((tlb), page_ptdesc(pte))
extern void pagetable_init(void);
extern void mmu_init(unsigned long min_pfn, unsigned long max_pfn);
diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h
index 1ee5f5f157ca..937a11ef4c33 100644
--- a/arch/hexagon/include/asm/pgalloc.h
+++ b/arch/hexagon/include/asm/pgalloc.h
@@ -87,10 +87,7 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
max_kernel_seg = pmdindex;
}
-#define __pte_free_tlb(tlb, pte, addr) \
-do { \
- pagetable_dtor((page_ptdesc(pte))); \
- tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
-} while (0)
+#define __pte_free_tlb(tlb, pte, addr) \
+ tlb_remove_ptdesc((tlb), page_ptdesc(pte))
#endif
diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h
index 7211dff8c969..b58f587f0f0a 100644
--- a/arch/loongarch/include/asm/pgalloc.h
+++ b/arch/loongarch/include/asm/pgalloc.h
@@ -55,11 +55,8 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
return pte;
}
-#define __pte_free_tlb(tlb, pte, address) \
-do { \
- pagetable_dtor(page_ptdesc(pte)); \
- tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \
-} while (0)
+#define __pte_free_tlb(tlb, pte, address) \
+ tlb_remove_ptdesc((tlb), page_ptdesc(pte))
#ifndef __PAGETABLE_PMD_FOLDED
diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h
index 80afc3a18724..1e21c758b774 100644
--- a/arch/m68k/include/asm/sun3_pgalloc.h
+++ b/arch/m68k/include/asm/sun3_pgalloc.h
@@ -17,11 +17,8 @@
extern const char bad_pmd_string[];
-#define __pte_free_tlb(tlb, pte, addr) \
-do { \
- pagetable_dtor(page_ptdesc(pte)); \
- tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \
-} while (0)
+#define __pte_free_tlb(tlb, pte, addr) \
+ tlb_remove_ptdesc((tlb), page_ptdesc(pte))
static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
{
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 65f0d1fb8a2a..31d475cdb1c5 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -118,7 +118,7 @@ int page_is_ram(unsigned long pfn)
/*
* Check for command-line options that affect what MMU_init will do.
*/
-static void mm_cmdline_setup(void)
+static void __init mm_cmdline_setup(void)
{
unsigned long maxmem = 0;
char *p = cmd_line;
diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index 26c7a6ede983..bbca420c96d3 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@@ -48,11 +48,8 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
extern void pgd_init(void *addr);
extern pgd_t *pgd_alloc(struct mm_struct *mm);
-#define __pte_free_tlb(tlb, pte, address) \
-do { \
- pagetable_dtor(page_ptdesc(pte)); \
- tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \
-} while (0)
+#define __pte_free_tlb(tlb, pte, address) \
+ tlb_remove_ptdesc((tlb), page_ptdesc(pte))
#ifndef __PAGETABLE_PMD_FOLDED
diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h
index 12a536b7bfbd..db122b093a8b 100644
--- a/arch/nios2/include/asm/pgalloc.h
+++ b/arch/nios2/include/asm/pgalloc.h
@@ -28,10 +28,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
extern pgd_t *pgd_alloc(struct mm_struct *mm);
-#define __pte_free_tlb(tlb, pte, addr) \
- do { \
- pagetable_dtor(page_ptdesc(pte)); \
- tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
- } while (0)
+#define __pte_free_tlb(tlb, pte, addr) \
+ tlb_remove_ptdesc((tlb), page_ptdesc(pte))
#endif /* _ASM_NIOS2_PGALLOC_H */
diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h
index 3372f4e6ab4b..3f110931d8f6 100644
--- a/arch/openrisc/include/asm/pgalloc.h
+++ b/arch/openrisc/include/asm/pgalloc.h
@@ -64,10 +64,7 @@ extern inline pgd_t *pgd_alloc(struct mm_struct *mm)
extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
-#define __pte_free_tlb(tlb, pte, addr) \
-do { \
- pagetable_dtor(page_ptdesc(pte)); \
- tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
-} while (0)
+#define __pte_free_tlb(tlb, pte, addr) \
+ tlb_remove_ptdesc((tlb), page_ptdesc(pte))
#endif
diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
index 3e2aebea6312..770ce18a7328 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@@ -15,24 +15,6 @@
#define __HAVE_ARCH_PUD_FREE
#include <asm-generic/pgalloc.h>
-/*
- * While riscv platforms with riscv_ipi_for_rfence as true require an IPI to
- * perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use
- * SBI to perform TLB shootdown. To keep software pagetable walkers safe in this
- * case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the
- * comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
- * for more details.
- */
-static inline void riscv_tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt)
-{
- if (riscv_use_sbi_for_rfence()) {
- tlb_remove_ptdesc(tlb, pt);
- } else {
- pagetable_dtor(pt);
- tlb_remove_page_ptdesc(tlb, pt);
- }
-}
-
static inline void pmd_populate_kernel(struct mm_struct *mm,
pmd_t *pmd, pte_t *pte)
{
@@ -108,14 +90,14 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
unsigned long addr)
{
if (pgtable_l4_enabled)
- riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud));
+ tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud));
}
static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
unsigned long addr)
{
if (pgtable_l5_enabled)
- riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d));
+ tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d));
}
#endif /* __PAGETABLE_PMD_FOLDED */
@@ -143,7 +125,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
unsigned long addr)
{
- riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd));
+ tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd));
}
#endif /* __PAGETABLE_PMD_FOLDED */
@@ -151,7 +133,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
unsigned long addr)
{
- riscv_tlb_remove_ptdesc(tlb, page_ptdesc(pte));
+ tlb_remove_ptdesc(tlb, page_ptdesc(pte));
}
#endif /* CONFIG_MMU */
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index c809c486d136..b8fa367c1fc9 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -137,6 +137,7 @@ config S390
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
select ARCH_SUPPORTS_HUGETLBFS
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && CC_IS_CLANG
+ select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_SUPPORTS_PER_VMA_LOCK
select ARCH_USE_BUILTIN_BSWAP
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 70c8f9ad13cd..430feb1a5013 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -80,7 +80,7 @@ static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len)
vdso_text_start = vvar_start + VDSO_NR_PAGES * PAGE_SIZE;
/* VM_MAYWRITE for COW so gdb can set breakpoints */
vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len,
- VM_READ|VM_EXEC|
+ VM_READ|VM_EXEC|VM_SEALED_SYSMAP|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
vdso_mapping);
if (IS_ERR(vma)) {
diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h
index 96d938fdf224..6fe7123d38fa 100644
--- a/arch/sh/include/asm/pgalloc.h
+++ b/arch/sh/include/asm/pgalloc.h
@@ -32,10 +32,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
set_pmd(pmd, __pmd((unsigned long)page_address(pte)));
}
-#define __pte_free_tlb(tlb, pte, addr) \
-do { \
- pagetable_dtor(page_ptdesc(pte)); \
- tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
-} while (0)
+#define __pte_free_tlb(tlb, pte, addr) \
+ tlb_remove_ptdesc((tlb), page_ptdesc(pte))
#endif /* __ASM_SH_PGALLOC_H */
diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h
index f0af23c3aeb2..826ec44b58cd 100644
--- a/arch/um/include/asm/pgalloc.h
+++ b/arch/um/include/asm/pgalloc.h
@@ -25,27 +25,18 @@
*/
extern pgd_t *pgd_alloc(struct mm_struct *);
-#define __pte_free_tlb(tlb, pte, address) \
-do { \
- pagetable_dtor(page_ptdesc(pte)); \
- tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
-} while (0)
+#define __pte_free_tlb(tlb, pte, address) \
+ tlb_remove_ptdesc((tlb), page_ptdesc(pte))
#if CONFIG_PGTABLE_LEVELS > 2
-#define __pmd_free_tlb(tlb, pmd, address) \
-do { \
- pagetable_dtor(virt_to_ptdesc(pmd)); \
- tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \
-} while (0)
+#define __pmd_free_tlb(tlb, pmd, address) \
+ tlb_remove_ptdesc((tlb), virt_to_ptdesc(pmd))
#if CONFIG_PGTABLE_LEVELS > 3
-#define __pud_free_tlb(tlb, pud, address) \
-do { \
- pagetable_dtor(virt_to_ptdesc(pud)); \
- tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pud)); \
-} while (0)
+#define __pud_free_tlb(tlb, pud, address) \
+ tlb_remove_ptdesc((tlb), virt_to_ptdesc(pud))
#endif
#endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2caf4c0f789e..85ba2e187571 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86_64
# Options that are inherently 64-bit kernel only:
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_PTDUMP
+ select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
select ARCH_SUPPORTS_PER_VMA_LOCK
select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 9518bf1ddf35..adb299d3b6a1 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -162,7 +162,8 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
text_start,
image->size,
VM_READ|VM_EXEC|
- VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+ VM_SEALED_SYSMAP,
&vdso_mapping);
if (IS_ERR(vma)) {
@@ -181,7 +182,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
VDSO_VCLOCK_PAGES_START(addr),
VDSO_NR_VCLOCK_PAGES * PAGE_SIZE,
VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
- VM_PFNMAP,
+ VM_PFNMAP|VM_SEALED_SYSMAP,
&vvar_vclock_mapping);
if (IS_ERR(vma)) {
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 72405d315b41..def3d9284254 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2274,6 +2274,7 @@ int set_mce_nospec(unsigned long pfn)
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
return rc;
}
+EXPORT_SYMBOL_GPL(set_mce_nospec);
/* Restore full speculative operation to the pfn. */
int clear_mce_nospec(unsigned long pfn)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index cec321fb74f2..a05fcddfc811 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -20,7 +20,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm)
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
paravirt_release_pte(page_to_pfn(pte));
- tlb_remove_table(tlb, page_ptdesc(pte));
+ tlb_remove_ptdesc(tlb, page_ptdesc(pte));
}
#if CONFIG_PGTABLE_LEVELS > 2
@@ -34,21 +34,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
#ifdef CONFIG_X86_PAE
tlb->need_flush_all = 1;
#endif
- tlb_remove_table(tlb, virt_to_ptdesc(pmd));
+ tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd));
}
#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- tlb_remove_table(tlb, virt_to_ptdesc(pud));
+ tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
- tlb_remove_table(tlb, virt_to_ptdesc(p4d));
+ tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index b72772494655..289e365f84b2 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -674,6 +674,105 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
schedule_work(&entry->work);
}
+/* Room for 8 entries */
+#define CXL_CPER_PROT_ERR_FIFO_DEPTH 8
+static DEFINE_KFIFO(cxl_cper_prot_err_fifo, struct cxl_cper_prot_err_work_data,
+ CXL_CPER_PROT_ERR_FIFO_DEPTH);
+
+/* Synchronize schedule_work() with cxl_cper_prot_err_work changes */
+static DEFINE_SPINLOCK(cxl_cper_prot_err_work_lock);
+struct work_struct *cxl_cper_prot_err_work;
+
+static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
+ int severity)
+{
+#ifdef CONFIG_ACPI_APEI_PCIEAER
+ struct cxl_cper_prot_err_work_data wd;
+ u8 *dvsec_start, *cap_start;
+
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS)) {
+ pr_err_ratelimited("CXL CPER invalid agent type\n");
+ return;
+ }
+
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
+ pr_err_ratelimited("CXL CPER invalid protocol error log\n");
+ return;
+ }
+
+ if (prot_err->err_len != sizeof(struct cxl_ras_capability_regs)) {
+ pr_err_ratelimited("CXL CPER invalid RAS Cap size (%u)\n",
+ prot_err->err_len);
+ return;
+ }
+
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER))
+ pr_warn(FW_WARN "CXL CPER no device serial number\n");
+
+ guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock);
+
+ if (!cxl_cper_prot_err_work)
+ return;
+
+ switch (prot_err->agent_type) {
+ case RCD:
+ case DEVICE:
+ case LD:
+ case FMLD:
+ case RP:
+ case DSP:
+ case USP:
+ memcpy(&wd.prot_err, prot_err, sizeof(wd.prot_err));
+
+ dvsec_start = (u8 *)(prot_err + 1);
+ cap_start = dvsec_start + prot_err->dvsec_len;
+
+ memcpy(&wd.ras_cap, cap_start, sizeof(wd.ras_cap));
+ wd.severity = cper_severity_to_aer(severity);
+ break;
+ default:
+ pr_err_ratelimited("CXL CPER invalid agent type: %d\n",
+ prot_err->agent_type);
+ return;
+ }
+
+ if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) {
+ pr_err_ratelimited("CXL CPER kfifo overflow\n");
+ return;
+ }
+
+ schedule_work(cxl_cper_prot_err_work);
+#endif
+}
+
+int cxl_cper_register_prot_err_work(struct work_struct *work)
+{
+ if (cxl_cper_prot_err_work)
+ return -EINVAL;
+
+ guard(spinlock)(&cxl_cper_prot_err_work_lock);
+ cxl_cper_prot_err_work = work;
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_register_prot_err_work, "CXL");
+
+int cxl_cper_unregister_prot_err_work(struct work_struct *work)
+{
+ if (cxl_cper_prot_err_work != work)
+ return -EINVAL;
+
+ guard(spinlock)(&cxl_cper_prot_err_work_lock);
+ cxl_cper_prot_err_work = NULL;
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_prot_err_work, "CXL");
+
+int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
+{
+ return kfifo_get(&cxl_cper_prot_err_fifo, wd);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_prot_err_kfifo_get, "CXL");
+
/* Room for 8 entries for each of the 4 event log queues */
#define CXL_CPER_FIFO_DEPTH 32
DEFINE_KFIFO(cxl_cper_fifo, struct cxl_cper_work_data, CXL_CPER_FIFO_DEPTH);
@@ -777,6 +876,10 @@ static bool ghes_do_proc(struct ghes *ghes,
}
else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
queued = ghes_handle_arm_hw_error(gdata, sev, sync);
+ } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
+ struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
+
+ cxl_cper_post_prot_err(prot_err, gdata->error_severity);
} else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index a5d47819b3a4..ae035b93da08 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -485,7 +485,7 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
cmd_mask = nd_desc->cmd_mask;
if (cmd == ND_CMD_CALL && call_pkg->nd_family) {
family = call_pkg->nd_family;
- if (family > NVDIMM_BUS_FAMILY_MAX ||
+ if (call_pkg->nd_family > NVDIMM_BUS_FAMILY_MAX ||
!test_bit(family, &nd_desc->bus_family_mask))
return -EINVAL;
family = array_index_nospec(family,
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index bfbb08b1e6af..9d9052258e92 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -108,6 +108,45 @@ static struct memory_target *find_mem_target(unsigned int mem_pxm)
return NULL;
}
+/**
+ * hmat_get_extended_linear_cache_size - Retrieve the extended linear cache size
+ * @backing_res: resource from the backing media
+ * @nid: node id for the memory region
+ * @cache_size: (Output) size of extended linear cache.
+ *
+ * Return: 0 on success. Errno on failure.
+ *
+ */
+int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid,
+ resource_size_t *cache_size)
+{
+ unsigned int pxm = node_to_pxm(nid);
+ struct memory_target *target;
+ struct target_cache *tcache;
+ struct resource *res;
+
+ target = find_mem_target(pxm);
+ if (!target)
+ return -ENOENT;
+
+ list_for_each_entry(tcache, &target->caches, node) {
+ if (tcache->cache_attrs.address_mode !=
+ NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR)
+ continue;
+
+ res = &target->memregions;
+ if (!resource_contains(res, backing_res))
+ continue;
+
+ *cache_size = tcache->cache_attrs.size;
+ return 0;
+ }
+
+ *cache_size = 0;
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(hmat_get_extended_linear_cache_size, "CXL");
+
static struct memory_target *acpi_find_genport_target(u32 uid)
{
struct memory_target *target;
@@ -506,6 +545,11 @@ static __init int hmat_parse_cache(union acpi_subtable_headers *header,
switch ((attrs & ACPI_HMAT_CACHE_ASSOCIATIVITY) >> 8) {
case ACPI_HMAT_CA_DIRECT_MAPPED:
tcache->cache_attrs.indexing = NODE_CACHE_DIRECT_MAP;
+ /* Extended Linear mode is only valid if cache is direct mapped */
+ if (cache->address_mode == ACPI_HMAT_CACHE_MODE_EXTENDED_LINEAR) {
+ tcache->cache_attrs.address_mode =
+ NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR;
+ }
break;
case ACPI_HMAT_CA_COMPLEX_CACHE_INDEXING:
tcache->cache_attrs.indexing = NODE_CACHE_INDEXED;
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 0ea653fa3433..cd13ef287011 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -244,12 +244,14 @@ CACHE_ATTR(size, "%llu")
CACHE_ATTR(line_size, "%u")
CACHE_ATTR(indexing, "%u")
CACHE_ATTR(write_policy, "%u")
+CACHE_ATTR(address_mode, "%#x")
static struct attribute *cache_attrs[] = {
&dev_attr_indexing.attr,
&dev_attr_size.attr,
&dev_attr_line_size.attr,
&dev_attr_write_policy.attr,
+ &dev_attr_address_mode.attr,
NULL,
};
ATTRIBUTE_GROUPS(cache);
diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index 8ac1e9d70eeb..cf1ba673b8c2 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -158,4 +158,8 @@ config CXL_REGION_INVALIDATION_TEST
If unsure, or if this kernel is meant for production environments,
say N.
+config CXL_MCE
+ def_bool y
+ depends on X86_MCE && MEMORY_FAILURE
+
endif
diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
index b0bfbd9eac9b..086df97a0fcf 100644
--- a/drivers/cxl/core/Makefile
+++ b/drivers/cxl/core/Makefile
@@ -14,6 +14,9 @@ cxl_core-y += pci.o
cxl_core-y += hdm.o
cxl_core-y += pmu.o
cxl_core-y += cdat.o
+cxl_core-y += ras.o
+cxl_core-y += acpi.o
cxl_core-$(CONFIG_TRACING) += trace.o
cxl_core-$(CONFIG_CXL_REGION) += region.o
+cxl_core-$(CONFIG_CXL_MCE) += mce.o
cxl_core-$(CONFIG_CXL_FEATURES) += features.o
diff --git a/drivers/cxl/core/acpi.c b/drivers/cxl/core/acpi.c
new file mode 100644
index 000000000000..f13b4dae6ac5
--- /dev/null
+++ b/drivers/cxl/core/acpi.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
+#include <linux/acpi.h>
+#include "cxl.h"
+#include "core.h"
+
+int cxl_acpi_get_extended_linear_cache_size(struct resource *backing_res,
+ int nid, resource_size_t *size)
+{
+ return hmat_get_extended_linear_cache_size(backing_res, nid, size);
+}
diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c
index 8153f8d83a16..edb4f41eeacc 100644
--- a/drivers/cxl/core/cdat.c
+++ b/drivers/cxl/core/cdat.c
@@ -258,27 +258,29 @@ static void update_perf_entry(struct device *dev, struct dsmas_entry *dent,
static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds,
struct xarray *dsmas_xa)
{
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
struct device *dev = cxlds->dev;
- struct range pmem_range = {
- .start = cxlds->pmem_res.start,
- .end = cxlds->pmem_res.end,
- };
- struct range ram_range = {
- .start = cxlds->ram_res.start,
- .end = cxlds->ram_res.end,
- };
struct dsmas_entry *dent;
unsigned long index;
xa_for_each(dsmas_xa, index, dent) {
- if (resource_size(&cxlds->ram_res) &&
- range_contains(&ram_range, &dent->dpa_range))
- update_perf_entry(dev, dent, &mds->ram_perf);
- else if (resource_size(&cxlds->pmem_res) &&
- range_contains(&pmem_range, &dent->dpa_range))
- update_perf_entry(dev, dent, &mds->pmem_perf);
- else
+ bool found = false;
+
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ struct resource *res = &cxlds->part[i].res;
+ struct range range = {
+ .start = res->start,
+ .end = res->end,
+ };
+
+ if (range_contains(&range, &dent->dpa_range)) {
+ update_perf_entry(dev, dent,
+ &cxlds->part[i].perf);
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
dev_dbg(dev, "no partition for dsmas dpa: %pra\n",
&dent->dpa_range);
}
@@ -343,36 +345,46 @@ static int match_cxlrd_hb(struct device *dev, void *data)
return 0;
}
-static int cxl_qos_class_verify(struct cxl_memdev *cxlmd)
+static void cxl_qos_class_verify(struct cxl_memdev *cxlmd)
{
struct cxl_dev_state *cxlds = cxlmd->cxlds;
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
struct cxl_port *root_port;
- int rc;
struct cxl_root *cxl_root __free(put_cxl_root) =
find_cxl_root(cxlmd->endpoint);
+ /*
+ * No need to reset_dpa_perf() here as find_cxl_root() is guaranteed to
+ * succeed when called in the cxl_endpoint_port_probe() path.
+ */
if (!cxl_root)
- return -ENODEV;
+ return;
root_port = &cxl_root->port;
- /* Check that the QTG IDs are all sane between end device and root decoders */
- if (!cxl_qos_match(root_port, &mds->ram_perf))
- reset_dpa_perf(&mds->ram_perf);
- if (!cxl_qos_match(root_port, &mds->pmem_perf))
- reset_dpa_perf(&mds->pmem_perf);
-
- /* Check to make sure that the device's host bridge is under a root decoder */
- rc = device_for_each_child(&root_port->dev,
- cxlmd->endpoint->host_bridge, match_cxlrd_hb);
- if (!rc) {
- reset_dpa_perf(&mds->ram_perf);
- reset_dpa_perf(&mds->pmem_perf);
+ /*
+ * Save userspace from needing to check if a qos class has any matches
+ * by hiding qos class info if the memdev is not mapped by a root
+ * decoder, or the partition class does not match any root decoder
+ * class.
+ */
+ if (!device_for_each_child(&root_port->dev,
+ cxlmd->endpoint->host_bridge,
+ match_cxlrd_hb)) {
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ struct cxl_dpa_perf *perf = &cxlds->part[i].perf;
+
+ reset_dpa_perf(perf);
+ }
+ return;
}
- return rc;
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ struct cxl_dpa_perf *perf = &cxlds->part[i].perf;
+
+ if (!cxl_qos_match(root_port, perf))
+ reset_dpa_perf(perf);
+ }
}
static void discard_dsmas(struct xarray *xa)
@@ -570,23 +582,18 @@ static bool dpa_perf_contains(struct cxl_dpa_perf *perf,
return range_contains(&perf->dpa_range, &dpa);
}
-static struct cxl_dpa_perf *cxled_get_dpa_perf(struct cxl_endpoint_decoder *cxled,
- enum cxl_decoder_mode mode)
+static struct cxl_dpa_perf *cxled_get_dpa_perf(struct cxl_endpoint_decoder *cxled)
{
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
struct cxl_dpa_perf *perf;
- switch (mode) {
- case CXL_DECODER_RAM:
- perf = &mds->ram_perf;
- break;
- case CXL_DECODER_PMEM:
- perf = &mds->pmem_perf;
- break;
- default:
+ if (cxled->part < 0)
+ return ERR_PTR(-EINVAL);
+ perf = &cxlds->part[cxled->part].perf;
+
+ if (!perf)
return ERR_PTR(-EINVAL);
- }
if (!dpa_perf_contains(perf, cxled->dpa_res))
return ERR_PTR(-EINVAL);
@@ -647,11 +654,10 @@ static int cxl_endpoint_gather_bandwidth(struct cxl_region *cxlr,
if (cxlds->rcd)
return -ENODEV;
- perf = cxled_get_dpa_perf(cxled, cxlr->mode);
+ perf = cxled_get_dpa_perf(cxled);
if (IS_ERR(perf))
return PTR_ERR(perf);
- gp_port = to_cxl_port(parent_port->dev.parent);
*gp_is_root = is_cxl_root(gp_port);
/*
@@ -1053,7 +1059,7 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
lockdep_assert_held(&cxl_dpa_rwsem);
- perf = cxled_get_dpa_perf(cxled, cxlr->mode);
+ perf = cxled_get_dpa_perf(cxled);
if (IS_ERR(perf))
return;
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 17e99a25c29a..15699299dc11 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -74,8 +74,8 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
resource_size_t length);
struct dentry *cxl_debugfs_create_dir(const char *dir);
-int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
- enum cxl_decoder_mode mode);
+int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled,
+ enum cxl_partition_mode mode);
int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size);
int cxl_dpa_free(struct cxl_endpoint_decoder *cxled);
resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled);
@@ -117,6 +117,12 @@ bool cxl_need_node_perf_attrs_update(int nid);
int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
struct access_coordinate *c);
+int cxl_ras_init(void);
+void cxl_ras_exit(void);
+int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port);
+int cxl_acpi_get_extended_linear_cache_size(struct resource *backing_res,
+ int nid, resource_size_t *size);
+
#ifdef CONFIG_CXL_FEATURES
size_t cxl_get_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid,
enum cxl_get_feat_selection selection,
diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index 50e6a45b30ba..70cae4ebf8a4 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -213,16 +213,46 @@ void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds)
{
struct resource *p1, *p2;
- down_read(&cxl_dpa_rwsem);
+ guard(rwsem_read)(&cxl_dpa_rwsem);
for (p1 = cxlds->dpa_res.child; p1; p1 = p1->sibling) {
__cxl_dpa_debug(file, p1, 0);
for (p2 = p1->child; p2; p2 = p2->sibling)
__cxl_dpa_debug(file, p2, 1);
}
- up_read(&cxl_dpa_rwsem);
}
EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, "CXL");
+/* See request_skip() kernel-doc */
+static resource_size_t __adjust_skip(struct cxl_dev_state *cxlds,
+ const resource_size_t skip_base,
+ const resource_size_t skip_len,
+ const char *requester)
+{
+ const resource_size_t skip_end = skip_base + skip_len - 1;
+
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ const struct resource *part_res = &cxlds->part[i].res;
+ resource_size_t adjust_start, adjust_end, size;
+
+ adjust_start = max(skip_base, part_res->start);
+ adjust_end = min(skip_end, part_res->end);
+
+ if (adjust_end < adjust_start)
+ continue;
+
+ size = adjust_end - adjust_start + 1;
+
+ if (!requester)
+ __release_region(&cxlds->dpa_res, adjust_start, size);
+ else if (!__request_region(&cxlds->dpa_res, adjust_start, size,
+ requester, 0))
+ return adjust_start - skip_base;
+ }
+
+ return skip_len;
+}
+#define release_skip(c, b, l) __adjust_skip((c), (b), (l), NULL)
+
/*
* Must be called in a context that synchronizes against this decoder's
* port ->remove() callback (like an endpoint decoder sysfs attribute)
@@ -241,7 +271,7 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
skip_start = res->start - cxled->skip;
__release_region(&cxlds->dpa_res, res->start, resource_size(res));
if (cxled->skip)
- __release_region(&cxlds->dpa_res, skip_start, cxled->skip);
+ release_skip(cxlds, skip_start, cxled->skip);
cxled->skip = 0;
cxled->dpa_res = NULL;
put_device(&cxled->cxld.dev);
@@ -250,9 +280,8 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
static void cxl_dpa_release(void *cxled)
{
- down_write(&cxl_dpa_rwsem);
+ guard(rwsem_write)(&cxl_dpa_rwsem);
__cxl_dpa_release(cxled);
- up_write(&cxl_dpa_rwsem);
}
/*
@@ -268,6 +297,58 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
__cxl_dpa_release(cxled);
}
+/**
+ * request_skip() - Track DPA 'skip' in @cxlds->dpa_res resource tree
+ * @cxlds: CXL.mem device context that parents @cxled
+ * @cxled: Endpoint decoder establishing new allocation that skips lower DPA
+ * @skip_base: DPA < start of new DPA allocation (DPAnew)
+ * @skip_len: @skip_base + @skip_len == DPAnew
+ *
+ * DPA 'skip' arises from out-of-sequence DPA allocation events relative
+ * to free capacity across multiple partitions. It is a wasteful event
+ * as usable DPA gets thrown away, but if a deployment has, for example,
+ * a dual RAM+PMEM device, wants to use PMEM, and has unallocated RAM
+ * DPA, the free RAM DPA must be sacrificed to start allocating PMEM.
+ * See third "Implementation Note" in CXL 3.1 8.2.4.19.13 "Decoder
+ * Protection" for more details.
+ *
+ * A 'skip' always covers the last allocated DPA in a previous partition
+ * to the start of the current partition to allocate. Allocations never
+ * start in the middle of a partition, and allocations are always
+ * de-allocated in reverse order (see cxl_dpa_free(), or natural devm
+ * unwind order from forced in-order allocation).
+ *
+ * If @cxlds->nr_partitions was guaranteed to be <= 2 then the 'skip'
+ * would always be contained to a single partition. Given
+ * @cxlds->nr_partitions may be > 2 it results in cases where the 'skip'
+ * might span "tail capacity of partition[0], all of partition[1], ...,
+ * all of partition[N-1]" to support allocating from partition[N]. That
+ * in turn interacts with the partition 'struct resource' boundaries
+ * within @cxlds->dpa_res whereby 'skip' requests need to be divided by
+ * partition. I.e. this is a quirk of using a 'struct resource' tree to
+ * detect range conflicts while also tracking partition boundaries in
+ * @cxlds->dpa_res.
+ */
+static int request_skip(struct cxl_dev_state *cxlds,
+ struct cxl_endpoint_decoder *cxled,
+ const resource_size_t skip_base,
+ const resource_size_t skip_len)
+{
+ resource_size_t skipped = __adjust_skip(cxlds, skip_base, skip_len,
+ dev_name(&cxled->cxld.dev));
+
+ if (skipped == skip_len)
+ return 0;
+
+ dev_dbg(cxlds->dev,
+ "%s: failed to reserve skipped space (%pa %pa %pa)\n",
+ dev_name(&cxled->cxld.dev), &skip_base, &skip_len, &skipped);
+
+ release_skip(cxlds, skip_base, skipped);
+
+ return -EBUSY;
+}
+
static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
resource_size_t base, resource_size_t len,
resource_size_t skipped)
@@ -277,6 +358,7 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
struct cxl_dev_state *cxlds = cxlmd->cxlds;
struct device *dev = &port->dev;
struct resource *res;
+ int rc;
lockdep_assert_held_write(&cxl_dpa_rwsem);
@@ -305,14 +387,9 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
}
if (skipped) {
- res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
- dev_name(&cxled->cxld.dev), 0);
- if (!res) {
- dev_dbg(dev,
- "decoder%d.%d: failed to reserve skipped space\n",
- port->id, cxled->cxld.id);
- return -EBUSY;
- }
+ rc = request_skip(cxlds, cxled, base - skipped, skipped);
+ if (rc)
+ return rc;
}
res = __request_region(&cxlds->dpa_res, base, len,
dev_name(&cxled->cxld.dev), 0);
@@ -320,28 +397,117 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
port->id, cxled->cxld.id);
if (skipped)
- __release_region(&cxlds->dpa_res, base - skipped,
- skipped);
+ release_skip(cxlds, base - skipped, skipped);
return -EBUSY;
}
cxled->dpa_res = res;
cxled->skip = skipped;
- if (resource_contains(&cxlds->pmem_res, res))
- cxled->mode = CXL_DECODER_PMEM;
- else if (resource_contains(&cxlds->ram_res, res))
- cxled->mode = CXL_DECODER_RAM;
- else {
- dev_warn(dev, "decoder%d.%d: %pr mixed mode not supported\n",
- port->id, cxled->cxld.id, cxled->dpa_res);
- cxled->mode = CXL_DECODER_MIXED;
- }
+ /*
+ * When allocating new capacity, ->part is already set, when
+ * discovering decoder settings at initial enumeration, ->part
+ * is not set.
+ */
+ if (cxled->part < 0)
+ for (int i = 0; cxlds->nr_partitions; i++)
+ if (resource_contains(&cxlds->part[i].res, res)) {
+ cxled->part = i;
+ break;
+ }
+
+ if (cxled->part < 0)
+ dev_warn(dev, "decoder%d.%d: %pr does not map any partition\n",
+ port->id, cxled->cxld.id, res);
port->hdm_end++;
get_device(&cxled->cxld.dev);
return 0;
}
+static int add_dpa_res(struct device *dev, struct resource *parent,
+ struct resource *res, resource_size_t start,
+ resource_size_t size, const char *type)
+{
+ int rc;
+
+ *res = (struct resource) {
+ .name = type,
+ .start = start,
+ .end = start + size - 1,
+ .flags = IORESOURCE_MEM,
+ };
+ if (resource_size(res) == 0) {
+ dev_dbg(dev, "DPA(%s): no capacity\n", res->name);
+ return 0;
+ }
+ rc = request_resource(parent, res);
+ if (rc) {
+ dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name,
+ res, rc);
+ return rc;
+ }
+
+ dev_dbg(dev, "DPA(%s): %pr\n", res->name, res);
+
+ return 0;
+}
+
+static const char *cxl_mode_name(enum cxl_partition_mode mode)
+{
+ switch (mode) {
+ case CXL_PARTMODE_RAM:
+ return "ram";
+ case CXL_PARTMODE_PMEM:
+ return "pmem";
+ default:
+ return "";
+ };
+}
+
+/* if this fails the caller must destroy @cxlds, there is no recovery */
+int cxl_dpa_setup(struct cxl_dev_state *cxlds, const struct cxl_dpa_info *info)
+{
+ struct device *dev = cxlds->dev;
+
+ guard(rwsem_write)(&cxl_dpa_rwsem);
+
+ if (cxlds->nr_partitions)
+ return -EBUSY;
+
+ if (!info->size || !info->nr_partitions) {
+ cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
+ cxlds->nr_partitions = 0;
+ return 0;
+ }
+
+ cxlds->dpa_res = DEFINE_RES_MEM(0, info->size);
+
+ for (int i = 0; i < info->nr_partitions; i++) {
+ const struct cxl_dpa_part_info *part = &info->part[i];
+ int rc;
+
+ cxlds->part[i].perf.qos_class = CXL_QOS_CLASS_INVALID;
+ cxlds->part[i].mode = part->mode;
+
+ /* Require ordered + contiguous partitions */
+ if (i) {
+ const struct cxl_dpa_part_info *prev = &info->part[i - 1];
+
+ if (prev->range.end + 1 != part->range.start)
+ return -EINVAL;
+ }
+ rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->part[i].res,
+ part->range.start, range_len(&part->range),
+ cxl_mode_name(part->mode));
+ if (rc)
+ return rc;
+ cxlds->nr_partitions++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cxl_dpa_setup);
+
int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
resource_size_t base, resource_size_t len,
resource_size_t skipped)
@@ -362,14 +528,11 @@ EXPORT_SYMBOL_NS_GPL(devm_cxl_dpa_reserve, "CXL");
resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled)
{
- resource_size_t size = 0;
-
- down_read(&cxl_dpa_rwsem);
+ guard(rwsem_read)(&cxl_dpa_rwsem);
if (cxled->dpa_res)
- size = resource_size(cxled->dpa_res);
- up_read(&cxl_dpa_rwsem);
+ return resource_size(cxled->dpa_res);
- return size;
+ return 0;
}
resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled)
@@ -387,151 +550,136 @@ int cxl_dpa_free(struct cxl_endpoint_decoder *cxled)
{
struct cxl_port *port = cxled_to_port(cxled);
struct device *dev = &cxled->cxld.dev;
- int rc;
- down_write(&cxl_dpa_rwsem);
- if (!cxled->dpa_res) {
- rc = 0;
- goto out;
- }
+ guard(rwsem_write)(&cxl_dpa_rwsem);
+ if (!cxled->dpa_res)
+ return 0;
if (cxled->cxld.region) {
dev_dbg(dev, "decoder assigned to: %s\n",
dev_name(&cxled->cxld.region->dev));
- rc = -EBUSY;
- goto out;
+ return -EBUSY;
}
if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) {
dev_dbg(dev, "decoder enabled\n");
- rc = -EBUSY;
- goto out;
+ return -EBUSY;
}
if (cxled->cxld.id != port->hdm_end) {
dev_dbg(dev, "expected decoder%d.%d\n", port->id,
port->hdm_end);
- rc = -EBUSY;
- goto out;
+ return -EBUSY;
}
+
devm_cxl_dpa_release(cxled);
- rc = 0;
-out:
- up_write(&cxl_dpa_rwsem);
- return rc;
+ return 0;
}
-int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
- enum cxl_decoder_mode mode)
+int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled,
+ enum cxl_partition_mode mode)
{
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
struct device *dev = &cxled->cxld.dev;
-
- switch (mode) {
- case CXL_DECODER_RAM:
- case CXL_DECODER_PMEM:
- break;
- default:
- dev_dbg(dev, "unsupported mode: %d\n", mode);
- return -EINVAL;
- }
+ int part;
guard(rwsem_write)(&cxl_dpa_rwsem);
if (cxled->cxld.flags & CXL_DECODER_F_ENABLE)
return -EBUSY;
- /*
- * Only allow modes that are supported by the current partition
- * configuration
- */
- if (mode == CXL_DECODER_PMEM && !resource_size(&cxlds->pmem_res)) {
- dev_dbg(dev, "no available pmem capacity\n");
- return -ENXIO;
+ for (part = 0; part < cxlds->nr_partitions; part++)
+ if (cxlds->part[part].mode == mode)
+ break;
+
+ if (part >= cxlds->nr_partitions) {
+ dev_dbg(dev, "unsupported mode: %d\n", mode);
+ return -EINVAL;
}
- if (mode == CXL_DECODER_RAM && !resource_size(&cxlds->ram_res)) {
- dev_dbg(dev, "no available ram capacity\n");
+
+ if (!resource_size(&cxlds->part[part].res)) {
+ dev_dbg(dev, "no available capacity for mode: %d\n", mode);
return -ENXIO;
}
- cxled->mode = mode;
+ cxled->part = part;
return 0;
}
-int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
+static int __cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
{
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
- resource_size_t free_ram_start, free_pmem_start;
- struct cxl_port *port = cxled_to_port(cxled);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
struct device *dev = &cxled->cxld.dev;
- resource_size_t start, avail, skip;
+ struct resource *res, *prev = NULL;
+ resource_size_t start, avail, skip, skip_start;
struct resource *p, *last;
- int rc;
+ int part;
- down_write(&cxl_dpa_rwsem);
+ guard(rwsem_write)(&cxl_dpa_rwsem);
if (cxled->cxld.region) {
dev_dbg(dev, "decoder attached to %s\n",
dev_name(&cxled->cxld.region->dev));
- rc = -EBUSY;
- goto out;
+ return -EBUSY;
}
if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) {
dev_dbg(dev, "decoder enabled\n");
- rc = -EBUSY;
- goto out;
+ return -EBUSY;
}
- for (p = cxlds->ram_res.child, last = NULL; p; p = p->sibling)
- last = p;
- if (last)
- free_ram_start = last->end + 1;
- else
- free_ram_start = cxlds->ram_res.start;
+ part = cxled->part;
+ if (part < 0) {
+ dev_dbg(dev, "partition not set\n");
+ return -EBUSY;
+ }
- for (p = cxlds->pmem_res.child, last = NULL; p; p = p->sibling)
+ res = &cxlds->part[part].res;
+ for (p = res->child, last = NULL; p; p = p->sibling)
last = p;
if (last)
- free_pmem_start = last->end + 1;
+ start = last->end + 1;
else
- free_pmem_start = cxlds->pmem_res.start;
-
- if (cxled->mode == CXL_DECODER_RAM) {
- start = free_ram_start;
- avail = cxlds->ram_res.end - start + 1;
- skip = 0;
- } else if (cxled->mode == CXL_DECODER_PMEM) {
- resource_size_t skip_start, skip_end;
-
- start = free_pmem_start;
- avail = cxlds->pmem_res.end - start + 1;
- skip_start = free_ram_start;
+ start = res->start;
- /*
- * If some pmem is already allocated, then that allocation
- * already handled the skip.
- */
- if (cxlds->pmem_res.child &&
- skip_start == cxlds->pmem_res.child->start)
- skip_end = skip_start - 1;
- else
- skip_end = start - 1;
- skip = skip_end - skip_start + 1;
- } else {
- dev_dbg(dev, "mode not set\n");
- rc = -EINVAL;
- goto out;
+ /*
+ * To allocate at partition N, a skip needs to be calculated for all
+ * unallocated space at lower partitions indices.
+ *
+ * If a partition has any allocations, the search can end because a
+ * previous cxl_dpa_alloc() invocation is assumed to have accounted for
+ * all previous partitions.
+ */
+ skip_start = CXL_RESOURCE_NONE;
+ for (int i = part; i; i--) {
+ prev = &cxlds->part[i - 1].res;
+ for (p = prev->child, last = NULL; p; p = p->sibling)
+ last = p;
+ if (last) {
+ skip_start = last->end + 1;
+ break;
+ }
+ skip_start = prev->start;
}
+ avail = res->end - start + 1;
+ if (skip_start == CXL_RESOURCE_NONE)
+ skip = 0;
+ else
+ skip = res->start - skip_start;
+
if (size > avail) {
dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
- cxl_decoder_mode_name(cxled->mode), &avail);
- rc = -ENOSPC;
- goto out;
+ res->name, &avail);
+ return -ENOSPC;
}
- rc = __cxl_dpa_reserve(cxled, start, size, skip);
-out:
- up_write(&cxl_dpa_rwsem);
+ return __cxl_dpa_reserve(cxled, start, size, skip);
+}
+
+int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
+{
+ struct cxl_port *port = cxled_to_port(cxled);
+ int rc;
+ rc = __cxl_dpa_alloc(cxled, size);
if (rc)
return rc;
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 78c5346e3e89..d72764056ce6 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -11,6 +11,7 @@
#include "core.h"
#include "trace.h"
+#include "mce.h"
static bool cxl_raw_allow_all;
@@ -900,7 +901,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
}
if (trace_cxl_general_media_enabled() || trace_cxl_dram_enabled()) {
- u64 dpa, hpa = ULLONG_MAX;
+ u64 dpa, hpa = ULLONG_MAX, hpa_alias = ULLONG_MAX;
struct cxl_region *cxlr;
/*
@@ -913,14 +914,20 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
dpa = le64_to_cpu(evt->media_hdr.phys_addr) & CXL_DPA_MASK;
cxlr = cxl_dpa_to_region(cxlmd, dpa);
- if (cxlr)
+ if (cxlr) {
+ u64 cache_size = cxlr->params.cache_size;
+
hpa = cxl_dpa_to_hpa(cxlr, cxlmd, dpa);
+ if (cache_size)
+ hpa_alias = hpa - cache_size;
+ }
if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
trace_cxl_general_media(cxlmd, type, cxlr, hpa,
- &evt->gen_media);
+ hpa_alias, &evt->gen_media);
else if (event_type == CXL_CPER_EVENT_DRAM)
- trace_cxl_dram(cxlmd, type, cxlr, hpa, &evt->dram);
+ trace_cxl_dram(cxlmd, type, cxlr, hpa, hpa_alias,
+ &evt->dram);
}
}
EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, "CXL");
@@ -1126,10 +1133,6 @@ static int cxl_mem_get_partition_info(struct cxl_memdev_state *mds)
le64_to_cpu(pi.active_volatile_cap) * CXL_CAPACITY_MULTIPLIER;
mds->active_persistent_bytes =
le64_to_cpu(pi.active_persistent_cap) * CXL_CAPACITY_MULTIPLIER;
- mds->next_volatile_bytes =
- le64_to_cpu(pi.next_volatile_cap) * CXL_CAPACITY_MULTIPLIER;
- mds->next_persistent_bytes =
- le64_to_cpu(pi.next_volatile_cap) * CXL_CAPACITY_MULTIPLIER;
return 0;
}
@@ -1251,74 +1254,54 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd)
{
struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
struct cxl_port *endpoint;
- int rc;
/* synchronize with cxl_mem_probe() and decoder write operations */
guard(device)(&cxlmd->dev);
endpoint = cxlmd->endpoint;
- down_read(&cxl_region_rwsem);
+ guard(rwsem_read)(&cxl_region_rwsem);
/*
* Require an endpoint to be safe otherwise the driver can not
* be sure that the device is unmapped.
*/
if (endpoint && cxl_num_decoders_committed(endpoint) == 0)
- rc = __cxl_mem_sanitize(mds, cmd);
- else
- rc = -EBUSY;
- up_read(&cxl_region_rwsem);
+ return __cxl_mem_sanitize(mds, cmd);
- return rc;
+ return -EBUSY;
}
-static int add_dpa_res(struct device *dev, struct resource *parent,
- struct resource *res, resource_size_t start,
- resource_size_t size, const char *type)
+static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode)
{
- int rc;
-
- res->name = type;
- res->start = start;
- res->end = start + size - 1;
- res->flags = IORESOURCE_MEM;
- if (resource_size(res) == 0) {
- dev_dbg(dev, "DPA(%s): no capacity\n", res->name);
- return 0;
- }
- rc = request_resource(parent, res);
- if (rc) {
- dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name,
- res, rc);
- return rc;
- }
+ int i = info->nr_partitions;
- dev_dbg(dev, "DPA(%s): %pr\n", res->name, res);
+ if (size == 0)
+ return;
- return 0;
+ info->part[i].range = (struct range) {
+ .start = start,
+ .end = start + size - 1,
+ };
+ info->part[i].mode = mode;
+ info->nr_partitions++;
}
-int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
+int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info)
{
struct cxl_dev_state *cxlds = &mds->cxlds;
struct device *dev = cxlds->dev;
int rc;
if (!cxlds->media_ready) {
- cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
- cxlds->ram_res = DEFINE_RES_MEM(0, 0);
- cxlds->pmem_res = DEFINE_RES_MEM(0, 0);
+ info->size = 0;
return 0;
}
- cxlds->dpa_res = DEFINE_RES_MEM(0, mds->total_bytes);
+ info->size = mds->total_bytes;
if (mds->partition_align_bytes == 0) {
- rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
- mds->volatile_only_bytes, "ram");
- if (rc)
- return rc;
- return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
- mds->volatile_only_bytes,
- mds->persistent_only_bytes, "pmem");
+ add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM);
+ add_part(info, mds->volatile_only_bytes,
+ mds->persistent_only_bytes, CXL_PARTMODE_PMEM);
+ return 0;
}
rc = cxl_mem_get_partition_info(mds);
@@ -1327,15 +1310,52 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
return rc;
}
- rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
- mds->active_volatile_bytes, "ram");
- if (rc)
- return rc;
- return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
- mds->active_volatile_bytes,
- mds->active_persistent_bytes, "pmem");
+ add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM);
+ add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes,
+ CXL_PARTMODE_PMEM);
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL");
+
+int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_get_health_info_out hi;
+ struct cxl_mbox_cmd mbox_cmd;
+ int rc;
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_GET_HEALTH_INFO,
+ .size_out = sizeof(hi),
+ .payload_out = &hi,
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (!rc)
+ *count = le32_to_cpu(hi.dirty_shutdown_cnt);
+
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_get_dirty_count, "CXL");
+
+int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_cmd mbox_cmd;
+ struct cxl_mbox_set_shutdown_state_in in = {
+ .state = 1
+ };
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_SET_SHUTDOWN_STATE,
+ .size_in = sizeof(in),
+ .payload_in = &in,
+ };
+
+ return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
}
-EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, "CXL");
+EXPORT_SYMBOL_NS_GPL(cxl_arm_dirty_shutdown, "CXL");
int cxl_set_timestamp(struct cxl_memdev_state *mds)
{
@@ -1467,6 +1487,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_mailbox_init, "CXL");
struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
{
struct cxl_memdev_state *mds;
+ int rc;
mds = devm_kzalloc(dev, sizeof(*mds), GFP_KERNEL);
if (!mds) {
@@ -1480,8 +1501,12 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
mds->cxlds.cxl_mbox.host = dev;
mds->cxlds.reg_map.resource = CXL_RESOURCE_NONE;
mds->cxlds.type = CXL_DEVTYPE_CLASSMEM;
- mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID;
- mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID;
+
+ rc = devm_cxl_register_mce_notifier(dev, &mds->mce_notifier);
+ if (rc == -EOPNOTSUPP)
+ dev_warn(dev, "CXL MCE unsupported\n");
+ else if (rc)
+ return ERR_PTR(rc);
return mds;
}
diff --git a/drivers/cxl/core/mce.c b/drivers/cxl/core/mce.c
new file mode 100644
index 000000000000..ff8d078c6ca1
--- /dev/null
+++ b/drivers/cxl/core/mce.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/set_memory.h>
+#include <asm/mce.h>
+#include <cxlmem.h>
+#include "mce.h"
+
+static int cxl_handle_mce(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cxl_memdev_state *mds = container_of(nb, struct cxl_memdev_state,
+ mce_notifier);
+ struct cxl_memdev *cxlmd = mds->cxlds.cxlmd;
+ struct cxl_port *endpoint = cxlmd->endpoint;
+ struct mce *mce = data;
+ u64 spa, spa_alias;
+ unsigned long pfn;
+
+ if (!mce || !mce_usable_address(mce))
+ return NOTIFY_DONE;
+
+ if (!endpoint)
+ return NOTIFY_DONE;
+
+ spa = mce->addr & MCI_ADDR_PHYSADDR;
+
+ pfn = spa >> PAGE_SHIFT;
+ if (!pfn_valid(pfn))
+ return NOTIFY_DONE;
+
+ spa_alias = cxl_port_get_spa_cache_alias(endpoint, spa);
+ if (spa_alias == ~0ULL)
+ return NOTIFY_DONE;
+
+ pfn = spa_alias >> PAGE_SHIFT;
+
+ /*
+ * Take down the aliased memory page. The original memory page flagged
+ * by the MCE will be taken cared of by the standard MCE handler.
+ */
+ dev_emerg(mds->cxlds.dev, "Offlining aliased SPA address0: %#llx\n",
+ spa_alias);
+ if (!memory_failure(pfn, 0))
+ set_mce_nospec(pfn);
+
+ return NOTIFY_OK;
+}
+
+static void cxl_unregister_mce_notifier(void *mce_notifier)
+{
+ mce_unregister_decode_chain(mce_notifier);
+}
+
+int devm_cxl_register_mce_notifier(struct device *dev,
+ struct notifier_block *mce_notifier)
+{
+ mce_notifier->notifier_call = cxl_handle_mce;
+ mce_notifier->priority = MCE_PRIO_UC;
+ mce_register_decode_chain(mce_notifier);
+
+ return devm_add_action_or_reset(dev, cxl_unregister_mce_notifier,
+ mce_notifier);
+}
diff --git a/drivers/cxl/core/mce.h b/drivers/cxl/core/mce.h
new file mode 100644
index 000000000000..ace73424eeb6
--- /dev/null
+++ b/drivers/cxl/core/mce.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
+#ifndef _CXL_CORE_MCE_H_
+#define _CXL_CORE_MCE_H_
+
+#include <linux/notifier.h>
+
+#ifdef CONFIG_CXL_MCE
+int devm_cxl_register_mce_notifier(struct device *dev,
+ struct notifier_block *mce_notifer);
+#else
+static inline int
+devm_cxl_register_mce_notifier(struct device *dev,
+ struct notifier_block *mce_notifier)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+#endif
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index 2e2e035abdaa..a16a5886d40a 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -75,12 +75,20 @@ static ssize_t label_storage_size_show(struct device *dev,
}
static DEVICE_ATTR_RO(label_storage_size);
+static resource_size_t cxl_ram_size(struct cxl_dev_state *cxlds)
+{
+ /* Static RAM is only expected at partition 0. */
+ if (cxlds->part[0].mode != CXL_PARTMODE_RAM)
+ return 0;
+ return resource_size(&cxlds->part[0].res);
+}
+
static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
- unsigned long long len = resource_size(&cxlds->ram_res);
+ unsigned long long len = cxl_ram_size(cxlds);
return sysfs_emit(buf, "%#llx\n", len);
}
@@ -93,7 +101,7 @@ static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr,
{
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
- unsigned long long len = resource_size(&cxlds->pmem_res);
+ unsigned long long len = cxl_pmem_size(cxlds);
return sysfs_emit(buf, "%#llx\n", len);
}
@@ -198,22 +206,17 @@ static int cxl_get_poison_by_memdev(struct cxl_memdev *cxlmd)
int rc = 0;
/* CXL 3.0 Spec 8.2.9.8.4.1 Separate pmem and ram poison requests */
- if (resource_size(&cxlds->pmem_res)) {
- offset = cxlds->pmem_res.start;
- length = resource_size(&cxlds->pmem_res);
- rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
- if (rc)
- return rc;
- }
- if (resource_size(&cxlds->ram_res)) {
- offset = cxlds->ram_res.start;
- length = resource_size(&cxlds->ram_res);
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ const struct resource *res = &cxlds->part[i].res;
+
+ offset = res->start;
+ length = resource_size(res);
rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
/*
* Invalid Physical Address is not an error for
* volatile addresses. Device support is optional.
*/
- if (rc == -EFAULT)
+ if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM)
rc = 0;
}
return rc;
@@ -404,14 +407,21 @@ static struct attribute *cxl_memdev_attributes[] = {
NULL,
};
+static struct cxl_dpa_perf *to_pmem_perf(struct cxl_dev_state *cxlds)
+{
+ for (int i = 0; i < cxlds->nr_partitions; i++)
+ if (cxlds->part[i].mode == CXL_PARTMODE_PMEM)
+ return &cxlds->part[i].perf;
+ return NULL;
+}
+
static ssize_t pmem_qos_class_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
- return sysfs_emit(buf, "%d\n", mds->pmem_perf.qos_class);
+ return sysfs_emit(buf, "%d\n", to_pmem_perf(cxlds)->qos_class);
}
static struct device_attribute dev_attr_pmem_qos_class =
@@ -423,14 +433,20 @@ static struct attribute *cxl_memdev_pmem_attributes[] = {
NULL,
};
+static struct cxl_dpa_perf *to_ram_perf(struct cxl_dev_state *cxlds)
+{
+ if (cxlds->part[0].mode != CXL_PARTMODE_RAM)
+ return NULL;
+ return &cxlds->part[0].perf;
+}
+
static ssize_t ram_qos_class_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
- return sysfs_emit(buf, "%d\n", mds->ram_perf.qos_class);
+ return sysfs_emit(buf, "%d\n", to_ram_perf(cxlds)->qos_class);
}
static struct device_attribute dev_attr_ram_qos_class =
@@ -466,11 +482,11 @@ static umode_t cxl_ram_visible(struct kobject *kobj, struct attribute *a, int n)
{
struct device *dev = kobj_to_dev(kobj);
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+ struct cxl_dpa_perf *perf = to_ram_perf(cxlmd->cxlds);
- if (a == &dev_attr_ram_qos_class.attr)
- if (mds->ram_perf.qos_class == CXL_QOS_CLASS_INVALID)
- return 0;
+ if (a == &dev_attr_ram_qos_class.attr &&
+ (!perf || perf->qos_class == CXL_QOS_CLASS_INVALID))
+ return 0;
return a->mode;
}
@@ -485,11 +501,11 @@ static umode_t cxl_pmem_visible(struct kobject *kobj, struct attribute *a, int n
{
struct device *dev = kobj_to_dev(kobj);
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+ struct cxl_dpa_perf *perf = to_pmem_perf(cxlmd->cxlds);
- if (a == &dev_attr_pmem_qos_class.attr)
- if (mds->pmem_perf.qos_class == CXL_QOS_CLASS_INVALID)
- return 0;
+ if (a == &dev_attr_pmem_qos_class.attr &&
+ (!perf || perf->qos_class == CXL_QOS_CLASS_INVALID))
+ return 0;
return a->mode;
}
@@ -566,10 +582,9 @@ void set_exclusive_cxl_commands(struct cxl_memdev_state *mds,
{
struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
- down_write(&cxl_memdev_rwsem);
+ guard(rwsem_write)(&cxl_memdev_rwsem);
bitmap_or(cxl_mbox->exclusive_cmds, cxl_mbox->exclusive_cmds,
cmds, CXL_MEM_COMMAND_ID_MAX);
- up_write(&cxl_memdev_rwsem);
}
EXPORT_SYMBOL_NS_GPL(set_exclusive_cxl_commands, "CXL");
@@ -583,10 +598,9 @@ void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds,
{
struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
- down_write(&cxl_memdev_rwsem);
+ guard(rwsem_write)(&cxl_memdev_rwsem);
bitmap_andnot(cxl_mbox->exclusive_cmds, cxl_mbox->exclusive_cmds,
cmds, CXL_MEM_COMMAND_ID_MAX);
- up_write(&cxl_memdev_rwsem);
}
EXPORT_SYMBOL_NS_GPL(clear_exclusive_cxl_commands, "CXL");
@@ -594,9 +608,8 @@ static void cxl_memdev_shutdown(struct device *dev)
{
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
- down_write(&cxl_memdev_rwsem);
+ guard(rwsem_write)(&cxl_memdev_rwsem);
cxlmd->cxlds = NULL;
- up_write(&cxl_memdev_rwsem);
}
static void cxl_memdev_unregister(void *_cxlmd)
@@ -678,15 +691,13 @@ static long cxl_memdev_ioctl(struct file *file, unsigned int cmd,
{
struct cxl_memdev *cxlmd = file->private_data;
struct cxl_dev_state *cxlds;
- int rc = -ENXIO;
- down_read(&cxl_memdev_rwsem);
+ guard(rwsem_read)(&cxl_memdev_rwsem);
cxlds = cxlmd->cxlds;
if (cxlds && cxlds->type == CXL_DEVTYPE_CLASSMEM)
- rc = __cxl_memdev_ioctl(cxlmd, cmd, arg);
- up_read(&cxl_memdev_rwsem);
+ return __cxl_memdev_ioctl(cxlmd, cmd, arg);
- return rc;
+ return -ENXIO;
}
static int cxl_memdev_open(struct inode *inode, struct file *file)
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 013b869b66cb..96fecb799cbc 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -1054,3 +1054,100 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c)
return 0;
}
+
+/*
+ * Set max timeout such that platforms will optimize GPF flow to avoid
+ * the implied worst-case scenario delays. On a sane platform, all
+ * devices should always complete GPF within the energy budget of
+ * the GPF flow. The kernel does not have enough information to pick
+ * anything better than "maximize timeouts and hope it works".
+ *
+ * A misbehaving device could block forward progress of GPF for all
+ * the other devices, exhausting the energy budget of the platform.
+ * However, the spec seems to assume that moving on from slow to respond
+ * devices is a virtue. It is not possible to know that, in actuality,
+ * the slow to respond device is *the* most critical device in the
+ * system to wait.
+ */
+#define GPF_TIMEOUT_BASE_MAX 2
+#define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */
+
+u16 cxl_gpf_get_dvsec(struct device *dev, bool is_port)
+{
+ u16 dvsec;
+
+ if (!dev_is_pci(dev))
+ return 0;
+
+ dvsec = pci_find_dvsec_capability(to_pci_dev(dev), PCI_VENDOR_ID_CXL,
+ is_port ? CXL_DVSEC_PORT_GPF : CXL_DVSEC_DEVICE_GPF);
+ if (!dvsec)
+ dev_warn(dev, "%s GPF DVSEC not present\n",
+ is_port ? "Port" : "Device");
+ return dvsec;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_gpf_get_dvsec, "CXL");
+
+static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase)
+{
+ u64 base, scale;
+ int rc, offset;
+ u16 ctrl;
+
+ switch (phase) {
+ case 1:
+ offset = CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET;
+ base = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK;
+ scale = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK;
+ break;
+ case 2:
+ offset = CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET;
+ base = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK;
+ scale = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ rc = pci_read_config_word(pdev, dvsec + offset, &ctrl);
+ if (rc)
+ return rc;
+
+ if (FIELD_GET(base, ctrl) == GPF_TIMEOUT_BASE_MAX &&
+ FIELD_GET(scale, ctrl) == GPF_TIMEOUT_SCALE_MAX)
+ return 0;
+
+ ctrl = FIELD_PREP(base, GPF_TIMEOUT_BASE_MAX);
+ ctrl |= FIELD_PREP(scale, GPF_TIMEOUT_SCALE_MAX);
+
+ rc = pci_write_config_word(pdev, dvsec + offset, ctrl);
+ if (!rc)
+ pci_dbg(pdev, "Port GPF phase %d timeout: %d0 secs\n",
+ phase, GPF_TIMEOUT_BASE_MAX);
+
+ return rc;
+}
+
+int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port)
+{
+ struct pci_dev *pdev;
+
+ if (!port)
+ return -EINVAL;
+
+ if (!port->gpf_dvsec) {
+ int dvsec;
+
+ dvsec = cxl_gpf_get_dvsec(dport_dev, true);
+ if (!dvsec)
+ return -EINVAL;
+
+ port->gpf_dvsec = dvsec;
+ }
+
+ pdev = to_pci_dev(dport_dev);
+ update_gpf_port_dvsec(pdev, port->gpf_dvsec, 1);
+ update_gpf_port_dvsec(pdev, port->gpf_dvsec, 2);
+
+ return 0;
+}
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 78a5c2c25982..0fd6646c1a2e 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -194,25 +194,35 @@ static ssize_t mode_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ /* without @cxl_dpa_rwsem, make sure @part is not reloaded */
+ int part = READ_ONCE(cxled->part);
+ const char *desc;
+
+ if (part < 0)
+ desc = "none";
+ else
+ desc = cxlds->part[part].res.name;
- return sysfs_emit(buf, "%s\n", cxl_decoder_mode_name(cxled->mode));
+ return sysfs_emit(buf, "%s\n", desc);
}
static ssize_t mode_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
- enum cxl_decoder_mode mode;
+ enum cxl_partition_mode mode;
ssize_t rc;
if (sysfs_streq(buf, "pmem"))
- mode = CXL_DECODER_PMEM;
+ mode = CXL_PARTMODE_PMEM;
else if (sysfs_streq(buf, "ram"))
- mode = CXL_DECODER_RAM;
+ mode = CXL_PARTMODE_RAM;
else
return -EINVAL;
- rc = cxl_dpa_set_mode(cxled, mode);
+ rc = cxl_dpa_set_part(cxled, mode);
if (rc)
return rc;
@@ -549,13 +559,9 @@ static ssize_t decoders_committed_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct cxl_port *port = to_cxl_port(dev);
- int rc;
-
- down_read(&cxl_region_rwsem);
- rc = sysfs_emit(buf, "%d\n", cxl_num_decoders_committed(port));
- up_read(&cxl_region_rwsem);
- return rc;
+ guard(rwsem_read)(&cxl_region_rwsem);
+ return sysfs_emit(buf, "%d\n", cxl_num_decoders_committed(port));
}
static DEVICE_ATTR_RO(decoders_committed);
@@ -1672,6 +1678,8 @@ retry:
if (rc && rc != -EBUSY)
return rc;
+ cxl_gpf_port_setup(dport_dev, port);
+
/* Any more ports to add between this one and the root? */
if (!dev_is_cxl_root_child(&port->dev))
continue;
@@ -1899,6 +1907,7 @@ struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port)
return ERR_PTR(-ENOMEM);
cxled->pos = -1;
+ cxled->part = -1;
cxld = &cxled->cxld;
rc = cxl_decoder_init(port, cxld);
if (rc) {
@@ -2339,8 +2348,14 @@ static __init int cxl_core_init(void)
if (rc)
goto err_region;
+ rc = cxl_ras_init();
+ if (rc)
+ goto err_ras;
+
return 0;
+err_ras:
+ cxl_region_exit();
err_region:
bus_unregister(&cxl_bus_type);
err_bus:
@@ -2352,6 +2367,7 @@ err_wq:
static void cxl_core_exit(void)
{
+ cxl_ras_exit();
cxl_region_exit();
bus_unregister(&cxl_bus_type);
destroy_workqueue(cxl_bus_wq);
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
new file mode 100644
index 000000000000..485a831695c7
--- /dev/null
+++ b/drivers/cxl/core/ras.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2025 AMD Corporation. All rights reserved. */
+
+#include <linux/pci.h>
+#include <linux/aer.h>
+#include <cxl/event.h>
+#include <cxlmem.h>
+#include "trace.h"
+
+static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
+
+ trace_cxl_port_aer_correctable_error(&pdev->dev, status);
+}
+
+static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
+ u32 fe;
+
+ if (hweight32(status) > 1)
+ fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+ ras_cap.cap_control));
+ else
+ fe = status;
+
+ trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
+ ras_cap.header_log);
+}
+
+static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
+ struct cxl_dev_state *cxlds;
+
+ cxlds = pci_get_drvdata(pdev);
+ if (!cxlds)
+ return;
+
+ trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+}
+
+static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
+ struct cxl_dev_state *cxlds;
+ u32 fe;
+
+ cxlds = pci_get_drvdata(pdev);
+ if (!cxlds)
+ return;
+
+ if (hweight32(status) > 1)
+ fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+ ras_cap.cap_control));
+ else
+ fe = status;
+
+ trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
+ ras_cap.header_log);
+}
+
+static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
+{
+ unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
+ data->prot_err.agent_addr.function);
+ struct pci_dev *pdev __free(pci_dev_put) =
+ pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
+ data->prot_err.agent_addr.bus,
+ devfn);
+ int port_type;
+
+ if (!pdev)
+ return;
+
+ guard(device)(&pdev->dev);
+
+ port_type = pci_pcie_type(pdev);
+ if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
+ port_type == PCI_EXP_TYPE_DOWNSTREAM ||
+ port_type == PCI_EXP_TYPE_UPSTREAM) {
+ if (data->severity == AER_CORRECTABLE)
+ cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap);
+ else
+ cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap);
+
+ return;
+ }
+
+ if (data->severity == AER_CORRECTABLE)
+ cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
+ else
+ cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
+}
+
+static void cxl_cper_prot_err_work_fn(struct work_struct *work)
+{
+ struct cxl_cper_prot_err_work_data wd;
+
+ while (cxl_cper_prot_err_kfifo_get(&wd))
+ cxl_cper_handle_prot_err(&wd);
+}
+static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
+
+int cxl_ras_init(void)
+{
+ return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
+}
+
+void cxl_ras_exit(void)
+{
+ cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
+ cancel_work_sync(&cxl_cper_prot_err_work);
+}
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index e8d11a988fd9..c3f4dc244df7 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -144,7 +144,7 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
rc = down_read_interruptible(&cxl_region_rwsem);
if (rc)
return rc;
- if (cxlr->mode != CXL_DECODER_PMEM)
+ if (cxlr->mode != CXL_PARTMODE_PMEM)
rc = sysfs_emit(buf, "\n");
else
rc = sysfs_emit(buf, "%pUb\n", &p->uuid);
@@ -441,7 +441,7 @@ static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
* Support tooling that expects to find a 'uuid' attribute for all
* regions regardless of mode.
*/
- if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_DECODER_PMEM)
+ if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM)
return 0444;
return a->mode;
}
@@ -603,8 +603,16 @@ static ssize_t mode_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct cxl_region *cxlr = to_cxl_region(dev);
+ const char *desc;
- return sysfs_emit(buf, "%s\n", cxl_decoder_mode_name(cxlr->mode));
+ if (cxlr->mode == CXL_PARTMODE_RAM)
+ desc = "ram";
+ else if (cxlr->mode == CXL_PARTMODE_PMEM)
+ desc = "pmem";
+ else
+ desc = "";
+
+ return sysfs_emit(buf, "%s\n", desc);
}
static DEVICE_ATTR_RO(mode);
@@ -630,7 +638,7 @@ static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size)
/* ways, granularity and uuid (if PMEM) need to be set before HPA */
if (!p->interleave_ways || !p->interleave_granularity ||
- (cxlr->mode == CXL_DECODER_PMEM && uuid_is_null(&p->uuid)))
+ (cxlr->mode == CXL_PARTMODE_PMEM && uuid_is_null(&p->uuid)))
return -ENXIO;
div64_u64_rem(size, (u64)SZ_256M * p->interleave_ways, &remainder);
@@ -824,6 +832,21 @@ static int match_free_decoder(struct device *dev, const void *data)
return 1;
}
+static bool region_res_match_cxl_range(const struct cxl_region_params *p,
+ struct range *range)
+{
+ if (!p->res)
+ return false;
+
+ /*
+ * If an extended linear cache region then the CXL range is assumed
+ * to be fronted by the DRAM range in current known implementation.
+ * This assumption will be made until a variant implementation exists.
+ */
+ return p->res->start + p->cache_size == range->start &&
+ p->res->end == range->end;
+}
+
static int match_auto_decoder(struct device *dev, const void *data)
{
const struct cxl_region_params *p = data;
@@ -836,7 +859,7 @@ static int match_auto_decoder(struct device *dev, const void *data)
cxld = to_cxl_decoder(dev);
r = &cxld->hpa_range;
- if (p->res && p->res->start == r->start && p->res->end == r->end)
+ if (region_res_match_cxl_range(p, r))
return 1;
return 0;
@@ -1424,8 +1447,7 @@ static int cxl_port_setup_targets(struct cxl_port *port,
if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
if (cxld->interleave_ways != iw ||
cxld->interleave_granularity != ig ||
- cxld->hpa_range.start != p->res->start ||
- cxld->hpa_range.end != p->res->end ||
+ !region_res_match_cxl_range(p, &cxld->hpa_range) ||
((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
dev_err(&cxlr->dev,
"%s:%s %s expected iw: %d ig: %d %pr\n",
@@ -1888,6 +1910,7 @@ static int cxl_region_attach(struct cxl_region *cxlr,
{
struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
struct cxl_region_params *p = &cxlr->params;
struct cxl_port *ep_port, *root_port;
struct cxl_dport *dport;
@@ -1902,17 +1925,17 @@ static int cxl_region_attach(struct cxl_region *cxlr,
return rc;
}
- if (cxled->mode != cxlr->mode) {
- dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n",
- dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode);
- return -EINVAL;
- }
-
- if (cxled->mode == CXL_DECODER_DEAD) {
+ if (cxled->part < 0) {
dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev));
return -ENODEV;
}
+ if (cxlds->part[cxled->part].mode != cxlr->mode) {
+ dev_dbg(&cxlr->dev, "%s region mode: %d mismatch\n",
+ dev_name(&cxled->cxld.dev), cxlr->mode);
+ return -EINVAL;
+ }
+
/* all full of members, or interleave config not established? */
if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) {
dev_dbg(&cxlr->dev, "region already active\n");
@@ -1951,13 +1974,13 @@ static int cxl_region_attach(struct cxl_region *cxlr,
return -ENXIO;
}
- if (resource_size(cxled->dpa_res) * p->interleave_ways !=
+ if (resource_size(cxled->dpa_res) * p->interleave_ways + p->cache_size !=
resource_size(p->res)) {
dev_dbg(&cxlr->dev,
- "%s:%s: decoder-size-%#llx * ways-%d != region-size-%#llx\n",
+ "%s:%s-size-%#llx * ways-%d + cache-%#llx != region-size-%#llx\n",
dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
(u64)resource_size(cxled->dpa_res), p->interleave_ways,
- (u64)resource_size(p->res));
+ (u64)p->cache_size, (u64)resource_size(p->res));
return -EINVAL;
}
@@ -2115,7 +2138,7 @@ out:
void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled)
{
down_write(&cxl_region_rwsem);
- cxled->mode = CXL_DECODER_DEAD;
+ cxled->part = -1;
cxl_region_detach(cxled);
up_write(&cxl_region_rwsem);
}
@@ -2471,7 +2494,7 @@ static int cxl_region_calculate_adistance(struct notifier_block *nb,
*/
static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
int id,
- enum cxl_decoder_mode mode,
+ enum cxl_partition_mode mode,
enum cxl_decoder_type type)
{
struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
@@ -2525,13 +2548,13 @@ static ssize_t create_ram_region_show(struct device *dev,
}
static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd,
- enum cxl_decoder_mode mode, int id)
+ enum cxl_partition_mode mode, int id)
{
int rc;
switch (mode) {
- case CXL_DECODER_RAM:
- case CXL_DECODER_PMEM:
+ case CXL_PARTMODE_RAM:
+ case CXL_PARTMODE_PMEM:
break;
default:
dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
@@ -2551,7 +2574,7 @@ static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd,
}
static ssize_t create_region_store(struct device *dev, const char *buf,
- size_t len, enum cxl_decoder_mode mode)
+ size_t len, enum cxl_partition_mode mode)
{
struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
struct cxl_region *cxlr;
@@ -2572,7 +2595,7 @@ static ssize_t create_pmem_region_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t len)
{
- return create_region_store(dev, buf, len, CXL_DECODER_PMEM);
+ return create_region_store(dev, buf, len, CXL_PARTMODE_PMEM);
}
DEVICE_ATTR_RW(create_pmem_region);
@@ -2580,7 +2603,7 @@ static ssize_t create_ram_region_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t len)
{
- return create_region_store(dev, buf, len, CXL_DECODER_RAM);
+ return create_region_store(dev, buf, len, CXL_PARTMODE_RAM);
}
DEVICE_ATTR_RW(create_ram_region);
@@ -2678,7 +2701,7 @@ EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, "CXL");
struct cxl_poison_context {
struct cxl_port *port;
- enum cxl_decoder_mode mode;
+ int part;
u64 offset;
};
@@ -2686,47 +2709,45 @@ static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd,
struct cxl_poison_context *ctx)
{
struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ const struct resource *res;
+ struct resource *p, *last;
u64 offset, length;
int rc = 0;
+ if (ctx->part < 0)
+ return 0;
+
/*
- * Collect poison for the remaining unmapped resources
- * after poison is collected by committed endpoints.
- *
- * Knowing that PMEM must always follow RAM, get poison
- * for unmapped resources based on the last decoder's mode:
- * ram: scan remains of ram range, then any pmem range
- * pmem: scan remains of pmem range
+ * Collect poison for the remaining unmapped resources after
+ * poison is collected by committed endpoints decoders.
*/
-
- if (ctx->mode == CXL_DECODER_RAM) {
- offset = ctx->offset;
- length = resource_size(&cxlds->ram_res) - offset;
+ for (int i = ctx->part; i < cxlds->nr_partitions; i++) {
+ res = &cxlds->part[i].res;
+ for (p = res->child, last = NULL; p; p = p->sibling)
+ last = p;
+ if (last)
+ offset = last->end + 1;
+ else
+ offset = res->start;
+ length = res->end - offset + 1;
+ if (!length)
+ break;
rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
- if (rc == -EFAULT)
- rc = 0;
+ if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM)
+ continue;
if (rc)
- return rc;
- }
- if (ctx->mode == CXL_DECODER_PMEM) {
- offset = ctx->offset;
- length = resource_size(&cxlds->dpa_res) - offset;
- if (!length)
- return 0;
- } else if (resource_size(&cxlds->pmem_res)) {
- offset = cxlds->pmem_res.start;
- length = resource_size(&cxlds->pmem_res);
- } else {
- return 0;
+ break;
}
- return cxl_mem_get_poison(cxlmd, offset, length, NULL);
+ return rc;
}
static int poison_by_decoder(struct device *dev, void *arg)
{
struct cxl_poison_context *ctx = arg;
struct cxl_endpoint_decoder *cxled;
+ enum cxl_partition_mode mode;
+ struct cxl_dev_state *cxlds;
struct cxl_memdev *cxlmd;
u64 offset, length;
int rc = 0;
@@ -2735,27 +2756,18 @@ static int poison_by_decoder(struct device *dev, void *arg)
return rc;
cxled = to_cxl_endpoint_decoder(dev);
- if (!cxled->dpa_res || !resource_size(cxled->dpa_res))
- return rc;
-
- /*
- * Regions are only created with single mode decoders: pmem or ram.
- * Linux does not support mixed mode decoders. This means that
- * reading poison per endpoint decoder adheres to the requirement
- * that poison reads of pmem and ram must be separated.
- * CXL 3.0 Spec 8.2.9.8.4.1
- */
- if (cxled->mode == CXL_DECODER_MIXED) {
- dev_dbg(dev, "poison list read unsupported in mixed mode\n");
+ if (!cxled->dpa_res)
return rc;
- }
cxlmd = cxled_to_memdev(cxled);
+ cxlds = cxlmd->cxlds;
+ mode = cxlds->part[cxled->part].mode;
+
if (cxled->skip) {
offset = cxled->dpa_res->start - cxled->skip;
length = cxled->skip;
rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
- if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM)
+ if (rc == -EFAULT && mode == CXL_PARTMODE_RAM)
rc = 0;
if (rc)
return rc;
@@ -2764,7 +2776,7 @@ static int poison_by_decoder(struct device *dev, void *arg)
offset = cxled->dpa_res->start;
length = cxled->dpa_res->end - offset + 1;
rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region);
- if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM)
+ if (rc == -EFAULT && mode == CXL_PARTMODE_RAM)
rc = 0;
if (rc)
return rc;
@@ -2772,7 +2784,7 @@ static int poison_by_decoder(struct device *dev, void *arg)
/* Iterate until commit_end is reached */
if (cxled->cxld.id == ctx->port->commit_end) {
ctx->offset = cxled->dpa_res->end + 1;
- ctx->mode = cxled->mode;
+ ctx->part = cxled->part;
return 1;
}
@@ -2785,7 +2797,8 @@ int cxl_get_poison_by_endpoint(struct cxl_port *port)
int rc = 0;
ctx = (struct cxl_poison_context) {
- .port = port
+ .port = port,
+ .part = -1,
};
rc = device_for_each_child(&port->dev, &ctx, poison_by_decoder);
@@ -2921,7 +2934,7 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0);
/* Apply the hpa_offset to the region base address */
- hpa = hpa_offset + p->res->start;
+ hpa = hpa_offset + p->res->start + p->cache_size;
/* Root decoder translation overrides typical modulo decode */
if (cxlrd->hpa_to_spa)
@@ -3038,17 +3051,13 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
struct cxl_dax_region *cxlr_dax;
struct device *dev;
- down_read(&cxl_region_rwsem);
- if (p->state != CXL_CONFIG_COMMIT) {
- cxlr_dax = ERR_PTR(-ENXIO);
- goto out;
- }
+ guard(rwsem_read)(&cxl_region_rwsem);
+ if (p->state != CXL_CONFIG_COMMIT)
+ return ERR_PTR(-ENXIO);
cxlr_dax = kzalloc(sizeof(*cxlr_dax), GFP_KERNEL);
- if (!cxlr_dax) {
- cxlr_dax = ERR_PTR(-ENOMEM);
- goto out;
- }
+ if (!cxlr_dax)
+ return ERR_PTR(-ENOMEM);
cxlr_dax->hpa_range.start = p->res->start;
cxlr_dax->hpa_range.end = p->res->end;
@@ -3061,8 +3070,6 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
dev->parent = &cxlr->dev;
dev->bus = &cxl_bus_type;
dev->type = &cxl_dax_region_type;
-out:
- up_read(&cxl_region_rwsem);
return cxlr_dax;
}
@@ -3208,7 +3215,6 @@ static int match_region_by_range(struct device *dev, const void *data)
struct cxl_region_params *p;
struct cxl_region *cxlr;
const struct range *r = data;
- int rc = 0;
if (!is_cxl_region(dev))
return 0;
@@ -3216,60 +3222,96 @@ static int match_region_by_range(struct device *dev, const void *data)
cxlr = to_cxl_region(dev);
p = &cxlr->params;
- down_read(&cxl_region_rwsem);
+ guard(rwsem_read)(&cxl_region_rwsem);
if (p->res && p->res->start == r->start && p->res->end == r->end)
- rc = 1;
- up_read(&cxl_region_rwsem);
+ return 1;
- return rc;
+ return 0;
}
-/* Establish an empty region covering the given HPA range */
-static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
- struct cxl_endpoint_decoder *cxled)
+static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr,
+ struct resource *res)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+ struct cxl_region_params *p = &cxlr->params;
+ int nid = phys_to_target_node(res->start);
+ resource_size_t size = resource_size(res);
+ resource_size_t cache_size, start;
+ int rc;
+
+ rc = cxl_acpi_get_extended_linear_cache_size(res, nid, &cache_size);
+ if (rc)
+ return rc;
+
+ if (!cache_size)
+ return 0;
+
+ if (size != cache_size) {
+ dev_warn(&cxlr->dev,
+ "Extended Linear Cache size %pa != CXL size %pa. No Support!",
+ &cache_size, &size);
+ return -ENXIO;
+ }
+
+ /*
+ * Move the start of the range to where the cache range starts. The
+ * implementation assumes that the cache range is in front of the
+ * CXL range. This is not dictated by the HMAT spec but is how the
+ * current known implementation is configured.
+ *
+ * The cache range is expected to be within the CFMWS. The adjusted
+ * res->start should not be less than cxlrd->res->start.
+ */
+ start = res->start - cache_size;
+ if (start < cxlrd->res->start)
+ return -ENXIO;
+
+ res->start = start;
+ p->cache_size = cache_size;
+
+ return 0;
+}
+
+static int __construct_region(struct cxl_region *cxlr,
+ struct cxl_root_decoder *cxlrd,
+ struct cxl_endpoint_decoder *cxled)
{
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
- struct cxl_port *port = cxlrd_to_port(cxlrd);
struct range *hpa = &cxled->cxld.hpa_range;
struct cxl_region_params *p;
- struct cxl_region *cxlr;
struct resource *res;
int rc;
- do {
- cxlr = __create_region(cxlrd, cxled->mode,
- atomic_read(&cxlrd->region_id));
- } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY);
-
- if (IS_ERR(cxlr)) {
- dev_err(cxlmd->dev.parent,
- "%s:%s: %s failed assign region: %ld\n",
- dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
- __func__, PTR_ERR(cxlr));
- return cxlr;
- }
-
- down_write(&cxl_region_rwsem);
+ guard(rwsem_write)(&cxl_region_rwsem);
p = &cxlr->params;
if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
dev_err(cxlmd->dev.parent,
"%s:%s: %s autodiscovery interrupted\n",
dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
__func__);
- rc = -EBUSY;
- goto err;
+ return -EBUSY;
}
set_bit(CXL_REGION_F_AUTO, &cxlr->flags);
res = kmalloc(sizeof(*res), GFP_KERNEL);
- if (!res) {
- rc = -ENOMEM;
- goto err;
- }
+ if (!res)
+ return -ENOMEM;
*res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa),
dev_name(&cxlr->dev));
+
+ rc = cxl_extended_linear_cache_resize(cxlr, res);
+ if (rc && rc != -EOPNOTSUPP) {
+ /*
+ * Failing to support extended linear cache region resize does not
+ * prevent the region from functioning. Only causes cxl list showing
+ * incorrect region size.
+ */
+ dev_warn(cxlmd->dev.parent,
+ "Extended linear cache calculation failed rc:%d\n", rc);
+ }
+
rc = insert_resource(cxlrd->res, res);
if (rc) {
/*
@@ -3289,7 +3331,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
if (rc)
- goto err;
+ return rc;
dev_dbg(cxlmd->dev.parent, "%s:%s: %s %s res: %pr iw: %d ig: %d\n",
dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__,
@@ -3298,14 +3340,40 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
/* ...to match put_device() in cxl_add_to_region() */
get_device(&cxlr->dev);
- up_write(&cxl_region_rwsem);
- return cxlr;
+ return 0;
+}
-err:
- up_write(&cxl_region_rwsem);
- devm_release_action(port->uport_dev, unregister_region, cxlr);
- return ERR_PTR(rc);
+/* Establish an empty region covering the given HPA range */
+static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
+ struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_port *port = cxlrd_to_port(cxlrd);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ int rc, part = READ_ONCE(cxled->part);
+ struct cxl_region *cxlr;
+
+ do {
+ cxlr = __create_region(cxlrd, cxlds->part[part].mode,
+ atomic_read(&cxlrd->region_id));
+ } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY);
+
+ if (IS_ERR(cxlr)) {
+ dev_err(cxlmd->dev.parent,
+ "%s:%s: %s failed assign region: %ld\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+ __func__, PTR_ERR(cxlr));
+ return cxlr;
+ }
+
+ rc = __construct_region(cxlr, cxlrd, cxled);
+ if (rc) {
+ devm_release_action(port->uport_dev, unregister_region, cxlr);
+ return ERR_PTR(rc);
+ }
+
+ return cxlr;
}
int cxl_add_to_region(struct cxl_port *root, struct cxl_endpoint_decoder *cxled)
@@ -3375,6 +3443,34 @@ out:
}
EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, "CXL");
+u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa)
+{
+ struct cxl_region_ref *iter;
+ unsigned long index;
+
+ if (!endpoint)
+ return ~0ULL;
+
+ guard(rwsem_write)(&cxl_region_rwsem);
+
+ xa_for_each(&endpoint->regions, index, iter) {
+ struct cxl_region_params *p = &iter->region->params;
+
+ if (p->res->start <= spa && spa <= p->res->end) {
+ if (!p->cache_size)
+ return ~0ULL;
+
+ if (spa >= p->res->start + p->cache_size)
+ return spa - p->cache_size;
+
+ return spa + p->cache_size;
+ }
+ }
+
+ return ~0ULL;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_port_get_spa_cache_alias, "CXL");
+
static int is_system_ram(struct resource *res, void *arg)
{
struct cxl_region *cxlr = arg;
@@ -3440,9 +3536,9 @@ out:
return rc;
switch (cxlr->mode) {
- case CXL_DECODER_PMEM:
+ case CXL_PARTMODE_PMEM:
return devm_cxl_add_pmem_region(cxlr);
- case CXL_DECODER_RAM:
+ case CXL_PARTMODE_RAM:
/*
* The region can not be manged by CXL if any portion of
* it is already online as 'System RAM'
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index cea706b683b5..25ebfbc1616c 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -48,6 +48,34 @@
{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
)
+TRACE_EVENT(cxl_port_aer_uncorrectable_error,
+ TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
+ TP_ARGS(dev, status, fe, hl),
+ TP_STRUCT__entry(
+ __string(device, dev_name(dev))
+ __string(host, dev_name(dev->parent))
+ __field(u32, status)
+ __field(u32, first_error)
+ __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
+ ),
+ TP_fast_assign(
+ __assign_str(device);
+ __assign_str(host);
+ __entry->status = status;
+ __entry->first_error = fe;
+ /*
+ * Embed the 512B headerlog data for user app retrieval and
+ * parsing, but no need to print this in the trace buffer.
+ */
+ memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
+ ),
+ TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
+ __get_str(device), __get_str(host),
+ show_uc_errs(__entry->status),
+ show_uc_errs(__entry->first_error)
+ )
+);
+
TRACE_EVENT(cxl_aer_uncorrectable_error,
TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl),
TP_ARGS(cxlmd, status, fe, hl),
@@ -96,6 +124,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \
)
+TRACE_EVENT(cxl_port_aer_correctable_error,
+ TP_PROTO(struct device *dev, u32 status),
+ TP_ARGS(dev, status),
+ TP_STRUCT__entry(
+ __string(device, dev_name(dev))
+ __string(host, dev_name(dev->parent))
+ __field(u32, status)
+ ),
+ TP_fast_assign(
+ __assign_str(device);
+ __assign_str(host);
+ __entry->status = status;
+ ),
+ TP_printk("device=%s host=%s status='%s'",
+ __get_str(device), __get_str(host),
+ show_ce_errs(__entry->status)
+ )
+);
+
TRACE_EVENT(cxl_aer_correctable_error,
TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
TP_ARGS(cxlmd, status),
@@ -392,9 +439,10 @@ TRACE_EVENT(cxl_generic_event,
TRACE_EVENT(cxl_general_media,
TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
- struct cxl_region *cxlr, u64 hpa, struct cxl_event_gen_media *rec),
+ struct cxl_region *cxlr, u64 hpa, u64 hpa_alias0,
+ struct cxl_event_gen_media *rec),
- TP_ARGS(cxlmd, log, cxlr, hpa, rec),
+ TP_ARGS(cxlmd, log, cxlr, hpa, hpa_alias0, rec),
TP_STRUCT__entry(
CXL_EVT_TP_entry
@@ -408,6 +456,7 @@ TRACE_EVENT(cxl_general_media,
__array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE)
/* Following are out of order to pack trace record */
__field(u64, hpa)
+ __field(u64, hpa_alias0)
__field_struct(uuid_t, region_uuid)
__field(u16, validity_flags)
__field(u8, rank)
@@ -438,6 +487,7 @@ TRACE_EVENT(cxl_general_media,
CXL_EVENT_GEN_MED_COMP_ID_SIZE);
__entry->validity_flags = get_unaligned_le16(&rec->media_hdr.validity_flags);
__entry->hpa = hpa;
+ __entry->hpa_alias0 = hpa_alias0;
if (cxlr) {
__assign_str(region_name);
uuid_copy(&__entry->region_uuid, &cxlr->params.uuid);
@@ -455,7 +505,7 @@ TRACE_EVENT(cxl_general_media,
"device=%x validity_flags='%s' " \
"comp_id=%s comp_id_pldm_valid_flags='%s' " \
"pldm_entity_id=%s pldm_resource_id=%s " \
- "hpa=%llx region=%s region_uuid=%pUb " \
+ "hpa=%llx hpa_alias0=%llx region=%s region_uuid=%pUb " \
"cme_threshold_ev_flags='%s' cme_count=%u",
__entry->dpa, show_dpa_flags(__entry->dpa_flags),
show_event_desc_flags(__entry->descriptor),
@@ -470,7 +520,7 @@ TRACE_EVENT(cxl_general_media,
CXL_GMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
show_pldm_resource_id(__entry->validity_flags, CXL_GMER_VALID_COMPONENT,
CXL_GMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
- __entry->hpa, __get_str(region_name), &__entry->region_uuid,
+ __entry->hpa, __entry->hpa_alias0, __get_str(region_name), &__entry->region_uuid,
show_cme_threshold_ev_flags(__entry->cme_threshold_ev_flags), __entry->cme_count
)
);
@@ -529,9 +579,10 @@ TRACE_EVENT(cxl_general_media,
TRACE_EVENT(cxl_dram,
TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
- struct cxl_region *cxlr, u64 hpa, struct cxl_event_dram *rec),
+ struct cxl_region *cxlr, u64 hpa, u64 hpa_alias0,
+ struct cxl_event_dram *rec),
- TP_ARGS(cxlmd, log, cxlr, hpa, rec),
+ TP_ARGS(cxlmd, log, cxlr, hpa, hpa_alias0, rec),
TP_STRUCT__entry(
CXL_EVT_TP_entry
@@ -547,6 +598,7 @@ TRACE_EVENT(cxl_dram,
__field(u32, row)
__array(u8, cor_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE)
__field(u64, hpa)
+ __field(u64, hpa_alias0)
__field_struct(uuid_t, region_uuid)
__field(u8, rank) /* Out of order to pack trace record */
__field(u8, bank_group) /* Out of order to pack trace record */
@@ -584,6 +636,7 @@ TRACE_EVENT(cxl_dram,
memcpy(__entry->cor_mask, &rec->correction_mask,
CXL_EVENT_DER_CORRECTION_MASK_SIZE);
__entry->hpa = hpa;
+ __entry->hpa_alias0 = hpa_alias0;
if (cxlr) {
__assign_str(region_name);
uuid_copy(&__entry->region_uuid, &cxlr->params.uuid);
@@ -604,7 +657,7 @@ TRACE_EVENT(cxl_dram,
"validity_flags='%s' " \
"comp_id=%s comp_id_pldm_valid_flags='%s' " \
"pldm_entity_id=%s pldm_resource_id=%s " \
- "hpa=%llx region=%s region_uuid=%pUb " \
+ "hpa=%llx hpa_alias0=%llx region=%s region_uuid=%pUb " \
"sub_channel=%u cme_threshold_ev_flags='%s' cvme_count=%u",
__entry->dpa, show_dpa_flags(__entry->dpa_flags),
show_event_desc_flags(__entry->descriptor),
@@ -622,7 +675,7 @@ TRACE_EVENT(cxl_dram,
CXL_DER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
show_pldm_resource_id(__entry->validity_flags, CXL_DER_VALID_COMPONENT,
CXL_DER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
- __entry->hpa, __get_str(region_name), &__entry->region_uuid,
+ __entry->hpa, __entry->hpa_alias0, __get_str(region_name), &__entry->region_uuid,
__entry->sub_channel, show_cme_threshold_ev_flags(__entry->cme_threshold_ev_flags),
__entry->cvme_count
)
@@ -870,6 +923,7 @@ TRACE_EVENT(cxl_poison,
__string(region, cxlr ? dev_name(&cxlr->dev) : "")
__field(u64, overflow_ts)
__field(u64, hpa)
+ __field(u64, hpa_alias0)
__field(u64, dpa)
__field(u32, dpa_length)
__array(char, uuid, 16)
@@ -892,16 +946,22 @@ TRACE_EVENT(cxl_poison,
memcpy(__entry->uuid, &cxlr->params.uuid, 16);
__entry->hpa = cxl_dpa_to_hpa(cxlr, cxlmd,
__entry->dpa);
+ if (__entry->hpa != ULLONG_MAX && cxlr->params.cache_size)
+ __entry->hpa_alias0 = __entry->hpa +
+ cxlr->params.cache_size;
+ else
+ __entry->hpa_alias0 = ULLONG_MAX;
} else {
__assign_str(region);
memset(__entry->uuid, 0, 16);
__entry->hpa = ULLONG_MAX;
+ __entry->hpa_alias0 = ULLONG_MAX;
}
),
TP_printk("memdev=%s host=%s serial=%lld trace_type=%s region=%s " \
- "region_uuid=%pU hpa=0x%llx dpa=0x%llx dpa_length=0x%x " \
- "source=%s flags=%s overflow_time=%llu",
+ "region_uuid=%pU hpa=0x%llx hpa_alias0=0x%llx dpa=0x%llx " \
+ "dpa_length=0x%x source=%s flags=%s overflow_time=%llu",
__get_str(memdev),
__get_str(host),
__entry->serial,
@@ -909,6 +969,7 @@ TRACE_EVENT(cxl_poison,
__get_str(region),
__entry->uuid,
__entry->hpa,
+ __entry->hpa_alias0,
__entry->dpa,
__entry->dpa_length,
show_poison_source(__entry->source),
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index bbbaa0d0a670..be8a7dc77719 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -373,32 +373,6 @@ struct cxl_decoder {
};
/*
- * CXL_DECODER_DEAD prevents endpoints from being reattached to regions
- * while cxld_unregister() is running
- */
-enum cxl_decoder_mode {
- CXL_DECODER_NONE,
- CXL_DECODER_RAM,
- CXL_DECODER_PMEM,
- CXL_DECODER_MIXED,
- CXL_DECODER_DEAD,
-};
-
-static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
-{
- static const char * const names[] = {
- [CXL_DECODER_NONE] = "none",
- [CXL_DECODER_RAM] = "ram",
- [CXL_DECODER_PMEM] = "pmem",
- [CXL_DECODER_MIXED] = "mixed",
- };
-
- if (mode >= CXL_DECODER_NONE && mode <= CXL_DECODER_MIXED)
- return names[mode];
- return "mixed";
-}
-
-/*
* Track whether this decoder is reserved for region autodiscovery, or
* free for userspace provisioning.
*/
@@ -412,16 +386,16 @@ enum cxl_decoder_state {
* @cxld: base cxl_decoder_object
* @dpa_res: actively claimed DPA span of this decoder
* @skip: offset into @dpa_res where @cxld.hpa_range maps
- * @mode: which memory type / access-mode-partition this decoder targets
* @state: autodiscovery state
+ * @part: partition index this decoder maps
* @pos: interleave position in @cxld.region
*/
struct cxl_endpoint_decoder {
struct cxl_decoder cxld;
struct resource *dpa_res;
resource_size_t skip;
- enum cxl_decoder_mode mode;
enum cxl_decoder_state state;
+ int part;
int pos;
};
@@ -493,6 +467,7 @@ enum cxl_config_state {
* @res: allocated iomem capacity for this region
* @targets: active ordered targets in current decoder configuration
* @nr_targets: number of targets
+ * @cache_size: extended linear cache size if exists, otherwise zero.
*
* State transitions are protected by the cxl_region_rwsem
*/
@@ -504,6 +479,12 @@ struct cxl_region_params {
struct resource *res;
struct cxl_endpoint_decoder *targets[CXL_DECODER_MAX_INTERLEAVE];
int nr_targets;
+ resource_size_t cache_size;
+};
+
+enum cxl_partition_mode {
+ CXL_PARTMODE_RAM,
+ CXL_PARTMODE_PMEM,
};
/*
@@ -525,7 +506,7 @@ struct cxl_region_params {
* struct cxl_region - CXL region
* @dev: This region's device
* @id: This region's id. Id is globally unique across all regions
- * @mode: Endpoint decoder allocation / access mode
+ * @mode: Operational mode of the mapped capacity
* @type: Endpoint decoder target type
* @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown
* @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge
@@ -538,7 +519,7 @@ struct cxl_region_params {
struct cxl_region {
struct device dev;
int id;
- enum cxl_decoder_mode mode;
+ enum cxl_partition_mode mode;
enum cxl_decoder_type type;
struct cxl_nvdimm_bridge *cxl_nvb;
struct cxl_pmem_region *cxlr_pmem;
@@ -563,6 +544,7 @@ struct cxl_nvdimm {
struct device dev;
struct cxl_memdev *cxlmd;
u8 dev_id[CXL_DEV_ID_LEN]; /* for nvdimm, string of 'serial' */
+ u64 dirty_shutdowns;
};
struct cxl_pmem_region_mapping {
@@ -610,6 +592,7 @@ struct cxl_dax_region {
* @cdat: Cached CDAT data
* @cdat_available: Should a CDAT attribute be available in sysfs
* @pci_latency: Upstream latency in picoseconds
+ * @gpf_dvsec: Cached GPF port DVSEC
*/
struct cxl_port {
struct device dev;
@@ -633,6 +616,7 @@ struct cxl_port {
} cdat;
bool cdat_available;
long pci_latency;
+ int gpf_dvsec;
};
/**
@@ -875,6 +859,7 @@ struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev);
int cxl_add_to_region(struct cxl_port *root,
struct cxl_endpoint_decoder *cxled);
struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
+u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
#else
static inline bool is_cxl_pmem_region(struct device *dev)
{
@@ -893,6 +878,11 @@ static inline struct cxl_dax_region *to_cxl_dax_region(struct device *dev)
{
return NULL;
}
+static inline u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint,
+ u64 spa)
+{
+ return 0;
+}
#endif
void cxl_endpoint_parse_cdat(struct cxl_port *port);
@@ -920,4 +910,6 @@ bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port);
#define __mock static
#endif
+u16 cxl_gpf_get_dvsec(struct device *dev, bool is_port);
+
#endif /* __CXL_H__ */
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index dd2b7060d501..3ec6b906371b 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -97,6 +97,19 @@ int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
resource_size_t base, resource_size_t len,
resource_size_t skipped);
+#define CXL_NR_PARTITIONS_MAX 2
+
+struct cxl_dpa_info {
+ u64 size;
+ struct cxl_dpa_part_info {
+ struct range range;
+ enum cxl_partition_mode mode;
+ } part[CXL_NR_PARTITIONS_MAX];
+ int nr_partitions;
+};
+
+int cxl_dpa_setup(struct cxl_dev_state *cxlds, const struct cxl_dpa_info *info);
+
static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port,
struct cxl_memdev *cxlmd)
{
@@ -373,6 +386,18 @@ struct cxl_dpa_perf {
};
/**
+ * struct cxl_dpa_partition - DPA partition descriptor
+ * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res)
+ * @perf: performance attributes of the partition from CDAT
+ * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic...
+ */
+struct cxl_dpa_partition {
+ struct resource res;
+ struct cxl_dpa_perf perf;
+ enum cxl_partition_mode mode;
+};
+
+/**
* struct cxl_dev_state - The driver device state
*
* cxl_dev_state represents the CXL driver/device state. It provides an
@@ -387,8 +412,8 @@ struct cxl_dpa_perf {
* @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH)
* @media_ready: Indicate whether the device media is usable
* @dpa_res: Overall DPA resource tree for the device
- * @pmem_res: Active Persistent memory capacity configuration
- * @ram_res: Active Volatile memory capacity configuration
+ * @part: DPA partition array
+ * @nr_partitions: Number of DPA partitions
* @serial: PCIe Device Serial Number
* @type: Generic Memory Class device or Vendor Specific Memory device
* @cxl_mbox: CXL mailbox context
@@ -403,8 +428,8 @@ struct cxl_dev_state {
bool rcd;
bool media_ready;
struct resource dpa_res;
- struct resource pmem_res;
- struct resource ram_res;
+ struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX];
+ unsigned int nr_partitions;
u64 serial;
enum cxl_devtype type;
struct cxl_mailbox cxl_mbox;
@@ -413,6 +438,18 @@ struct cxl_dev_state {
#endif
};
+static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds)
+{
+ /*
+ * Static PMEM may be at partition index 0 when there is no static RAM
+ * capacity.
+ */
+ for (int i = 0; i < cxlds->nr_partitions; i++)
+ if (cxlds->part[i].mode == CXL_PARTMODE_PMEM)
+ return resource_size(&cxlds->part[i].res);
+ return 0;
+}
+
static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
{
return dev_get_drvdata(cxl_mbox->host);
@@ -435,14 +472,11 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
* @partition_align_bytes: alignment size for partition-able capacity
* @active_volatile_bytes: sum of hard + soft volatile
* @active_persistent_bytes: sum of hard + soft persistent
- * @next_volatile_bytes: volatile capacity change pending device reset
- * @next_persistent_bytes: persistent capacity change pending device reset
- * @ram_perf: performance data entry matched to RAM partition
- * @pmem_perf: performance data entry matched to PMEM partition
* @event: event log driver state
* @poison: poison driver state info
* @security: security driver state info
* @fw: firmware upload / activation state
+ * @mce_notifier: MCE notifier
*
* See CXL 3.0 8.2.9.8.2 Capacity Configuration and Label Storage for
* details on capacity parameters.
@@ -457,16 +491,12 @@ struct cxl_memdev_state {
u64 partition_align_bytes;
u64 active_volatile_bytes;
u64 active_persistent_bytes;
- u64 next_volatile_bytes;
- u64 next_persistent_bytes;
-
- struct cxl_dpa_perf ram_perf;
- struct cxl_dpa_perf pmem_perf;
struct cxl_event_state event;
struct cxl_poison_state poison;
struct cxl_security_state security;
struct cxl_fw_state fw;
+ struct notifier_block mce_notifier;
};
static inline struct cxl_memdev_state *
@@ -660,6 +690,23 @@ struct cxl_mbox_set_partition_info {
#define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0)
+/* Get Health Info Output Payload CXL 3.2 Spec 8.2.10.9.3.1 Table 8-148 */
+struct cxl_mbox_get_health_info_out {
+ u8 health_status;
+ u8 media_status;
+ u8 additional_status;
+ u8 life_used;
+ __le16 device_temperature;
+ __le32 dirty_shutdown_cnt;
+ __le32 corrected_volatile_error_cnt;
+ __le32 corrected_persistent_error_cnt;
+} __packed;
+
+/* Set Shutdown State Input Payload CXL 3.2 Spec 8.2.10.9.3.5 Table 8-152 */
+struct cxl_mbox_set_shutdown_state_in {
+ u8 state;
+} __packed;
+
/* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
struct cxl_mbox_set_timestamp_in {
__le64 timestamp;
@@ -785,7 +832,7 @@ int cxl_internal_send_cmd(struct cxl_mailbox *cxl_mbox,
int cxl_dev_state_identify(struct cxl_memdev_state *mds);
int cxl_await_media_ready(struct cxl_dev_state *cxlds);
int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
-int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
+int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info);
struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev);
void set_exclusive_cxl_commands(struct cxl_memdev_state *mds,
unsigned long *cmds);
@@ -796,6 +843,8 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
enum cxl_event_log_type type,
enum cxl_event_type event_type,
const uuid_t *uuid, union cxl_event *evt);
+int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count);
+int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds);
int cxl_set_timestamp(struct cxl_memdev_state *mds);
int cxl_poison_state_init(struct cxl_memdev_state *mds);
int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 4da07727ab9c..54e219b0049e 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -40,6 +40,12 @@
/* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */
#define CXL_DVSEC_PORT_GPF 4
+#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C
+#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK GENMASK(3, 0)
+#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK GENMASK(11, 8)
+#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE
+#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK GENMASK(3, 0)
+#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK GENMASK(11, 8)
/* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */
#define CXL_DVSEC_DEVICE_GPF 5
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index 2f03a4d5606e..9675243bd05b 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -152,7 +152,7 @@ static int cxl_mem_probe(struct device *dev)
return -ENXIO;
}
- if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM)) {
+ if (cxl_pmem_size(cxlds) && IS_ENABLED(CONFIG_CXL_PMEM)) {
rc = devm_cxl_add_nvdimm(parent_port, cxlmd);
if (rc) {
if (rc == -ENODEV)
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 993fa60fe453..7b14a154463c 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -903,6 +903,7 @@ __ATTRIBUTE_GROUPS(cxl_rcd);
static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct pci_host_bridge *host_bridge = pci_find_host_bridge(pdev->bus);
+ struct cxl_dpa_info range_info = { 0 };
struct cxl_memdev_state *mds;
struct cxl_dev_state *cxlds;
struct cxl_register_map map;
@@ -993,7 +994,11 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (rc)
return rc;
- rc = cxl_mem_create_range_info(mds);
+ rc = cxl_mem_dpa_fetch(mds, &range_info);
+ if (rc)
+ return rc;
+
+ rc = cxl_dpa_setup(cxlds, &range_info);
if (rc)
return rc;
diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c
index f9c95996e937..d061fe3d2b86 100644
--- a/drivers/cxl/pmem.c
+++ b/drivers/cxl/pmem.c
@@ -42,15 +42,44 @@ static ssize_t id_show(struct device *dev, struct device_attribute *attr, char *
}
static DEVICE_ATTR_RO(id);
+static ssize_t dirty_shutdown_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+
+ return sysfs_emit(buf, "%llu\n", cxl_nvd->dirty_shutdowns);
+}
+static DEVICE_ATTR_RO(dirty_shutdown);
+
static struct attribute *cxl_dimm_attributes[] = {
&dev_attr_id.attr,
&dev_attr_provider.attr,
+ &dev_attr_dirty_shutdown.attr,
NULL
};
+#define CXL_INVALID_DIRTY_SHUTDOWN_COUNT ULLONG_MAX
+static umode_t cxl_dimm_visible(struct kobject *kobj,
+ struct attribute *a, int n)
+{
+ if (a == &dev_attr_dirty_shutdown.attr) {
+ struct device *dev = kobj_to_dev(kobj);
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+
+ if (cxl_nvd->dirty_shutdowns ==
+ CXL_INVALID_DIRTY_SHUTDOWN_COUNT)
+ return 0;
+ }
+
+ return a->mode;
+}
+
static const struct attribute_group cxl_dimm_attribute_group = {
.name = "cxl",
.attrs = cxl_dimm_attributes,
+ .is_visible = cxl_dimm_visible
};
static const struct attribute_group *cxl_dimm_attribute_groups[] = {
@@ -58,6 +87,38 @@ static const struct attribute_group *cxl_dimm_attribute_groups[] = {
NULL
};
+static void cxl_nvdimm_arm_dirty_shutdown_tracking(struct cxl_nvdimm *cxl_nvd)
+{
+ struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+ struct device *dev = &cxl_nvd->dev;
+ u32 count;
+
+ /*
+ * Dirty tracking is enabled and exposed to the user, only when:
+ * - dirty shutdown on the device can be set, and,
+ * - the device has a Device GPF DVSEC (albeit unused), and,
+ * - the Get Health Info cmd can retrieve the device's dirty count.
+ */
+ cxl_nvd->dirty_shutdowns = CXL_INVALID_DIRTY_SHUTDOWN_COUNT;
+
+ if (cxl_arm_dirty_shutdown(mds)) {
+ dev_warn(dev, "GPF: could not set dirty shutdown state\n");
+ return;
+ }
+
+ if (!cxl_gpf_get_dvsec(cxlds->dev, false))
+ return;
+
+ if (cxl_get_dirty_count(mds, &count)) {
+ dev_warn(dev, "GPF: could not retrieve dirty count\n");
+ return;
+ }
+
+ cxl_nvd->dirty_shutdowns = count;
+}
+
static int cxl_nvdimm_probe(struct device *dev)
{
struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev);
@@ -78,6 +139,14 @@ static int cxl_nvdimm_probe(struct device *dev)
set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask);
+
+ /*
+ * Set dirty shutdown now, with the expectation that the device
+ * clear it upon a successful GPF flow. The exception to this
+ * is upon Viral detection, per CXL 3.2 section 12.4.2.
+ */
+ cxl_nvdimm_arm_dirty_shutdown_tracking(cxl_nvd);
+
nvdimm = __nvdimm_create(cxl_nvb->nvdimm_bus, cxl_nvd,
cxl_dimm_attribute_groups, flags,
cmd_mask, 0, NULL, cxl_nvd->dev_id,
@@ -375,6 +444,16 @@ static int cxl_pmem_region_probe(struct device *dev)
goto out_nvd;
}
+ if (cxlds->serial == 0) {
+ /* include missing alongside invalid in this error message. */
+ dev_err(dev, "%s: invalid or missing serial number\n",
+ dev_name(&cxlmd->dev));
+ rc = -ENXIO;
+ goto out_nvd;
+ }
+ info[i].serial = cxlds->serial;
+ info[i].offset = m->start;
+
m->cxl_nvd = cxl_nvd;
mappings[i] = (struct nd_mapping_desc) {
.nvdimm = nvdimm,
@@ -382,8 +461,6 @@ static int cxl_pmem_region_probe(struct device *dev)
.size = m->size,
.position = i,
};
- info[i].offset = m->start;
- info[i].serial = cxlds->serial;
}
ndr_desc.num_mappings = cxlr_pmem->nr_mappings;
ndr_desc.mapping = mappings;
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index b360dca2c69e..bd04980009a4 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -1137,10 +1137,7 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg)
unsigned long payload, buffer_end, transmit_header_bytes = 0;
u32 control;
int count;
- struct {
- struct fw_iso_packet packet;
- u8 header[256];
- } u;
+ DEFINE_RAW_FLEX(struct fw_iso_packet, u, header, 64);
if (ctx == NULL || a->handle != 0)
return -EINVAL;
@@ -1172,29 +1169,29 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg)
while (p < end) {
if (get_user(control, &p->control))
return -EFAULT;
- u.packet.payload_length = GET_PAYLOAD_LENGTH(control);
- u.packet.interrupt = GET_INTERRUPT(control);
- u.packet.skip = GET_SKIP(control);
- u.packet.tag = GET_TAG(control);
- u.packet.sy = GET_SY(control);
- u.packet.header_length = GET_HEADER_LENGTH(control);
+ u->payload_length = GET_PAYLOAD_LENGTH(control);
+ u->interrupt = GET_INTERRUPT(control);
+ u->skip = GET_SKIP(control);
+ u->tag = GET_TAG(control);
+ u->sy = GET_SY(control);
+ u->header_length = GET_HEADER_LENGTH(control);
switch (ctx->type) {
case FW_ISO_CONTEXT_TRANSMIT:
- if (u.packet.header_length & 3)
+ if (u->header_length & 3)
return -EINVAL;
- transmit_header_bytes = u.packet.header_length;
+ transmit_header_bytes = u->header_length;
break;
case FW_ISO_CONTEXT_RECEIVE:
- if (u.packet.header_length == 0 ||
- u.packet.header_length % ctx->header_size != 0)
+ if (u->header_length == 0 ||
+ u->header_length % ctx->header_size != 0)
return -EINVAL;
break;
case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
- if (u.packet.payload_length == 0 ||
- u.packet.payload_length & 3)
+ if (u->payload_length == 0 ||
+ u->payload_length & 3)
return -EINVAL;
break;
}
@@ -1204,20 +1201,19 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg)
if (next > end)
return -EINVAL;
if (copy_from_user
- (u.packet.header, p->header, transmit_header_bytes))
+ (u->header, p->header, transmit_header_bytes))
return -EFAULT;
- if (u.packet.skip && ctx->type == FW_ISO_CONTEXT_TRANSMIT &&
- u.packet.header_length + u.packet.payload_length > 0)
+ if (u->skip && ctx->type == FW_ISO_CONTEXT_TRANSMIT &&
+ u->header_length + u->payload_length > 0)
return -EINVAL;
- if (payload + u.packet.payload_length > buffer_end)
+ if (payload + u->payload_length > buffer_end)
return -EINVAL;
- if (fw_iso_context_queue(ctx, &u.packet,
- &client->buffer, payload))
+ if (fw_iso_context_queue(ctx, u, &client->buffer, payload))
break;
p = next;
- payload += u.packet.payload_length;
+ payload += u->payload_length;
count++;
}
fw_iso_context_queue_flush(ctx);
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index b69e68ef3f02..928409199a1a 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -24,7 +24,7 @@
#include <linux/bcd.h>
#include <acpi/ghes.h>
#include <ras/ras_event.h>
-#include "cper_cxl.h"
+#include <cxl/event.h>
/*
* CPER record ID need to be unique even after reboot, because record
@@ -624,11 +624,11 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
else
goto err_section_too_small;
} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
- struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
+ struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
printk("%ssection_type: CXL Protocol Error\n", newpfx);
if (gdata->error_data_length >= sizeof(*prot_err))
- cper_print_prot_err(newpfx, prot_err);
+ cxl_cper_print_prot_err(newpfx, prot_err);
else
goto err_section_too_small;
} else {
diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index a55771b99a97..8a7667faf953 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -8,26 +8,7 @@
*/
#include <linux/cper.h>
-#include "cper_cxl.h"
-
-#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
-#define PROT_ERR_VALID_AGENT_ADDRESS BIT_ULL(1)
-#define PROT_ERR_VALID_DEVICE_ID BIT_ULL(2)
-#define PROT_ERR_VALID_SERIAL_NUMBER BIT_ULL(3)
-#define PROT_ERR_VALID_CAPABILITY BIT_ULL(4)
-#define PROT_ERR_VALID_DVSEC BIT_ULL(5)
-#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6)
-
-/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
-struct cxl_ras_capability_regs {
- u32 uncor_status;
- u32 uncor_mask;
- u32 uncor_severity;
- u32 cor_status;
- u32 cor_mask;
- u32 cap_control;
- u32 header_log[16];
-};
+#include <cxl/event.h>
static const char * const prot_err_agent_type_strs[] = {
"Restricted CXL Device",
@@ -40,22 +21,8 @@ static const char * const prot_err_agent_type_strs[] = {
"CXL Upstream Switch Port",
};
-/*
- * The layout of the enumeration and the values matches CXL Agent Type
- * field in the UEFI 2.10 Section N.2.13,
- */
-enum {
- RCD, /* Restricted CXL Device */
- RCH_DP, /* Restricted CXL Host Downstream Port */
- DEVICE, /* CXL Device */
- LD, /* CXL Logical Device */
- FMLD, /* CXL Fabric Manager managed Logical Device */
- RP, /* CXL Root Port */
- DSP, /* CXL Downstream Switch Port */
- USP, /* CXL Upstream Switch Port */
-};
-
-void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
+void cxl_cper_print_prot_err(const char *pfx,
+ const struct cxl_cper_sec_prot_err *prot_err)
{
if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
pr_info("%s agent_type: %d, %s\n", pfx, prot_err->agent_type,
diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
deleted file mode 100644
index 86bfcf7909ec..000000000000
--- a/drivers/firmware/efi/cper_cxl.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * UEFI Common Platform Error Record (CPER) support for CXL Section.
- *
- * Copyright (C) 2022 Advanced Micro Devices, Inc.
- *
- * Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
- */
-
-#ifndef LINUX_CPER_CXL_H
-#define LINUX_CPER_CXL_H
-
-/* CXL Protocol Error Section */
-#define CPER_SEC_CXL_PROT_ERR \
- GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
- 0x4B, 0x77, 0x10, 0x48)
-
-#pragma pack(1)
-
-/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
-struct cper_sec_prot_err {
- u64 valid_bits;
- u8 agent_type;
- u8 reserved[7];
-
- /*
- * Except for RCH Downstream Port, all the remaining CXL Agent
- * types are uniquely identified by the PCIe compatible SBDF number.
- */
- union {
- u64 rcrb_base_addr;
- struct {
- u8 function;
- u8 device;
- u8 bus;
- u16 segment;
- u8 reserved_1[3];
- };
- } agent_addr;
-
- struct {
- u16 vendor_id;
- u16 device_id;
- u16 subsystem_vendor_id;
- u16 subsystem_id;
- u8 class_code[2];
- u16 slot;
- u8 reserved_1[4];
- } device_id;
-
- struct {
- u32 lower_dw;
- u32 upper_dw;
- } dev_serial_num;
-
- u8 capability[60];
- u16 dvsec_len;
- u16 err_len;
- u8 reserved_2[4];
-};
-
-#pragma pack()
-
-void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err);
-
-#endif //__CPER_CXL_
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 0b1870a09e1f..06f809e70f15 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -267,6 +267,7 @@ config DM_CRYPT
depends on BLK_DEV_DM
depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
depends on (TRUSTED_KEYS || TRUSTED_KEYS=n)
+ select CRC32
select CRYPTO
select CRYPTO_CBC
select CRYPTO_ESSIV
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index aab8240429b0..9c8ed65cd87e 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -2234,7 +2234,7 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c
}
EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
-static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
+static void forget_buffer(struct dm_bufio_client *c, sector_t block)
{
struct dm_buffer *b;
@@ -2249,8 +2249,6 @@ static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
cache_put_and_wake(c, b);
}
}
-
- return b ? true : false;
}
/*
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9cb797a561d6..a10d75a562db 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -406,6 +406,12 @@ struct cache {
mempool_t migration_pool;
struct bio_set bs;
+
+ /*
+ * Cache_size entries. Set bits indicate blocks mapped beyond the
+ * target length, which are marked for invalidation.
+ */
+ unsigned long *invalid_bitset;
};
struct per_bio_data {
@@ -1922,6 +1928,9 @@ static void __destroy(struct cache *cache)
if (cache->discard_bitset)
free_bitset(cache->discard_bitset);
+ if (cache->invalid_bitset)
+ free_bitset(cache->invalid_bitset);
+
if (cache->copier)
dm_kcopyd_client_destroy(cache->copier);
@@ -2510,6 +2519,13 @@ static int cache_create(struct cache_args *ca, struct cache **result)
}
clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
+ cache->invalid_bitset = alloc_bitset(from_cblock(cache->cache_size));
+ if (!cache->invalid_bitset) {
+ *error = "could not allocate bitset for invalid blocks";
+ goto bad;
+ }
+ clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size));
+
cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
if (IS_ERR(cache->copier)) {
*error = "could not create kcopyd client";
@@ -2808,6 +2824,24 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
}
+static int load_filtered_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
+ bool dirty, uint32_t hint, bool hint_valid)
+{
+ struct cache *cache = context;
+
+ if (from_oblock(oblock) >= from_oblock(cache->origin_blocks)) {
+ if (dirty) {
+ DMERR("%s: unable to shrink origin; cache block %u is dirty",
+ cache_device_name(cache), from_cblock(cblock));
+ return -EFBIG;
+ }
+ set_bit(from_cblock(cblock), cache->invalid_bitset);
+ return 0;
+ }
+
+ return load_mapping(context, oblock, cblock, dirty, hint, hint_valid);
+}
+
/*
* The discard block size in the on disk metadata is not
* necessarily the same as we're currently using. So we have to
@@ -2899,6 +2933,27 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache)
return to_cblock(size);
}
+static bool can_resume(struct cache *cache)
+{
+ /*
+ * Disallow retrying the resume operation for devices that failed the
+ * first resume attempt, as the failure leaves the policy object partially
+ * initialized. Retrying could trigger BUG_ON when loading cache mappings
+ * into the incomplete policy object.
+ */
+ if (cache->sized && !cache->loaded_mappings) {
+ if (get_cache_mode(cache) != CM_WRITE)
+ DMERR("%s: unable to resume a failed-loaded cache, please check metadata.",
+ cache_device_name(cache));
+ else
+ DMERR("%s: unable to resume cache due to missing proper cache table reload",
+ cache_device_name(cache));
+ return false;
+ }
+
+ return true;
+}
+
static bool can_resize(struct cache *cache, dm_cblock_t new_size)
{
if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
@@ -2941,12 +2996,33 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
return 0;
}
+static int truncate_oblocks(struct cache *cache)
+{
+ uint32_t nr_blocks = from_cblock(cache->cache_size);
+ uint32_t i;
+ int r;
+
+ for_each_set_bit(i, cache->invalid_bitset, nr_blocks) {
+ r = dm_cache_remove_mapping(cache->cmd, to_cblock(i));
+ if (r) {
+ DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
+ cache_device_name(cache));
+ return r;
+ }
+ }
+
+ return 0;
+}
+
static int cache_preresume(struct dm_target *ti)
{
int r = 0;
struct cache *cache = ti->private;
dm_cblock_t csize = get_cache_dev_size(cache);
+ if (!can_resume(cache))
+ return -EINVAL;
+
/*
* Check to see if the cache has resized.
*/
@@ -2962,11 +3038,25 @@ static int cache_preresume(struct dm_target *ti)
}
if (!cache->loaded_mappings) {
+ /*
+ * The fast device could have been resized since the last
+ * failed preresume attempt. To be safe we start by a blank
+ * bitset for cache blocks.
+ */
+ clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size));
+
r = dm_cache_load_mappings(cache->cmd, cache->policy,
- load_mapping, cache);
+ load_filtered_mapping, cache);
if (r) {
DMERR("%s: could not load cache mappings", cache_device_name(cache));
- metadata_operation_failed(cache, "dm_cache_load_mappings", r);
+ if (r != -EFBIG)
+ metadata_operation_failed(cache, "dm_cache_load_mappings", r);
+ return r;
+ }
+
+ r = truncate_oblocks(cache);
+ if (r) {
+ metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
return r;
}
@@ -3426,7 +3516,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {2, 2, 0},
+ .version = {2, 3, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 02a2919f4e5a..9dfdb63220d7 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -17,6 +17,7 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
+#include <linux/crc32.h>
#include <linux/mempool.h>
#include <linux/slab.h>
#include <linux/crypto.h>
@@ -125,7 +126,6 @@ struct iv_lmk_private {
#define TCW_WHITENING_SIZE 16
struct iv_tcw_private {
- struct crypto_shash *crc32_tfm;
u8 *iv_seed;
u8 *whitening;
};
@@ -607,10 +607,6 @@ static void crypt_iv_tcw_dtr(struct crypt_config *cc)
tcw->iv_seed = NULL;
kfree_sensitive(tcw->whitening);
tcw->whitening = NULL;
-
- if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm))
- crypto_free_shash(tcw->crc32_tfm);
- tcw->crc32_tfm = NULL;
}
static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -628,13 +624,6 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
return -EINVAL;
}
- tcw->crc32_tfm = crypto_alloc_shash("crc32", 0,
- CRYPTO_ALG_ALLOCATES_MEMORY);
- if (IS_ERR(tcw->crc32_tfm)) {
- ti->error = "Error initializing CRC32 in TCW";
- return PTR_ERR(tcw->crc32_tfm);
- }
-
tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL);
tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL);
if (!tcw->iv_seed || !tcw->whitening) {
@@ -668,36 +657,28 @@ static int crypt_iv_tcw_wipe(struct crypt_config *cc)
return 0;
}
-static int crypt_iv_tcw_whitening(struct crypt_config *cc,
- struct dm_crypt_request *dmreq,
- u8 *data)
+static void crypt_iv_tcw_whitening(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq, u8 *data)
{
struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
__le64 sector = cpu_to_le64(dmreq->iv_sector);
u8 buf[TCW_WHITENING_SIZE];
- SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm);
- int i, r;
+ int i;
/* xor whitening with sector number */
crypto_xor_cpy(buf, tcw->whitening, (u8 *)&sector, 8);
crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)&sector, 8);
/* calculate crc32 for every 32bit part and xor it */
- desc->tfm = tcw->crc32_tfm;
- for (i = 0; i < 4; i++) {
- r = crypto_shash_digest(desc, &buf[i * 4], 4, &buf[i * 4]);
- if (r)
- goto out;
- }
+ for (i = 0; i < 4; i++)
+ put_unaligned_le32(crc32(0, &buf[i * 4], 4), &buf[i * 4]);
crypto_xor(&buf[0], &buf[12], 4);
crypto_xor(&buf[4], &buf[8], 4);
/* apply whitening (8 bytes) to whole sector */
for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++)
crypto_xor(data + i * 8, buf, 8);
-out:
memzero_explicit(buf, sizeof(buf));
- return r;
}
static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
@@ -707,13 +688,12 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
__le64 sector = cpu_to_le64(dmreq->iv_sector);
u8 *src;
- int r = 0;
/* Remove whitening from ciphertext */
if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
sg = crypt_get_sg_data(cc, dmreq->sg_in);
src = kmap_local_page(sg_page(sg));
- r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset);
+ crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset);
kunmap_local(src);
}
@@ -723,7 +703,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)&sector,
cc->iv_size - 8);
- return r;
+ return 0;
}
static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
@@ -731,7 +711,6 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
{
struct scatterlist *sg;
u8 *dst;
- int r;
if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
return 0;
@@ -739,10 +718,10 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
/* Apply whitening on ciphertext */
sg = crypt_get_sg_data(cc, dmreq->sg_out);
dst = kmap_local_page(sg_page(sg));
- r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
+ crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
kunmap_local(dst);
- return r;
+ return 0;
}
static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 08f6387620c1..d4cf0ac2a7aa 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -369,6 +369,21 @@ static int delay_map(struct dm_target *ti, struct bio *bio)
return delay_bio(dc, c, bio);
}
+#ifdef CONFIG_BLK_DEV_ZONED
+static int delay_report_zones(struct dm_target *ti,
+ struct dm_report_zones_args *args, unsigned int nr_zones)
+{
+ struct delay_c *dc = ti->private;
+ struct delay_class *c = &dc->read;
+
+ return dm_report_zones(c->dev->bdev, c->start,
+ c->start + dm_target_offset(ti, args->next_sector),
+ args, nr_zones);
+}
+#else
+#define delay_report_zones NULL
+#endif
+
#define DMEMIT_DELAY_CLASS(c) \
DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay)
@@ -424,11 +439,12 @@ out:
static struct target_type delay_target = {
.name = "delay",
.version = {1, 4, 0},
- .features = DM_TARGET_PASSES_INTEGRITY,
+ .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM,
.module = THIS_MODULE,
.ctr = delay_ctr,
.dtr = delay_dtr,
.map = delay_map,
+ .report_zones = delay_report_zones,
.presuspend = delay_presuspend,
.resume = delay_resume,
.status = delay_status,
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index 18ae45dcbfb2..b19b0142a690 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -390,6 +390,12 @@ static int ebs_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
+static void ebs_postsuspend(struct dm_target *ti)
+{
+ struct ebs_c *ec = ti->private;
+ dm_bufio_client_reset(ec->bufio);
+}
+
static void ebs_status(struct dm_target *ti, status_type_t type,
unsigned int status_flags, char *result, unsigned int maxlen)
{
@@ -447,6 +453,7 @@ static struct target_type ebs_target = {
.ctr = ebs_ctr,
.dtr = ebs_dtr,
.map = ebs_map,
+ .postsuspend = ebs_postsuspend,
.status = ebs_status,
.io_hints = ebs_io_hints,
.prepare_ioctl = ebs_prepare_ioctl,
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index c8c1a00e7d80..8b219b1199b4 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -21,6 +21,7 @@
#include <linux/reboot.h>
#include <crypto/hash.h>
#include <crypto/skcipher.h>
+#include <crypto/utils.h>
#include <linux/async_tx.h>
#include <linux/dm-bufio.h>
@@ -516,7 +517,7 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr)
dm_integrity_io_error(ic, "crypto_shash_digest", r);
return r;
}
- if (memcmp(mac, actual_mac, mac_size)) {
+ if (crypto_memneq(mac, actual_mac, mac_size)) {
dm_integrity_io_error(ic, "superblock mac", -EILSEQ);
dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0);
return -EILSEQ;
@@ -859,7 +860,7 @@ static void rw_section_mac(struct dm_integrity_c *ic, unsigned int section, bool
if (likely(wr))
memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
else {
- if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) {
+ if (crypto_memneq(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) {
dm_integrity_io_error(ic, "journal mac", -EILSEQ);
dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0);
}
@@ -1401,10 +1402,9 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_
static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
unsigned int *metadata_offset, unsigned int total_size, int op)
{
-#define MAY_BE_FILLER 1
-#define MAY_BE_HASH 2
unsigned int hash_offset = 0;
- unsigned int may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
+ unsigned char mismatch_hash = 0;
+ unsigned char mismatch_filler = !ic->discard;
do {
unsigned char *data, *dp;
@@ -1425,7 +1425,7 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
if (op == TAG_READ) {
memcpy(tag, dp, to_copy);
} else if (op == TAG_WRITE) {
- if (memcmp(dp, tag, to_copy)) {
+ if (crypto_memneq(dp, tag, to_copy)) {
memcpy(dp, tag, to_copy);
dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
}
@@ -1433,29 +1433,30 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
/* e.g.: op == TAG_CMP */
if (likely(is_power_of_2(ic->tag_size))) {
- if (unlikely(memcmp(dp, tag, to_copy)))
- if (unlikely(!ic->discard) ||
- unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) {
- goto thorough_test;
- }
+ if (unlikely(crypto_memneq(dp, tag, to_copy)))
+ goto thorough_test;
} else {
unsigned int i, ts;
thorough_test:
ts = total_size;
for (i = 0; i < to_copy; i++, ts--) {
- if (unlikely(dp[i] != tag[i]))
- may_be &= ~MAY_BE_HASH;
- if (likely(dp[i] != DISCARD_FILLER))
- may_be &= ~MAY_BE_FILLER;
+ /*
+ * Warning: the control flow must not be
+ * dependent on match/mismatch of
+ * individual bytes.
+ */
+ mismatch_hash |= dp[i] ^ tag[i];
+ mismatch_filler |= dp[i] ^ DISCARD_FILLER;
hash_offset++;
if (unlikely(hash_offset == ic->tag_size)) {
- if (unlikely(!may_be)) {
+ if (unlikely(mismatch_hash) && unlikely(mismatch_filler)) {
dm_bufio_release(b);
return ts;
}
hash_offset = 0;
- may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
+ mismatch_hash = 0;
+ mismatch_filler = !ic->discard;
}
}
}
@@ -1476,8 +1477,6 @@ thorough_test:
} while (unlikely(total_size));
return 0;
-#undef MAY_BE_FILLER
-#undef MAY_BE_HASH
}
struct flush_request {
@@ -2076,7 +2075,7 @@ retry_kmap:
char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
- if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
+ if (unlikely(crypto_memneq(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
logical_sector);
dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum",
@@ -2595,7 +2594,7 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
bio_put(outgoing_bio);
integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest);
- if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
+ if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
ic->dev->bdev, dio->bio_details.bi_iter.bi_sector);
atomic64_inc(&ic->number_of_mismatches);
@@ -2634,7 +2633,7 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status
char *mem = bvec_kmap_local(&bv);
//memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT);
integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest);
- if (unlikely(memcmp(digest, dio->integrity_payload + pos,
+ if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos,
min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
kunmap_local(mem);
dm_integrity_free_payload(dio);
@@ -2911,7 +2910,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start
integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
(char *)access_journal_data(ic, i, l), test_tag);
- if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) {
+ if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) {
dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0);
}
@@ -5072,16 +5071,19 @@ try_smaller_buffer:
ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
if (!ic->recalc_bitmap) {
+ ti->error = "Could not allocate memory for bitmap";
r = -ENOMEM;
goto bad;
}
ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
if (!ic->may_write_bitmap) {
+ ti->error = "Could not allocate memory for bitmap";
r = -ENOMEM;
goto bad;
}
ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL);
if (!ic->bbs) {
+ ti->error = "Could not allocate memory for bitmap";
r = -ENOMEM;
goto bad;
}
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 3786ac67cefe..a1b7535c508a 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -467,7 +467,7 @@ static struct target_type stripe_target = {
.name = "striped",
.version = {1, 7, 0},
.features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT |
- DM_TARGET_ATOMIC_WRITES,
+ DM_TARGET_ATOMIC_WRITES | DM_TARGET_PASSES_CRYPTO,
.module = THIS_MODULE,
.ctr = stripe_ctr,
.dtr = stripe_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 453803f1edf5..35100a435c88 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -697,6 +697,10 @@ int dm_table_add_target(struct dm_table *t, const char *type,
DMERR("%s: zero-length target", dm_device_name(t->md));
return -EINVAL;
}
+ if (start + len < start || start + len > LLONG_MAX >> SECTOR_SHIFT) {
+ DMERR("%s: too large device", dm_device_name(t->md));
+ return -EINVAL;
+ }
ti->type = dm_get_target_type(type);
if (!ti->type) {
diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c
index 89cb7942ec5c..baf683cabb1b 100644
--- a/drivers/md/dm-vdo/block-map.c
+++ b/drivers/md/dm-vdo/block-map.c
@@ -451,7 +451,7 @@ static struct page_info * __must_check find_page(struct vdo_page_cache *cache,
* select_lru_page() - Determine which page is least recently used.
*
* Picks the least recently used from among the non-busy entries at the front of each of the lru
- * ring. Since whenever we mark a page busy we also put it to the end of the ring it is unlikely
+ * list. Since whenever we mark a page busy we also put it to the end of the list it is unlikely
* that the entries at the front are busy unless the queue is very short, but not impossible.
*
* Return: A pointer to the info structure for a relevant page, or NULL if no such page can be
@@ -1544,7 +1544,7 @@ static void write_page_if_not_dirtied(struct vdo_waiter *waiter, void *context)
static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio)
{
- return_vio_to_pool(zone->vio_pool, vio);
+ return_vio_to_pool(vio);
check_for_drain_complete(zone);
}
@@ -1837,7 +1837,7 @@ static void finish_block_map_page_load(struct vdo_completion *completion)
if (!vdo_copy_valid_page(vio->data, nonce, pbn, page))
vdo_format_block_map_page(page, nonce, pbn, false);
- return_vio_to_pool(zone->vio_pool, pooled);
+ return_vio_to_pool(pooled);
/* Release our claim to the load and wake any waiters */
release_page_lock(data_vio, "load");
@@ -1851,10 +1851,9 @@ static void handle_io_error(struct vdo_completion *completion)
struct vio *vio = as_vio(completion);
struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
struct data_vio *data_vio = completion->parent;
- struct block_map_zone *zone = pooled->context;
vio_record_metadata_io_error(vio);
- return_vio_to_pool(zone->vio_pool, pooled);
+ return_vio_to_pool(pooled);
abort_load(data_vio, result);
}
@@ -2499,7 +2498,7 @@ static void finish_cursor(struct cursor *cursor)
struct cursors *cursors = cursor->parent;
struct vdo_completion *completion = cursors->completion;
- return_vio_to_pool(cursors->pool, vdo_forget(cursor->vio));
+ return_vio_to_pool(vdo_forget(cursor->vio));
if (--cursors->active_roots > 0)
return;
@@ -2746,7 +2745,7 @@ static int __must_check initialize_block_map_zone(struct block_map *map,
if (result != VDO_SUCCESS)
return result;
- result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE,
+ result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, 1,
zone->thread_id, VIO_TYPE_BLOCK_MAP_INTERIOR,
VIO_PRIORITY_METADATA, zone, &zone->vio_pool);
if (result != VDO_SUCCESS)
diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h
index a8c4d6e24b38..2a8b03779f87 100644
--- a/drivers/md/dm-vdo/constants.h
+++ b/drivers/md/dm-vdo/constants.h
@@ -44,9 +44,6 @@ enum {
/* The default size of each slab journal, in blocks */
DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224,
- /* Unit test minimum */
- MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2,
-
/*
* The initial size of lbn_operations and pbn_operations, which is based upon the expected
* maximum number of outstanding VIOs. This value was chosen to make it highly unlikely
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
index 3f3d29af1be4..5c49d49e023c 100644
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -226,7 +226,7 @@ struct hash_lock {
* A list containing the data VIOs sharing this lock, all having the same record name and
* data block contents, linked by their hash_lock_node fields.
*/
- struct list_head duplicate_ring;
+ struct list_head duplicate_vios;
/* The number of data_vios sharing this lock instance */
data_vio_count_t reference_count;
@@ -343,7 +343,7 @@ static void return_hash_lock_to_pool(struct hash_zone *zone, struct hash_lock *l
{
memset(lock, 0, sizeof(*lock));
INIT_LIST_HEAD(&lock->pool_node);
- INIT_LIST_HEAD(&lock->duplicate_ring);
+ INIT_LIST_HEAD(&lock->duplicate_vios);
vdo_waitq_init(&lock->waiters);
list_add_tail(&lock->pool_node, &zone->lock_pool);
}
@@ -441,7 +441,7 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock)
VDO_ASSERT_LOG_ONLY(data_vio->hash_zone != NULL,
"must have a hash zone when holding a hash lock");
VDO_ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry),
- "must be on a hash lock ring when holding a hash lock");
+ "must be on a hash lock list when holding a hash lock");
VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 0,
"hash lock reference must be counted");
@@ -464,10 +464,10 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock)
if (new_lock != NULL) {
/*
- * Keep all data_vios sharing the lock on a ring since they can complete in any
+ * Keep all data_vios sharing the lock on a list since they can complete in any
* order and we'll always need a pointer to one to compare data.
*/
- list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_ring);
+ list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_vios);
new_lock->reference_count += 1;
if (new_lock->max_references < new_lock->reference_count)
new_lock->max_references = new_lock->reference_count;
@@ -1789,10 +1789,10 @@ static bool is_hash_collision(struct hash_lock *lock, struct data_vio *candidate
struct hash_zone *zone;
bool collides;
- if (list_empty(&lock->duplicate_ring))
+ if (list_empty(&lock->duplicate_vios))
return false;
- lock_holder = list_first_entry(&lock->duplicate_ring, struct data_vio,
+ lock_holder = list_first_entry(&lock->duplicate_vios, struct data_vio,
hash_lock_entry);
zone = candidate->hash_zone;
collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data);
@@ -1815,7 +1815,7 @@ static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio
return result;
result = VDO_ASSERT(list_empty(&data_vio->hash_lock_entry),
- "must not already be a member of a hash lock ring");
+ "must not already be a member of a hash lock list");
if (result != VDO_SUCCESS)
return result;
@@ -1942,8 +1942,8 @@ void vdo_release_hash_lock(struct data_vio *data_vio)
"returned hash lock must not be in use with state %s",
get_hash_lock_state_name(lock->state));
VDO_ASSERT_LOG_ONLY(list_empty(&lock->pool_node),
- "hash lock returned to zone must not be in a pool ring");
- VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring),
+ "hash lock returned to zone must not be in a pool list");
+ VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_vios),
"hash lock returned to zone must not reference DataVIOs");
return_hash_lock_to_pool(zone, lock);
diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c
index 100e92f8f866..b7cc0f41caca 100644
--- a/drivers/md/dm-vdo/encodings.c
+++ b/drivers/md/dm-vdo/encodings.c
@@ -711,24 +711,11 @@ int vdo_configure_slab(block_count_t slab_size, block_count_t slab_journal_block
ref_blocks = vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks);
meta_blocks = (ref_blocks + slab_journal_blocks);
- /* Make sure test code hasn't configured slabs to be too small. */
+ /* Make sure configured slabs are not too small. */
if (meta_blocks >= slab_size)
return VDO_BAD_CONFIGURATION;
- /*
- * If the slab size is very small, assume this must be a unit test and override the number
- * of data blocks to be a power of two (wasting blocks in the slab). Many tests need their
- * data_blocks fields to be the exact capacity of the configured volume, and that used to
- * fall out since they use a power of two for the number of data blocks, the slab size was
- * a power of two, and every block in a slab was a data block.
- *
- * TODO: Try to figure out some way of structuring testParameters and unit tests so this
- * hack isn't needed without having to edit several unit tests every time the metadata size
- * changes by one block.
- */
data_blocks = slab_size - meta_blocks;
- if ((slab_size < 1024) && !is_power_of_2(data_blocks))
- data_blocks = ((block_count_t) 1 << ilog2(data_blocks));
/*
* Configure the slab journal thresholds. The flush threshold is 168 of 224 blocks in
@@ -1221,11 +1208,6 @@ int vdo_validate_config(const struct vdo_config *config,
if (result != VDO_SUCCESS)
return result;
- result = VDO_ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS,
- "slab journal size meets minimum size");
- if (result != VDO_SUCCESS)
- return result;
-
result = VDO_ASSERT(config->slab_journal_blocks <= config->slab_size,
"slab journal size is within expected bound");
if (result != VDO_SUCCESS)
diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c
index af8fab83b0f3..61edf2b72427 100644
--- a/drivers/md/dm-vdo/indexer/index-layout.c
+++ b/drivers/md/dm-vdo/indexer/index-layout.c
@@ -54,7 +54,6 @@
* Each save also has a unique nonce.
*/
-#define MAGIC_SIZE 32
#define NONCE_INFO_SIZE 32
#define MAX_SAVES 2
@@ -98,9 +97,11 @@ enum region_type {
#define SUPER_VERSION_CURRENT 3
#define SUPER_VERSION_MAXIMUM 7
-static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*";
+static const u8 LAYOUT_MAGIC[] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*";
static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */
+#define MAGIC_SIZE (sizeof(LAYOUT_MAGIC) - 1)
+
struct region_header {
u64 magic;
u64 region_blocks;
diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c
index aee0914d604a..aa575a24e0b2 100644
--- a/drivers/md/dm-vdo/indexer/index-session.c
+++ b/drivers/md/dm-vdo/indexer/index-session.c
@@ -100,7 +100,6 @@ static int get_index_session(struct uds_index_session *index_session)
int uds_launch_request(struct uds_request *request)
{
- size_t internal_size;
int result;
if (request->callback == NULL) {
@@ -121,10 +120,7 @@ int uds_launch_request(struct uds_request *request)
}
/* Reset all internal fields before processing. */
- internal_size =
- sizeof(struct uds_request) - offsetof(struct uds_request, zone_number);
- // FIXME should be using struct_group for this instead
- memset((char *) request + sizeof(*request) - internal_size, 0, internal_size);
+ memset(&request->internal, 0, sizeof(request->internal));
result = get_index_session(request->session);
if (result != UDS_SUCCESS)
diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h
index 183a94eb7e92..7c1fc4577f5b 100644
--- a/drivers/md/dm-vdo/indexer/indexer.h
+++ b/drivers/md/dm-vdo/indexer/indexer.h
@@ -8,6 +8,7 @@
#include <linux/mutex.h>
#include <linux/sched.h>
+#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/wait.h>
@@ -73,7 +74,7 @@ enum uds_request_type {
/* Remove any mapping for a name. */
UDS_DELETE,
-};
+} __packed;
enum uds_open_index_type {
/* Create a new index. */
@@ -226,7 +227,7 @@ struct uds_zone_message {
enum uds_zone_message_type type;
/* The virtual chapter number to which the message applies */
u64 virtual_chapter;
-};
+} __packed;
struct uds_index_session;
struct uds_index;
@@ -253,34 +254,32 @@ struct uds_request {
/* The existing data associated with the request name, if any */
struct uds_record_data old_metadata;
- /* Either UDS_SUCCESS or an error code for the request */
- int status;
/* True if the record name had an existing entry in the index */
bool found;
+ /* Either UDS_SUCCESS or an error code for the request */
+ int status;
- /*
- * The remaining fields are used internally and should not be altered by clients. The index
- * relies on zone_number being the first field in this section.
- */
-
- /* The number of the zone which will process this request*/
- unsigned int zone_number;
- /* A link for adding a request to a lock-free queue */
- struct funnel_queue_entry queue_link;
- /* A link for adding a request to a standard linked list */
- struct uds_request *next_request;
- /* A pointer to the index processing this request */
- struct uds_index *index;
- /* Control message for coordinating between zones */
- struct uds_zone_message zone_message;
- /* If true, process request immediately by waking the worker thread */
- bool unbatched;
- /* If true, continue this request before processing newer requests */
- bool requeued;
- /* The virtual chapter containing the record name, if known */
- u64 virtual_chapter;
- /* The region of the index containing the record name */
- enum uds_index_region location;
+ /* The remaining fields are used internally and should not be altered by clients. */
+ struct_group(internal,
+ /* The virtual chapter containing the record name, if known */
+ u64 virtual_chapter;
+ /* The region of the index containing the record name */
+ enum uds_index_region location;
+ /* If true, process request immediately by waking the worker thread */
+ bool unbatched;
+ /* If true, continue this request before processing newer requests */
+ bool requeued;
+ /* Control message for coordinating between zones */
+ struct uds_zone_message zone_message;
+ /* The number of the zone which will process this request*/
+ unsigned int zone_number;
+ /* A link for adding a request to a lock-free queue */
+ struct funnel_queue_entry queue_link;
+ /* A link for adding a request to a standard linked list */
+ struct uds_request *next_request;
+ /* A pointer to the index processing this request */
+ struct uds_index *index;
+ );
};
/* A session is required for most index operations. */
diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c
index 421e5436c32c..11d47770b54d 100644
--- a/drivers/md/dm-vdo/io-submitter.c
+++ b/drivers/md/dm-vdo/io-submitter.c
@@ -327,6 +327,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio)
* @error_handler: the handler for submission or I/O errors (may be NULL)
* @operation: the type of I/O to perform
* @data: the buffer to read or write (may be NULL)
+ * @size: the I/O amount in bytes
*
* The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block
* other vdo threads.
@@ -338,7 +339,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio)
*/
void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
bio_end_io_t callback, vdo_action_fn error_handler,
- blk_opf_t operation, char *data)
+ blk_opf_t operation, char *data, int size)
{
int result;
struct vdo_completion *completion = &vio->completion;
@@ -349,7 +350,8 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
vdo_reset_completion(completion);
completion->error_handler = error_handler;
- result = vio_reset_bio(vio, data, callback, operation | REQ_META, physical);
+ result = vio_reset_bio_with_size(vio, data, size, callback, operation | REQ_META,
+ physical);
if (result != VDO_SUCCESS) {
continue_vio(vio, result);
return;
diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h
index 80748699496f..3088f11055fd 100644
--- a/drivers/md/dm-vdo/io-submitter.h
+++ b/drivers/md/dm-vdo/io-submitter.h
@@ -8,6 +8,7 @@
#include <linux/bio.h>
+#include "constants.h"
#include "types.h"
struct io_submitter;
@@ -26,14 +27,25 @@ void vdo_submit_data_vio(struct data_vio *data_vio);
void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
bio_end_io_t callback, vdo_action_fn error_handler,
- blk_opf_t operation, char *data);
+ blk_opf_t operation, char *data, int size);
static inline void vdo_submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
bio_end_io_t callback, vdo_action_fn error_handler,
blk_opf_t operation)
{
__submit_metadata_vio(vio, physical, callback, error_handler,
- operation, vio->data);
+ operation, vio->data, vio->block_count * VDO_BLOCK_SIZE);
+}
+
+static inline void vdo_submit_metadata_vio_with_size(struct vio *vio,
+ physical_block_number_t physical,
+ bio_end_io_t callback,
+ vdo_action_fn error_handler,
+ blk_opf_t operation,
+ int size)
+{
+ __submit_metadata_vio(vio, physical, callback, error_handler,
+ operation, vio->data, size);
}
static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback,
@@ -41,7 +53,7 @@ static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback,
{
/* FIXME: Can we just use REQ_OP_FLUSH? */
__submit_metadata_vio(vio, 0, callback, error_handler,
- REQ_OP_WRITE | REQ_PREFLUSH, NULL);
+ REQ_OP_WRITE | REQ_PREFLUSH, NULL, 0);
}
#endif /* VDO_IO_SUBMITTER_H */
diff --git a/drivers/md/dm-vdo/packer.h b/drivers/md/dm-vdo/packer.h
index 0f3be44710b5..8c8d6892582d 100644
--- a/drivers/md/dm-vdo/packer.h
+++ b/drivers/md/dm-vdo/packer.h
@@ -46,7 +46,7 @@ struct compressed_block {
/*
* Each packer_bin holds an incomplete batch of data_vios that only partially fill a compressed
- * block. The bins are kept in a ring sorted by the amount of unused space so the first bin with
+ * block. The bins are kept in a list sorted by the amount of unused space so the first bin with
* enough space to hold a newly-compressed data_vio can easily be found. When the bin fills up or
* is flushed, the first uncanceled data_vio in the bin is selected to be the agent for that bin.
* Upon entering the packer, each data_vio already has its compressed data in the first slot of the
diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c
index 42d3d8d0e4b5..9bae8256ba4e 100644
--- a/drivers/md/dm-vdo/priority-table.c
+++ b/drivers/md/dm-vdo/priority-table.c
@@ -199,7 +199,7 @@ void vdo_priority_table_remove(struct priority_table *table, struct list_head *e
/*
* Remove the entry from the bucket list, remembering a pointer to another entry in the
- * ring.
+ * list.
*/
next_entry = entry->next;
list_del_init(entry);
diff --git a/drivers/md/dm-vdo/recovery-journal.h b/drivers/md/dm-vdo/recovery-journal.h
index 899071173015..25e7ec6d19f6 100644
--- a/drivers/md/dm-vdo/recovery-journal.h
+++ b/drivers/md/dm-vdo/recovery-journal.h
@@ -43,9 +43,9 @@
* has a vio which is used to commit that block to disk. The vio's data is the on-disk
* representation of the journal block. In addition each in-memory block has a buffer which is used
* to accumulate entries while a partial commit of the block is in progress. In-memory blocks are
- * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active
- * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is
- * moved back to the 'free_tail_blocks' ring.
+ * kept on two lists. Free blocks live on the 'free_tail_blocks' list. When a block becomes active
+ * (see below) it is moved to the 'active_tail_blocks' list. When a block is fully committed, it is
+ * moved back to the 'free_tail_blocks' list.
*
* When entries are added to the journal, they are added to the active in-memory block, as
* indicated by the 'active_block' field. If the caller wishes to wait for the entry to be
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
index 8f0a35c63af6..f3d80ff7bef5 100644
--- a/drivers/md/dm-vdo/slab-depot.c
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -139,7 +139,7 @@ static bool is_slab_journal_blank(const struct vdo_slab *slab)
}
/**
- * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct
+ * mark_slab_journal_dirty() - Put a slab journal on the dirty list of its allocator in the correct
* order.
* @journal: The journal to be marked dirty.
* @lock: The recovery journal lock held by the slab journal.
@@ -414,8 +414,7 @@ static void complete_reaping(struct vdo_completion *completion)
{
struct slab_journal *journal = completion->parent;
- return_vio_to_pool(journal->slab->allocator->vio_pool,
- vio_as_pooled_vio(as_vio(vdo_forget(completion))));
+ return_vio_to_pool(vio_as_pooled_vio(as_vio(completion)));
finish_reaping(journal);
reap_slab_journal(journal);
}
@@ -698,7 +697,7 @@ static void complete_write(struct vdo_completion *completion)
sequence_number_t committed = get_committing_sequence_number(pooled);
list_del_init(&pooled->list_entry);
- return_vio_to_pool(journal->slab->allocator->vio_pool, vdo_forget(pooled));
+ return_vio_to_pool(pooled);
if (result != VDO_SUCCESS) {
vio_record_metadata_io_error(as_vio(completion));
@@ -822,7 +821,7 @@ static void commit_tail(struct slab_journal *journal)
/*
* Since we are about to commit the tail block, this journal no longer needs to be on the
- * ring of journals which the recovery journal might ask to commit.
+ * list of journals which the recovery journal might ask to commit.
*/
mark_slab_journal_clean(journal);
@@ -1076,7 +1075,7 @@ static void finish_reference_block_write(struct vdo_completion *completion)
/* Release the slab journal lock. */
adjust_slab_journal_block_reference(&slab->journal,
block->slab_journal_lock_to_release, -1);
- return_vio_to_pool(slab->allocator->vio_pool, pooled);
+ return_vio_to_pool(pooled);
/*
* We can't clear the is_writing flag earlier as releasing the slab journal lock may cause
@@ -1170,8 +1169,8 @@ static void handle_io_error(struct vdo_completion *completion)
struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab;
vio_record_metadata_io_error(vio);
- return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
- slab->active_count--;
+ return_vio_to_pool(vio_as_pooled_vio(vio));
+ slab->active_count -= vio->io_size / VDO_BLOCK_SIZE;
vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
check_if_slab_drained(slab);
}
@@ -1372,7 +1371,7 @@ static unsigned int calculate_slab_priority(struct vdo_slab *slab)
static void prioritize_slab(struct vdo_slab *slab)
{
VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
- "a slab must not already be on a ring when prioritizing");
+ "a slab must not already be on a list when prioritizing");
slab->priority = calculate_slab_priority(slab);
vdo_priority_table_enqueue(slab->allocator->prioritized_slabs,
slab->priority, &slab->allocq_entry);
@@ -2165,28 +2164,95 @@ static void dirty_all_reference_blocks(struct vdo_slab *slab)
dirty_block(&slab->reference_blocks[i]);
}
+static inline bool journal_points_equal(struct journal_point first,
+ struct journal_point second)
+{
+ return ((first.sequence_number == second.sequence_number) &&
+ (first.entry_count == second.entry_count));
+}
+
/**
- * clear_provisional_references() - Clear the provisional reference counts from a reference block.
- * @block: The block to clear.
+ * match_bytes() - Check an 8-byte word for bytes matching the value specified
+ * @input: A word to examine the bytes of
+ * @match: The byte value sought
+ *
+ * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise
*/
-static void clear_provisional_references(struct reference_block *block)
+static inline u64 match_bytes(u64 input, u8 match)
{
- vdo_refcount_t *counters = get_reference_counters_for_block(block);
- block_count_t j;
+ u64 temp = input ^ (match * 0x0101010101010101ULL);
+ /* top bit of each byte is set iff top bit of temp byte is clear; rest are 0 */
+ u64 test_top_bits = ~temp & 0x8080808080808080ULL;
+ /* top bit of each byte is set iff low 7 bits of temp byte are clear; rest are useless */
+ u64 test_low_bits = 0x8080808080808080ULL - (temp & 0x7f7f7f7f7f7f7f7fULL);
+ /* return 1 when both tests indicate temp byte is 0 */
+ return (test_top_bits & test_low_bits) >> 7;
+}
+
+/**
+ * count_valid_references() - Process a newly loaded refcount array
+ * @counters: the array of counters from a metadata block
+ *
+ * Scan a 8-byte-aligned array of counters, fixing up any "provisional" values that weren't
+ * cleaned up at shutdown, changing them internally to "empty".
+ *
+ * Return: the number of blocks that are referenced (counters not "empty")
+ */
+static unsigned int count_valid_references(vdo_refcount_t *counters)
+{
+ u64 *words = (u64 *)counters;
+ /* It's easier to count occurrences of a specific byte than its absences. */
+ unsigned int empty_count = 0;
+ /* For speed, we process 8 bytes at once. */
+ unsigned int words_left = COUNTS_PER_BLOCK / sizeof(u64);
+
+ /*
+ * Sanity check assumptions used for optimizing this code: Counters are bytes. The counter
+ * array is a multiple of the word size.
+ */
+ BUILD_BUG_ON(sizeof(vdo_refcount_t) != 1);
+ BUILD_BUG_ON((COUNTS_PER_BLOCK % sizeof(u64)) != 0);
+
+ while (words_left > 0) {
+ /*
+ * This is used effectively as 8 byte-size counters. Byte 0 counts how many words
+ * had the target value found in byte 0, etc. We just have to avoid overflow.
+ */
+ u64 split_count = 0;
+ /*
+ * The counter "% 255" trick used below to fold split_count into empty_count
+ * imposes a limit of 254 bytes examined each iteration of the outer loop. We
+ * process a word at a time, so that limit gets rounded down to 31 u64 words.
+ */
+ const unsigned int max_words_per_iteration = 254 / sizeof(u64);
+ unsigned int iter_words_left = min_t(unsigned int, words_left,
+ max_words_per_iteration);
+
+ words_left -= iter_words_left;
+
+ while (iter_words_left--) {
+ u64 word = *words;
+ u64 temp;
+
+ /* First, if we have any provisional refcount values, clear them. */
+ temp = match_bytes(word, PROVISIONAL_REFERENCE_COUNT);
+ if (temp) {
+ /*
+ * 'temp' has 0x01 bytes where 'word' has PROVISIONAL; this xor
+ * will alter just those bytes, changing PROVISIONAL to EMPTY.
+ */
+ word ^= temp * (PROVISIONAL_REFERENCE_COUNT ^ EMPTY_REFERENCE_COUNT);
+ *words = word;
+ }
- for (j = 0; j < COUNTS_PER_BLOCK; j++) {
- if (counters[j] == PROVISIONAL_REFERENCE_COUNT) {
- counters[j] = EMPTY_REFERENCE_COUNT;
- block->allocated_count--;
+ /* Now count the EMPTY_REFERENCE_COUNT bytes, updating the 8 counters. */
+ split_count += match_bytes(word, EMPTY_REFERENCE_COUNT);
+ words++;
}
+ empty_count += split_count % 255;
}
-}
-static inline bool journal_points_equal(struct journal_point first,
- struct journal_point second)
-{
- return ((first.sequence_number == second.sequence_number) &&
- (first.entry_count == second.entry_count));
+ return COUNTS_PER_BLOCK - empty_count;
}
/**
@@ -2197,7 +2263,6 @@ static inline bool journal_points_equal(struct journal_point first,
static void unpack_reference_block(struct packed_reference_block *packed,
struct reference_block *block)
{
- block_count_t index;
sector_count_t i;
struct vdo_slab *slab = block->slab;
vdo_refcount_t *counters = get_reference_counters_for_block(block);
@@ -2223,11 +2288,7 @@ static void unpack_reference_block(struct packed_reference_block *packed,
}
}
- block->allocated_count = 0;
- for (index = 0; index < COUNTS_PER_BLOCK; index++) {
- if (counters[index] != EMPTY_REFERENCE_COUNT)
- block->allocated_count++;
- }
+ block->allocated_count = count_valid_references(counters);
}
/**
@@ -2240,13 +2301,19 @@ static void finish_reference_block_load(struct vdo_completion *completion)
struct pooled_vio *pooled = vio_as_pooled_vio(vio);
struct reference_block *block = completion->parent;
struct vdo_slab *slab = block->slab;
+ unsigned int block_count = vio->io_size / VDO_BLOCK_SIZE;
+ unsigned int i;
+ char *data = vio->data;
- unpack_reference_block((struct packed_reference_block *) vio->data, block);
- return_vio_to_pool(slab->allocator->vio_pool, pooled);
- slab->active_count--;
- clear_provisional_references(block);
+ for (i = 0; i < block_count; i++, block++, data += VDO_BLOCK_SIZE) {
+ struct packed_reference_block *packed = (struct packed_reference_block *) data;
+
+ unpack_reference_block(packed, block);
+ slab->free_blocks -= block->allocated_count;
+ }
+ return_vio_to_pool(pooled);
+ slab->active_count -= block_count;
- slab->free_blocks -= block->allocated_count;
check_if_slab_drained(slab);
}
@@ -2260,23 +2327,25 @@ static void load_reference_block_endio(struct bio *bio)
}
/**
- * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the
- * block.
- * @waiter: The waiter of the block to load.
+ * load_reference_block_group() - After a block waiter has gotten a VIO from the VIO pool, load
+ * a set of blocks.
+ * @waiter: The waiter of the first block to load.
* @context: The VIO returned by the pool.
*/
-static void load_reference_block(struct vdo_waiter *waiter, void *context)
+static void load_reference_block_group(struct vdo_waiter *waiter, void *context)
{
struct pooled_vio *pooled = context;
struct vio *vio = &pooled->vio;
struct reference_block *block =
container_of(waiter, struct reference_block, waiter);
- size_t block_offset = (block - block->slab->reference_blocks);
+ u32 block_offset = block - block->slab->reference_blocks;
+ u32 max_block_count = block->slab->reference_block_count - block_offset;
+ u32 block_count = min_t(int, vio->block_count, max_block_count);
vio->completion.parent = block;
- vdo_submit_metadata_vio(vio, block->slab->ref_counts_origin + block_offset,
- load_reference_block_endio, handle_io_error,
- REQ_OP_READ);
+ vdo_submit_metadata_vio_with_size(vio, block->slab->ref_counts_origin + block_offset,
+ load_reference_block_endio, handle_io_error,
+ REQ_OP_READ, block_count * VDO_BLOCK_SIZE);
}
/**
@@ -2286,14 +2355,21 @@ static void load_reference_block(struct vdo_waiter *waiter, void *context)
static void load_reference_blocks(struct vdo_slab *slab)
{
block_count_t i;
+ u64 blocks_per_vio = slab->allocator->refcount_blocks_per_big_vio;
+ struct vio_pool *pool = slab->allocator->refcount_big_vio_pool;
+
+ if (!pool) {
+ pool = slab->allocator->vio_pool;
+ blocks_per_vio = 1;
+ }
slab->free_blocks = slab->block_count;
slab->active_count = slab->reference_block_count;
- for (i = 0; i < slab->reference_block_count; i++) {
+ for (i = 0; i < slab->reference_block_count; i += blocks_per_vio) {
struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter;
- waiter->callback = load_reference_block;
- acquire_vio_from_pool(slab->allocator->vio_pool, waiter);
+ waiter->callback = load_reference_block_group;
+ acquire_vio_from_pool(pool, waiter);
}
}
@@ -2429,7 +2505,7 @@ static void finish_loading_journal(struct vdo_completion *completion)
initialize_journal_state(journal);
}
- return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+ return_vio_to_pool(vio_as_pooled_vio(vio));
vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab));
}
@@ -2449,7 +2525,7 @@ static void handle_load_error(struct vdo_completion *completion)
struct vio *vio = as_vio(completion);
vio_record_metadata_io_error(vio);
- return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+ return_vio_to_pool(vio_as_pooled_vio(vio));
vdo_finish_loading_with_result(&journal->slab->state, result);
}
@@ -2547,7 +2623,7 @@ static void queue_slab(struct vdo_slab *slab)
int result;
VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
- "a requeued slab must not already be on a ring");
+ "a requeued slab must not already be on a list");
if (vdo_is_read_only(allocator->depot->vdo))
return;
@@ -2700,6 +2776,7 @@ static void finish_scrubbing(struct slab_scrubber *scrubber, int result)
vdo_log_info("VDO commencing normal operation");
else if (prior_state == VDO_RECOVERING)
vdo_log_info("Exiting recovery mode");
+ free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool));
}
/*
@@ -3281,7 +3358,7 @@ int vdo_release_block_reference(struct block_allocator *allocator,
* This is a min_heap callback function orders slab_status structures using the 'is_clean' field as
* the primary key and the 'emptiness' field as the secondary key.
*
- * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping
+ * Slabs need to be pushed onto the lists in the same order they are to be popped off. Popping
* should always get the most empty first, so pushing should be from most empty to least empty.
* Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements
* before larger ones.
@@ -3983,6 +4060,7 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot,
struct vdo *vdo = depot->vdo;
block_count_t max_free_blocks = depot->slab_config.data_blocks;
unsigned int max_priority = (2 + ilog2(max_free_blocks));
+ u32 reference_block_count, refcount_reads_needed, refcount_blocks_per_vio;
*allocator = (struct block_allocator) {
.depot = depot,
@@ -4000,12 +4078,24 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot,
return result;
vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION);
- result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, allocator->thread_id,
+ result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, 1, allocator->thread_id,
VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA,
allocator, &allocator->vio_pool);
if (result != VDO_SUCCESS)
return result;
+ /* Initialize the refcount-reading vio pool. */
+ reference_block_count = vdo_get_saved_reference_count_size(depot->slab_config.slab_blocks);
+ refcount_reads_needed = DIV_ROUND_UP(reference_block_count, MAX_BLOCKS_PER_VIO);
+ refcount_blocks_per_vio = DIV_ROUND_UP(reference_block_count, refcount_reads_needed);
+ allocator->refcount_blocks_per_big_vio = refcount_blocks_per_vio;
+ result = make_vio_pool(vdo, BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE,
+ allocator->refcount_blocks_per_big_vio, allocator->thread_id,
+ VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA,
+ NULL, &allocator->refcount_big_vio_pool);
+ if (result != VDO_SUCCESS)
+ return result;
+
result = initialize_slab_scrubber(allocator);
if (result != VDO_SUCCESS)
return result;
@@ -4223,6 +4313,7 @@ void vdo_free_slab_depot(struct slab_depot *depot)
uninitialize_allocator_summary(allocator);
uninitialize_scrubber_vio(&allocator->scrubber);
free_vio_pool(vdo_forget(allocator->vio_pool));
+ free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool));
vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs));
}
diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h
index f234853501ca..fadc0c9d4dc4 100644
--- a/drivers/md/dm-vdo/slab-depot.h
+++ b/drivers/md/dm-vdo/slab-depot.h
@@ -45,6 +45,13 @@
enum {
/* The number of vios in the vio pool is proportional to the throughput of the VDO. */
BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128,
+
+ /*
+ * The number of vios in the vio pool used for loading reference count data. A slab's
+ * refcounts is capped at ~8MB, and we process one at a time in a zone, so 9 should be
+ * plenty.
+ */
+ BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE = 9,
};
/*
@@ -248,7 +255,7 @@ struct vdo_slab {
/* A list of the dirty blocks waiting to be written out */
struct vdo_wait_queue dirty_blocks;
- /* The number of blocks which are currently writing */
+ /* The number of blocks which are currently reading or writing */
size_t active_count;
/* A waiter object for updating the slab summary */
@@ -425,6 +432,10 @@ struct block_allocator {
/* The vio pool for reading and writing block allocator metadata */
struct vio_pool *vio_pool;
+ /* The vio pool for large initial reads of ref count areas */
+ struct vio_pool *refcount_big_vio_pool;
+ /* How many ref count blocks are read per vio at initial load */
+ u32 refcount_blocks_per_big_vio;
/* The dm_kcopyd client for erasing slab journals */
struct dm_kcopyd_client *eraser;
/* Iterator over the slabs to be erased */
diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h
index dbe892b10f26..cdf36e7d7702 100644
--- a/drivers/md/dm-vdo/types.h
+++ b/drivers/md/dm-vdo/types.h
@@ -376,6 +376,9 @@ struct vio {
/* The size of this vio in blocks */
unsigned int block_count;
+ /* The amount of data to be read or written, in bytes */
+ unsigned int io_size;
+
/* The data being read or written. */
char *data;
diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c
index a7e32baab4af..80b608674022 100644
--- a/drivers/md/dm-vdo/vdo.c
+++ b/drivers/md/dm-vdo/vdo.c
@@ -31,9 +31,7 @@
#include <linux/completion.h>
#include <linux/device-mapper.h>
-#include <linux/kernel.h>
#include <linux/lz4.h>
-#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/types.h>
@@ -142,12 +140,6 @@ static void finish_vdo_request_queue(void *ptr)
vdo_unregister_allocating_thread();
}
-#ifdef MODULE
-#define MODULE_NAME THIS_MODULE->name
-#else
-#define MODULE_NAME "dm-vdo"
-#endif /* MODULE */
-
static const struct vdo_work_queue_type default_queue_type = {
.start = start_vdo_request_queue,
.finish = finish_vdo_request_queue,
@@ -559,8 +551,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason,
*vdo_ptr = vdo;
snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix),
- "%s%u", MODULE_NAME, instance);
- BUG_ON(vdo->thread_name_prefix[0] == '\0');
+ "vdo%u", instance);
result = vdo_allocate(vdo->thread_config.thread_count,
struct vdo_thread, __func__, &vdo->threads);
if (result != VDO_SUCCESS) {
diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c
index e710f3c5a972..e7f4153e55e3 100644
--- a/drivers/md/dm-vdo/vio.c
+++ b/drivers/md/dm-vdo/vio.c
@@ -188,14 +188,23 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb
/*
* Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated
- * bio, as it assumes the bio wraps a 4k buffer that is 4k aligned, but there does not have to be a
- * vio associated with the bio.
+ * bio, as it assumes the bio wraps a 4k-multiple buffer that is 4k aligned, but there does not
+ * have to be a vio associated with the bio.
*/
int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
blk_opf_t bi_opf, physical_block_number_t pbn)
{
- int bvec_count, offset, len, i;
+ return vio_reset_bio_with_size(vio, data, vio->block_count * VDO_BLOCK_SIZE,
+ callback, bi_opf, pbn);
+}
+
+int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback,
+ blk_opf_t bi_opf, physical_block_number_t pbn)
+{
+ int bvec_count, offset, i;
struct bio *bio = vio->bio;
+ int vio_size = vio->block_count * VDO_BLOCK_SIZE;
+ int remaining;
bio_reset(bio, bio->bi_bdev, bi_opf);
vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn);
@@ -205,22 +214,21 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
bio->bi_ioprio = 0;
bio->bi_io_vec = bio->bi_inline_vecs;
bio->bi_max_vecs = vio->block_count + 1;
- len = VDO_BLOCK_SIZE * vio->block_count;
+ if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d",
+ size, vio_size) != VDO_SUCCESS)
+ size = vio_size;
+ vio->io_size = size;
offset = offset_in_page(data);
- bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+ bvec_count = DIV_ROUND_UP(offset + size, PAGE_SIZE);
+ remaining = size;
- /*
- * If we knew that data was always on one page, or contiguous pages, we wouldn't need the
- * loop. But if we're using vmalloc, it's not impossible that the data is in different
- * pages that can't be merged in bio_add_page...
- */
- for (i = 0; (i < bvec_count) && (len > 0); i++) {
+ for (i = 0; (i < bvec_count) && (remaining > 0); i++) {
struct page *page;
int bytes_added;
int bytes = PAGE_SIZE - offset;
- if (bytes > len)
- bytes = len;
+ if (bytes > remaining)
+ bytes = remaining;
page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data);
bytes_added = bio_add_page(bio, page, bytes, offset);
@@ -232,7 +240,7 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
}
data += bytes;
- len -= bytes;
+ remaining -= bytes;
offset = 0;
}
@@ -301,6 +309,7 @@ void vio_record_metadata_io_error(struct vio *vio)
* make_vio_pool() - Create a new vio pool.
* @vdo: The vdo.
* @pool_size: The number of vios in the pool.
+ * @block_count: The number of 4k blocks per vio.
* @thread_id: The ID of the thread using this pool.
* @vio_type: The type of vios in the pool.
* @priority: The priority with which vios from the pool should be enqueued.
@@ -309,13 +318,14 @@ void vio_record_metadata_io_error(struct vio *vio)
*
* Return: A success or error code.
*/
-int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
+int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_id_t thread_id,
enum vio_type vio_type, enum vio_priority priority, void *context,
struct vio_pool **pool_ptr)
{
struct vio_pool *pool;
char *ptr;
int result;
+ size_t per_vio_size = VDO_BLOCK_SIZE * block_count;
result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio,
__func__, &pool);
@@ -326,7 +336,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
INIT_LIST_HEAD(&pool->available);
INIT_LIST_HEAD(&pool->busy);
- result = vdo_allocate(pool_size * VDO_BLOCK_SIZE, char,
+ result = vdo_allocate(pool_size * per_vio_size, char,
"VIO pool buffer", &pool->buffer);
if (result != VDO_SUCCESS) {
free_vio_pool(pool);
@@ -334,10 +344,10 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
}
ptr = pool->buffer;
- for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += VDO_BLOCK_SIZE) {
+ for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += per_vio_size) {
struct pooled_vio *pooled = &pool->vios[pool->size];
- result = allocate_vio_components(vdo, vio_type, priority, NULL, 1, ptr,
+ result = allocate_vio_components(vdo, vio_type, priority, NULL, block_count, ptr,
&pooled->vio);
if (result != VDO_SUCCESS) {
free_vio_pool(pool);
@@ -345,6 +355,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
}
pooled->context = context;
+ pooled->pool = pool;
list_add_tail(&pooled->pool_entry, &pool->available);
}
@@ -419,12 +430,13 @@ void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter)
}
/**
- * return_vio_to_pool() - Return a vio to the pool
- * @pool: The vio pool.
+ * return_vio_to_pool() - Return a vio to its pool
* @vio: The pooled vio to return.
*/
-void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio)
+void return_vio_to_pool(struct pooled_vio *vio)
{
+ struct vio_pool *pool = vio->pool;
+
VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()),
"vio pool entry returned on same thread as it was acquired");
diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h
index 3490e9f59b04..4bfcb21901f1 100644
--- a/drivers/md/dm-vdo/vio.h
+++ b/drivers/md/dm-vdo/vio.h
@@ -30,6 +30,8 @@ struct pooled_vio {
void *context;
/* The list entry used by the pool */
struct list_head pool_entry;
+ /* The pool this vio is allocated from */
+ struct vio_pool *pool;
};
/**
@@ -123,6 +125,8 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb
int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
blk_opf_t bi_opf, physical_block_number_t pbn);
+int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback,
+ blk_opf_t bi_opf, physical_block_number_t pbn);
void update_vio_error_stats(struct vio *vio, const char *format, ...)
__printf(2, 3);
@@ -188,12 +192,13 @@ static inline struct pooled_vio *vio_as_pooled_vio(struct vio *vio)
struct vio_pool;
-int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
- enum vio_type vio_type, enum vio_priority priority,
- void *context, struct vio_pool **pool_ptr);
+int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count,
+ thread_id_t thread_id, enum vio_type vio_type,
+ enum vio_priority priority, void *context,
+ struct vio_pool **pool_ptr);
void free_vio_pool(struct vio_pool *pool);
bool __must_check is_vio_pool_busy(struct vio_pool *pool);
void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter);
-void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio);
+void return_vio_to_pool(struct pooled_vio *vio);
#endif /* VIO_H */
diff --git a/drivers/md/dm-vdo/wait-queue.c b/drivers/md/dm-vdo/wait-queue.c
index 6e1e739277ef..f81ed0cee2bf 100644
--- a/drivers/md/dm-vdo/wait-queue.c
+++ b/drivers/md/dm-vdo/wait-queue.c
@@ -34,7 +34,7 @@ void vdo_waitq_enqueue_waiter(struct vdo_wait_queue *waitq, struct vdo_waiter *w
waitq->last_waiter->next_waiter = waiter;
}
- /* In both cases, the waiter we added to the ring becomes the last waiter. */
+ /* In both cases, the waiter we added to the list becomes the last waiter. */
waitq->last_waiter = waiter;
waitq->length += 1;
}
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index e86c1431b108..3c427f18a04b 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -30,6 +30,7 @@
#define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR"
#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
+#define DM_VERITY_USE_BH_DEFAULT_BYTES 8192
#define DM_VERITY_MAX_CORRUPTED_ERRS 100
@@ -49,6 +50,15 @@ static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE
module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644);
+static unsigned int dm_verity_use_bh_bytes[4] = {
+ DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_NONE
+ DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_RT
+ DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_BE
+ 0 // IOPRIO_CLASS_IDLE
+};
+
+module_param_array_named(use_bh_bytes, dm_verity_use_bh_bytes, uint, NULL, 0644);
+
static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled);
/* Is at least one dm-verity instance using ahash_tfm instead of shash_tfm? */
@@ -311,7 +321,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
data = dm_bufio_get(v->bufio, hash_block, &buf);
- if (data == NULL) {
+ if (IS_ERR_OR_NULL(data)) {
/*
* In tasklet and the hash was not in the bufio cache.
* Return early and resume execution from a work-queue
@@ -324,8 +334,24 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
&buf, bio->bi_ioprio);
}
- if (IS_ERR(data))
- return PTR_ERR(data);
+ if (IS_ERR(data)) {
+ if (skip_unverified)
+ return 1;
+ r = PTR_ERR(data);
+ data = dm_bufio_new(v->bufio, hash_block, &buf);
+ if (IS_ERR(data))
+ return r;
+ if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA,
+ hash_block, data) == 0) {
+ aux = dm_bufio_get_aux_data(buf);
+ aux->hash_verified = 1;
+ goto release_ok;
+ } else {
+ dm_bufio_release(buf);
+ dm_bufio_forget(v->bufio, hash_block);
+ return r;
+ }
+ }
aux = dm_bufio_get_aux_data(buf);
@@ -366,6 +392,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
}
}
+release_ok:
data += offset;
memcpy(want_digest, data, v->digest_size);
r = 0;
@@ -652,9 +679,17 @@ static void verity_bh_work(struct work_struct *w)
verity_finish_io(io, errno_to_blk_status(err));
}
+static inline bool verity_use_bh(unsigned int bytes, unsigned short ioprio)
+{
+ return ioprio <= IOPRIO_CLASS_IDLE &&
+ bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]);
+}
+
static void verity_end_io(struct bio *bio)
{
struct dm_verity_io *io = bio->bi_private;
+ unsigned short ioprio = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
+ unsigned int bytes = io->n_blocks << io->v->data_dev_block_bits;
if (bio->bi_status &&
(!verity_fec_is_enabled(io->v) ||
@@ -664,9 +699,14 @@ static void verity_end_io(struct bio *bio)
return;
}
- if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq) {
- INIT_WORK(&io->bh_work, verity_bh_work);
- queue_work(system_bh_wq, &io->bh_work);
+ if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq &&
+ verity_use_bh(bytes, ioprio)) {
+ if (in_hardirq() || irqs_disabled()) {
+ INIT_WORK(&io->bh_work, verity_bh_work);
+ queue_work(system_bh_wq, &io->bh_work);
+ } else {
+ verity_bh_work(&io->bh_work);
+ }
} else {
INIT_WORK(&io->work, verity_work);
queue_work(io->v->verify_wq, &io->work);
@@ -796,6 +836,13 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
+static void verity_postsuspend(struct dm_target *ti)
+{
+ struct dm_verity *v = ti->private;
+ flush_workqueue(v->verify_wq);
+ dm_bufio_client_reset(v->bufio);
+}
+
/*
* Status: V (valid) or C (corruption found)
*/
@@ -1761,11 +1808,12 @@ static struct target_type verity_target = {
.name = "verity",
/* Note: the LSMs depend on the singleton and immutable features */
.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
- .version = {1, 10, 0},
+ .version = {1, 11, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
.map = verity_map,
+ .postsuspend = verity_postsuspend,
.status = verity_status,
.prepare_ioctl = verity_prepare_ioctl,
.iterate_devices = verity_iterate_devices,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4d1e42891d24..5ab7574c0c76 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1540,14 +1540,18 @@ static void __send_empty_flush(struct clone_info *ci)
{
struct dm_table *t = ci->map;
struct bio flush_bio;
+ blk_opf_t opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+
+ if ((ci->io->orig_bio->bi_opf & (REQ_IDLE | REQ_SYNC)) ==
+ (REQ_IDLE | REQ_SYNC))
+ opf |= REQ_IDLE;
/*
* Use an on-stack bio for this, it's safe since we don't
* need to reference it after submit. It's just used as
* the basis for the clone(s).
*/
- bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
- REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
+ bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf);
ci->bio = &flush_bio;
ci->sector_count = 0;
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 9e84ab411564..51614651d2e7 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -56,17 +56,6 @@ bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
return true;
}
-bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
- struct nd_namespace_common **_ndns)
-{
- bool claimed;
-
- nvdimm_bus_lock(&attach->dev);
- claimed = __nd_attach_ndns(dev, attach, _ndns);
- nvdimm_bus_unlock(&attach->dev);
- return claimed;
-}
-
static bool is_idle(struct device *dev, struct nd_namespace_common *ndns)
{
struct nd_region *nd_region = to_nd_region(dev->parent);
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 082253a3a956..04f4a049599a 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -442,7 +442,8 @@ int nd_label_data_init(struct nvdimm_drvdata *ndd)
if (ndd->data)
return 0;
- if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0) {
+ if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0 ||
+ ndd->nsarea.config_size == 0) {
dev_dbg(ndd->dev, "failed to init config data area: (%u:%u)\n",
ndd->nsarea.max_xfer, ndd->nsarea.config_size);
return -ENXIO;
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 86976a9e8a15..bfc6bfeb6e24 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -127,8 +127,6 @@ resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region);
resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
struct nd_mapping *nd_mapping);
resource_size_t nd_region_available_dpa(struct nd_region *nd_region);
-int nd_region_conflict(struct nd_region *nd_region, resource_size_t start,
- resource_size_t size);
resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
struct nd_label_id *label_id);
int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd);
@@ -136,8 +134,6 @@ void get_ndd(struct nvdimm_drvdata *ndd);
resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
void nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns);
void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns);
-bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
- struct nd_namespace_common **_ndns);
bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
struct nd_namespace_common **_ndns);
ssize_t nd_namespace_store(struct device *dev,
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 37417ce5ec7b..de1ee5ebc851 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1229,45 +1229,4 @@ bool is_nvdimm_sync(struct nd_region *nd_region)
}
EXPORT_SYMBOL_GPL(is_nvdimm_sync);
-struct conflict_context {
- struct nd_region *nd_region;
- resource_size_t start, size;
-};
-
-static int region_conflict(struct device *dev, void *data)
-{
- struct nd_region *nd_region;
- struct conflict_context *ctx = data;
- resource_size_t res_end, region_end, region_start;
-
- if (!is_memory(dev))
- return 0;
-
- nd_region = to_nd_region(dev);
- if (nd_region == ctx->nd_region)
- return 0;
-
- res_end = ctx->start + ctx->size;
- region_start = nd_region->ndr_start;
- region_end = region_start + nd_region->ndr_size;
- if (ctx->start >= region_start && ctx->start < region_end)
- return -EBUSY;
- if (res_end > region_start && res_end <= region_end)
- return -EBUSY;
- return 0;
-}
-
-int nd_region_conflict(struct nd_region *nd_region, resource_size_t start,
- resource_size_t size)
-{
- struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
- struct conflict_context ctx = {
- .nd_region = nd_region,
- .start = start,
- .size = size,
- };
-
- return device_for_each_child(&nvdimm_bus->dev, &ctx, region_conflict);
-}
-
MODULE_IMPORT_NS("DEVMEM");
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 97c4d71115d8..d80f94346199 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -396,32 +396,6 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
goto out;
/*
- * If it's already released don't get it. This avoids to loop
- * in __get_user_pages if userfaultfd_release waits on the
- * caller of handle_userfault to release the mmap_lock.
- */
- if (unlikely(READ_ONCE(ctx->released))) {
- /*
- * Don't return VM_FAULT_SIGBUS in this case, so a non
- * cooperative manager can close the uffd after the
- * last UFFDIO_COPY, without risking to trigger an
- * involuntary SIGBUS if the process was starting the
- * userfaultfd while the userfaultfd was still armed
- * (but after the last UFFDIO_COPY). If the uffd
- * wasn't already closed when the userfault reached
- * this point, that would normally be solved by
- * userfaultfd_must_wait returning 'false'.
- *
- * If we were to return VM_FAULT_SIGBUS here, the non
- * cooperative manager would be instead forced to
- * always call UFFDIO_UNREGISTER before it can safely
- * close the uffd.
- */
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
-
- /*
* Check that we can return VM_FAULT_RETRY.
*
* NOTE: it should become possible to return VM_FAULT_RETRY
@@ -457,6 +431,31 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;
+ if (unlikely(READ_ONCE(ctx->released))) {
+ /*
+ * If a concurrent release is detected, do not return
+ * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always
+ * return VM_FAULT_RETRY with lock released proactively.
+ *
+ * If we were to return VM_FAULT_SIGBUS here, the non
+ * cooperative manager would be instead forced to
+ * always call UFFDIO_UNREGISTER before it can safely
+ * close the uffd, to avoid involuntary SIGBUS triggered.
+ *
+ * If we were to return VM_FAULT_NOPAGE, it would work for
+ * the fault path, in which the lock will be released
+ * later. However for GUP, faultin_page() does nothing
+ * special on NOPAGE, so GUP would spin retrying without
+ * releasing the mmap read lock, causing possible livelock.
+ *
+ * Here only VM_FAULT_RETRY would make sure the mmap lock
+ * be released immediately, so that the thread concurrently
+ * releasing the userfault would always make progress.
+ */
+ release_fault_lock(vmf);
+ goto out;
+ }
+
/* take the reference before dropping the mmap_lock */
userfaultfd_ctx_get(ctx);
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index d1adfba8387e..88a42973fa47 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -227,10 +227,10 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page);
*/
static inline void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
- struct page *page = (struct page *)table;
+ struct ptdesc *ptdesc = (struct ptdesc *)table;
- pagetable_dtor(page_ptdesc(page));
- tlb_remove_page(tlb, page);
+ pagetable_dtor(ptdesc);
+ tlb_remove_page(tlb, ptdesc_page(ptdesc));
}
#endif /* CONFIG_MMU_GATHER_TABLE_FREE */
@@ -493,17 +493,11 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
return tlb_remove_page_size(tlb, page, PAGE_SIZE);
}
-static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt)
+static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt)
{
tlb_remove_table(tlb, pt);
}
-/* Like tlb_remove_ptdesc, but for page-like page directories. */
-static inline void tlb_remove_page_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt)
-{
- tlb_remove_page(tlb, ptdesc_page(pt));
-}
-
static inline void tlb_change_page_size(struct mmu_gather *tlb,
unsigned int page_size)
{
diff --git a/include/cxl/event.h b/include/cxl/event.h
index 04edd44bd26f..f9ae1796da85 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -164,10 +164,99 @@ struct cxl_cper_work_data {
struct cxl_cper_event_rec rec;
};
+#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
+#define PROT_ERR_VALID_AGENT_ADDRESS BIT_ULL(1)
+#define PROT_ERR_VALID_DEVICE_ID BIT_ULL(2)
+#define PROT_ERR_VALID_SERIAL_NUMBER BIT_ULL(3)
+#define PROT_ERR_VALID_CAPABILITY BIT_ULL(4)
+#define PROT_ERR_VALID_DVSEC BIT_ULL(5)
+#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6)
+
+/*
+ * The layout of the enumeration and the values matches CXL Agent Type
+ * field in the UEFI 2.10 Section N.2.13,
+ */
+enum {
+ RCD, /* Restricted CXL Device */
+ RCH_DP, /* Restricted CXL Host Downstream Port */
+ DEVICE, /* CXL Device */
+ LD, /* CXL Logical Device */
+ FMLD, /* CXL Fabric Manager managed Logical Device */
+ RP, /* CXL Root Port */
+ DSP, /* CXL Downstream Switch Port */
+ USP, /* CXL Upstream Switch Port */
+};
+
+#pragma pack(1)
+
+/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
+struct cxl_cper_sec_prot_err {
+ u64 valid_bits;
+ u8 agent_type;
+ u8 reserved[7];
+
+ /*
+ * Except for RCH Downstream Port, all the remaining CXL Agent
+ * types are uniquely identified by the PCIe compatible SBDF number.
+ */
+ union {
+ u64 rcrb_base_addr;
+ struct {
+ u8 function;
+ u8 device;
+ u8 bus;
+ u16 segment;
+ u8 reserved_1[3];
+ };
+ } agent_addr;
+
+ struct {
+ u16 vendor_id;
+ u16 device_id;
+ u16 subsystem_vendor_id;
+ u16 subsystem_id;
+ u8 class_code[2];
+ u16 slot;
+ u8 reserved_1[4];
+ } device_id;
+
+ struct {
+ u32 lower_dw;
+ u32 upper_dw;
+ } dev_serial_num;
+
+ u8 capability[60];
+ u16 dvsec_len;
+ u16 err_len;
+ u8 reserved_2[4];
+};
+
+#pragma pack()
+
+/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
+struct cxl_ras_capability_regs {
+ u32 uncor_status;
+ u32 uncor_mask;
+ u32 uncor_severity;
+ u32 cor_status;
+ u32 cor_mask;
+ u32 cap_control;
+ u32 header_log[16];
+};
+
+struct cxl_cper_prot_err_work_data {
+ struct cxl_cper_sec_prot_err prot_err;
+ struct cxl_ras_capability_regs ras_cap;
+ int severity;
+};
+
#ifdef CONFIG_ACPI_APEI_GHES
int cxl_cper_register_work(struct work_struct *work);
int cxl_cper_unregister_work(struct work_struct *work);
int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd);
+int cxl_cper_register_prot_err_work(struct work_struct *work);
+int cxl_cper_unregister_prot_err_work(struct work_struct *work);
+int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd);
#else
static inline int cxl_cper_register_work(struct work_struct *work)
{
@@ -182,6 +271,18 @@ static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
{
return 0;
}
+static inline int cxl_cper_register_prot_err_work(struct work_struct *work)
+{
+ return 0;
+}
+static inline int cxl_cper_unregister_prot_err_work(struct work_struct *work)
+{
+ return 0;
+}
+static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
+{
+ return 0;
+}
#endif
#endif /* _LINUX_CXL_EVENT_H */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a70e62d69dc7..3f2e93ed9730 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1094,6 +1094,17 @@ static inline acpi_handle acpi_get_processor_handle(int cpu)
#endif /* !CONFIG_ACPI */
+#ifdef CONFIG_ACPI_HMAT
+int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid,
+ resource_size_t *size);
+#else
+static inline int hmat_get_extended_linear_cache_size(struct resource *backing_res,
+ int nid, resource_size_t *size)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
extern void arch_post_acpi_subsys_init(void);
#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 265b0f8fc0b3..0ed60a91eca9 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -89,6 +89,10 @@ enum {
#define CPER_NOTIFY_DMAR \
GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \
0x72, 0x2D, 0xEB, 0x41)
+/* CXL Protocol Error Section */
+#define CPER_SEC_CXL_PROT_ERR \
+ GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
+ 0x4B, 0x77, 0x10, 0x48)
/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */
/*
@@ -601,4 +605,8 @@ void cper_estatus_print(const char *pfx,
int cper_estatus_check_header(const struct acpi_hest_generic_status *estatus);
int cper_estatus_check(const struct acpi_hest_generic_status *estatus);
+struct cxl_cper_sec_prot_err;
+void cxl_cper_print_prot_err(const char *pfx,
+ const struct cxl_cper_sec_prot_err *prot_err);
+
#endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d66bc0e97632..b7f13f087954 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4238,4 +4238,14 @@ int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *st
int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
+
+/*
+ * mseal of userspace process's system mappings.
+ */
+#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS
+#define VM_SEALED_SYSMAP VM_SEALED
+#else
+#define VM_SEALED_SYSMAP VM_NONE
+#endif
+
#endif /* _LINUX_MM_H */
diff --git a/include/linux/node.h b/include/linux/node.h
index 9a881c2208b3..2b7517892230 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -57,6 +57,11 @@ enum cache_write_policy {
NODE_CACHE_WRITE_OTHER,
};
+enum cache_mode {
+ NODE_CACHE_ADDR_MODE_RESERVED,
+ NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR,
+};
+
/**
* struct node_cache_attrs - system memory caching attributes
*
@@ -65,6 +70,7 @@ enum cache_write_policy {
* @size: Total size of cache in bytes
* @line_size: Number of bytes fetched on a cache miss
* @level: The cache hierarchy level
+ * @address_mode: The address mode
*/
struct node_cache_attrs {
enum cache_indexing indexing;
@@ -72,6 +78,7 @@ struct node_cache_attrs {
u64 size;
u16 line_size;
u8 level;
+ u16 address_mode;
};
#ifdef CONFIG_HMEM_REPORTING
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5bd9492a66ee..e6a21b62dcce 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -226,11 +226,48 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page
}
return page;
}
+
+static __always_inline bool page_count_writable(const struct page *page, int u)
+{
+ if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
+ return true;
+
+ /*
+ * The refcount check is ordered before the fake-head check to prevent
+ * the following race:
+ * CPU 1 (HVO) CPU 2 (speculative PFN walker)
+ *
+ * page_ref_freeze()
+ * synchronize_rcu()
+ * rcu_read_lock()
+ * page_is_fake_head() is false
+ * vmemmap_remap_pte()
+ * XXX: struct page[] becomes r/o
+ *
+ * page_ref_unfreeze()
+ * page_ref_count() is not zero
+ *
+ * atomic_add_unless(&page->_refcount)
+ * XXX: try to modify r/o struct page[]
+ *
+ * The refcount check also prevents modification attempts to other (r/o)
+ * tail pages that are not fake heads.
+ */
+ if (atomic_read_acquire(&page->_refcount) == u)
+ return false;
+
+ return page_fixed_fake_head(page) == page;
+}
#else
static inline const struct page *page_fixed_fake_head(const struct page *page)
{
return page;
}
+
+static inline bool page_count_writable(const struct page *page, int u)
+{
+ return true;
+}
#endif
static __always_inline int page_is_fake_head(const struct page *page)
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 8c236c651d1d..544150d1d5fd 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -234,7 +234,7 @@ static inline bool page_ref_add_unless(struct page *page, int nr, int u)
rcu_read_lock();
/* avoid writing to the vmemmap area being remapped */
- if (!page_is_fake_head(page) && page_ref_count(page) != u)
+ if (page_count_writable(page, u))
ret = atomic_add_unless(&page->_refcount, nr, u);
rcu_read_unlock();
diff --git a/include/linux/sort.h b/include/linux/sort.h
index e163287ac6c1..8e5603b10941 100644
--- a/include/linux/sort.h
+++ b/include/linux/sort.h
@@ -13,4 +13,15 @@ void sort(void *base, size_t num, size_t size,
cmp_func_t cmp_func,
swap_func_t swap_func);
+/* Versions that periodically call cond_resched(): */
+
+void sort_r_nonatomic(void *base, size_t num, size_t size,
+ cmp_r_func_t cmp_func,
+ swap_r_func_t swap_func,
+ const void *priv);
+
+void sort_nonatomic(void *base, size_t num, size_t size,
+ cmp_func_t cmp_func,
+ swap_func_t swap_func);
+
#endif
diff --git a/init/Kconfig b/init/Kconfig
index 681f38ee68db..18717967fc8c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1888,6 +1888,28 @@ config ARCH_HAS_MEMBARRIER_CALLBACKS
config ARCH_HAS_MEMBARRIER_SYNC_CORE
bool
+config ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS
+ bool
+ help
+ Control MSEAL_SYSTEM_MAPPINGS access based on architecture.
+
+ A 64-bit kernel is required for the memory sealing feature.
+ No specific hardware features from the CPU are needed.
+
+ To enable this feature, the architecture needs to update their
+ special mappings calls to include the sealing flag and confirm
+ that it doesn't unmap/remap system mappings during the life
+ time of the process. The existence of this flag for an architecture
+ implies that it does not require the remapping of the system
+ mappings during process lifetime, so sealing these mappings is safe
+ from a kernel perspective.
+
+ After the architecture enables this, a distribution can set
+ CONFIG_MSEAL_SYSTEM_MAPPING to manage access to the feature.
+
+ For complete descriptions of memory sealing, please see
+ Documentation/userspace-api/mseal.rst
+
config HAVE_PERF_EVENTS
bool
help
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2746791ce1e2..615b4e6d22c7 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1703,7 +1703,8 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
}
vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
- VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
+ VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO|
+ VM_SEALED_SYSMAP,
&xol_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 21575d39c376..66bcd40a28ca 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4171,8 +4171,8 @@ static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
init_dsq(dsq, dsq_id);
- ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node,
- dsq_hash_params);
+ ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node,
+ dsq_hash_params);
if (ret) {
kfree(dsq);
return ERR_PTR(ret);
@@ -5361,6 +5361,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
cpus_read_lock();
+ scx_idle_enable(ops);
+
if (scx_ops.init) {
ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
if (ret) {
@@ -5427,8 +5429,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (scx_ops.cpu_acquire || scx_ops.cpu_release)
static_branch_enable(&scx_ops_cpu_preempt);
- scx_idle_enable(ops);
-
/*
* Lock out forks, cgroup on/offlining and moves before opening the
* floodgate so that they don't wander into the operations prematurely.
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 52c36a70a3d0..cb343ca889e0 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -544,7 +544,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* core.
*/
if (flags & SCX_PICK_IDLE_CORE) {
- cpu = prev_cpu;
+ cpu = -EBUSY;
goto out_unlock;
}
}
@@ -584,8 +584,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* increasing distance.
*/
cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags);
- if (cpu >= 0)
- goto out_unlock;
out_unlock:
rcu_read_unlock();
@@ -723,14 +721,14 @@ static void reset_idle_masks(struct sched_ext_ops *ops)
void scx_idle_enable(struct sched_ext_ops *ops)
{
if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))
- static_branch_enable(&scx_builtin_idle_enabled);
+ static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
else
- static_branch_disable(&scx_builtin_idle_enabled);
+ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
- static_branch_enable(&scx_builtin_idle_per_node);
+ static_branch_enable_cpuslocked(&scx_builtin_idle_per_node);
else
- static_branch_disable(&scx_builtin_idle_per_node);
+ static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
#ifdef CONFIG_SMP
reset_idle_masks(ops);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 033fba0633cf..a3f35c7d83b6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -265,8 +265,7 @@ config FUNCTION_GRAPH_RETADDR
config FUNCTION_TRACE_ARGS
bool
- depends on HAVE_FUNCTION_ARG_ACCESS_API
- depends on DEBUG_INFO_BTF
+ depends on PROBE_EVENTS_BTF_ARGS
default y
help
If supported with function argument access API and BTF, then
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 92015de6203d..1a48aedb5255 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6855,6 +6855,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
}
}
}
+ cond_resched();
} while_for_each_ftrace_rec();
return fail ? -EINVAL : 0;
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 50344aa9f7f9..968c5c3b0246 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -809,7 +809,8 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
if (p && rv_is_nested_monitor(p)) {
pr_info("Parent monitor %s is already nested, cannot nest further\n",
parent->name);
- return -EINVAL;
+ retval = -EINVAL;
+ goto out_unlock;
}
r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bc957a2507e2..d058b519d502 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9604,6 +9604,7 @@ static void free_trace_buffers(struct trace_array *tr)
return;
free_trace_buffer(&tr->array_buffer);
+ kfree(tr->module_delta);
#ifdef CONFIG_TRACER_MAX_TRACE
free_trace_buffer(&tr->max_buffer);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8638b7f7ff85..069e92856bda 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -470,6 +470,7 @@ static void test_event_printk(struct trace_event_call *call)
case '%':
continue;
case 'p':
+ do_pointer:
/* Find dereferencing fields */
switch (fmt[i + 1]) {
case 'B': case 'R': case 'r':
@@ -498,6 +499,12 @@ static void test_event_printk(struct trace_event_call *call)
continue;
if (fmt[i + j] == '*') {
star = true;
+ /* Handle %*pbl case */
+ if (!j && fmt[i + 1] == 'p') {
+ arg++;
+ i++;
+ goto do_pointer;
+ }
continue;
}
if ((fmt[i + j] == 's')) {
diff --git a/lib/sg_split.c b/lib/sg_split.c
index 60a0babebf2e..0f89aab5c671 100644
--- a/lib/sg_split.c
+++ b/lib/sg_split.c
@@ -88,8 +88,6 @@ static void sg_split_phys(struct sg_splitter *splitters, const int nb_splits)
if (!j) {
out_sg->offset += split->skip_sg0;
out_sg->length -= split->skip_sg0;
- } else {
- out_sg->offset = 0;
}
sg_dma_address(out_sg) = 0;
sg_dma_len(out_sg) = 0;
diff --git a/lib/sort.c b/lib/sort.c
index 8e73dc55476b..52363995ccc5 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -186,36 +186,13 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size)
return i / 2;
}
-/**
- * sort_r - sort an array of elements
- * @base: pointer to data to sort
- * @num: number of elements
- * @size: size of each element
- * @cmp_func: pointer to comparison function
- * @swap_func: pointer to swap function or NULL
- * @priv: third argument passed to comparison function
- *
- * This function does a heapsort on the given array. You may provide
- * a swap_func function if you need to do something more than a memory
- * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
- * avoids a slow retpoline and so is significantly faster.
- *
- * The comparison function must adhere to specific mathematical
- * properties to ensure correct and stable sorting:
- * - Antisymmetry: cmp_func(a, b) must return the opposite sign of
- * cmp_func(b, a).
- * - Transitivity: if cmp_func(a, b) <= 0 and cmp_func(b, c) <= 0, then
- * cmp_func(a, c) <= 0.
- *
- * Sorting time is O(n log n) both on average and worst-case. While
- * quicksort is slightly faster on average, it suffers from exploitable
- * O(n*n) worst-case behavior and extra memory requirements that make
- * it less suitable for kernel use.
- */
-void sort_r(void *base, size_t num, size_t size,
- cmp_r_func_t cmp_func,
- swap_r_func_t swap_func,
- const void *priv)
+#include <linux/sched.h>
+
+static void __sort_r(void *base, size_t num, size_t size,
+ cmp_r_func_t cmp_func,
+ swap_r_func_t swap_func,
+ const void *priv,
+ bool may_schedule)
{
/* pre-scale counters for performance */
size_t n = num * size, a = (num/2) * size;
@@ -286,6 +263,9 @@ void sort_r(void *base, size_t num, size_t size,
b = parent(b, lsbit, size);
do_swap(base + b, base + c, size, swap_func, priv);
}
+
+ if (may_schedule)
+ cond_resched();
}
n -= size;
@@ -293,8 +273,63 @@ void sort_r(void *base, size_t num, size_t size,
if (n == size * 2 && do_cmp(base, base + size, cmp_func, priv) > 0)
do_swap(base, base + size, size, swap_func, priv);
}
+
+/**
+ * sort_r - sort an array of elements
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp_func: pointer to comparison function
+ * @swap_func: pointer to swap function or NULL
+ * @priv: third argument passed to comparison function
+ *
+ * This function does a heapsort on the given array. You may provide
+ * a swap_func function if you need to do something more than a memory
+ * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
+ * avoids a slow retpoline and so is significantly faster.
+ *
+ * The comparison function must adhere to specific mathematical
+ * properties to ensure correct and stable sorting:
+ * - Antisymmetry: cmp_func(a, b) must return the opposite sign of
+ * cmp_func(b, a).
+ * - Transitivity: if cmp_func(a, b) <= 0 and cmp_func(b, c) <= 0, then
+ * cmp_func(a, c) <= 0.
+ *
+ * Sorting time is O(n log n) both on average and worst-case. While
+ * quicksort is slightly faster on average, it suffers from exploitable
+ * O(n*n) worst-case behavior and extra memory requirements that make
+ * it less suitable for kernel use.
+ */
+void sort_r(void *base, size_t num, size_t size,
+ cmp_r_func_t cmp_func,
+ swap_r_func_t swap_func,
+ const void *priv)
+{
+ __sort_r(base, num, size, cmp_func, swap_func, priv, false);
+}
EXPORT_SYMBOL(sort_r);
+/**
+ * sort_r_nonatomic - sort an array of elements, with cond_resched
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp_func: pointer to comparison function
+ * @swap_func: pointer to swap function or NULL
+ * @priv: third argument passed to comparison function
+ *
+ * Same as sort_r, but preferred for larger arrays as it does a periodic
+ * cond_resched().
+ */
+void sort_r_nonatomic(void *base, size_t num, size_t size,
+ cmp_r_func_t cmp_func,
+ swap_r_func_t swap_func,
+ const void *priv)
+{
+ __sort_r(base, num, size, cmp_func, swap_func, priv, true);
+}
+EXPORT_SYMBOL(sort_r_nonatomic);
+
void sort(void *base, size_t num, size_t size,
cmp_func_t cmp_func,
swap_func_t swap_func)
@@ -304,6 +339,19 @@ void sort(void *base, size_t num, size_t size,
.swap = swap_func,
};
- return sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
+ return __sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w, false);
}
EXPORT_SYMBOL(sort);
+
+void sort_nonatomic(void *base, size_t num, size_t size,
+ cmp_func_t cmp_func,
+ swap_func_t swap_func)
+{
+ struct wrapper w = {
+ .cmp = cmp_func,
+ .swap = swap_func,
+ };
+
+ return __sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w, true);
+}
+EXPORT_SYMBOL(sort_nonatomic);
diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c
index c715e217ec65..3693c6caf2c4 100644
--- a/lib/vdso/datastore.c
+++ b/lib/vdso/datastore.c
@@ -99,7 +99,8 @@ const struct vm_special_mapping vdso_vvar_mapping = {
struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr)
{
return _install_special_mapping(mm, addr, VDSO_NR_PAGES * PAGE_SIZE,
- VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP | VM_PFNMAP,
+ VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP |
+ VM_PFNMAP | VM_SEALED_SYSMAP,
&vdso_vvar_mapping);
}
diff --git a/mm/damon/core.c b/mm/damon/core.c
index fc1eba3da419..f0c1676f0599 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -76,14 +76,13 @@ int damon_register_ops(struct damon_operations *ops)
if (ops->id >= NR_DAMON_OPS)
return -EINVAL;
+
mutex_lock(&damon_ops_lock);
/* Fail for already registered ops */
- if (__damon_is_registered_ops(ops->id)) {
+ if (__damon_is_registered_ops(ops->id))
err = -EINVAL;
- goto out;
- }
- damon_registered_ops[ops->id] = *ops;
-out:
+ else
+ damon_registered_ops[ops->id] = *ops;
mutex_unlock(&damon_ops_lock);
return err;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6fccfe6d046c..39f92aad7bd1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5179,7 +5179,7 @@ static const struct ctl_table hugetlb_table[] = {
},
};
-static void hugetlb_sysctl_init(void)
+static void __init hugetlb_sysctl_init(void)
{
register_sysctl_init("vm", hugetlb_table);
}
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index 59d673400085..3ea317837c2d 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -1073,14 +1073,11 @@ static void kmem_cache_rcu_uaf(struct kunit *test)
kmem_cache_destroy(cache);
}
-static void empty_cache_ctor(void *object) { }
-
static void kmem_cache_double_destroy(struct kunit *test)
{
struct kmem_cache *cache;
- /* Provide a constructor to prevent cache merging. */
- cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor);
+ cache = kmem_cache_create("test_cache", 200, 0, SLAB_NO_MERGE, NULL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
kmem_cache_destroy(cache);
KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache));
diff --git a/mm/memblock.c b/mm/memblock.c
index 284154445409..0a53db4d9f7b 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2167,6 +2167,9 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
unsigned long start_pfn = PFN_UP(start);
unsigned long end_pfn = PFN_DOWN(end);
+ if (!IS_ENABLED(CONFIG_HIGHMEM) && end_pfn > max_low_pfn)
+ end_pfn = max_low_pfn;
+
if (start_pfn >= end_pfn)
return 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 75401866fb76..8305483de38b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1813,21 +1813,15 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
page = pfn_to_page(pfn);
folio = page_folio(page);
- /*
- * No reference or lock is held on the folio, so it might
- * be modified concurrently (e.g. split). As such,
- * folio_nr_pages() may read garbage. This is fine as the outer
- * loop will revisit the split folio later.
- */
- if (folio_test_large(folio))
- pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
-
if (!folio_try_get(folio))
continue;
if (unlikely(page_folio(page) != folio))
goto put_folio;
+ if (folio_test_large(folio))
+ pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
+
if (folio_contain_hwpoisoned_page(folio)) {
if (WARN_ON(folio_test_lru(folio)))
folio_isolate_lru(folio);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index a38a1909b407..84f14fa12d0d 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -984,19 +984,19 @@ static void __init memmap_init(void)
}
}
-#ifdef CONFIG_SPARSEMEM
/*
* Initialize the memory map for hole in the range [memory_end,
- * section_end].
+ * section_end] for SPARSEMEM and in the range [memory_end, memmap_end]
+ * for FLATMEM.
* Append the pages in this hole to the highest zone in the last
* node.
- * The call to init_unavailable_range() is outside the ifdef to
- * silence the compiler warining about zone_id set but not used;
- * for FLATMEM it is a nop anyway
*/
+#ifdef CONFIG_SPARSEMEM
end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
- if (hole_pfn < end_pfn)
+#else
+ end_pfn = round_up(end_pfn, MAX_ORDER_NR_PAGES);
#endif
+ if (hole_pfn < end_pfn)
init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 0865387531ed..7db9da609c84 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -1561,11 +1561,12 @@ static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm)
* adjacent to the expanded vma and otherwise
* compatible.
*/
- vma = vrm->vma = vma_merge_extend(&vmi, vma, vrm->delta);
+ vma = vma_merge_extend(&vmi, vma, vrm->delta);
if (!vma) {
vrm_uncharge(vrm);
return -ENOMEM;
}
+ vrm->vma = vma;
vrm_stat_account(vrm, vrm->delta);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5b173c2da641..fd6b865cb1ab 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1593,7 +1593,7 @@ static __always_inline void page_del_and_expand(struct zone *zone,
static void check_new_page_bad(struct page *page)
{
- if (unlikely(page->flags & __PG_HWPOISON)) {
+ if (unlikely(PageHWPoison(page))) {
/* Don't complain about hwpoisoned pages */
if (PageBuddy(page))
__ClearPageBuddy(page);
@@ -4604,8 +4604,8 @@ retry:
goto retry;
/* Reclaim/compaction failed to prevent the fallback */
- if (defrag_mode) {
- alloc_flags &= ALLOC_NOFRAGMENT;
+ if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT)) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index a051a29e95ad..b2fc5266e3d2 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -83,7 +83,14 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
unsigned int skip_pages;
if (PageHuge(page)) {
- if (!hugepage_migration_supported(folio_hstate(folio)))
+ struct hstate *h;
+
+ /*
+ * The huge page may be freed so can not
+ * use folio_hstate() directly.
+ */
+ h = size_to_hstate(folio_size(folio));
+ if (h && !hugepage_migration_supported(h))
return page;
} else if (!folio_test_lru(folio) && !__folio_test_movable(folio)) {
return page;
diff --git a/mm/zswap.c b/mm/zswap.c
index 0dcc54eab58b..204fb59da33c 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -883,18 +883,32 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ struct acomp_req *req;
+ struct crypto_acomp *acomp;
+ u8 *buffer;
+
+ if (IS_ERR_OR_NULL(acomp_ctx))
+ return 0;
mutex_lock(&acomp_ctx->mutex);
- if (!IS_ERR_OR_NULL(acomp_ctx)) {
- if (!IS_ERR_OR_NULL(acomp_ctx->req))
- acomp_request_free(acomp_ctx->req);
- acomp_ctx->req = NULL;
- if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
- crypto_free_acomp(acomp_ctx->acomp);
- kfree(acomp_ctx->buffer);
- }
+ req = acomp_ctx->req;
+ acomp = acomp_ctx->acomp;
+ buffer = acomp_ctx->buffer;
+ acomp_ctx->req = NULL;
+ acomp_ctx->acomp = NULL;
+ acomp_ctx->buffer = NULL;
mutex_unlock(&acomp_ctx->mutex);
+ /*
+ * Do the actual freeing after releasing the mutex to avoid subtle
+ * locking dependencies causing deadlocks.
+ */
+ if (!IS_ERR_OR_NULL(req))
+ acomp_request_free(req);
+ if (!IS_ERR_OR_NULL(acomp))
+ crypto_free_acomp(acomp);
+ kfree(buffer);
+
return 0;
}
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
index 999f78d380ae..1a05fc153353 100644
--- a/samples/trace_events/trace-events-sample.h
+++ b/samples/trace_events/trace-events-sample.h
@@ -319,7 +319,8 @@ TRACE_EVENT(foo_bar,
__assign_cpumask(cpum, cpumask_bits(mask));
),
- TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar,
+ TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s [%d] %*pbl",
+ __entry->foo, __entry->bar,
/*
* Notice here the use of some helper functions. This includes:
@@ -370,7 +371,10 @@ TRACE_EVENT(foo_bar,
__get_str(str), __get_str(lstr),
__get_bitmask(cpus), __get_cpumask(cpum),
- __get_str(vstr))
+ __get_str(vstr),
+ __get_dynamic_array_len(cpus),
+ __get_dynamic_array_len(cpus),
+ __get_dynamic_array(cpus))
);
/*
diff --git a/scripts/sorttable.c b/scripts/sorttable.c
index 7b4b3714b1af..deed676bfe38 100644
--- a/scripts/sorttable.c
+++ b/scripts/sorttable.c
@@ -857,7 +857,7 @@ static void *sort_mcount_loc(void *arg)
for (void *ptr = vals; ptr < vals + size; ptr += long_size) {
uint64_t key;
- key = long_size == 4 ? r((uint32_t *)ptr) : r8((uint64_t *)ptr);
+ key = long_size == 4 ? *(uint32_t *)ptr : *(uint64_t *)ptr;
if (!find_func(key)) {
if (long_size == 4)
*(uint32_t *)ptr = 0;
diff --git a/security/Kconfig b/security/Kconfig
index 536061cf33a9..4816fc74f81e 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -51,6 +51,27 @@ config PROC_MEM_NO_FORCE
endchoice
+config MSEAL_SYSTEM_MAPPINGS
+ bool "mseal system mappings"
+ depends on 64BIT
+ depends on ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS
+ depends on !CHECKPOINT_RESTORE
+ help
+ Apply mseal on system mappings.
+ The system mappings includes vdso, vvar, vvar_vclock,
+ vectors (arm compat-mode), sigpage (arm compat-mode), uprobes.
+
+ A 64-bit kernel is required for the memory sealing feature.
+ No specific hardware features from the CPU are needed.
+
+ WARNING: This feature breaks programs which rely on relocating
+ or unmapping system mappings. Known broken software at the time
+ of writing includes CHECKPOINT_RESTORE, UML, gVisor, rr. Therefore
+ this config can't be enabled universally.
+
+ For complete descriptions of memory sealing, please see
+ Documentation/userspace-api/mseal.rst
+
config SECURITY
bool "Enable different security models"
depends on SYSFS
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index dc4333d23189..8787048c6762 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -586,36 +586,48 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
}
}
-#define READ_ONCE(x) \
-({ \
- union { typeof(x) __val; char __c[1]; } __u = \
- { .__c = { 0 } }; \
- __read_once_size(&(x), __u.__c, sizeof(x)); \
- __u.__val; \
-})
-
-#define WRITE_ONCE(x, val) \
-({ \
- union { typeof(x) __val; char __c[1]; } __u = \
- { .__val = (val) }; \
- __write_once_size(&(x), __u.__c, sizeof(x)); \
- __u.__val; \
-})
-
-#define READ_ONCE_ARENA(type, x) \
-({ \
- union { type __val; char __c[1]; } __u = \
- { .__c = { 0 } }; \
- __read_once_size((void *)&(x), __u.__c, sizeof(x)); \
- __u.__val; \
+/*
+ * __unqual_typeof(x) - Declare an unqualified scalar type, leaving
+ * non-scalar types unchanged,
+ *
+ * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char'
+ * is not type-compatible with 'signed char', and we define a separate case.
+ *
+ * This is copied verbatim from kernel's include/linux/compiler_types.h, but
+ * with default expression (for pointers) changed from (x) to (typeof(x)0).
+ *
+ * This is because LLVM has a bug where for lvalue (x), it does not get rid of
+ * an extra address_space qualifier, but does in case of rvalue (typeof(x)0).
+ * Hence, for pointers, we need to create an rvalue expression to get the
+ * desired type. See https://github.com/llvm/llvm-project/issues/53400.
+ */
+#define __scalar_type_to_expr_cases(type) \
+ unsigned type : (unsigned type)0, signed type : (signed type)0
+
+#define __unqual_typeof(x) \
+ typeof(_Generic((x), \
+ char: (char)0, \
+ __scalar_type_to_expr_cases(char), \
+ __scalar_type_to_expr_cases(short), \
+ __scalar_type_to_expr_cases(int), \
+ __scalar_type_to_expr_cases(long), \
+ __scalar_type_to_expr_cases(long long), \
+ default: (typeof(x))0))
+
+#define READ_ONCE(x) \
+({ \
+ union { __unqual_typeof(x) __val; char __c[1]; } __u = \
+ { .__c = { 0 } }; \
+ __read_once_size((__unqual_typeof(x) *)&(x), __u.__c, sizeof(x)); \
+ __u.__val; \
})
-#define WRITE_ONCE_ARENA(type, x, val) \
-({ \
- union { type __val; char __c[1]; } __u = \
- { .__val = (val) }; \
- __write_once_size((void *)&(x), __u.__c, sizeof(x)); \
- __u.__val; \
+#define WRITE_ONCE(x, val) \
+({ \
+ union { __unqual_typeof(x) __val; char __c[1]; } __u = \
+ { .__val = (val) }; \
+ __write_once_size((__unqual_typeof(x) *)&(x), __u.__c, sizeof(x)); \
+ __u.__val; \
})
/*
@@ -648,6 +660,23 @@ static inline u32 log2_u64(u64 v)
return log2_u32(v) + 1;
}
+/*
+ * Return a value proportionally scaled to the task's weight.
+ */
+static inline u64 scale_by_task_weight(const struct task_struct *p, u64 value)
+{
+ return (value * p->scx.weight) / 100;
+}
+
+/*
+ * Return a value inversely proportional to the task's weight.
+ */
+static inline u64 scale_by_task_weight_inverse(const struct task_struct *p, u64 value)
+{
+ return value * 100 / p->scx.weight;
+}
+
+
#include "compat.bpf.h"
#include "enums.bpf.h"
diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h
index 6e6c45f14fe1..c2c33df9292c 100644
--- a/tools/sched_ext/include/scx/enum_defs.autogen.h
+++ b/tools/sched_ext/include/scx/enum_defs.autogen.h
@@ -88,6 +88,8 @@
#define HAVE_SCX_OPS_ENQ_LAST
#define HAVE_SCX_OPS_ENQ_EXITING
#define HAVE_SCX_OPS_SWITCH_PARTIAL
+#define HAVE_SCX_OPS_ENQ_MIGRATION_DISABLED
+#define HAVE_SCX_OPS_ALLOW_QUEUED_WAKEUP
#define HAVE_SCX_OPS_HAS_CGROUP_WEIGHT
#define HAVE_SCX_OPS_ALL_FLAGS
#define HAVE_SCX_OPSS_NONE
@@ -104,6 +106,7 @@
#define HAVE_SCX_RQ_BAL_PENDING
#define HAVE_SCX_RQ_BAL_KEEP
#define HAVE_SCX_RQ_BYPASSING
+#define HAVE_SCX_RQ_CLK_VALID
#define HAVE_SCX_RQ_IN_WAKEUP
#define HAVE_SCX_RQ_IN_BALANCE
#define HAVE_SCX_TASK_NONE
diff --git a/tools/sched_ext/include/scx/enums.autogen.bpf.h b/tools/sched_ext/include/scx/enums.autogen.bpf.h
index 0e941a0d6f88..2f8002bcc19a 100644
--- a/tools/sched_ext/include/scx/enums.autogen.bpf.h
+++ b/tools/sched_ext/include/scx/enums.autogen.bpf.h
@@ -13,6 +13,30 @@ const volatile u64 __SCX_SLICE_DFL __weak;
const volatile u64 __SCX_SLICE_INF __weak;
#define SCX_SLICE_INF __SCX_SLICE_INF
+const volatile u64 __SCX_RQ_ONLINE __weak;
+#define SCX_RQ_ONLINE __SCX_RQ_ONLINE
+
+const volatile u64 __SCX_RQ_CAN_STOP_TICK __weak;
+#define SCX_RQ_CAN_STOP_TICK __SCX_RQ_CAN_STOP_TICK
+
+const volatile u64 __SCX_RQ_BAL_PENDING __weak;
+#define SCX_RQ_BAL_PENDING __SCX_RQ_BAL_PENDING
+
+const volatile u64 __SCX_RQ_BAL_KEEP __weak;
+#define SCX_RQ_BAL_KEEP __SCX_RQ_BAL_KEEP
+
+const volatile u64 __SCX_RQ_BYPASSING __weak;
+#define SCX_RQ_BYPASSING __SCX_RQ_BYPASSING
+
+const volatile u64 __SCX_RQ_CLK_VALID __weak;
+#define SCX_RQ_CLK_VALID __SCX_RQ_CLK_VALID
+
+const volatile u64 __SCX_RQ_IN_WAKEUP __weak;
+#define SCX_RQ_IN_WAKEUP __SCX_RQ_IN_WAKEUP
+
+const volatile u64 __SCX_RQ_IN_BALANCE __weak;
+#define SCX_RQ_IN_BALANCE __SCX_RQ_IN_BALANCE
+
const volatile u64 __SCX_DSQ_FLAG_BUILTIN __weak;
#define SCX_DSQ_FLAG_BUILTIN __SCX_DSQ_FLAG_BUILTIN
diff --git a/tools/sched_ext/include/scx/enums.autogen.h b/tools/sched_ext/include/scx/enums.autogen.h
index 88137a140e72..fedec938584b 100644
--- a/tools/sched_ext/include/scx/enums.autogen.h
+++ b/tools/sched_ext/include/scx/enums.autogen.h
@@ -8,6 +8,14 @@
SCX_ENUM_SET(skel, scx_public_consts, SCX_OPS_NAME_LEN); \
SCX_ENUM_SET(skel, scx_public_consts, SCX_SLICE_DFL); \
SCX_ENUM_SET(skel, scx_public_consts, SCX_SLICE_INF); \
+ SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_ONLINE); \
+ SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_CAN_STOP_TICK); \
+ SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_BAL_PENDING); \
+ SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_BAL_KEEP); \
+ SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_BYPASSING); \
+ SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_CLK_VALID); \
+ SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_IN_WAKEUP); \
+ SCX_ENUM_SET(skel, scx_rq_flags, SCX_RQ_IN_BALANCE); \
SCX_ENUM_SET(skel, scx_dsq_id_flags, SCX_DSQ_FLAG_BUILTIN); \
SCX_ENUM_SET(skel, scx_dsq_id_flags, SCX_DSQ_FLAG_LOCAL_ON); \
SCX_ENUM_SET(skel, scx_dsq_id_flags, SCX_DSQ_INVALID); \
diff --git a/tools/sched_ext/include/scx/enums.h b/tools/sched_ext/include/scx/enums.h
index 34cbebe974b7..8e7c91575f0b 100644
--- a/tools/sched_ext/include/scx/enums.h
+++ b/tools/sched_ext/include/scx/enums.h
@@ -14,7 +14,8 @@ static inline void __ENUM_set(u64 *val, char *type, char *name)
bool res;
res = __COMPAT_read_enum(type, name, val);
- SCX_BUG_ON(!res, "enum not found(%s)", name);
+ if (!res)
+ *val = 0;
}
#define SCX_ENUM_SET(skel, type, name) do { \
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 0a6572ab6f37..387f3df8b988 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -61,8 +61,11 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o
cxl_core-y += $(CXL_CORE_SRC)/hdm.o
cxl_core-y += $(CXL_CORE_SRC)/pmu.o
cxl_core-y += $(CXL_CORE_SRC)/cdat.o
+cxl_core-y += $(CXL_CORE_SRC)/ras.o
+cxl_core-y += $(CXL_CORE_SRC)/acpi.o
cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
+cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o
cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o
cxl_core-y += config_check.o
cxl_core-y += cxl_core_test.o
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index cc8948f49117..1c3336095923 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -155,7 +155,7 @@ static struct {
} cfmws7;
struct {
struct acpi_cedt_cfmws cfmws;
- u32 target[4];
+ u32 target[3];
} cfmws8;
struct {
struct acpi_cedt_cxims cxims;
@@ -331,14 +331,14 @@ static struct {
.length = sizeof(mock_cedt.cfmws8),
},
.interleave_arithmetic = ACPI_CEDT_CFMWS_ARITHMETIC_XOR,
- .interleave_ways = 2,
- .granularity = 0,
+ .interleave_ways = 8,
+ .granularity = 1,
.restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 |
ACPI_CEDT_CFMWS_RESTRICT_PMEM,
.qtg_id = FAKE_QTG_ID,
- .window_size = SZ_256M * 16UL,
+ .window_size = SZ_512M * 6UL,
},
- .target = { 0, 1, 0, 1, },
+ .target = { 0, 1, 2, },
},
.cxims0 = {
.cxims = {
@@ -1000,25 +1000,21 @@ static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port)
find_cxl_root(port);
struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
struct access_coordinate ep_c[ACCESS_COORDINATE_MAX];
- struct range pmem_range = {
- .start = cxlds->pmem_res.start,
- .end = cxlds->pmem_res.end,
- };
- struct range ram_range = {
- .start = cxlds->ram_res.start,
- .end = cxlds->ram_res.end,
- };
if (!cxl_root)
return;
- if (range_len(&ram_range))
- dpa_perf_setup(port, &ram_range, &mds->ram_perf);
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ struct resource *res = &cxlds->part[i].res;
+ struct cxl_dpa_perf *perf = &cxlds->part[i].perf;
+ struct range range = {
+ .start = res->start,
+ .end = res->end,
+ };
- if (range_len(&pmem_range))
- dpa_perf_setup(port, &pmem_range, &mds->pmem_perf);
+ dpa_perf_setup(port, &range, perf);
+ }
cxl_memdev_update_perf(cxlmd);
diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 9495dbcc03a7..f2957a3e36fe 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -78,6 +78,10 @@ static struct cxl_cel_entry mock_cel[] = {
.effect = CXL_CMD_EFFECT_NONE,
},
{
+ .opcode = cpu_to_le16(CXL_MBOX_OP_SET_SHUTDOWN_STATE),
+ .effect = POLICY_CHANGE_IMMEDIATE,
+ },
+ {
.opcode = cpu_to_le16(CXL_MBOX_OP_GET_POISON),
.effect = CXL_CMD_EFFECT_NONE,
},
@@ -178,6 +182,7 @@ struct cxl_mockmem_data {
u64 timestamp;
unsigned long sanitize_timeout;
struct vendor_test_feat test_feat;
+ u8 shutdown_state;
};
static struct mock_event_log *event_find_log(struct device *dev, int log_type)
@@ -1105,6 +1110,21 @@ static int mock_health_info(struct cxl_mbox_cmd *cmd)
return 0;
}
+static int mock_set_shutdown_state(struct cxl_mockmem_data *mdata,
+ struct cxl_mbox_cmd *cmd)
+{
+ struct cxl_mbox_set_shutdown_state_in *ss = cmd->payload_in;
+
+ if (cmd->size_in != sizeof(*ss))
+ return -EINVAL;
+
+ if (cmd->size_out != 0)
+ return -EINVAL;
+
+ mdata->shutdown_state = ss->state;
+ return 0;
+}
+
static struct mock_poison {
struct cxl_dev_state *cxlds;
u64 dpa;
@@ -1583,6 +1603,9 @@ static int cxl_mock_mbox_send(struct cxl_mailbox *cxl_mbox,
case CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE:
rc = mock_passphrase_secure_erase(mdata, cmd);
break;
+ case CXL_MBOX_OP_SET_SHUTDOWN_STATE:
+ rc = mock_set_shutdown_state(mdata, cmd);
+ break;
case CXL_MBOX_OP_GET_POISON:
rc = mock_get_poison(cxlds, cmd);
break;
@@ -1670,6 +1693,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
struct cxl_dev_state *cxlds;
struct cxl_mockmem_data *mdata;
struct cxl_mailbox *cxl_mbox;
+ struct cxl_dpa_info range_info = { 0 };
int rc;
mdata = devm_kzalloc(dev, sizeof(*mdata), GFP_KERNEL);
@@ -1709,7 +1733,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
mds->event.buf = (struct cxl_get_event_payload *) mdata->event_buf;
INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mockmem_sanitize_work);
- cxlds->serial = pdev->id;
+ cxlds->serial = pdev->id + 1;
if (is_rcd(pdev))
cxlds->rcd = true;
@@ -1730,7 +1754,11 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
if (rc)
return rc;
- rc = cxl_mem_create_range_info(mds);
+ rc = cxl_mem_dpa_fetch(mds, &range_info);
+ if (rc)
+ return rc;
+
+ rc = cxl_dpa_setup(cxlds, &range_info);
if (rc)
return rc;
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 2694344274bf..c77c8c8e3d9b 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -62,6 +62,7 @@ TARGETS += mount
TARGETS += mount_setattr
TARGETS += move_mount_set_group
TARGETS += mqueue
+TARGETS += mseal_system_mappings
TARGETS += nci
TARGETS += net
TARGETS += net/af_unix
diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh
index 2c725773cd79..1f92e8caceac 100755
--- a/tools/testing/selftests/mm/va_high_addr_switch.sh
+++ b/tools/testing/selftests/mm/va_high_addr_switch.sh
@@ -41,6 +41,31 @@ check_supported_x86_64()
fi
}
+check_supported_ppc64()
+{
+ local config="/proc/config.gz"
+ [[ -f "${config}" ]] || config="/boot/config-$(uname -r)"
+ [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot"
+
+ local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2)
+ if [[ "${pg_table_levels}" -lt 5 ]]; then
+ echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
+ exit $ksft_skip
+ fi
+
+ local mmu_support=$(grep -m1 "mmu" /proc/cpuinfo | awk '{print $3}')
+ if [[ "$mmu_support" != "radix" ]]; then
+ echo "$0: System does not use Radix MMU, required for 5-level paging"
+ exit $ksft_skip
+ fi
+
+ local hugepages_total=$(awk '/HugePages_Total/ {print $2}' /proc/meminfo)
+ if [[ "${hugepages_total}" -eq 0 ]]; then
+ echo "$0: HugePages are not enabled, required for some tests"
+ exit $ksft_skip
+ fi
+}
+
check_test_requirements()
{
# The test supports x86_64 and powerpc64. We currently have no useful
@@ -50,6 +75,9 @@ check_test_requirements()
"x86_64")
check_supported_x86_64
;;
+ "ppc64le"|"ppc64")
+ check_supported_ppc64
+ ;;
*)
return 0
;;
diff --git a/tools/testing/selftests/mseal_system_mappings/.gitignore b/tools/testing/selftests/mseal_system_mappings/.gitignore
new file mode 100644
index 000000000000..319c497a595e
--- /dev/null
+++ b/tools/testing/selftests/mseal_system_mappings/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+sysmap_is_sealed
diff --git a/tools/testing/selftests/mseal_system_mappings/Makefile b/tools/testing/selftests/mseal_system_mappings/Makefile
new file mode 100644
index 000000000000..2b4504e2f52f
--- /dev/null
+++ b/tools/testing/selftests/mseal_system_mappings/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -std=c99 -pthread -Wall $(KHDR_INCLUDES)
+
+TEST_GEN_PROGS := sysmap_is_sealed
+
+include ../lib.mk
diff --git a/tools/testing/selftests/mseal_system_mappings/config b/tools/testing/selftests/mseal_system_mappings/config
new file mode 100644
index 000000000000..675cb9f37b86
--- /dev/null
+++ b/tools/testing/selftests/mseal_system_mappings/config
@@ -0,0 +1 @@
+CONFIG_MSEAL_SYSTEM_MAPPINGS=y
diff --git a/tools/testing/selftests/mseal_system_mappings/sysmap_is_sealed.c b/tools/testing/selftests/mseal_system_mappings/sysmap_is_sealed.c
new file mode 100644
index 000000000000..0d2af30c3bf5
--- /dev/null
+++ b/tools/testing/selftests/mseal_system_mappings/sysmap_is_sealed.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * test system mappings are sealed when
+ * KCONFIG_MSEAL_SYSTEM_MAPPINGS=y
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "../kselftest.h"
+#include "../kselftest_harness.h"
+
+#define VMFLAGS "VmFlags:"
+#define MSEAL_FLAGS "sl"
+#define MAX_LINE_LEN 512
+
+bool has_mapping(char *name, FILE *maps)
+{
+ char line[MAX_LINE_LEN];
+
+ while (fgets(line, sizeof(line), maps)) {
+ if (strstr(line, name))
+ return true;
+ }
+
+ return false;
+}
+
+bool mapping_is_sealed(char *name, FILE *maps)
+{
+ char line[MAX_LINE_LEN];
+
+ while (fgets(line, sizeof(line), maps)) {
+ if (!strncmp(line, VMFLAGS, strlen(VMFLAGS))) {
+ if (strstr(line, MSEAL_FLAGS))
+ return true;
+
+ return false;
+ }
+ }
+
+ return false;
+}
+
+FIXTURE(basic) {
+ FILE *maps;
+};
+
+FIXTURE_SETUP(basic)
+{
+ self->maps = fopen("/proc/self/smaps", "r");
+ if (!self->maps)
+ SKIP(return, "Could not open /proc/self/smap, errno=%d",
+ errno);
+};
+
+FIXTURE_TEARDOWN(basic)
+{
+ if (self->maps)
+ fclose(self->maps);
+};
+
+FIXTURE_VARIANT(basic)
+{
+ char *name;
+ bool sealed;
+};
+
+FIXTURE_VARIANT_ADD(basic, vdso) {
+ .name = "[vdso]",
+ .sealed = true,
+};
+
+FIXTURE_VARIANT_ADD(basic, vvar) {
+ .name = "[vvar]",
+ .sealed = true,
+};
+
+FIXTURE_VARIANT_ADD(basic, vvar_vclock) {
+ .name = "[vvar_vclock]",
+ .sealed = true,
+};
+
+FIXTURE_VARIANT_ADD(basic, sigpage) {
+ .name = "[sigpage]",
+ .sealed = true,
+};
+
+FIXTURE_VARIANT_ADD(basic, vectors) {
+ .name = "[vectors]",
+ .sealed = true,
+};
+
+FIXTURE_VARIANT_ADD(basic, uprobes) {
+ .name = "[uprobes]",
+ .sealed = true,
+};
+
+FIXTURE_VARIANT_ADD(basic, stack) {
+ .name = "[stack]",
+ .sealed = false,
+};
+
+TEST_F(basic, check_sealed)
+{
+ if (!has_mapping(variant->name, self->maps)) {
+ SKIP(return, "could not find the mapping, %s",
+ variant->name);
+ }
+
+ EXPECT_EQ(variant->sealed,
+ mapping_is_sealed(variant->name, self->maps));
+};
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/x86/test_mremap_vdso.c b/tools/testing/selftests/x86/test_mremap_vdso.c
index d53959e03593..94bee6e0c813 100644
--- a/tools/testing/selftests/x86/test_mremap_vdso.c
+++ b/tools/testing/selftests/x86/test_mremap_vdso.c
@@ -14,6 +14,7 @@
#include <errno.h>
#include <unistd.h>
#include <string.h>
+#include <stdbool.h>
#include <sys/mman.h>
#include <sys/auxv.h>
@@ -55,13 +56,55 @@ static int try_to_remap(void *vdso_addr, unsigned long size)
}
+#define VDSO_NAME "[vdso]"
+#define VMFLAGS "VmFlags:"
+#define MSEAL_FLAGS "sl"
+#define MAX_LINE_LEN 512
+
+bool vdso_sealed(FILE *maps)
+{
+ char line[MAX_LINE_LEN];
+ bool has_vdso = false;
+
+ while (fgets(line, sizeof(line), maps)) {
+ if (strstr(line, VDSO_NAME))
+ has_vdso = true;
+
+ if (has_vdso && !strncmp(line, VMFLAGS, strlen(VMFLAGS))) {
+ if (strstr(line, MSEAL_FLAGS))
+ return true;
+
+ return false;
+ }
+ }
+
+ return false;
+}
+
int main(int argc, char **argv, char **envp)
{
pid_t child;
+ FILE *maps;
ksft_print_header();
ksft_set_plan(1);
+ maps = fopen("/proc/self/smaps", "r");
+ if (!maps) {
+ ksft_test_result_skip(
+ "Could not open /proc/self/smaps, errno=%d\n",
+ errno);
+
+ return 0;
+ }
+
+ if (vdso_sealed(maps)) {
+ ksft_test_result_skip("vdso is sealed\n");
+ return 0;
+ }
+
+ fclose(maps);
+
child = fork();
if (child == -1)
ksft_exit_fail_msg("failed to fork (%d): %m\n", errno);