diff options
68 files changed, 466 insertions, 292 deletions
diff --git a/Documentation/arch/arm64/gcs.rst b/Documentation/arch/arm64/gcs.rst index 1f65a3193e776..226c0b008456f 100644 --- a/Documentation/arch/arm64/gcs.rst +++ b/Documentation/arch/arm64/gcs.rst @@ -37,7 +37,7 @@ intended to be exhaustive. shadow stacks rather than GCS. * Support for GCS is reported to userspace via HWCAP_GCS in the aux vector - AT_HWCAP2 entry. + AT_HWCAP entry. * GCS is enabled per thread. While there is support for disabling GCS at runtime this should be done with great care. diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fcdd0ed3eca89..940343beb3d4c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -225,7 +225,6 @@ config ARM64 select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_FREGS select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_GRAPH_RETVAL select HAVE_GCC_PLUGINS select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && \ HW_PERF_EVENTS && HAVE_PERF_EVENTS_NMI diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 358c68565bfd0..2b25d671365f2 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -48,7 +48,11 @@ KBUILD_CFLAGS += $(CC_FLAGS_NO_FPU) \ KBUILD_CFLAGS += $(call cc-disable-warning, psabi) KBUILD_AFLAGS += $(compat_vdso) +ifeq ($(call test-ge, $(CONFIG_RUSTC_VERSION), 108500),y) +KBUILD_RUSTFLAGS += --target=aarch64-unknown-none-softfloat +else KBUILD_RUSTFLAGS += --target=aarch64-unknown-none -Ctarget-feature="-neon" +endif KBUILD_CFLAGS += $(call cc-option,-mabi=lp64) KBUILD_AFLAGS += $(call cc-option,-mabi=lp64) diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index d9c9218fa1fdd..309942b06c5bc 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -101,16 +101,18 @@ int populate_cache_leaves(unsigned int cpu) unsigned int level, idx; enum cache_type type; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf = this_cpu_ci->info_list; + struct cacheinfo *infos = this_cpu_ci->info_list; for (idx = 0, level = 1; level <= this_cpu_ci->num_levels && - idx < this_cpu_ci->num_leaves; idx++, level++) { + idx < this_cpu_ci->num_leaves; level++) { type = get_cache_type(level); if (type == CACHE_TYPE_SEPARATE) { - ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); - ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); + if (idx + 1 >= this_cpu_ci->num_leaves) + break; + ci_leaf_init(&infos[idx++], CACHE_TYPE_DATA, level); + ci_leaf_init(&infos[idx++], CACHE_TYPE_INST, level); } else { - ci_leaf_init(this_leaf++, type, level); + ci_leaf_init(&infos[idx++], type, level); } } return 0; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 4eb7c6698ae43..d561cf3b8ac7b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3091,6 +3091,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM, CAP_HWCAP, KERNEL_HWCAP_FLAGM), HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), HWCAP_CAP(ID_AA64ISAR0_EL1, RNDR, IMP, CAP_HWCAP, KERNEL_HWCAP_RNG), + HWCAP_CAP(ID_AA64ISAR3_EL1, FPRCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_FPRCVT), HWCAP_CAP(ID_AA64PFR0_EL1, FP, IMP, CAP_HWCAP, KERNEL_HWCAP_FP), HWCAP_CAP(ID_AA64PFR0_EL1, FP, FP16, CAP_HWCAP, KERNEL_HWCAP_FPHP), HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMD), @@ -3180,8 +3181,6 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA), HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4), HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2), - HWCAP_CAP(ID_AA64SMFR0_EL1, SF8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8MM8), - HWCAP_CAP(ID_AA64SMFR0_EL1, SF8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8MM4), HWCAP_CAP(ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM), HWCAP_CAP(ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES), HWCAP_CAP(ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA), @@ -3192,6 +3191,8 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA), HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP4), HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM8), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM4), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2), #ifdef CONFIG_ARM64_POE diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 1a2c72f3e7f80..cb180684d10d5 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -194,12 +194,19 @@ static void amu_fie_setup(const struct cpumask *cpus) int cpu; /* We are already set since the last insmod of cpufreq driver */ - if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + if (cpumask_available(amu_fie_cpus) && + unlikely(cpumask_subset(cpus, amu_fie_cpus))) return; - for_each_cpu(cpu, cpus) { + for_each_cpu(cpu, cpus) if (!freq_counters_valid(cpu)) return; + + if (!cpumask_available(amu_fie_cpus) && + !zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) { + WARN_ONCE(1, "Failed to allocate FIE cpumask for CPUs[%*pbl]\n", + cpumask_pr_args(cpus)); + return; } cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); @@ -237,17 +244,8 @@ static struct notifier_block init_amu_fie_notifier = { static int __init init_amu_fie(void) { - int ret; - - if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) - return -ENOMEM; - - ret = cpufreq_register_notifier(&init_amu_fie_notifier, + return cpufreq_register_notifier(&init_amu_fie_notifier, CPUFREQ_POLICY_NOTIFIER); - if (ret) - free_cpumask_var(amu_fie_cpus); - - return ret; } core_initcall(init_amu_fie); diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index 4ec32e86a8da2..47ad6944f9f08 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -41,6 +41,7 @@ SECTIONS */ /DISCARD/ : { *(.note.GNU-stack .note.gnu.property) + *(.ARM.attributes) } .note : { *(.note.*) } :text :note diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index f84c71f04d9ea..e73326bd3ff7e 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -162,6 +162,7 @@ SECTIONS /DISCARD/ : { *(.interp .dynamic) *(.dynsym .dynstr .hash .gnu.hash) + *(.ARM.attributes) } . = KIMAGE_VADDR; diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 19c67ed1a21fe..18543b603c77b 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -162,6 +162,13 @@ static int copy_p4d(struct trans_pgd_info *info, pgd_t *dst_pgdp, unsigned long next; unsigned long addr = start; + if (pgd_none(READ_ONCE(*dst_pgdp))) { + dst_p4dp = trans_alloc(info); + if (!dst_p4dp) + return -ENOMEM; + pgd_populate(NULL, dst_pgdp, dst_p4dp); + } + dst_p4dp = p4d_offset(dst_pgdp, start); src_p4dp = p4d_offset(src_pgdp, start); do { diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2c70cd35e72c5..d078de2c952b3 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -111,6 +111,51 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; */ static DEFINE_SPINLOCK(xen_reservation_lock); +/* Protected by xen_reservation_lock. */ +#define MIN_CONTIG_ORDER 9 /* 2MB */ +static unsigned int discontig_frames_order = MIN_CONTIG_ORDER; +static unsigned long discontig_frames_early[1UL << MIN_CONTIG_ORDER] __initdata; +static unsigned long *discontig_frames __refdata = discontig_frames_early; +static bool discontig_frames_dyn; + +static int alloc_discontig_frames(unsigned int order) +{ + unsigned long *new_array, *old_array; + unsigned int old_order; + unsigned long flags; + + BUG_ON(order < MIN_CONTIG_ORDER); + BUILD_BUG_ON(sizeof(discontig_frames_early) != PAGE_SIZE); + + new_array = (unsigned long *)__get_free_pages(GFP_KERNEL, + order - MIN_CONTIG_ORDER); + if (!new_array) + return -ENOMEM; + + spin_lock_irqsave(&xen_reservation_lock, flags); + + old_order = discontig_frames_order; + + if (order > discontig_frames_order || !discontig_frames_dyn) { + if (!discontig_frames_dyn) + old_array = NULL; + else + old_array = discontig_frames; + + discontig_frames = new_array; + discontig_frames_order = order; + discontig_frames_dyn = true; + } else { + old_array = new_array; + } + + spin_unlock_irqrestore(&xen_reservation_lock, flags); + + free_pages((unsigned long)old_array, old_order - MIN_CONTIG_ORDER); + + return 0; +} + /* * Note about cr3 (pagetable base) values: * @@ -814,6 +859,9 @@ static void __init xen_after_bootmem(void) SetPagePinned(virt_to_page(level3_user_vsyscall)); #endif xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); + + if (alloc_discontig_frames(MIN_CONTIG_ORDER)) + BUG(); } static void xen_unpin_page(struct mm_struct *mm, struct page *page, @@ -2203,10 +2251,6 @@ void __init xen_init_mmu_ops(void) memset(dummy_mapping, 0xff, PAGE_SIZE); } -/* Protected by xen_reservation_lock. */ -#define MAX_CONTIG_ORDER 9 /* 2MB */ -static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; - #define VOID_PTE (mfn_pte(0, __pgprot(0))) static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, unsigned long *in_frames, @@ -2323,18 +2367,25 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, unsigned int address_bits, dma_addr_t *dma_handle) { - unsigned long *in_frames = discontig_frames, out_frame; + unsigned long *in_frames, out_frame; unsigned long flags; int success; unsigned long vstart = (unsigned long)phys_to_virt(pstart); - if (unlikely(order > MAX_CONTIG_ORDER)) - return -ENOMEM; + if (unlikely(order > discontig_frames_order)) { + if (!discontig_frames_dyn) + return -ENOMEM; + + if (alloc_discontig_frames(order)) + return -ENOMEM; + } memset((void *) vstart, 0, PAGE_SIZE << order); spin_lock_irqsave(&xen_reservation_lock, flags); + in_frames = discontig_frames; + /* 1. Zap current PTEs, remembering MFNs. */ xen_zap_pfn_range(vstart, order, in_frames, NULL); @@ -2358,12 +2409,12 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) { - unsigned long *out_frames = discontig_frames, in_frame; + unsigned long *out_frames, in_frame; unsigned long flags; int success; unsigned long vstart; - if (unlikely(order > MAX_CONTIG_ORDER)) + if (unlikely(order > discontig_frames_order)) return; vstart = (unsigned long)phys_to_virt(pstart); @@ -2371,6 +2422,8 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) spin_lock_irqsave(&xen_reservation_lock, flags); + out_frames = discontig_frames; + /* 1. Find start MFN of contiguous extent. */ in_frame = virt_to_mfn((void *)vstart); diff --git a/block/partitions/mac.c b/block/partitions/mac.c index c80183156d680..b02530d986297 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -53,13 +53,25 @@ int mac_partition(struct parsed_partitions *state) } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); + + /* + * If the "block size" is not a power of 2, things get weird - we might + * end up with a partition straddling a sector boundary, so we wouldn't + * be able to read a partition entry with read_part_sector(). + * Real block sizes are probably (?) powers of two, so just require + * that. + */ + if (!is_power_of_2(secsize)) + return -1; datasize = round_down(secsize, 512); data = read_part_sector(state, datasize / 512, §); if (!data) return -1; partoffset = secsize % 512; - if (partoffset + sizeof(*part) > datasize) + if (partoffset + sizeof(*part) > datasize) { + put_dev_sector(sect); return -1; + } part = (struct mac_partition *) (data + partoffset); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { put_dev_sector(sect); @@ -112,8 +124,8 @@ int mac_partition(struct parsed_partitions *state) int i, l; goodness++; - l = strlen(part->name); - if (strcmp(part->name, "/") == 0) + l = strnlen(part->name, sizeof(part->name)); + if (strncmp(part->name, "/", sizeof(part->name)) == 0) goodness++; for (i = 0; i <= l - 4; ++i) { if (strncasecmp(part->name + i, "root", diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index 3561553eff8b5..70f8290b659de 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -163,7 +163,7 @@ int __init acpi_gtdt_init(struct acpi_table_header *table, { void *platform_timer; struct acpi_table_gtdt *gtdt; - int cnt = 0; + u32 cnt = 0; gtdt = container_of(table, struct acpi_table_gtdt, header); acpi_gtdt_desc.gtdt = gtdt; @@ -188,13 +188,17 @@ int __init acpi_gtdt_init(struct acpi_table_header *table, cnt++; if (cnt != gtdt->platform_timer_count) { + cnt = min(cnt, gtdt->platform_timer_count); + pr_err(FW_BUG "limiting Platform Timer count to %d\n", cnt); + } + + if (!cnt) { acpi_gtdt_desc.platform_timer = NULL; - pr_err(FW_BUG "invalid timer data.\n"); - return -EINVAL; + return 0; } if (platform_timer_count) - *platform_timer_count = gtdt->platform_timer_count; + *platform_timer_count = cnt; return 0; } diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 8296bf985d1d1..7309394b8fc98 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -934,13 +934,15 @@ char * __init efi_md_typeattr_format(char *buf, size_t size, EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_RO | EFI_MEMORY_WP | EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_NV | EFI_MEMORY_SP | EFI_MEMORY_CPU_CRYPTO | - EFI_MEMORY_RUNTIME | EFI_MEMORY_MORE_RELIABLE)) + EFI_MEMORY_MORE_RELIABLE | EFI_MEMORY_HOT_PLUGGABLE | + EFI_MEMORY_RUNTIME)) snprintf(pos, size, "|attr=0x%016llx]", (unsigned long long)attr); else snprintf(pos, size, - "|%3s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]", + "|%3s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]", attr & EFI_MEMORY_RUNTIME ? "RUN" : "", + attr & EFI_MEMORY_HOT_PLUGGABLE ? "HP" : "", attr & EFI_MEMORY_MORE_RELIABLE ? "MR" : "", attr & EFI_MEMORY_CPU_CRYPTO ? "CC" : "", attr & EFI_MEMORY_SP ? "SP" : "", diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index e5872e38d9a46..5a732018be36d 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -25,6 +25,9 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md, if (md->type != EFI_CONVENTIONAL_MEMORY) return 0; + if (md->attribute & EFI_MEMORY_HOT_PLUGGABLE) + return 0; + if (efi_soft_reserve_enabled() && (md->attribute & EFI_MEMORY_SP)) return 0; diff --git a/drivers/firmware/efi/libstub/relocate.c b/drivers/firmware/efi/libstub/relocate.c index 99b45d1cd6246..d4264bfb6dc17 100644 --- a/drivers/firmware/efi/libstub/relocate.c +++ b/drivers/firmware/efi/libstub/relocate.c @@ -53,6 +53,9 @@ efi_status_t efi_low_alloc_above(unsigned long size, unsigned long align, if (desc->type != EFI_CONVENTIONAL_MEMORY) continue; + if (desc->attribute & EFI_MEMORY_HOT_PLUGGABLE) + continue; + if (efi_soft_reserve_enabled() && (desc->attribute & EFI_MEMORY_SP)) continue; diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 0bbda60d3cdc7..23caea22f8dcd 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -175,6 +175,7 @@ #define CONTROL_GAM_EN 25 #define CONTROL_GALOG_EN 28 #define CONTROL_GAINT_EN 29 +#define CONTROL_EPH_EN 45 #define CONTROL_XT_EN 50 #define CONTROL_INTCAPXT_EN 51 #define CONTROL_IRTCACHEDIS 59 diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index c5cd92edada06..438848b0682fe 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -2653,6 +2653,10 @@ static void iommu_init_flags(struct amd_iommu *iommu) /* Set IOTLB invalidation timeout to 1s */ iommu_set_inv_tlb_timeout(iommu, CTRL_INV_TO_1S); + + /* Enable Enhanced Peripheral Page Request Handling */ + if (check_feature(FEATURE_EPHSUP)) + iommu_feature_enable(iommu, CONTROL_EPH_EN); } static void iommu_apply_resume_quirks(struct amd_iommu *iommu) diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index c666ecab955d2..69e23e017d9e5 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -249,7 +249,7 @@ struct exynos_iommu_domain { struct list_head clients; /* list of sysmmu_drvdata.domain_node */ sysmmu_pte_t *pgtable; /* lv1 page table, 16KB */ short *lv2entcnt; /* free lv2 entry counter for each section */ - spinlock_t lock; /* lock for modyfying list of clients */ + spinlock_t lock; /* lock for modifying list of clients */ spinlock_t pgtablelock; /* lock for modifying page table @ pgtable */ struct iommu_domain domain; /* generic domain data structure */ }; @@ -292,7 +292,7 @@ struct sysmmu_drvdata { struct clk *aclk; /* SYSMMU's aclk clock */ struct clk *pclk; /* SYSMMU's pclk clock */ struct clk *clk_master; /* master's device clock */ - spinlock_t lock; /* lock for modyfying state */ + spinlock_t lock; /* lock for modifying state */ bool active; /* current status */ struct exynos_iommu_domain *domain; /* domain we belong to */ struct list_head domain_node; /* node for domain clients list */ @@ -746,7 +746,7 @@ static int exynos_sysmmu_probe(struct platform_device *pdev) ret = devm_request_irq(dev, irq, exynos_sysmmu_irq, 0, dev_name(dev), data); if (ret) { - dev_err(dev, "Unabled to register handler of irq %d\n", irq); + dev_err(dev, "Unable to register handler of irq %d\n", irq); return ret; } diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c index c2d792db52c3e..064194399b38b 100644 --- a/drivers/iommu/intel/prq.c +++ b/drivers/iommu/intel/prq.c @@ -87,7 +87,9 @@ prq_retry: struct page_req_dsc *req; req = &iommu->prq[head / sizeof(*req)]; - if (!req->pasid_present || req->pasid != pasid) { + if (req->rid != sid || + (req->pasid_present && pasid != req->pasid) || + (!req->pasid_present && pasid != IOMMU_NO_PASID)) { head = (head + sizeof(*req)) & PRQ_RING_MASK; continue; } diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 4674e618797c1..8b5926c1452ed 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -478,6 +478,7 @@ void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) ops->page_response(dev, iopf, &resp); list_del_init(&group->pending_node); + iopf_free_group(group); } mutex_unlock(&fault_param->lock); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 870c3cdbd0f62..60aed01e54f27 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1756,7 +1756,7 @@ static int iommu_get_def_domain_type(struct iommu_group *group, group->id); /* - * Try to recover, drivers are allowed to force IDENITY or DMA, IDENTITY + * Try to recover, drivers are allowed to force IDENTITY or DMA, IDENTITY * takes precedence. */ if (type == IOMMU_DOMAIN_IDENTITY) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index 4b6e913725260..345ea91629e0f 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -273,6 +273,7 @@ #define MSDC_PAD_TUNE_CMD2_SEL BIT(21) /* RW */ #define PAD_DS_TUNE_DLY_SEL BIT(0) /* RW */ +#define PAD_DS_TUNE_DLY2_SEL BIT(1) /* RW */ #define PAD_DS_TUNE_DLY1 GENMASK(6, 2) /* RW */ #define PAD_DS_TUNE_DLY2 GENMASK(11, 7) /* RW */ #define PAD_DS_TUNE_DLY3 GENMASK(16, 12) /* RW */ @@ -318,6 +319,7 @@ /* EMMC50_PAD_DS_TUNE mask */ #define PAD_DS_DLY_SEL BIT(16) /* RW */ +#define PAD_DS_DLY2_SEL BIT(15) /* RW */ #define PAD_DS_DLY1 GENMASK(14, 10) /* RW */ #define PAD_DS_DLY3 GENMASK(4, 0) /* RW */ @@ -2504,13 +2506,23 @@ tune_done: static int msdc_prepare_hs400_tuning(struct mmc_host *mmc, struct mmc_ios *ios) { struct msdc_host *host = mmc_priv(mmc); + host->hs400_mode = true; - if (host->top_base) - writel(host->hs400_ds_delay, - host->top_base + EMMC50_PAD_DS_TUNE); - else - writel(host->hs400_ds_delay, host->base + PAD_DS_TUNE); + if (host->top_base) { + if (host->hs400_ds_dly3) + sdr_set_field(host->top_base + EMMC50_PAD_DS_TUNE, + PAD_DS_DLY3, host->hs400_ds_dly3); + if (host->hs400_ds_delay) + writel(host->hs400_ds_delay, + host->top_base + EMMC50_PAD_DS_TUNE); + } else { + if (host->hs400_ds_dly3) + sdr_set_field(host->base + PAD_DS_TUNE, + PAD_DS_TUNE_DLY3, host->hs400_ds_dly3); + if (host->hs400_ds_delay) + writel(host->hs400_ds_delay, host->base + PAD_DS_TUNE); + } /* hs400 mode must set it to 0 */ sdr_clr_bits(host->base + MSDC_PATCH_BIT2, MSDC_PATCH_BIT2_CFGCRCSTS); /* to improve read performance, set outstanding to 2 */ @@ -2530,14 +2542,11 @@ static int msdc_execute_hs400_tuning(struct mmc_host *mmc, struct mmc_card *card if (host->top_base) { sdr_set_bits(host->top_base + EMMC50_PAD_DS_TUNE, PAD_DS_DLY_SEL); - if (host->hs400_ds_dly3) - sdr_set_field(host->top_base + EMMC50_PAD_DS_TUNE, - PAD_DS_DLY3, host->hs400_ds_dly3); + sdr_clr_bits(host->top_base + EMMC50_PAD_DS_TUNE, + PAD_DS_DLY2_SEL); } else { sdr_set_bits(host->base + PAD_DS_TUNE, PAD_DS_TUNE_DLY_SEL); - if (host->hs400_ds_dly3) - sdr_set_field(host->base + PAD_DS_TUNE, - PAD_DS_TUNE_DLY3, host->hs400_ds_dly3); + sdr_clr_bits(host->base + PAD_DS_TUNE, PAD_DS_TUNE_DLY2_SEL); } host->hs400_tuning = true; diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index b73f673db92bb..f75c31815ab00 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -155,7 +155,6 @@ struct sdhci_am654_data { u32 tuning_loop; #define SDHCI_AM654_QUIRK_FORCE_CDTEST BIT(0) -#define SDHCI_AM654_QUIRK_SUPPRESS_V1P8_ENA BIT(1) }; struct window { @@ -357,29 +356,6 @@ static void sdhci_j721e_4bit_set_clock(struct sdhci_host *host, sdhci_set_clock(host, clock); } -static int sdhci_am654_start_signal_voltage_switch(struct mmc_host *mmc, struct mmc_ios *ios) -{ - struct sdhci_host *host = mmc_priv(mmc); - struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - struct sdhci_am654_data *sdhci_am654 = sdhci_pltfm_priv(pltfm_host); - int ret; - - if ((sdhci_am654->quirks & SDHCI_AM654_QUIRK_SUPPRESS_V1P8_ENA) && - ios->signal_voltage == MMC_SIGNAL_VOLTAGE_180) { - if (!IS_ERR(mmc->supply.vqmmc)) { - ret = mmc_regulator_set_vqmmc(mmc, ios); - if (ret < 0) { - pr_err("%s: Switching to 1.8V signalling voltage failed,\n", - mmc_hostname(mmc)); - return -EIO; - } - } - return 0; - } - - return sdhci_start_signal_voltage_switch(mmc, ios); -} - static u8 sdhci_am654_write_power_on(struct sdhci_host *host, u8 val, int reg) { writeb(val, host->ioaddr + reg); @@ -868,11 +844,6 @@ static int sdhci_am654_get_of_property(struct platform_device *pdev, if (device_property_read_bool(dev, "ti,fails-without-test-cd")) sdhci_am654->quirks |= SDHCI_AM654_QUIRK_FORCE_CDTEST; - /* Suppress v1p8 ena for eMMC and SD with vqmmc supply */ - if (!!of_parse_phandle(dev->of_node, "vmmc-supply", 0) == - !!of_parse_phandle(dev->of_node, "vqmmc-supply", 0)) - sdhci_am654->quirks |= SDHCI_AM654_QUIRK_SUPPRESS_V1P8_ENA; - sdhci_get_of_property(pdev); return 0; @@ -969,7 +940,6 @@ static int sdhci_am654_probe(struct platform_device *pdev) goto err_pltfm_free; } - host->mmc_host_ops.start_signal_voltage_switch = sdhci_am654_start_signal_voltage_switch; host->mmc_host_ops.execute_tuning = sdhci_am654_execute_tuning; pm_runtime_get_noresume(dev); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index b6536ed599c37..246744d8d268a 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -339,13 +339,14 @@ out: return (res->flags & IORESOURCE_MEM_64) ? 1 : 0; } -static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom) +static __always_inline void pci_read_bases(struct pci_dev *dev, + unsigned int howmany, int rom) { u32 rombar, stdbars[PCI_STD_NUM_BARS]; unsigned int pos, reg; u16 orig_cmd; - BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS); + BUILD_BUG_ON(statically_true(howmany > PCI_STD_NUM_BARS)); if (dev->non_compliant_bars) return; diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index b84ff7bade822..82b21e34c545e 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5522,7 +5522,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap); * AMD Matisse USB 3.0 Host Controller 0x149c * Intel 82579LM Gigabit Ethernet Controller 0x1502 * Intel 82579V Gigabit Ethernet Controller 0x1503 - * + * Mediatek MT7922 802.11ax PCI Express Wireless Network Adapter */ static void quirk_no_flr(struct pci_dev *dev) { @@ -5534,6 +5534,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x149c, quirk_no_flr); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x7901, quirk_no_flr); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1502, quirk_no_flr); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1503, quirk_no_flr); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_MEDIATEK, 0x0616, quirk_no_flr); /* FLR may cause the SolidRun SNET DPU (rev 0x1) to hang */ static void quirk_no_flr_snet(struct pci_dev *dev) diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index 280071be30b15..6b7ab1814c12d 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -57,8 +57,6 @@ struct time_in_idle { * @max_level: maximum cooling level. One less than total number of valid * cpufreq frequencies. * @em: Reference on the Energy Model of the device - * @cdev: thermal_cooling_device pointer to keep track of the - * registered cooling device. * @policy: cpufreq policy. * @cooling_ops: cpufreq callbacks to thermal cooling device ops * @idle_time: idle time stats diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index a337edcf8faf7..1f65795cf5d7a 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -74,19 +74,21 @@ static inline phys_addr_t xen_dma_to_phys(struct device *dev, return xen_bus_to_phys(dev, dma_to_phys(dev, dma_addr)); } +static inline bool range_requires_alignment(phys_addr_t p, size_t size) +{ + phys_addr_t algn = 1ULL << (get_order(size) + PAGE_SHIFT); + phys_addr_t bus_addr = pfn_to_bfn(XEN_PFN_DOWN(p)) << XEN_PAGE_SHIFT; + + return IS_ALIGNED(p, algn) && !IS_ALIGNED(bus_addr, algn); +} + static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p); unsigned int i, nr_pages = XEN_PFN_UP(xen_offset_in_page(p) + size); - phys_addr_t algn = 1ULL << (get_order(size) + PAGE_SHIFT); next_bfn = pfn_to_bfn(xen_pfn); - /* If buffer is physically aligned, ensure DMA alignment. */ - if (IS_ALIGNED(p, algn) && - !IS_ALIGNED((phys_addr_t)next_bfn << XEN_PAGE_SHIFT, algn)) - return 1; - for (i = 1; i < nr_pages; i++) if (pfn_to_bfn(++xen_pfn) != ++next_bfn) return 1; @@ -111,7 +113,7 @@ static struct io_tlb_pool *xen_swiotlb_find_pool(struct device *dev, } #ifdef CONFIG_X86 -int xen_swiotlb_fixup(void *buf, unsigned long nslabs) +int __init xen_swiotlb_fixup(void *buf, unsigned long nslabs) { int rc; unsigned int order = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT); @@ -156,7 +158,8 @@ xen_swiotlb_alloc_coherent(struct device *dev, size_t size, *dma_handle = xen_phys_to_dma(dev, phys); if (*dma_handle + size - 1 > dma_mask || - range_straddles_page_boundary(phys, size)) { + range_straddles_page_boundary(phys, size) || + range_requires_alignment(phys, size)) { if (xen_create_contiguous_region(phys, order, fls64(dma_mask), dma_handle) != 0) goto out_free_pages; @@ -182,7 +185,8 @@ xen_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, size = ALIGN(size, XEN_PAGE_SIZE); if (WARN_ON_ONCE(dma_handle + size - 1 > dev->coherent_dma_mask) || - WARN_ON_ONCE(range_straddles_page_boundary(phys, size))) + WARN_ON_ONCE(range_straddles_page_boundary(phys, size) || + range_requires_alignment(phys, size))) return; if (TestClearPageXenRemapped(virt_to_page(vaddr))) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index ac1f890a0d543..4bdd6a43e5215 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1508,7 +1508,6 @@ struct cifs_io_parms { struct cifs_io_request { struct netfs_io_request rreq; struct cifsFileInfo *cfile; - struct TCP_Server_Info *server; pid_t pid; }; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 79de2f2f9c41a..8582cf61242c6 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -147,7 +147,7 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); - struct TCP_Server_Info *server = req->server; + struct TCP_Server_Info *server; struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); size_t size; int rc = 0; @@ -156,6 +156,8 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) rdata->xid = get_xid(); rdata->have_xid = true; } + + server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); rdata->server = server; if (cifs_sb->ctx->rsize == 0) @@ -198,7 +200,7 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); - struct TCP_Server_Info *server = req->server; + struct TCP_Server_Info *server = rdata->server; int rc = 0; cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n", @@ -266,7 +268,6 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) open_file = file->private_data; rreq->netfs_priv = file->private_data; req->cfile = cifsFileInfo_get(open_file); - req->server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) req->pid = req->cfile->pid; } else if (rreq->origin != NETFS_WRITEBACK) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9ebb53f031cdb..fa2a76cc2f73d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -861,12 +861,22 @@ static inline bool blk_mq_add_to_batch(struct request *req, void (*complete)(struct io_comp_batch *)) { /* - * blk_mq_end_request_batch() can't end request allocated from - * sched tags + * Check various conditions that exclude batch processing: + * 1) No batch container + * 2) Has scheduler data attached + * 3) Not a passthrough request and end_io set + * 4) Not a passthrough request and an ioerror */ - if (!iob || (req->rq_flags & RQF_SCHED_TAGS) || ioerror || - (req->end_io && !blk_rq_is_passthrough(req))) + if (!iob) return false; + if (req->rq_flags & RQF_SCHED_TAGS) + return false; + if (!blk_rq_is_passthrough(req)) { + if (req->end_io) + return false; + if (ioerror < 0) + return false; + } if (!iob->complete) iob->complete = complete; diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1b20d2d8ef7cc..17960a1e858db 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -71,9 +71,6 @@ enum { /* Cgroup is frozen. */ CGRP_FROZEN, - - /* Control group has to be killed. */ - CGRP_KILL, }; /* cgroup_root->flags */ @@ -461,6 +458,9 @@ struct cgroup { int nr_threaded_children; /* # of live threaded child cgroups */ + /* sequence number for cgroup.kill, serialized by css_set_lock. */ + unsigned int kill_seq; + struct kernfs_node *kn; /* cgroup kernfs entry */ struct cgroup_file procs_file; /* handle for "cgroup.procs" */ struct cgroup_file events_file; /* handle for "cgroup.events" */ diff --git a/include/linux/efi.h b/include/linux/efi.h index 053c57e618698..7d63d1d75f22f 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -114,21 +114,22 @@ typedef struct { #define EFI_MAX_MEMORY_TYPE 16 /* Attribute values: */ -#define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */ -#define EFI_MEMORY_WC ((u64)0x0000000000000002ULL) /* write-coalescing */ -#define EFI_MEMORY_WT ((u64)0x0000000000000004ULL) /* write-through */ -#define EFI_MEMORY_WB ((u64)0x0000000000000008ULL) /* write-back */ -#define EFI_MEMORY_UCE ((u64)0x0000000000000010ULL) /* uncached, exported */ -#define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ -#define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ -#define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ -#define EFI_MEMORY_NV ((u64)0x0000000000008000ULL) /* non-volatile */ -#define EFI_MEMORY_MORE_RELIABLE \ - ((u64)0x0000000000010000ULL) /* higher reliability */ -#define EFI_MEMORY_RO ((u64)0x0000000000020000ULL) /* read-only */ -#define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */ -#define EFI_MEMORY_CPU_CRYPTO ((u64)0x0000000000080000ULL) /* supports encryption */ -#define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ +#define EFI_MEMORY_UC BIT_ULL(0) /* uncached */ +#define EFI_MEMORY_WC BIT_ULL(1) /* write-coalescing */ +#define EFI_MEMORY_WT BIT_ULL(2) /* write-through */ +#define EFI_MEMORY_WB BIT_ULL(3) /* write-back */ +#define EFI_MEMORY_UCE BIT_ULL(4) /* uncached, exported */ +#define EFI_MEMORY_WP BIT_ULL(12) /* write-protect */ +#define EFI_MEMORY_RP BIT_ULL(13) /* read-protect */ +#define EFI_MEMORY_XP BIT_ULL(14) /* execute-protect */ +#define EFI_MEMORY_NV BIT_ULL(15) /* non-volatile */ +#define EFI_MEMORY_MORE_RELIABLE BIT_ULL(16) /* higher reliability */ +#define EFI_MEMORY_RO BIT_ULL(17) /* read-only */ +#define EFI_MEMORY_SP BIT_ULL(18) /* soft reserved */ +#define EFI_MEMORY_CPU_CRYPTO BIT_ULL(19) /* supports encryption */ +#define EFI_MEMORY_HOT_PLUGGABLE BIT_ULL(20) /* supports unplugging at runtime */ +#define EFI_MEMORY_RUNTIME BIT_ULL(63) /* range requires runtime mapping */ + #define EFI_MEMORY_DESCRIPTOR_VERSION 1 #define EFI_PAGE_SHIFT 12 diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 0f2aeb37bbb04..ca1db4b92c324 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -43,6 +43,7 @@ struct kernel_clone_args { void *fn_arg; struct cgroup *cgrp; struct css_set *cset; + unsigned int kill_seq; }; /* diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h index 349718c271ebf..46a2633d33aaa 100644 --- a/include/uapi/linux/thermal.h +++ b/include/uapi/linux/thermal.h @@ -30,7 +30,6 @@ enum thermal_genl_attr { THERMAL_GENL_ATTR_TZ, THERMAL_GENL_ATTR_TZ_ID, THERMAL_GENL_ATTR_TZ_TEMP, - THERMAL_GENL_ATTR_TZ_PREV_TEMP, THERMAL_GENL_ATTR_TZ_TRIP, THERMAL_GENL_ATTR_TZ_TRIP_ID, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, @@ -54,6 +53,7 @@ enum thermal_genl_attr { THERMAL_GENL_ATTR_THRESHOLD, THERMAL_GENL_ATTR_THRESHOLD_TEMP, THERMAL_GENL_ATTR_THRESHOLD_DIRECTION, + THERMAL_GENL_ATTR_TZ_PREV_TEMP, __THERMAL_GENL_ATTR_MAX, }; #define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 04bf493eecae0..8e72de7712ac9 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -415,6 +415,13 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) } } +static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +{ + scoped_guard(mutex, &ctx->mmap_lock) + WARN_ON_ONCE(xa_erase(&ctx->io_bl_xa, bl->bgid) != bl); + io_put_bl(ctx, bl); +} + int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); @@ -636,12 +643,13 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) /* if mapped buffer ring OR classic exists, don't allow */ if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list)) return -EEXIST; - } else { - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return -ENOMEM; + io_destroy_bl(ctx, bl); } + free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) + return -ENOMEM; + mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT; ring_size = flex_array_size(br, bufs, reg.ring_entries); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 1f6a82128b475..e6701b7aa1474 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -54,9 +54,6 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, continue; if (cmd->flags & IORING_URING_CMD_CANCELABLE) { - /* ->sqe isn't available if no async data */ - if (!req_has_async_data(req)) - cmd->sqe = NULL; file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL | IO_URING_F_COMPLETE_DEFER); ret = true; @@ -179,12 +176,13 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, return -ENOMEM; cache->op_data = NULL; - if (!(req->flags & REQ_F_FORCE_ASYNC)) { - /* defer memcpy until we need it */ - ioucmd->sqe = sqe; - return 0; - } - + /* + * Unconditionally cache the SQE for now - this is only needed for + * requests that go async, but prep handlers must ensure that any + * sqe data is stable beyond prep. Since uring_cmd is special in + * that it doesn't read in per-op data, play it safe and ensure that + * any SQE data is stable beyond prep. This can later get relaxed. + */ memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); ioucmd->sqe = cache->sqes; return 0; @@ -249,16 +247,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } ret = file->f_op->uring_cmd(ioucmd, issue_flags); - if (ret == -EAGAIN) { - struct io_uring_cmd_data *cache = req->async_data; - - if (ioucmd->sqe != (void *) cache) - memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); - return -EAGAIN; - } else if (ret == -EIOCBQUEUED) { - return -EIOCBQUEUED; - } - + if (ret == -EAGAIN || ret == -EIOCBQUEUED) + return ret; if (ret < 0) req_set_fail(req); io_req_uring_cleanup(req, issue_flags); diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 853e97a7b0ecb..15a7daf3ff4f3 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -118,7 +118,6 @@ static int io_waitid_finish(struct io_kiocb *req, int ret) static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); - struct io_tw_state ts = {}; /* anyone completing better be holding a reference */ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); @@ -131,7 +130,6 @@ static void io_waitid_complete(struct io_kiocb *req, int ret) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - io_req_task_complete(req, &ts); } static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) @@ -153,6 +151,7 @@ static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) list_del_init(&iwa->wo.child_wait.entry); spin_unlock_irq(&iw->head->lock); io_waitid_complete(req, -ECANCELED); + io_req_queue_tw_complete(req, -ECANCELED); return true; } @@ -258,6 +257,7 @@ static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) } io_waitid_complete(req, ret); + io_req_task_complete(req, ts); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, @@ -285,10 +285,16 @@ static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct io_waitid_async *iwa; if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags) return -EINVAL; + iwa = io_uring_alloc_async_data(NULL, req); + if (!unlikely(iwa)) + return -ENOMEM; + iwa->req = req; + iw->which = READ_ONCE(sqe->len); iw->upid = READ_ONCE(sqe->fd); iw->options = READ_ONCE(sqe->file_index); @@ -299,16 +305,10 @@ int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_waitid(struct io_kiocb *req, unsigned int issue_flags) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; - struct io_waitid_async *iwa; int ret; - iwa = io_uring_alloc_async_data(NULL, req); - if (!iwa) - return -ENOMEM; - - iwa->req = req; - ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, iw->options, NULL); if (ret) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d9061bd55436b..afc665b7b1fe5 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4013,7 +4013,7 @@ static void __cgroup_kill(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - set_bit(CGRP_KILL, &cgrp->flags); + cgrp->kill_seq++; spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); @@ -4029,10 +4029,6 @@ static void __cgroup_kill(struct cgroup *cgrp) send_sig(SIGKILL, task, 0); } css_task_iter_end(&it); - - spin_lock_irq(&css_set_lock); - clear_bit(CGRP_KILL, &cgrp->flags); - spin_unlock_irq(&css_set_lock); } static void cgroup_kill(struct cgroup *cgrp) @@ -6488,6 +6484,10 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); + if (kargs->cgrp) + kargs->kill_seq = kargs->cgrp->kill_seq; + else + kargs->kill_seq = cset->dfl_cgrp->kill_seq; spin_unlock_irq(&css_set_lock); if (!(kargs->flags & CLONE_INTO_CGROUP)) { @@ -6668,6 +6668,7 @@ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { + unsigned int cgrp_kill_seq = 0; unsigned long cgrp_flags = 0; bool kill = false; struct cgroup_subsys *ss; @@ -6681,10 +6682,13 @@ void cgroup_post_fork(struct task_struct *child, /* init tasks are special, only link regular threads */ if (likely(child->pid)) { - if (kargs->cgrp) + if (kargs->cgrp) { cgrp_flags = kargs->cgrp->flags; - else + cgrp_kill_seq = kargs->cgrp->kill_seq; + } else { cgrp_flags = cset->dfl_cgrp->flags; + cgrp_kill_seq = cset->dfl_cgrp->kill_seq; + } WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; @@ -6719,7 +6723,7 @@ void cgroup_post_fork(struct task_struct *child, * child down right after we finished preparing it for * userspace. */ - kill = test_bit(CGRP_KILL, &cgrp_flags); + kill = kargs->kill_seq != cgrp_kill_seq; } spin_unlock_irq(&css_set_lock); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 5877974ece92c..aac91466279f1 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -590,7 +590,6 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; - cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; #ifdef CONFIG_SCHED_CORE bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 83d46b9b8ec8c..2b331822c7e77 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -150,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p) * see this thread after that: we can no longer use signal->autogroup. * See the PF_EXITING check in task_wants_autogroup(). */ - sched_move_task(p); + sched_move_task(p, true); } static void @@ -182,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * sched_autogroup_exit_task(). */ for_each_thread(p, t) - sched_move_task(t); + sched_move_task(t, true); unlock_task_sighand(p, &flags); autogroup_kref_put(prev); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 165c90ba64ea9..5404a303199d9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9050,7 +9050,7 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect * its new group. */ -void sched_move_task(struct task_struct *tsk) +void sched_move_task(struct task_struct *tsk, bool for_autogroup) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; @@ -9079,7 +9079,8 @@ void sched_move_task(struct task_struct *tsk) put_prev_task(rq, tsk); sched_change_group(tsk, group); - scx_move_task(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -9180,7 +9181,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) - sched_move_task(task); + sched_move_task(task, false); scx_cgroup_finish_attach(); } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8857c0709bdde..5a81d9a1e31f2 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -123,6 +123,19 @@ enum scx_ops_flags { SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, /* + * A migration disabled task can only execute on its current CPU. By + * default, such tasks are automatically put on the CPU's local DSQ with + * the default slice on enqueue. If this ops flag is set, they also go + * through ops.enqueue(). + * + * A migration disabled task never invokes ops.select_cpu() as it can + * only select the current CPU. Also, p->cpus_ptr will only contain its + * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr + * and thus may disagree with cpumask_weight(p->cpus_ptr). + */ + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + + /* * CPU cgroup support flags */ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */ @@ -130,6 +143,7 @@ enum scx_ops_flags { SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_ENQ_LAST | SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | SCX_OPS_SWITCH_PARTIAL | SCX_OPS_HAS_CGROUP_WEIGHT, }; @@ -416,7 +430,7 @@ struct sched_ext_ops { /** * @update_idle: Update the idle state of a CPU - * @cpu: CPU to udpate the idle state for + * @cpu: CPU to update the idle state for * @idle: whether entering or exiting the idle state * * This operation is called when @rq's CPU goes or leaves the idle @@ -882,6 +896,7 @@ static bool scx_warned_zero_slice; static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled); static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); @@ -1214,7 +1229,7 @@ static bool scx_kf_allowed_if_unlocked(void) /** * nldsq_next_task - Iterate to the next task in a non-local DSQ - * @dsq: user dsq being interated + * @dsq: user dsq being iterated * @cur: current position, %NULL to start iteration * @rev: walk backwards * @@ -2014,6 +2029,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, unlikely(p->flags & PF_EXITING)) goto local; + /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ + if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && + is_migration_disabled(p)) + goto local; + if (!SCX_HAS_OP(enqueue)) goto global; @@ -2078,7 +2098,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p) /* * list_add_tail() must be used. scx_ops_bypass() depends on tasks being - * appened to the runnable_list. + * appended to the runnable_list. */ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); } @@ -2313,12 +2333,35 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, * * - The BPF scheduler is bypassed while the rq is offline and we can always say * no to the BPF scheduler initiated migrations while offline. + * + * The caller must ensure that @p and @rq are on different CPUs. */ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { int cpu = cpu_of(rq); + SCHED_WARN_ON(task_cpu(p) == cpu); + + /* + * If @p has migration disabled, @p->cpus_ptr is updated to contain only + * the pinned CPU in migrate_disable_switch() while @p is being switched + * out. However, put_prev_task_scx() is called before @p->cpus_ptr is + * updated and thus another CPU may see @p on a DSQ inbetween leading to + * @p passing the below task_allowed_on_cpu() check while migration is + * disabled. + * + * Test the migration disabled state first as the race window is narrow + * and the BPF scheduler failing to check migration disabled state can + * easily be masked if task_allowed_on_cpu() is done first. + */ + if (unlikely(is_migration_disabled(p))) { + if (trigger_error) + scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", + p->comm, p->pid, task_cpu(p), cpu); + return false; + } + /* * We don't require the BPF scheduler to avoid dispatching to offline * CPUs mostly for convenience but also because CPUs can go offline @@ -2327,14 +2370,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, */ if (!task_allowed_on_cpu(p, cpu)) { if (trigger_error) - scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", - cpu_of(rq), p->comm, p->pid); + scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", + cpu, p->comm, p->pid); return false; } - if (unlikely(is_migration_disabled(p))) - return false; - if (!scx_rq_online(rq)) return false; @@ -2437,7 +2477,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, if (dst_dsq->id == SCX_DSQ_LOCAL) { dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); - if (!task_can_run_on_remote_rq(p, dst_rq, true)) { + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { dst_dsq = find_global_dsq(p); dst_rq = src_rq; } @@ -2480,7 +2521,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, /* * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly * banging on the same DSQ on a large NUMA system to the point where switching - * to the bypass mode can take a long time. Inject artifical delays while the + * to the bypass mode can take a long time. Inject artificial delays while the * bypass mode is switching to guarantee timely completion. */ static void scx_ops_breather(struct rq *rq) @@ -2575,6 +2616,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, { struct rq *src_rq = task_rq(p); struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); +#ifdef CONFIG_SMP + struct rq *locked_rq = rq; +#endif /* * We're synchronized against dequeue through DISPATCHING. As @p can't @@ -2588,7 +2632,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } #ifdef CONFIG_SMP - if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { dispatch_enqueue(find_global_dsq(p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; @@ -2611,8 +2656,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); /* switch to @src_rq lock */ - if (rq != src_rq) { - raw_spin_rq_unlock(rq); + if (locked_rq != src_rq) { + raw_spin_rq_unlock(locked_rq); + locked_rq = src_rq; raw_spin_rq_lock(src_rq); } @@ -2630,6 +2676,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } else { move_remote_task_to_local_dsq(p, enq_flags, src_rq, dst_rq); + /* task has been moved to dst_rq, which is now locked */ + locked_rq = dst_rq; } /* if the destination CPU is idle, wake it up */ @@ -2638,8 +2686,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } /* switch back to @rq lock */ - if (rq != dst_rq) { - raw_spin_rq_unlock(dst_rq); + if (locked_rq != rq) { + raw_spin_rq_unlock(locked_rq); raw_spin_rq_lock(rq); } #else /* CONFIG_SMP */ @@ -3144,7 +3192,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) * * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used * to implement the default task ordering. The older the timestamp, the higher - * prority the task - the global FIFO ordering matching the default scheduling + * priority the task - the global FIFO ordering matching the default scheduling * behavior. * * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to @@ -3851,7 +3899,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) curr->scx.slice = 0; touch_core_sched(rq, curr); } else if (SCX_HAS_OP(tick)) { - SCX_CALL_OP(SCX_KF_REST, tick, curr); + SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr); } if (!curr->scx.slice) @@ -3998,7 +4046,7 @@ static void scx_ops_disable_task(struct task_struct *p) WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); if (SCX_HAS_OP(disable)) - SCX_CALL_OP(SCX_KF_REST, disable, p); + SCX_CALL_OP_TASK(SCX_KF_REST, disable, p); scx_set_task_state(p, SCX_TASK_READY); } @@ -4027,7 +4075,7 @@ static void scx_ops_exit_task(struct task_struct *p) } if (SCX_HAS_OP(exit_task)) - SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); + SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args); scx_set_task_state(p, SCX_TASK_NONE); } @@ -4323,25 +4371,12 @@ err: return ops_sanitize_err("cgroup_prep_move", ret); } -void scx_move_task(struct task_struct *p) +void scx_cgroup_move_task(struct task_struct *p) { if (!scx_cgroup_enabled) return; /* - * We're called from sched_move_task() which handles both cgroup and - * autogroup moves. Ignore the latter. - * - * Also ignore exiting tasks, because in the exit path tasks transition - * from the autogroup to the root group, so task_group_is_autogroup() - * alone isn't able to catch exiting autogroup tasks. This is safe for - * cgroup_move(), because cgroup migrations never happen for PF_EXITING - * tasks. - */ - if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING)) - return; - - /* * @p must have ops.cgroup_prep_move() called on it and thus * cgrp_moving_from set. */ @@ -4590,7 +4625,7 @@ static int scx_cgroup_init(void) cgroup_warned_missing_idle = false; /* - * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and init, all online cgroups are initialized. */ rcu_read_lock(); @@ -5059,6 +5094,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) static_branch_disable(&scx_has_op[i]); static_branch_disable(&scx_ops_enq_last); static_branch_disable(&scx_ops_enq_exiting); + static_branch_disable(&scx_ops_enq_migration_disabled); static_branch_disable(&scx_ops_cpu_preempt); static_branch_disable(&scx_builtin_idle_enabled); synchronize_rcu(); @@ -5277,9 +5313,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); - dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu", - p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf, - p->scx.dsq_vtime, p->scx.slice); + dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", + p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); + dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", + p->scx.dsq_vtime, p->scx.slice, p->scx.weight); dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); if (SCX_HAS_OP(dump_task)) { @@ -5667,6 +5704,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ops->flags & SCX_OPS_ENQ_EXITING) static_branch_enable(&scx_ops_enq_exiting); + if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) + static_branch_enable(&scx_ops_enq_migration_disabled); if (scx_ops.cpu_acquire || scx_ops.cpu_release) static_branch_enable(&scx_ops_cpu_preempt); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 4d022d17ac7dd..1079b56b0f7ae 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -73,7 +73,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); -void scx_move_task(struct task_struct *p); +void scx_cgroup_move_task(struct task_struct *p); void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); @@ -82,7 +82,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle); static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static inline void scx_move_task(struct task_struct *p) {} +static inline void scx_cgroup_move_task(struct task_struct *p) {} static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 38e0e323dda26..b93c8c3dc05a5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -572,7 +572,7 @@ extern void sched_online_group(struct task_group *tg, extern void sched_destroy_group(struct task_group *tg); extern void sched_release_group(struct task_group *tg); -extern void sched_move_task(struct task_struct *tsk); +extern void sched_move_task(struct task_struct *tsk, bool for_autogroup); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3c2c45313c886..97152f2250fe7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3517,12 +3517,6 @@ repeat: } /* - * Put the reference grabbed by send_mayday(). @pool won't - * go away while we're still attached to it. - */ - put_pwq(pwq); - - /* * Leave this pool. Notify regular workers; otherwise, we end up * with 0 concurrency and stalling the execution. */ @@ -3532,6 +3526,12 @@ repeat: worker_detach_from_pool(rescuer); + /* + * Put the reference grabbed by send_mayday(). @pool might + * go away any time after it. + */ + put_pwq_unlocked(pwq); + raw_spin_lock_irq(&wq_mayday_lock); } diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index f3e15e9efa76b..7849405614b15 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -270,8 +270,16 @@ void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; #define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) #define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) -void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; -void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +int bpf_list_push_front_impl(struct bpf_list_head *head, + struct bpf_list_node *node, + void *meta, __u64 off) __ksym; +#define bpf_list_push_front(head, node) bpf_list_push_front_impl(head, node, NULL, 0) + +int bpf_list_push_back_impl(struct bpf_list_head *head, + struct bpf_list_node *node, + void *meta, __u64 off) __ksym; +#define bpf_list_push_back(head, node) bpf_list_push_back_impl(head, node, NULL, 0) + struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, @@ -404,6 +412,17 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask) return (const struct cpumask *)mask; } +/* + * Return true if task @p cannot migrate to a different CPU, false + * otherwise. + */ +static inline bool is_migration_disabled(const struct task_struct *p) +{ + if (bpf_core_field_exists(p->migration_disabled)) + return p->migration_disabled; + return false; +} + /* rcu */ void bpf_rcu_read_lock(void) __ksym; void bpf_rcu_read_unlock(void) __ksym; @@ -421,7 +440,7 @@ void bpf_rcu_read_unlock(void) __ksym; */ static inline s64 time_delta(u64 after, u64 before) { - return (s64)(after - before) > 0 ? : 0; + return (s64)(after - before) > 0 ? (s64)(after - before) : 0; } /** diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh index 3f45512fb512e..7406c24be1ac9 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # # Test the special cpuset v1 hotplug case where a cpuset become empty of diff --git a/tools/testing/selftests/sched_ext/create_dsq.c b/tools/testing/selftests/sched_ext/create_dsq.c index fa946d9146d4d..d67431f57ac65 100644 --- a/tools/testing/selftests/sched_ext/create_dsq.c +++ b/tools/testing/selftests/sched_ext/create_dsq.c @@ -14,11 +14,11 @@ static enum scx_test_status setup(void **ctx) { struct create_dsq *skel; - skel = create_dsq__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = create_dsq__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(create_dsq__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c index e65d22f23f3bc..b6d13496b24e8 100644 --- a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c +++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct ddsp_bogus_dsq_fail *skel; - skel = ddsp_bogus_dsq_fail__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = ddsp_bogus_dsq_fail__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(ddsp_bogus_dsq_fail__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c index abafee587cd60..af9ce4ee8baac 100644 --- a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c +++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c @@ -14,8 +14,11 @@ static enum scx_test_status setup(void **ctx) { struct ddsp_vtimelocal_fail *skel; - skel = ddsp_vtimelocal_fail__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = ddsp_vtimelocal_fail__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(ddsp_vtimelocal_fail__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c index fbda6bf546712..c02b2aa6fc641 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c @@ -43,7 +43,7 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) if (!p) return; - if (p->nr_cpus_allowed == nr_cpus) + if (p->nr_cpus_allowed == nr_cpus && !is_migration_disabled(p)) target = bpf_get_prandom_u32() % nr_cpus; else target = scx_bpf_task_cpu(p); diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c index 0ff27e57fe430..e1f2ce4abfe64 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.c @@ -15,6 +15,7 @@ static enum scx_test_status setup(void **ctx) skel = dsp_local_on__open(); SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); skel->rodata->nr_cpus = libbpf_num_possible_cpus(); SCX_FAIL_IF(dsp_local_on__load(skel), "Failed to load skel"); diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c index 73e679953e27a..d3387ae036794 100644 --- a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c +++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct enq_last_no_enq_fails *skel; - skel = enq_last_no_enq_fails__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = enq_last_no_enq_fails__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(enq_last_no_enq_fails__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c index dd1350e5f002d..a80e3a3b3698c 100644 --- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c +++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct enq_select_cpu_fails *skel; - skel = enq_select_cpu_fails__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = enq_select_cpu_fails__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(enq_select_cpu_fails__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c index 31bcd06e21cd3..9451782689de1 100644 --- a/tools/testing/selftests/sched_ext/exit.c +++ b/tools/testing/selftests/sched_ext/exit.c @@ -23,6 +23,7 @@ static enum scx_test_status run(void *ctx) char buf[16]; skel = exit__open(); + SCX_ENUM_INIT(skel); skel->rodata->exit_point = tc; exit__load(skel); link = bpf_map__attach_struct_ops(skel->maps.exit_ops); diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c index 87bf220b1bcee..1c9ceb661c43e 100644 --- a/tools/testing/selftests/sched_ext/hotplug.c +++ b/tools/testing/selftests/sched_ext/hotplug.c @@ -49,8 +49,10 @@ static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined) SCX_ASSERT(is_cpu_online()); - skel = hotplug__open_and_load(); - SCX_ASSERT(skel); + skel = hotplug__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(hotplug__load(skel), "Failed to load skel"); /* Testing the offline -> online path, so go offline before starting */ if (onlining) diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c index 97d45f1e5597e..eddf9e0e26e7f 100644 --- a/tools/testing/selftests/sched_ext/init_enable_count.c +++ b/tools/testing/selftests/sched_ext/init_enable_count.c @@ -15,22 +15,6 @@ #define SCHED_EXT 7 -static struct init_enable_count * -open_load_prog(bool global) -{ - struct init_enable_count *skel; - - skel = init_enable_count__open(); - SCX_BUG_ON(!skel, "Failed to open skel"); - - if (!global) - skel->struct_ops.init_enable_count_ops->flags |= SCX_OPS_SWITCH_PARTIAL; - - SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel"); - - return skel; -} - static enum scx_test_status run_test(bool global) { struct init_enable_count *skel; @@ -40,7 +24,14 @@ static enum scx_test_status run_test(bool global) struct sched_param param = {}; pid_t pids[num_pre_forks]; - skel = open_load_prog(global); + skel = init_enable_count__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + + if (!global) + skel->struct_ops.init_enable_count_ops->flags |= SCX_OPS_SWITCH_PARTIAL; + + SCX_FAIL_IF(init_enable_count__load(skel), "Failed to load skel"); /* * Fork a bunch of children before we attach the scheduler so that we @@ -159,7 +150,7 @@ static enum scx_test_status run(void *ctx) struct scx_test init_enable_count = { .name = "init_enable_count", - .description = "Verify we do the correct amount of counting of init, " + .description = "Verify we correctly count the occurrences of init, " "enable, etc callbacks.", .run = run, }; diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c index f38fc973c3800..c6be50a9941d5 100644 --- a/tools/testing/selftests/sched_ext/maximal.c +++ b/tools/testing/selftests/sched_ext/maximal.c @@ -14,8 +14,11 @@ static enum scx_test_status setup(void **ctx) { struct maximal *skel; - skel = maximal__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = maximal__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(maximal__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/maybe_null.c b/tools/testing/selftests/sched_ext/maybe_null.c index 31cfafb0cf65a..aacf0c58ca4fa 100644 --- a/tools/testing/selftests/sched_ext/maybe_null.c +++ b/tools/testing/selftests/sched_ext/maybe_null.c @@ -43,7 +43,7 @@ static enum scx_test_status run(void *ctx) struct scx_test maybe_null = { .name = "maybe_null", - .description = "Verify if PTR_MAYBE_NULL work for .dispatch", + .description = "Verify if PTR_MAYBE_NULL works for .dispatch", .run = run, }; REGISTER_SCX_TEST(&maybe_null) diff --git a/tools/testing/selftests/sched_ext/minimal.c b/tools/testing/selftests/sched_ext/minimal.c index 6c5db8ebbf8ac..89f7261757ffb 100644 --- a/tools/testing/selftests/sched_ext/minimal.c +++ b/tools/testing/selftests/sched_ext/minimal.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct minimal *skel; - skel = minimal__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = minimal__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(minimal__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/prog_run.c b/tools/testing/selftests/sched_ext/prog_run.c index 3cd57ef8daaa5..05974820ca69d 100644 --- a/tools/testing/selftests/sched_ext/prog_run.c +++ b/tools/testing/selftests/sched_ext/prog_run.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct prog_run *skel; - skel = prog_run__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = prog_run__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(prog_run__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c index 5cfba2d6e0568..308211d804364 100644 --- a/tools/testing/selftests/sched_ext/reload_loop.c +++ b/tools/testing/selftests/sched_ext/reload_loop.c @@ -18,11 +18,10 @@ bool force_exit = false; static enum scx_test_status setup(void **ctx) { - skel = maximal__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = maximal__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(maximal__load(skel), "Failed to load skel"); return SCX_TEST_PASS; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c index a53a40c2d2f0f..5b6e045e1109b 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c @@ -17,8 +17,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dfl *skel; - skel = select_cpu_dfl__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dfl__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dfl__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c index 1d85bf4bf3a39..9b5d232efb7f6 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c @@ -17,8 +17,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dfl_nodispatch *skel; - skel = select_cpu_dfl_nodispatch__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dfl_nodispatch__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dfl_nodispatch__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c index 0309ca8785b36..80283dbc41b7e 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c @@ -17,8 +17,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dispatch *skel; - skel = select_cpu_dispatch__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dispatch__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dispatch__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c index 47eb6ed7627d9..5e72ebbc90a5a 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dispatch_bad_dsq *skel; - skel = select_cpu_dispatch_bad_dsq__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dispatch_bad_dsq__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dispatch_bad_dsq__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c index 48ff028a3c46d..aa85949478bcf 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dispatch_dbl_dsp *skel; - skel = select_cpu_dispatch_dbl_dsp__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dispatch_dbl_dsp__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dispatch_dbl_dsp__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.c index b4629c2364f5d..1e9b5c9bfff1d 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_vtime.c +++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_vtime *skel; - skel = select_cpu_vtime__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_vtime__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_vtime__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; |