diff options
-rw-r--r-- | include/linux/gfp.h | 2 | ||||
-rw-r--r-- | include/linux/kasan.h | 13 | ||||
-rw-r--r-- | include/linux/local_lock.h | 2 | ||||
-rw-r--r-- | include/linux/local_lock_internal.h | 7 | ||||
-rw-r--r-- | include/linux/memcontrol.h | 12 | ||||
-rw-r--r-- | include/linux/rtmutex.h | 10 | ||||
-rw-r--r-- | include/linux/slab.h | 4 | ||||
-rw-r--r-- | kernel/bpf/stream.c | 2 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 2 | ||||
-rw-r--r-- | kernel/locking/rtmutex_common.h | 9 | ||||
-rw-r--r-- | mm/Kconfig | 1 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/kasan/common.c | 5 | ||||
-rw-r--r-- | mm/page_alloc.c | 55 | ||||
-rw-r--r-- | mm/slab.h | 7 | ||||
-rw-r--r-- | mm/slab_common.c | 3 | ||||
-rw-r--r-- | mm/slub.c | 526 |
17 files changed, 571 insertions, 93 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5ebf26fcdcfa..0ceb4e09306c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -354,7 +354,7 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp, } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) -struct page *alloc_pages_nolock_noprof(int nid, unsigned int order); +struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); #define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__)) extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 890011071f2b..acdc8cb0152e 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -200,7 +200,7 @@ static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s, } bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, - bool still_accessible); + bool still_accessible, bool no_quarantine); /** * kasan_slab_free - Poison, initialize, and quarantine a slab object. * @object: Object to be freed. @@ -226,11 +226,13 @@ bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, * @Return true if KASAN took ownership of the object; false otherwise. */ static __always_inline bool kasan_slab_free(struct kmem_cache *s, - void *object, bool init, - bool still_accessible) + void *object, bool init, + bool still_accessible, + bool no_quarantine) { if (kasan_enabled()) - return __kasan_slab_free(s, object, init, still_accessible); + return __kasan_slab_free(s, object, init, still_accessible, + no_quarantine); return false; } @@ -427,7 +429,8 @@ static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object) } static inline bool kasan_slab_free(struct kmem_cache *s, void *object, - bool init, bool still_accessible) + bool init, bool still_accessible, + bool no_quarantine) { return false; } diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 2ba846419524..0d91d060e3e9 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -66,6 +66,8 @@ */ #define local_trylock(lock) __local_trylock(this_cpu_ptr(lock)) +#define local_lock_is_locked(lock) __local_lock_is_locked(lock) + /** * local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable * interrupts if acquired diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 949de37700db..a4dc479157b5 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -165,6 +165,9 @@ do { \ !!tl; \ }) +/* preemption or migration must be disabled before calling __local_lock_is_locked */ +#define __local_lock_is_locked(lock) READ_ONCE(this_cpu_ptr(lock)->acquired) + #define __local_lock_release(lock) \ do { \ local_trylock_t *tl; \ @@ -285,4 +288,8 @@ do { \ __local_trylock(lock); \ }) +/* migration must be disabled before calling __local_lock_is_locked */ +#define __local_lock_is_locked(__lock) \ + (rt_mutex_owner(&this_cpu_ptr(__lock)->lock) == current) + #endif /* CONFIG_PREEMPT_RT */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 785173aa0739..82563236f35c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -341,17 +341,25 @@ enum page_memcg_data_flags { __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; +#define __OBJEXTS_ALLOC_FAIL MEMCG_DATA_OBJEXTS #define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS #else /* CONFIG_MEMCG */ +#define __OBJEXTS_ALLOC_FAIL (1UL << 0) #define __FIRST_OBJEXT_FLAG (1UL << 0) #endif /* CONFIG_MEMCG */ enum objext_flags { - /* slabobj_ext vector failed to allocate */ - OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG, + /* + * Use bit 0 with zero other bits to signal that slabobj_ext vector + * failed to allocate. The same bit 0 with valid upper bits means + * MEMCG_DATA_OBJEXTS. + */ + OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL, + /* slabobj_ext vector allocated with kmalloc_nolock() */ + OBJEXTS_NOSPIN_ALLOC = __FIRST_OBJEXT_FLAG, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index fa9f1021541e..ede4c6bf6f22 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -44,6 +44,16 @@ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock) return READ_ONCE(lock->owner) != NULL; } +#ifdef CONFIG_RT_MUTEXES +#define RT_MUTEX_HAS_WAITERS 1UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) +{ + unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + + return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); +} +#endif extern void rt_mutex_base_init(struct rt_mutex_base *rtb); /** diff --git a/include/linux/slab.h b/include/linux/slab.h index 680193356ac7..561597dd2164 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -501,6 +501,7 @@ void * __must_check krealloc_noprof(const void *objp, size_t new_size, #define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__)) void kfree(const void *objp); +void kfree_nolock(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); @@ -957,6 +958,9 @@ static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t f } #define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__)) +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node); +#define kmalloc_nolock(...) alloc_hooks(kmalloc_nolock_noprof(__VA_ARGS__)) + #define kmem_buckets_alloc(_b, _size, _flags) \ alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE)) diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index ab592db4a4bf..eb6c5a21c2ef 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -83,7 +83,7 @@ static struct bpf_stream_page *bpf_stream_page_replace(void) struct bpf_stream_page *stream_page, *old_stream_page; struct page *page; - page = alloc_pages_nolock(NUMA_NO_NODE, 0); + page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0); if (!page) return NULL; stream_page = page_address(page); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0fbfa8532c39..dbf86f8014de 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -581,7 +581,7 @@ static bool can_alloc_pages(void) static struct page *__bpf_alloc_page(int nid) { if (!can_alloc_pages()) - return alloc_pages_nolock(nid, 0); + return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0); return alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 78dd3d8c6554..cf6ddd1b23a2 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -153,15 +153,6 @@ static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) pi_tree.entry); } -#define RT_MUTEX_HAS_WAITERS 1UL - -static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) -{ - unsigned long owner = (unsigned long) READ_ONCE(lock->owner); - - return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); -} - /* * Constants for rt mutex functions which have a selectable deadlock * detection. diff --git a/mm/Kconfig b/mm/Kconfig index e443fe8cd6cf..202e044f2b4d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -194,6 +194,7 @@ menu "Slab allocator options" config SLUB def_bool y + select IRQ_WORK config KVFREE_RCU_BATCHED def_bool y diff --git a/mm/internal.h b/mm/internal.h index 45b725c3dc03..9904421cabc1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -842,6 +842,10 @@ static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int ord #define alloc_frozen_pages(...) \ alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__)) +struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); +#define alloc_frozen_pages_nolock(...) \ + alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__)) + extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 9142964ab9c9..3264900b942f 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -252,7 +252,7 @@ bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, } bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, - bool still_accessible) + bool still_accessible, bool no_quarantine) { if (!kasan_arch_is_ready() || is_kfence_address(object)) return false; @@ -274,6 +274,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, poison_slab_object(cache, object, init); + if (no_quarantine) + return false; + /* * If the object is put into quarantine, do not let slab put the object * onto the freelist for now. The object's metadata is kept until the diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d1d037f97c5f..5a40e2b7d148 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7478,22 +7478,7 @@ static bool __free_unaccepted(struct page *page) #endif /* CONFIG_UNACCEPTED_MEMORY */ -/** - * alloc_pages_nolock - opportunistic reentrant allocation from any context - * @nid: node to allocate from - * @order: allocation order size - * - * Allocates pages of a given order from the given node. This is safe to - * call from any context (from atomic, NMI, and also reentrant - * allocator -> tracepoint -> alloc_pages_nolock_noprof). - * Allocation is best effort and to be expected to fail easily so nobody should - * rely on the success. Failures are not reported via warn_alloc(). - * See always fail conditions below. - * - * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. - * It means ENOMEM. There is no reason to call it again and expect !NULL. - */ -struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) +struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) { /* * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. @@ -7515,12 +7500,13 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) * specify it here to highlight that alloc_pages_nolock() * doesn't want to deplete reserves. */ - gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC - | __GFP_ACCOUNT; + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP + | gfp_flags; unsigned int alloc_flags = ALLOC_TRYLOCK; struct alloc_context ac = { }; struct page *page; + VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT); /* * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is * unsafe in NMI. If spin_trylock() is called from hard IRQ the current @@ -7555,15 +7541,38 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */ - if (page) - set_page_refcounted(page); - - if (memcg_kmem_online() && page && + if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) && unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) { - free_pages_nolock(page, order); + __free_frozen_pages(page, order, FPI_TRYLOCK); page = NULL; } trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); kmsan_alloc_page(page, order, alloc_gfp); return page; } +/** + * alloc_pages_nolock - opportunistic reentrant allocation from any context + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed. + * @nid: node to allocate from + * @order: allocation order size + * + * Allocates pages of a given order from the given node. This is safe to + * call from any context (from atomic, NMI, and also reentrant + * allocator -> tracepoint -> alloc_pages_nolock_noprof). + * Allocation is best effort and to be expected to fail easily so nobody should + * rely on the success. Failures are not reported via warn_alloc(). + * See always fail conditions below. + * + * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. + * It means ENOMEM. There is no reason to call it again and expect !NULL. + */ +struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) +{ + struct page *page; + + page = alloc_frozen_pages_nolock_noprof(gfp_flags, nid, order); + if (page) + set_page_refcounted(page); + return page; +} +EXPORT_SYMBOL_GPL(alloc_pages_nolock_noprof); diff --git a/mm/slab.h b/mm/slab.h index 2b78a717461c..d63cc9b5e313 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -57,6 +57,10 @@ struct slab { struct { union { struct list_head slab_list; + struct { /* For deferred deactivate_slab() */ + struct llist_node llnode; + void *flush_freelist; + }; #ifdef CONFIG_SLUB_CPU_PARTIAL struct { struct slab *next; @@ -234,6 +238,7 @@ struct kmem_cache_order_objects { struct kmem_cache { #ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; + struct lock_class_key lock_key; #endif struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ @@ -665,6 +670,8 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) void __check_heap_object(const void *ptr, unsigned long n, const struct slab *slab, bool to_user); +void defer_free_barrier(void); + static inline bool slub_debug_orig_size(struct kmem_cache *s) { return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && diff --git a/mm/slab_common.c b/mm/slab_common.c index b6601e0fe598..932d13ada36c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -510,6 +510,9 @@ void kmem_cache_destroy(struct kmem_cache *s) rcu_barrier(); } + /* Wait for deferred work from kmalloc/kfree_nolock() */ + defer_free_barrier(); + cpus_read_lock(); mutex_lock(&slab_mutex); diff --git a/mm/slub.c b/mm/slub.c index c2c6b350766e..a585d0ac45d4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -44,7 +44,8 @@ #include <kunit/test.h> #include <kunit/test-bug.h> #include <linux/sort.h> - +#include <linux/irq_work.h> +#include <linux/kprobes.h> #include <linux/debugfs.h> #include <trace/events/kmem.h> @@ -426,7 +427,7 @@ struct kmem_cache_cpu { #ifdef CONFIG_SLUB_CPU_PARTIAL struct slab *partial; /* Partially allocated slabs */ #endif - local_lock_t lock; /* Protects the fields above */ + local_trylock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS unsigned int stat[NR_SLUB_STAT_ITEMS]; #endif @@ -2056,7 +2057,7 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts, * objects with no tag reference. Mark all references in this * vector as empty to avoid warnings later on. */ - if (obj_exts & OBJEXTS_ALLOC_FAIL) { + if (obj_exts == OBJEXTS_ALLOC_FAIL) { unsigned int i; for (i = 0; i < objects; i++) @@ -2089,6 +2090,7 @@ static inline void init_slab_obj_exts(struct slab *slab) int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { + bool allow_spin = gfpflags_allow_spinning(gfp); unsigned int objects = objs_per_slab(s, slab); unsigned long new_exts; unsigned long old_exts; @@ -2097,8 +2099,22 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp &= ~OBJCGS_CLEAR_MASK; /* Prevent recursive extension vector allocation */ gfp |= __GFP_NO_OBJ_EXT; - vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, - slab_nid(slab)); + + /* + * Note that allow_spin may be false during early boot and its + * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting + * architectures with cmpxchg16b, early obj_exts will be missing for + * very early allocations on those. + */ + if (unlikely(!allow_spin)) { + size_t sz = objects * sizeof(struct slabobj_ext); + + vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT, + slab_nid(slab)); + } else { + vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, + slab_nid(slab)); + } if (!vec) { /* Mark vectors which failed to allocate */ mark_failed_objexts_alloc(slab); @@ -2107,6 +2123,8 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, } new_exts = (unsigned long)vec; + if (unlikely(!allow_spin)) + new_exts |= OBJEXTS_NOSPIN_ALLOC; #ifdef CONFIG_MEMCG new_exts |= MEMCG_DATA_OBJEXTS; #endif @@ -2127,7 +2145,10 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, * objcg vector should be reused. */ mark_objexts_empty(vec); - kfree(vec); + if (unlikely(!allow_spin)) + kfree_nolock(vec); + else + kfree(vec); return 0; } @@ -2151,7 +2172,10 @@ static inline void free_slab_obj_exts(struct slab *slab) * the extension for obj_exts is expected to be NULL. */ mark_objexts_empty(obj_exts); - kfree(obj_exts); + if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC)) + kfree_nolock(obj_exts); + else + kfree(obj_exts); slab->obj_exts = 0; } @@ -2485,7 +2509,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return !kasan_slab_free(s, x, init, still_accessible); + return !kasan_slab_free(s, x, init, still_accessible, false); } static __fastpath_inline @@ -2990,13 +3014,17 @@ static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) * Slab allocation and freeing */ static inline struct slab *alloc_slab_page(gfp_t flags, int node, - struct kmem_cache_order_objects oo) + struct kmem_cache_order_objects oo, + bool allow_spin) { struct folio *folio; struct slab *slab; unsigned int order = oo_order(oo); - if (node == NUMA_NO_NODE) + if (unlikely(!allow_spin)) + folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, + node, order); + else if (node == NUMA_NO_NODE) folio = (struct folio *)alloc_frozen_pages(flags, order); else folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); @@ -3146,6 +3174,7 @@ static __always_inline void unaccount_slab(struct slab *slab, int order, static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { + bool allow_spin = gfpflags_allow_spinning(flags); struct slab *slab; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; @@ -3165,7 +3194,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM; - slab = alloc_slab_page(alloc_gfp, node, oo); + /* + * __GFP_RECLAIM could be cleared on the first allocation attempt, + * so pass allow_spin flag directly. + */ + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); if (unlikely(!slab)) { oo = s->min; alloc_gfp = flags; @@ -3173,7 +3206,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - slab = alloc_slab_page(alloc_gfp, node, oo); + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); if (unlikely(!slab)) return NULL; stat(s, ORDER_FALLBACK); @@ -3350,33 +3383,47 @@ static void *alloc_single_from_partial(struct kmem_cache *s, return object; } +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); + /* * Called only for kmem_cache_debug() caches to allocate from a freshly * allocated slab. Allocate a single object instead of whole freelist * and put the slab to the partial (or full) list. */ -static void *alloc_single_from_new_slab(struct kmem_cache *s, - struct slab *slab, int orig_size) +static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, + int orig_size, gfp_t gfpflags) { + bool allow_spin = gfpflags_allow_spinning(gfpflags); int nid = slab_nid(slab); struct kmem_cache_node *n = get_node(s, nid); unsigned long flags; void *object; + if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { + /* Unlucky, discard newly allocated slab */ + slab->frozen = 1; + defer_deactivate_slab(slab, NULL); + return NULL; + } object = slab->freelist; slab->freelist = get_freepointer(s, object); slab->inuse = 1; - if (!alloc_debug_processing(s, slab, object, orig_size)) + if (!alloc_debug_processing(s, slab, object, orig_size)) { /* * It's not really expected that this would fail on a * freshly allocated slab, but a concurrent memory * corruption in theory could cause that. + * Leak memory of allocated slab. */ + if (!allow_spin) + spin_unlock_irqrestore(&n->list_lock, flags); return NULL; + } - spin_lock_irqsave(&n->list_lock, flags); + if (allow_spin) + spin_lock_irqsave(&n->list_lock, flags); if (slab->inuse == slab->objects) add_full(s, n, slab); @@ -3417,7 +3464,10 @@ static struct slab *get_partial_node(struct kmem_cache *s, if (!n || !n->nr_partial) return NULL; - spin_lock_irqsave(&n->list_lock, flags); + if (gfpflags_allow_spinning(pc->flags)) + spin_lock_irqsave(&n->list_lock, flags); + else if (!spin_trylock_irqsave(&n->list_lock, flags)) + return NULL; list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { if (!pfmemalloc_match(slab, pc->flags)) continue; @@ -3602,12 +3652,29 @@ static inline void note_cmpxchg_failure(const char *n, static void init_kmem_cache_cpus(struct kmem_cache *s) { +#ifdef CONFIG_PREEMPT_RT + /* + * Register lockdep key for non-boot kmem caches to avoid + * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() + */ + bool finegrain_lockdep = !init_section_contains(s, 1); +#else + /* + * Don't bother with different lockdep classes for each + * kmem_cache, since we only use local_trylock_irqsave(). + */ + bool finegrain_lockdep = false; +#endif int cpu; struct kmem_cache_cpu *c; + if (finegrain_lockdep) + lockdep_register_key(&s->lock_key); for_each_possible_cpu(cpu) { c = per_cpu_ptr(s->cpu_slab, cpu); - local_lock_init(&c->lock); + local_trylock_init(&c->lock); + if (finegrain_lockdep) + lockdep_set_class(&c->lock, &s->lock_key); c->tid = init_tid(cpu); } } @@ -3698,6 +3765,47 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, } } +/* + * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock + * can be acquired without a deadlock before invoking the function. + * + * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is + * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), + * and kmalloc() is not used in an unsupported context. + * + * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). + * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but + * lockdep_assert() will catch a bug in case: + * #1 + * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() + * or + * #2 + * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() + * + * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt + * disabled context. The lock will always be acquired and if needed it + * block and sleep until the lock is available. + * #1 is possible in !PREEMPT_RT only. + * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: + * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> + * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) + * + * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B + */ +#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) +#define local_lock_cpu_slab(s, flags) \ + local_lock_irqsave(&(s)->cpu_slab->lock, flags) +#else +#define local_lock_cpu_slab(s, flags) \ + do { \ + bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ + lockdep_assert(__l); \ + } while (0) +#endif + +#define local_unlock_cpu_slab(s, flags) \ + local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) + #ifdef CONFIG_SLUB_CPU_PARTIAL static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) { @@ -3782,7 +3890,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) unsigned long flags; int slabs = 0; - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); oldslab = this_cpu_read(s->cpu_slab->partial); @@ -3807,7 +3915,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) this_cpu_write(s->cpu_slab->partial, slab); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); if (slab_to_put) { __put_partials(s, slab_to_put); @@ -4322,6 +4430,7 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { + bool allow_spin = gfpflags_allow_spinning(gfpflags); void *freelist; struct slab *slab; unsigned long flags; @@ -4347,9 +4456,21 @@ reread_slab: if (unlikely(!node_match(slab, node))) { /* * same as above but node_match() being false already - * implies node != NUMA_NO_NODE + * implies node != NUMA_NO_NODE. + * + * We don't strictly honor pfmemalloc and NUMA preferences + * when !allow_spin because: + * + * 1. Most kmalloc() users allocate objects on the local node, + * so kmalloc_nolock() tries not to interfere with them by + * deactivating the cpu slab. + * + * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause + * unnecessary slab allocations even when n->partial list + * is not empty. */ - if (!node_isset(node, slab_nodes)) { + if (!node_isset(node, slab_nodes) || + !allow_spin) { node = NUMA_NO_NODE; } else { stat(s, ALLOC_NODE_MISMATCH); @@ -4362,13 +4483,14 @@ reread_slab: * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!pfmemalloc_match(slab, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) goto deactivate_slab; /* must check again c->slab in case we got preempted and it changed */ - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); + if (unlikely(slab != c->slab)) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } freelist = c->freelist; @@ -4380,7 +4502,7 @@ reread_slab: if (!freelist) { c->slab = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -4399,34 +4521,34 @@ load_freelist: VM_BUG_ON(!c->slab->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); return freelist; deactivate_slab: - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (slab != c->slab) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } freelist = c->freelist; c->slab = NULL; c->freelist = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); deactivate_slab(s, slab, freelist); new_slab: #ifdef CONFIG_SLUB_CPU_PARTIAL while (slub_percpu_partial(c)) { - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (unlikely(c->slab)) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } if (unlikely(!slub_percpu_partial(c))) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); /* we were preempted and partial list got empty */ goto new_objects; } @@ -4435,7 +4557,8 @@ new_slab: slub_set_percpu_partial(c, slab); if (likely(node_match(slab, node) && - pfmemalloc_match(slab, gfpflags))) { + pfmemalloc_match(slab, gfpflags)) || + !allow_spin) { c->slab = slab; freelist = get_freelist(s, slab); VM_BUG_ON(!freelist); @@ -4443,7 +4566,7 @@ new_slab: goto load_freelist; } - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); slab->next = NULL; __put_partials(s, slab); @@ -4465,8 +4588,13 @@ new_objects: * allocating new page from other nodes */ if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) - && try_thisnode)) - pc.flags = GFP_NOWAIT | __GFP_THISNODE; + && try_thisnode)) { + if (unlikely(!allow_spin)) + /* Do not upgrade gfp to NOWAIT from more restrictive mode */ + pc.flags = gfpflags | __GFP_THISNODE; + else + pc.flags = GFP_NOWAIT | __GFP_THISNODE; + } pc.orig_size = orig_size; slab = get_partial(s, node, &pc); @@ -4510,7 +4638,7 @@ new_objects: stat(s, ALLOC_SLAB); if (kmem_cache_debug(s)) { - freelist = alloc_single_from_new_slab(s, slab, orig_size); + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); if (unlikely(!freelist)) goto new_objects; @@ -4533,7 +4661,7 @@ new_objects: inc_slabs_node(s, slab_nid(slab), slab->objects); - if (unlikely(!pfmemalloc_match(slab, gfpflags))) { + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { /* * For !pfmemalloc_match() case we don't load freelist so that * we don't make further mismatched allocations easier. @@ -4544,7 +4672,7 @@ new_objects: retry_load_slab: - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (unlikely(c->slab)) { void *flush_freelist = c->freelist; struct slab *flush_slab = c->slab; @@ -4553,9 +4681,14 @@ retry_load_slab: c->freelist = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); - deactivate_slab(s, flush_slab, flush_freelist); + if (unlikely(!allow_spin)) { + /* Reentrant slub cannot take locks, defer */ + defer_deactivate_slab(flush_slab, flush_freelist); + } else { + deactivate_slab(s, flush_slab, flush_freelist); + } stat(s, CPUSLAB_FLUSH); @@ -4565,6 +4698,19 @@ retry_load_slab: goto load_freelist; } +/* + * We disallow kprobes in ___slab_alloc() to prevent reentrance + * + * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of + * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> + * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() + * manipulating c->freelist without lock. + * + * This does not prevent kprobe in functions called from ___slab_alloc() such as + * local_lock_irqsave() itself, and that is fine, we only need to protect the + * c->freelist manipulation in ___slab_alloc() itself. + */ +NOKPROBE_SYMBOL(___slab_alloc); /* * A wrapper for ___slab_alloc() for contexts where preemption is not yet @@ -4584,8 +4730,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, */ c = slub_get_cpu_ptr(s->cpu_slab); #endif - + if (unlikely(!gfpflags_allow_spinning(gfpflags))) { + if (local_lock_is_locked(&s->cpu_slab->lock)) { + /* + * EBUSY is an internal signal to kmalloc_nolock() to + * retry a different bucket. It's not propagated + * to the caller. + */ + p = ERR_PTR(-EBUSY); + goto out; + } + } p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); +out: #ifdef CONFIG_PREEMPT_COUNT slub_put_cpu_ptr(s->cpu_slab); #endif @@ -4709,7 +4866,7 @@ static void *__slab_alloc_node(struct kmem_cache *s, return NULL; } - object = alloc_single_from_new_slab(s, slab, orig_size); + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); return object; } @@ -4788,8 +4945,9 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) memset(p[i], 0, zero_size); - kmemleak_alloc_recursive(p[i], s->object_size, 1, - s->flags, init_flags); + if (gfpflags_allow_spinning(flags)) + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, init_flags); kmsan_slab_alloc(s, p[i], init_flags); alloc_tagging_slab_alloc_hook(s, p[i], flags); } @@ -5456,6 +5614,96 @@ void *__kmalloc_noprof(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc_noprof); +/** + * kmalloc_nolock - Allocate an object of given size from any context. + * @size: size to allocate + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT + * allowed. + * @node: node number of the target node. + * + * Return: pointer to the new object or NULL in case of error. + * NULL does not mean EBUSY or EAGAIN. It means ENOMEM. + * There is no reason to call it again and expect !NULL. + */ +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) +{ + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; + struct kmem_cache *s; + bool can_retry = true; + void *ret = ERR_PTR(-EBUSY); + + VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | + __GFP_NO_OBJ_EXT)); + + if (unlikely(!size)) + return ZERO_SIZE_PTR; + + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) + /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */ + return NULL; +retry: + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; + s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_); + + if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) + /* + * kmalloc_nolock() is not supported on architectures that + * don't implement cmpxchg16b, but debug caches don't use + * per-cpu slab and per-cpu partial slabs. They rely on + * kmem_cache_node->list_lock, so kmalloc_nolock() can + * attempt to allocate from debug caches by + * spin_trylock_irqsave(&n->list_lock, ...) + */ + return NULL; + + /* + * Do not call slab_alloc_node(), since trylock mode isn't + * compatible with slab_pre_alloc_hook/should_failslab and + * kfence_alloc. Hence call __slab_alloc_node() (at most twice) + * and slab_post_alloc_hook() directly. + * + * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair + * in irq saved region. It assumes that the same cpu will not + * __update_cpu_freelist_fast() into the same (freelist,tid) pair. + * Therefore use in_nmi() to check whether particular bucket is in + * irq protected section. + * + * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that + * this cpu was interrupted somewhere inside ___slab_alloc() after + * it did local_lock_irqsave(&s->cpu_slab->lock, flags). + * In this case fast path with __update_cpu_freelist_fast() is not safe. + */ +#ifndef CONFIG_SLUB_TINY + if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) +#endif + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); + + if (PTR_ERR(ret) == -EBUSY) { + if (can_retry) { + /* pick the next kmalloc bucket */ + size = s->object_size + 1; + /* + * Another alternative is to + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; + * to retry from bucket of the same size. + */ + can_retry = false; + goto retry; + } + ret = NULL; + } + + maybe_wipe_obj_freeptr(s, ret); + slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, + slab_want_init_on_alloc(alloc_gfp, s), size); + + ret = kasan_kmalloc(s, ret, size, alloc_gfp); + return ret; +} +EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof); + void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node, unsigned long caller) { @@ -6117,6 +6365,93 @@ flush_remote: } } +struct defer_free { + struct llist_head objects; + struct llist_head slabs; + struct irq_work work; +}; + +static void free_deferred_objects(struct irq_work *work); + +static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { + .objects = LLIST_HEAD_INIT(objects), + .slabs = LLIST_HEAD_INIT(slabs), + .work = IRQ_WORK_INIT(free_deferred_objects), +}; + +/* + * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe + * to take sleeping spin_locks from __slab_free() and deactivate_slab(). + * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). + */ +static void free_deferred_objects(struct irq_work *work) +{ + struct defer_free *df = container_of(work, struct defer_free, work); + struct llist_head *objs = &df->objects; + struct llist_head *slabs = &df->slabs; + struct llist_node *llnode, *pos, *t; + + if (llist_empty(objs) && llist_empty(slabs)) + return; + + llnode = llist_del_all(objs); + llist_for_each_safe(pos, t, llnode) { + struct kmem_cache *s; + struct slab *slab; + void *x = pos; + + slab = virt_to_slab(x); + s = slab->slab_cache; + + /* + * We used freepointer in 'x' to link 'x' into df->objects. + * Clear it to NULL to avoid false positive detection + * of "Freepointer corruption". + */ + *(void **)x = NULL; + + /* Point 'x' back to the beginning of allocated object */ + x -= s->offset; + __slab_free(s, slab, x, x, 1, _THIS_IP_); + } + + llnode = llist_del_all(slabs); + llist_for_each_safe(pos, t, llnode) { + struct slab *slab = container_of(pos, struct slab, llnode); + +#ifdef CONFIG_SLUB_TINY + discard_slab(slab->slab_cache, slab); +#else + deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); +#endif + } +} + +static void defer_free(struct kmem_cache *s, void *head) +{ + struct defer_free *df = this_cpu_ptr(&defer_free_objects); + + if (llist_add(head + s->offset, &df->objects)) + irq_work_queue(&df->work); +} + +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) +{ + struct defer_free *df = this_cpu_ptr(&defer_free_objects); + + slab->flush_freelist = flush_freelist; + if (llist_add(&slab->llnode, &df->slabs)) + irq_work_queue(&df->work); +} + +void defer_free_barrier(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); +} + #ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that @@ -6137,6 +6472,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { + /* cnt == 0 signals that it's called from kfree_nolock() */ + bool allow_spin = cnt; struct kmem_cache_cpu *c; unsigned long tid; void **freelist; @@ -6155,10 +6492,29 @@ redo: barrier(); if (unlikely(slab != c->slab)) { - __slab_free(s, slab, head, tail, cnt, addr); + if (unlikely(!allow_spin)) { + /* + * __slab_free() can locklessly cmpxchg16 into a slab, + * but then it might need to take spin_lock or local_lock + * in put_cpu_partial() for further processing. + * Avoid the complexity and simply add to a deferred list. + */ + defer_free(s, head); + } else { + __slab_free(s, slab, head, tail, cnt, addr); + } return; } + if (unlikely(!allow_spin)) { + if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && + local_lock_is_locked(&s->cpu_slab->lock)) { + defer_free(s, head); + return; + } + cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ + } + if (USE_LOCKLESS_FAST_PATH()) { freelist = READ_ONCE(c->freelist); @@ -6169,11 +6525,13 @@ redo: goto redo; } } else { + __maybe_unused unsigned long flags = 0; + /* Update the free list under the local lock */ - local_lock(&s->cpu_slab->lock); + local_lock_cpu_slab(s, flags); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { - local_unlock(&s->cpu_slab->lock); + local_unlock_cpu_slab(s, flags); goto redo; } tid = c->tid; @@ -6183,7 +6541,7 @@ redo: c->freelist = head; c->tid = next_tid(tid); - local_unlock(&s->cpu_slab->lock); + local_unlock_cpu_slab(s, flags); } stat_add(s, FREE_FASTPATH, cnt); } @@ -6414,6 +6772,71 @@ void kfree(const void *object) } EXPORT_SYMBOL(kfree); +/* + * Can be called while holding raw_spinlock_t or from IRQ and NMI, + * but ONLY for objects allocated by kmalloc_nolock(). + * Debug checks (like kmemleak and kfence) were skipped on allocation, + * hence + * obj = kmalloc(); kfree_nolock(obj); + * will miss kmemleak/kfence book keeping and will cause false positives. + * large_kmalloc is not supported either. + */ +void kfree_nolock(const void *object) +{ + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *x = (void *)object; + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + folio = virt_to_folio(object); + if (unlikely(!folio_test_slab(folio))) { + WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()"); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + + memcg_slab_free_hook(s, slab, &x, 1); + alloc_tagging_slab_free_hook(s, slab, &x, 1); + /* + * Unlike slab_free() do NOT call the following: + * kmemleak_free_recursive(x, s->flags); + * debug_check_no_locks_freed(x, s->object_size); + * debug_check_no_obj_freed(x, s->object_size); + * __kcsan_check_access(x, s->object_size, ..); + * kfence_free(x); + * since they take spinlocks or not safe from any context. + */ + kmsan_slab_free(s, x); + /* + * If KASAN finds a kernel bug it will do kasan_report_invalid_free() + * which will call raw_spin_lock_irqsave() which is technically + * unsafe from NMI, but take chance and report kernel bug. + * The sequence of + * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI + * -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU + * is double buggy and deserves to deadlock. + */ + if (kasan_slab_pre_free(s, x)) + return; + /* + * memcg, kasan_slab_pre_free are done for 'x'. + * The only thing left is kasan_poison without quarantine, + * since kasan quarantine takes locks and not supported from NMI. + */ + kasan_slab_free(s, x, false, false, /* skip quarantine */true); +#ifndef CONFIG_SLUB_TINY + do_slab_free(s, slab, x, x, 0, _RET_IP_); +#else + defer_free(s, x); +#endif +} +EXPORT_SYMBOL_GPL(kfree_nolock); + static __always_inline __realloc_size(2) void * __do_krealloc(const void *p, size_t new_size, gfp_t flags) { @@ -7236,6 +7659,9 @@ void __kmem_cache_release(struct kmem_cache *s) if (s->cpu_sheaves) pcs_destroy(s); #ifndef CONFIG_SLUB_TINY +#ifdef CONFIG_PREEMPT_RT + lockdep_unregister_key(&s->lock_key); +#endif free_percpu(s->cpu_slab); #endif free_kmem_cache_nodes(s); |