diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-01-27 15:26:06 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-01-27 15:26:06 -0800 |
commit | deee7487f5d495d0d9e5ab40d866d69ad524c46a (patch) | |
tree | 771e875dcbc8fa7d5a74af283e194ea08c09dfb8 | |
parent | 805ba04cb7ccfc7d72e834ebd796e043142156ba (diff) | |
parent | 5820a3b08987951e3e4a89fca8ab6e1448f672e1 (diff) |
Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
Pull virtio updates from Michael Tsirkin:
"A small number of improvements all over the place:
- vdpa/octeon support for multiple interrupts
- virtio-pci support for error recovery
- vp_vdpa support for notification with data
- vhost/net fix to set num_buffers for spec compliance
- virtio-mem now works with kdump on s390
And small cleanups all over the place"
* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: (23 commits)
virtio_blk: Add support for transport error recovery
virtio_pci: Add support for PCIe Function Level Reset
vhost/net: Set num_buffers for virtio 1.0
vdpa/octeon_ep: read vendor-specific PCI capability
virtio-pci: define type and header for PCI vendor data
vdpa/octeon_ep: handle device config change events
vdpa/octeon_ep: enable support for multiple interrupts per device
vdpa: solidrun: Replace deprecated PCI functions
s390/kdump: virtio-mem kdump support (CONFIG_PROC_VMCORE_DEVICE_RAM)
virtio-mem: support CONFIG_PROC_VMCORE_DEVICE_RAM
virtio-mem: remember usable region size
virtio-mem: mark device ready before registering callbacks in kdump mode
fs/proc/vmcore: introduce PROC_VMCORE_DEVICE_RAM to detect device RAM ranges in 2nd kernel
fs/proc/vmcore: factor out freeing a list of vmcore ranges
fs/proc/vmcore: factor out allocating a vmcore range and adding it to a list
fs/proc/vmcore: move vmcore definitions out of kcore.h
fs/proc/vmcore: prefix all pr_* with "vmcore:"
fs/proc/vmcore: disallow vmcore modifications while the vmcore is open
fs/proc/vmcore: replace vmcoredd_mutex by vmcore_mutex
fs/proc/vmcore: convert vmcore_cb_lock into vmcore_mutex
...
-rw-r--r-- | arch/s390/Kconfig | 1 | ||||
-rw-r--r-- | arch/s390/kernel/crash_dump.c | 39 | ||||
-rw-r--r-- | drivers/block/virtio_blk.c | 28 | ||||
-rw-r--r-- | drivers/vdpa/octeon_ep/octep_vdpa.h | 32 | ||||
-rw-r--r-- | drivers/vdpa/octeon_ep/octep_vdpa_hw.c | 38 | ||||
-rw-r--r-- | drivers/vdpa/octeon_ep/octep_vdpa_main.c | 99 | ||||
-rw-r--r-- | drivers/vdpa/solidrun/snet_main.c | 57 | ||||
-rw-r--r-- | drivers/vdpa/virtio_pci/vp_vdpa.c | 9 | ||||
-rw-r--r-- | drivers/vhost/net.c | 5 | ||||
-rw-r--r-- | drivers/virtio/virtio.c | 94 | ||||
-rw-r--r-- | drivers/virtio/virtio_balloon.c | 2 | ||||
-rw-r--r-- | drivers/virtio/virtio_mem.c | 103 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_common.c | 41 | ||||
-rw-r--r-- | fs/proc/Kconfig | 19 | ||||
-rw-r--r-- | fs/proc/vmcore.c | 283 | ||||
-rw-r--r-- | include/linux/crash_dump.h | 41 | ||||
-rw-r--r-- | include/linux/kcore.h | 13 | ||||
-rw-r--r-- | include/linux/virtio.h | 8 | ||||
-rw-r--r-- | include/uapi/linux/vduse.h | 2 | ||||
-rw-r--r-- | include/uapi/linux/virtio_pci.h | 14 |
20 files changed, 735 insertions, 193 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 570558a99359d..6e9545d8b0c72 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -244,6 +244,7 @@ config S390 select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PCI select NEED_PER_CPU_EMBED_FIRST_CHUNK + select NEED_PROC_VMCORE_DEVICE_RAM if PROC_VMCORE select NEED_SG_DMA_LENGTH if PCI select OLD_SIGACTION select OLD_SIGSUSPEND3 diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index dc7328fd2ec4d..276cb4c1e11be 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -506,6 +506,19 @@ static int get_mem_chunk_cnt(void) return cnt; } +static void fill_ptload(Elf64_Phdr *phdr, unsigned long paddr, + unsigned long vaddr, unsigned long size) +{ + phdr->p_type = PT_LOAD; + phdr->p_vaddr = vaddr; + phdr->p_offset = paddr; + phdr->p_paddr = paddr; + phdr->p_filesz = size; + phdr->p_memsz = size; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_align = PAGE_SIZE; +} + /* * Initialize ELF loads (new kernel) */ @@ -518,14 +531,8 @@ static void loads_init(Elf64_Phdr *phdr, bool os_info_has_vm) if (os_info_has_vm) old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE); for_each_physmem_range(idx, &oldmem_type, &start, &end) { - phdr->p_type = PT_LOAD; - phdr->p_vaddr = old_identity_base + start; - phdr->p_offset = start; - phdr->p_paddr = start; - phdr->p_filesz = end - start; - phdr->p_memsz = end - start; - phdr->p_flags = PF_R | PF_W | PF_X; - phdr->p_align = PAGE_SIZE; + fill_ptload(phdr, start, old_identity_base + start, + end - start); phdr++; } } @@ -535,6 +542,22 @@ static bool os_info_has_vm(void) return os_info_old_value(OS_INFO_KASLR_OFFSET); } +#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM +/* + * Fill PT_LOAD for a physical memory range owned by a device and detected by + * its device driver. + */ +void elfcorehdr_fill_device_ram_ptload_elf64(Elf64_Phdr *phdr, + unsigned long long paddr, unsigned long long size) +{ + unsigned long old_identity_base = 0; + + if (os_info_has_vm()) + old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE); + fill_ptload(phdr, paddr, old_identity_base + paddr, size); +} +#endif + /* * Prepare PT_LOAD type program header for kernel image region */ diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index bbaa26b523b8d..bfbe391c20fee 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1579,8 +1579,7 @@ static void virtblk_remove(struct virtio_device *vdev) put_disk(vblk->disk); } -#ifdef CONFIG_PM_SLEEP -static int virtblk_freeze(struct virtio_device *vdev) +static int virtblk_freeze_priv(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; struct request_queue *q = vblk->disk->queue; @@ -1602,7 +1601,7 @@ static int virtblk_freeze(struct virtio_device *vdev) return 0; } -static int virtblk_restore(struct virtio_device *vdev) +static int virtblk_restore_priv(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; int ret; @@ -1616,8 +1615,29 @@ static int virtblk_restore(struct virtio_device *vdev) return 0; } + +#ifdef CONFIG_PM_SLEEP +static int virtblk_freeze(struct virtio_device *vdev) +{ + return virtblk_freeze_priv(vdev); +} + +static int virtblk_restore(struct virtio_device *vdev) +{ + return virtblk_restore_priv(vdev); +} #endif +static int virtblk_reset_prepare(struct virtio_device *vdev) +{ + return virtblk_freeze_priv(vdev); +} + +static int virtblk_reset_done(struct virtio_device *vdev) +{ + return virtblk_restore_priv(vdev); +} + static const struct virtio_device_id id_table[] = { { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, { 0 }, @@ -1653,6 +1673,8 @@ static struct virtio_driver virtio_blk = { .freeze = virtblk_freeze, .restore = virtblk_restore, #endif + .reset_prepare = virtblk_reset_prepare, + .reset_done = virtblk_reset_done, }; static int __init virtio_blk_init(void) diff --git a/drivers/vdpa/octeon_ep/octep_vdpa.h b/drivers/vdpa/octeon_ep/octep_vdpa.h index 046710ec4d424..53b020b019f73 100644 --- a/drivers/vdpa/octeon_ep/octep_vdpa.h +++ b/drivers/vdpa/octeon_ep/octep_vdpa.h @@ -8,6 +8,7 @@ #include <linux/pci_regs.h> #include <linux/vdpa.h> #include <linux/virtio_pci_modern.h> +#include <uapi/linux/virtio_crypto.h> #include <uapi/linux/virtio_net.h> #include <uapi/linux/virtio_blk.h> #include <uapi/linux/virtio_config.h> @@ -29,12 +30,12 @@ #define OCTEP_EPF_RINFO(x) (0x000209f0 | ((x) << 25)) #define OCTEP_VF_MBOX_DATA(x) (0x00010210 | ((x) << 17)) #define OCTEP_PF_MBOX_DATA(x) (0x00022000 | ((x) << 4)) - -#define OCTEP_EPF_RINFO_RPVF(val) (((val) >> 32) & 0xF) -#define OCTEP_EPF_RINFO_NVFS(val) (((val) >> 48) & 0x7F) +#define OCTEP_VF_IN_CTRL(x) (0x00010000 | ((x) << 17)) +#define OCTEP_VF_IN_CTRL_RPVF(val) (((val) >> 48) & 0xF) #define OCTEP_FW_READY_SIGNATURE0 0xFEEDFEED #define OCTEP_FW_READY_SIGNATURE1 0x3355ffaa +#define OCTEP_MAX_CB_INTR 8 enum octep_vdpa_dev_status { OCTEP_VDPA_DEV_STATUS_INVALID, @@ -48,9 +49,26 @@ enum octep_vdpa_dev_status { struct octep_vring_info { struct vdpa_callback cb; void __iomem *notify_addr; - u32 __iomem *cb_notify_addr; + void __iomem *cb_notify_addr; phys_addr_t notify_pa; - char msix_name[256]; +}; + +enum octep_pci_vndr_cfg_type { + OCTEP_PCI_VNDR_CFG_TYPE_VIRTIO_ID, + OCTEP_PCI_VNDR_CFG_TYPE_MAX, +}; + +struct octep_pci_vndr_data { + struct virtio_pci_vndr_data hdr; + u8 id; + u8 bar; + union { + u64 data; + struct { + u32 offset; + u32 length; + }; + }; }; struct octep_hw { @@ -68,7 +86,9 @@ struct octep_hw { u64 features; u16 nr_vring; u32 config_size; - int irq; + int nb_irqs; + int *irqs; + u8 dev_id; }; u8 octep_hw_get_status(struct octep_hw *oct_hw); diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c index 1d4767b33315e..74240101c5052 100644 --- a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c +++ b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c @@ -2,6 +2,7 @@ /* Copyright (C) 2024 Marvell. */ #include <linux/iopoll.h> +#include <linux/build_bug.h> #include "octep_vdpa.h" @@ -358,7 +359,14 @@ u16 octep_get_vq_size(struct octep_hw *oct_hw) static u32 octep_get_config_size(struct octep_hw *oct_hw) { - return sizeof(struct virtio_net_config); + switch (oct_hw->dev_id) { + case VIRTIO_ID_NET: + return sizeof(struct virtio_net_config); + case VIRTIO_ID_CRYPTO: + return sizeof(struct virtio_crypto_config); + default: + return 0; + } } static void __iomem *octep_get_cap_addr(struct octep_hw *oct_hw, struct virtio_pci_cap *cap) @@ -416,8 +424,25 @@ static int octep_pci_signature_verify(struct octep_hw *oct_hw) return 0; } +static void octep_vndr_data_process(struct octep_hw *oct_hw, + struct octep_pci_vndr_data *vndr_data) +{ + BUILD_BUG_ON(sizeof(struct octep_pci_vndr_data) % 4 != 0); + + switch (vndr_data->id) { + case OCTEP_PCI_VNDR_CFG_TYPE_VIRTIO_ID: + oct_hw->dev_id = (u8)vndr_data->data; + break; + default: + dev_err(&oct_hw->pdev->dev, "Invalid vendor data id %u\n", + vndr_data->id); + break; + } +} + int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev) { + struct octep_pci_vndr_data vndr_data; struct octep_mbox __iomem *mbox; struct device *dev = &pdev->dev; struct virtio_pci_cap cap; @@ -466,6 +491,15 @@ int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev) case VIRTIO_PCI_CAP_ISR_CFG: oct_hw->isr = octep_get_cap_addr(oct_hw, &cap); break; + case VIRTIO_PCI_CAP_VENDOR_CFG: + octep_pci_caps_read(oct_hw, &vndr_data, sizeof(vndr_data), pos); + if (vndr_data.hdr.vendor_id != PCI_VENDOR_ID_CAVIUM) { + dev_err(dev, "Invalid vendor data\n"); + return -EINVAL; + } + + octep_vndr_data_process(oct_hw, &vndr_data); + break; } pos = cap.cap_next; @@ -495,8 +529,6 @@ int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev) if (!oct_hw->vqs) return -ENOMEM; - oct_hw->irq = -1; - dev_info(&pdev->dev, "Device features : %llx\n", oct_hw->features); dev_info(&pdev->dev, "Maximum queues : %u\n", oct_hw->nr_vring); diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c index cd55b1aac1512..f3d4dda4e04cd 100644 --- a/drivers/vdpa/octeon_ep/octep_vdpa_main.c +++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c @@ -49,58 +49,89 @@ static irqreturn_t octep_vdpa_intr_handler(int irq, void *data) struct octep_hw *oct_hw = data; int i; - for (i = 0; i < oct_hw->nr_vring; i++) { - if (oct_hw->vqs[i].cb.callback && ioread32(oct_hw->vqs[i].cb_notify_addr)) { - /* Acknowledge the per queue notification to the device */ - iowrite32(0, oct_hw->vqs[i].cb_notify_addr); - oct_hw->vqs[i].cb.callback(oct_hw->vqs[i].cb.private); + /* Each device has multiple interrupts (nb_irqs) shared among rings + * (nr_vring). Device interrupts are mapped to the rings in a + * round-robin fashion. + * + * For example, if nb_irqs = 8 and nr_vring = 64: + * 0 -> 0, 8, 16, 24, 32, 40, 48, 56; + * 1 -> 1, 9, 17, 25, 33, 41, 49, 57; + * ... + * 7 -> 7, 15, 23, 31, 39, 47, 55, 63; + */ + + for (i = irq - oct_hw->irqs[0]; i < oct_hw->nr_vring; i += oct_hw->nb_irqs) { + if (ioread8(oct_hw->vqs[i].cb_notify_addr)) { + /* Acknowledge the per ring notification to the device */ + iowrite8(0, oct_hw->vqs[i].cb_notify_addr); + + if (likely(oct_hw->vqs[i].cb.callback)) + oct_hw->vqs[i].cb.callback(oct_hw->vqs[i].cb.private); + break; } } + /* Check for config interrupt. Config uses the first interrupt */ + if (unlikely(irq == oct_hw->irqs[0] && ioread8(oct_hw->isr))) { + iowrite8(0, oct_hw->isr); + + if (oct_hw->config_cb.callback) + oct_hw->config_cb.callback(oct_hw->config_cb.private); + } + return IRQ_HANDLED; } static void octep_free_irqs(struct octep_hw *oct_hw) { struct pci_dev *pdev = oct_hw->pdev; + int irq; - if (oct_hw->irq != -1) { - devm_free_irq(&pdev->dev, oct_hw->irq, oct_hw); - oct_hw->irq = -1; + if (!oct_hw->irqs) + return; + + for (irq = 0; irq < oct_hw->nb_irqs; irq++) { + if (!oct_hw->irqs[irq]) + break; + + devm_free_irq(&pdev->dev, oct_hw->irqs[irq], oct_hw); } + pci_free_irq_vectors(pdev); + devm_kfree(&pdev->dev, oct_hw->irqs); + oct_hw->irqs = NULL; } static int octep_request_irqs(struct octep_hw *oct_hw) { struct pci_dev *pdev = oct_hw->pdev; - int ret, irq; + int ret, irq, idx; - /* Currently HW device provisions one IRQ per VF, hence - * allocate one IRQ for all virtqueues call interface. - */ - ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSIX); + oct_hw->irqs = devm_kcalloc(&pdev->dev, oct_hw->nb_irqs, sizeof(int), GFP_KERNEL); + if (!oct_hw->irqs) + return -ENOMEM; + + ret = pci_alloc_irq_vectors(pdev, 1, oct_hw->nb_irqs, PCI_IRQ_MSIX); if (ret < 0) { dev_err(&pdev->dev, "Failed to alloc msix vector"); return ret; } - snprintf(oct_hw->vqs->msix_name, sizeof(oct_hw->vqs->msix_name), - OCTEP_VDPA_DRIVER_NAME "-vf-%d", pci_iov_vf_id(pdev)); - - irq = pci_irq_vector(pdev, 0); - ret = devm_request_irq(&pdev->dev, irq, octep_vdpa_intr_handler, 0, - oct_hw->vqs->msix_name, oct_hw); - if (ret) { - dev_err(&pdev->dev, "Failed to register interrupt handler\n"); - goto free_irq_vec; + for (idx = 0; idx < oct_hw->nb_irqs; idx++) { + irq = pci_irq_vector(pdev, idx); + ret = devm_request_irq(&pdev->dev, irq, octep_vdpa_intr_handler, 0, + dev_name(&pdev->dev), oct_hw); + if (ret) { + dev_err(&pdev->dev, "Failed to register interrupt handler\n"); + goto free_irqs; + } + oct_hw->irqs[idx] = irq; } - oct_hw->irq = irq; return 0; -free_irq_vec: - pci_free_irq_vectors(pdev); +free_irqs: + octep_free_irqs(oct_hw); return ret; } @@ -271,7 +302,9 @@ static u32 octep_vdpa_get_generation(struct vdpa_device *vdpa_dev) static u32 octep_vdpa_get_device_id(struct vdpa_device *vdpa_dev) { - return VIRTIO_ID_NET; + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + return oct_hw->dev_id; } static u32 octep_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev) @@ -559,6 +592,7 @@ static void octep_vdpa_setup_task(struct work_struct *work) struct device *dev = &pdev->dev; struct octep_hw *oct_hw; unsigned long timeout; + u64 val; int ret; oct_hw = &mgmt_dev->oct_hw; @@ -590,6 +624,13 @@ static void octep_vdpa_setup_task(struct work_struct *work) if (ret) return; + val = readq(oct_hw->base[OCTEP_HW_MBOX_BAR] + OCTEP_VF_IN_CTRL(0)); + oct_hw->nb_irqs = OCTEP_VF_IN_CTRL_RPVF(val); + if (!oct_hw->nb_irqs || oct_hw->nb_irqs > OCTEP_MAX_CB_INTR) { + dev_err(dev, "Invalid number of interrupts %d\n", oct_hw->nb_irqs); + goto unmap_region; + } + ret = octep_hw_caps_read(oct_hw, pdev); if (ret < 0) goto unmap_region; @@ -768,12 +809,6 @@ static int octep_vdpa_pf_setup(struct octep_pf *octpf) return -EINVAL; } - if (OCTEP_EPF_RINFO_RPVF(val) != BIT_ULL(0)) { - val &= ~GENMASK_ULL(35, 32); - val |= BIT_ULL(32); - writeq(val, addr + OCTEP_EPF_RINFO(0)); - } - len = pci_resource_len(pdev, OCTEP_HW_CAPS_BAR); octpf->vf_stride = len / totalvfs; diff --git a/drivers/vdpa/solidrun/snet_main.c b/drivers/vdpa/solidrun/snet_main.c index c8b74980dbd17..55ec51c17ab35 100644 --- a/drivers/vdpa/solidrun/snet_main.c +++ b/drivers/vdpa/solidrun/snet_main.c @@ -556,36 +556,38 @@ static const struct vdpa_config_ops snet_config_ops = { static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet) { char *name; - int ret, i, mask = 0; + unsigned short i; + bool bars_found = false; + + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "psnet[%s]-bars", pci_name(pdev)); + if (!name) + return -ENOMEM; + /* We don't know which BAR will be used to communicate.. * We will map every bar with len > 0. * * Later, we will discover the BAR and unmap all other BARs. */ for (i = 0; i < PCI_STD_NUM_BARS; i++) { - if (pci_resource_len(pdev, i)) - mask |= (1 << i); - } + void __iomem *io; - /* No BAR can be used.. */ - if (!mask) { - SNET_ERR(pdev, "Failed to find a PCI BAR\n"); - return -ENODEV; - } + if (pci_resource_len(pdev, i) == 0) + continue; - name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "psnet[%s]-bars", pci_name(pdev)); - if (!name) - return -ENOMEM; + io = pcim_iomap_region(pdev, i, name); + if (IS_ERR(io)) { + SNET_ERR(pdev, "Failed to request and map PCI BARs\n"); + return PTR_ERR(io); + } - ret = pcim_iomap_regions(pdev, mask, name); - if (ret) { - SNET_ERR(pdev, "Failed to request and map PCI BARs\n"); - return ret; + psnet->bars[i] = io; + bars_found = true; } - for (i = 0; i < PCI_STD_NUM_BARS; i++) { - if (mask & (1 << i)) - psnet->bars[i] = pcim_iomap_table(pdev)[i]; + /* No BAR can be used.. */ + if (!bars_found) { + SNET_ERR(pdev, "Failed to find a PCI BAR\n"); + return -ENODEV; } return 0; @@ -594,20 +596,20 @@ static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet) static int snet_open_vf_bar(struct pci_dev *pdev, struct snet *snet) { char *name; - int ret; + void __iomem *io; name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "snet[%s]-bars", pci_name(pdev)); if (!name) return -ENOMEM; /* Request and map BAR */ - ret = pcim_iomap_regions(pdev, BIT(snet->psnet->cfg.vf_bar), name); - if (ret) { + io = pcim_iomap_region(pdev, snet->psnet->cfg.vf_bar, name); + if (IS_ERR(io)) { SNET_ERR(pdev, "Failed to request and map PCI BAR for a VF\n"); - return ret; + return PTR_ERR(io); } - snet->bar = pcim_iomap_table(pdev)[snet->psnet->cfg.vf_bar]; + snet->bar = io; return 0; } @@ -656,15 +658,12 @@ static int psnet_detect_bar(struct psnet *psnet, u32 off) static void psnet_unmap_unused_bars(struct pci_dev *pdev, struct psnet *psnet) { - int i, mask = 0; + unsigned short i; for (i = 0; i < PCI_STD_NUM_BARS; i++) { if (psnet->bars[i] && i != psnet->barno) - mask |= (1 << i); + pcim_iounmap_region(pdev, i); } - - if (mask) - pcim_iounmap_regions(pdev, mask); } /* Read SNET config from PCI BAR */ diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index 16380764275ea..8787407f75b06 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -367,6 +367,14 @@ static void vp_vdpa_kick_vq(struct vdpa_device *vdpa, u16 qid) vp_iowrite16(qid, vp_vdpa->vring[qid].notify); } +static void vp_vdpa_kick_vq_with_data(struct vdpa_device *vdpa, u32 data) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + u16 qid = data & 0xFFFF; + + vp_iowrite32(data, vp_vdpa->vring[qid].notify); +} + static u32 vp_vdpa_get_generation(struct vdpa_device *vdpa) { struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); @@ -472,6 +480,7 @@ static const struct vdpa_config_ops vp_vdpa_ops = { .get_vq_size = vp_vdpa_get_vq_size, .set_vq_address = vp_vdpa_set_vq_address, .kick_vq = vp_vdpa_kick_vq, + .kick_vq_with_data = vp_vdpa_kick_vq_with_data, .get_generation = vp_vdpa_get_generation, .get_device_id = vp_vdpa_get_device_id, .get_vendor_id = vp_vdpa_get_vendor_id, diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 9ad37c0121890..b9b9e9d409518 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1107,6 +1107,7 @@ static void handle_rx(struct vhost_net *net) size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; bool busyloop_intr = false; + bool set_num_buffers; struct socket *sock; struct iov_iter fixup; __virtio16 num_buffers; @@ -1129,6 +1130,8 @@ static void handle_rx(struct vhost_net *net) vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq->log : NULL; mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); + set_num_buffers = mergeable || + vhost_has_feature(vq, VIRTIO_F_VERSION_1); do { sock_len = vhost_net_rx_peek_head_len(net, sock->sk, @@ -1205,7 +1208,7 @@ static void handle_rx(struct vhost_net *net) /* TODO: Should check and handle checksum. */ num_buffers = cpu_to_vhost16(vq, headcount); - if (likely(mergeable) && + if (likely(set_num_buffers) && copy_to_iter(&num_buffers, sizeof num_buffers, &fixup) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write"); diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index b10ed9f5b5435..ba37665188b51 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -546,29 +546,7 @@ void unregister_virtio_device(struct virtio_device *dev) } EXPORT_SYMBOL_GPL(unregister_virtio_device); -#ifdef CONFIG_PM_SLEEP -int virtio_device_freeze(struct virtio_device *dev) -{ - struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); - int ret; - - virtio_config_core_disable(dev); - - dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED; - - if (drv && drv->freeze) { - ret = drv->freeze(dev); - if (ret) { - virtio_config_core_enable(dev); - return ret; - } - } - - return 0; -} -EXPORT_SYMBOL_GPL(virtio_device_freeze); - -int virtio_device_restore(struct virtio_device *dev) +static int virtio_device_restore_priv(struct virtio_device *dev, bool restore) { struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); int ret; @@ -599,8 +577,14 @@ int virtio_device_restore(struct virtio_device *dev) if (ret) goto err; - if (drv->restore) { - ret = drv->restore(dev); + if (restore) { + if (drv->restore) { + ret = drv->restore(dev); + if (ret) + goto err; + } + } else { + ret = drv->reset_done(dev); if (ret) goto err; } @@ -617,9 +601,69 @@ err: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return ret; } + +#ifdef CONFIG_PM_SLEEP +int virtio_device_freeze(struct virtio_device *dev) +{ + struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); + int ret; + + virtio_config_core_disable(dev); + + dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED; + + if (drv && drv->freeze) { + ret = drv->freeze(dev); + if (ret) { + virtio_config_core_enable(dev); + return ret; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_device_freeze); + +int virtio_device_restore(struct virtio_device *dev) +{ + return virtio_device_restore_priv(dev, true); +} EXPORT_SYMBOL_GPL(virtio_device_restore); #endif +int virtio_device_reset_prepare(struct virtio_device *dev) +{ + struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); + int ret; + + if (!drv || !drv->reset_prepare) + return -EOPNOTSUPP; + + virtio_config_core_disable(dev); + + dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED; + + ret = drv->reset_prepare(dev); + if (ret) { + virtio_config_core_enable(dev); + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_device_reset_prepare); + +int virtio_device_reset_done(struct virtio_device *dev) +{ + struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); + + if (!drv || !drv->reset_done) + return -EOPNOTSUPP; + + return virtio_device_restore_priv(dev, false); +} +EXPORT_SYMBOL_GPL(virtio_device_reset_done); + static int virtio_init(void) { if (bus_register(&virtio_bus) != 0) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index b36d2803674ef..89da052f4f687 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -251,7 +251,7 @@ static unsigned int fill_balloon(struct virtio_balloon *vb, size_t num) for (num_pfns = 0; num_pfns < num; num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { - struct page *page = balloon_page_alloc(); + page = balloon_page_alloc(); if (!page) { dev_info_ratelimited(&vb->vdev->dev, diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index b0b8714415783..8a294b9cbcf68 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -133,6 +133,8 @@ struct virtio_mem { uint64_t addr; /* Maximum region size in bytes. */ uint64_t region_size; + /* Usable region size in bytes. */ + uint64_t usable_region_size; /* The parent resource for all memory added via this device. */ struct resource *parent_resource; @@ -2368,7 +2370,7 @@ static int virtio_mem_cleanup_pending_mb(struct virtio_mem *vm) static void virtio_mem_refresh_config(struct virtio_mem *vm) { const struct range pluggable_range = mhp_get_pluggable_range(true); - uint64_t new_plugged_size, usable_region_size, end_addr; + uint64_t new_plugged_size, end_addr; /* the plugged_size is just a reflection of what _we_ did previously */ virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, @@ -2378,8 +2380,8 @@ static void virtio_mem_refresh_config(struct virtio_mem *vm) /* calculate the last usable memory block id */ virtio_cread_le(vm->vdev, struct virtio_mem_config, - usable_region_size, &usable_region_size); - end_addr = min(vm->addr + usable_region_size - 1, + usable_region_size, &vm->usable_region_size); + end_addr = min(vm->addr + vm->usable_region_size - 1, pluggable_range.end); if (vm->in_sbm) { @@ -2648,6 +2650,7 @@ static int virtio_mem_init_hotplug(struct virtio_mem *vm) if (rc) goto out_unreg_pm; + virtio_device_ready(vm->vdev); return 0; out_unreg_pm: unregister_pm_notifier(&vm->pm_notifier); @@ -2725,13 +2728,103 @@ static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb, mutex_unlock(&vm->hotplug_mutex); return is_ram; } + +#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM +static int virtio_mem_vmcore_add_device_ram(struct virtio_mem *vm, + struct list_head *list, uint64_t start, uint64_t end) +{ + int rc; + + rc = vmcore_alloc_add_range(list, start, end - start); + if (rc) + dev_err(&vm->vdev->dev, + "Error adding device RAM range: %d\n", rc); + return rc; +} + +static int virtio_mem_vmcore_get_device_ram(struct vmcore_cb *cb, + struct list_head *list) +{ + struct virtio_mem *vm = container_of(cb, struct virtio_mem, + vmcore_cb); + const uint64_t device_start = vm->addr; + const uint64_t device_end = vm->addr + vm->usable_region_size; + uint64_t chunk_size, cur_start, cur_end, plugged_range_start = 0; + LIST_HEAD(tmp_list); + int rc; + + if (!vm->plugged_size) + return 0; + + /* Process memory sections, unless the device block size is bigger. */ + chunk_size = max_t(uint64_t, PFN_PHYS(PAGES_PER_SECTION), + vm->device_block_size); + + mutex_lock(&vm->hotplug_mutex); + + /* + * We process larger chunks and indicate the complete chunk if any + * block in there is plugged. This reduces the number of pfn_is_ram() + * callbacks and mimic what is effectively being done when the old + * kernel would add complete memory sections/blocks to the elfcore hdr. + */ + cur_start = device_start; + for (cur_start = device_start; cur_start < device_end; cur_start = cur_end) { + cur_end = ALIGN_DOWN(cur_start + chunk_size, chunk_size); + cur_end = min_t(uint64_t, cur_end, device_end); + + rc = virtio_mem_send_state_request(vm, cur_start, + cur_end - cur_start); + + if (rc < 0) { + dev_err(&vm->vdev->dev, + "Error querying block states: %d\n", rc); + goto out; + } else if (rc != VIRTIO_MEM_STATE_UNPLUGGED) { + /* Merge ranges with plugged memory. */ + if (!plugged_range_start) + plugged_range_start = cur_start; + continue; + } + + /* Flush any plugged range. */ + if (plugged_range_start) { + rc = virtio_mem_vmcore_add_device_ram(vm, &tmp_list, + plugged_range_start, + cur_start); + if (rc) + goto out; + plugged_range_start = 0; + } + } + + /* Flush any plugged range. */ + if (plugged_range_start) + rc = virtio_mem_vmcore_add_device_ram(vm, &tmp_list, + plugged_range_start, + cur_start); +out: + mutex_unlock(&vm->hotplug_mutex); + if (rc < 0) { + vmcore_free_ranges(&tmp_list); + return rc; + } + list_splice_tail(&tmp_list, list); + return 0; +} +#endif /* CONFIG_PROC_VMCORE_DEVICE_RAM */ #endif /* CONFIG_PROC_VMCORE */ static int virtio_mem_init_kdump(struct virtio_mem *vm) { + /* We must be prepared to receive a callback immediately. */ + virtio_device_ready(vm->vdev); #ifdef CONFIG_PROC_VMCORE dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n"); vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram; +#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM + vm->vmcore_cb.get_device_ram = virtio_mem_vmcore_get_device_ram; +#endif /* CONFIG_PROC_VMCORE_DEVICE_RAM */ register_vmcore_cb(&vm->vmcore_cb); return 0; #else /* CONFIG_PROC_VMCORE */ @@ -2760,6 +2853,8 @@ static int virtio_mem_init(struct virtio_mem *vm) virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr); virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, &vm->region_size); + virtio_cread_le(vm->vdev, struct virtio_mem_config, usable_region_size, + &vm->usable_region_size); /* Determine the nid for the device based on the lowest address. */ if (vm->nid == NUMA_NO_NODE) @@ -2870,8 +2965,6 @@ static int virtio_mem_probe(struct virtio_device *vdev) if (rc) goto out_del_vq; - virtio_device_ready(vdev); - /* trigger a config update to start processing the requested_size */ if (!vm->in_kdump) { atomic_set(&vm->config_changed, 1); diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 88074451dd615..d6d79af44569b 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -794,6 +794,46 @@ static int virtio_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs) return num_vfs; } +static void virtio_pci_reset_prepare(struct pci_dev *pci_dev) +{ + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + int ret = 0; + + ret = virtio_device_reset_prepare(&vp_dev->vdev); + if (ret) { + if (ret != -EOPNOTSUPP) + dev_warn(&pci_dev->dev, "Reset prepare failure: %d", + ret); + return; + } + + if (pci_is_enabled(pci_dev)) + pci_disable_device(pci_dev); +} + +static void virtio_pci_reset_done(struct pci_dev *pci_dev) +{ + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + int ret; + + if (pci_is_enabled(pci_dev)) + return; + + ret = pci_enable_device(pci_dev); + if (!ret) { + pci_set_master(pci_dev); + ret = virtio_device_reset_done(&vp_dev->vdev); + } + + if (ret && ret != -EOPNOTSUPP) + dev_warn(&pci_dev->dev, "Reset done failure: %d", ret); +} + +static const struct pci_error_handlers virtio_pci_err_handler = { + .reset_prepare = virtio_pci_reset_prepare, + .reset_done = virtio_pci_reset_done, +}; + static struct pci_driver virtio_pci_driver = { .name = "virtio-pci", .id_table = virtio_pci_id_table, @@ -803,6 +843,7 @@ static struct pci_driver virtio_pci_driver = { .driver.pm = &virtio_pci_pm_ops, #endif .sriov_configure = virtio_pci_sriov_configure, + .err_handler = &virtio_pci_err_handler, }; struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev) diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index d80a1431ef7be..6ae966c561e73 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -61,6 +61,25 @@ config PROC_VMCORE_DEVICE_DUMP as ELF notes to /proc/vmcore. You can still disable device dump using the kernel command line option 'novmcoredd'. +config NEED_PROC_VMCORE_DEVICE_RAM + bool + +config PROC_VMCORE_DEVICE_RAM + def_bool y + depends on PROC_VMCORE && NEED_PROC_VMCORE_DEVICE_RAM + depends on VIRTIO_MEM + help + If the elfcore hdr is allocated and prepared by the dump kernel + ("2nd kernel") instead of the crashed kernel, RAM provided by memory + devices such as virtio-mem will not be included in the dump + image, because only the device driver can properly detect them. + + With this config enabled, these RAM ranges will be queried from the + device drivers once the device gets probed, so they can be included + in the crash dump. + + Relevant architectures should select NEED_PROC_VMCORE_DEVICE_RAM. + config PROC_SYSCTL bool "Sysctl support (/proc/sys)" if EXPERT depends on PROC_FS diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 658bf199d4247..a00120a3c0994 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -8,6 +8,8 @@ * */ +#define pr_fmt(fmt) "vmcore: " fmt + #include <linux/mm.h> #include <linux/kcore.h> #include <linux/user.h> @@ -51,9 +53,14 @@ static u64 vmcore_size; static struct proc_dir_entry *proc_vmcore; #ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +struct vmcoredd_node { + struct list_head list; /* List of dumps */ + void *buf; /* Buffer containing device's dump */ + unsigned int size; /* Size of the buffer */ +}; + /* Device Dump list and mutex to synchronize access to list */ static LIST_HEAD(vmcoredd_list); -static DEFINE_MUTEX(vmcoredd_mutex); static bool vmcoredd_disabled; core_param(novmcoredd, vmcoredd_disabled, bool, 0); @@ -62,17 +69,22 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0); /* Device Dump Size */ static size_t vmcoredd_orig_sz; -static DEFINE_SPINLOCK(vmcore_cb_lock); +static DEFINE_MUTEX(vmcore_mutex); + DEFINE_STATIC_SRCU(vmcore_cb_srcu); /* List of registered vmcore callbacks. */ static LIST_HEAD(vmcore_cb_list); /* Whether the vmcore has been opened once. */ static bool vmcore_opened; +/* Whether the vmcore is currently open. */ +static unsigned int vmcore_open; + +static void vmcore_process_device_ram(struct vmcore_cb *cb); void register_vmcore_cb(struct vmcore_cb *cb) { INIT_LIST_HEAD(&cb->next); - spin_lock(&vmcore_cb_lock); + mutex_lock(&vmcore_mutex); list_add_tail(&cb->next, &vmcore_cb_list); /* * Registering a vmcore callback after the vmcore was opened is @@ -80,13 +92,15 @@ void register_vmcore_cb(struct vmcore_cb *cb) */ if (vmcore_opened) pr_warn_once("Unexpected vmcore callback registration\n"); - spin_unlock(&vmcore_cb_lock); + if (!vmcore_open && cb->get_device_ram) + vmcore_process_device_ram(cb); + mutex_unlock(&vmcore_mutex); } EXPORT_SYMBOL_GPL(register_vmcore_cb); void unregister_vmcore_cb(struct vmcore_cb *cb) { - spin_lock(&vmcore_cb_lock); + mutex_lock(&vmcore_mutex); list_del_rcu(&cb->next); /* * Unregistering a vmcore callback after the vmcore was opened is @@ -95,7 +109,7 @@ void unregister_vmcore_cb(struct vmcore_cb *cb) */ if (vmcore_opened) pr_warn_once("Unexpected vmcore callback unregistration\n"); - spin_unlock(&vmcore_cb_lock); + mutex_unlock(&vmcore_mutex); synchronize_srcu(&vmcore_cb_srcu); } @@ -120,9 +134,23 @@ static bool pfn_is_ram(unsigned long pfn) static int open_vmcore(struct inode *inode, struct file *file) { - spin_lock(&vmcore_cb_lock); + mutex_lock(&vmcore_mutex); vmcore_opened = true; - spin_unlock(&vmcore_cb_lock); + if (vmcore_open + 1 == 0) { + mutex_unlock(&vmcore_mutex); + return -EBUSY; + } + vmcore_open++; + mutex_unlock(&vmcore_mutex); + + return 0; +} + +static int release_vmcore(struct inode *inode, struct file *file) +{ + mutex_lock(&vmcore_mutex); + vmcore_open--; + mutex_unlock(&vmcore_mutex); return 0; } @@ -243,33 +271,27 @@ static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size) { struct vmcoredd_node *dump; u64 offset = 0; - int ret = 0; size_t tsz; char *buf; - mutex_lock(&vmcoredd_mutex); list_for_each_entry(dump, &vmcoredd_list, list) { if (start < offset + dump->size) { tsz = min(offset + (u64)dump->size - start, (u64)size); buf = dump->buf + start - offset; - if (copy_to_iter(buf, tsz, iter) < tsz) { - ret = -EFAULT; - goto out_unlock; - } + if (copy_to_iter(buf, tsz, iter) < tsz) + return -EFAULT; size -= tsz; start += tsz; /* Leave now if buffer filled already */ if (!size) - goto out_unlock; + return 0; } offset += dump->size; } -out_unlock: - mutex_unlock(&vmcoredd_mutex); - return ret; + return 0; } #ifdef CONFIG_MMU @@ -278,20 +300,16 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, { struct vmcoredd_node *dump; u64 offset = 0; - int ret = 0; size_t tsz; char *buf; - mutex_lock(&vmcoredd_mutex); list_for_each_entry(dump, &vmcoredd_list, list) { if (start < offset + dump->size) { tsz = min(offset + (u64)dump->size - start, (u64)size); buf = dump->buf + start - offset; if (remap_vmalloc_range_partial(vma, dst, buf, 0, - tsz)) { - ret = -EFAULT; - goto out_unlock; - } + tsz)) + return -EFAULT; size -= tsz; start += tsz; @@ -299,14 +317,12 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, /* Leave now if buffer filled already */ if (!size) - goto out_unlock; + return 0; } offset += dump->size; } -out_unlock: - mutex_unlock(&vmcoredd_mutex); - return ret; + return 0; } #endif /* CONFIG_MMU */ #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ @@ -316,10 +332,10 @@ out_unlock: */ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) { + struct vmcore_range *m = NULL; ssize_t acc = 0, tmp; size_t tsz; u64 start; - struct vmcore *m = NULL; if (!iov_iter_count(iter) || *fpos >= vmcore_size) return 0; @@ -576,7 +592,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) { size_t size = vma->vm_end - vma->vm_start; u64 start, end, len, tsz; - struct vmcore *m; + struct vmcore_range *m; start = (u64)vma->vm_pgoff << PAGE_SHIFT; end = start + size; @@ -693,21 +709,17 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) static const struct proc_ops vmcore_proc_ops = { .proc_open = open_vmcore, + .proc_release = release_vmcore, .proc_read_iter = read_vmcore, .proc_lseek = default_llseek, .proc_mmap = mmap_vmcore, }; -static struct vmcore* __init get_new_element(void) -{ - return kzalloc(sizeof(struct vmcore), GFP_KERNEL); -} - static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz, struct list_head *vc_list) { + struct vmcore_range *m; u64 size; - struct vmcore *m; size = elfsz + elfnotesegsz; list_for_each_entry(m, vc_list, list) { @@ -1109,7 +1121,6 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, Elf64_Ehdr *ehdr_ptr; Elf64_Phdr *phdr_ptr; loff_t vmcore_off; - struct vmcore *new; ehdr_ptr = (Elf64_Ehdr *)elfptr; phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ @@ -1128,13 +1139,8 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); size = end - start; - /* Add this contiguous chunk of memory to vmcore list.*/ - new = get_new_element(); - if (!new) + if (vmcore_alloc_add_range(vc_list, start, size)) return -ENOMEM; - new->paddr = start; - new->size = size; - list_add_tail(&new->list, vc_list); /* Update the program header offset. */ phdr_ptr->p_offset = vmcore_off + (paddr - start); @@ -1152,7 +1158,6 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, Elf32_Ehdr *ehdr_ptr; Elf32_Phdr *phdr_ptr; loff_t vmcore_off; - struct vmcore *new; ehdr_ptr = (Elf32_Ehdr *)elfptr; phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ @@ -1171,13 +1176,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); size = end - start; - /* Add this contiguous chunk of memory to vmcore list.*/ - new = get_new_element(); - if (!new) + if (vmcore_alloc_add_range(vc_list, start, size)) return -ENOMEM; - new->paddr = start; - new->size = size; - list_add_tail(&new->list, vc_list); /* Update the program header offset */ phdr_ptr->p_offset = vmcore_off + (paddr - start); @@ -1190,8 +1190,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, struct list_head *vc_list) { + struct vmcore_range *m; loff_t vmcore_off; - struct vmcore *m; /* Skip ELF header, program headers and ELF note segment. */ vmcore_off = elfsz + elfnotes_sz; @@ -1518,12 +1518,18 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) dump->buf = buf; dump->size = data_size; - /* Add the dump to driver sysfs list */ - mutex_lock(&vmcoredd_mutex); - list_add_tail(&dump->list, &vmcoredd_list); - mutex_unlock(&vmcoredd_mutex); + /* Add the dump to driver sysfs list and update the elfcore hdr */ + mutex_lock(&vmcore_mutex); + if (vmcore_opened) + pr_warn_once("Unexpected adding of device dump\n"); + if (vmcore_open) { + ret = -EBUSY; + goto out_err; + } + list_add_tail(&dump->list, &vmcoredd_list); vmcoredd_update_size(data_size); + mutex_unlock(&vmcore_mutex); return 0; out_err: @@ -1535,11 +1541,163 @@ out_err: EXPORT_SYMBOL(vmcore_add_device_dump); #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ +#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM +static int vmcore_realloc_elfcore_buffer_elf64(size_t new_size) +{ + char *elfcorebuf_new; + + if (WARN_ON_ONCE(new_size < elfcorebuf_sz)) + return -EINVAL; + if (get_order(elfcorebuf_sz_orig) == get_order(new_size)) { + elfcorebuf_sz_orig = new_size; + return 0; + } + + elfcorebuf_new = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(new_size)); + if (!elfcorebuf_new) + return -ENOMEM; + memcpy(elfcorebuf_new, elfcorebuf, elfcorebuf_sz); + free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig)); + elfcorebuf = elfcorebuf_new; + elfcorebuf_sz_orig = new_size; + return 0; +} + +static void vmcore_reset_offsets_elf64(void) +{ + Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr)); + loff_t vmcore_off = elfcorebuf_sz + elfnotes_sz; + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf; + Elf64_Phdr *phdr; + int i; + + for (i = 0, phdr = phdr_start; i < ehdr->e_phnum; i++, phdr++) { + u64 start, end; + + /* + * After merge_note_headers_elf64() we should only have a single + * PT_NOTE entry that starts immediately after elfcorebuf_sz. + */ + if (phdr->p_type == PT_NOTE) { + phdr->p_offset = elfcorebuf_sz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, PAGE_SIZE); + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off = vmcore_off + end - start; + } + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); +} + +static int vmcore_add_device_ram_elf64(struct list_head *list, size_t count) +{ + Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr)); + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf; + struct vmcore_range *cur; + Elf64_Phdr *phdr; + size_t new_size; + int rc; + + if ((Elf32_Half)(ehdr->e_phnum + count) != ehdr->e_phnum + count) { + pr_err("too many device ram ranges\n"); + return -ENOSPC; + } + + /* elfcorebuf_sz must always cover full pages. */ + new_size = sizeof(Elf64_Ehdr) + + (ehdr->e_phnum + count) * sizeof(Elf64_Phdr); + new_size = roundup(new_size, PAGE_SIZE); + + /* + * Make sure we have sufficient space to include the new PT_LOAD + * entries. + */ + rc = vmcore_realloc_elfcore_buffer_elf64(new_size); + if (rc) { + pr_err("resizing elfcore failed\n"); + return rc; + } + + /* Modify our used elfcore buffer size to cover the new entries. */ + elfcorebuf_sz = new_size; + + /* Fill the added PT_LOAD entries. */ + phdr = phdr_start + ehdr->e_phnum; + list_for_each_entry(cur, list, list) { + WARN_ON_ONCE(!IS_ALIGNED(cur->paddr | cur->size, PAGE_SIZE)); + elfcorehdr_fill_device_ram_ptload_elf64(phdr, cur->paddr, cur->size); + + /* p_offset will be adjusted later. */ + phdr++; + ehdr->e_phnum++; + } + list_splice_tail(list, &vmcore_list); + + /* We changed elfcorebuf_sz and added new entries; reset all offsets. */ + vmcore_reset_offsets_elf64(); + + /* Finally, recalculate the total vmcore size. */ + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + proc_vmcore->size = vmcore_size; + return 0; +} + +static void vmcore_process_device_ram(struct vmcore_cb *cb) +{ + unsigned char *e_ident = (unsigned char *)elfcorebuf; + struct vmcore_range *first, *m; + LIST_HEAD(list); + int count; + + /* We only support Elf64 dumps for now. */ + if (WARN_ON_ONCE(e_ident[EI_CLASS] != ELFCLASS64)) { + pr_err("device ram ranges only support Elf64\n"); + return; + } + + if (cb->get_device_ram(cb, &list)) { + pr_err("obtaining device ram ranges failed\n"); + return; + } + count = list_count_nodes(&list); + if (!count) + return; + + /* + * For some reason these ranges are already know? Might happen + * with unusual register->unregister->register sequences; we'll simply + * sanity check using the first range. + */ + first = list_first_entry(&list, struct vmcore_range, list); + list_for_each_entry(m, &vmcore_list, list) { + unsigned long long m_end = m->paddr + m->size; + unsigned long long first_end = first->paddr + first->size; + + if (first->paddr < m_end && m->paddr < first_end) + goto out_free; + } + + /* If adding the mem nodes succeeds, they must not be freed. */ + if (!vmcore_add_device_ram_elf64(&list, count)) + return; +out_free: + vmcore_free_ranges(&list); +} +#else /* !CONFIG_PROC_VMCORE_DEVICE_RAM */ +static void vmcore_process_device_ram(struct vmcore_cb *cb) +{ +} +#endif /* CONFIG_PROC_VMCORE_DEVICE_RAM */ + /* Free all dumps in vmcore device dump list */ static void vmcore_free_device_dumps(void) { #ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP - mutex_lock(&vmcoredd_mutex); + mutex_lock(&vmcore_mutex); while (!list_empty(&vmcoredd_list)) { struct vmcoredd_node *dump; @@ -1549,7 +1707,7 @@ static void vmcore_free_device_dumps(void) vfree(dump->buf); vfree(dump); } - mutex_unlock(&vmcoredd_mutex); + mutex_unlock(&vmcore_mutex); #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ } @@ -1571,7 +1729,7 @@ static int __init vmcore_init(void) rc = parse_crash_elf_headers(); if (rc) { elfcorehdr_free(elfcorehdr_addr); - pr_warn("Kdump: vmcore not initialized\n"); + pr_warn("not initialized\n"); return rc; } elfcorehdr_free(elfcorehdr_addr); @@ -1592,14 +1750,7 @@ void vmcore_cleanup(void) proc_vmcore = NULL; } - /* clear the vmcore list. */ - while (!list_empty(&vmcore_list)) { - struct vmcore *m; - - m = list_first_entry(&vmcore_list, struct vmcore, list); - list_del(&m->list); - kfree(m); - } + vmcore_free_ranges(&vmcore_list); free_elfcorebuf(); /* clear vmcore device dump list */ diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index acc55626afdcd..2f2555e6407ce 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -20,6 +20,8 @@ extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size); extern void elfcorehdr_free(unsigned long long addr); extern ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos); extern ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); +void elfcorehdr_fill_device_ram_ptload_elf64(Elf64_Phdr *phdr, + unsigned long long paddr, unsigned long long size); extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot); @@ -99,6 +101,12 @@ static inline void vmcore_unusable(void) * indicated in the vmcore instead. For example, a ballooned page * contains no data and reading from such a page will cause high * load in the hypervisor. + * @get_device_ram: query RAM ranges that can only be detected by device + * drivers, such as the virtio-mem driver, so they can be included in + * the crash dump on architectures that allocate the elfcore hdr in the dump + * ("2nd") kernel. Indicated RAM ranges may contain holes to reduce the + * total number of ranges; such holes can be detected using the pfn_is_ram + * callback just like for other RAM. * @next: List head to manage registered callbacks internally; initialized by * register_vmcore_cb(). * @@ -109,11 +117,44 @@ static inline void vmcore_unusable(void) */ struct vmcore_cb { bool (*pfn_is_ram)(struct vmcore_cb *cb, unsigned long pfn); + int (*get_device_ram)(struct vmcore_cb *cb, struct list_head *list); struct list_head next; }; extern void register_vmcore_cb(struct vmcore_cb *cb); extern void unregister_vmcore_cb(struct vmcore_cb *cb); +struct vmcore_range { + struct list_head list; + unsigned long long paddr; + unsigned long long size; + loff_t offset; +}; + +/* Allocate a vmcore range and add it to the list. */ +static inline int vmcore_alloc_add_range(struct list_head *list, + unsigned long long paddr, unsigned long long size) +{ + struct vmcore_range *m = kzalloc(sizeof(*m), GFP_KERNEL); + + if (!m) + return -ENOMEM; + m->paddr = paddr; + m->size = size; + list_add_tail(&m->list, list); + return 0; +} + +/* Free a list of vmcore ranges. */ +static inline void vmcore_free_ranges(struct list_head *list) +{ + struct vmcore_range *m, *tmp; + + list_for_each_entry_safe(m, tmp, list, list) { + list_del(&m->list); + kfree(m); + } +} + #else /* !CONFIG_CRASH_DUMP */ static inline bool is_kdump_kernel(void) { return false; } #endif /* CONFIG_CRASH_DUMP */ diff --git a/include/linux/kcore.h b/include/linux/kcore.h index 86c0f1d189988..9a2fa013c91de 100644 --- a/include/linux/kcore.h +++ b/include/linux/kcore.h @@ -20,19 +20,6 @@ struct kcore_list { int type; }; -struct vmcore { - struct list_head list; - unsigned long long paddr; - unsigned long long size; - loff_t offset; -}; - -struct vmcoredd_node { - struct list_head list; /* List of dumps */ - void *buf; /* Buffer containing device's dump */ - unsigned int size; /* Size of the buffer */ -}; - #ifdef CONFIG_PROC_KCORE void __init kclist_add(struct kcore_list *, void *, size_t, int type); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index dd88682e27e31..4d16c13d0df58 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -190,6 +190,8 @@ int virtio_device_freeze(struct virtio_device *dev); int virtio_device_restore(struct virtio_device *dev); #endif void virtio_reset_device(struct virtio_device *dev); +int virtio_device_reset_prepare(struct virtio_device *dev); +int virtio_device_reset_done(struct virtio_device *dev); size_t virtio_max_dma_size(const struct virtio_device *vdev); @@ -214,6 +216,10 @@ size_t virtio_max_dma_size(const struct virtio_device *vdev); * changes; may be called in interrupt context. * @freeze: optional function to call during suspend/hibernation. * @restore: optional function to call on resume. + * @reset_prepare: optional function to call when a transport specific reset + * occurs. + * @reset_done: optional function to call after transport specific reset + * operation has finished. */ struct virtio_driver { struct device_driver driver; @@ -229,6 +235,8 @@ struct virtio_driver { void (*config_changed)(struct virtio_device *dev); int (*freeze)(struct virtio_device *dev); int (*restore)(struct virtio_device *dev); + int (*reset_prepare)(struct virtio_device *dev); + int (*reset_done)(struct virtio_device *dev); }; #define drv_to_virtio(__drv) container_of_const(__drv, struct virtio_driver, driver) diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index 11bd48c72c6cc..68a627d04afa1 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ #ifndef _UAPI_VDUSE_H_ #define _UAPI_VDUSE_H_ diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index 1beb317df1b9b..8549d45712571 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -116,6 +116,8 @@ #define VIRTIO_PCI_CAP_PCI_CFG 5 /* Additional shared memory capability */ #define VIRTIO_PCI_CAP_SHARED_MEMORY_CFG 8 +/* PCI vendor data configuration */ +#define VIRTIO_PCI_CAP_VENDOR_CFG 9 /* This is the PCI capability header: */ struct virtio_pci_cap { @@ -130,6 +132,18 @@ struct virtio_pci_cap { __le32 length; /* Length of the structure, in bytes. */ }; +/* This is the PCI vendor data capability header: */ +struct virtio_pci_vndr_data { + __u8 cap_vndr; /* Generic PCI field: PCI_CAP_ID_VNDR */ + __u8 cap_next; /* Generic PCI field: next ptr. */ + __u8 cap_len; /* Generic PCI field: capability length */ + __u8 cfg_type; /* Identifies the structure. */ + __u16 vendor_id; /* Identifies the vendor-specific format. */ + /* For Vendor Definition */ + /* Pads structure to a multiple of 4 bytes */ + /* Reads must not have side effects */ +}; + struct virtio_pci_cap64 { struct virtio_pci_cap cap; __le32 offset_hi; /* Most sig 32 bits of offset */ |