diff options
Diffstat (limited to 'drivers/iommu')
| -rw-r--r-- | drivers/iommu/Kconfig | 110 | ||||
| -rw-r--r-- | drivers/iommu/Makefile | 5 | ||||
| -rw-r--r-- | drivers/iommu/amd_iommu.c | 2824 | ||||
| -rw-r--r-- | drivers/iommu/amd_iommu_init.c | 1574 | ||||
| -rw-r--r-- | drivers/iommu/amd_iommu_proto.h | 54 | ||||
| -rw-r--r-- | drivers/iommu/amd_iommu_types.h | 585 | ||||
| -rw-r--r-- | drivers/iommu/dmar.c | 1461 | ||||
| -rw-r--r-- | drivers/iommu/intel-iommu.c | 4016 | ||||
| -rw-r--r-- | drivers/iommu/intr_remapping.c | 797 | ||||
| -rw-r--r-- | drivers/iommu/intr_remapping.h | 17 | ||||
| -rw-r--r-- | drivers/iommu/iommu.c | 124 | ||||
| -rw-r--r-- | drivers/iommu/iova.c | 435 | ||||
| -rw-r--r-- | drivers/iommu/msm_iommu.c | 731 | ||||
| -rw-r--r-- | drivers/iommu/msm_iommu_dev.c | 422 | 
14 files changed, 13155 insertions, 0 deletions
| diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig new file mode 100644 index 000000000000..b57b3fa492f3 --- /dev/null +++ b/drivers/iommu/Kconfig @@ -0,0 +1,110 @@ +# IOMMU_API always gets selected by whoever wants it. +config IOMMU_API +	bool + +menuconfig IOMMU_SUPPORT +	bool "IOMMU Hardware Support" +	default y +	---help--- +	  Say Y here if you want to compile device drivers for IO Memory +	  Management Units into the kernel. These devices usually allow to +	  remap DMA requests and/or remap interrupts from other devices on the +	  system. + +if IOMMU_SUPPORT + +# MSM IOMMU support +config MSM_IOMMU +	bool "MSM IOMMU Support" +	depends on ARCH_MSM8X60 || ARCH_MSM8960 +	select IOMMU_API +	help +	  Support for the IOMMUs found on certain Qualcomm SOCs. +	  These IOMMUs allow virtualization of the address space used by most +	  cores within the multimedia subsystem. + +	  If unsure, say N here. + +config IOMMU_PGTABLES_L2 +	def_bool y +	depends on MSM_IOMMU && MMU && SMP && CPU_DCACHE_DISABLE=n + +# AMD IOMMU support +config AMD_IOMMU +	bool "AMD IOMMU support" +	select SWIOTLB +	select PCI_MSI +	select PCI_IOV +	select IOMMU_API +	depends on X86_64 && PCI && ACPI +	---help--- +	  With this option you can enable support for AMD IOMMU hardware in +	  your system. An IOMMU is a hardware component which provides +	  remapping of DMA memory accesses from devices. With an AMD IOMMU you +	  can isolate the the DMA memory of different devices and protect the +	  system from misbehaving device drivers or hardware. + +	  You can find out if your system has an AMD IOMMU if you look into +	  your BIOS for an option to enable it or if you have an IVRS ACPI +	  table. + +config AMD_IOMMU_STATS +	bool "Export AMD IOMMU statistics to debugfs" +	depends on AMD_IOMMU +	select DEBUG_FS +	---help--- +	  This option enables code in the AMD IOMMU driver to collect various +	  statistics about whats happening in the driver and exports that +	  information to userspace via debugfs. +	  If unsure, say N. + +# Intel IOMMU support +config DMAR +	bool "Support for DMA Remapping Devices" +	depends on PCI_MSI && ACPI && (X86 || IA64_GENERIC) +	select IOMMU_API +	help +	  DMA remapping (DMAR) devices support enables independent address +	  translations for Direct Memory Access (DMA) from devices. +	  These DMA remapping devices are reported via ACPI tables +	  and include PCI device scope covered by these DMA +	  remapping devices. + +config DMAR_DEFAULT_ON +	def_bool y +	prompt "Enable DMA Remapping Devices by default" +	depends on DMAR +	help +	  Selecting this option will enable a DMAR device at boot time if +	  one is found. If this option is not selected, DMAR support can +	  be enabled by passing intel_iommu=on to the kernel. + +config DMAR_BROKEN_GFX_WA +	bool "Workaround broken graphics drivers (going away soon)" +	depends on DMAR && BROKEN && X86 +	---help--- +	  Current Graphics drivers tend to use physical address +	  for DMA and avoid using DMA APIs. Setting this config +	  option permits the IOMMU driver to set a unity map for +	  all the OS-visible memory. Hence the driver can continue +	  to use physical addresses for DMA, at least until this +	  option is removed in the 2.6.32 kernel. + +config DMAR_FLOPPY_WA +	def_bool y +	depends on DMAR && X86 +	---help--- +	  Floppy disk drivers are known to bypass DMA API calls +	  thereby failing to work when IOMMU is enabled. This +	  workaround will setup a 1:1 mapping for the first +	  16MiB to make floppy (an ISA device) work. + +config INTR_REMAP +	bool "Support for Interrupt Remapping (EXPERIMENTAL)" +	depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL +	---help--- +	  Supports Interrupt remapping for IO-APIC and MSI devices. +	  To use x2apic mode in the CPU's which support x2APIC enhancements or +	  to support platforms with CPU's having > 8 bit APIC ID, say Y. + +endif # IOMMU_SUPPORT diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile new file mode 100644 index 000000000000..4d4d77df7cac --- /dev/null +++ b/drivers/iommu/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_IOMMU_API) += iommu.o +obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o +obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o +obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o +obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c new file mode 100644 index 000000000000..a14f8dc23462 --- /dev/null +++ b/drivers/iommu/amd_iommu.c @@ -0,0 +1,2824 @@ +/* + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + *         Leo Duran <leo.duran@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA + */ + +#include <linux/pci.h> +#include <linux/pci-ats.h> +#include <linux/bitmap.h> +#include <linux/slab.h> +#include <linux/debugfs.h> +#include <linux/scatterlist.h> +#include <linux/dma-mapping.h> +#include <linux/iommu-helper.h> +#include <linux/iommu.h> +#include <linux/delay.h> +#include <linux/amd-iommu.h> +#include <asm/msidef.h> +#include <asm/proto.h> +#include <asm/iommu.h> +#include <asm/gart.h> +#include <asm/dma.h> + +#include "amd_iommu_proto.h" +#include "amd_iommu_types.h" + +#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) + +#define LOOP_TIMEOUT	100000 + +static DEFINE_RWLOCK(amd_iommu_devtable_lock); + +/* A list of preallocated protection domains */ +static LIST_HEAD(iommu_pd_list); +static DEFINE_SPINLOCK(iommu_pd_list_lock); + +/* List of all available dev_data structures */ +static LIST_HEAD(dev_data_list); +static DEFINE_SPINLOCK(dev_data_list_lock); + +/* + * Domain for untranslated devices - only allocated + * if iommu=pt passed on kernel cmd line. + */ +static struct protection_domain *pt_domain; + +static struct iommu_ops amd_iommu_ops; + +/* + * general struct to manage commands send to an IOMMU + */ +struct iommu_cmd { +	u32 data[4]; +}; + +static void update_domain(struct protection_domain *domain); + +/**************************************************************************** + * + * Helper functions + * + ****************************************************************************/ + +static struct iommu_dev_data *alloc_dev_data(u16 devid) +{ +	struct iommu_dev_data *dev_data; +	unsigned long flags; + +	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); +	if (!dev_data) +		return NULL; + +	dev_data->devid = devid; +	atomic_set(&dev_data->bind, 0); + +	spin_lock_irqsave(&dev_data_list_lock, flags); +	list_add_tail(&dev_data->dev_data_list, &dev_data_list); +	spin_unlock_irqrestore(&dev_data_list_lock, flags); + +	return dev_data; +} + +static void free_dev_data(struct iommu_dev_data *dev_data) +{ +	unsigned long flags; + +	spin_lock_irqsave(&dev_data_list_lock, flags); +	list_del(&dev_data->dev_data_list); +	spin_unlock_irqrestore(&dev_data_list_lock, flags); + +	kfree(dev_data); +} + +static struct iommu_dev_data *search_dev_data(u16 devid) +{ +	struct iommu_dev_data *dev_data; +	unsigned long flags; + +	spin_lock_irqsave(&dev_data_list_lock, flags); +	list_for_each_entry(dev_data, &dev_data_list, dev_data_list) { +		if (dev_data->devid == devid) +			goto out_unlock; +	} + +	dev_data = NULL; + +out_unlock: +	spin_unlock_irqrestore(&dev_data_list_lock, flags); + +	return dev_data; +} + +static struct iommu_dev_data *find_dev_data(u16 devid) +{ +	struct iommu_dev_data *dev_data; + +	dev_data = search_dev_data(devid); + +	if (dev_data == NULL) +		dev_data = alloc_dev_data(devid); + +	return dev_data; +} + +static inline u16 get_device_id(struct device *dev) +{ +	struct pci_dev *pdev = to_pci_dev(dev); + +	return calc_devid(pdev->bus->number, pdev->devfn); +} + +static struct iommu_dev_data *get_dev_data(struct device *dev) +{ +	return dev->archdata.iommu; +} + +/* + * In this function the list of preallocated protection domains is traversed to + * find the domain for a specific device + */ +static struct dma_ops_domain *find_protection_domain(u16 devid) +{ +	struct dma_ops_domain *entry, *ret = NULL; +	unsigned long flags; +	u16 alias = amd_iommu_alias_table[devid]; + +	if (list_empty(&iommu_pd_list)) +		return NULL; + +	spin_lock_irqsave(&iommu_pd_list_lock, flags); + +	list_for_each_entry(entry, &iommu_pd_list, list) { +		if (entry->target_dev == devid || +		    entry->target_dev == alias) { +			ret = entry; +			break; +		} +	} + +	spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + +	return ret; +} + +/* + * This function checks if the driver got a valid device from the caller to + * avoid dereferencing invalid pointers. + */ +static bool check_device(struct device *dev) +{ +	u16 devid; + +	if (!dev || !dev->dma_mask) +		return false; + +	/* No device or no PCI device */ +	if (dev->bus != &pci_bus_type) +		return false; + +	devid = get_device_id(dev); + +	/* Out of our scope? */ +	if (devid > amd_iommu_last_bdf) +		return false; + +	if (amd_iommu_rlookup_table[devid] == NULL) +		return false; + +	return true; +} + +static int iommu_init_device(struct device *dev) +{ +	struct iommu_dev_data *dev_data; +	u16 alias; + +	if (dev->archdata.iommu) +		return 0; + +	dev_data = find_dev_data(get_device_id(dev)); +	if (!dev_data) +		return -ENOMEM; + +	alias = amd_iommu_alias_table[dev_data->devid]; +	if (alias != dev_data->devid) { +		struct iommu_dev_data *alias_data; + +		alias_data = find_dev_data(alias); +		if (alias_data == NULL) { +			pr_err("AMD-Vi: Warning: Unhandled device %s\n", +					dev_name(dev)); +			free_dev_data(dev_data); +			return -ENOTSUPP; +		} +		dev_data->alias_data = alias_data; +	} + +	dev->archdata.iommu = dev_data; + +	return 0; +} + +static void iommu_ignore_device(struct device *dev) +{ +	u16 devid, alias; + +	devid = get_device_id(dev); +	alias = amd_iommu_alias_table[devid]; + +	memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry)); +	memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry)); + +	amd_iommu_rlookup_table[devid] = NULL; +	amd_iommu_rlookup_table[alias] = NULL; +} + +static void iommu_uninit_device(struct device *dev) +{ +	/* +	 * Nothing to do here - we keep dev_data around for unplugged devices +	 * and reuse it when the device is re-plugged - not doing so would +	 * introduce a ton of races. +	 */ +} + +void __init amd_iommu_uninit_devices(void) +{ +	struct iommu_dev_data *dev_data, *n; +	struct pci_dev *pdev = NULL; + +	for_each_pci_dev(pdev) { + +		if (!check_device(&pdev->dev)) +			continue; + +		iommu_uninit_device(&pdev->dev); +	} + +	/* Free all of our dev_data structures */ +	list_for_each_entry_safe(dev_data, n, &dev_data_list, dev_data_list) +		free_dev_data(dev_data); +} + +int __init amd_iommu_init_devices(void) +{ +	struct pci_dev *pdev = NULL; +	int ret = 0; + +	for_each_pci_dev(pdev) { + +		if (!check_device(&pdev->dev)) +			continue; + +		ret = iommu_init_device(&pdev->dev); +		if (ret == -ENOTSUPP) +			iommu_ignore_device(&pdev->dev); +		else if (ret) +			goto out_free; +	} + +	return 0; + +out_free: + +	amd_iommu_uninit_devices(); + +	return ret; +} +#ifdef CONFIG_AMD_IOMMU_STATS + +/* + * Initialization code for statistics collection + */ + +DECLARE_STATS_COUNTER(compl_wait); +DECLARE_STATS_COUNTER(cnt_map_single); +DECLARE_STATS_COUNTER(cnt_unmap_single); +DECLARE_STATS_COUNTER(cnt_map_sg); +DECLARE_STATS_COUNTER(cnt_unmap_sg); +DECLARE_STATS_COUNTER(cnt_alloc_coherent); +DECLARE_STATS_COUNTER(cnt_free_coherent); +DECLARE_STATS_COUNTER(cross_page); +DECLARE_STATS_COUNTER(domain_flush_single); +DECLARE_STATS_COUNTER(domain_flush_all); +DECLARE_STATS_COUNTER(alloced_io_mem); +DECLARE_STATS_COUNTER(total_map_requests); + +static struct dentry *stats_dir; +static struct dentry *de_fflush; + +static void amd_iommu_stats_add(struct __iommu_counter *cnt) +{ +	if (stats_dir == NULL) +		return; + +	cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir, +				       &cnt->value); +} + +static void amd_iommu_stats_init(void) +{ +	stats_dir = debugfs_create_dir("amd-iommu", NULL); +	if (stats_dir == NULL) +		return; + +	de_fflush  = debugfs_create_bool("fullflush", 0444, stats_dir, +					 (u32 *)&amd_iommu_unmap_flush); + +	amd_iommu_stats_add(&compl_wait); +	amd_iommu_stats_add(&cnt_map_single); +	amd_iommu_stats_add(&cnt_unmap_single); +	amd_iommu_stats_add(&cnt_map_sg); +	amd_iommu_stats_add(&cnt_unmap_sg); +	amd_iommu_stats_add(&cnt_alloc_coherent); +	amd_iommu_stats_add(&cnt_free_coherent); +	amd_iommu_stats_add(&cross_page); +	amd_iommu_stats_add(&domain_flush_single); +	amd_iommu_stats_add(&domain_flush_all); +	amd_iommu_stats_add(&alloced_io_mem); +	amd_iommu_stats_add(&total_map_requests); +} + +#endif + +/**************************************************************************** + * + * Interrupt handling functions + * + ****************************************************************************/ + +static void dump_dte_entry(u16 devid) +{ +	int i; + +	for (i = 0; i < 8; ++i) +		pr_err("AMD-Vi: DTE[%d]: %08x\n", i, +			amd_iommu_dev_table[devid].data[i]); +} + +static void dump_command(unsigned long phys_addr) +{ +	struct iommu_cmd *cmd = phys_to_virt(phys_addr); +	int i; + +	for (i = 0; i < 4; ++i) +		pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]); +} + +static void iommu_print_event(struct amd_iommu *iommu, void *__evt) +{ +	u32 *event = __evt; +	int type  = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK; +	int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; +	int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK; +	int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; +	u64 address = (u64)(((u64)event[3]) << 32) | event[2]; + +	printk(KERN_ERR "AMD-Vi: Event logged ["); + +	switch (type) { +	case EVENT_TYPE_ILL_DEV: +		printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x " +		       "address=0x%016llx flags=0x%04x]\n", +		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), +		       address, flags); +		dump_dte_entry(devid); +		break; +	case EVENT_TYPE_IO_FAULT: +		printk("IO_PAGE_FAULT device=%02x:%02x.%x " +		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n", +		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), +		       domid, address, flags); +		break; +	case EVENT_TYPE_DEV_TAB_ERR: +		printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x " +		       "address=0x%016llx flags=0x%04x]\n", +		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), +		       address, flags); +		break; +	case EVENT_TYPE_PAGE_TAB_ERR: +		printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x " +		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n", +		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), +		       domid, address, flags); +		break; +	case EVENT_TYPE_ILL_CMD: +		printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); +		dump_command(address); +		break; +	case EVENT_TYPE_CMD_HARD_ERR: +		printk("COMMAND_HARDWARE_ERROR address=0x%016llx " +		       "flags=0x%04x]\n", address, flags); +		break; +	case EVENT_TYPE_IOTLB_INV_TO: +		printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x " +		       "address=0x%016llx]\n", +		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), +		       address); +		break; +	case EVENT_TYPE_INV_DEV_REQ: +		printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x " +		       "address=0x%016llx flags=0x%04x]\n", +		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), +		       address, flags); +		break; +	default: +		printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type); +	} +} + +static void iommu_poll_events(struct amd_iommu *iommu) +{ +	u32 head, tail; +	unsigned long flags; + +	spin_lock_irqsave(&iommu->lock, flags); + +	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); +	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); + +	while (head != tail) { +		iommu_print_event(iommu, iommu->evt_buf + head); +		head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; +	} + +	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); + +	spin_unlock_irqrestore(&iommu->lock, flags); +} + +irqreturn_t amd_iommu_int_thread(int irq, void *data) +{ +	struct amd_iommu *iommu; + +	for_each_iommu(iommu) +		iommu_poll_events(iommu); + +	return IRQ_HANDLED; +} + +irqreturn_t amd_iommu_int_handler(int irq, void *data) +{ +	return IRQ_WAKE_THREAD; +} + +/**************************************************************************** + * + * IOMMU command queuing functions + * + ****************************************************************************/ + +static int wait_on_sem(volatile u64 *sem) +{ +	int i = 0; + +	while (*sem == 0 && i < LOOP_TIMEOUT) { +		udelay(1); +		i += 1; +	} + +	if (i == LOOP_TIMEOUT) { +		pr_alert("AMD-Vi: Completion-Wait loop timed out\n"); +		return -EIO; +	} + +	return 0; +} + +static void copy_cmd_to_buffer(struct amd_iommu *iommu, +			       struct iommu_cmd *cmd, +			       u32 tail) +{ +	u8 *target; + +	target = iommu->cmd_buf + tail; +	tail   = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; + +	/* Copy command to buffer */ +	memcpy(target, cmd, sizeof(*cmd)); + +	/* Tell the IOMMU about it */ +	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); +} + +static void build_completion_wait(struct iommu_cmd *cmd, u64 address) +{ +	WARN_ON(address & 0x7ULL); + +	memset(cmd, 0, sizeof(*cmd)); +	cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK; +	cmd->data[1] = upper_32_bits(__pa(address)); +	cmd->data[2] = 1; +	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); +} + +static void build_inv_dte(struct iommu_cmd *cmd, u16 devid) +{ +	memset(cmd, 0, sizeof(*cmd)); +	cmd->data[0] = devid; +	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY); +} + +static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, +				  size_t size, u16 domid, int pde) +{ +	u64 pages; +	int s; + +	pages = iommu_num_pages(address, size, PAGE_SIZE); +	s     = 0; + +	if (pages > 1) { +		/* +		 * If we have to flush more than one page, flush all +		 * TLB entries for this domain +		 */ +		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; +		s = 1; +	} + +	address &= PAGE_MASK; + +	memset(cmd, 0, sizeof(*cmd)); +	cmd->data[1] |= domid; +	cmd->data[2]  = lower_32_bits(address); +	cmd->data[3]  = upper_32_bits(address); +	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); +	if (s) /* size bit - we flush more than one 4kb page */ +		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; +	if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ +		cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; +} + +static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, +				  u64 address, size_t size) +{ +	u64 pages; +	int s; + +	pages = iommu_num_pages(address, size, PAGE_SIZE); +	s     = 0; + +	if (pages > 1) { +		/* +		 * If we have to flush more than one page, flush all +		 * TLB entries for this domain +		 */ +		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; +		s = 1; +	} + +	address &= PAGE_MASK; + +	memset(cmd, 0, sizeof(*cmd)); +	cmd->data[0]  = devid; +	cmd->data[0] |= (qdep & 0xff) << 24; +	cmd->data[1]  = devid; +	cmd->data[2]  = lower_32_bits(address); +	cmd->data[3]  = upper_32_bits(address); +	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); +	if (s) +		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; +} + +static void build_inv_all(struct iommu_cmd *cmd) +{ +	memset(cmd, 0, sizeof(*cmd)); +	CMD_SET_TYPE(cmd, CMD_INV_ALL); +} + +/* + * Writes the command to the IOMMUs command buffer and informs the + * hardware about the new command. + */ +static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) +{ +	u32 left, tail, head, next_tail; +	unsigned long flags; + +	WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED); + +again: +	spin_lock_irqsave(&iommu->lock, flags); + +	head      = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); +	tail      = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); +	next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; +	left      = (head - next_tail) % iommu->cmd_buf_size; + +	if (left <= 2) { +		struct iommu_cmd sync_cmd; +		volatile u64 sem = 0; +		int ret; + +		build_completion_wait(&sync_cmd, (u64)&sem); +		copy_cmd_to_buffer(iommu, &sync_cmd, tail); + +		spin_unlock_irqrestore(&iommu->lock, flags); + +		if ((ret = wait_on_sem(&sem)) != 0) +			return ret; + +		goto again; +	} + +	copy_cmd_to_buffer(iommu, cmd, tail); + +	/* We need to sync now to make sure all commands are processed */ +	iommu->need_sync = true; + +	spin_unlock_irqrestore(&iommu->lock, flags); + +	return 0; +} + +/* + * This function queues a completion wait command into the command + * buffer of an IOMMU + */ +static int iommu_completion_wait(struct amd_iommu *iommu) +{ +	struct iommu_cmd cmd; +	volatile u64 sem = 0; +	int ret; + +	if (!iommu->need_sync) +		return 0; + +	build_completion_wait(&cmd, (u64)&sem); + +	ret = iommu_queue_command(iommu, &cmd); +	if (ret) +		return ret; + +	return wait_on_sem(&sem); +} + +static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) +{ +	struct iommu_cmd cmd; + +	build_inv_dte(&cmd, devid); + +	return iommu_queue_command(iommu, &cmd); +} + +static void iommu_flush_dte_all(struct amd_iommu *iommu) +{ +	u32 devid; + +	for (devid = 0; devid <= 0xffff; ++devid) +		iommu_flush_dte(iommu, devid); + +	iommu_completion_wait(iommu); +} + +/* + * This function uses heavy locking and may disable irqs for some time. But + * this is no issue because it is only called during resume. + */ +static void iommu_flush_tlb_all(struct amd_iommu *iommu) +{ +	u32 dom_id; + +	for (dom_id = 0; dom_id <= 0xffff; ++dom_id) { +		struct iommu_cmd cmd; +		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, +				      dom_id, 1); +		iommu_queue_command(iommu, &cmd); +	} + +	iommu_completion_wait(iommu); +} + +static void iommu_flush_all(struct amd_iommu *iommu) +{ +	struct iommu_cmd cmd; + +	build_inv_all(&cmd); + +	iommu_queue_command(iommu, &cmd); +	iommu_completion_wait(iommu); +} + +void iommu_flush_all_caches(struct amd_iommu *iommu) +{ +	if (iommu_feature(iommu, FEATURE_IA)) { +		iommu_flush_all(iommu); +	} else { +		iommu_flush_dte_all(iommu); +		iommu_flush_tlb_all(iommu); +	} +} + +/* + * Command send function for flushing on-device TLB + */ +static int device_flush_iotlb(struct iommu_dev_data *dev_data, +			      u64 address, size_t size) +{ +	struct amd_iommu *iommu; +	struct iommu_cmd cmd; +	int qdep; + +	qdep     = dev_data->ats.qdep; +	iommu    = amd_iommu_rlookup_table[dev_data->devid]; + +	build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size); + +	return iommu_queue_command(iommu, &cmd); +} + +/* + * Command send function for invalidating a device table entry + */ +static int device_flush_dte(struct iommu_dev_data *dev_data) +{ +	struct amd_iommu *iommu; +	int ret; + +	iommu = amd_iommu_rlookup_table[dev_data->devid]; + +	ret = iommu_flush_dte(iommu, dev_data->devid); +	if (ret) +		return ret; + +	if (dev_data->ats.enabled) +		ret = device_flush_iotlb(dev_data, 0, ~0UL); + +	return ret; +} + +/* + * TLB invalidation function which is called from the mapping functions. + * It invalidates a single PTE if the range to flush is within a single + * page. Otherwise it flushes the whole TLB of the IOMMU. + */ +static void __domain_flush_pages(struct protection_domain *domain, +				 u64 address, size_t size, int pde) +{ +	struct iommu_dev_data *dev_data; +	struct iommu_cmd cmd; +	int ret = 0, i; + +	build_inv_iommu_pages(&cmd, address, size, domain->id, pde); + +	for (i = 0; i < amd_iommus_present; ++i) { +		if (!domain->dev_iommu[i]) +			continue; + +		/* +		 * Devices of this domain are behind this IOMMU +		 * We need a TLB flush +		 */ +		ret |= iommu_queue_command(amd_iommus[i], &cmd); +	} + +	list_for_each_entry(dev_data, &domain->dev_list, list) { + +		if (!dev_data->ats.enabled) +			continue; + +		ret |= device_flush_iotlb(dev_data, address, size); +	} + +	WARN_ON(ret); +} + +static void domain_flush_pages(struct protection_domain *domain, +			       u64 address, size_t size) +{ +	__domain_flush_pages(domain, address, size, 0); +} + +/* Flush the whole IO/TLB for a given protection domain */ +static void domain_flush_tlb(struct protection_domain *domain) +{ +	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); +} + +/* Flush the whole IO/TLB for a given protection domain - including PDE */ +static void domain_flush_tlb_pde(struct protection_domain *domain) +{ +	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); +} + +static void domain_flush_complete(struct protection_domain *domain) +{ +	int i; + +	for (i = 0; i < amd_iommus_present; ++i) { +		if (!domain->dev_iommu[i]) +			continue; + +		/* +		 * Devices of this domain are behind this IOMMU +		 * We need to wait for completion of all commands. +		 */ +		iommu_completion_wait(amd_iommus[i]); +	} +} + + +/* + * This function flushes the DTEs for all devices in domain + */ +static void domain_flush_devices(struct protection_domain *domain) +{ +	struct iommu_dev_data *dev_data; +	unsigned long flags; + +	spin_lock_irqsave(&domain->lock, flags); + +	list_for_each_entry(dev_data, &domain->dev_list, list) +		device_flush_dte(dev_data); + +	spin_unlock_irqrestore(&domain->lock, flags); +} + +/**************************************************************************** + * + * The functions below are used the create the page table mappings for + * unity mapped regions. + * + ****************************************************************************/ + +/* + * This function is used to add another level to an IO page table. Adding + * another level increases the size of the address space by 9 bits to a size up + * to 64 bits. + */ +static bool increase_address_space(struct protection_domain *domain, +				   gfp_t gfp) +{ +	u64 *pte; + +	if (domain->mode == PAGE_MODE_6_LEVEL) +		/* address space already 64 bit large */ +		return false; + +	pte = (void *)get_zeroed_page(gfp); +	if (!pte) +		return false; + +	*pte             = PM_LEVEL_PDE(domain->mode, +					virt_to_phys(domain->pt_root)); +	domain->pt_root  = pte; +	domain->mode    += 1; +	domain->updated  = true; + +	return true; +} + +static u64 *alloc_pte(struct protection_domain *domain, +		      unsigned long address, +		      unsigned long page_size, +		      u64 **pte_page, +		      gfp_t gfp) +{ +	int level, end_lvl; +	u64 *pte, *page; + +	BUG_ON(!is_power_of_2(page_size)); + +	while (address > PM_LEVEL_SIZE(domain->mode)) +		increase_address_space(domain, gfp); + +	level   = domain->mode - 1; +	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; +	address = PAGE_SIZE_ALIGN(address, page_size); +	end_lvl = PAGE_SIZE_LEVEL(page_size); + +	while (level > end_lvl) { +		if (!IOMMU_PTE_PRESENT(*pte)) { +			page = (u64 *)get_zeroed_page(gfp); +			if (!page) +				return NULL; +			*pte = PM_LEVEL_PDE(level, virt_to_phys(page)); +		} + +		/* No level skipping support yet */ +		if (PM_PTE_LEVEL(*pte) != level) +			return NULL; + +		level -= 1; + +		pte = IOMMU_PTE_PAGE(*pte); + +		if (pte_page && level == end_lvl) +			*pte_page = pte; + +		pte = &pte[PM_LEVEL_INDEX(level, address)]; +	} + +	return pte; +} + +/* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64 *fetch_pte(struct protection_domain *domain, unsigned long address) +{ +	int level; +	u64 *pte; + +	if (address > PM_LEVEL_SIZE(domain->mode)) +		return NULL; + +	level   =  domain->mode - 1; +	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + +	while (level > 0) { + +		/* Not Present */ +		if (!IOMMU_PTE_PRESENT(*pte)) +			return NULL; + +		/* Large PTE */ +		if (PM_PTE_LEVEL(*pte) == 0x07) { +			unsigned long pte_mask, __pte; + +			/* +			 * If we have a series of large PTEs, make +			 * sure to return a pointer to the first one. +			 */ +			pte_mask = PTE_PAGE_SIZE(*pte); +			pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1); +			__pte    = ((unsigned long)pte) & pte_mask; + +			return (u64 *)__pte; +		} + +		/* No level skipping support yet */ +		if (PM_PTE_LEVEL(*pte) != level) +			return NULL; + +		level -= 1; + +		/* Walk to the next level */ +		pte = IOMMU_PTE_PAGE(*pte); +		pte = &pte[PM_LEVEL_INDEX(level, address)]; +	} + +	return pte; +} + +/* + * Generic mapping functions. It maps a physical address into a DMA + * address space. It allocates the page table pages if necessary. + * In the future it can be extended to a generic mapping function + * supporting all features of AMD IOMMU page tables like level skipping + * and full 64 bit address spaces. + */ +static int iommu_map_page(struct protection_domain *dom, +			  unsigned long bus_addr, +			  unsigned long phys_addr, +			  int prot, +			  unsigned long page_size) +{ +	u64 __pte, *pte; +	int i, count; + +	if (!(prot & IOMMU_PROT_MASK)) +		return -EINVAL; + +	bus_addr  = PAGE_ALIGN(bus_addr); +	phys_addr = PAGE_ALIGN(phys_addr); +	count     = PAGE_SIZE_PTE_COUNT(page_size); +	pte       = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL); + +	for (i = 0; i < count; ++i) +		if (IOMMU_PTE_PRESENT(pte[i])) +			return -EBUSY; + +	if (page_size > PAGE_SIZE) { +		__pte = PAGE_SIZE_PTE(phys_addr, page_size); +		__pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC; +	} else +		__pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC; + +	if (prot & IOMMU_PROT_IR) +		__pte |= IOMMU_PTE_IR; +	if (prot & IOMMU_PROT_IW) +		__pte |= IOMMU_PTE_IW; + +	for (i = 0; i < count; ++i) +		pte[i] = __pte; + +	update_domain(dom); + +	return 0; +} + +static unsigned long iommu_unmap_page(struct protection_domain *dom, +				      unsigned long bus_addr, +				      unsigned long page_size) +{ +	unsigned long long unmap_size, unmapped; +	u64 *pte; + +	BUG_ON(!is_power_of_2(page_size)); + +	unmapped = 0; + +	while (unmapped < page_size) { + +		pte = fetch_pte(dom, bus_addr); + +		if (!pte) { +			/* +			 * No PTE for this address +			 * move forward in 4kb steps +			 */ +			unmap_size = PAGE_SIZE; +		} else if (PM_PTE_LEVEL(*pte) == 0) { +			/* 4kb PTE found for this address */ +			unmap_size = PAGE_SIZE; +			*pte       = 0ULL; +		} else { +			int count, i; + +			/* Large PTE found which maps this address */ +			unmap_size = PTE_PAGE_SIZE(*pte); +			count      = PAGE_SIZE_PTE_COUNT(unmap_size); +			for (i = 0; i < count; i++) +				pte[i] = 0ULL; +		} + +		bus_addr  = (bus_addr & ~(unmap_size - 1)) + unmap_size; +		unmapped += unmap_size; +	} + +	BUG_ON(!is_power_of_2(unmapped)); + +	return unmapped; +} + +/* + * This function checks if a specific unity mapping entry is needed for + * this specific IOMMU. + */ +static int iommu_for_unity_map(struct amd_iommu *iommu, +			       struct unity_map_entry *entry) +{ +	u16 bdf, i; + +	for (i = entry->devid_start; i <= entry->devid_end; ++i) { +		bdf = amd_iommu_alias_table[i]; +		if (amd_iommu_rlookup_table[bdf] == iommu) +			return 1; +	} + +	return 0; +} + +/* + * This function actually applies the mapping to the page table of the + * dma_ops domain. + */ +static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, +			     struct unity_map_entry *e) +{ +	u64 addr; +	int ret; + +	for (addr = e->address_start; addr < e->address_end; +	     addr += PAGE_SIZE) { +		ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, +				     PAGE_SIZE); +		if (ret) +			return ret; +		/* +		 * if unity mapping is in aperture range mark the page +		 * as allocated in the aperture +		 */ +		if (addr < dma_dom->aperture_size) +			__set_bit(addr >> PAGE_SHIFT, +				  dma_dom->aperture[0]->bitmap); +	} + +	return 0; +} + +/* + * Init the unity mappings for a specific IOMMU in the system + * + * Basically iterates over all unity mapping entries and applies them to + * the default domain DMA of that IOMMU if necessary. + */ +static int iommu_init_unity_mappings(struct amd_iommu *iommu) +{ +	struct unity_map_entry *entry; +	int ret; + +	list_for_each_entry(entry, &amd_iommu_unity_map, list) { +		if (!iommu_for_unity_map(iommu, entry)) +			continue; +		ret = dma_ops_unity_map(iommu->default_dom, entry); +		if (ret) +			return ret; +	} + +	return 0; +} + +/* + * Inits the unity mappings required for a specific device + */ +static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, +					  u16 devid) +{ +	struct unity_map_entry *e; +	int ret; + +	list_for_each_entry(e, &amd_iommu_unity_map, list) { +		if (!(devid >= e->devid_start && devid <= e->devid_end)) +			continue; +		ret = dma_ops_unity_map(dma_dom, e); +		if (ret) +			return ret; +	} + +	return 0; +} + +/**************************************************************************** + * + * The next functions belong to the address allocator for the dma_ops + * interface functions. They work like the allocators in the other IOMMU + * drivers. Its basically a bitmap which marks the allocated pages in + * the aperture. Maybe it could be enhanced in the future to a more + * efficient allocator. + * + ****************************************************************************/ + +/* + * The address allocator core functions. + * + * called with domain->lock held + */ + +/* + * Used to reserve address ranges in the aperture (e.g. for exclusion + * ranges. + */ +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, +				      unsigned long start_page, +				      unsigned int pages) +{ +	unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; + +	if (start_page + pages > last_page) +		pages = last_page - start_page; + +	for (i = start_page; i < start_page + pages; ++i) { +		int index = i / APERTURE_RANGE_PAGES; +		int page  = i % APERTURE_RANGE_PAGES; +		__set_bit(page, dom->aperture[index]->bitmap); +	} +} + +/* + * This function is used to add a new aperture range to an existing + * aperture in case of dma_ops domain allocation or address allocation + * failure. + */ +static int alloc_new_range(struct dma_ops_domain *dma_dom, +			   bool populate, gfp_t gfp) +{ +	int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; +	struct amd_iommu *iommu; +	unsigned long i, old_size; + +#ifdef CONFIG_IOMMU_STRESS +	populate = false; +#endif + +	if (index >= APERTURE_MAX_RANGES) +		return -ENOMEM; + +	dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp); +	if (!dma_dom->aperture[index]) +		return -ENOMEM; + +	dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp); +	if (!dma_dom->aperture[index]->bitmap) +		goto out_free; + +	dma_dom->aperture[index]->offset = dma_dom->aperture_size; + +	if (populate) { +		unsigned long address = dma_dom->aperture_size; +		int i, num_ptes = APERTURE_RANGE_PAGES / 512; +		u64 *pte, *pte_page; + +		for (i = 0; i < num_ptes; ++i) { +			pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE, +					&pte_page, gfp); +			if (!pte) +				goto out_free; + +			dma_dom->aperture[index]->pte_pages[i] = pte_page; + +			address += APERTURE_RANGE_SIZE / 64; +		} +	} + +	old_size                = dma_dom->aperture_size; +	dma_dom->aperture_size += APERTURE_RANGE_SIZE; + +	/* Reserve address range used for MSI messages */ +	if (old_size < MSI_ADDR_BASE_LO && +	    dma_dom->aperture_size > MSI_ADDR_BASE_LO) { +		unsigned long spage; +		int pages; + +		pages = iommu_num_pages(MSI_ADDR_BASE_LO, 0x10000, PAGE_SIZE); +		spage = MSI_ADDR_BASE_LO >> PAGE_SHIFT; + +		dma_ops_reserve_addresses(dma_dom, spage, pages); +	} + +	/* Initialize the exclusion range if necessary */ +	for_each_iommu(iommu) { +		if (iommu->exclusion_start && +		    iommu->exclusion_start >= dma_dom->aperture[index]->offset +		    && iommu->exclusion_start < dma_dom->aperture_size) { +			unsigned long startpage; +			int pages = iommu_num_pages(iommu->exclusion_start, +						    iommu->exclusion_length, +						    PAGE_SIZE); +			startpage = iommu->exclusion_start >> PAGE_SHIFT; +			dma_ops_reserve_addresses(dma_dom, startpage, pages); +		} +	} + +	/* +	 * Check for areas already mapped as present in the new aperture +	 * range and mark those pages as reserved in the allocator. Such +	 * mappings may already exist as a result of requested unity +	 * mappings for devices. +	 */ +	for (i = dma_dom->aperture[index]->offset; +	     i < dma_dom->aperture_size; +	     i += PAGE_SIZE) { +		u64 *pte = fetch_pte(&dma_dom->domain, i); +		if (!pte || !IOMMU_PTE_PRESENT(*pte)) +			continue; + +		dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); +	} + +	update_domain(&dma_dom->domain); + +	return 0; + +out_free: +	update_domain(&dma_dom->domain); + +	free_page((unsigned long)dma_dom->aperture[index]->bitmap); + +	kfree(dma_dom->aperture[index]); +	dma_dom->aperture[index] = NULL; + +	return -ENOMEM; +} + +static unsigned long dma_ops_area_alloc(struct device *dev, +					struct dma_ops_domain *dom, +					unsigned int pages, +					unsigned long align_mask, +					u64 dma_mask, +					unsigned long start) +{ +	unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE; +	int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; +	int i = start >> APERTURE_RANGE_SHIFT; +	unsigned long boundary_size; +	unsigned long address = -1; +	unsigned long limit; + +	next_bit >>= PAGE_SHIFT; + +	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, +			PAGE_SIZE) >> PAGE_SHIFT; + +	for (;i < max_index; ++i) { +		unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT; + +		if (dom->aperture[i]->offset >= dma_mask) +			break; + +		limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset, +					       dma_mask >> PAGE_SHIFT); + +		address = iommu_area_alloc(dom->aperture[i]->bitmap, +					   limit, next_bit, pages, 0, +					    boundary_size, align_mask); +		if (address != -1) { +			address = dom->aperture[i]->offset + +				  (address << PAGE_SHIFT); +			dom->next_address = address + (pages << PAGE_SHIFT); +			break; +		} + +		next_bit = 0; +	} + +	return address; +} + +static unsigned long dma_ops_alloc_addresses(struct device *dev, +					     struct dma_ops_domain *dom, +					     unsigned int pages, +					     unsigned long align_mask, +					     u64 dma_mask) +{ +	unsigned long address; + +#ifdef CONFIG_IOMMU_STRESS +	dom->next_address = 0; +	dom->need_flush = true; +#endif + +	address = dma_ops_area_alloc(dev, dom, pages, align_mask, +				     dma_mask, dom->next_address); + +	if (address == -1) { +		dom->next_address = 0; +		address = dma_ops_area_alloc(dev, dom, pages, align_mask, +					     dma_mask, 0); +		dom->need_flush = true; +	} + +	if (unlikely(address == -1)) +		address = DMA_ERROR_CODE; + +	WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); + +	return address; +} + +/* + * The address free function. + * + * called with domain->lock held + */ +static void dma_ops_free_addresses(struct dma_ops_domain *dom, +				   unsigned long address, +				   unsigned int pages) +{ +	unsigned i = address >> APERTURE_RANGE_SHIFT; +	struct aperture_range *range = dom->aperture[i]; + +	BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); + +#ifdef CONFIG_IOMMU_STRESS +	if (i < 4) +		return; +#endif + +	if (address >= dom->next_address) +		dom->need_flush = true; + +	address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; + +	bitmap_clear(range->bitmap, address, pages); + +} + +/**************************************************************************** + * + * The next functions belong to the domain allocation. A domain is + * allocated for every IOMMU as the default domain. If device isolation + * is enabled, every device get its own domain. The most important thing + * about domains is the page table mapping the DMA address space they + * contain. + * + ****************************************************************************/ + +/* + * This function adds a protection domain to the global protection domain list + */ +static void add_domain_to_list(struct protection_domain *domain) +{ +	unsigned long flags; + +	spin_lock_irqsave(&amd_iommu_pd_lock, flags); +	list_add(&domain->list, &amd_iommu_pd_list); +	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + +/* + * This function removes a protection domain to the global + * protection domain list + */ +static void del_domain_from_list(struct protection_domain *domain) +{ +	unsigned long flags; + +	spin_lock_irqsave(&amd_iommu_pd_lock, flags); +	list_del(&domain->list); +	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + +static u16 domain_id_alloc(void) +{ +	unsigned long flags; +	int id; + +	write_lock_irqsave(&amd_iommu_devtable_lock, flags); +	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); +	BUG_ON(id == 0); +	if (id > 0 && id < MAX_DOMAIN_ID) +		__set_bit(id, amd_iommu_pd_alloc_bitmap); +	else +		id = 0; +	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + +	return id; +} + +static void domain_id_free(int id) +{ +	unsigned long flags; + +	write_lock_irqsave(&amd_iommu_devtable_lock, flags); +	if (id > 0 && id < MAX_DOMAIN_ID) +		__clear_bit(id, amd_iommu_pd_alloc_bitmap); +	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +} + +static void free_pagetable(struct protection_domain *domain) +{ +	int i, j; +	u64 *p1, *p2, *p3; + +	p1 = domain->pt_root; + +	if (!p1) +		return; + +	for (i = 0; i < 512; ++i) { +		if (!IOMMU_PTE_PRESENT(p1[i])) +			continue; + +		p2 = IOMMU_PTE_PAGE(p1[i]); +		for (j = 0; j < 512; ++j) { +			if (!IOMMU_PTE_PRESENT(p2[j])) +				continue; +			p3 = IOMMU_PTE_PAGE(p2[j]); +			free_page((unsigned long)p3); +		} + +		free_page((unsigned long)p2); +	} + +	free_page((unsigned long)p1); + +	domain->pt_root = NULL; +} + +/* + * Free a domain, only used if something went wrong in the + * allocation path and we need to free an already allocated page table + */ +static void dma_ops_domain_free(struct dma_ops_domain *dom) +{ +	int i; + +	if (!dom) +		return; + +	del_domain_from_list(&dom->domain); + +	free_pagetable(&dom->domain); + +	for (i = 0; i < APERTURE_MAX_RANGES; ++i) { +		if (!dom->aperture[i]) +			continue; +		free_page((unsigned long)dom->aperture[i]->bitmap); +		kfree(dom->aperture[i]); +	} + +	kfree(dom); +} + +/* + * Allocates a new protection domain usable for the dma_ops functions. + * It also initializes the page table and the address allocator data + * structures required for the dma_ops interface + */ +static struct dma_ops_domain *dma_ops_domain_alloc(void) +{ +	struct dma_ops_domain *dma_dom; + +	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); +	if (!dma_dom) +		return NULL; + +	spin_lock_init(&dma_dom->domain.lock); + +	dma_dom->domain.id = domain_id_alloc(); +	if (dma_dom->domain.id == 0) +		goto free_dma_dom; +	INIT_LIST_HEAD(&dma_dom->domain.dev_list); +	dma_dom->domain.mode = PAGE_MODE_2_LEVEL; +	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); +	dma_dom->domain.flags = PD_DMA_OPS_MASK; +	dma_dom->domain.priv = dma_dom; +	if (!dma_dom->domain.pt_root) +		goto free_dma_dom; + +	dma_dom->need_flush = false; +	dma_dom->target_dev = 0xffff; + +	add_domain_to_list(&dma_dom->domain); + +	if (alloc_new_range(dma_dom, true, GFP_KERNEL)) +		goto free_dma_dom; + +	/* +	 * mark the first page as allocated so we never return 0 as +	 * a valid dma-address. So we can use 0 as error value +	 */ +	dma_dom->aperture[0]->bitmap[0] = 1; +	dma_dom->next_address = 0; + + +	return dma_dom; + +free_dma_dom: +	dma_ops_domain_free(dma_dom); + +	return NULL; +} + +/* + * little helper function to check whether a given protection domain is a + * dma_ops domain + */ +static bool dma_ops_domain(struct protection_domain *domain) +{ +	return domain->flags & PD_DMA_OPS_MASK; +} + +static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) +{ +	u64 pte_root = virt_to_phys(domain->pt_root); +	u32 flags = 0; + +	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) +		    << DEV_ENTRY_MODE_SHIFT; +	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; + +	if (ats) +		flags |= DTE_FLAG_IOTLB; + +	amd_iommu_dev_table[devid].data[3] |= flags; +	amd_iommu_dev_table[devid].data[2]  = domain->id; +	amd_iommu_dev_table[devid].data[1]  = upper_32_bits(pte_root); +	amd_iommu_dev_table[devid].data[0]  = lower_32_bits(pte_root); +} + +static void clear_dte_entry(u16 devid) +{ +	/* remove entry from the device table seen by the hardware */ +	amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; +	amd_iommu_dev_table[devid].data[1] = 0; +	amd_iommu_dev_table[devid].data[2] = 0; + +	amd_iommu_apply_erratum_63(devid); +} + +static void do_attach(struct iommu_dev_data *dev_data, +		      struct protection_domain *domain) +{ +	struct amd_iommu *iommu; +	bool ats; + +	iommu = amd_iommu_rlookup_table[dev_data->devid]; +	ats   = dev_data->ats.enabled; + +	/* Update data structures */ +	dev_data->domain = domain; +	list_add(&dev_data->list, &domain->dev_list); +	set_dte_entry(dev_data->devid, domain, ats); + +	/* Do reference counting */ +	domain->dev_iommu[iommu->index] += 1; +	domain->dev_cnt                 += 1; + +	/* Flush the DTE entry */ +	device_flush_dte(dev_data); +} + +static void do_detach(struct iommu_dev_data *dev_data) +{ +	struct amd_iommu *iommu; + +	iommu = amd_iommu_rlookup_table[dev_data->devid]; + +	/* decrease reference counters */ +	dev_data->domain->dev_iommu[iommu->index] -= 1; +	dev_data->domain->dev_cnt                 -= 1; + +	/* Update data structures */ +	dev_data->domain = NULL; +	list_del(&dev_data->list); +	clear_dte_entry(dev_data->devid); + +	/* Flush the DTE entry */ +	device_flush_dte(dev_data); +} + +/* + * If a device is not yet associated with a domain, this function does + * assigns it visible for the hardware + */ +static int __attach_device(struct iommu_dev_data *dev_data, +			   struct protection_domain *domain) +{ +	int ret; + +	/* lock domain */ +	spin_lock(&domain->lock); + +	if (dev_data->alias_data != NULL) { +		struct iommu_dev_data *alias_data = dev_data->alias_data; + +		/* Some sanity checks */ +		ret = -EBUSY; +		if (alias_data->domain != NULL && +				alias_data->domain != domain) +			goto out_unlock; + +		if (dev_data->domain != NULL && +				dev_data->domain != domain) +			goto out_unlock; + +		/* Do real assignment */ +		if (alias_data->domain == NULL) +			do_attach(alias_data, domain); + +		atomic_inc(&alias_data->bind); +	} + +	if (dev_data->domain == NULL) +		do_attach(dev_data, domain); + +	atomic_inc(&dev_data->bind); + +	ret = 0; + +out_unlock: + +	/* ready */ +	spin_unlock(&domain->lock); + +	return ret; +} + +/* + * If a device is not yet associated with a domain, this function does + * assigns it visible for the hardware + */ +static int attach_device(struct device *dev, +			 struct protection_domain *domain) +{ +	struct pci_dev *pdev = to_pci_dev(dev); +	struct iommu_dev_data *dev_data; +	unsigned long flags; +	int ret; + +	dev_data = get_dev_data(dev); + +	if (amd_iommu_iotlb_sup && pci_enable_ats(pdev, PAGE_SHIFT) == 0) { +		dev_data->ats.enabled = true; +		dev_data->ats.qdep    = pci_ats_queue_depth(pdev); +	} + +	write_lock_irqsave(&amd_iommu_devtable_lock, flags); +	ret = __attach_device(dev_data, domain); +	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + +	/* +	 * We might boot into a crash-kernel here. The crashed kernel +	 * left the caches in the IOMMU dirty. So we have to flush +	 * here to evict all dirty stuff. +	 */ +	domain_flush_tlb_pde(domain); + +	return ret; +} + +/* + * Removes a device from a protection domain (unlocked) + */ +static void __detach_device(struct iommu_dev_data *dev_data) +{ +	struct protection_domain *domain; +	unsigned long flags; + +	BUG_ON(!dev_data->domain); + +	domain = dev_data->domain; + +	spin_lock_irqsave(&domain->lock, flags); + +	if (dev_data->alias_data != NULL) { +		struct iommu_dev_data *alias_data = dev_data->alias_data; + +		if (atomic_dec_and_test(&alias_data->bind)) +			do_detach(alias_data); +	} + +	if (atomic_dec_and_test(&dev_data->bind)) +		do_detach(dev_data); + +	spin_unlock_irqrestore(&domain->lock, flags); + +	/* +	 * If we run in passthrough mode the device must be assigned to the +	 * passthrough domain if it is detached from any other domain. +	 * Make sure we can deassign from the pt_domain itself. +	 */ +	if (iommu_pass_through && +	    (dev_data->domain == NULL && domain != pt_domain)) +		__attach_device(dev_data, pt_domain); +} + +/* + * Removes a device from a protection domain (with devtable_lock held) + */ +static void detach_device(struct device *dev) +{ +	struct iommu_dev_data *dev_data; +	unsigned long flags; + +	dev_data = get_dev_data(dev); + +	/* lock device table */ +	write_lock_irqsave(&amd_iommu_devtable_lock, flags); +	__detach_device(dev_data); +	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + +	if (dev_data->ats.enabled) { +		pci_disable_ats(to_pci_dev(dev)); +		dev_data->ats.enabled = false; +	} +} + +/* + * Find out the protection domain structure for a given PCI device. This + * will give us the pointer to the page table root for example. + */ +static struct protection_domain *domain_for_device(struct device *dev) +{ +	struct iommu_dev_data *dev_data; +	struct protection_domain *dom = NULL; +	unsigned long flags; + +	dev_data   = get_dev_data(dev); + +	if (dev_data->domain) +		return dev_data->domain; + +	if (dev_data->alias_data != NULL) { +		struct iommu_dev_data *alias_data = dev_data->alias_data; + +		read_lock_irqsave(&amd_iommu_devtable_lock, flags); +		if (alias_data->domain != NULL) { +			__attach_device(dev_data, alias_data->domain); +			dom = alias_data->domain; +		} +		read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +	} + +	return dom; +} + +static int device_change_notifier(struct notifier_block *nb, +				  unsigned long action, void *data) +{ +	struct device *dev = data; +	u16 devid; +	struct protection_domain *domain; +	struct dma_ops_domain *dma_domain; +	struct amd_iommu *iommu; +	unsigned long flags; + +	if (!check_device(dev)) +		return 0; + +	devid  = get_device_id(dev); +	iommu  = amd_iommu_rlookup_table[devid]; + +	switch (action) { +	case BUS_NOTIFY_UNBOUND_DRIVER: + +		domain = domain_for_device(dev); + +		if (!domain) +			goto out; +		if (iommu_pass_through) +			break; +		detach_device(dev); +		break; +	case BUS_NOTIFY_ADD_DEVICE: + +		iommu_init_device(dev); + +		domain = domain_for_device(dev); + +		/* allocate a protection domain if a device is added */ +		dma_domain = find_protection_domain(devid); +		if (dma_domain) +			goto out; +		dma_domain = dma_ops_domain_alloc(); +		if (!dma_domain) +			goto out; +		dma_domain->target_dev = devid; + +		spin_lock_irqsave(&iommu_pd_list_lock, flags); +		list_add_tail(&dma_domain->list, &iommu_pd_list); +		spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + +		break; +	case BUS_NOTIFY_DEL_DEVICE: + +		iommu_uninit_device(dev); + +	default: +		goto out; +	} + +	iommu_completion_wait(iommu); + +out: +	return 0; +} + +static struct notifier_block device_nb = { +	.notifier_call = device_change_notifier, +}; + +void amd_iommu_init_notifier(void) +{ +	bus_register_notifier(&pci_bus_type, &device_nb); +} + +/***************************************************************************** + * + * The next functions belong to the dma_ops mapping/unmapping code. + * + *****************************************************************************/ + +/* + * In the dma_ops path we only have the struct device. This function + * finds the corresponding IOMMU, the protection domain and the + * requestor id for a given device. + * If the device is not yet associated with a domain this is also done + * in this function. + */ +static struct protection_domain *get_domain(struct device *dev) +{ +	struct protection_domain *domain; +	struct dma_ops_domain *dma_dom; +	u16 devid = get_device_id(dev); + +	if (!check_device(dev)) +		return ERR_PTR(-EINVAL); + +	domain = domain_for_device(dev); +	if (domain != NULL && !dma_ops_domain(domain)) +		return ERR_PTR(-EBUSY); + +	if (domain != NULL) +		return domain; + +	/* Device not bount yet - bind it */ +	dma_dom = find_protection_domain(devid); +	if (!dma_dom) +		dma_dom = amd_iommu_rlookup_table[devid]->default_dom; +	attach_device(dev, &dma_dom->domain); +	DUMP_printk("Using protection domain %d for device %s\n", +		    dma_dom->domain.id, dev_name(dev)); + +	return &dma_dom->domain; +} + +static void update_device_table(struct protection_domain *domain) +{ +	struct iommu_dev_data *dev_data; + +	list_for_each_entry(dev_data, &domain->dev_list, list) +		set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled); +} + +static void update_domain(struct protection_domain *domain) +{ +	if (!domain->updated) +		return; + +	update_device_table(domain); + +	domain_flush_devices(domain); +	domain_flush_tlb_pde(domain); + +	domain->updated = false; +} + +/* + * This function fetches the PTE for a given address in the aperture + */ +static u64* dma_ops_get_pte(struct dma_ops_domain *dom, +			    unsigned long address) +{ +	struct aperture_range *aperture; +	u64 *pte, *pte_page; + +	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; +	if (!aperture) +		return NULL; + +	pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; +	if (!pte) { +		pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page, +				GFP_ATOMIC); +		aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; +	} else +		pte += PM_LEVEL_INDEX(0, address); + +	update_domain(&dom->domain); + +	return pte; +} + +/* + * This is the generic map function. It maps one 4kb page at paddr to + * the given address in the DMA address space for the domain. + */ +static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom, +				     unsigned long address, +				     phys_addr_t paddr, +				     int direction) +{ +	u64 *pte, __pte; + +	WARN_ON(address > dom->aperture_size); + +	paddr &= PAGE_MASK; + +	pte  = dma_ops_get_pte(dom, address); +	if (!pte) +		return DMA_ERROR_CODE; + +	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; + +	if (direction == DMA_TO_DEVICE) +		__pte |= IOMMU_PTE_IR; +	else if (direction == DMA_FROM_DEVICE) +		__pte |= IOMMU_PTE_IW; +	else if (direction == DMA_BIDIRECTIONAL) +		__pte |= IOMMU_PTE_IR | IOMMU_PTE_IW; + +	WARN_ON(*pte); + +	*pte = __pte; + +	return (dma_addr_t)address; +} + +/* + * The generic unmapping function for on page in the DMA address space. + */ +static void dma_ops_domain_unmap(struct dma_ops_domain *dom, +				 unsigned long address) +{ +	struct aperture_range *aperture; +	u64 *pte; + +	if (address >= dom->aperture_size) +		return; + +	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; +	if (!aperture) +		return; + +	pte  = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; +	if (!pte) +		return; + +	pte += PM_LEVEL_INDEX(0, address); + +	WARN_ON(!*pte); + +	*pte = 0ULL; +} + +/* + * This function contains common code for mapping of a physically + * contiguous memory region into DMA address space. It is used by all + * mapping functions provided with this IOMMU driver. + * Must be called with the domain lock held. + */ +static dma_addr_t __map_single(struct device *dev, +			       struct dma_ops_domain *dma_dom, +			       phys_addr_t paddr, +			       size_t size, +			       int dir, +			       bool align, +			       u64 dma_mask) +{ +	dma_addr_t offset = paddr & ~PAGE_MASK; +	dma_addr_t address, start, ret; +	unsigned int pages; +	unsigned long align_mask = 0; +	int i; + +	pages = iommu_num_pages(paddr, size, PAGE_SIZE); +	paddr &= PAGE_MASK; + +	INC_STATS_COUNTER(total_map_requests); + +	if (pages > 1) +		INC_STATS_COUNTER(cross_page); + +	if (align) +		align_mask = (1UL << get_order(size)) - 1; + +retry: +	address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, +					  dma_mask); +	if (unlikely(address == DMA_ERROR_CODE)) { +		/* +		 * setting next_address here will let the address +		 * allocator only scan the new allocated range in the +		 * first run. This is a small optimization. +		 */ +		dma_dom->next_address = dma_dom->aperture_size; + +		if (alloc_new_range(dma_dom, false, GFP_ATOMIC)) +			goto out; + +		/* +		 * aperture was successfully enlarged by 128 MB, try +		 * allocation again +		 */ +		goto retry; +	} + +	start = address; +	for (i = 0; i < pages; ++i) { +		ret = dma_ops_domain_map(dma_dom, start, paddr, dir); +		if (ret == DMA_ERROR_CODE) +			goto out_unmap; + +		paddr += PAGE_SIZE; +		start += PAGE_SIZE; +	} +	address += offset; + +	ADD_STATS_COUNTER(alloced_io_mem, size); + +	if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { +		domain_flush_tlb(&dma_dom->domain); +		dma_dom->need_flush = false; +	} else if (unlikely(amd_iommu_np_cache)) +		domain_flush_pages(&dma_dom->domain, address, size); + +out: +	return address; + +out_unmap: + +	for (--i; i >= 0; --i) { +		start -= PAGE_SIZE; +		dma_ops_domain_unmap(dma_dom, start); +	} + +	dma_ops_free_addresses(dma_dom, address, pages); + +	return DMA_ERROR_CODE; +} + +/* + * Does the reverse of the __map_single function. Must be called with + * the domain lock held too + */ +static void __unmap_single(struct dma_ops_domain *dma_dom, +			   dma_addr_t dma_addr, +			   size_t size, +			   int dir) +{ +	dma_addr_t flush_addr; +	dma_addr_t i, start; +	unsigned int pages; + +	if ((dma_addr == DMA_ERROR_CODE) || +	    (dma_addr + size > dma_dom->aperture_size)) +		return; + +	flush_addr = dma_addr; +	pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); +	dma_addr &= PAGE_MASK; +	start = dma_addr; + +	for (i = 0; i < pages; ++i) { +		dma_ops_domain_unmap(dma_dom, start); +		start += PAGE_SIZE; +	} + +	SUB_STATS_COUNTER(alloced_io_mem, size); + +	dma_ops_free_addresses(dma_dom, dma_addr, pages); + +	if (amd_iommu_unmap_flush || dma_dom->need_flush) { +		domain_flush_pages(&dma_dom->domain, flush_addr, size); +		dma_dom->need_flush = false; +	} +} + +/* + * The exported map_single function for dma_ops. + */ +static dma_addr_t map_page(struct device *dev, struct page *page, +			   unsigned long offset, size_t size, +			   enum dma_data_direction dir, +			   struct dma_attrs *attrs) +{ +	unsigned long flags; +	struct protection_domain *domain; +	dma_addr_t addr; +	u64 dma_mask; +	phys_addr_t paddr = page_to_phys(page) + offset; + +	INC_STATS_COUNTER(cnt_map_single); + +	domain = get_domain(dev); +	if (PTR_ERR(domain) == -EINVAL) +		return (dma_addr_t)paddr; +	else if (IS_ERR(domain)) +		return DMA_ERROR_CODE; + +	dma_mask = *dev->dma_mask; + +	spin_lock_irqsave(&domain->lock, flags); + +	addr = __map_single(dev, domain->priv, paddr, size, dir, false, +			    dma_mask); +	if (addr == DMA_ERROR_CODE) +		goto out; + +	domain_flush_complete(domain); + +out: +	spin_unlock_irqrestore(&domain->lock, flags); + +	return addr; +} + +/* + * The exported unmap_single function for dma_ops. + */ +static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, +		       enum dma_data_direction dir, struct dma_attrs *attrs) +{ +	unsigned long flags; +	struct protection_domain *domain; + +	INC_STATS_COUNTER(cnt_unmap_single); + +	domain = get_domain(dev); +	if (IS_ERR(domain)) +		return; + +	spin_lock_irqsave(&domain->lock, flags); + +	__unmap_single(domain->priv, dma_addr, size, dir); + +	domain_flush_complete(domain); + +	spin_unlock_irqrestore(&domain->lock, flags); +} + +/* + * This is a special map_sg function which is used if we should map a + * device which is not handled by an AMD IOMMU in the system. + */ +static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, +			   int nelems, int dir) +{ +	struct scatterlist *s; +	int i; + +	for_each_sg(sglist, s, nelems, i) { +		s->dma_address = (dma_addr_t)sg_phys(s); +		s->dma_length  = s->length; +	} + +	return nelems; +} + +/* + * The exported map_sg function for dma_ops (handles scatter-gather + * lists). + */ +static int map_sg(struct device *dev, struct scatterlist *sglist, +		  int nelems, enum dma_data_direction dir, +		  struct dma_attrs *attrs) +{ +	unsigned long flags; +	struct protection_domain *domain; +	int i; +	struct scatterlist *s; +	phys_addr_t paddr; +	int mapped_elems = 0; +	u64 dma_mask; + +	INC_STATS_COUNTER(cnt_map_sg); + +	domain = get_domain(dev); +	if (PTR_ERR(domain) == -EINVAL) +		return map_sg_no_iommu(dev, sglist, nelems, dir); +	else if (IS_ERR(domain)) +		return 0; + +	dma_mask = *dev->dma_mask; + +	spin_lock_irqsave(&domain->lock, flags); + +	for_each_sg(sglist, s, nelems, i) { +		paddr = sg_phys(s); + +		s->dma_address = __map_single(dev, domain->priv, +					      paddr, s->length, dir, false, +					      dma_mask); + +		if (s->dma_address) { +			s->dma_length = s->length; +			mapped_elems++; +		} else +			goto unmap; +	} + +	domain_flush_complete(domain); + +out: +	spin_unlock_irqrestore(&domain->lock, flags); + +	return mapped_elems; +unmap: +	for_each_sg(sglist, s, mapped_elems, i) { +		if (s->dma_address) +			__unmap_single(domain->priv, s->dma_address, +				       s->dma_length, dir); +		s->dma_address = s->dma_length = 0; +	} + +	mapped_elems = 0; + +	goto out; +} + +/* + * The exported map_sg function for dma_ops (handles scatter-gather + * lists). + */ +static void unmap_sg(struct device *dev, struct scatterlist *sglist, +		     int nelems, enum dma_data_direction dir, +		     struct dma_attrs *attrs) +{ +	unsigned long flags; +	struct protection_domain *domain; +	struct scatterlist *s; +	int i; + +	INC_STATS_COUNTER(cnt_unmap_sg); + +	domain = get_domain(dev); +	if (IS_ERR(domain)) +		return; + +	spin_lock_irqsave(&domain->lock, flags); + +	for_each_sg(sglist, s, nelems, i) { +		__unmap_single(domain->priv, s->dma_address, +			       s->dma_length, dir); +		s->dma_address = s->dma_length = 0; +	} + +	domain_flush_complete(domain); + +	spin_unlock_irqrestore(&domain->lock, flags); +} + +/* + * The exported alloc_coherent function for dma_ops. + */ +static void *alloc_coherent(struct device *dev, size_t size, +			    dma_addr_t *dma_addr, gfp_t flag) +{ +	unsigned long flags; +	void *virt_addr; +	struct protection_domain *domain; +	phys_addr_t paddr; +	u64 dma_mask = dev->coherent_dma_mask; + +	INC_STATS_COUNTER(cnt_alloc_coherent); + +	domain = get_domain(dev); +	if (PTR_ERR(domain) == -EINVAL) { +		virt_addr = (void *)__get_free_pages(flag, get_order(size)); +		*dma_addr = __pa(virt_addr); +		return virt_addr; +	} else if (IS_ERR(domain)) +		return NULL; + +	dma_mask  = dev->coherent_dma_mask; +	flag     &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); +	flag     |= __GFP_ZERO; + +	virt_addr = (void *)__get_free_pages(flag, get_order(size)); +	if (!virt_addr) +		return NULL; + +	paddr = virt_to_phys(virt_addr); + +	if (!dma_mask) +		dma_mask = *dev->dma_mask; + +	spin_lock_irqsave(&domain->lock, flags); + +	*dma_addr = __map_single(dev, domain->priv, paddr, +				 size, DMA_BIDIRECTIONAL, true, dma_mask); + +	if (*dma_addr == DMA_ERROR_CODE) { +		spin_unlock_irqrestore(&domain->lock, flags); +		goto out_free; +	} + +	domain_flush_complete(domain); + +	spin_unlock_irqrestore(&domain->lock, flags); + +	return virt_addr; + +out_free: + +	free_pages((unsigned long)virt_addr, get_order(size)); + +	return NULL; +} + +/* + * The exported free_coherent function for dma_ops. + */ +static void free_coherent(struct device *dev, size_t size, +			  void *virt_addr, dma_addr_t dma_addr) +{ +	unsigned long flags; +	struct protection_domain *domain; + +	INC_STATS_COUNTER(cnt_free_coherent); + +	domain = get_domain(dev); +	if (IS_ERR(domain)) +		goto free_mem; + +	spin_lock_irqsave(&domain->lock, flags); + +	__unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); + +	domain_flush_complete(domain); + +	spin_unlock_irqrestore(&domain->lock, flags); + +free_mem: +	free_pages((unsigned long)virt_addr, get_order(size)); +} + +/* + * This function is called by the DMA layer to find out if we can handle a + * particular device. It is part of the dma_ops. + */ +static int amd_iommu_dma_supported(struct device *dev, u64 mask) +{ +	return check_device(dev); +} + +/* + * The function for pre-allocating protection domains. + * + * If the driver core informs the DMA layer if a driver grabs a device + * we don't need to preallocate the protection domains anymore. + * For now we have to. + */ +static void prealloc_protection_domains(void) +{ +	struct pci_dev *dev = NULL; +	struct dma_ops_domain *dma_dom; +	u16 devid; + +	for_each_pci_dev(dev) { + +		/* Do we handle this device? */ +		if (!check_device(&dev->dev)) +			continue; + +		/* Is there already any domain for it? */ +		if (domain_for_device(&dev->dev)) +			continue; + +		devid = get_device_id(&dev->dev); + +		dma_dom = dma_ops_domain_alloc(); +		if (!dma_dom) +			continue; +		init_unity_mappings_for_device(dma_dom, devid); +		dma_dom->target_dev = devid; + +		attach_device(&dev->dev, &dma_dom->domain); + +		list_add_tail(&dma_dom->list, &iommu_pd_list); +	} +} + +static struct dma_map_ops amd_iommu_dma_ops = { +	.alloc_coherent = alloc_coherent, +	.free_coherent = free_coherent, +	.map_page = map_page, +	.unmap_page = unmap_page, +	.map_sg = map_sg, +	.unmap_sg = unmap_sg, +	.dma_supported = amd_iommu_dma_supported, +}; + +static unsigned device_dma_ops_init(void) +{ +	struct pci_dev *pdev = NULL; +	unsigned unhandled = 0; + +	for_each_pci_dev(pdev) { +		if (!check_device(&pdev->dev)) { +			unhandled += 1; +			continue; +		} + +		pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops; +	} + +	return unhandled; +} + +/* + * The function which clues the AMD IOMMU driver into dma_ops. + */ + +void __init amd_iommu_init_api(void) +{ +	register_iommu(&amd_iommu_ops); +} + +int __init amd_iommu_init_dma_ops(void) +{ +	struct amd_iommu *iommu; +	int ret, unhandled; + +	/* +	 * first allocate a default protection domain for every IOMMU we +	 * found in the system. Devices not assigned to any other +	 * protection domain will be assigned to the default one. +	 */ +	for_each_iommu(iommu) { +		iommu->default_dom = dma_ops_domain_alloc(); +		if (iommu->default_dom == NULL) +			return -ENOMEM; +		iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; +		ret = iommu_init_unity_mappings(iommu); +		if (ret) +			goto free_domains; +	} + +	/* +	 * Pre-allocate the protection domains for each device. +	 */ +	prealloc_protection_domains(); + +	iommu_detected = 1; +	swiotlb = 0; + +	/* Make the driver finally visible to the drivers */ +	unhandled = device_dma_ops_init(); +	if (unhandled && max_pfn > MAX_DMA32_PFN) { +		/* There are unhandled devices - initialize swiotlb for them */ +		swiotlb = 1; +	} + +	amd_iommu_stats_init(); + +	return 0; + +free_domains: + +	for_each_iommu(iommu) { +		if (iommu->default_dom) +			dma_ops_domain_free(iommu->default_dom); +	} + +	return ret; +} + +/***************************************************************************** + * + * The following functions belong to the exported interface of AMD IOMMU + * + * This interface allows access to lower level functions of the IOMMU + * like protection domain handling and assignement of devices to domains + * which is not possible with the dma_ops interface. + * + *****************************************************************************/ + +static void cleanup_domain(struct protection_domain *domain) +{ +	struct iommu_dev_data *dev_data, *next; +	unsigned long flags; + +	write_lock_irqsave(&amd_iommu_devtable_lock, flags); + +	list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { +		__detach_device(dev_data); +		atomic_set(&dev_data->bind, 0); +	} + +	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +} + +static void protection_domain_free(struct protection_domain *domain) +{ +	if (!domain) +		return; + +	del_domain_from_list(domain); + +	if (domain->id) +		domain_id_free(domain->id); + +	kfree(domain); +} + +static struct protection_domain *protection_domain_alloc(void) +{ +	struct protection_domain *domain; + +	domain = kzalloc(sizeof(*domain), GFP_KERNEL); +	if (!domain) +		return NULL; + +	spin_lock_init(&domain->lock); +	mutex_init(&domain->api_lock); +	domain->id = domain_id_alloc(); +	if (!domain->id) +		goto out_err; +	INIT_LIST_HEAD(&domain->dev_list); + +	add_domain_to_list(domain); + +	return domain; + +out_err: +	kfree(domain); + +	return NULL; +} + +static int amd_iommu_domain_init(struct iommu_domain *dom) +{ +	struct protection_domain *domain; + +	domain = protection_domain_alloc(); +	if (!domain) +		goto out_free; + +	domain->mode    = PAGE_MODE_3_LEVEL; +	domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); +	if (!domain->pt_root) +		goto out_free; + +	dom->priv = domain; + +	return 0; + +out_free: +	protection_domain_free(domain); + +	return -ENOMEM; +} + +static void amd_iommu_domain_destroy(struct iommu_domain *dom) +{ +	struct protection_domain *domain = dom->priv; + +	if (!domain) +		return; + +	if (domain->dev_cnt > 0) +		cleanup_domain(domain); + +	BUG_ON(domain->dev_cnt != 0); + +	free_pagetable(domain); + +	protection_domain_free(domain); + +	dom->priv = NULL; +} + +static void amd_iommu_detach_device(struct iommu_domain *dom, +				    struct device *dev) +{ +	struct iommu_dev_data *dev_data = dev->archdata.iommu; +	struct amd_iommu *iommu; +	u16 devid; + +	if (!check_device(dev)) +		return; + +	devid = get_device_id(dev); + +	if (dev_data->domain != NULL) +		detach_device(dev); + +	iommu = amd_iommu_rlookup_table[devid]; +	if (!iommu) +		return; + +	iommu_completion_wait(iommu); +} + +static int amd_iommu_attach_device(struct iommu_domain *dom, +				   struct device *dev) +{ +	struct protection_domain *domain = dom->priv; +	struct iommu_dev_data *dev_data; +	struct amd_iommu *iommu; +	int ret; + +	if (!check_device(dev)) +		return -EINVAL; + +	dev_data = dev->archdata.iommu; + +	iommu = amd_iommu_rlookup_table[dev_data->devid]; +	if (!iommu) +		return -EINVAL; + +	if (dev_data->domain) +		detach_device(dev); + +	ret = attach_device(dev, domain); + +	iommu_completion_wait(iommu); + +	return ret; +} + +static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, +			 phys_addr_t paddr, int gfp_order, int iommu_prot) +{ +	unsigned long page_size = 0x1000UL << gfp_order; +	struct protection_domain *domain = dom->priv; +	int prot = 0; +	int ret; + +	if (iommu_prot & IOMMU_READ) +		prot |= IOMMU_PROT_IR; +	if (iommu_prot & IOMMU_WRITE) +		prot |= IOMMU_PROT_IW; + +	mutex_lock(&domain->api_lock); +	ret = iommu_map_page(domain, iova, paddr, prot, page_size); +	mutex_unlock(&domain->api_lock); + +	return ret; +} + +static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, +			   int gfp_order) +{ +	struct protection_domain *domain = dom->priv; +	unsigned long page_size, unmap_size; + +	page_size  = 0x1000UL << gfp_order; + +	mutex_lock(&domain->api_lock); +	unmap_size = iommu_unmap_page(domain, iova, page_size); +	mutex_unlock(&domain->api_lock); + +	domain_flush_tlb_pde(domain); + +	return get_order(unmap_size); +} + +static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, +					  unsigned long iova) +{ +	struct protection_domain *domain = dom->priv; +	unsigned long offset_mask; +	phys_addr_t paddr; +	u64 *pte, __pte; + +	pte = fetch_pte(domain, iova); + +	if (!pte || !IOMMU_PTE_PRESENT(*pte)) +		return 0; + +	if (PM_PTE_LEVEL(*pte) == 0) +		offset_mask = PAGE_SIZE - 1; +	else +		offset_mask = PTE_PAGE_SIZE(*pte) - 1; + +	__pte = *pte & PM_ADDR_MASK; +	paddr = (__pte & ~offset_mask) | (iova & offset_mask); + +	return paddr; +} + +static int amd_iommu_domain_has_cap(struct iommu_domain *domain, +				    unsigned long cap) +{ +	switch (cap) { +	case IOMMU_CAP_CACHE_COHERENCY: +		return 1; +	} + +	return 0; +} + +static struct iommu_ops amd_iommu_ops = { +	.domain_init = amd_iommu_domain_init, +	.domain_destroy = amd_iommu_domain_destroy, +	.attach_dev = amd_iommu_attach_device, +	.detach_dev = amd_iommu_detach_device, +	.map = amd_iommu_map, +	.unmap = amd_iommu_unmap, +	.iova_to_phys = amd_iommu_iova_to_phys, +	.domain_has_cap = amd_iommu_domain_has_cap, +}; + +/***************************************************************************** + * + * The next functions do a basic initialization of IOMMU for pass through + * mode + * + * In passthrough mode the IOMMU is initialized and enabled but not used for + * DMA-API translation. + * + *****************************************************************************/ + +int __init amd_iommu_init_passthrough(void) +{ +	struct amd_iommu *iommu; +	struct pci_dev *dev = NULL; +	u16 devid; + +	/* allocate passthrough domain */ +	pt_domain = protection_domain_alloc(); +	if (!pt_domain) +		return -ENOMEM; + +	pt_domain->mode |= PAGE_MODE_NONE; + +	for_each_pci_dev(dev) { +		if (!check_device(&dev->dev)) +			continue; + +		devid = get_device_id(&dev->dev); + +		iommu = amd_iommu_rlookup_table[devid]; +		if (!iommu) +			continue; + +		attach_device(&dev->dev, pt_domain); +	} + +	pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); + +	return 0; +} diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c new file mode 100644 index 000000000000..82d2410f4205 --- /dev/null +++ b/drivers/iommu/amd_iommu_init.c @@ -0,0 +1,1574 @@ +/* + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + *         Leo Duran <leo.duran@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA + */ + +#include <linux/pci.h> +#include <linux/acpi.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/syscore_ops.h> +#include <linux/interrupt.h> +#include <linux/msi.h> +#include <linux/amd-iommu.h> +#include <asm/pci-direct.h> +#include <asm/iommu.h> +#include <asm/gart.h> +#include <asm/x86_init.h> +#include <asm/iommu_table.h> + +#include "amd_iommu_proto.h" +#include "amd_iommu_types.h" + +/* + * definitions for the ACPI scanning code + */ +#define IVRS_HEADER_LENGTH 48 + +#define ACPI_IVHD_TYPE                  0x10 +#define ACPI_IVMD_TYPE_ALL              0x20 +#define ACPI_IVMD_TYPE                  0x21 +#define ACPI_IVMD_TYPE_RANGE            0x22 + +#define IVHD_DEV_ALL                    0x01 +#define IVHD_DEV_SELECT                 0x02 +#define IVHD_DEV_SELECT_RANGE_START     0x03 +#define IVHD_DEV_RANGE_END              0x04 +#define IVHD_DEV_ALIAS                  0x42 +#define IVHD_DEV_ALIAS_RANGE            0x43 +#define IVHD_DEV_EXT_SELECT             0x46 +#define IVHD_DEV_EXT_SELECT_RANGE       0x47 + +#define IVHD_FLAG_HT_TUN_EN_MASK        0x01 +#define IVHD_FLAG_PASSPW_EN_MASK        0x02 +#define IVHD_FLAG_RESPASSPW_EN_MASK     0x04 +#define IVHD_FLAG_ISOC_EN_MASK          0x08 + +#define IVMD_FLAG_EXCL_RANGE            0x08 +#define IVMD_FLAG_UNITY_MAP             0x01 + +#define ACPI_DEVFLAG_INITPASS           0x01 +#define ACPI_DEVFLAG_EXTINT             0x02 +#define ACPI_DEVFLAG_NMI                0x04 +#define ACPI_DEVFLAG_SYSMGT1            0x10 +#define ACPI_DEVFLAG_SYSMGT2            0x20 +#define ACPI_DEVFLAG_LINT0              0x40 +#define ACPI_DEVFLAG_LINT1              0x80 +#define ACPI_DEVFLAG_ATSDIS             0x10000000 + +/* + * ACPI table definitions + * + * These data structures are laid over the table to parse the important values + * out of it. + */ + +/* + * structure describing one IOMMU in the ACPI table. Typically followed by one + * or more ivhd_entrys. + */ +struct ivhd_header { +	u8 type; +	u8 flags; +	u16 length; +	u16 devid; +	u16 cap_ptr; +	u64 mmio_phys; +	u16 pci_seg; +	u16 info; +	u32 reserved; +} __attribute__((packed)); + +/* + * A device entry describing which devices a specific IOMMU translates and + * which requestor ids they use. + */ +struct ivhd_entry { +	u8 type; +	u16 devid; +	u8 flags; +	u32 ext; +} __attribute__((packed)); + +/* + * An AMD IOMMU memory definition structure. It defines things like exclusion + * ranges for devices and regions that should be unity mapped. + */ +struct ivmd_header { +	u8 type; +	u8 flags; +	u16 length; +	u16 devid; +	u16 aux; +	u64 resv; +	u64 range_start; +	u64 range_length; +} __attribute__((packed)); + +bool amd_iommu_dump; + +static int __initdata amd_iommu_detected; +static bool __initdata amd_iommu_disabled; + +u16 amd_iommu_last_bdf;			/* largest PCI device id we have +					   to handle */ +LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings +					   we find in ACPI */ +bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */ + +LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the +					   system */ + +/* Array to assign indices to IOMMUs*/ +struct amd_iommu *amd_iommus[MAX_IOMMUS]; +int amd_iommus_present; + +/* IOMMUs have a non-present cache? */ +bool amd_iommu_np_cache __read_mostly; +bool amd_iommu_iotlb_sup __read_mostly = true; + +/* + * The ACPI table parsing functions set this variable on an error + */ +static int __initdata amd_iommu_init_err; + +/* + * List of protection domains - used during resume + */ +LIST_HEAD(amd_iommu_pd_list); +spinlock_t amd_iommu_pd_lock; + +/* + * Pointer to the device table which is shared by all AMD IOMMUs + * it is indexed by the PCI device id or the HT unit id and contains + * information about the domain the device belongs to as well as the + * page table root pointer. + */ +struct dev_table_entry *amd_iommu_dev_table; + +/* + * The alias table is a driver specific data structure which contains the + * mappings of the PCI device ids to the actual requestor ids on the IOMMU. + * More than one device can share the same requestor id. + */ +u16 *amd_iommu_alias_table; + +/* + * The rlookup table is used to find the IOMMU which is responsible + * for a specific device. It is also indexed by the PCI device id. + */ +struct amd_iommu **amd_iommu_rlookup_table; + +/* + * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap + * to know which ones are already in use. + */ +unsigned long *amd_iommu_pd_alloc_bitmap; + +static u32 dev_table_size;	/* size of the device table */ +static u32 alias_table_size;	/* size of the alias table */ +static u32 rlookup_table_size;	/* size if the rlookup table */ + +/* + * This function flushes all internal caches of + * the IOMMU used by this driver. + */ +extern void iommu_flush_all_caches(struct amd_iommu *iommu); + +static inline void update_last_devid(u16 devid) +{ +	if (devid > amd_iommu_last_bdf) +		amd_iommu_last_bdf = devid; +} + +static inline unsigned long tbl_size(int entry_size) +{ +	unsigned shift = PAGE_SHIFT + +			 get_order(((int)amd_iommu_last_bdf + 1) * entry_size); + +	return 1UL << shift; +} + +/* Access to l1 and l2 indexed register spaces */ + +static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address) +{ +	u32 val; + +	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); +	pci_read_config_dword(iommu->dev, 0xfc, &val); +	return val; +} + +static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val) +{ +	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31)); +	pci_write_config_dword(iommu->dev, 0xfc, val); +	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); +} + +static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address) +{ +	u32 val; + +	pci_write_config_dword(iommu->dev, 0xf0, address); +	pci_read_config_dword(iommu->dev, 0xf4, &val); +	return val; +} + +static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val) +{ +	pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8)); +	pci_write_config_dword(iommu->dev, 0xf4, val); +} + +/**************************************************************************** + * + * AMD IOMMU MMIO register space handling functions + * + * These functions are used to program the IOMMU device registers in + * MMIO space required for that driver. + * + ****************************************************************************/ + +/* + * This function set the exclusion range in the IOMMU. DMA accesses to the + * exclusion range are passed through untranslated + */ +static void iommu_set_exclusion_range(struct amd_iommu *iommu) +{ +	u64 start = iommu->exclusion_start & PAGE_MASK; +	u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; +	u64 entry; + +	if (!iommu->exclusion_start) +		return; + +	entry = start | MMIO_EXCL_ENABLE_MASK; +	memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET, +			&entry, sizeof(entry)); + +	entry = limit; +	memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET, +			&entry, sizeof(entry)); +} + +/* Programs the physical address of the device table into the IOMMU hardware */ +static void __init iommu_set_device_table(struct amd_iommu *iommu) +{ +	u64 entry; + +	BUG_ON(iommu->mmio_base == NULL); + +	entry = virt_to_phys(amd_iommu_dev_table); +	entry |= (dev_table_size >> 12) - 1; +	memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET, +			&entry, sizeof(entry)); +} + +/* Generic functions to enable/disable certain features of the IOMMU. */ +static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) +{ +	u32 ctrl; + +	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); +	ctrl |= (1 << bit); +	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); +} + +static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) +{ +	u32 ctrl; + +	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); +	ctrl &= ~(1 << bit); +	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); +} + +/* Function to enable the hardware */ +static void iommu_enable(struct amd_iommu *iommu) +{ +	static const char * const feat_str[] = { +		"PreF", "PPR", "X2APIC", "NX", "GT", "[5]", +		"IA", "GA", "HE", "PC", NULL +	}; +	int i; + +	printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx", +	       dev_name(&iommu->dev->dev), iommu->cap_ptr); + +	if (iommu->cap & (1 << IOMMU_CAP_EFR)) { +		printk(KERN_CONT " extended features: "); +		for (i = 0; feat_str[i]; ++i) +			if (iommu_feature(iommu, (1ULL << i))) +				printk(KERN_CONT " %s", feat_str[i]); +	} +	printk(KERN_CONT "\n"); + +	iommu_feature_enable(iommu, CONTROL_IOMMU_EN); +} + +static void iommu_disable(struct amd_iommu *iommu) +{ +	/* Disable command buffer */ +	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN); + +	/* Disable event logging and event interrupts */ +	iommu_feature_disable(iommu, CONTROL_EVT_INT_EN); +	iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN); + +	/* Disable IOMMU hardware itself */ +	iommu_feature_disable(iommu, CONTROL_IOMMU_EN); +} + +/* + * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in + * the system has one. + */ +static u8 * __init iommu_map_mmio_space(u64 address) +{ +	u8 *ret; + +	if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) { +		pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n", +			address); +		pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n"); +		return NULL; +	} + +	ret = ioremap_nocache(address, MMIO_REGION_LENGTH); +	if (ret != NULL) +		return ret; + +	release_mem_region(address, MMIO_REGION_LENGTH); + +	return NULL; +} + +static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu) +{ +	if (iommu->mmio_base) +		iounmap(iommu->mmio_base); +	release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); +} + +/**************************************************************************** + * + * The functions below belong to the first pass of AMD IOMMU ACPI table + * parsing. In this pass we try to find out the highest device id this + * code has to handle. Upon this information the size of the shared data + * structures is determined later. + * + ****************************************************************************/ + +/* + * This function calculates the length of a given IVHD entry + */ +static inline int ivhd_entry_length(u8 *ivhd) +{ +	return 0x04 << (*ivhd >> 6); +} + +/* + * This function reads the last device id the IOMMU has to handle from the PCI + * capability header for this IOMMU + */ +static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) +{ +	u32 cap; + +	cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); +	update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); + +	return 0; +} + +/* + * After reading the highest device id from the IOMMU PCI capability header + * this function looks if there is a higher device id defined in the ACPI table + */ +static int __init find_last_devid_from_ivhd(struct ivhd_header *h) +{ +	u8 *p = (void *)h, *end = (void *)h; +	struct ivhd_entry *dev; + +	p += sizeof(*h); +	end += h->length; + +	find_last_devid_on_pci(PCI_BUS(h->devid), +			PCI_SLOT(h->devid), +			PCI_FUNC(h->devid), +			h->cap_ptr); + +	while (p < end) { +		dev = (struct ivhd_entry *)p; +		switch (dev->type) { +		case IVHD_DEV_SELECT: +		case IVHD_DEV_RANGE_END: +		case IVHD_DEV_ALIAS: +		case IVHD_DEV_EXT_SELECT: +			/* all the above subfield types refer to device ids */ +			update_last_devid(dev->devid); +			break; +		default: +			break; +		} +		p += ivhd_entry_length(p); +	} + +	WARN_ON(p != end); + +	return 0; +} + +/* + * Iterate over all IVHD entries in the ACPI table and find the highest device + * id which we need to handle. This is the first of three functions which parse + * the ACPI table. So we check the checksum here. + */ +static int __init find_last_devid_acpi(struct acpi_table_header *table) +{ +	int i; +	u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table; +	struct ivhd_header *h; + +	/* +	 * Validate checksum here so we don't need to do it when +	 * we actually parse the table +	 */ +	for (i = 0; i < table->length; ++i) +		checksum += p[i]; +	if (checksum != 0) { +		/* ACPI table corrupt */ +		amd_iommu_init_err = -ENODEV; +		return 0; +	} + +	p += IVRS_HEADER_LENGTH; + +	end += table->length; +	while (p < end) { +		h = (struct ivhd_header *)p; +		switch (h->type) { +		case ACPI_IVHD_TYPE: +			find_last_devid_from_ivhd(h); +			break; +		default: +			break; +		} +		p += h->length; +	} +	WARN_ON(p != end); + +	return 0; +} + +/**************************************************************************** + * + * The following functions belong the the code path which parses the ACPI table + * the second time. In this ACPI parsing iteration we allocate IOMMU specific + * data structures, initialize the device/alias/rlookup table and also + * basically initialize the hardware. + * + ****************************************************************************/ + +/* + * Allocates the command buffer. This buffer is per AMD IOMMU. We can + * write commands to that buffer later and the IOMMU will execute them + * asynchronously + */ +static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) +{ +	u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, +			get_order(CMD_BUFFER_SIZE)); + +	if (cmd_buf == NULL) +		return NULL; + +	iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED; + +	return cmd_buf; +} + +/* + * This function resets the command buffer if the IOMMU stopped fetching + * commands from it. + */ +void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu) +{ +	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN); + +	writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); +	writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + +	iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); +} + +/* + * This function writes the command buffer address to the hardware and + * enables it. + */ +static void iommu_enable_command_buffer(struct amd_iommu *iommu) +{ +	u64 entry; + +	BUG_ON(iommu->cmd_buf == NULL); + +	entry = (u64)virt_to_phys(iommu->cmd_buf); +	entry |= MMIO_CMD_SIZE_512; + +	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, +		    &entry, sizeof(entry)); + +	amd_iommu_reset_cmd_buffer(iommu); +	iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED); +} + +static void __init free_command_buffer(struct amd_iommu *iommu) +{ +	free_pages((unsigned long)iommu->cmd_buf, +		   get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED))); +} + +/* allocates the memory where the IOMMU will log its events to */ +static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) +{ +	iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, +						get_order(EVT_BUFFER_SIZE)); + +	if (iommu->evt_buf == NULL) +		return NULL; + +	iommu->evt_buf_size = EVT_BUFFER_SIZE; + +	return iommu->evt_buf; +} + +static void iommu_enable_event_buffer(struct amd_iommu *iommu) +{ +	u64 entry; + +	BUG_ON(iommu->evt_buf == NULL); + +	entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; + +	memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, +		    &entry, sizeof(entry)); + +	/* set head and tail to zero manually */ +	writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); +	writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); + +	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); +} + +static void __init free_event_buffer(struct amd_iommu *iommu) +{ +	free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE)); +} + +/* sets a specific bit in the device table entry. */ +static void set_dev_entry_bit(u16 devid, u8 bit) +{ +	int i = (bit >> 5) & 0x07; +	int _bit = bit & 0x1f; + +	amd_iommu_dev_table[devid].data[i] |= (1 << _bit); +} + +static int get_dev_entry_bit(u16 devid, u8 bit) +{ +	int i = (bit >> 5) & 0x07; +	int _bit = bit & 0x1f; + +	return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit; +} + + +void amd_iommu_apply_erratum_63(u16 devid) +{ +	int sysmgt; + +	sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) | +		 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1); + +	if (sysmgt == 0x01) +		set_dev_entry_bit(devid, DEV_ENTRY_IW); +} + +/* Writes the specific IOMMU for a device into the rlookup table */ +static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) +{ +	amd_iommu_rlookup_table[devid] = iommu; +} + +/* + * This function takes the device specific flags read from the ACPI + * table and sets up the device table entry with that information + */ +static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, +					   u16 devid, u32 flags, u32 ext_flags) +{ +	if (flags & ACPI_DEVFLAG_INITPASS) +		set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); +	if (flags & ACPI_DEVFLAG_EXTINT) +		set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS); +	if (flags & ACPI_DEVFLAG_NMI) +		set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS); +	if (flags & ACPI_DEVFLAG_SYSMGT1) +		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1); +	if (flags & ACPI_DEVFLAG_SYSMGT2) +		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2); +	if (flags & ACPI_DEVFLAG_LINT0) +		set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); +	if (flags & ACPI_DEVFLAG_LINT1) +		set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); + +	amd_iommu_apply_erratum_63(devid); + +	set_iommu_for_device(iommu, devid); +} + +/* + * Reads the device exclusion range from ACPI and initialize IOMMU with + * it + */ +static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) +{ +	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + +	if (!(m->flags & IVMD_FLAG_EXCL_RANGE)) +		return; + +	if (iommu) { +		/* +		 * We only can configure exclusion ranges per IOMMU, not +		 * per device. But we can enable the exclusion range per +		 * device. This is done here +		 */ +		set_dev_entry_bit(m->devid, DEV_ENTRY_EX); +		iommu->exclusion_start = m->range_start; +		iommu->exclusion_length = m->range_length; +	} +} + +/* + * This function reads some important data from the IOMMU PCI space and + * initializes the driver data structure with it. It reads the hardware + * capabilities and the first/last device entries + */ +static void __init init_iommu_from_pci(struct amd_iommu *iommu) +{ +	int cap_ptr = iommu->cap_ptr; +	u32 range, misc, low, high; +	int i, j; + +	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, +			      &iommu->cap); +	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET, +			      &range); +	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET, +			      &misc); + +	iommu->first_device = calc_devid(MMIO_GET_BUS(range), +					 MMIO_GET_FD(range)); +	iommu->last_device = calc_devid(MMIO_GET_BUS(range), +					MMIO_GET_LD(range)); +	iommu->evt_msi_num = MMIO_MSI_NUM(misc); + +	if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB))) +		amd_iommu_iotlb_sup = false; + +	/* read extended feature bits */ +	low  = readl(iommu->mmio_base + MMIO_EXT_FEATURES); +	high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4); + +	iommu->features = ((u64)high << 32) | low; + +	if (!is_rd890_iommu(iommu->dev)) +		return; + +	/* +	 * Some rd890 systems may not be fully reconfigured by the BIOS, so +	 * it's necessary for us to store this information so it can be +	 * reprogrammed on resume +	 */ + +	pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4, +			      &iommu->stored_addr_lo); +	pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8, +			      &iommu->stored_addr_hi); + +	/* Low bit locks writes to configuration space */ +	iommu->stored_addr_lo &= ~1; + +	for (i = 0; i < 6; i++) +		for (j = 0; j < 0x12; j++) +			iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j); + +	for (i = 0; i < 0x83; i++) +		iommu->stored_l2[i] = iommu_read_l2(iommu, i); +} + +/* + * Takes a pointer to an AMD IOMMU entry in the ACPI table and + * initializes the hardware and our data structures with it. + */ +static void __init init_iommu_from_acpi(struct amd_iommu *iommu, +					struct ivhd_header *h) +{ +	u8 *p = (u8 *)h; +	u8 *end = p, flags = 0; +	u16 devid = 0, devid_start = 0, devid_to = 0; +	u32 dev_i, ext_flags = 0; +	bool alias = false; +	struct ivhd_entry *e; + +	/* +	 * First save the recommended feature enable bits from ACPI +	 */ +	iommu->acpi_flags = h->flags; + +	/* +	 * Done. Now parse the device entries +	 */ +	p += sizeof(struct ivhd_header); +	end += h->length; + + +	while (p < end) { +		e = (struct ivhd_entry *)p; +		switch (e->type) { +		case IVHD_DEV_ALL: + +			DUMP_printk("  DEV_ALL\t\t\t first devid: %02x:%02x.%x" +				    " last device %02x:%02x.%x flags: %02x\n", +				    PCI_BUS(iommu->first_device), +				    PCI_SLOT(iommu->first_device), +				    PCI_FUNC(iommu->first_device), +				    PCI_BUS(iommu->last_device), +				    PCI_SLOT(iommu->last_device), +				    PCI_FUNC(iommu->last_device), +				    e->flags); + +			for (dev_i = iommu->first_device; +					dev_i <= iommu->last_device; ++dev_i) +				set_dev_entry_from_acpi(iommu, dev_i, +							e->flags, 0); +			break; +		case IVHD_DEV_SELECT: + +			DUMP_printk("  DEV_SELECT\t\t\t devid: %02x:%02x.%x " +				    "flags: %02x\n", +				    PCI_BUS(e->devid), +				    PCI_SLOT(e->devid), +				    PCI_FUNC(e->devid), +				    e->flags); + +			devid = e->devid; +			set_dev_entry_from_acpi(iommu, devid, e->flags, 0); +			break; +		case IVHD_DEV_SELECT_RANGE_START: + +			DUMP_printk("  DEV_SELECT_RANGE_START\t " +				    "devid: %02x:%02x.%x flags: %02x\n", +				    PCI_BUS(e->devid), +				    PCI_SLOT(e->devid), +				    PCI_FUNC(e->devid), +				    e->flags); + +			devid_start = e->devid; +			flags = e->flags; +			ext_flags = 0; +			alias = false; +			break; +		case IVHD_DEV_ALIAS: + +			DUMP_printk("  DEV_ALIAS\t\t\t devid: %02x:%02x.%x " +				    "flags: %02x devid_to: %02x:%02x.%x\n", +				    PCI_BUS(e->devid), +				    PCI_SLOT(e->devid), +				    PCI_FUNC(e->devid), +				    e->flags, +				    PCI_BUS(e->ext >> 8), +				    PCI_SLOT(e->ext >> 8), +				    PCI_FUNC(e->ext >> 8)); + +			devid = e->devid; +			devid_to = e->ext >> 8; +			set_dev_entry_from_acpi(iommu, devid   , e->flags, 0); +			set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); +			amd_iommu_alias_table[devid] = devid_to; +			break; +		case IVHD_DEV_ALIAS_RANGE: + +			DUMP_printk("  DEV_ALIAS_RANGE\t\t " +				    "devid: %02x:%02x.%x flags: %02x " +				    "devid_to: %02x:%02x.%x\n", +				    PCI_BUS(e->devid), +				    PCI_SLOT(e->devid), +				    PCI_FUNC(e->devid), +				    e->flags, +				    PCI_BUS(e->ext >> 8), +				    PCI_SLOT(e->ext >> 8), +				    PCI_FUNC(e->ext >> 8)); + +			devid_start = e->devid; +			flags = e->flags; +			devid_to = e->ext >> 8; +			ext_flags = 0; +			alias = true; +			break; +		case IVHD_DEV_EXT_SELECT: + +			DUMP_printk("  DEV_EXT_SELECT\t\t devid: %02x:%02x.%x " +				    "flags: %02x ext: %08x\n", +				    PCI_BUS(e->devid), +				    PCI_SLOT(e->devid), +				    PCI_FUNC(e->devid), +				    e->flags, e->ext); + +			devid = e->devid; +			set_dev_entry_from_acpi(iommu, devid, e->flags, +						e->ext); +			break; +		case IVHD_DEV_EXT_SELECT_RANGE: + +			DUMP_printk("  DEV_EXT_SELECT_RANGE\t devid: " +				    "%02x:%02x.%x flags: %02x ext: %08x\n", +				    PCI_BUS(e->devid), +				    PCI_SLOT(e->devid), +				    PCI_FUNC(e->devid), +				    e->flags, e->ext); + +			devid_start = e->devid; +			flags = e->flags; +			ext_flags = e->ext; +			alias = false; +			break; +		case IVHD_DEV_RANGE_END: + +			DUMP_printk("  DEV_RANGE_END\t\t devid: %02x:%02x.%x\n", +				    PCI_BUS(e->devid), +				    PCI_SLOT(e->devid), +				    PCI_FUNC(e->devid)); + +			devid = e->devid; +			for (dev_i = devid_start; dev_i <= devid; ++dev_i) { +				if (alias) { +					amd_iommu_alias_table[dev_i] = devid_to; +					set_dev_entry_from_acpi(iommu, +						devid_to, flags, ext_flags); +				} +				set_dev_entry_from_acpi(iommu, dev_i, +							flags, ext_flags); +			} +			break; +		default: +			break; +		} + +		p += ivhd_entry_length(p); +	} +} + +/* Initializes the device->iommu mapping for the driver */ +static int __init init_iommu_devices(struct amd_iommu *iommu) +{ +	u32 i; + +	for (i = iommu->first_device; i <= iommu->last_device; ++i) +		set_iommu_for_device(iommu, i); + +	return 0; +} + +static void __init free_iommu_one(struct amd_iommu *iommu) +{ +	free_command_buffer(iommu); +	free_event_buffer(iommu); +	iommu_unmap_mmio_space(iommu); +} + +static void __init free_iommu_all(void) +{ +	struct amd_iommu *iommu, *next; + +	for_each_iommu_safe(iommu, next) { +		list_del(&iommu->list); +		free_iommu_one(iommu); +		kfree(iommu); +	} +} + +/* + * This function clues the initialization function for one IOMMU + * together and also allocates the command buffer and programs the + * hardware. It does NOT enable the IOMMU. This is done afterwards. + */ +static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) +{ +	spin_lock_init(&iommu->lock); + +	/* Add IOMMU to internal data structures */ +	list_add_tail(&iommu->list, &amd_iommu_list); +	iommu->index             = amd_iommus_present++; + +	if (unlikely(iommu->index >= MAX_IOMMUS)) { +		WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n"); +		return -ENOSYS; +	} + +	/* Index is fine - add IOMMU to the array */ +	amd_iommus[iommu->index] = iommu; + +	/* +	 * Copy data from ACPI table entry to the iommu struct +	 */ +	iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff); +	if (!iommu->dev) +		return 1; + +	iommu->cap_ptr = h->cap_ptr; +	iommu->pci_seg = h->pci_seg; +	iommu->mmio_phys = h->mmio_phys; +	iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); +	if (!iommu->mmio_base) +		return -ENOMEM; + +	iommu->cmd_buf = alloc_command_buffer(iommu); +	if (!iommu->cmd_buf) +		return -ENOMEM; + +	iommu->evt_buf = alloc_event_buffer(iommu); +	if (!iommu->evt_buf) +		return -ENOMEM; + +	iommu->int_enabled = false; + +	init_iommu_from_pci(iommu); +	init_iommu_from_acpi(iommu, h); +	init_iommu_devices(iommu); + +	if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) +		amd_iommu_np_cache = true; + +	return pci_enable_device(iommu->dev); +} + +/* + * Iterates over all IOMMU entries in the ACPI table, allocates the + * IOMMU structure and initializes it with init_iommu_one() + */ +static int __init init_iommu_all(struct acpi_table_header *table) +{ +	u8 *p = (u8 *)table, *end = (u8 *)table; +	struct ivhd_header *h; +	struct amd_iommu *iommu; +	int ret; + +	end += table->length; +	p += IVRS_HEADER_LENGTH; + +	while (p < end) { +		h = (struct ivhd_header *)p; +		switch (*p) { +		case ACPI_IVHD_TYPE: + +			DUMP_printk("device: %02x:%02x.%01x cap: %04x " +				    "seg: %d flags: %01x info %04x\n", +				    PCI_BUS(h->devid), PCI_SLOT(h->devid), +				    PCI_FUNC(h->devid), h->cap_ptr, +				    h->pci_seg, h->flags, h->info); +			DUMP_printk("       mmio-addr: %016llx\n", +				    h->mmio_phys); + +			iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); +			if (iommu == NULL) { +				amd_iommu_init_err = -ENOMEM; +				return 0; +			} + +			ret = init_iommu_one(iommu, h); +			if (ret) { +				amd_iommu_init_err = ret; +				return 0; +			} +			break; +		default: +			break; +		} +		p += h->length; + +	} +	WARN_ON(p != end); + +	return 0; +} + +/**************************************************************************** + * + * The following functions initialize the MSI interrupts for all IOMMUs + * in the system. Its a bit challenging because there could be multiple + * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per + * pci_dev. + * + ****************************************************************************/ + +static int iommu_setup_msi(struct amd_iommu *iommu) +{ +	int r; + +	if (pci_enable_msi(iommu->dev)) +		return 1; + +	r = request_threaded_irq(iommu->dev->irq, +				 amd_iommu_int_handler, +				 amd_iommu_int_thread, +				 0, "AMD-Vi", +				 iommu->dev); + +	if (r) { +		pci_disable_msi(iommu->dev); +		return 1; +	} + +	iommu->int_enabled = true; +	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); + +	return 0; +} + +static int iommu_init_msi(struct amd_iommu *iommu) +{ +	if (iommu->int_enabled) +		return 0; + +	if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) +		return iommu_setup_msi(iommu); + +	return 1; +} + +/**************************************************************************** + * + * The next functions belong to the third pass of parsing the ACPI + * table. In this last pass the memory mapping requirements are + * gathered (like exclusion and unity mapping reanges). + * + ****************************************************************************/ + +static void __init free_unity_maps(void) +{ +	struct unity_map_entry *entry, *next; + +	list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) { +		list_del(&entry->list); +		kfree(entry); +	} +} + +/* called when we find an exclusion range definition in ACPI */ +static int __init init_exclusion_range(struct ivmd_header *m) +{ +	int i; + +	switch (m->type) { +	case ACPI_IVMD_TYPE: +		set_device_exclusion_range(m->devid, m); +		break; +	case ACPI_IVMD_TYPE_ALL: +		for (i = 0; i <= amd_iommu_last_bdf; ++i) +			set_device_exclusion_range(i, m); +		break; +	case ACPI_IVMD_TYPE_RANGE: +		for (i = m->devid; i <= m->aux; ++i) +			set_device_exclusion_range(i, m); +		break; +	default: +		break; +	} + +	return 0; +} + +/* called for unity map ACPI definition */ +static int __init init_unity_map_range(struct ivmd_header *m) +{ +	struct unity_map_entry *e = 0; +	char *s; + +	e = kzalloc(sizeof(*e), GFP_KERNEL); +	if (e == NULL) +		return -ENOMEM; + +	switch (m->type) { +	default: +		kfree(e); +		return 0; +	case ACPI_IVMD_TYPE: +		s = "IVMD_TYPEi\t\t\t"; +		e->devid_start = e->devid_end = m->devid; +		break; +	case ACPI_IVMD_TYPE_ALL: +		s = "IVMD_TYPE_ALL\t\t"; +		e->devid_start = 0; +		e->devid_end = amd_iommu_last_bdf; +		break; +	case ACPI_IVMD_TYPE_RANGE: +		s = "IVMD_TYPE_RANGE\t\t"; +		e->devid_start = m->devid; +		e->devid_end = m->aux; +		break; +	} +	e->address_start = PAGE_ALIGN(m->range_start); +	e->address_end = e->address_start + PAGE_ALIGN(m->range_length); +	e->prot = m->flags >> 1; + +	DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x" +		    " range_start: %016llx range_end: %016llx flags: %x\n", s, +		    PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start), +		    PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end), +		    PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end), +		    e->address_start, e->address_end, m->flags); + +	list_add_tail(&e->list, &amd_iommu_unity_map); + +	return 0; +} + +/* iterates over all memory definitions we find in the ACPI table */ +static int __init init_memory_definitions(struct acpi_table_header *table) +{ +	u8 *p = (u8 *)table, *end = (u8 *)table; +	struct ivmd_header *m; + +	end += table->length; +	p += IVRS_HEADER_LENGTH; + +	while (p < end) { +		m = (struct ivmd_header *)p; +		if (m->flags & IVMD_FLAG_EXCL_RANGE) +			init_exclusion_range(m); +		else if (m->flags & IVMD_FLAG_UNITY_MAP) +			init_unity_map_range(m); + +		p += m->length; +	} + +	return 0; +} + +/* + * Init the device table to not allow DMA access for devices and + * suppress all page faults + */ +static void init_device_table(void) +{ +	u32 devid; + +	for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { +		set_dev_entry_bit(devid, DEV_ENTRY_VALID); +		set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION); +	} +} + +static void iommu_init_flags(struct amd_iommu *iommu) +{ +	iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ? +		iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : +		iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); + +	iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ? +		iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : +		iommu_feature_disable(iommu, CONTROL_PASSPW_EN); + +	iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ? +		iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : +		iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); + +	iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ? +		iommu_feature_enable(iommu, CONTROL_ISOC_EN) : +		iommu_feature_disable(iommu, CONTROL_ISOC_EN); + +	/* +	 * make IOMMU memory accesses cache coherent +	 */ +	iommu_feature_enable(iommu, CONTROL_COHERENT_EN); +} + +static void iommu_apply_resume_quirks(struct amd_iommu *iommu) +{ +	int i, j; +	u32 ioc_feature_control; +	struct pci_dev *pdev = NULL; + +	/* RD890 BIOSes may not have completely reconfigured the iommu */ +	if (!is_rd890_iommu(iommu->dev)) +		return; + +	/* +	 * First, we need to ensure that the iommu is enabled. This is +	 * controlled by a register in the northbridge +	 */ +	pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0)); + +	if (!pdev) +		return; + +	/* Select Northbridge indirect register 0x75 and enable writing */ +	pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7)); +	pci_read_config_dword(pdev, 0x64, &ioc_feature_control); + +	/* Enable the iommu */ +	if (!(ioc_feature_control & 0x1)) +		pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1); + +	pci_dev_put(pdev); + +	/* Restore the iommu BAR */ +	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, +			       iommu->stored_addr_lo); +	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8, +			       iommu->stored_addr_hi); + +	/* Restore the l1 indirect regs for each of the 6 l1s */ +	for (i = 0; i < 6; i++) +		for (j = 0; j < 0x12; j++) +			iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]); + +	/* Restore the l2 indirect regs */ +	for (i = 0; i < 0x83; i++) +		iommu_write_l2(iommu, i, iommu->stored_l2[i]); + +	/* Lock PCI setup registers */ +	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, +			       iommu->stored_addr_lo | 1); +} + +/* + * This function finally enables all IOMMUs found in the system after + * they have been initialized + */ +static void enable_iommus(void) +{ +	struct amd_iommu *iommu; + +	for_each_iommu(iommu) { +		iommu_disable(iommu); +		iommu_init_flags(iommu); +		iommu_set_device_table(iommu); +		iommu_enable_command_buffer(iommu); +		iommu_enable_event_buffer(iommu); +		iommu_set_exclusion_range(iommu); +		iommu_init_msi(iommu); +		iommu_enable(iommu); +		iommu_flush_all_caches(iommu); +	} +} + +static void disable_iommus(void) +{ +	struct amd_iommu *iommu; + +	for_each_iommu(iommu) +		iommu_disable(iommu); +} + +/* + * Suspend/Resume support + * disable suspend until real resume implemented + */ + +static void amd_iommu_resume(void) +{ +	struct amd_iommu *iommu; + +	for_each_iommu(iommu) +		iommu_apply_resume_quirks(iommu); + +	/* re-load the hardware */ +	enable_iommus(); + +	/* +	 * we have to flush after the IOMMUs are enabled because a +	 * disabled IOMMU will never execute the commands we send +	 */ +	for_each_iommu(iommu) +		iommu_flush_all_caches(iommu); +} + +static int amd_iommu_suspend(void) +{ +	/* disable IOMMUs to go out of the way for BIOS */ +	disable_iommus(); + +	return 0; +} + +static struct syscore_ops amd_iommu_syscore_ops = { +	.suspend = amd_iommu_suspend, +	.resume = amd_iommu_resume, +}; + +/* + * This is the core init function for AMD IOMMU hardware in the system. + * This function is called from the generic x86 DMA layer initialization + * code. + * + * This function basically parses the ACPI table for AMD IOMMU (IVRS) + * three times: + * + *	1 pass) Find the highest PCI device id the driver has to handle. + *		Upon this information the size of the data structures is + *		determined that needs to be allocated. + * + *	2 pass) Initialize the data structures just allocated with the + *		information in the ACPI table about available AMD IOMMUs + *		in the system. It also maps the PCI devices in the + *		system to specific IOMMUs + * + *	3 pass) After the basic data structures are allocated and + *		initialized we update them with information about memory + *		remapping requirements parsed out of the ACPI table in + *		this last pass. + * + * After that the hardware is initialized and ready to go. In the last + * step we do some Linux specific things like registering the driver in + * the dma_ops interface and initializing the suspend/resume support + * functions. Finally it prints some information about AMD IOMMUs and + * the driver state and enables the hardware. + */ +static int __init amd_iommu_init(void) +{ +	int i, ret = 0; + +	/* +	 * First parse ACPI tables to find the largest Bus/Dev/Func +	 * we need to handle. Upon this information the shared data +	 * structures for the IOMMUs in the system will be allocated +	 */ +	if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) +		return -ENODEV; + +	ret = amd_iommu_init_err; +	if (ret) +		goto out; + +	dev_table_size     = tbl_size(DEV_TABLE_ENTRY_SIZE); +	alias_table_size   = tbl_size(ALIAS_TABLE_ENTRY_SIZE); +	rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); + +	ret = -ENOMEM; + +	/* Device table - directly used by all IOMMUs */ +	amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, +				      get_order(dev_table_size)); +	if (amd_iommu_dev_table == NULL) +		goto out; + +	/* +	 * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the +	 * IOMMU see for that device +	 */ +	amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL, +			get_order(alias_table_size)); +	if (amd_iommu_alias_table == NULL) +		goto free; + +	/* IOMMU rlookup table - find the IOMMU for a specific device */ +	amd_iommu_rlookup_table = (void *)__get_free_pages( +			GFP_KERNEL | __GFP_ZERO, +			get_order(rlookup_table_size)); +	if (amd_iommu_rlookup_table == NULL) +		goto free; + +	amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( +					    GFP_KERNEL | __GFP_ZERO, +					    get_order(MAX_DOMAIN_ID/8)); +	if (amd_iommu_pd_alloc_bitmap == NULL) +		goto free; + +	/* init the device table */ +	init_device_table(); + +	/* +	 * let all alias entries point to itself +	 */ +	for (i = 0; i <= amd_iommu_last_bdf; ++i) +		amd_iommu_alias_table[i] = i; + +	/* +	 * never allocate domain 0 because its used as the non-allocated and +	 * error value placeholder +	 */ +	amd_iommu_pd_alloc_bitmap[0] = 1; + +	spin_lock_init(&amd_iommu_pd_lock); + +	/* +	 * now the data structures are allocated and basically initialized +	 * start the real acpi table scan +	 */ +	ret = -ENODEV; +	if (acpi_table_parse("IVRS", init_iommu_all) != 0) +		goto free; + +	if (amd_iommu_init_err) { +		ret = amd_iommu_init_err; +		goto free; +	} + +	if (acpi_table_parse("IVRS", init_memory_definitions) != 0) +		goto free; + +	if (amd_iommu_init_err) { +		ret = amd_iommu_init_err; +		goto free; +	} + +	ret = amd_iommu_init_devices(); +	if (ret) +		goto free; + +	enable_iommus(); + +	if (iommu_pass_through) +		ret = amd_iommu_init_passthrough(); +	else +		ret = amd_iommu_init_dma_ops(); + +	if (ret) +		goto free_disable; + +	amd_iommu_init_api(); + +	amd_iommu_init_notifier(); + +	register_syscore_ops(&amd_iommu_syscore_ops); + +	if (iommu_pass_through) +		goto out; + +	if (amd_iommu_unmap_flush) +		printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); +	else +		printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); + +	x86_platform.iommu_shutdown = disable_iommus; +out: +	return ret; + +free_disable: +	disable_iommus(); + +free: +	amd_iommu_uninit_devices(); + +	free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, +		   get_order(MAX_DOMAIN_ID/8)); + +	free_pages((unsigned long)amd_iommu_rlookup_table, +		   get_order(rlookup_table_size)); + +	free_pages((unsigned long)amd_iommu_alias_table, +		   get_order(alias_table_size)); + +	free_pages((unsigned long)amd_iommu_dev_table, +		   get_order(dev_table_size)); + +	free_iommu_all(); + +	free_unity_maps(); + +#ifdef CONFIG_GART_IOMMU +	/* +	 * We failed to initialize the AMD IOMMU - try fallback to GART +	 * if possible. +	 */ +	gart_iommu_init(); + +#endif + +	goto out; +} + +/**************************************************************************** + * + * Early detect code. This code runs at IOMMU detection time in the DMA + * layer. It just looks if there is an IVRS ACPI table to detect AMD + * IOMMUs + * + ****************************************************************************/ +static int __init early_amd_iommu_detect(struct acpi_table_header *table) +{ +	return 0; +} + +int __init amd_iommu_detect(void) +{ +	if (no_iommu || (iommu_detected && !gart_iommu_aperture)) +		return -ENODEV; + +	if (amd_iommu_disabled) +		return -ENODEV; + +	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { +		iommu_detected = 1; +		amd_iommu_detected = 1; +		x86_init.iommu.iommu_init = amd_iommu_init; + +		/* Make sure ACS will be enabled */ +		pci_request_acs(); +		return 1; +	} +	return -ENODEV; +} + +/**************************************************************************** + * + * Parsing functions for the AMD IOMMU specific kernel command line + * options. + * + ****************************************************************************/ + +static int __init parse_amd_iommu_dump(char *str) +{ +	amd_iommu_dump = true; + +	return 1; +} + +static int __init parse_amd_iommu_options(char *str) +{ +	for (; *str; ++str) { +		if (strncmp(str, "fullflush", 9) == 0) +			amd_iommu_unmap_flush = true; +		if (strncmp(str, "off", 3) == 0) +			amd_iommu_disabled = true; +	} + +	return 1; +} + +__setup("amd_iommu_dump", parse_amd_iommu_dump); +__setup("amd_iommu=", parse_amd_iommu_options); + +IOMMU_INIT_FINISH(amd_iommu_detect, +		  gart_iommu_hole_init, +		  0, +		  0); diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h new file mode 100644 index 000000000000..7ffaa64410b0 --- /dev/null +++ b/drivers/iommu/amd_iommu_proto.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2009-2010 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA + */ + +#ifndef _ASM_X86_AMD_IOMMU_PROTO_H +#define _ASM_X86_AMD_IOMMU_PROTO_H + +#include "amd_iommu_types.h" + +extern int amd_iommu_init_dma_ops(void); +extern int amd_iommu_init_passthrough(void); +extern irqreturn_t amd_iommu_int_thread(int irq, void *data); +extern irqreturn_t amd_iommu_int_handler(int irq, void *data); +extern void amd_iommu_apply_erratum_63(u16 devid); +extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); +extern int amd_iommu_init_devices(void); +extern void amd_iommu_uninit_devices(void); +extern void amd_iommu_init_notifier(void); +extern void amd_iommu_init_api(void); +#ifndef CONFIG_AMD_IOMMU_STATS + +static inline void amd_iommu_stats_init(void) { } + +#endif /* !CONFIG_AMD_IOMMU_STATS */ + +static inline bool is_rd890_iommu(struct pci_dev *pdev) +{ +	return (pdev->vendor == PCI_VENDOR_ID_ATI) && +	       (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); +} + +static inline bool iommu_feature(struct amd_iommu *iommu, u64 f) +{ +	if (!(iommu->cap & (1 << IOMMU_CAP_EFR))) +		return false; + +	return !!(iommu->features & f); +} + +#endif /* _ASM_X86_AMD_IOMMU_PROTO_H  */ diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h new file mode 100644 index 000000000000..5b9c5075e81a --- /dev/null +++ b/drivers/iommu/amd_iommu_types.h @@ -0,0 +1,585 @@ +/* + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + *         Leo Duran <leo.duran@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA + */ + +#ifndef _ASM_X86_AMD_IOMMU_TYPES_H +#define _ASM_X86_AMD_IOMMU_TYPES_H + +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/spinlock.h> + +/* + * Maximum number of IOMMUs supported + */ +#define MAX_IOMMUS	32 + +/* + * some size calculation constants + */ +#define DEV_TABLE_ENTRY_SIZE		32 +#define ALIAS_TABLE_ENTRY_SIZE		2 +#define RLOOKUP_TABLE_ENTRY_SIZE	(sizeof(void *)) + +/* Length of the MMIO region for the AMD IOMMU */ +#define MMIO_REGION_LENGTH       0x4000 + +/* Capability offsets used by the driver */ +#define MMIO_CAP_HDR_OFFSET	0x00 +#define MMIO_RANGE_OFFSET	0x0c +#define MMIO_MISC_OFFSET	0x10 + +/* Masks, shifts and macros to parse the device range capability */ +#define MMIO_RANGE_LD_MASK	0xff000000 +#define MMIO_RANGE_FD_MASK	0x00ff0000 +#define MMIO_RANGE_BUS_MASK	0x0000ff00 +#define MMIO_RANGE_LD_SHIFT	24 +#define MMIO_RANGE_FD_SHIFT	16 +#define MMIO_RANGE_BUS_SHIFT	8 +#define MMIO_GET_LD(x)  (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT) +#define MMIO_GET_FD(x)  (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT) +#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT) +#define MMIO_MSI_NUM(x)	((x) & 0x1f) + +/* Flag masks for the AMD IOMMU exclusion range */ +#define MMIO_EXCL_ENABLE_MASK 0x01ULL +#define MMIO_EXCL_ALLOW_MASK  0x02ULL + +/* Used offsets into the MMIO space */ +#define MMIO_DEV_TABLE_OFFSET   0x0000 +#define MMIO_CMD_BUF_OFFSET     0x0008 +#define MMIO_EVT_BUF_OFFSET     0x0010 +#define MMIO_CONTROL_OFFSET     0x0018 +#define MMIO_EXCL_BASE_OFFSET   0x0020 +#define MMIO_EXCL_LIMIT_OFFSET  0x0028 +#define MMIO_EXT_FEATURES	0x0030 +#define MMIO_CMD_HEAD_OFFSET	0x2000 +#define MMIO_CMD_TAIL_OFFSET	0x2008 +#define MMIO_EVT_HEAD_OFFSET	0x2010 +#define MMIO_EVT_TAIL_OFFSET	0x2018 +#define MMIO_STATUS_OFFSET	0x2020 + + +/* Extended Feature Bits */ +#define FEATURE_PREFETCH	(1ULL<<0) +#define FEATURE_PPR		(1ULL<<1) +#define FEATURE_X2APIC		(1ULL<<2) +#define FEATURE_NX		(1ULL<<3) +#define FEATURE_GT		(1ULL<<4) +#define FEATURE_IA		(1ULL<<6) +#define FEATURE_GA		(1ULL<<7) +#define FEATURE_HE		(1ULL<<8) +#define FEATURE_PC		(1ULL<<9) + +/* MMIO status bits */ +#define MMIO_STATUS_COM_WAIT_INT_MASK	0x04 + +/* event logging constants */ +#define EVENT_ENTRY_SIZE	0x10 +#define EVENT_TYPE_SHIFT	28 +#define EVENT_TYPE_MASK		0xf +#define EVENT_TYPE_ILL_DEV	0x1 +#define EVENT_TYPE_IO_FAULT	0x2 +#define EVENT_TYPE_DEV_TAB_ERR	0x3 +#define EVENT_TYPE_PAGE_TAB_ERR	0x4 +#define EVENT_TYPE_ILL_CMD	0x5 +#define EVENT_TYPE_CMD_HARD_ERR	0x6 +#define EVENT_TYPE_IOTLB_INV_TO	0x7 +#define EVENT_TYPE_INV_DEV_REQ	0x8 +#define EVENT_DEVID_MASK	0xffff +#define EVENT_DEVID_SHIFT	0 +#define EVENT_DOMID_MASK	0xffff +#define EVENT_DOMID_SHIFT	0 +#define EVENT_FLAGS_MASK	0xfff +#define EVENT_FLAGS_SHIFT	0x10 + +/* feature control bits */ +#define CONTROL_IOMMU_EN        0x00ULL +#define CONTROL_HT_TUN_EN       0x01ULL +#define CONTROL_EVT_LOG_EN      0x02ULL +#define CONTROL_EVT_INT_EN      0x03ULL +#define CONTROL_COMWAIT_EN      0x04ULL +#define CONTROL_PASSPW_EN       0x08ULL +#define CONTROL_RESPASSPW_EN    0x09ULL +#define CONTROL_COHERENT_EN     0x0aULL +#define CONTROL_ISOC_EN         0x0bULL +#define CONTROL_CMDBUF_EN       0x0cULL +#define CONTROL_PPFLOG_EN       0x0dULL +#define CONTROL_PPFINT_EN       0x0eULL + +/* command specific defines */ +#define CMD_COMPL_WAIT          0x01 +#define CMD_INV_DEV_ENTRY       0x02 +#define CMD_INV_IOMMU_PAGES	0x03 +#define CMD_INV_IOTLB_PAGES	0x04 +#define CMD_INV_ALL		0x08 + +#define CMD_COMPL_WAIT_STORE_MASK	0x01 +#define CMD_COMPL_WAIT_INT_MASK		0x02 +#define CMD_INV_IOMMU_PAGES_SIZE_MASK	0x01 +#define CMD_INV_IOMMU_PAGES_PDE_MASK	0x02 + +#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS	0x7fffffffffffffffULL + +/* macros and definitions for device table entries */ +#define DEV_ENTRY_VALID         0x00 +#define DEV_ENTRY_TRANSLATION   0x01 +#define DEV_ENTRY_IR            0x3d +#define DEV_ENTRY_IW            0x3e +#define DEV_ENTRY_NO_PAGE_FAULT	0x62 +#define DEV_ENTRY_EX            0x67 +#define DEV_ENTRY_SYSMGT1       0x68 +#define DEV_ENTRY_SYSMGT2       0x69 +#define DEV_ENTRY_INIT_PASS     0xb8 +#define DEV_ENTRY_EINT_PASS     0xb9 +#define DEV_ENTRY_NMI_PASS      0xba +#define DEV_ENTRY_LINT0_PASS    0xbe +#define DEV_ENTRY_LINT1_PASS    0xbf +#define DEV_ENTRY_MODE_MASK	0x07 +#define DEV_ENTRY_MODE_SHIFT	0x09 + +/* constants to configure the command buffer */ +#define CMD_BUFFER_SIZE    8192 +#define CMD_BUFFER_UNINITIALIZED 1 +#define CMD_BUFFER_ENTRIES 512 +#define MMIO_CMD_SIZE_SHIFT 56 +#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) + +/* constants for event buffer handling */ +#define EVT_BUFFER_SIZE		8192 /* 512 entries */ +#define EVT_LEN_MASK		(0x9ULL << 56) + +#define PAGE_MODE_NONE    0x00 +#define PAGE_MODE_1_LEVEL 0x01 +#define PAGE_MODE_2_LEVEL 0x02 +#define PAGE_MODE_3_LEVEL 0x03 +#define PAGE_MODE_4_LEVEL 0x04 +#define PAGE_MODE_5_LEVEL 0x05 +#define PAGE_MODE_6_LEVEL 0x06 + +#define PM_LEVEL_SHIFT(x)	(12 + ((x) * 9)) +#define PM_LEVEL_SIZE(x)	(((x) < 6) ? \ +				  ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \ +				   (0xffffffffffffffffULL)) +#define PM_LEVEL_INDEX(x, a)	(((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL) +#define PM_LEVEL_ENC(x)		(((x) << 9) & 0xe00ULL) +#define PM_LEVEL_PDE(x, a)	((a) | PM_LEVEL_ENC((x)) | \ +				 IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) +#define PM_PTE_LEVEL(pte)	(((pte) >> 9) & 0x7ULL) + +#define PM_MAP_4k		0 +#define PM_ADDR_MASK		0x000ffffffffff000ULL +#define PM_MAP_MASK(lvl)	(PM_ADDR_MASK & \ +				(~((1ULL << (12 + ((lvl) * 9))) - 1))) +#define PM_ALIGNED(lvl, addr)	((PM_MAP_MASK(lvl) & (addr)) == (addr)) + +/* + * Returns the page table level to use for a given page size + * Pagesize is expected to be a power-of-two + */ +#define PAGE_SIZE_LEVEL(pagesize) \ +		((__ffs(pagesize) - 12) / 9) +/* + * Returns the number of ptes to use for a given page size + * Pagesize is expected to be a power-of-two + */ +#define PAGE_SIZE_PTE_COUNT(pagesize) \ +		(1ULL << ((__ffs(pagesize) - 12) % 9)) + +/* + * Aligns a given io-virtual address to a given page size + * Pagesize is expected to be a power-of-two + */ +#define PAGE_SIZE_ALIGN(address, pagesize) \ +		((address) & ~((pagesize) - 1)) +/* + * Creates an IOMMU PTE for an address an a given pagesize + * The PTE has no permission bits set + * Pagesize is expected to be a power-of-two larger than 4096 + */ +#define PAGE_SIZE_PTE(address, pagesize)		\ +		(((address) | ((pagesize) - 1)) &	\ +		 (~(pagesize >> 1)) & PM_ADDR_MASK) + +/* + * Takes a PTE value with mode=0x07 and returns the page size it maps + */ +#define PTE_PAGE_SIZE(pte) \ +	(1ULL << (1 + ffz(((pte) | 0xfffULL)))) + +#define IOMMU_PTE_P  (1ULL << 0) +#define IOMMU_PTE_TV (1ULL << 1) +#define IOMMU_PTE_U  (1ULL << 59) +#define IOMMU_PTE_FC (1ULL << 60) +#define IOMMU_PTE_IR (1ULL << 61) +#define IOMMU_PTE_IW (1ULL << 62) + +#define DTE_FLAG_IOTLB	0x01 + +#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) +#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) +#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) +#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) + +#define IOMMU_PROT_MASK 0x03 +#define IOMMU_PROT_IR 0x01 +#define IOMMU_PROT_IW 0x02 + +/* IOMMU capabilities */ +#define IOMMU_CAP_IOTLB   24 +#define IOMMU_CAP_NPCACHE 26 +#define IOMMU_CAP_EFR     27 + +#define MAX_DOMAIN_ID 65536 + +/* FIXME: move this macro to <linux/pci.h> */ +#define PCI_BUS(x) (((x) >> 8) & 0xff) + +/* Protection domain flags */ +#define PD_DMA_OPS_MASK		(1UL << 0) /* domain used for dma_ops */ +#define PD_DEFAULT_MASK		(1UL << 1) /* domain is a default dma_ops +					      domain for an IOMMU */ +#define PD_PASSTHROUGH_MASK	(1UL << 2) /* domain has no page +					      translation */ + +extern bool amd_iommu_dump; +#define DUMP_printk(format, arg...)					\ +	do {								\ +		if (amd_iommu_dump)						\ +			printk(KERN_INFO "AMD-Vi: " format, ## arg);	\ +	} while(0); + +/* global flag if IOMMUs cache non-present entries */ +extern bool amd_iommu_np_cache; +/* Only true if all IOMMUs support device IOTLBs */ +extern bool amd_iommu_iotlb_sup; + +/* + * Make iterating over all IOMMUs easier + */ +#define for_each_iommu(iommu) \ +	list_for_each_entry((iommu), &amd_iommu_list, list) +#define for_each_iommu_safe(iommu, next) \ +	list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list) + +#define APERTURE_RANGE_SHIFT	27	/* 128 MB */ +#define APERTURE_RANGE_SIZE	(1ULL << APERTURE_RANGE_SHIFT) +#define APERTURE_RANGE_PAGES	(APERTURE_RANGE_SIZE >> PAGE_SHIFT) +#define APERTURE_MAX_RANGES	32	/* allows 4GB of DMA address space */ +#define APERTURE_RANGE_INDEX(a)	((a) >> APERTURE_RANGE_SHIFT) +#define APERTURE_PAGE_INDEX(a)	(((a) >> 21) & 0x3fULL) + +/* + * This structure contains generic data for  IOMMU protection domains + * independent of their use. + */ +struct protection_domain { +	struct list_head list;  /* for list of all protection domains */ +	struct list_head dev_list; /* List of all devices in this domain */ +	spinlock_t lock;	/* mostly used to lock the page table*/ +	struct mutex api_lock;	/* protect page tables in the iommu-api path */ +	u16 id;			/* the domain id written to the device table */ +	int mode;		/* paging mode (0-6 levels) */ +	u64 *pt_root;		/* page table root pointer */ +	unsigned long flags;	/* flags to find out type of domain */ +	bool updated;		/* complete domain flush required */ +	unsigned dev_cnt;	/* devices assigned to this domain */ +	unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ +	void *priv;		/* private data */ + +}; + +/* + * This struct contains device specific data for the IOMMU + */ +struct iommu_dev_data { +	struct list_head list;		  /* For domain->dev_list */ +	struct list_head dev_data_list;	  /* For global dev_data_list */ +	struct iommu_dev_data *alias_data;/* The alias dev_data */ +	struct protection_domain *domain; /* Domain the device is bound to */ +	atomic_t bind;			  /* Domain attach reverent count */ +	u16 devid;			  /* PCI Device ID */ +	struct { +		bool enabled; +		int qdep; +	} ats;				  /* ATS state */ +}; + +/* + * For dynamic growth the aperture size is split into ranges of 128MB of + * DMA address space each. This struct represents one such range. + */ +struct aperture_range { + +	/* address allocation bitmap */ +	unsigned long *bitmap; + +	/* +	 * Array of PTE pages for the aperture. In this array we save all the +	 * leaf pages of the domain page table used for the aperture. This way +	 * we don't need to walk the page table to find a specific PTE. We can +	 * just calculate its address in constant time. +	 */ +	u64 *pte_pages[64]; + +	unsigned long offset; +}; + +/* + * Data container for a dma_ops specific protection domain + */ +struct dma_ops_domain { +	struct list_head list; + +	/* generic protection domain information */ +	struct protection_domain domain; + +	/* size of the aperture for the mappings */ +	unsigned long aperture_size; + +	/* address we start to search for free addresses */ +	unsigned long next_address; + +	/* address space relevant data */ +	struct aperture_range *aperture[APERTURE_MAX_RANGES]; + +	/* This will be set to true when TLB needs to be flushed */ +	bool need_flush; + +	/* +	 * if this is a preallocated domain, keep the device for which it was +	 * preallocated in this variable +	 */ +	u16 target_dev; +}; + +/* + * Structure where we save information about one hardware AMD IOMMU in the + * system. + */ +struct amd_iommu { +	struct list_head list; + +	/* Index within the IOMMU array */ +	int index; + +	/* locks the accesses to the hardware */ +	spinlock_t lock; + +	/* Pointer to PCI device of this IOMMU */ +	struct pci_dev *dev; + +	/* physical address of MMIO space */ +	u64 mmio_phys; +	/* virtual address of MMIO space */ +	u8 *mmio_base; + +	/* capabilities of that IOMMU read from ACPI */ +	u32 cap; + +	/* flags read from acpi table */ +	u8 acpi_flags; + +	/* Extended features */ +	u64 features; + +	/* +	 * Capability pointer. There could be more than one IOMMU per PCI +	 * device function if there are more than one AMD IOMMU capability +	 * pointers. +	 */ +	u16 cap_ptr; + +	/* pci domain of this IOMMU */ +	u16 pci_seg; + +	/* first device this IOMMU handles. read from PCI */ +	u16 first_device; +	/* last device this IOMMU handles. read from PCI */ +	u16 last_device; + +	/* start of exclusion range of that IOMMU */ +	u64 exclusion_start; +	/* length of exclusion range of that IOMMU */ +	u64 exclusion_length; + +	/* command buffer virtual address */ +	u8 *cmd_buf; +	/* size of command buffer */ +	u32 cmd_buf_size; + +	/* size of event buffer */ +	u32 evt_buf_size; +	/* event buffer virtual address */ +	u8 *evt_buf; +	/* MSI number for event interrupt */ +	u16 evt_msi_num; + +	/* true if interrupts for this IOMMU are already enabled */ +	bool int_enabled; + +	/* if one, we need to send a completion wait command */ +	bool need_sync; + +	/* default dma_ops domain for that IOMMU */ +	struct dma_ops_domain *default_dom; + +	/* +	 * We can't rely on the BIOS to restore all values on reinit, so we +	 * need to stash them +	 */ + +	/* The iommu BAR */ +	u32 stored_addr_lo; +	u32 stored_addr_hi; + +	/* +	 * Each iommu has 6 l1s, each of which is documented as having 0x12 +	 * registers +	 */ +	u32 stored_l1[6][0x12]; + +	/* The l2 indirect registers */ +	u32 stored_l2[0x83]; +}; + +/* + * List with all IOMMUs in the system. This list is not locked because it is + * only written and read at driver initialization or suspend time + */ +extern struct list_head amd_iommu_list; + +/* + * Array with pointers to each IOMMU struct + * The indices are referenced in the protection domains + */ +extern struct amd_iommu *amd_iommus[MAX_IOMMUS]; + +/* Number of IOMMUs present in the system */ +extern int amd_iommus_present; + +/* + * Declarations for the global list of all protection domains + */ +extern spinlock_t amd_iommu_pd_lock; +extern struct list_head amd_iommu_pd_list; + +/* + * Structure defining one entry in the device table + */ +struct dev_table_entry { +	u32 data[8]; +}; + +/* + * One entry for unity mappings parsed out of the ACPI table. + */ +struct unity_map_entry { +	struct list_head list; + +	/* starting device id this entry is used for (including) */ +	u16 devid_start; +	/* end device id this entry is used for (including) */ +	u16 devid_end; + +	/* start address to unity map (including) */ +	u64 address_start; +	/* end address to unity map (including) */ +	u64 address_end; + +	/* required protection */ +	int prot; +}; + +/* + * List of all unity mappings. It is not locked because as runtime it is only + * read. It is created at ACPI table parsing time. + */ +extern struct list_head amd_iommu_unity_map; + +/* + * Data structures for device handling + */ + +/* + * Device table used by hardware. Read and write accesses by software are + * locked with the amd_iommu_pd_table lock. + */ +extern struct dev_table_entry *amd_iommu_dev_table; + +/* + * Alias table to find requestor ids to device ids. Not locked because only + * read on runtime. + */ +extern u16 *amd_iommu_alias_table; + +/* + * Reverse lookup table to find the IOMMU which translates a specific device. + */ +extern struct amd_iommu **amd_iommu_rlookup_table; + +/* size of the dma_ops aperture as power of 2 */ +extern unsigned amd_iommu_aperture_order; + +/* largest PCI device id we expect translation requests for */ +extern u16 amd_iommu_last_bdf; + +/* allocation bitmap for domain ids */ +extern unsigned long *amd_iommu_pd_alloc_bitmap; + +/* + * If true, the addresses will be flushed on unmap time, not when + * they are reused + */ +extern bool amd_iommu_unmap_flush; + +/* takes bus and device/function and returns the device id + * FIXME: should that be in generic PCI code? */ +static inline u16 calc_devid(u8 bus, u8 devfn) +{ +	return (((u16)bus) << 8) | devfn; +} + +#ifdef CONFIG_AMD_IOMMU_STATS + +struct __iommu_counter { +	char *name; +	struct dentry *dent; +	u64 value; +}; + +#define DECLARE_STATS_COUNTER(nm) \ +	static struct __iommu_counter nm = {	\ +		.name = #nm,			\ +	} + +#define INC_STATS_COUNTER(name)		name.value += 1 +#define ADD_STATS_COUNTER(name, x)	name.value += (x) +#define SUB_STATS_COUNTER(name, x)	name.value -= (x) + +#else /* CONFIG_AMD_IOMMU_STATS */ + +#define DECLARE_STATS_COUNTER(name) +#define INC_STATS_COUNTER(name) +#define ADD_STATS_COUNTER(name, x) +#define SUB_STATS_COUNTER(name, x) + +#endif /* CONFIG_AMD_IOMMU_STATS */ + +#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c new file mode 100644 index 000000000000..3dc9befa5aec --- /dev/null +++ b/drivers/iommu/dmar.c @@ -0,0 +1,1461 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) 2006-2008 Intel Corporation + * Author: Ashok Raj <ashok.raj@intel.com> + * Author: Shaohua Li <shaohua.li@intel.com> + * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> + * + * This file implements early detection/parsing of Remapping Devices + * reported to OS through BIOS via DMA remapping reporting (DMAR) ACPI + * tables. + * + * These routines are used by both DMA-remapping and Interrupt-remapping + */ + +#include <linux/pci.h> +#include <linux/dmar.h> +#include <linux/iova.h> +#include <linux/intel-iommu.h> +#include <linux/timer.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/tboot.h> +#include <linux/dmi.h> +#include <linux/slab.h> +#include <asm/iommu_table.h> + +#define PREFIX "DMAR: " + +/* No locks are needed as DMA remapping hardware unit + * list is constructed at boot time and hotplug of + * these units are not supported by the architecture. + */ +LIST_HEAD(dmar_drhd_units); + +static struct acpi_table_header * __initdata dmar_tbl; +static acpi_size dmar_tbl_size; + +static void __init dmar_register_drhd_unit(struct dmar_drhd_unit *drhd) +{ +	/* +	 * add INCLUDE_ALL at the tail, so scan the list will find it at +	 * the very end. +	 */ +	if (drhd->include_all) +		list_add_tail(&drhd->list, &dmar_drhd_units); +	else +		list_add(&drhd->list, &dmar_drhd_units); +} + +static int __init dmar_parse_one_dev_scope(struct acpi_dmar_device_scope *scope, +					   struct pci_dev **dev, u16 segment) +{ +	struct pci_bus *bus; +	struct pci_dev *pdev = NULL; +	struct acpi_dmar_pci_path *path; +	int count; + +	bus = pci_find_bus(segment, scope->bus); +	path = (struct acpi_dmar_pci_path *)(scope + 1); +	count = (scope->length - sizeof(struct acpi_dmar_device_scope)) +		/ sizeof(struct acpi_dmar_pci_path); + +	while (count) { +		if (pdev) +			pci_dev_put(pdev); +		/* +		 * Some BIOSes list non-exist devices in DMAR table, just +		 * ignore it +		 */ +		if (!bus) { +			printk(KERN_WARNING +			PREFIX "Device scope bus [%d] not found\n", +			scope->bus); +			break; +		} +		pdev = pci_get_slot(bus, PCI_DEVFN(path->dev, path->fn)); +		if (!pdev) { +			printk(KERN_WARNING PREFIX +			"Device scope device [%04x:%02x:%02x.%02x] not found\n", +				segment, bus->number, path->dev, path->fn); +			break; +		} +		path ++; +		count --; +		bus = pdev->subordinate; +	} +	if (!pdev) { +		printk(KERN_WARNING PREFIX +		"Device scope device [%04x:%02x:%02x.%02x] not found\n", +		segment, scope->bus, path->dev, path->fn); +		*dev = NULL; +		return 0; +	} +	if ((scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT && \ +			pdev->subordinate) || (scope->entry_type == \ +			ACPI_DMAR_SCOPE_TYPE_BRIDGE && !pdev->subordinate)) { +		pci_dev_put(pdev); +		printk(KERN_WARNING PREFIX +			"Device scope type does not match for %s\n", +			 pci_name(pdev)); +		return -EINVAL; +	} +	*dev = pdev; +	return 0; +} + +static int __init dmar_parse_dev_scope(void *start, void *end, int *cnt, +				       struct pci_dev ***devices, u16 segment) +{ +	struct acpi_dmar_device_scope *scope; +	void * tmp = start; +	int index; +	int ret; + +	*cnt = 0; +	while (start < end) { +		scope = start; +		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT || +		    scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE) +			(*cnt)++; +		else if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_IOAPIC) { +			printk(KERN_WARNING PREFIX +			       "Unsupported device scope\n"); +		} +		start += scope->length; +	} +	if (*cnt == 0) +		return 0; + +	*devices = kcalloc(*cnt, sizeof(struct pci_dev *), GFP_KERNEL); +	if (!*devices) +		return -ENOMEM; + +	start = tmp; +	index = 0; +	while (start < end) { +		scope = start; +		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT || +		    scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE) { +			ret = dmar_parse_one_dev_scope(scope, +				&(*devices)[index], segment); +			if (ret) { +				kfree(*devices); +				return ret; +			} +			index ++; +		} +		start += scope->length; +	} + +	return 0; +} + +/** + * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition + * structure which uniquely represent one DMA remapping hardware unit + * present in the platform + */ +static int __init +dmar_parse_one_drhd(struct acpi_dmar_header *header) +{ +	struct acpi_dmar_hardware_unit *drhd; +	struct dmar_drhd_unit *dmaru; +	int ret = 0; + +	drhd = (struct acpi_dmar_hardware_unit *)header; +	dmaru = kzalloc(sizeof(*dmaru), GFP_KERNEL); +	if (!dmaru) +		return -ENOMEM; + +	dmaru->hdr = header; +	dmaru->reg_base_addr = drhd->address; +	dmaru->segment = drhd->segment; +	dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */ + +	ret = alloc_iommu(dmaru); +	if (ret) { +		kfree(dmaru); +		return ret; +	} +	dmar_register_drhd_unit(dmaru); +	return 0; +} + +static int __init dmar_parse_dev(struct dmar_drhd_unit *dmaru) +{ +	struct acpi_dmar_hardware_unit *drhd; +	int ret = 0; + +	drhd = (struct acpi_dmar_hardware_unit *) dmaru->hdr; + +	if (dmaru->include_all) +		return 0; + +	ret = dmar_parse_dev_scope((void *)(drhd + 1), +				((void *)drhd) + drhd->header.length, +				&dmaru->devices_cnt, &dmaru->devices, +				drhd->segment); +	if (ret) { +		list_del(&dmaru->list); +		kfree(dmaru); +	} +	return ret; +} + +#ifdef CONFIG_DMAR +LIST_HEAD(dmar_rmrr_units); + +static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr) +{ +	list_add(&rmrr->list, &dmar_rmrr_units); +} + + +static int __init +dmar_parse_one_rmrr(struct acpi_dmar_header *header) +{ +	struct acpi_dmar_reserved_memory *rmrr; +	struct dmar_rmrr_unit *rmrru; + +	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); +	if (!rmrru) +		return -ENOMEM; + +	rmrru->hdr = header; +	rmrr = (struct acpi_dmar_reserved_memory *)header; +	rmrru->base_address = rmrr->base_address; +	rmrru->end_address = rmrr->end_address; + +	dmar_register_rmrr_unit(rmrru); +	return 0; +} + +static int __init +rmrr_parse_dev(struct dmar_rmrr_unit *rmrru) +{ +	struct acpi_dmar_reserved_memory *rmrr; +	int ret; + +	rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr; +	ret = dmar_parse_dev_scope((void *)(rmrr + 1), +		((void *)rmrr) + rmrr->header.length, +		&rmrru->devices_cnt, &rmrru->devices, rmrr->segment); + +	if (ret || (rmrru->devices_cnt == 0)) { +		list_del(&rmrru->list); +		kfree(rmrru); +	} +	return ret; +} + +static LIST_HEAD(dmar_atsr_units); + +static int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr) +{ +	struct acpi_dmar_atsr *atsr; +	struct dmar_atsr_unit *atsru; + +	atsr = container_of(hdr, struct acpi_dmar_atsr, header); +	atsru = kzalloc(sizeof(*atsru), GFP_KERNEL); +	if (!atsru) +		return -ENOMEM; + +	atsru->hdr = hdr; +	atsru->include_all = atsr->flags & 0x1; + +	list_add(&atsru->list, &dmar_atsr_units); + +	return 0; +} + +static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru) +{ +	int rc; +	struct acpi_dmar_atsr *atsr; + +	if (atsru->include_all) +		return 0; + +	atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); +	rc = dmar_parse_dev_scope((void *)(atsr + 1), +				(void *)atsr + atsr->header.length, +				&atsru->devices_cnt, &atsru->devices, +				atsr->segment); +	if (rc || !atsru->devices_cnt) { +		list_del(&atsru->list); +		kfree(atsru); +	} + +	return rc; +} + +int dmar_find_matched_atsr_unit(struct pci_dev *dev) +{ +	int i; +	struct pci_bus *bus; +	struct acpi_dmar_atsr *atsr; +	struct dmar_atsr_unit *atsru; + +	dev = pci_physfn(dev); + +	list_for_each_entry(atsru, &dmar_atsr_units, list) { +		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); +		if (atsr->segment == pci_domain_nr(dev->bus)) +			goto found; +	} + +	return 0; + +found: +	for (bus = dev->bus; bus; bus = bus->parent) { +		struct pci_dev *bridge = bus->self; + +		if (!bridge || !pci_is_pcie(bridge) || +		    bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE) +			return 0; + +		if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) { +			for (i = 0; i < atsru->devices_cnt; i++) +				if (atsru->devices[i] == bridge) +					return 1; +			break; +		} +	} + +	if (atsru->include_all) +		return 1; + +	return 0; +} +#endif + +#ifdef CONFIG_ACPI_NUMA +static int __init +dmar_parse_one_rhsa(struct acpi_dmar_header *header) +{ +	struct acpi_dmar_rhsa *rhsa; +	struct dmar_drhd_unit *drhd; + +	rhsa = (struct acpi_dmar_rhsa *)header; +	for_each_drhd_unit(drhd) { +		if (drhd->reg_base_addr == rhsa->base_address) { +			int node = acpi_map_pxm_to_node(rhsa->proximity_domain); + +			if (!node_online(node)) +				node = -1; +			drhd->iommu->node = node; +			return 0; +		} +	} +	WARN_TAINT( +		1, TAINT_FIRMWARE_WORKAROUND, +		"Your BIOS is broken; RHSA refers to non-existent DMAR unit at %llx\n" +		"BIOS vendor: %s; Ver: %s; Product Version: %s\n", +		drhd->reg_base_addr, +		dmi_get_system_info(DMI_BIOS_VENDOR), +		dmi_get_system_info(DMI_BIOS_VERSION), +		dmi_get_system_info(DMI_PRODUCT_VERSION)); + +	return 0; +} +#endif + +static void __init +dmar_table_print_dmar_entry(struct acpi_dmar_header *header) +{ +	struct acpi_dmar_hardware_unit *drhd; +	struct acpi_dmar_reserved_memory *rmrr; +	struct acpi_dmar_atsr *atsr; +	struct acpi_dmar_rhsa *rhsa; + +	switch (header->type) { +	case ACPI_DMAR_TYPE_HARDWARE_UNIT: +		drhd = container_of(header, struct acpi_dmar_hardware_unit, +				    header); +		printk (KERN_INFO PREFIX +			"DRHD base: %#016Lx flags: %#x\n", +			(unsigned long long)drhd->address, drhd->flags); +		break; +	case ACPI_DMAR_TYPE_RESERVED_MEMORY: +		rmrr = container_of(header, struct acpi_dmar_reserved_memory, +				    header); +		printk (KERN_INFO PREFIX +			"RMRR base: %#016Lx end: %#016Lx\n", +			(unsigned long long)rmrr->base_address, +			(unsigned long long)rmrr->end_address); +		break; +	case ACPI_DMAR_TYPE_ATSR: +		atsr = container_of(header, struct acpi_dmar_atsr, header); +		printk(KERN_INFO PREFIX "ATSR flags: %#x\n", atsr->flags); +		break; +	case ACPI_DMAR_HARDWARE_AFFINITY: +		rhsa = container_of(header, struct acpi_dmar_rhsa, header); +		printk(KERN_INFO PREFIX "RHSA base: %#016Lx proximity domain: %#x\n", +		       (unsigned long long)rhsa->base_address, +		       rhsa->proximity_domain); +		break; +	} +} + +/** + * dmar_table_detect - checks to see if the platform supports DMAR devices + */ +static int __init dmar_table_detect(void) +{ +	acpi_status status = AE_OK; + +	/* if we could find DMAR table, then there are DMAR devices */ +	status = acpi_get_table_with_size(ACPI_SIG_DMAR, 0, +				(struct acpi_table_header **)&dmar_tbl, +				&dmar_tbl_size); + +	if (ACPI_SUCCESS(status) && !dmar_tbl) { +		printk (KERN_WARNING PREFIX "Unable to map DMAR\n"); +		status = AE_NOT_FOUND; +	} + +	return (ACPI_SUCCESS(status) ? 1 : 0); +} + +/** + * parse_dmar_table - parses the DMA reporting table + */ +static int __init +parse_dmar_table(void) +{ +	struct acpi_table_dmar *dmar; +	struct acpi_dmar_header *entry_header; +	int ret = 0; + +	/* +	 * Do it again, earlier dmar_tbl mapping could be mapped with +	 * fixed map. +	 */ +	dmar_table_detect(); + +	/* +	 * ACPI tables may not be DMA protected by tboot, so use DMAR copy +	 * SINIT saved in SinitMleData in TXT heap (which is DMA protected) +	 */ +	dmar_tbl = tboot_get_dmar_table(dmar_tbl); + +	dmar = (struct acpi_table_dmar *)dmar_tbl; +	if (!dmar) +		return -ENODEV; + +	if (dmar->width < PAGE_SHIFT - 1) { +		printk(KERN_WARNING PREFIX "Invalid DMAR haw\n"); +		return -EINVAL; +	} + +	printk (KERN_INFO PREFIX "Host address width %d\n", +		dmar->width + 1); + +	entry_header = (struct acpi_dmar_header *)(dmar + 1); +	while (((unsigned long)entry_header) < +			(((unsigned long)dmar) + dmar_tbl->length)) { +		/* Avoid looping forever on bad ACPI tables */ +		if (entry_header->length == 0) { +			printk(KERN_WARNING PREFIX +				"Invalid 0-length structure\n"); +			ret = -EINVAL; +			break; +		} + +		dmar_table_print_dmar_entry(entry_header); + +		switch (entry_header->type) { +		case ACPI_DMAR_TYPE_HARDWARE_UNIT: +			ret = dmar_parse_one_drhd(entry_header); +			break; +		case ACPI_DMAR_TYPE_RESERVED_MEMORY: +#ifdef CONFIG_DMAR +			ret = dmar_parse_one_rmrr(entry_header); +#endif +			break; +		case ACPI_DMAR_TYPE_ATSR: +#ifdef CONFIG_DMAR +			ret = dmar_parse_one_atsr(entry_header); +#endif +			break; +		case ACPI_DMAR_HARDWARE_AFFINITY: +#ifdef CONFIG_ACPI_NUMA +			ret = dmar_parse_one_rhsa(entry_header); +#endif +			break; +		default: +			printk(KERN_WARNING PREFIX +				"Unknown DMAR structure type %d\n", +				entry_header->type); +			ret = 0; /* for forward compatibility */ +			break; +		} +		if (ret) +			break; + +		entry_header = ((void *)entry_header + entry_header->length); +	} +	return ret; +} + +static int dmar_pci_device_match(struct pci_dev *devices[], int cnt, +			  struct pci_dev *dev) +{ +	int index; + +	while (dev) { +		for (index = 0; index < cnt; index++) +			if (dev == devices[index]) +				return 1; + +		/* Check our parent */ +		dev = dev->bus->self; +	} + +	return 0; +} + +struct dmar_drhd_unit * +dmar_find_matched_drhd_unit(struct pci_dev *dev) +{ +	struct dmar_drhd_unit *dmaru = NULL; +	struct acpi_dmar_hardware_unit *drhd; + +	dev = pci_physfn(dev); + +	list_for_each_entry(dmaru, &dmar_drhd_units, list) { +		drhd = container_of(dmaru->hdr, +				    struct acpi_dmar_hardware_unit, +				    header); + +		if (dmaru->include_all && +		    drhd->segment == pci_domain_nr(dev->bus)) +			return dmaru; + +		if (dmar_pci_device_match(dmaru->devices, +					  dmaru->devices_cnt, dev)) +			return dmaru; +	} + +	return NULL; +} + +int __init dmar_dev_scope_init(void) +{ +	struct dmar_drhd_unit *drhd, *drhd_n; +	int ret = -ENODEV; + +	list_for_each_entry_safe(drhd, drhd_n, &dmar_drhd_units, list) { +		ret = dmar_parse_dev(drhd); +		if (ret) +			return ret; +	} + +#ifdef CONFIG_DMAR +	{ +		struct dmar_rmrr_unit *rmrr, *rmrr_n; +		struct dmar_atsr_unit *atsr, *atsr_n; + +		list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) { +			ret = rmrr_parse_dev(rmrr); +			if (ret) +				return ret; +		} + +		list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) { +			ret = atsr_parse_dev(atsr); +			if (ret) +				return ret; +		} +	} +#endif + +	return ret; +} + + +int __init dmar_table_init(void) +{ +	static int dmar_table_initialized; +	int ret; + +	if (dmar_table_initialized) +		return 0; + +	dmar_table_initialized = 1; + +	ret = parse_dmar_table(); +	if (ret) { +		if (ret != -ENODEV) +			printk(KERN_INFO PREFIX "parse DMAR table failure.\n"); +		return ret; +	} + +	if (list_empty(&dmar_drhd_units)) { +		printk(KERN_INFO PREFIX "No DMAR devices found\n"); +		return -ENODEV; +	} + +#ifdef CONFIG_DMAR +	if (list_empty(&dmar_rmrr_units)) +		printk(KERN_INFO PREFIX "No RMRR found\n"); + +	if (list_empty(&dmar_atsr_units)) +		printk(KERN_INFO PREFIX "No ATSR found\n"); +#endif + +	return 0; +} + +static void warn_invalid_dmar(u64 addr, const char *message) +{ +	WARN_TAINT_ONCE( +		1, TAINT_FIRMWARE_WORKAROUND, +		"Your BIOS is broken; DMAR reported at address %llx%s!\n" +		"BIOS vendor: %s; Ver: %s; Product Version: %s\n", +		addr, message, +		dmi_get_system_info(DMI_BIOS_VENDOR), +		dmi_get_system_info(DMI_BIOS_VERSION), +		dmi_get_system_info(DMI_PRODUCT_VERSION)); +} + +int __init check_zero_address(void) +{ +	struct acpi_table_dmar *dmar; +	struct acpi_dmar_header *entry_header; +	struct acpi_dmar_hardware_unit *drhd; + +	dmar = (struct acpi_table_dmar *)dmar_tbl; +	entry_header = (struct acpi_dmar_header *)(dmar + 1); + +	while (((unsigned long)entry_header) < +			(((unsigned long)dmar) + dmar_tbl->length)) { +		/* Avoid looping forever on bad ACPI tables */ +		if (entry_header->length == 0) { +			printk(KERN_WARNING PREFIX +				"Invalid 0-length structure\n"); +			return 0; +		} + +		if (entry_header->type == ACPI_DMAR_TYPE_HARDWARE_UNIT) { +			void __iomem *addr; +			u64 cap, ecap; + +			drhd = (void *)entry_header; +			if (!drhd->address) { +				warn_invalid_dmar(0, ""); +				goto failed; +			} + +			addr = early_ioremap(drhd->address, VTD_PAGE_SIZE); +			if (!addr ) { +				printk("IOMMU: can't validate: %llx\n", drhd->address); +				goto failed; +			} +			cap = dmar_readq(addr + DMAR_CAP_REG); +			ecap = dmar_readq(addr + DMAR_ECAP_REG); +			early_iounmap(addr, VTD_PAGE_SIZE); +			if (cap == (uint64_t)-1 && ecap == (uint64_t)-1) { +				warn_invalid_dmar(drhd->address, +						  " returns all ones"); +				goto failed; +			} +		} + +		entry_header = ((void *)entry_header + entry_header->length); +	} +	return 1; + +failed: +#ifdef CONFIG_DMAR +	dmar_disabled = 1; +#endif +	return 0; +} + +int __init detect_intel_iommu(void) +{ +	int ret; + +	ret = dmar_table_detect(); +	if (ret) +		ret = check_zero_address(); +	{ +#ifdef CONFIG_INTR_REMAP +		struct acpi_table_dmar *dmar; + +		dmar = (struct acpi_table_dmar *) dmar_tbl; +		if (ret && cpu_has_x2apic && dmar->flags & 0x1) +			printk(KERN_INFO +			       "Queued invalidation will be enabled to support " +			       "x2apic and Intr-remapping.\n"); +#endif +#ifdef CONFIG_DMAR +		if (ret && !no_iommu && !iommu_detected && !dmar_disabled) { +			iommu_detected = 1; +			/* Make sure ACS will be enabled */ +			pci_request_acs(); +		} +#endif +#ifdef CONFIG_X86 +		if (ret) +			x86_init.iommu.iommu_init = intel_iommu_init; +#endif +	} +	early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size); +	dmar_tbl = NULL; + +	return ret ? 1 : -ENODEV; +} + + +int alloc_iommu(struct dmar_drhd_unit *drhd) +{ +	struct intel_iommu *iommu; +	int map_size; +	u32 ver; +	static int iommu_allocated = 0; +	int agaw = 0; +	int msagaw = 0; + +	if (!drhd->reg_base_addr) { +		warn_invalid_dmar(0, ""); +		return -EINVAL; +	} + +	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); +	if (!iommu) +		return -ENOMEM; + +	iommu->seq_id = iommu_allocated++; +	sprintf (iommu->name, "dmar%d", iommu->seq_id); + +	iommu->reg = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE); +	if (!iommu->reg) { +		printk(KERN_ERR "IOMMU: can't map the region\n"); +		goto error; +	} +	iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG); +	iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG); + +	if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) { +		warn_invalid_dmar(drhd->reg_base_addr, " returns all ones"); +		goto err_unmap; +	} + +#ifdef CONFIG_DMAR +	agaw = iommu_calculate_agaw(iommu); +	if (agaw < 0) { +		printk(KERN_ERR +		       "Cannot get a valid agaw for iommu (seq_id = %d)\n", +		       iommu->seq_id); +		goto err_unmap; +	} +	msagaw = iommu_calculate_max_sagaw(iommu); +	if (msagaw < 0) { +		printk(KERN_ERR +			"Cannot get a valid max agaw for iommu (seq_id = %d)\n", +			iommu->seq_id); +		goto err_unmap; +	} +#endif +	iommu->agaw = agaw; +	iommu->msagaw = msagaw; + +	iommu->node = -1; + +	/* the registers might be more than one page */ +	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap), +		cap_max_fault_reg_offset(iommu->cap)); +	map_size = VTD_PAGE_ALIGN(map_size); +	if (map_size > VTD_PAGE_SIZE) { +		iounmap(iommu->reg); +		iommu->reg = ioremap(drhd->reg_base_addr, map_size); +		if (!iommu->reg) { +			printk(KERN_ERR "IOMMU: can't map the region\n"); +			goto error; +		} +	} + +	ver = readl(iommu->reg + DMAR_VER_REG); +	pr_info("IOMMU %d: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n", +		iommu->seq_id, +		(unsigned long long)drhd->reg_base_addr, +		DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver), +		(unsigned long long)iommu->cap, +		(unsigned long long)iommu->ecap); + +	spin_lock_init(&iommu->register_lock); + +	drhd->iommu = iommu; +	return 0; + + err_unmap: +	iounmap(iommu->reg); + error: +	kfree(iommu); +	return -1; +} + +void free_iommu(struct intel_iommu *iommu) +{ +	if (!iommu) +		return; + +#ifdef CONFIG_DMAR +	free_dmar_iommu(iommu); +#endif + +	if (iommu->reg) +		iounmap(iommu->reg); +	kfree(iommu); +} + +/* + * Reclaim all the submitted descriptors which have completed its work. + */ +static inline void reclaim_free_desc(struct q_inval *qi) +{ +	while (qi->desc_status[qi->free_tail] == QI_DONE || +	       qi->desc_status[qi->free_tail] == QI_ABORT) { +		qi->desc_status[qi->free_tail] = QI_FREE; +		qi->free_tail = (qi->free_tail + 1) % QI_LENGTH; +		qi->free_cnt++; +	} +} + +static int qi_check_fault(struct intel_iommu *iommu, int index) +{ +	u32 fault; +	int head, tail; +	struct q_inval *qi = iommu->qi; +	int wait_index = (index + 1) % QI_LENGTH; + +	if (qi->desc_status[wait_index] == QI_ABORT) +		return -EAGAIN; + +	fault = readl(iommu->reg + DMAR_FSTS_REG); + +	/* +	 * If IQE happens, the head points to the descriptor associated +	 * with the error. No new descriptors are fetched until the IQE +	 * is cleared. +	 */ +	if (fault & DMA_FSTS_IQE) { +		head = readl(iommu->reg + DMAR_IQH_REG); +		if ((head >> DMAR_IQ_SHIFT) == index) { +			printk(KERN_ERR "VT-d detected invalid descriptor: " +				"low=%llx, high=%llx\n", +				(unsigned long long)qi->desc[index].low, +				(unsigned long long)qi->desc[index].high); +			memcpy(&qi->desc[index], &qi->desc[wait_index], +					sizeof(struct qi_desc)); +			__iommu_flush_cache(iommu, &qi->desc[index], +					sizeof(struct qi_desc)); +			writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG); +			return -EINVAL; +		} +	} + +	/* +	 * If ITE happens, all pending wait_desc commands are aborted. +	 * No new descriptors are fetched until the ITE is cleared. +	 */ +	if (fault & DMA_FSTS_ITE) { +		head = readl(iommu->reg + DMAR_IQH_REG); +		head = ((head >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH; +		head |= 1; +		tail = readl(iommu->reg + DMAR_IQT_REG); +		tail = ((tail >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH; + +		writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG); + +		do { +			if (qi->desc_status[head] == QI_IN_USE) +				qi->desc_status[head] = QI_ABORT; +			head = (head - 2 + QI_LENGTH) % QI_LENGTH; +		} while (head != tail); + +		if (qi->desc_status[wait_index] == QI_ABORT) +			return -EAGAIN; +	} + +	if (fault & DMA_FSTS_ICE) +		writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG); + +	return 0; +} + +/* + * Submit the queued invalidation descriptor to the remapping + * hardware unit and wait for its completion. + */ +int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) +{ +	int rc; +	struct q_inval *qi = iommu->qi; +	struct qi_desc *hw, wait_desc; +	int wait_index, index; +	unsigned long flags; + +	if (!qi) +		return 0; + +	hw = qi->desc; + +restart: +	rc = 0; + +	spin_lock_irqsave(&qi->q_lock, flags); +	while (qi->free_cnt < 3) { +		spin_unlock_irqrestore(&qi->q_lock, flags); +		cpu_relax(); +		spin_lock_irqsave(&qi->q_lock, flags); +	} + +	index = qi->free_head; +	wait_index = (index + 1) % QI_LENGTH; + +	qi->desc_status[index] = qi->desc_status[wait_index] = QI_IN_USE; + +	hw[index] = *desc; + +	wait_desc.low = QI_IWD_STATUS_DATA(QI_DONE) | +			QI_IWD_STATUS_WRITE | QI_IWD_TYPE; +	wait_desc.high = virt_to_phys(&qi->desc_status[wait_index]); + +	hw[wait_index] = wait_desc; + +	__iommu_flush_cache(iommu, &hw[index], sizeof(struct qi_desc)); +	__iommu_flush_cache(iommu, &hw[wait_index], sizeof(struct qi_desc)); + +	qi->free_head = (qi->free_head + 2) % QI_LENGTH; +	qi->free_cnt -= 2; + +	/* +	 * update the HW tail register indicating the presence of +	 * new descriptors. +	 */ +	writel(qi->free_head << DMAR_IQ_SHIFT, iommu->reg + DMAR_IQT_REG); + +	while (qi->desc_status[wait_index] != QI_DONE) { +		/* +		 * We will leave the interrupts disabled, to prevent interrupt +		 * context to queue another cmd while a cmd is already submitted +		 * and waiting for completion on this cpu. This is to avoid +		 * a deadlock where the interrupt context can wait indefinitely +		 * for free slots in the queue. +		 */ +		rc = qi_check_fault(iommu, index); +		if (rc) +			break; + +		spin_unlock(&qi->q_lock); +		cpu_relax(); +		spin_lock(&qi->q_lock); +	} + +	qi->desc_status[index] = QI_DONE; + +	reclaim_free_desc(qi); +	spin_unlock_irqrestore(&qi->q_lock, flags); + +	if (rc == -EAGAIN) +		goto restart; + +	return rc; +} + +/* + * Flush the global interrupt entry cache. + */ +void qi_global_iec(struct intel_iommu *iommu) +{ +	struct qi_desc desc; + +	desc.low = QI_IEC_TYPE; +	desc.high = 0; + +	/* should never fail */ +	qi_submit_sync(&desc, iommu); +} + +void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, +		      u64 type) +{ +	struct qi_desc desc; + +	desc.low = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did) +			| QI_CC_GRAN(type) | QI_CC_TYPE; +	desc.high = 0; + +	qi_submit_sync(&desc, iommu); +} + +void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, +		    unsigned int size_order, u64 type) +{ +	u8 dw = 0, dr = 0; + +	struct qi_desc desc; +	int ih = 0; + +	if (cap_write_drain(iommu->cap)) +		dw = 1; + +	if (cap_read_drain(iommu->cap)) +		dr = 1; + +	desc.low = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) +		| QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE; +	desc.high = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) +		| QI_IOTLB_AM(size_order); + +	qi_submit_sync(&desc, iommu); +} + +void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep, +			u64 addr, unsigned mask) +{ +	struct qi_desc desc; + +	if (mask) { +		BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1)); +		addr |= (1 << (VTD_PAGE_SHIFT + mask - 1)) - 1; +		desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; +	} else +		desc.high = QI_DEV_IOTLB_ADDR(addr); + +	if (qdep >= QI_DEV_IOTLB_MAX_INVS) +		qdep = 0; + +	desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | +		   QI_DIOTLB_TYPE; + +	qi_submit_sync(&desc, iommu); +} + +/* + * Disable Queued Invalidation interface. + */ +void dmar_disable_qi(struct intel_iommu *iommu) +{ +	unsigned long flags; +	u32 sts; +	cycles_t start_time = get_cycles(); + +	if (!ecap_qis(iommu->ecap)) +		return; + +	spin_lock_irqsave(&iommu->register_lock, flags); + +	sts =  dmar_readq(iommu->reg + DMAR_GSTS_REG); +	if (!(sts & DMA_GSTS_QIES)) +		goto end; + +	/* +	 * Give a chance to HW to complete the pending invalidation requests. +	 */ +	while ((readl(iommu->reg + DMAR_IQT_REG) != +		readl(iommu->reg + DMAR_IQH_REG)) && +		(DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time))) +		cpu_relax(); + +	iommu->gcmd &= ~DMA_GCMD_QIE; +	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + +	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, +		      !(sts & DMA_GSTS_QIES), sts); +end: +	spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +/* + * Enable queued invalidation. + */ +static void __dmar_enable_qi(struct intel_iommu *iommu) +{ +	u32 sts; +	unsigned long flags; +	struct q_inval *qi = iommu->qi; + +	qi->free_head = qi->free_tail = 0; +	qi->free_cnt = QI_LENGTH; + +	spin_lock_irqsave(&iommu->register_lock, flags); + +	/* write zero to the tail reg */ +	writel(0, iommu->reg + DMAR_IQT_REG); + +	dmar_writeq(iommu->reg + DMAR_IQA_REG, virt_to_phys(qi->desc)); + +	iommu->gcmd |= DMA_GCMD_QIE; +	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + +	/* Make sure hardware complete it */ +	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts); + +	spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +/* + * Enable Queued Invalidation interface. This is a must to support + * interrupt-remapping. Also used by DMA-remapping, which replaces + * register based IOTLB invalidation. + */ +int dmar_enable_qi(struct intel_iommu *iommu) +{ +	struct q_inval *qi; +	struct page *desc_page; + +	if (!ecap_qis(iommu->ecap)) +		return -ENOENT; + +	/* +	 * queued invalidation is already setup and enabled. +	 */ +	if (iommu->qi) +		return 0; + +	iommu->qi = kmalloc(sizeof(*qi), GFP_ATOMIC); +	if (!iommu->qi) +		return -ENOMEM; + +	qi = iommu->qi; + + +	desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO, 0); +	if (!desc_page) { +		kfree(qi); +		iommu->qi = 0; +		return -ENOMEM; +	} + +	qi->desc = page_address(desc_page); + +	qi->desc_status = kmalloc(QI_LENGTH * sizeof(int), GFP_ATOMIC); +	if (!qi->desc_status) { +		free_page((unsigned long) qi->desc); +		kfree(qi); +		iommu->qi = 0; +		return -ENOMEM; +	} + +	qi->free_head = qi->free_tail = 0; +	qi->free_cnt = QI_LENGTH; + +	spin_lock_init(&qi->q_lock); + +	__dmar_enable_qi(iommu); + +	return 0; +} + +/* iommu interrupt handling. Most stuff are MSI-like. */ + +enum faulttype { +	DMA_REMAP, +	INTR_REMAP, +	UNKNOWN, +}; + +static const char *dma_remap_fault_reasons[] = +{ +	"Software", +	"Present bit in root entry is clear", +	"Present bit in context entry is clear", +	"Invalid context entry", +	"Access beyond MGAW", +	"PTE Write access is not set", +	"PTE Read access is not set", +	"Next page table ptr is invalid", +	"Root table address invalid", +	"Context table ptr is invalid", +	"non-zero reserved fields in RTP", +	"non-zero reserved fields in CTP", +	"non-zero reserved fields in PTE", +}; + +static const char *intr_remap_fault_reasons[] = +{ +	"Detected reserved fields in the decoded interrupt-remapped request", +	"Interrupt index exceeded the interrupt-remapping table size", +	"Present field in the IRTE entry is clear", +	"Error accessing interrupt-remapping table pointed by IRTA_REG", +	"Detected reserved fields in the IRTE entry", +	"Blocked a compatibility format interrupt request", +	"Blocked an interrupt request due to source-id verification failure", +}; + +#define MAX_FAULT_REASON_IDX 	(ARRAY_SIZE(fault_reason_strings) - 1) + +const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type) +{ +	if (fault_reason >= 0x20 && (fault_reason <= 0x20 + +				     ARRAY_SIZE(intr_remap_fault_reasons))) { +		*fault_type = INTR_REMAP; +		return intr_remap_fault_reasons[fault_reason - 0x20]; +	} else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) { +		*fault_type = DMA_REMAP; +		return dma_remap_fault_reasons[fault_reason]; +	} else { +		*fault_type = UNKNOWN; +		return "Unknown"; +	} +} + +void dmar_msi_unmask(struct irq_data *data) +{ +	struct intel_iommu *iommu = irq_data_get_irq_handler_data(data); +	unsigned long flag; + +	/* unmask it */ +	spin_lock_irqsave(&iommu->register_lock, flag); +	writel(0, iommu->reg + DMAR_FECTL_REG); +	/* Read a reg to force flush the post write */ +	readl(iommu->reg + DMAR_FECTL_REG); +	spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +void dmar_msi_mask(struct irq_data *data) +{ +	unsigned long flag; +	struct intel_iommu *iommu = irq_data_get_irq_handler_data(data); + +	/* mask it */ +	spin_lock_irqsave(&iommu->register_lock, flag); +	writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG); +	/* Read a reg to force flush the post write */ +	readl(iommu->reg + DMAR_FECTL_REG); +	spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +void dmar_msi_write(int irq, struct msi_msg *msg) +{ +	struct intel_iommu *iommu = irq_get_handler_data(irq); +	unsigned long flag; + +	spin_lock_irqsave(&iommu->register_lock, flag); +	writel(msg->data, iommu->reg + DMAR_FEDATA_REG); +	writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG); +	writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG); +	spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +void dmar_msi_read(int irq, struct msi_msg *msg) +{ +	struct intel_iommu *iommu = irq_get_handler_data(irq); +	unsigned long flag; + +	spin_lock_irqsave(&iommu->register_lock, flag); +	msg->data = readl(iommu->reg + DMAR_FEDATA_REG); +	msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG); +	msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG); +	spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +static int dmar_fault_do_one(struct intel_iommu *iommu, int type, +		u8 fault_reason, u16 source_id, unsigned long long addr) +{ +	const char *reason; +	int fault_type; + +	reason = dmar_get_fault_reason(fault_reason, &fault_type); + +	if (fault_type == INTR_REMAP) +		printk(KERN_ERR "INTR-REMAP: Request device [[%02x:%02x.%d] " +		       "fault index %llx\n" +			"INTR-REMAP:[fault reason %02d] %s\n", +			(source_id >> 8), PCI_SLOT(source_id & 0xFF), +			PCI_FUNC(source_id & 0xFF), addr >> 48, +			fault_reason, reason); +	else +		printk(KERN_ERR +		       "DMAR:[%s] Request device [%02x:%02x.%d] " +		       "fault addr %llx \n" +		       "DMAR:[fault reason %02d] %s\n", +		       (type ? "DMA Read" : "DMA Write"), +		       (source_id >> 8), PCI_SLOT(source_id & 0xFF), +		       PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason); +	return 0; +} + +#define PRIMARY_FAULT_REG_LEN (16) +irqreturn_t dmar_fault(int irq, void *dev_id) +{ +	struct intel_iommu *iommu = dev_id; +	int reg, fault_index; +	u32 fault_status; +	unsigned long flag; + +	spin_lock_irqsave(&iommu->register_lock, flag); +	fault_status = readl(iommu->reg + DMAR_FSTS_REG); +	if (fault_status) +		printk(KERN_ERR "DRHD: handling fault status reg %x\n", +		       fault_status); + +	/* TBD: ignore advanced fault log currently */ +	if (!(fault_status & DMA_FSTS_PPF)) +		goto clear_rest; + +	fault_index = dma_fsts_fault_record_index(fault_status); +	reg = cap_fault_reg_offset(iommu->cap); +	while (1) { +		u8 fault_reason; +		u16 source_id; +		u64 guest_addr; +		int type; +		u32 data; + +		/* highest 32 bits */ +		data = readl(iommu->reg + reg + +				fault_index * PRIMARY_FAULT_REG_LEN + 12); +		if (!(data & DMA_FRCD_F)) +			break; + +		fault_reason = dma_frcd_fault_reason(data); +		type = dma_frcd_type(data); + +		data = readl(iommu->reg + reg + +				fault_index * PRIMARY_FAULT_REG_LEN + 8); +		source_id = dma_frcd_source_id(data); + +		guest_addr = dmar_readq(iommu->reg + reg + +				fault_index * PRIMARY_FAULT_REG_LEN); +		guest_addr = dma_frcd_page_addr(guest_addr); +		/* clear the fault */ +		writel(DMA_FRCD_F, iommu->reg + reg + +			fault_index * PRIMARY_FAULT_REG_LEN + 12); + +		spin_unlock_irqrestore(&iommu->register_lock, flag); + +		dmar_fault_do_one(iommu, type, fault_reason, +				source_id, guest_addr); + +		fault_index++; +		if (fault_index >= cap_num_fault_regs(iommu->cap)) +			fault_index = 0; +		spin_lock_irqsave(&iommu->register_lock, flag); +	} +clear_rest: +	/* clear all the other faults */ +	fault_status = readl(iommu->reg + DMAR_FSTS_REG); +	writel(fault_status, iommu->reg + DMAR_FSTS_REG); + +	spin_unlock_irqrestore(&iommu->register_lock, flag); +	return IRQ_HANDLED; +} + +int dmar_set_interrupt(struct intel_iommu *iommu) +{ +	int irq, ret; + +	/* +	 * Check if the fault interrupt is already initialized. +	 */ +	if (iommu->irq) +		return 0; + +	irq = create_irq(); +	if (!irq) { +		printk(KERN_ERR "IOMMU: no free vectors\n"); +		return -EINVAL; +	} + +	irq_set_handler_data(irq, iommu); +	iommu->irq = irq; + +	ret = arch_setup_dmar_msi(irq); +	if (ret) { +		irq_set_handler_data(irq, NULL); +		iommu->irq = 0; +		destroy_irq(irq); +		return ret; +	} + +	ret = request_irq(irq, dmar_fault, 0, iommu->name, iommu); +	if (ret) +		printk(KERN_ERR "IOMMU: can't request irq\n"); +	return ret; +} + +int __init enable_drhd_fault_handling(void) +{ +	struct dmar_drhd_unit *drhd; + +	/* +	 * Enable fault control interrupt. +	 */ +	for_each_drhd_unit(drhd) { +		int ret; +		struct intel_iommu *iommu = drhd->iommu; +		ret = dmar_set_interrupt(iommu); + +		if (ret) { +			printk(KERN_ERR "DRHD %Lx: failed to enable fault, " +			       " interrupt, ret %d\n", +			       (unsigned long long)drhd->reg_base_addr, ret); +			return -1; +		} + +		/* +		 * Clear any previous faults. +		 */ +		dmar_fault(iommu->irq, iommu); +	} + +	return 0; +} + +/* + * Re-enable Queued Invalidation interface. + */ +int dmar_reenable_qi(struct intel_iommu *iommu) +{ +	if (!ecap_qis(iommu->ecap)) +		return -ENOENT; + +	if (!iommu->qi) +		return -ENOENT; + +	/* +	 * First disable queued invalidation. +	 */ +	dmar_disable_qi(iommu); +	/* +	 * Then enable queued invalidation again. Since there is no pending +	 * invalidation requests now, it's safe to re-enable queued +	 * invalidation. +	 */ +	__dmar_enable_qi(iommu); + +	return 0; +} + +/* + * Check interrupt remapping support in DMAR table description. + */ +int __init dmar_ir_support(void) +{ +	struct acpi_table_dmar *dmar; +	dmar = (struct acpi_table_dmar *)dmar_tbl; +	if (!dmar) +		return 0; +	return dmar->flags & 0x1; +} +IOMMU_INIT_POST(detect_intel_iommu); diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c new file mode 100644 index 000000000000..c621c98c99da --- /dev/null +++ b/drivers/iommu/intel-iommu.c @@ -0,0 +1,4016 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) 2006-2008 Intel Corporation + * Author: Ashok Raj <ashok.raj@intel.com> + * Author: Shaohua Li <shaohua.li@intel.com> + * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> + * Author: Fenghua Yu <fenghua.yu@intel.com> + */ + +#include <linux/init.h> +#include <linux/bitmap.h> +#include <linux/debugfs.h> +#include <linux/slab.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/pci.h> +#include <linux/dmar.h> +#include <linux/dma-mapping.h> +#include <linux/mempool.h> +#include <linux/timer.h> +#include <linux/iova.h> +#include <linux/iommu.h> +#include <linux/intel-iommu.h> +#include <linux/syscore_ops.h> +#include <linux/tboot.h> +#include <linux/dmi.h> +#include <linux/pci-ats.h> +#include <asm/cacheflush.h> +#include <asm/iommu.h> + +#define ROOT_SIZE		VTD_PAGE_SIZE +#define CONTEXT_SIZE		VTD_PAGE_SIZE + +#define IS_BRIDGE_HOST_DEVICE(pdev) \ +			    ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST) +#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) +#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) +#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) + +#define IOAPIC_RANGE_START	(0xfee00000) +#define IOAPIC_RANGE_END	(0xfeefffff) +#define IOVA_START_ADDR		(0x1000) + +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 + +#define MAX_AGAW_WIDTH 64 + +#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1) +#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1) + +/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR +   to match. That way, we can use 'unsigned long' for PFNs with impunity. */ +#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \ +				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) +#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) + +#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT) +#define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32)) +#define DMA_64BIT_PFN		IOVA_PFN(DMA_BIT_MASK(64)) + +/* page table handling */ +#define LEVEL_STRIDE		(9) +#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1) + +static inline int agaw_to_level(int agaw) +{ +	return agaw + 2; +} + +static inline int agaw_to_width(int agaw) +{ +	return 30 + agaw * LEVEL_STRIDE; +} + +static inline int width_to_agaw(int width) +{ +	return (width - 30) / LEVEL_STRIDE; +} + +static inline unsigned int level_to_offset_bits(int level) +{ +	return (level - 1) * LEVEL_STRIDE; +} + +static inline int pfn_level_offset(unsigned long pfn, int level) +{ +	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; +} + +static inline unsigned long level_mask(int level) +{ +	return -1UL << level_to_offset_bits(level); +} + +static inline unsigned long level_size(int level) +{ +	return 1UL << level_to_offset_bits(level); +} + +static inline unsigned long align_to_level(unsigned long pfn, int level) +{ +	return (pfn + level_size(level) - 1) & level_mask(level); +} + +static inline unsigned long lvl_to_nr_pages(unsigned int lvl) +{ +	return  1 << ((lvl - 1) * LEVEL_STRIDE); +} + +/* VT-d pages must always be _smaller_ than MM pages. Otherwise things +   are never going to work. */ +static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) +{ +	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT); +} + +static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) +{ +	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); +} +static inline unsigned long page_to_dma_pfn(struct page *pg) +{ +	return mm_to_dma_pfn(page_to_pfn(pg)); +} +static inline unsigned long virt_to_dma_pfn(void *p) +{ +	return page_to_dma_pfn(virt_to_page(p)); +} + +/* global iommu list, set NULL for ignored DMAR units */ +static struct intel_iommu **g_iommus; + +static void __init check_tylersburg_isoch(void); +static int rwbf_quirk; + +/* + * set to 1 to panic kernel if can't successfully enable VT-d + * (used when kernel is launched w/ TXT) + */ +static int force_on = 0; + +/* + * 0: Present + * 1-11: Reserved + * 12-63: Context Ptr (12 - (haw-1)) + * 64-127: Reserved + */ +struct root_entry { +	u64	val; +	u64	rsvd1; +}; +#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) +static inline bool root_present(struct root_entry *root) +{ +	return (root->val & 1); +} +static inline void set_root_present(struct root_entry *root) +{ +	root->val |= 1; +} +static inline void set_root_value(struct root_entry *root, unsigned long value) +{ +	root->val |= value & VTD_PAGE_MASK; +} + +static inline struct context_entry * +get_context_addr_from_root(struct root_entry *root) +{ +	return (struct context_entry *) +		(root_present(root)?phys_to_virt( +		root->val & VTD_PAGE_MASK) : +		NULL); +} + +/* + * low 64 bits: + * 0: present + * 1: fault processing disable + * 2-3: translation type + * 12-63: address space root + * high 64 bits: + * 0-2: address width + * 3-6: aval + * 8-23: domain id + */ +struct context_entry { +	u64 lo; +	u64 hi; +}; + +static inline bool context_present(struct context_entry *context) +{ +	return (context->lo & 1); +} +static inline void context_set_present(struct context_entry *context) +{ +	context->lo |= 1; +} + +static inline void context_set_fault_enable(struct context_entry *context) +{ +	context->lo &= (((u64)-1) << 2) | 1; +} + +static inline void context_set_translation_type(struct context_entry *context, +						unsigned long value) +{ +	context->lo &= (((u64)-1) << 4) | 3; +	context->lo |= (value & 3) << 2; +} + +static inline void context_set_address_root(struct context_entry *context, +					    unsigned long value) +{ +	context->lo |= value & VTD_PAGE_MASK; +} + +static inline void context_set_address_width(struct context_entry *context, +					     unsigned long value) +{ +	context->hi |= value & 7; +} + +static inline void context_set_domain_id(struct context_entry *context, +					 unsigned long value) +{ +	context->hi |= (value & ((1 << 16) - 1)) << 8; +} + +static inline void context_clear_entry(struct context_entry *context) +{ +	context->lo = 0; +	context->hi = 0; +} + +/* + * 0: readable + * 1: writable + * 2-6: reserved + * 7: super page + * 8-10: available + * 11: snoop behavior + * 12-63: Host physcial address + */ +struct dma_pte { +	u64 val; +}; + +static inline void dma_clear_pte(struct dma_pte *pte) +{ +	pte->val = 0; +} + +static inline void dma_set_pte_readable(struct dma_pte *pte) +{ +	pte->val |= DMA_PTE_READ; +} + +static inline void dma_set_pte_writable(struct dma_pte *pte) +{ +	pte->val |= DMA_PTE_WRITE; +} + +static inline void dma_set_pte_snp(struct dma_pte *pte) +{ +	pte->val |= DMA_PTE_SNP; +} + +static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot) +{ +	pte->val = (pte->val & ~3) | (prot & 3); +} + +static inline u64 dma_pte_addr(struct dma_pte *pte) +{ +#ifdef CONFIG_64BIT +	return pte->val & VTD_PAGE_MASK; +#else +	/* Must have a full atomic 64-bit read */ +	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK; +#endif +} + +static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn) +{ +	pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT; +} + +static inline bool dma_pte_present(struct dma_pte *pte) +{ +	return (pte->val & 3) != 0; +} + +static inline int first_pte_in_page(struct dma_pte *pte) +{ +	return !((unsigned long)pte & ~VTD_PAGE_MASK); +} + +/* + * This domain is a statically identity mapping domain. + *	1. This domain creats a static 1:1 mapping to all usable memory. + * 	2. It maps to each iommu if successful. + *	3. Each iommu mapps to this domain if successful. + */ +static struct dmar_domain *si_domain; +static int hw_pass_through = 1; + +/* devices under the same p2p bridge are owned in one domain */ +#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0) + +/* domain represents a virtual machine, more than one devices + * across iommus may be owned in one domain, e.g. kvm guest. + */ +#define DOMAIN_FLAG_VIRTUAL_MACHINE	(1 << 1) + +/* si_domain contains mulitple devices */ +#define DOMAIN_FLAG_STATIC_IDENTITY	(1 << 2) + +struct dmar_domain { +	int	id;			/* domain id */ +	int	nid;			/* node id */ +	unsigned long iommu_bmp;	/* bitmap of iommus this domain uses*/ + +	struct list_head devices; 	/* all devices' list */ +	struct iova_domain iovad;	/* iova's that belong to this domain */ + +	struct dma_pte	*pgd;		/* virtual address */ +	int		gaw;		/* max guest address width */ + +	/* adjusted guest address width, 0 is level 2 30-bit */ +	int		agaw; + +	int		flags;		/* flags to find out type of domain */ + +	int		iommu_coherency;/* indicate coherency of iommu access */ +	int		iommu_snooping; /* indicate snooping control feature*/ +	int		iommu_count;	/* reference count of iommu */ +	int		iommu_superpage;/* Level of superpages supported: +					   0 == 4KiB (no superpages), 1 == 2MiB, +					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ +	spinlock_t	iommu_lock;	/* protect iommu set in domain */ +	u64		max_addr;	/* maximum mapped address */ +}; + +/* PCI domain-device relationship */ +struct device_domain_info { +	struct list_head link;	/* link to domain siblings */ +	struct list_head global; /* link to global list */ +	int segment;		/* PCI domain */ +	u8 bus;			/* PCI bus number */ +	u8 devfn;		/* PCI devfn number */ +	struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */ +	struct intel_iommu *iommu; /* IOMMU used by this device */ +	struct dmar_domain *domain; /* pointer to domain */ +}; + +static void flush_unmaps_timeout(unsigned long data); + +DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0); + +#define HIGH_WATER_MARK 250 +struct deferred_flush_tables { +	int next; +	struct iova *iova[HIGH_WATER_MARK]; +	struct dmar_domain *domain[HIGH_WATER_MARK]; +}; + +static struct deferred_flush_tables *deferred_flush; + +/* bitmap for indexing intel_iommus */ +static int g_num_of_iommus; + +static DEFINE_SPINLOCK(async_umap_flush_lock); +static LIST_HEAD(unmaps_to_do); + +static int timer_on; +static long list_size; + +static void domain_remove_dev_info(struct dmar_domain *domain); + +#ifdef CONFIG_DMAR_DEFAULT_ON +int dmar_disabled = 0; +#else +int dmar_disabled = 1; +#endif /*CONFIG_DMAR_DEFAULT_ON*/ + +static int dmar_map_gfx = 1; +static int dmar_forcedac; +static int intel_iommu_strict; +static int intel_iommu_superpage = 1; + +#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) +static DEFINE_SPINLOCK(device_domain_lock); +static LIST_HEAD(device_domain_list); + +static struct iommu_ops intel_iommu_ops; + +static int __init intel_iommu_setup(char *str) +{ +	if (!str) +		return -EINVAL; +	while (*str) { +		if (!strncmp(str, "on", 2)) { +			dmar_disabled = 0; +			printk(KERN_INFO "Intel-IOMMU: enabled\n"); +		} else if (!strncmp(str, "off", 3)) { +			dmar_disabled = 1; +			printk(KERN_INFO "Intel-IOMMU: disabled\n"); +		} else if (!strncmp(str, "igfx_off", 8)) { +			dmar_map_gfx = 0; +			printk(KERN_INFO +				"Intel-IOMMU: disable GFX device mapping\n"); +		} else if (!strncmp(str, "forcedac", 8)) { +			printk(KERN_INFO +				"Intel-IOMMU: Forcing DAC for PCI devices\n"); +			dmar_forcedac = 1; +		} else if (!strncmp(str, "strict", 6)) { +			printk(KERN_INFO +				"Intel-IOMMU: disable batched IOTLB flush\n"); +			intel_iommu_strict = 1; +		} else if (!strncmp(str, "sp_off", 6)) { +			printk(KERN_INFO +				"Intel-IOMMU: disable supported super page\n"); +			intel_iommu_superpage = 0; +		} + +		str += strcspn(str, ","); +		while (*str == ',') +			str++; +	} +	return 0; +} +__setup("intel_iommu=", intel_iommu_setup); + +static struct kmem_cache *iommu_domain_cache; +static struct kmem_cache *iommu_devinfo_cache; +static struct kmem_cache *iommu_iova_cache; + +static inline void *alloc_pgtable_page(int node) +{ +	struct page *page; +	void *vaddr = NULL; + +	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); +	if (page) +		vaddr = page_address(page); +	return vaddr; +} + +static inline void free_pgtable_page(void *vaddr) +{ +	free_page((unsigned long)vaddr); +} + +static inline void *alloc_domain_mem(void) +{ +	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC); +} + +static void free_domain_mem(void *vaddr) +{ +	kmem_cache_free(iommu_domain_cache, vaddr); +} + +static inline void * alloc_devinfo_mem(void) +{ +	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC); +} + +static inline void free_devinfo_mem(void *vaddr) +{ +	kmem_cache_free(iommu_devinfo_cache, vaddr); +} + +struct iova *alloc_iova_mem(void) +{ +	return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC); +} + +void free_iova_mem(struct iova *iova) +{ +	kmem_cache_free(iommu_iova_cache, iova); +} + + +static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) +{ +	unsigned long sagaw; +	int agaw = -1; + +	sagaw = cap_sagaw(iommu->cap); +	for (agaw = width_to_agaw(max_gaw); +	     agaw >= 0; agaw--) { +		if (test_bit(agaw, &sagaw)) +			break; +	} + +	return agaw; +} + +/* + * Calculate max SAGAW for each iommu. + */ +int iommu_calculate_max_sagaw(struct intel_iommu *iommu) +{ +	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); +} + +/* + * calculate agaw for each iommu. + * "SAGAW" may be different across iommus, use a default agaw, and + * get a supported less agaw for iommus that don't support the default agaw. + */ +int iommu_calculate_agaw(struct intel_iommu *iommu) +{ +	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); +} + +/* This functionin only returns single iommu in a domain */ +static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) +{ +	int iommu_id; + +	/* si_domain and vm domain should not get here. */ +	BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE); +	BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY); + +	iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus); +	if (iommu_id < 0 || iommu_id >= g_num_of_iommus) +		return NULL; + +	return g_iommus[iommu_id]; +} + +static void domain_update_iommu_coherency(struct dmar_domain *domain) +{ +	int i; + +	domain->iommu_coherency = 1; + +	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) { +		if (!ecap_coherent(g_iommus[i]->ecap)) { +			domain->iommu_coherency = 0; +			break; +		} +	} +} + +static void domain_update_iommu_snooping(struct dmar_domain *domain) +{ +	int i; + +	domain->iommu_snooping = 1; + +	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) { +		if (!ecap_sc_support(g_iommus[i]->ecap)) { +			domain->iommu_snooping = 0; +			break; +		} +	} +} + +static void domain_update_iommu_superpage(struct dmar_domain *domain) +{ +	int i, mask = 0xf; + +	if (!intel_iommu_superpage) { +		domain->iommu_superpage = 0; +		return; +	} + +	domain->iommu_superpage = 4; /* 1TiB */ + +	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) { +		mask |= cap_super_page_val(g_iommus[i]->cap); +		if (!mask) { +			break; +		} +	} +	domain->iommu_superpage = fls(mask); +} + +/* Some capabilities may be different across iommus */ +static void domain_update_iommu_cap(struct dmar_domain *domain) +{ +	domain_update_iommu_coherency(domain); +	domain_update_iommu_snooping(domain); +	domain_update_iommu_superpage(domain); +} + +static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn) +{ +	struct dmar_drhd_unit *drhd = NULL; +	int i; + +	for_each_drhd_unit(drhd) { +		if (drhd->ignored) +			continue; +		if (segment != drhd->segment) +			continue; + +		for (i = 0; i < drhd->devices_cnt; i++) { +			if (drhd->devices[i] && +			    drhd->devices[i]->bus->number == bus && +			    drhd->devices[i]->devfn == devfn) +				return drhd->iommu; +			if (drhd->devices[i] && +			    drhd->devices[i]->subordinate && +			    drhd->devices[i]->subordinate->number <= bus && +			    drhd->devices[i]->subordinate->subordinate >= bus) +				return drhd->iommu; +		} + +		if (drhd->include_all) +			return drhd->iommu; +	} + +	return NULL; +} + +static void domain_flush_cache(struct dmar_domain *domain, +			       void *addr, int size) +{ +	if (!domain->iommu_coherency) +		clflush_cache_range(addr, size); +} + +/* Gets context entry for a given bus and devfn */ +static struct context_entry * device_to_context_entry(struct intel_iommu *iommu, +		u8 bus, u8 devfn) +{ +	struct root_entry *root; +	struct context_entry *context; +	unsigned long phy_addr; +	unsigned long flags; + +	spin_lock_irqsave(&iommu->lock, flags); +	root = &iommu->root_entry[bus]; +	context = get_context_addr_from_root(root); +	if (!context) { +		context = (struct context_entry *) +				alloc_pgtable_page(iommu->node); +		if (!context) { +			spin_unlock_irqrestore(&iommu->lock, flags); +			return NULL; +		} +		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); +		phy_addr = virt_to_phys((void *)context); +		set_root_value(root, phy_addr); +		set_root_present(root); +		__iommu_flush_cache(iommu, root, sizeof(*root)); +	} +	spin_unlock_irqrestore(&iommu->lock, flags); +	return &context[devfn]; +} + +static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ +	struct root_entry *root; +	struct context_entry *context; +	int ret; +	unsigned long flags; + +	spin_lock_irqsave(&iommu->lock, flags); +	root = &iommu->root_entry[bus]; +	context = get_context_addr_from_root(root); +	if (!context) { +		ret = 0; +		goto out; +	} +	ret = context_present(&context[devfn]); +out: +	spin_unlock_irqrestore(&iommu->lock, flags); +	return ret; +} + +static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ +	struct root_entry *root; +	struct context_entry *context; +	unsigned long flags; + +	spin_lock_irqsave(&iommu->lock, flags); +	root = &iommu->root_entry[bus]; +	context = get_context_addr_from_root(root); +	if (context) { +		context_clear_entry(&context[devfn]); +		__iommu_flush_cache(iommu, &context[devfn], \ +			sizeof(*context)); +	} +	spin_unlock_irqrestore(&iommu->lock, flags); +} + +static void free_context_table(struct intel_iommu *iommu) +{ +	struct root_entry *root; +	int i; +	unsigned long flags; +	struct context_entry *context; + +	spin_lock_irqsave(&iommu->lock, flags); +	if (!iommu->root_entry) { +		goto out; +	} +	for (i = 0; i < ROOT_ENTRY_NR; i++) { +		root = &iommu->root_entry[i]; +		context = get_context_addr_from_root(root); +		if (context) +			free_pgtable_page(context); +	} +	free_pgtable_page(iommu->root_entry); +	iommu->root_entry = NULL; +out: +	spin_unlock_irqrestore(&iommu->lock, flags); +} + +static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, +				      unsigned long pfn, int large_level) +{ +	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; +	struct dma_pte *parent, *pte = NULL; +	int level = agaw_to_level(domain->agaw); +	int offset, target_level; + +	BUG_ON(!domain->pgd); +	BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width); +	parent = domain->pgd; + +	/* Search pte */ +	if (!large_level) +		target_level = 1; +	else +		target_level = large_level; + +	while (level > 0) { +		void *tmp_page; + +		offset = pfn_level_offset(pfn, level); +		pte = &parent[offset]; +		if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE)) +			break; +		if (level == target_level) +			break; + +		if (!dma_pte_present(pte)) { +			uint64_t pteval; + +			tmp_page = alloc_pgtable_page(domain->nid); + +			if (!tmp_page) +				return NULL; + +			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); +			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; +			if (cmpxchg64(&pte->val, 0ULL, pteval)) { +				/* Someone else set it while we were thinking; use theirs. */ +				free_pgtable_page(tmp_page); +			} else { +				dma_pte_addr(pte); +				domain_flush_cache(domain, pte, sizeof(*pte)); +			} +		} +		parent = phys_to_virt(dma_pte_addr(pte)); +		level--; +	} + +	return pte; +} + + +/* return address's pte at specific level */ +static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, +					 unsigned long pfn, +					 int level, int *large_page) +{ +	struct dma_pte *parent, *pte = NULL; +	int total = agaw_to_level(domain->agaw); +	int offset; + +	parent = domain->pgd; +	while (level <= total) { +		offset = pfn_level_offset(pfn, total); +		pte = &parent[offset]; +		if (level == total) +			return pte; + +		if (!dma_pte_present(pte)) { +			*large_page = total; +			break; +		} + +		if (pte->val & DMA_PTE_LARGE_PAGE) { +			*large_page = total; +			return pte; +		} + +		parent = phys_to_virt(dma_pte_addr(pte)); +		total--; +	} +	return NULL; +} + +/* clear last level pte, a tlb flush should be followed */ +static void dma_pte_clear_range(struct dmar_domain *domain, +				unsigned long start_pfn, +				unsigned long last_pfn) +{ +	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; +	unsigned int large_page = 1; +	struct dma_pte *first_pte, *pte; + +	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); +	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); +	BUG_ON(start_pfn > last_pfn); + +	/* we don't need lock here; nobody else touches the iova range */ +	do { +		large_page = 1; +		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); +		if (!pte) { +			start_pfn = align_to_level(start_pfn + 1, large_page + 1); +			continue; +		} +		do { +			dma_clear_pte(pte); +			start_pfn += lvl_to_nr_pages(large_page); +			pte++; +		} while (start_pfn <= last_pfn && !first_pte_in_page(pte)); + +		domain_flush_cache(domain, first_pte, +				   (void *)pte - (void *)first_pte); + +	} while (start_pfn && start_pfn <= last_pfn); +} + +/* free page table pages. last level pte should already be cleared */ +static void dma_pte_free_pagetable(struct dmar_domain *domain, +				   unsigned long start_pfn, +				   unsigned long last_pfn) +{ +	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; +	struct dma_pte *first_pte, *pte; +	int total = agaw_to_level(domain->agaw); +	int level; +	unsigned long tmp; +	int large_page = 2; + +	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); +	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); +	BUG_ON(start_pfn > last_pfn); + +	/* We don't need lock here; nobody else touches the iova range */ +	level = 2; +	while (level <= total) { +		tmp = align_to_level(start_pfn, level); + +		/* If we can't even clear one PTE at this level, we're done */ +		if (tmp + level_size(level) - 1 > last_pfn) +			return; + +		do { +			large_page = level; +			first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page); +			if (large_page > level) +				level = large_page + 1; +			if (!pte) { +				tmp = align_to_level(tmp + 1, level + 1); +				continue; +			} +			do { +				if (dma_pte_present(pte)) { +					free_pgtable_page(phys_to_virt(dma_pte_addr(pte))); +					dma_clear_pte(pte); +				} +				pte++; +				tmp += level_size(level); +			} while (!first_pte_in_page(pte) && +				 tmp + level_size(level) - 1 <= last_pfn); + +			domain_flush_cache(domain, first_pte, +					   (void *)pte - (void *)first_pte); +			 +		} while (tmp && tmp + level_size(level) - 1 <= last_pfn); +		level++; +	} +	/* free pgd */ +	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { +		free_pgtable_page(domain->pgd); +		domain->pgd = NULL; +	} +} + +/* iommu handling */ +static int iommu_alloc_root_entry(struct intel_iommu *iommu) +{ +	struct root_entry *root; +	unsigned long flags; + +	root = (struct root_entry *)alloc_pgtable_page(iommu->node); +	if (!root) +		return -ENOMEM; + +	__iommu_flush_cache(iommu, root, ROOT_SIZE); + +	spin_lock_irqsave(&iommu->lock, flags); +	iommu->root_entry = root; +	spin_unlock_irqrestore(&iommu->lock, flags); + +	return 0; +} + +static void iommu_set_root_entry(struct intel_iommu *iommu) +{ +	void *addr; +	u32 sts; +	unsigned long flag; + +	addr = iommu->root_entry; + +	spin_lock_irqsave(&iommu->register_lock, flag); +	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr)); + +	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); + +	/* Make sure hardware complete it */ +	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +		      readl, (sts & DMA_GSTS_RTPS), sts); + +	spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +static void iommu_flush_write_buffer(struct intel_iommu *iommu) +{ +	u32 val; +	unsigned long flag; + +	if (!rwbf_quirk && !cap_rwbf(iommu->cap)) +		return; + +	spin_lock_irqsave(&iommu->register_lock, flag); +	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); + +	/* Make sure hardware complete it */ +	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +		      readl, (!(val & DMA_GSTS_WBFS)), val); + +	spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +/* return value determine if we need a write buffer flush */ +static void __iommu_flush_context(struct intel_iommu *iommu, +				  u16 did, u16 source_id, u8 function_mask, +				  u64 type) +{ +	u64 val = 0; +	unsigned long flag; + +	switch (type) { +	case DMA_CCMD_GLOBAL_INVL: +		val = DMA_CCMD_GLOBAL_INVL; +		break; +	case DMA_CCMD_DOMAIN_INVL: +		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); +		break; +	case DMA_CCMD_DEVICE_INVL: +		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) +			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); +		break; +	default: +		BUG(); +	} +	val |= DMA_CCMD_ICC; + +	spin_lock_irqsave(&iommu->register_lock, flag); +	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); + +	/* Make sure hardware complete it */ +	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, +		dmar_readq, (!(val & DMA_CCMD_ICC)), val); + +	spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +/* return value determine if we need a write buffer flush */ +static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, +				u64 addr, unsigned int size_order, u64 type) +{ +	int tlb_offset = ecap_iotlb_offset(iommu->ecap); +	u64 val = 0, val_iva = 0; +	unsigned long flag; + +	switch (type) { +	case DMA_TLB_GLOBAL_FLUSH: +		/* global flush doesn't need set IVA_REG */ +		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; +		break; +	case DMA_TLB_DSI_FLUSH: +		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); +		break; +	case DMA_TLB_PSI_FLUSH: +		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); +		/* Note: always flush non-leaf currently */ +		val_iva = size_order | addr; +		break; +	default: +		BUG(); +	} +	/* Note: set drain read/write */ +#if 0 +	/* +	 * This is probably to be super secure.. Looks like we can +	 * ignore it without any impact. +	 */ +	if (cap_read_drain(iommu->cap)) +		val |= DMA_TLB_READ_DRAIN; +#endif +	if (cap_write_drain(iommu->cap)) +		val |= DMA_TLB_WRITE_DRAIN; + +	spin_lock_irqsave(&iommu->register_lock, flag); +	/* Note: Only uses first TLB reg currently */ +	if (val_iva) +		dmar_writeq(iommu->reg + tlb_offset, val_iva); +	dmar_writeq(iommu->reg + tlb_offset + 8, val); + +	/* Make sure hardware complete it */ +	IOMMU_WAIT_OP(iommu, tlb_offset + 8, +		dmar_readq, (!(val & DMA_TLB_IVT)), val); + +	spin_unlock_irqrestore(&iommu->register_lock, flag); + +	/* check IOTLB invalidation granularity */ +	if (DMA_TLB_IAIG(val) == 0) +		printk(KERN_ERR"IOMMU: flush IOTLB failed\n"); +	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) +		pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n", +			(unsigned long long)DMA_TLB_IIRG(type), +			(unsigned long long)DMA_TLB_IAIG(val)); +} + +static struct device_domain_info *iommu_support_dev_iotlb( +	struct dmar_domain *domain, int segment, u8 bus, u8 devfn) +{ +	int found = 0; +	unsigned long flags; +	struct device_domain_info *info; +	struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn); + +	if (!ecap_dev_iotlb_support(iommu->ecap)) +		return NULL; + +	if (!iommu->qi) +		return NULL; + +	spin_lock_irqsave(&device_domain_lock, flags); +	list_for_each_entry(info, &domain->devices, link) +		if (info->bus == bus && info->devfn == devfn) { +			found = 1; +			break; +		} +	spin_unlock_irqrestore(&device_domain_lock, flags); + +	if (!found || !info->dev) +		return NULL; + +	if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS)) +		return NULL; + +	if (!dmar_find_matched_atsr_unit(info->dev)) +		return NULL; + +	info->iommu = iommu; + +	return info; +} + +static void iommu_enable_dev_iotlb(struct device_domain_info *info) +{ +	if (!info) +		return; + +	pci_enable_ats(info->dev, VTD_PAGE_SHIFT); +} + +static void iommu_disable_dev_iotlb(struct device_domain_info *info) +{ +	if (!info->dev || !pci_ats_enabled(info->dev)) +		return; + +	pci_disable_ats(info->dev); +} + +static void iommu_flush_dev_iotlb(struct dmar_domain *domain, +				  u64 addr, unsigned mask) +{ +	u16 sid, qdep; +	unsigned long flags; +	struct device_domain_info *info; + +	spin_lock_irqsave(&device_domain_lock, flags); +	list_for_each_entry(info, &domain->devices, link) { +		if (!info->dev || !pci_ats_enabled(info->dev)) +			continue; + +		sid = info->bus << 8 | info->devfn; +		qdep = pci_ats_queue_depth(info->dev); +		qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask); +	} +	spin_unlock_irqrestore(&device_domain_lock, flags); +} + +static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, +				  unsigned long pfn, unsigned int pages, int map) +{ +	unsigned int mask = ilog2(__roundup_pow_of_two(pages)); +	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; + +	BUG_ON(pages == 0); + +	/* +	 * Fallback to domain selective flush if no PSI support or the size is +	 * too big. +	 * PSI requires page size to be 2 ^ x, and the base address is naturally +	 * aligned to the size +	 */ +	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap)) +		iommu->flush.flush_iotlb(iommu, did, 0, 0, +						DMA_TLB_DSI_FLUSH); +	else +		iommu->flush.flush_iotlb(iommu, did, addr, mask, +						DMA_TLB_PSI_FLUSH); + +	/* +	 * In caching mode, changes of pages from non-present to present require +	 * flush. However, device IOTLB doesn't need to be flushed in this case. +	 */ +	if (!cap_caching_mode(iommu->cap) || !map) +		iommu_flush_dev_iotlb(iommu->domains[did], addr, mask); +} + +static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) +{ +	u32 pmen; +	unsigned long flags; + +	spin_lock_irqsave(&iommu->register_lock, flags); +	pmen = readl(iommu->reg + DMAR_PMEN_REG); +	pmen &= ~DMA_PMEN_EPM; +	writel(pmen, iommu->reg + DMAR_PMEN_REG); + +	/* wait for the protected region status bit to clear */ +	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, +		readl, !(pmen & DMA_PMEN_PRS), pmen); + +	spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +static int iommu_enable_translation(struct intel_iommu *iommu) +{ +	u32 sts; +	unsigned long flags; + +	spin_lock_irqsave(&iommu->register_lock, flags); +	iommu->gcmd |= DMA_GCMD_TE; +	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + +	/* Make sure hardware complete it */ +	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +		      readl, (sts & DMA_GSTS_TES), sts); + +	spin_unlock_irqrestore(&iommu->register_lock, flags); +	return 0; +} + +static int iommu_disable_translation(struct intel_iommu *iommu) +{ +	u32 sts; +	unsigned long flag; + +	spin_lock_irqsave(&iommu->register_lock, flag); +	iommu->gcmd &= ~DMA_GCMD_TE; +	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + +	/* Make sure hardware complete it */ +	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +		      readl, (!(sts & DMA_GSTS_TES)), sts); + +	spin_unlock_irqrestore(&iommu->register_lock, flag); +	return 0; +} + + +static int iommu_init_domains(struct intel_iommu *iommu) +{ +	unsigned long ndomains; +	unsigned long nlongs; + +	ndomains = cap_ndoms(iommu->cap); +	pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id, +			ndomains); +	nlongs = BITS_TO_LONGS(ndomains); + +	spin_lock_init(&iommu->lock); + +	/* TBD: there might be 64K domains, +	 * consider other allocation for future chip +	 */ +	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL); +	if (!iommu->domain_ids) { +		printk(KERN_ERR "Allocating domain id array failed\n"); +		return -ENOMEM; +	} +	iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *), +			GFP_KERNEL); +	if (!iommu->domains) { +		printk(KERN_ERR "Allocating domain array failed\n"); +		return -ENOMEM; +	} + +	/* +	 * if Caching mode is set, then invalid translations are tagged +	 * with domainid 0. Hence we need to pre-allocate it. +	 */ +	if (cap_caching_mode(iommu->cap)) +		set_bit(0, iommu->domain_ids); +	return 0; +} + + +static void domain_exit(struct dmar_domain *domain); +static void vm_domain_exit(struct dmar_domain *domain); + +void free_dmar_iommu(struct intel_iommu *iommu) +{ +	struct dmar_domain *domain; +	int i; +	unsigned long flags; + +	if ((iommu->domains) && (iommu->domain_ids)) { +		for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) { +			domain = iommu->domains[i]; +			clear_bit(i, iommu->domain_ids); + +			spin_lock_irqsave(&domain->iommu_lock, flags); +			if (--domain->iommu_count == 0) { +				if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) +					vm_domain_exit(domain); +				else +					domain_exit(domain); +			} +			spin_unlock_irqrestore(&domain->iommu_lock, flags); +		} +	} + +	if (iommu->gcmd & DMA_GCMD_TE) +		iommu_disable_translation(iommu); + +	if (iommu->irq) { +		irq_set_handler_data(iommu->irq, NULL); +		/* This will mask the irq */ +		free_irq(iommu->irq, iommu); +		destroy_irq(iommu->irq); +	} + +	kfree(iommu->domains); +	kfree(iommu->domain_ids); + +	g_iommus[iommu->seq_id] = NULL; + +	/* if all iommus are freed, free g_iommus */ +	for (i = 0; i < g_num_of_iommus; i++) { +		if (g_iommus[i]) +			break; +	} + +	if (i == g_num_of_iommus) +		kfree(g_iommus); + +	/* free context mapping */ +	free_context_table(iommu); +} + +static struct dmar_domain *alloc_domain(void) +{ +	struct dmar_domain *domain; + +	domain = alloc_domain_mem(); +	if (!domain) +		return NULL; + +	domain->nid = -1; +	memset(&domain->iommu_bmp, 0, sizeof(unsigned long)); +	domain->flags = 0; + +	return domain; +} + +static int iommu_attach_domain(struct dmar_domain *domain, +			       struct intel_iommu *iommu) +{ +	int num; +	unsigned long ndomains; +	unsigned long flags; + +	ndomains = cap_ndoms(iommu->cap); + +	spin_lock_irqsave(&iommu->lock, flags); + +	num = find_first_zero_bit(iommu->domain_ids, ndomains); +	if (num >= ndomains) { +		spin_unlock_irqrestore(&iommu->lock, flags); +		printk(KERN_ERR "IOMMU: no free domain ids\n"); +		return -ENOMEM; +	} + +	domain->id = num; +	set_bit(num, iommu->domain_ids); +	set_bit(iommu->seq_id, &domain->iommu_bmp); +	iommu->domains[num] = domain; +	spin_unlock_irqrestore(&iommu->lock, flags); + +	return 0; +} + +static void iommu_detach_domain(struct dmar_domain *domain, +				struct intel_iommu *iommu) +{ +	unsigned long flags; +	int num, ndomains; +	int found = 0; + +	spin_lock_irqsave(&iommu->lock, flags); +	ndomains = cap_ndoms(iommu->cap); +	for_each_set_bit(num, iommu->domain_ids, ndomains) { +		if (iommu->domains[num] == domain) { +			found = 1; +			break; +		} +	} + +	if (found) { +		clear_bit(num, iommu->domain_ids); +		clear_bit(iommu->seq_id, &domain->iommu_bmp); +		iommu->domains[num] = NULL; +	} +	spin_unlock_irqrestore(&iommu->lock, flags); +} + +static struct iova_domain reserved_iova_list; +static struct lock_class_key reserved_rbtree_key; + +static int dmar_init_reserved_ranges(void) +{ +	struct pci_dev *pdev = NULL; +	struct iova *iova; +	int i; + +	init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN); + +	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock, +		&reserved_rbtree_key); + +	/* IOAPIC ranges shouldn't be accessed by DMA */ +	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START), +		IOVA_PFN(IOAPIC_RANGE_END)); +	if (!iova) { +		printk(KERN_ERR "Reserve IOAPIC range failed\n"); +		return -ENODEV; +	} + +	/* Reserve all PCI MMIO to avoid peer-to-peer access */ +	for_each_pci_dev(pdev) { +		struct resource *r; + +		for (i = 0; i < PCI_NUM_RESOURCES; i++) { +			r = &pdev->resource[i]; +			if (!r->flags || !(r->flags & IORESOURCE_MEM)) +				continue; +			iova = reserve_iova(&reserved_iova_list, +					    IOVA_PFN(r->start), +					    IOVA_PFN(r->end)); +			if (!iova) { +				printk(KERN_ERR "Reserve iova failed\n"); +				return -ENODEV; +			} +		} +	} +	return 0; +} + +static void domain_reserve_special_ranges(struct dmar_domain *domain) +{ +	copy_reserved_iova(&reserved_iova_list, &domain->iovad); +} + +static inline int guestwidth_to_adjustwidth(int gaw) +{ +	int agaw; +	int r = (gaw - 12) % 9; + +	if (r == 0) +		agaw = gaw; +	else +		agaw = gaw + 9 - r; +	if (agaw > 64) +		agaw = 64; +	return agaw; +} + +static int domain_init(struct dmar_domain *domain, int guest_width) +{ +	struct intel_iommu *iommu; +	int adjust_width, agaw; +	unsigned long sagaw; + +	init_iova_domain(&domain->iovad, DMA_32BIT_PFN); +	spin_lock_init(&domain->iommu_lock); + +	domain_reserve_special_ranges(domain); + +	/* calculate AGAW */ +	iommu = domain_get_iommu(domain); +	if (guest_width > cap_mgaw(iommu->cap)) +		guest_width = cap_mgaw(iommu->cap); +	domain->gaw = guest_width; +	adjust_width = guestwidth_to_adjustwidth(guest_width); +	agaw = width_to_agaw(adjust_width); +	sagaw = cap_sagaw(iommu->cap); +	if (!test_bit(agaw, &sagaw)) { +		/* hardware doesn't support it, choose a bigger one */ +		pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw); +		agaw = find_next_bit(&sagaw, 5, agaw); +		if (agaw >= 5) +			return -ENODEV; +	} +	domain->agaw = agaw; +	INIT_LIST_HEAD(&domain->devices); + +	if (ecap_coherent(iommu->ecap)) +		domain->iommu_coherency = 1; +	else +		domain->iommu_coherency = 0; + +	if (ecap_sc_support(iommu->ecap)) +		domain->iommu_snooping = 1; +	else +		domain->iommu_snooping = 0; + +	domain->iommu_superpage = fls(cap_super_page_val(iommu->cap)); +	domain->iommu_count = 1; +	domain->nid = iommu->node; + +	/* always allocate the top pgd */ +	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid); +	if (!domain->pgd) +		return -ENOMEM; +	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE); +	return 0; +} + +static void domain_exit(struct dmar_domain *domain) +{ +	struct dmar_drhd_unit *drhd; +	struct intel_iommu *iommu; + +	/* Domain 0 is reserved, so dont process it */ +	if (!domain) +		return; + +	/* Flush any lazy unmaps that may reference this domain */ +	if (!intel_iommu_strict) +		flush_unmaps_timeout(0); + +	domain_remove_dev_info(domain); +	/* destroy iovas */ +	put_iova_domain(&domain->iovad); + +	/* clear ptes */ +	dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); + +	/* free page tables */ +	dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); + +	for_each_active_iommu(iommu, drhd) +		if (test_bit(iommu->seq_id, &domain->iommu_bmp)) +			iommu_detach_domain(domain, iommu); + +	free_domain_mem(domain); +} + +static int domain_context_mapping_one(struct dmar_domain *domain, int segment, +				 u8 bus, u8 devfn, int translation) +{ +	struct context_entry *context; +	unsigned long flags; +	struct intel_iommu *iommu; +	struct dma_pte *pgd; +	unsigned long num; +	unsigned long ndomains; +	int id; +	int agaw; +	struct device_domain_info *info = NULL; + +	pr_debug("Set context mapping for %02x:%02x.%d\n", +		bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + +	BUG_ON(!domain->pgd); +	BUG_ON(translation != CONTEXT_TT_PASS_THROUGH && +	       translation != CONTEXT_TT_MULTI_LEVEL); + +	iommu = device_to_iommu(segment, bus, devfn); +	if (!iommu) +		return -ENODEV; + +	context = device_to_context_entry(iommu, bus, devfn); +	if (!context) +		return -ENOMEM; +	spin_lock_irqsave(&iommu->lock, flags); +	if (context_present(context)) { +		spin_unlock_irqrestore(&iommu->lock, flags); +		return 0; +	} + +	id = domain->id; +	pgd = domain->pgd; + +	if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE || +	    domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) { +		int found = 0; + +		/* find an available domain id for this device in iommu */ +		ndomains = cap_ndoms(iommu->cap); +		for_each_set_bit(num, iommu->domain_ids, ndomains) { +			if (iommu->domains[num] == domain) { +				id = num; +				found = 1; +				break; +			} +		} + +		if (found == 0) { +			num = find_first_zero_bit(iommu->domain_ids, ndomains); +			if (num >= ndomains) { +				spin_unlock_irqrestore(&iommu->lock, flags); +				printk(KERN_ERR "IOMMU: no free domain ids\n"); +				return -EFAULT; +			} + +			set_bit(num, iommu->domain_ids); +			iommu->domains[num] = domain; +			id = num; +		} + +		/* Skip top levels of page tables for +		 * iommu which has less agaw than default. +		 * Unnecessary for PT mode. +		 */ +		if (translation != CONTEXT_TT_PASS_THROUGH) { +			for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) { +				pgd = phys_to_virt(dma_pte_addr(pgd)); +				if (!dma_pte_present(pgd)) { +					spin_unlock_irqrestore(&iommu->lock, flags); +					return -ENOMEM; +				} +			} +		} +	} + +	context_set_domain_id(context, id); + +	if (translation != CONTEXT_TT_PASS_THROUGH) { +		info = iommu_support_dev_iotlb(domain, segment, bus, devfn); +		translation = info ? CONTEXT_TT_DEV_IOTLB : +				     CONTEXT_TT_MULTI_LEVEL; +	} +	/* +	 * In pass through mode, AW must be programmed to indicate the largest +	 * AGAW value supported by hardware. And ASR is ignored by hardware. +	 */ +	if (unlikely(translation == CONTEXT_TT_PASS_THROUGH)) +		context_set_address_width(context, iommu->msagaw); +	else { +		context_set_address_root(context, virt_to_phys(pgd)); +		context_set_address_width(context, iommu->agaw); +	} + +	context_set_translation_type(context, translation); +	context_set_fault_enable(context); +	context_set_present(context); +	domain_flush_cache(domain, context, sizeof(*context)); + +	/* +	 * It's a non-present to present mapping. If hardware doesn't cache +	 * non-present entry we only need to flush the write-buffer. If the +	 * _does_ cache non-present entries, then it does so in the special +	 * domain #0, which we have to flush: +	 */ +	if (cap_caching_mode(iommu->cap)) { +		iommu->flush.flush_context(iommu, 0, +					   (((u16)bus) << 8) | devfn, +					   DMA_CCMD_MASK_NOBIT, +					   DMA_CCMD_DEVICE_INVL); +		iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH); +	} else { +		iommu_flush_write_buffer(iommu); +	} +	iommu_enable_dev_iotlb(info); +	spin_unlock_irqrestore(&iommu->lock, flags); + +	spin_lock_irqsave(&domain->iommu_lock, flags); +	if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) { +		domain->iommu_count++; +		if (domain->iommu_count == 1) +			domain->nid = iommu->node; +		domain_update_iommu_cap(domain); +	} +	spin_unlock_irqrestore(&domain->iommu_lock, flags); +	return 0; +} + +static int +domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev, +			int translation) +{ +	int ret; +	struct pci_dev *tmp, *parent; + +	ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus), +					 pdev->bus->number, pdev->devfn, +					 translation); +	if (ret) +		return ret; + +	/* dependent device mapping */ +	tmp = pci_find_upstream_pcie_bridge(pdev); +	if (!tmp) +		return 0; +	/* Secondary interface's bus number and devfn 0 */ +	parent = pdev->bus->self; +	while (parent != tmp) { +		ret = domain_context_mapping_one(domain, +						 pci_domain_nr(parent->bus), +						 parent->bus->number, +						 parent->devfn, translation); +		if (ret) +			return ret; +		parent = parent->bus->self; +	} +	if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */ +		return domain_context_mapping_one(domain, +					pci_domain_nr(tmp->subordinate), +					tmp->subordinate->number, 0, +					translation); +	else /* this is a legacy PCI bridge */ +		return domain_context_mapping_one(domain, +						  pci_domain_nr(tmp->bus), +						  tmp->bus->number, +						  tmp->devfn, +						  translation); +} + +static int domain_context_mapped(struct pci_dev *pdev) +{ +	int ret; +	struct pci_dev *tmp, *parent; +	struct intel_iommu *iommu; + +	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number, +				pdev->devfn); +	if (!iommu) +		return -ENODEV; + +	ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn); +	if (!ret) +		return ret; +	/* dependent device mapping */ +	tmp = pci_find_upstream_pcie_bridge(pdev); +	if (!tmp) +		return ret; +	/* Secondary interface's bus number and devfn 0 */ +	parent = pdev->bus->self; +	while (parent != tmp) { +		ret = device_context_mapped(iommu, parent->bus->number, +					    parent->devfn); +		if (!ret) +			return ret; +		parent = parent->bus->self; +	} +	if (pci_is_pcie(tmp)) +		return device_context_mapped(iommu, tmp->subordinate->number, +					     0); +	else +		return device_context_mapped(iommu, tmp->bus->number, +					     tmp->devfn); +} + +/* Returns a number of VTD pages, but aligned to MM page size */ +static inline unsigned long aligned_nrpages(unsigned long host_addr, +					    size_t size) +{ +	host_addr &= ~PAGE_MASK; +	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; +} + +/* Return largest possible superpage level for a given mapping */ +static inline int hardware_largepage_caps(struct dmar_domain *domain, +					  unsigned long iov_pfn, +					  unsigned long phy_pfn, +					  unsigned long pages) +{ +	int support, level = 1; +	unsigned long pfnmerge; + +	support = domain->iommu_superpage; + +	/* To use a large page, the virtual *and* physical addresses +	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either +	   of them will mean we have to use smaller pages. So just +	   merge them and check both at once. */ +	pfnmerge = iov_pfn | phy_pfn; + +	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { +		pages >>= VTD_STRIDE_SHIFT; +		if (!pages) +			break; +		pfnmerge >>= VTD_STRIDE_SHIFT; +		level++; +		support--; +	} +	return level; +} + +static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, +			    struct scatterlist *sg, unsigned long phys_pfn, +			    unsigned long nr_pages, int prot) +{ +	struct dma_pte *first_pte = NULL, *pte = NULL; +	phys_addr_t uninitialized_var(pteval); +	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; +	unsigned long sg_res; +	unsigned int largepage_lvl = 0; +	unsigned long lvl_pages = 0; + +	BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width); + +	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) +		return -EINVAL; + +	prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP; + +	if (sg) +		sg_res = 0; +	else { +		sg_res = nr_pages + 1; +		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot; +	} + +	while (nr_pages > 0) { +		uint64_t tmp; + +		if (!sg_res) { +			sg_res = aligned_nrpages(sg->offset, sg->length); +			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset; +			sg->dma_length = sg->length; +			pteval = page_to_phys(sg_page(sg)) | prot; +			phys_pfn = pteval >> VTD_PAGE_SHIFT; +		} + +		if (!pte) { +			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res); + +			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl); +			if (!pte) +				return -ENOMEM; +			/* It is large page*/ +			if (largepage_lvl > 1) +				pteval |= DMA_PTE_LARGE_PAGE; +			else +				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; + +		} +		/* We don't need lock here, nobody else +		 * touches the iova range +		 */ +		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); +		if (tmp) { +			static int dumps = 5; +			printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", +			       iov_pfn, tmp, (unsigned long long)pteval); +			if (dumps) { +				dumps--; +				debug_dma_dump_mappings(NULL); +			} +			WARN_ON(1); +		} + +		lvl_pages = lvl_to_nr_pages(largepage_lvl); + +		BUG_ON(nr_pages < lvl_pages); +		BUG_ON(sg_res < lvl_pages); + +		nr_pages -= lvl_pages; +		iov_pfn += lvl_pages; +		phys_pfn += lvl_pages; +		pteval += lvl_pages * VTD_PAGE_SIZE; +		sg_res -= lvl_pages; + +		/* If the next PTE would be the first in a new page, then we +		   need to flush the cache on the entries we've just written. +		   And then we'll need to recalculate 'pte', so clear it and +		   let it get set again in the if (!pte) block above. + +		   If we're done (!nr_pages) we need to flush the cache too. + +		   Also if we've been setting superpages, we may need to +		   recalculate 'pte' and switch back to smaller pages for the +		   end of the mapping, if the trailing size is not enough to +		   use another superpage (i.e. sg_res < lvl_pages). */ +		pte++; +		if (!nr_pages || first_pte_in_page(pte) || +		    (largepage_lvl > 1 && sg_res < lvl_pages)) { +			domain_flush_cache(domain, first_pte, +					   (void *)pte - (void *)first_pte); +			pte = NULL; +		} + +		if (!sg_res && nr_pages) +			sg = sg_next(sg); +	} +	return 0; +} + +static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn, +				    struct scatterlist *sg, unsigned long nr_pages, +				    int prot) +{ +	return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot); +} + +static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn, +				     unsigned long phys_pfn, unsigned long nr_pages, +				     int prot) +{ +	return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot); +} + +static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ +	if (!iommu) +		return; + +	clear_context_table(iommu, bus, devfn); +	iommu->flush.flush_context(iommu, 0, 0, 0, +					   DMA_CCMD_GLOBAL_INVL); +	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); +} + +static void domain_remove_dev_info(struct dmar_domain *domain) +{ +	struct device_domain_info *info; +	unsigned long flags; +	struct intel_iommu *iommu; + +	spin_lock_irqsave(&device_domain_lock, flags); +	while (!list_empty(&domain->devices)) { +		info = list_entry(domain->devices.next, +			struct device_domain_info, link); +		list_del(&info->link); +		list_del(&info->global); +		if (info->dev) +			info->dev->dev.archdata.iommu = NULL; +		spin_unlock_irqrestore(&device_domain_lock, flags); + +		iommu_disable_dev_iotlb(info); +		iommu = device_to_iommu(info->segment, info->bus, info->devfn); +		iommu_detach_dev(iommu, info->bus, info->devfn); +		free_devinfo_mem(info); + +		spin_lock_irqsave(&device_domain_lock, flags); +	} +	spin_unlock_irqrestore(&device_domain_lock, flags); +} + +/* + * find_domain + * Note: we use struct pci_dev->dev.archdata.iommu stores the info + */ +static struct dmar_domain * +find_domain(struct pci_dev *pdev) +{ +	struct device_domain_info *info; + +	/* No lock here, assumes no domain exit in normal case */ +	info = pdev->dev.archdata.iommu; +	if (info) +		return info->domain; +	return NULL; +} + +/* domain is initialized */ +static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw) +{ +	struct dmar_domain *domain, *found = NULL; +	struct intel_iommu *iommu; +	struct dmar_drhd_unit *drhd; +	struct device_domain_info *info, *tmp; +	struct pci_dev *dev_tmp; +	unsigned long flags; +	int bus = 0, devfn = 0; +	int segment; +	int ret; + +	domain = find_domain(pdev); +	if (domain) +		return domain; + +	segment = pci_domain_nr(pdev->bus); + +	dev_tmp = pci_find_upstream_pcie_bridge(pdev); +	if (dev_tmp) { +		if (pci_is_pcie(dev_tmp)) { +			bus = dev_tmp->subordinate->number; +			devfn = 0; +		} else { +			bus = dev_tmp->bus->number; +			devfn = dev_tmp->devfn; +		} +		spin_lock_irqsave(&device_domain_lock, flags); +		list_for_each_entry(info, &device_domain_list, global) { +			if (info->segment == segment && +			    info->bus == bus && info->devfn == devfn) { +				found = info->domain; +				break; +			} +		} +		spin_unlock_irqrestore(&device_domain_lock, flags); +		/* pcie-pci bridge already has a domain, uses it */ +		if (found) { +			domain = found; +			goto found_domain; +		} +	} + +	domain = alloc_domain(); +	if (!domain) +		goto error; + +	/* Allocate new domain for the device */ +	drhd = dmar_find_matched_drhd_unit(pdev); +	if (!drhd) { +		printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n", +			pci_name(pdev)); +		return NULL; +	} +	iommu = drhd->iommu; + +	ret = iommu_attach_domain(domain, iommu); +	if (ret) { +		free_domain_mem(domain); +		goto error; +	} + +	if (domain_init(domain, gaw)) { +		domain_exit(domain); +		goto error; +	} + +	/* register pcie-to-pci device */ +	if (dev_tmp) { +		info = alloc_devinfo_mem(); +		if (!info) { +			domain_exit(domain); +			goto error; +		} +		info->segment = segment; +		info->bus = bus; +		info->devfn = devfn; +		info->dev = NULL; +		info->domain = domain; +		/* This domain is shared by devices under p2p bridge */ +		domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES; + +		/* pcie-to-pci bridge already has a domain, uses it */ +		found = NULL; +		spin_lock_irqsave(&device_domain_lock, flags); +		list_for_each_entry(tmp, &device_domain_list, global) { +			if (tmp->segment == segment && +			    tmp->bus == bus && tmp->devfn == devfn) { +				found = tmp->domain; +				break; +			} +		} +		if (found) { +			spin_unlock_irqrestore(&device_domain_lock, flags); +			free_devinfo_mem(info); +			domain_exit(domain); +			domain = found; +		} else { +			list_add(&info->link, &domain->devices); +			list_add(&info->global, &device_domain_list); +			spin_unlock_irqrestore(&device_domain_lock, flags); +		} +	} + +found_domain: +	info = alloc_devinfo_mem(); +	if (!info) +		goto error; +	info->segment = segment; +	info->bus = pdev->bus->number; +	info->devfn = pdev->devfn; +	info->dev = pdev; +	info->domain = domain; +	spin_lock_irqsave(&device_domain_lock, flags); +	/* somebody is fast */ +	found = find_domain(pdev); +	if (found != NULL) { +		spin_unlock_irqrestore(&device_domain_lock, flags); +		if (found != domain) { +			domain_exit(domain); +			domain = found; +		} +		free_devinfo_mem(info); +		return domain; +	} +	list_add(&info->link, &domain->devices); +	list_add(&info->global, &device_domain_list); +	pdev->dev.archdata.iommu = info; +	spin_unlock_irqrestore(&device_domain_lock, flags); +	return domain; +error: +	/* recheck it here, maybe others set it */ +	return find_domain(pdev); +} + +static int iommu_identity_mapping; +#define IDENTMAP_ALL		1 +#define IDENTMAP_GFX		2 +#define IDENTMAP_AZALIA		4 + +static int iommu_domain_identity_map(struct dmar_domain *domain, +				     unsigned long long start, +				     unsigned long long end) +{ +	unsigned long first_vpfn = start >> VTD_PAGE_SHIFT; +	unsigned long last_vpfn = end >> VTD_PAGE_SHIFT; + +	if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn), +			  dma_to_mm_pfn(last_vpfn))) { +		printk(KERN_ERR "IOMMU: reserve iova failed\n"); +		return -ENOMEM; +	} + +	pr_debug("Mapping reserved region %llx-%llx for domain %d\n", +		 start, end, domain->id); +	/* +	 * RMRR range might have overlap with physical memory range, +	 * clear it first +	 */ +	dma_pte_clear_range(domain, first_vpfn, last_vpfn); + +	return domain_pfn_mapping(domain, first_vpfn, first_vpfn, +				  last_vpfn - first_vpfn + 1, +				  DMA_PTE_READ|DMA_PTE_WRITE); +} + +static int iommu_prepare_identity_map(struct pci_dev *pdev, +				      unsigned long long start, +				      unsigned long long end) +{ +	struct dmar_domain *domain; +	int ret; + +	domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); +	if (!domain) +		return -ENOMEM; + +	/* For _hardware_ passthrough, don't bother. But for software +	   passthrough, we do it anyway -- it may indicate a memory +	   range which is reserved in E820, so which didn't get set +	   up to start with in si_domain */ +	if (domain == si_domain && hw_pass_through) { +		printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n", +		       pci_name(pdev), start, end); +		return 0; +	} + +	printk(KERN_INFO +	       "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n", +	       pci_name(pdev), start, end); +	 +	if (end < start) { +		WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n" +			"BIOS vendor: %s; Ver: %s; Product Version: %s\n", +			dmi_get_system_info(DMI_BIOS_VENDOR), +			dmi_get_system_info(DMI_BIOS_VERSION), +		     dmi_get_system_info(DMI_PRODUCT_VERSION)); +		ret = -EIO; +		goto error; +	} + +	if (end >> agaw_to_width(domain->agaw)) { +		WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n" +		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n", +		     agaw_to_width(domain->agaw), +		     dmi_get_system_info(DMI_BIOS_VENDOR), +		     dmi_get_system_info(DMI_BIOS_VERSION), +		     dmi_get_system_info(DMI_PRODUCT_VERSION)); +		ret = -EIO; +		goto error; +	} + +	ret = iommu_domain_identity_map(domain, start, end); +	if (ret) +		goto error; + +	/* context entry init */ +	ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL); +	if (ret) +		goto error; + +	return 0; + + error: +	domain_exit(domain); +	return ret; +} + +static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, +	struct pci_dev *pdev) +{ +	if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) +		return 0; +	return iommu_prepare_identity_map(pdev, rmrr->base_address, +		rmrr->end_address); +} + +#ifdef CONFIG_DMAR_FLOPPY_WA +static inline void iommu_prepare_isa(void) +{ +	struct pci_dev *pdev; +	int ret; + +	pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL); +	if (!pdev) +		return; + +	printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n"); +	ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1); + +	if (ret) +		printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; " +		       "floppy might not work\n"); + +} +#else +static inline void iommu_prepare_isa(void) +{ +	return; +} +#endif /* !CONFIG_DMAR_FLPY_WA */ + +static int md_domain_init(struct dmar_domain *domain, int guest_width); + +static int __init si_domain_work_fn(unsigned long start_pfn, +				    unsigned long end_pfn, void *datax) +{ +	int *ret = datax; + +	*ret = iommu_domain_identity_map(si_domain, +					 (uint64_t)start_pfn << PAGE_SHIFT, +					 (uint64_t)end_pfn << PAGE_SHIFT); +	return *ret; + +} + +static int __init si_domain_init(int hw) +{ +	struct dmar_drhd_unit *drhd; +	struct intel_iommu *iommu; +	int nid, ret = 0; + +	si_domain = alloc_domain(); +	if (!si_domain) +		return -EFAULT; + +	pr_debug("Identity mapping domain is domain %d\n", si_domain->id); + +	for_each_active_iommu(iommu, drhd) { +		ret = iommu_attach_domain(si_domain, iommu); +		if (ret) { +			domain_exit(si_domain); +			return -EFAULT; +		} +	} + +	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { +		domain_exit(si_domain); +		return -EFAULT; +	} + +	si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY; + +	if (hw) +		return 0; + +	for_each_online_node(nid) { +		work_with_active_regions(nid, si_domain_work_fn, &ret); +		if (ret) +			return ret; +	} + +	return 0; +} + +static void domain_remove_one_dev_info(struct dmar_domain *domain, +					  struct pci_dev *pdev); +static int identity_mapping(struct pci_dev *pdev) +{ +	struct device_domain_info *info; + +	if (likely(!iommu_identity_mapping)) +		return 0; + +	info = pdev->dev.archdata.iommu; +	if (info && info != DUMMY_DEVICE_DOMAIN_INFO) +		return (info->domain == si_domain); + +	return 0; +} + +static int domain_add_dev_info(struct dmar_domain *domain, +			       struct pci_dev *pdev, +			       int translation) +{ +	struct device_domain_info *info; +	unsigned long flags; +	int ret; + +	info = alloc_devinfo_mem(); +	if (!info) +		return -ENOMEM; + +	ret = domain_context_mapping(domain, pdev, translation); +	if (ret) { +		free_devinfo_mem(info); +		return ret; +	} + +	info->segment = pci_domain_nr(pdev->bus); +	info->bus = pdev->bus->number; +	info->devfn = pdev->devfn; +	info->dev = pdev; +	info->domain = domain; + +	spin_lock_irqsave(&device_domain_lock, flags); +	list_add(&info->link, &domain->devices); +	list_add(&info->global, &device_domain_list); +	pdev->dev.archdata.iommu = info; +	spin_unlock_irqrestore(&device_domain_lock, flags); + +	return 0; +} + +static int iommu_should_identity_map(struct pci_dev *pdev, int startup) +{ +	if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) +		return 1; + +	if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) +		return 1; + +	if (!(iommu_identity_mapping & IDENTMAP_ALL)) +		return 0; + +	/* +	 * We want to start off with all devices in the 1:1 domain, and +	 * take them out later if we find they can't access all of memory. +	 * +	 * However, we can't do this for PCI devices behind bridges, +	 * because all PCI devices behind the same bridge will end up +	 * with the same source-id on their transactions. +	 * +	 * Practically speaking, we can't change things around for these +	 * devices at run-time, because we can't be sure there'll be no +	 * DMA transactions in flight for any of their siblings. +	 *  +	 * So PCI devices (unless they're on the root bus) as well as +	 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of +	 * the 1:1 domain, just in _case_ one of their siblings turns out +	 * not to be able to map all of memory. +	 */ +	if (!pci_is_pcie(pdev)) { +		if (!pci_is_root_bus(pdev->bus)) +			return 0; +		if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI) +			return 0; +	} else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE) +		return 0; + +	/*  +	 * At boot time, we don't yet know if devices will be 64-bit capable. +	 * Assume that they will -- if they turn out not to be, then we can  +	 * take them out of the 1:1 domain later. +	 */ +	if (!startup) { +		/* +		 * If the device's dma_mask is less than the system's memory +		 * size then this is not a candidate for identity mapping. +		 */ +		u64 dma_mask = pdev->dma_mask; + +		if (pdev->dev.coherent_dma_mask && +		    pdev->dev.coherent_dma_mask < dma_mask) +			dma_mask = pdev->dev.coherent_dma_mask; + +		return dma_mask >= dma_get_required_mask(&pdev->dev); +	} + +	return 1; +} + +static int __init iommu_prepare_static_identity_mapping(int hw) +{ +	struct pci_dev *pdev = NULL; +	int ret; + +	ret = si_domain_init(hw); +	if (ret) +		return -EFAULT; + +	for_each_pci_dev(pdev) { +		/* Skip Host/PCI Bridge devices */ +		if (IS_BRIDGE_HOST_DEVICE(pdev)) +			continue; +		if (iommu_should_identity_map(pdev, 1)) { +			printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n", +			       hw ? "hardware" : "software", pci_name(pdev)); + +			ret = domain_add_dev_info(si_domain, pdev, +						     hw ? CONTEXT_TT_PASS_THROUGH : +						     CONTEXT_TT_MULTI_LEVEL); +			if (ret) +				return ret; +		} +	} + +	return 0; +} + +static int __init init_dmars(void) +{ +	struct dmar_drhd_unit *drhd; +	struct dmar_rmrr_unit *rmrr; +	struct pci_dev *pdev; +	struct intel_iommu *iommu; +	int i, ret; + +	/* +	 * for each drhd +	 *    allocate root +	 *    initialize and program root entry to not present +	 * endfor +	 */ +	for_each_drhd_unit(drhd) { +		g_num_of_iommus++; +		/* +		 * lock not needed as this is only incremented in the single +		 * threaded kernel __init code path all other access are read +		 * only +		 */ +	} + +	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), +			GFP_KERNEL); +	if (!g_iommus) { +		printk(KERN_ERR "Allocating global iommu array failed\n"); +		ret = -ENOMEM; +		goto error; +	} + +	deferred_flush = kzalloc(g_num_of_iommus * +		sizeof(struct deferred_flush_tables), GFP_KERNEL); +	if (!deferred_flush) { +		ret = -ENOMEM; +		goto error; +	} + +	for_each_drhd_unit(drhd) { +		if (drhd->ignored) +			continue; + +		iommu = drhd->iommu; +		g_iommus[iommu->seq_id] = iommu; + +		ret = iommu_init_domains(iommu); +		if (ret) +			goto error; + +		/* +		 * TBD: +		 * we could share the same root & context tables +		 * among all IOMMU's. Need to Split it later. +		 */ +		ret = iommu_alloc_root_entry(iommu); +		if (ret) { +			printk(KERN_ERR "IOMMU: allocate root entry failed\n"); +			goto error; +		} +		if (!ecap_pass_through(iommu->ecap)) +			hw_pass_through = 0; +	} + +	/* +	 * Start from the sane iommu hardware state. +	 */ +	for_each_drhd_unit(drhd) { +		if (drhd->ignored) +			continue; + +		iommu = drhd->iommu; + +		/* +		 * If the queued invalidation is already initialized by us +		 * (for example, while enabling interrupt-remapping) then +		 * we got the things already rolling from a sane state. +		 */ +		if (iommu->qi) +			continue; + +		/* +		 * Clear any previous faults. +		 */ +		dmar_fault(-1, iommu); +		/* +		 * Disable queued invalidation if supported and already enabled +		 * before OS handover. +		 */ +		dmar_disable_qi(iommu); +	} + +	for_each_drhd_unit(drhd) { +		if (drhd->ignored) +			continue; + +		iommu = drhd->iommu; + +		if (dmar_enable_qi(iommu)) { +			/* +			 * Queued Invalidate not enabled, use Register Based +			 * Invalidate +			 */ +			iommu->flush.flush_context = __iommu_flush_context; +			iommu->flush.flush_iotlb = __iommu_flush_iotlb; +			printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based " +			       "invalidation\n", +				iommu->seq_id, +			       (unsigned long long)drhd->reg_base_addr); +		} else { +			iommu->flush.flush_context = qi_flush_context; +			iommu->flush.flush_iotlb = qi_flush_iotlb; +			printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued " +			       "invalidation\n", +				iommu->seq_id, +			       (unsigned long long)drhd->reg_base_addr); +		} +	} + +	if (iommu_pass_through) +		iommu_identity_mapping |= IDENTMAP_ALL; + +#ifdef CONFIG_DMAR_BROKEN_GFX_WA +	iommu_identity_mapping |= IDENTMAP_GFX; +#endif + +	check_tylersburg_isoch(); + +	/* +	 * If pass through is not set or not enabled, setup context entries for +	 * identity mappings for rmrr, gfx, and isa and may fall back to static +	 * identity mapping if iommu_identity_mapping is set. +	 */ +	if (iommu_identity_mapping) { +		ret = iommu_prepare_static_identity_mapping(hw_pass_through); +		if (ret) { +			printk(KERN_CRIT "Failed to setup IOMMU pass-through\n"); +			goto error; +		} +	} +	/* +	 * For each rmrr +	 *   for each dev attached to rmrr +	 *   do +	 *     locate drhd for dev, alloc domain for dev +	 *     allocate free domain +	 *     allocate page table entries for rmrr +	 *     if context not allocated for bus +	 *           allocate and init context +	 *           set present in root table for this bus +	 *     init context with domain, translation etc +	 *    endfor +	 * endfor +	 */ +	printk(KERN_INFO "IOMMU: Setting RMRR:\n"); +	for_each_rmrr_units(rmrr) { +		for (i = 0; i < rmrr->devices_cnt; i++) { +			pdev = rmrr->devices[i]; +			/* +			 * some BIOS lists non-exist devices in DMAR +			 * table. +			 */ +			if (!pdev) +				continue; +			ret = iommu_prepare_rmrr_dev(rmrr, pdev); +			if (ret) +				printk(KERN_ERR +				       "IOMMU: mapping reserved region failed\n"); +		} +	} + +	iommu_prepare_isa(); + +	/* +	 * for each drhd +	 *   enable fault log +	 *   global invalidate context cache +	 *   global invalidate iotlb +	 *   enable translation +	 */ +	for_each_drhd_unit(drhd) { +		if (drhd->ignored) { +			/* +			 * we always have to disable PMRs or DMA may fail on +			 * this device +			 */ +			if (force_on) +				iommu_disable_protect_mem_regions(drhd->iommu); +			continue; +		} +		iommu = drhd->iommu; + +		iommu_flush_write_buffer(iommu); + +		ret = dmar_set_interrupt(iommu); +		if (ret) +			goto error; + +		iommu_set_root_entry(iommu); + +		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); +		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); + +		ret = iommu_enable_translation(iommu); +		if (ret) +			goto error; + +		iommu_disable_protect_mem_regions(iommu); +	} + +	return 0; +error: +	for_each_drhd_unit(drhd) { +		if (drhd->ignored) +			continue; +		iommu = drhd->iommu; +		free_iommu(iommu); +	} +	kfree(g_iommus); +	return ret; +} + +/* This takes a number of _MM_ pages, not VTD pages */ +static struct iova *intel_alloc_iova(struct device *dev, +				     struct dmar_domain *domain, +				     unsigned long nrpages, uint64_t dma_mask) +{ +	struct pci_dev *pdev = to_pci_dev(dev); +	struct iova *iova = NULL; + +	/* Restrict dma_mask to the width that the iommu can handle */ +	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask); + +	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) { +		/* +		 * First try to allocate an io virtual address in +		 * DMA_BIT_MASK(32) and if that fails then try allocating +		 * from higher range +		 */ +		iova = alloc_iova(&domain->iovad, nrpages, +				  IOVA_PFN(DMA_BIT_MASK(32)), 1); +		if (iova) +			return iova; +	} +	iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1); +	if (unlikely(!iova)) { +		printk(KERN_ERR "Allocating %ld-page iova for %s failed", +		       nrpages, pci_name(pdev)); +		return NULL; +	} + +	return iova; +} + +static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev) +{ +	struct dmar_domain *domain; +	int ret; + +	domain = get_domain_for_dev(pdev, +			DEFAULT_DOMAIN_ADDRESS_WIDTH); +	if (!domain) { +		printk(KERN_ERR +			"Allocating domain for %s failed", pci_name(pdev)); +		return NULL; +	} + +	/* make sure context mapping is ok */ +	if (unlikely(!domain_context_mapped(pdev))) { +		ret = domain_context_mapping(domain, pdev, +					     CONTEXT_TT_MULTI_LEVEL); +		if (ret) { +			printk(KERN_ERR +				"Domain context map for %s failed", +				pci_name(pdev)); +			return NULL; +		} +	} + +	return domain; +} + +static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev) +{ +	struct device_domain_info *info; + +	/* No lock here, assumes no domain exit in normal case */ +	info = dev->dev.archdata.iommu; +	if (likely(info)) +		return info->domain; + +	return __get_valid_domain_for_dev(dev); +} + +static int iommu_dummy(struct pci_dev *pdev) +{ +	return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO; +} + +/* Check if the pdev needs to go through non-identity map and unmap process.*/ +static int iommu_no_mapping(struct device *dev) +{ +	struct pci_dev *pdev; +	int found; + +	if (unlikely(dev->bus != &pci_bus_type)) +		return 1; + +	pdev = to_pci_dev(dev); +	if (iommu_dummy(pdev)) +		return 1; + +	if (!iommu_identity_mapping) +		return 0; + +	found = identity_mapping(pdev); +	if (found) { +		if (iommu_should_identity_map(pdev, 0)) +			return 1; +		else { +			/* +			 * 32 bit DMA is removed from si_domain and fall back +			 * to non-identity mapping. +			 */ +			domain_remove_one_dev_info(si_domain, pdev); +			printk(KERN_INFO "32bit %s uses non-identity mapping\n", +			       pci_name(pdev)); +			return 0; +		} +	} else { +		/* +		 * In case of a detached 64 bit DMA device from vm, the device +		 * is put into si_domain for identity mapping. +		 */ +		if (iommu_should_identity_map(pdev, 0)) { +			int ret; +			ret = domain_add_dev_info(si_domain, pdev, +						  hw_pass_through ? +						  CONTEXT_TT_PASS_THROUGH : +						  CONTEXT_TT_MULTI_LEVEL); +			if (!ret) { +				printk(KERN_INFO "64bit %s uses identity mapping\n", +				       pci_name(pdev)); +				return 1; +			} +		} +	} + +	return 0; +} + +static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, +				     size_t size, int dir, u64 dma_mask) +{ +	struct pci_dev *pdev = to_pci_dev(hwdev); +	struct dmar_domain *domain; +	phys_addr_t start_paddr; +	struct iova *iova; +	int prot = 0; +	int ret; +	struct intel_iommu *iommu; +	unsigned long paddr_pfn = paddr >> PAGE_SHIFT; + +	BUG_ON(dir == DMA_NONE); + +	if (iommu_no_mapping(hwdev)) +		return paddr; + +	domain = get_valid_domain_for_dev(pdev); +	if (!domain) +		return 0; + +	iommu = domain_get_iommu(domain); +	size = aligned_nrpages(paddr, size); + +	iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask); +	if (!iova) +		goto error; + +	/* +	 * Check if DMAR supports zero-length reads on write only +	 * mappings.. +	 */ +	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \ +			!cap_zlr(iommu->cap)) +		prot |= DMA_PTE_READ; +	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) +		prot |= DMA_PTE_WRITE; +	/* +	 * paddr - (paddr + size) might be partial page, we should map the whole +	 * page.  Note: if two part of one page are separately mapped, we +	 * might have two guest_addr mapping to the same host paddr, but this +	 * is not a big problem +	 */ +	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo), +				 mm_to_dma_pfn(paddr_pfn), size, prot); +	if (ret) +		goto error; + +	/* it's a non-present to present mapping. Only flush if caching mode */ +	if (cap_caching_mode(iommu->cap)) +		iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1); +	else +		iommu_flush_write_buffer(iommu); + +	start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT; +	start_paddr += paddr & ~PAGE_MASK; +	return start_paddr; + +error: +	if (iova) +		__free_iova(&domain->iovad, iova); +	printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n", +		pci_name(pdev), size, (unsigned long long)paddr, dir); +	return 0; +} + +static dma_addr_t intel_map_page(struct device *dev, struct page *page, +				 unsigned long offset, size_t size, +				 enum dma_data_direction dir, +				 struct dma_attrs *attrs) +{ +	return __intel_map_single(dev, page_to_phys(page) + offset, size, +				  dir, to_pci_dev(dev)->dma_mask); +} + +static void flush_unmaps(void) +{ +	int i, j; + +	timer_on = 0; + +	/* just flush them all */ +	for (i = 0; i < g_num_of_iommus; i++) { +		struct intel_iommu *iommu = g_iommus[i]; +		if (!iommu) +			continue; + +		if (!deferred_flush[i].next) +			continue; + +		/* In caching mode, global flushes turn emulation expensive */ +		if (!cap_caching_mode(iommu->cap)) +			iommu->flush.flush_iotlb(iommu, 0, 0, 0, +					 DMA_TLB_GLOBAL_FLUSH); +		for (j = 0; j < deferred_flush[i].next; j++) { +			unsigned long mask; +			struct iova *iova = deferred_flush[i].iova[j]; +			struct dmar_domain *domain = deferred_flush[i].domain[j]; + +			/* On real hardware multiple invalidations are expensive */ +			if (cap_caching_mode(iommu->cap)) +				iommu_flush_iotlb_psi(iommu, domain->id, +				iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0); +			else { +				mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1)); +				iommu_flush_dev_iotlb(deferred_flush[i].domain[j], +						(uint64_t)iova->pfn_lo << PAGE_SHIFT, mask); +			} +			__free_iova(&deferred_flush[i].domain[j]->iovad, iova); +		} +		deferred_flush[i].next = 0; +	} + +	list_size = 0; +} + +static void flush_unmaps_timeout(unsigned long data) +{ +	unsigned long flags; + +	spin_lock_irqsave(&async_umap_flush_lock, flags); +	flush_unmaps(); +	spin_unlock_irqrestore(&async_umap_flush_lock, flags); +} + +static void add_unmap(struct dmar_domain *dom, struct iova *iova) +{ +	unsigned long flags; +	int next, iommu_id; +	struct intel_iommu *iommu; + +	spin_lock_irqsave(&async_umap_flush_lock, flags); +	if (list_size == HIGH_WATER_MARK) +		flush_unmaps(); + +	iommu = domain_get_iommu(dom); +	iommu_id = iommu->seq_id; + +	next = deferred_flush[iommu_id].next; +	deferred_flush[iommu_id].domain[next] = dom; +	deferred_flush[iommu_id].iova[next] = iova; +	deferred_flush[iommu_id].next++; + +	if (!timer_on) { +		mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10)); +		timer_on = 1; +	} +	list_size++; +	spin_unlock_irqrestore(&async_umap_flush_lock, flags); +} + +static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, +			     size_t size, enum dma_data_direction dir, +			     struct dma_attrs *attrs) +{ +	struct pci_dev *pdev = to_pci_dev(dev); +	struct dmar_domain *domain; +	unsigned long start_pfn, last_pfn; +	struct iova *iova; +	struct intel_iommu *iommu; + +	if (iommu_no_mapping(dev)) +		return; + +	domain = find_domain(pdev); +	BUG_ON(!domain); + +	iommu = domain_get_iommu(domain); + +	iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr)); +	if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n", +		      (unsigned long long)dev_addr)) +		return; + +	start_pfn = mm_to_dma_pfn(iova->pfn_lo); +	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1; + +	pr_debug("Device %s unmapping: pfn %lx-%lx\n", +		 pci_name(pdev), start_pfn, last_pfn); + +	/*  clear the whole page */ +	dma_pte_clear_range(domain, start_pfn, last_pfn); + +	/* free page tables */ +	dma_pte_free_pagetable(domain, start_pfn, last_pfn); + +	if (intel_iommu_strict) { +		iommu_flush_iotlb_psi(iommu, domain->id, start_pfn, +				      last_pfn - start_pfn + 1, 0); +		/* free iova */ +		__free_iova(&domain->iovad, iova); +	} else { +		add_unmap(domain, iova); +		/* +		 * queue up the release of the unmap to save the 1/6th of the +		 * cpu used up by the iotlb flush operation... +		 */ +	} +} + +static void *intel_alloc_coherent(struct device *hwdev, size_t size, +				  dma_addr_t *dma_handle, gfp_t flags) +{ +	void *vaddr; +	int order; + +	size = PAGE_ALIGN(size); +	order = get_order(size); + +	if (!iommu_no_mapping(hwdev)) +		flags &= ~(GFP_DMA | GFP_DMA32); +	else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) { +		if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32)) +			flags |= GFP_DMA; +		else +			flags |= GFP_DMA32; +	} + +	vaddr = (void *)__get_free_pages(flags, order); +	if (!vaddr) +		return NULL; +	memset(vaddr, 0, size); + +	*dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size, +					 DMA_BIDIRECTIONAL, +					 hwdev->coherent_dma_mask); +	if (*dma_handle) +		return vaddr; +	free_pages((unsigned long)vaddr, order); +	return NULL; +} + +static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr, +				dma_addr_t dma_handle) +{ +	int order; + +	size = PAGE_ALIGN(size); +	order = get_order(size); + +	intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL); +	free_pages((unsigned long)vaddr, order); +} + +static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, +			   int nelems, enum dma_data_direction dir, +			   struct dma_attrs *attrs) +{ +	struct pci_dev *pdev = to_pci_dev(hwdev); +	struct dmar_domain *domain; +	unsigned long start_pfn, last_pfn; +	struct iova *iova; +	struct intel_iommu *iommu; + +	if (iommu_no_mapping(hwdev)) +		return; + +	domain = find_domain(pdev); +	BUG_ON(!domain); + +	iommu = domain_get_iommu(domain); + +	iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address)); +	if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n", +		      (unsigned long long)sglist[0].dma_address)) +		return; + +	start_pfn = mm_to_dma_pfn(iova->pfn_lo); +	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1; + +	/*  clear the whole page */ +	dma_pte_clear_range(domain, start_pfn, last_pfn); + +	/* free page tables */ +	dma_pte_free_pagetable(domain, start_pfn, last_pfn); + +	if (intel_iommu_strict) { +		iommu_flush_iotlb_psi(iommu, domain->id, start_pfn, +				      last_pfn - start_pfn + 1, 0); +		/* free iova */ +		__free_iova(&domain->iovad, iova); +	} else { +		add_unmap(domain, iova); +		/* +		 * queue up the release of the unmap to save the 1/6th of the +		 * cpu used up by the iotlb flush operation... +		 */ +	} +} + +static int intel_nontranslate_map_sg(struct device *hddev, +	struct scatterlist *sglist, int nelems, int dir) +{ +	int i; +	struct scatterlist *sg; + +	for_each_sg(sglist, sg, nelems, i) { +		BUG_ON(!sg_page(sg)); +		sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset; +		sg->dma_length = sg->length; +	} +	return nelems; +} + +static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, +			enum dma_data_direction dir, struct dma_attrs *attrs) +{ +	int i; +	struct pci_dev *pdev = to_pci_dev(hwdev); +	struct dmar_domain *domain; +	size_t size = 0; +	int prot = 0; +	struct iova *iova = NULL; +	int ret; +	struct scatterlist *sg; +	unsigned long start_vpfn; +	struct intel_iommu *iommu; + +	BUG_ON(dir == DMA_NONE); +	if (iommu_no_mapping(hwdev)) +		return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir); + +	domain = get_valid_domain_for_dev(pdev); +	if (!domain) +		return 0; + +	iommu = domain_get_iommu(domain); + +	for_each_sg(sglist, sg, nelems, i) +		size += aligned_nrpages(sg->offset, sg->length); + +	iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), +				pdev->dma_mask); +	if (!iova) { +		sglist->dma_length = 0; +		return 0; +	} + +	/* +	 * Check if DMAR supports zero-length reads on write only +	 * mappings.. +	 */ +	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \ +			!cap_zlr(iommu->cap)) +		prot |= DMA_PTE_READ; +	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) +		prot |= DMA_PTE_WRITE; + +	start_vpfn = mm_to_dma_pfn(iova->pfn_lo); + +	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot); +	if (unlikely(ret)) { +		/*  clear the page */ +		dma_pte_clear_range(domain, start_vpfn, +				    start_vpfn + size - 1); +		/* free page tables */ +		dma_pte_free_pagetable(domain, start_vpfn, +				       start_vpfn + size - 1); +		/* free iova */ +		__free_iova(&domain->iovad, iova); +		return 0; +	} + +	/* it's a non-present to present mapping. Only flush if caching mode */ +	if (cap_caching_mode(iommu->cap)) +		iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1); +	else +		iommu_flush_write_buffer(iommu); + +	return nelems; +} + +static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ +	return !dma_addr; +} + +struct dma_map_ops intel_dma_ops = { +	.alloc_coherent = intel_alloc_coherent, +	.free_coherent = intel_free_coherent, +	.map_sg = intel_map_sg, +	.unmap_sg = intel_unmap_sg, +	.map_page = intel_map_page, +	.unmap_page = intel_unmap_page, +	.mapping_error = intel_mapping_error, +}; + +static inline int iommu_domain_cache_init(void) +{ +	int ret = 0; + +	iommu_domain_cache = kmem_cache_create("iommu_domain", +					 sizeof(struct dmar_domain), +					 0, +					 SLAB_HWCACHE_ALIGN, + +					 NULL); +	if (!iommu_domain_cache) { +		printk(KERN_ERR "Couldn't create iommu_domain cache\n"); +		ret = -ENOMEM; +	} + +	return ret; +} + +static inline int iommu_devinfo_cache_init(void) +{ +	int ret = 0; + +	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo", +					 sizeof(struct device_domain_info), +					 0, +					 SLAB_HWCACHE_ALIGN, +					 NULL); +	if (!iommu_devinfo_cache) { +		printk(KERN_ERR "Couldn't create devinfo cache\n"); +		ret = -ENOMEM; +	} + +	return ret; +} + +static inline int iommu_iova_cache_init(void) +{ +	int ret = 0; + +	iommu_iova_cache = kmem_cache_create("iommu_iova", +					 sizeof(struct iova), +					 0, +					 SLAB_HWCACHE_ALIGN, +					 NULL); +	if (!iommu_iova_cache) { +		printk(KERN_ERR "Couldn't create iova cache\n"); +		ret = -ENOMEM; +	} + +	return ret; +} + +static int __init iommu_init_mempool(void) +{ +	int ret; +	ret = iommu_iova_cache_init(); +	if (ret) +		return ret; + +	ret = iommu_domain_cache_init(); +	if (ret) +		goto domain_error; + +	ret = iommu_devinfo_cache_init(); +	if (!ret) +		return ret; + +	kmem_cache_destroy(iommu_domain_cache); +domain_error: +	kmem_cache_destroy(iommu_iova_cache); + +	return -ENOMEM; +} + +static void __init iommu_exit_mempool(void) +{ +	kmem_cache_destroy(iommu_devinfo_cache); +	kmem_cache_destroy(iommu_domain_cache); +	kmem_cache_destroy(iommu_iova_cache); + +} + +static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev) +{ +	struct dmar_drhd_unit *drhd; +	u32 vtbar; +	int rc; + +	/* We know that this device on this chipset has its own IOMMU. +	 * If we find it under a different IOMMU, then the BIOS is lying +	 * to us. Hope that the IOMMU for this device is actually +	 * disabled, and it needs no translation... +	 */ +	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); +	if (rc) { +		/* "can't" happen */ +		dev_info(&pdev->dev, "failed to run vt-d quirk\n"); +		return; +	} +	vtbar &= 0xffff0000; + +	/* we know that the this iommu should be at offset 0xa000 from vtbar */ +	drhd = dmar_find_matched_drhd_unit(pdev); +	if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000, +			    TAINT_FIRMWARE_WORKAROUND, +			    "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n")) +		pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO; +} +DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu); + +static void __init init_no_remapping_devices(void) +{ +	struct dmar_drhd_unit *drhd; + +	for_each_drhd_unit(drhd) { +		if (!drhd->include_all) { +			int i; +			for (i = 0; i < drhd->devices_cnt; i++) +				if (drhd->devices[i] != NULL) +					break; +			/* ignore DMAR unit if no pci devices exist */ +			if (i == drhd->devices_cnt) +				drhd->ignored = 1; +		} +	} + +	if (dmar_map_gfx) +		return; + +	for_each_drhd_unit(drhd) { +		int i; +		if (drhd->ignored || drhd->include_all) +			continue; + +		for (i = 0; i < drhd->devices_cnt; i++) +			if (drhd->devices[i] && +				!IS_GFX_DEVICE(drhd->devices[i])) +				break; + +		if (i < drhd->devices_cnt) +			continue; + +		/* bypass IOMMU if it is just for gfx devices */ +		drhd->ignored = 1; +		for (i = 0; i < drhd->devices_cnt; i++) { +			if (!drhd->devices[i]) +				continue; +			drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO; +		} +	} +} + +#ifdef CONFIG_SUSPEND +static int init_iommu_hw(void) +{ +	struct dmar_drhd_unit *drhd; +	struct intel_iommu *iommu = NULL; + +	for_each_active_iommu(iommu, drhd) +		if (iommu->qi) +			dmar_reenable_qi(iommu); + +	for_each_iommu(iommu, drhd) { +		if (drhd->ignored) { +			/* +			 * we always have to disable PMRs or DMA may fail on +			 * this device +			 */ +			if (force_on) +				iommu_disable_protect_mem_regions(iommu); +			continue; +		} +	 +		iommu_flush_write_buffer(iommu); + +		iommu_set_root_entry(iommu); + +		iommu->flush.flush_context(iommu, 0, 0, 0, +					   DMA_CCMD_GLOBAL_INVL); +		iommu->flush.flush_iotlb(iommu, 0, 0, 0, +					 DMA_TLB_GLOBAL_FLUSH); +		if (iommu_enable_translation(iommu)) +			return 1; +		iommu_disable_protect_mem_regions(iommu); +	} + +	return 0; +} + +static void iommu_flush_all(void) +{ +	struct dmar_drhd_unit *drhd; +	struct intel_iommu *iommu; + +	for_each_active_iommu(iommu, drhd) { +		iommu->flush.flush_context(iommu, 0, 0, 0, +					   DMA_CCMD_GLOBAL_INVL); +		iommu->flush.flush_iotlb(iommu, 0, 0, 0, +					 DMA_TLB_GLOBAL_FLUSH); +	} +} + +static int iommu_suspend(void) +{ +	struct dmar_drhd_unit *drhd; +	struct intel_iommu *iommu = NULL; +	unsigned long flag; + +	for_each_active_iommu(iommu, drhd) { +		iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS, +						 GFP_ATOMIC); +		if (!iommu->iommu_state) +			goto nomem; +	} + +	iommu_flush_all(); + +	for_each_active_iommu(iommu, drhd) { +		iommu_disable_translation(iommu); + +		spin_lock_irqsave(&iommu->register_lock, flag); + +		iommu->iommu_state[SR_DMAR_FECTL_REG] = +			readl(iommu->reg + DMAR_FECTL_REG); +		iommu->iommu_state[SR_DMAR_FEDATA_REG] = +			readl(iommu->reg + DMAR_FEDATA_REG); +		iommu->iommu_state[SR_DMAR_FEADDR_REG] = +			readl(iommu->reg + DMAR_FEADDR_REG); +		iommu->iommu_state[SR_DMAR_FEUADDR_REG] = +			readl(iommu->reg + DMAR_FEUADDR_REG); + +		spin_unlock_irqrestore(&iommu->register_lock, flag); +	} +	return 0; + +nomem: +	for_each_active_iommu(iommu, drhd) +		kfree(iommu->iommu_state); + +	return -ENOMEM; +} + +static void iommu_resume(void) +{ +	struct dmar_drhd_unit *drhd; +	struct intel_iommu *iommu = NULL; +	unsigned long flag; + +	if (init_iommu_hw()) { +		if (force_on) +			panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); +		else +			WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); +		return; +	} + +	for_each_active_iommu(iommu, drhd) { + +		spin_lock_irqsave(&iommu->register_lock, flag); + +		writel(iommu->iommu_state[SR_DMAR_FECTL_REG], +			iommu->reg + DMAR_FECTL_REG); +		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], +			iommu->reg + DMAR_FEDATA_REG); +		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], +			iommu->reg + DMAR_FEADDR_REG); +		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], +			iommu->reg + DMAR_FEUADDR_REG); + +		spin_unlock_irqrestore(&iommu->register_lock, flag); +	} + +	for_each_active_iommu(iommu, drhd) +		kfree(iommu->iommu_state); +} + +static struct syscore_ops iommu_syscore_ops = { +	.resume		= iommu_resume, +	.suspend	= iommu_suspend, +}; + +static void __init init_iommu_pm_ops(void) +{ +	register_syscore_ops(&iommu_syscore_ops); +} + +#else +static inline void init_iommu_pm_ops(void) {} +#endif	/* CONFIG_PM */ + +/* + * Here we only respond to action of unbound device from driver. + * + * Added device is not attached to its DMAR domain here yet. That will happen + * when mapping the device to iova. + */ +static int device_notifier(struct notifier_block *nb, +				  unsigned long action, void *data) +{ +	struct device *dev = data; +	struct pci_dev *pdev = to_pci_dev(dev); +	struct dmar_domain *domain; + +	if (iommu_no_mapping(dev)) +		return 0; + +	domain = find_domain(pdev); +	if (!domain) +		return 0; + +	if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) { +		domain_remove_one_dev_info(domain, pdev); + +		if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) && +		    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) && +		    list_empty(&domain->devices)) +			domain_exit(domain); +	} + +	return 0; +} + +static struct notifier_block device_nb = { +	.notifier_call = device_notifier, +}; + +int __init intel_iommu_init(void) +{ +	int ret = 0; + +	/* VT-d is required for a TXT/tboot launch, so enforce that */ +	force_on = tboot_force_iommu(); + +	if (dmar_table_init()) { +		if (force_on) +			panic("tboot: Failed to initialize DMAR table\n"); +		return 	-ENODEV; +	} + +	if (dmar_dev_scope_init()) { +		if (force_on) +			panic("tboot: Failed to initialize DMAR device scope\n"); +		return 	-ENODEV; +	} + +	/* +	 * Check the need for DMA-remapping initialization now. +	 * Above initialization will also be used by Interrupt-remapping. +	 */ +	if (no_iommu || dmar_disabled) +		return -ENODEV; + +	if (iommu_init_mempool()) { +		if (force_on) +			panic("tboot: Failed to initialize iommu memory\n"); +		return 	-ENODEV; +	} + +	if (dmar_init_reserved_ranges()) { +		if (force_on) +			panic("tboot: Failed to reserve iommu ranges\n"); +		return 	-ENODEV; +	} + +	init_no_remapping_devices(); + +	ret = init_dmars(); +	if (ret) { +		if (force_on) +			panic("tboot: Failed to initialize DMARs\n"); +		printk(KERN_ERR "IOMMU: dmar init failed\n"); +		put_iova_domain(&reserved_iova_list); +		iommu_exit_mempool(); +		return ret; +	} +	printk(KERN_INFO +	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n"); + +	init_timer(&unmap_timer); +#ifdef CONFIG_SWIOTLB +	swiotlb = 0; +#endif +	dma_ops = &intel_dma_ops; + +	init_iommu_pm_ops(); + +	register_iommu(&intel_iommu_ops); + +	bus_register_notifier(&pci_bus_type, &device_nb); + +	return 0; +} + +static void iommu_detach_dependent_devices(struct intel_iommu *iommu, +					   struct pci_dev *pdev) +{ +	struct pci_dev *tmp, *parent; + +	if (!iommu || !pdev) +		return; + +	/* dependent device detach */ +	tmp = pci_find_upstream_pcie_bridge(pdev); +	/* Secondary interface's bus number and devfn 0 */ +	if (tmp) { +		parent = pdev->bus->self; +		while (parent != tmp) { +			iommu_detach_dev(iommu, parent->bus->number, +					 parent->devfn); +			parent = parent->bus->self; +		} +		if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */ +			iommu_detach_dev(iommu, +				tmp->subordinate->number, 0); +		else /* this is a legacy PCI bridge */ +			iommu_detach_dev(iommu, tmp->bus->number, +					 tmp->devfn); +	} +} + +static void domain_remove_one_dev_info(struct dmar_domain *domain, +					  struct pci_dev *pdev) +{ +	struct device_domain_info *info; +	struct intel_iommu *iommu; +	unsigned long flags; +	int found = 0; +	struct list_head *entry, *tmp; + +	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number, +				pdev->devfn); +	if (!iommu) +		return; + +	spin_lock_irqsave(&device_domain_lock, flags); +	list_for_each_safe(entry, tmp, &domain->devices) { +		info = list_entry(entry, struct device_domain_info, link); +		if (info->segment == pci_domain_nr(pdev->bus) && +		    info->bus == pdev->bus->number && +		    info->devfn == pdev->devfn) { +			list_del(&info->link); +			list_del(&info->global); +			if (info->dev) +				info->dev->dev.archdata.iommu = NULL; +			spin_unlock_irqrestore(&device_domain_lock, flags); + +			iommu_disable_dev_iotlb(info); +			iommu_detach_dev(iommu, info->bus, info->devfn); +			iommu_detach_dependent_devices(iommu, pdev); +			free_devinfo_mem(info); + +			spin_lock_irqsave(&device_domain_lock, flags); + +			if (found) +				break; +			else +				continue; +		} + +		/* if there is no other devices under the same iommu +		 * owned by this domain, clear this iommu in iommu_bmp +		 * update iommu count and coherency +		 */ +		if (iommu == device_to_iommu(info->segment, info->bus, +					    info->devfn)) +			found = 1; +	} + +	if (found == 0) { +		unsigned long tmp_flags; +		spin_lock_irqsave(&domain->iommu_lock, tmp_flags); +		clear_bit(iommu->seq_id, &domain->iommu_bmp); +		domain->iommu_count--; +		domain_update_iommu_cap(domain); +		spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags); + +		if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) && +		    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) { +			spin_lock_irqsave(&iommu->lock, tmp_flags); +			clear_bit(domain->id, iommu->domain_ids); +			iommu->domains[domain->id] = NULL; +			spin_unlock_irqrestore(&iommu->lock, tmp_flags); +		} +	} + +	spin_unlock_irqrestore(&device_domain_lock, flags); +} + +static void vm_domain_remove_all_dev_info(struct dmar_domain *domain) +{ +	struct device_domain_info *info; +	struct intel_iommu *iommu; +	unsigned long flags1, flags2; + +	spin_lock_irqsave(&device_domain_lock, flags1); +	while (!list_empty(&domain->devices)) { +		info = list_entry(domain->devices.next, +			struct device_domain_info, link); +		list_del(&info->link); +		list_del(&info->global); +		if (info->dev) +			info->dev->dev.archdata.iommu = NULL; + +		spin_unlock_irqrestore(&device_domain_lock, flags1); + +		iommu_disable_dev_iotlb(info); +		iommu = device_to_iommu(info->segment, info->bus, info->devfn); +		iommu_detach_dev(iommu, info->bus, info->devfn); +		iommu_detach_dependent_devices(iommu, info->dev); + +		/* clear this iommu in iommu_bmp, update iommu count +		 * and capabilities +		 */ +		spin_lock_irqsave(&domain->iommu_lock, flags2); +		if (test_and_clear_bit(iommu->seq_id, +				       &domain->iommu_bmp)) { +			domain->iommu_count--; +			domain_update_iommu_cap(domain); +		} +		spin_unlock_irqrestore(&domain->iommu_lock, flags2); + +		free_devinfo_mem(info); +		spin_lock_irqsave(&device_domain_lock, flags1); +	} +	spin_unlock_irqrestore(&device_domain_lock, flags1); +} + +/* domain id for virtual machine, it won't be set in context */ +static unsigned long vm_domid; + +static struct dmar_domain *iommu_alloc_vm_domain(void) +{ +	struct dmar_domain *domain; + +	domain = alloc_domain_mem(); +	if (!domain) +		return NULL; + +	domain->id = vm_domid++; +	domain->nid = -1; +	memset(&domain->iommu_bmp, 0, sizeof(unsigned long)); +	domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE; + +	return domain; +} + +static int md_domain_init(struct dmar_domain *domain, int guest_width) +{ +	int adjust_width; + +	init_iova_domain(&domain->iovad, DMA_32BIT_PFN); +	spin_lock_init(&domain->iommu_lock); + +	domain_reserve_special_ranges(domain); + +	/* calculate AGAW */ +	domain->gaw = guest_width; +	adjust_width = guestwidth_to_adjustwidth(guest_width); +	domain->agaw = width_to_agaw(adjust_width); + +	INIT_LIST_HEAD(&domain->devices); + +	domain->iommu_count = 0; +	domain->iommu_coherency = 0; +	domain->iommu_snooping = 0; +	domain->iommu_superpage = 0; +	domain->max_addr = 0; +	domain->nid = -1; + +	/* always allocate the top pgd */ +	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid); +	if (!domain->pgd) +		return -ENOMEM; +	domain_flush_cache(domain, domain->pgd, PAGE_SIZE); +	return 0; +} + +static void iommu_free_vm_domain(struct dmar_domain *domain) +{ +	unsigned long flags; +	struct dmar_drhd_unit *drhd; +	struct intel_iommu *iommu; +	unsigned long i; +	unsigned long ndomains; + +	for_each_drhd_unit(drhd) { +		if (drhd->ignored) +			continue; +		iommu = drhd->iommu; + +		ndomains = cap_ndoms(iommu->cap); +		for_each_set_bit(i, iommu->domain_ids, ndomains) { +			if (iommu->domains[i] == domain) { +				spin_lock_irqsave(&iommu->lock, flags); +				clear_bit(i, iommu->domain_ids); +				iommu->domains[i] = NULL; +				spin_unlock_irqrestore(&iommu->lock, flags); +				break; +			} +		} +	} +} + +static void vm_domain_exit(struct dmar_domain *domain) +{ +	/* Domain 0 is reserved, so dont process it */ +	if (!domain) +		return; + +	vm_domain_remove_all_dev_info(domain); +	/* destroy iovas */ +	put_iova_domain(&domain->iovad); + +	/* clear ptes */ +	dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); + +	/* free page tables */ +	dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); + +	iommu_free_vm_domain(domain); +	free_domain_mem(domain); +} + +static int intel_iommu_domain_init(struct iommu_domain *domain) +{ +	struct dmar_domain *dmar_domain; + +	dmar_domain = iommu_alloc_vm_domain(); +	if (!dmar_domain) { +		printk(KERN_ERR +			"intel_iommu_domain_init: dmar_domain == NULL\n"); +		return -ENOMEM; +	} +	if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { +		printk(KERN_ERR +			"intel_iommu_domain_init() failed\n"); +		vm_domain_exit(dmar_domain); +		return -ENOMEM; +	} +	domain->priv = dmar_domain; + +	return 0; +} + +static void intel_iommu_domain_destroy(struct iommu_domain *domain) +{ +	struct dmar_domain *dmar_domain = domain->priv; + +	domain->priv = NULL; +	vm_domain_exit(dmar_domain); +} + +static int intel_iommu_attach_device(struct iommu_domain *domain, +				     struct device *dev) +{ +	struct dmar_domain *dmar_domain = domain->priv; +	struct pci_dev *pdev = to_pci_dev(dev); +	struct intel_iommu *iommu; +	int addr_width; + +	/* normally pdev is not mapped */ +	if (unlikely(domain_context_mapped(pdev))) { +		struct dmar_domain *old_domain; + +		old_domain = find_domain(pdev); +		if (old_domain) { +			if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE || +			    dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) +				domain_remove_one_dev_info(old_domain, pdev); +			else +				domain_remove_dev_info(old_domain); +		} +	} + +	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number, +				pdev->devfn); +	if (!iommu) +		return -ENODEV; + +	/* check if this iommu agaw is sufficient for max mapped address */ +	addr_width = agaw_to_width(iommu->agaw); +	if (addr_width > cap_mgaw(iommu->cap)) +		addr_width = cap_mgaw(iommu->cap); + +	if (dmar_domain->max_addr > (1LL << addr_width)) { +		printk(KERN_ERR "%s: iommu width (%d) is not " +		       "sufficient for the mapped address (%llx)\n", +		       __func__, addr_width, dmar_domain->max_addr); +		return -EFAULT; +	} +	dmar_domain->gaw = addr_width; + +	/* +	 * Knock out extra levels of page tables if necessary +	 */ +	while (iommu->agaw < dmar_domain->agaw) { +		struct dma_pte *pte; + +		pte = dmar_domain->pgd; +		if (dma_pte_present(pte)) { +			dmar_domain->pgd = (struct dma_pte *) +				phys_to_virt(dma_pte_addr(pte)); +			free_pgtable_page(pte); +		} +		dmar_domain->agaw--; +	} + +	return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL); +} + +static void intel_iommu_detach_device(struct iommu_domain *domain, +				      struct device *dev) +{ +	struct dmar_domain *dmar_domain = domain->priv; +	struct pci_dev *pdev = to_pci_dev(dev); + +	domain_remove_one_dev_info(dmar_domain, pdev); +} + +static int intel_iommu_map(struct iommu_domain *domain, +			   unsigned long iova, phys_addr_t hpa, +			   int gfp_order, int iommu_prot) +{ +	struct dmar_domain *dmar_domain = domain->priv; +	u64 max_addr; +	int prot = 0; +	size_t size; +	int ret; + +	if (iommu_prot & IOMMU_READ) +		prot |= DMA_PTE_READ; +	if (iommu_prot & IOMMU_WRITE) +		prot |= DMA_PTE_WRITE; +	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) +		prot |= DMA_PTE_SNP; + +	size     = PAGE_SIZE << gfp_order; +	max_addr = iova + size; +	if (dmar_domain->max_addr < max_addr) { +		u64 end; + +		/* check if minimum agaw is sufficient for mapped address */ +		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; +		if (end < max_addr) { +			printk(KERN_ERR "%s: iommu width (%d) is not " +			       "sufficient for the mapped address (%llx)\n", +			       __func__, dmar_domain->gaw, max_addr); +			return -EFAULT; +		} +		dmar_domain->max_addr = max_addr; +	} +	/* Round up size to next multiple of PAGE_SIZE, if it and +	   the low bits of hpa would take us onto the next page */ +	size = aligned_nrpages(hpa, size); +	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, +				 hpa >> VTD_PAGE_SHIFT, size, prot); +	return ret; +} + +static int intel_iommu_unmap(struct iommu_domain *domain, +			     unsigned long iova, int gfp_order) +{ +	struct dmar_domain *dmar_domain = domain->priv; +	size_t size = PAGE_SIZE << gfp_order; + +	dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT, +			    (iova + size - 1) >> VTD_PAGE_SHIFT); + +	if (dmar_domain->max_addr == iova + size) +		dmar_domain->max_addr = iova; + +	return gfp_order; +} + +static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, +					    unsigned long iova) +{ +	struct dmar_domain *dmar_domain = domain->priv; +	struct dma_pte *pte; +	u64 phys = 0; + +	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0); +	if (pte) +		phys = dma_pte_addr(pte); + +	return phys; +} + +static int intel_iommu_domain_has_cap(struct iommu_domain *domain, +				      unsigned long cap) +{ +	struct dmar_domain *dmar_domain = domain->priv; + +	if (cap == IOMMU_CAP_CACHE_COHERENCY) +		return dmar_domain->iommu_snooping; +	if (cap == IOMMU_CAP_INTR_REMAP) +		return intr_remapping_enabled; + +	return 0; +} + +static struct iommu_ops intel_iommu_ops = { +	.domain_init	= intel_iommu_domain_init, +	.domain_destroy = intel_iommu_domain_destroy, +	.attach_dev	= intel_iommu_attach_device, +	.detach_dev	= intel_iommu_detach_device, +	.map		= intel_iommu_map, +	.unmap		= intel_iommu_unmap, +	.iova_to_phys	= intel_iommu_iova_to_phys, +	.domain_has_cap = intel_iommu_domain_has_cap, +}; + +static void __devinit quirk_iommu_rwbf(struct pci_dev *dev) +{ +	/* +	 * Mobile 4 Series Chipset neglects to set RWBF capability, +	 * but needs it: +	 */ +	printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n"); +	rwbf_quirk = 1; + +	/* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */ +	if (dev->revision == 0x07) { +		printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n"); +		dmar_map_gfx = 0; +	} +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); + +#define GGC 0x52 +#define GGC_MEMORY_SIZE_MASK	(0xf << 8) +#define GGC_MEMORY_SIZE_NONE	(0x0 << 8) +#define GGC_MEMORY_SIZE_1M	(0x1 << 8) +#define GGC_MEMORY_SIZE_2M	(0x3 << 8) +#define GGC_MEMORY_VT_ENABLED	(0x8 << 8) +#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8) +#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8) +#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8) + +static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev) +{ +	unsigned short ggc; + +	if (pci_read_config_word(dev, GGC, &ggc)) +		return; + +	if (!(ggc & GGC_MEMORY_VT_ENABLED)) { +		printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); +		dmar_map_gfx = 0; +	} +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); + +/* On Tylersburg chipsets, some BIOSes have been known to enable the +   ISOCH DMAR unit for the Azalia sound device, but not give it any +   TLB entries, which causes it to deadlock. Check for that.  We do +   this in a function called from init_dmars(), instead of in a PCI +   quirk, because we don't want to print the obnoxious "BIOS broken" +   message if VT-d is actually disabled. +*/ +static void __init check_tylersburg_isoch(void) +{ +	struct pci_dev *pdev; +	uint32_t vtisochctrl; + +	/* If there's no Azalia in the system anyway, forget it. */ +	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); +	if (!pdev) +		return; +	pci_dev_put(pdev); + +	/* System Management Registers. Might be hidden, in which case +	   we can't do the sanity check. But that's OK, because the +	   known-broken BIOSes _don't_ actually hide it, so far. */ +	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); +	if (!pdev) +		return; + +	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { +		pci_dev_put(pdev); +		return; +	} + +	pci_dev_put(pdev); + +	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ +	if (vtisochctrl & 1) +		return; + +	/* Drop all bits other than the number of TLB entries */ +	vtisochctrl &= 0x1c; + +	/* If we have the recommended number of TLB entries (16), fine. */ +	if (vtisochctrl == 0x10) +		return; + +	/* Zero TLB entries? You get to ride the short bus to school. */ +	if (!vtisochctrl) { +		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" +		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n", +		     dmi_get_system_info(DMI_BIOS_VENDOR), +		     dmi_get_system_info(DMI_BIOS_VERSION), +		     dmi_get_system_info(DMI_PRODUCT_VERSION)); +		iommu_identity_mapping |= IDENTMAP_AZALIA; +		return; +	} +	 +	printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", +	       vtisochctrl); +} diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c new file mode 100644 index 000000000000..1a89d4a2cadf --- /dev/null +++ b/drivers/iommu/intr_remapping.c @@ -0,0 +1,797 @@ +#include <linux/interrupt.h> +#include <linux/dmar.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/jiffies.h> +#include <linux/hpet.h> +#include <linux/pci.h> +#include <linux/irq.h> +#include <asm/io_apic.h> +#include <asm/smp.h> +#include <asm/cpu.h> +#include <linux/intel-iommu.h> +#include "intr_remapping.h" +#include <acpi/acpi.h> +#include <asm/pci-direct.h> + +static struct ioapic_scope ir_ioapic[MAX_IO_APICS]; +static struct hpet_scope ir_hpet[MAX_HPET_TBS]; +static int ir_ioapic_num, ir_hpet_num; +int intr_remapping_enabled; + +static int disable_intremap; +static int disable_sourceid_checking; + +static __init int setup_nointremap(char *str) +{ +	disable_intremap = 1; +	return 0; +} +early_param("nointremap", setup_nointremap); + +static __init int setup_intremap(char *str) +{ +	if (!str) +		return -EINVAL; + +	if (!strncmp(str, "on", 2)) +		disable_intremap = 0; +	else if (!strncmp(str, "off", 3)) +		disable_intremap = 1; +	else if (!strncmp(str, "nosid", 5)) +		disable_sourceid_checking = 1; + +	return 0; +} +early_param("intremap", setup_intremap); + +static DEFINE_SPINLOCK(irq_2_ir_lock); + +static struct irq_2_iommu *irq_2_iommu(unsigned int irq) +{ +	struct irq_cfg *cfg = irq_get_chip_data(irq); +	return cfg ? &cfg->irq_2_iommu : NULL; +} + +int get_irte(int irq, struct irte *entry) +{ +	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); +	unsigned long flags; +	int index; + +	if (!entry || !irq_iommu) +		return -1; + +	spin_lock_irqsave(&irq_2_ir_lock, flags); + +	index = irq_iommu->irte_index + irq_iommu->sub_handle; +	*entry = *(irq_iommu->iommu->ir_table->base + index); + +	spin_unlock_irqrestore(&irq_2_ir_lock, flags); +	return 0; +} + +int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) +{ +	struct ir_table *table = iommu->ir_table; +	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); +	u16 index, start_index; +	unsigned int mask = 0; +	unsigned long flags; +	int i; + +	if (!count || !irq_iommu) +		return -1; + +	/* +	 * start the IRTE search from index 0. +	 */ +	index = start_index = 0; + +	if (count > 1) { +		count = __roundup_pow_of_two(count); +		mask = ilog2(count); +	} + +	if (mask > ecap_max_handle_mask(iommu->ecap)) { +		printk(KERN_ERR +		       "Requested mask %x exceeds the max invalidation handle" +		       " mask value %Lx\n", mask, +		       ecap_max_handle_mask(iommu->ecap)); +		return -1; +	} + +	spin_lock_irqsave(&irq_2_ir_lock, flags); +	do { +		for (i = index; i < index + count; i++) +			if  (table->base[i].present) +				break; +		/* empty index found */ +		if (i == index + count) +			break; + +		index = (index + count) % INTR_REMAP_TABLE_ENTRIES; + +		if (index == start_index) { +			spin_unlock_irqrestore(&irq_2_ir_lock, flags); +			printk(KERN_ERR "can't allocate an IRTE\n"); +			return -1; +		} +	} while (1); + +	for (i = index; i < index + count; i++) +		table->base[i].present = 1; + +	irq_iommu->iommu = iommu; +	irq_iommu->irte_index =  index; +	irq_iommu->sub_handle = 0; +	irq_iommu->irte_mask = mask; + +	spin_unlock_irqrestore(&irq_2_ir_lock, flags); + +	return index; +} + +static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask) +{ +	struct qi_desc desc; + +	desc.low = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask) +		   | QI_IEC_SELECTIVE; +	desc.high = 0; + +	return qi_submit_sync(&desc, iommu); +} + +int map_irq_to_irte_handle(int irq, u16 *sub_handle) +{ +	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); +	unsigned long flags; +	int index; + +	if (!irq_iommu) +		return -1; + +	spin_lock_irqsave(&irq_2_ir_lock, flags); +	*sub_handle = irq_iommu->sub_handle; +	index = irq_iommu->irte_index; +	spin_unlock_irqrestore(&irq_2_ir_lock, flags); +	return index; +} + +int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle) +{ +	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); +	unsigned long flags; + +	if (!irq_iommu) +		return -1; + +	spin_lock_irqsave(&irq_2_ir_lock, flags); + +	irq_iommu->iommu = iommu; +	irq_iommu->irte_index = index; +	irq_iommu->sub_handle = subhandle; +	irq_iommu->irte_mask = 0; + +	spin_unlock_irqrestore(&irq_2_ir_lock, flags); + +	return 0; +} + +int modify_irte(int irq, struct irte *irte_modified) +{ +	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); +	struct intel_iommu *iommu; +	unsigned long flags; +	struct irte *irte; +	int rc, index; + +	if (!irq_iommu) +		return -1; + +	spin_lock_irqsave(&irq_2_ir_lock, flags); + +	iommu = irq_iommu->iommu; + +	index = irq_iommu->irte_index + irq_iommu->sub_handle; +	irte = &iommu->ir_table->base[index]; + +	set_64bit(&irte->low, irte_modified->low); +	set_64bit(&irte->high, irte_modified->high); +	__iommu_flush_cache(iommu, irte, sizeof(*irte)); + +	rc = qi_flush_iec(iommu, index, 0); +	spin_unlock_irqrestore(&irq_2_ir_lock, flags); + +	return rc; +} + +struct intel_iommu *map_hpet_to_ir(u8 hpet_id) +{ +	int i; + +	for (i = 0; i < MAX_HPET_TBS; i++) +		if (ir_hpet[i].id == hpet_id) +			return ir_hpet[i].iommu; +	return NULL; +} + +struct intel_iommu *map_ioapic_to_ir(int apic) +{ +	int i; + +	for (i = 0; i < MAX_IO_APICS; i++) +		if (ir_ioapic[i].id == apic) +			return ir_ioapic[i].iommu; +	return NULL; +} + +struct intel_iommu *map_dev_to_ir(struct pci_dev *dev) +{ +	struct dmar_drhd_unit *drhd; + +	drhd = dmar_find_matched_drhd_unit(dev); +	if (!drhd) +		return NULL; + +	return drhd->iommu; +} + +static int clear_entries(struct irq_2_iommu *irq_iommu) +{ +	struct irte *start, *entry, *end; +	struct intel_iommu *iommu; +	int index; + +	if (irq_iommu->sub_handle) +		return 0; + +	iommu = irq_iommu->iommu; +	index = irq_iommu->irte_index + irq_iommu->sub_handle; + +	start = iommu->ir_table->base + index; +	end = start + (1 << irq_iommu->irte_mask); + +	for (entry = start; entry < end; entry++) { +		set_64bit(&entry->low, 0); +		set_64bit(&entry->high, 0); +	} + +	return qi_flush_iec(iommu, index, irq_iommu->irte_mask); +} + +int free_irte(int irq) +{ +	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); +	unsigned long flags; +	int rc; + +	if (!irq_iommu) +		return -1; + +	spin_lock_irqsave(&irq_2_ir_lock, flags); + +	rc = clear_entries(irq_iommu); + +	irq_iommu->iommu = NULL; +	irq_iommu->irte_index = 0; +	irq_iommu->sub_handle = 0; +	irq_iommu->irte_mask = 0; + +	spin_unlock_irqrestore(&irq_2_ir_lock, flags); + +	return rc; +} + +/* + * source validation type + */ +#define SVT_NO_VERIFY		0x0  /* no verification is required */ +#define SVT_VERIFY_SID_SQ	0x1  /* verify using SID and SQ fields */ +#define SVT_VERIFY_BUS		0x2  /* verify bus of request-id */ + +/* + * source-id qualifier + */ +#define SQ_ALL_16	0x0  /* verify all 16 bits of request-id */ +#define SQ_13_IGNORE_1	0x1  /* verify most significant 13 bits, ignore +			      * the third least significant bit +			      */ +#define SQ_13_IGNORE_2	0x2  /* verify most significant 13 bits, ignore +			      * the second and third least significant bits +			      */ +#define SQ_13_IGNORE_3	0x3  /* verify most significant 13 bits, ignore +			      * the least three significant bits +			      */ + +/* + * set SVT, SQ and SID fields of irte to verify + * source ids of interrupt requests + */ +static void set_irte_sid(struct irte *irte, unsigned int svt, +			 unsigned int sq, unsigned int sid) +{ +	if (disable_sourceid_checking) +		svt = SVT_NO_VERIFY; +	irte->svt = svt; +	irte->sq = sq; +	irte->sid = sid; +} + +int set_ioapic_sid(struct irte *irte, int apic) +{ +	int i; +	u16 sid = 0; + +	if (!irte) +		return -1; + +	for (i = 0; i < MAX_IO_APICS; i++) { +		if (ir_ioapic[i].id == apic) { +			sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn; +			break; +		} +	} + +	if (sid == 0) { +		pr_warning("Failed to set source-id of IOAPIC (%d)\n", apic); +		return -1; +	} + +	set_irte_sid(irte, 1, 0, sid); + +	return 0; +} + +int set_hpet_sid(struct irte *irte, u8 id) +{ +	int i; +	u16 sid = 0; + +	if (!irte) +		return -1; + +	for (i = 0; i < MAX_HPET_TBS; i++) { +		if (ir_hpet[i].id == id) { +			sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn; +			break; +		} +	} + +	if (sid == 0) { +		pr_warning("Failed to set source-id of HPET block (%d)\n", id); +		return -1; +	} + +	/* +	 * Should really use SQ_ALL_16. Some platforms are broken. +	 * While we figure out the right quirks for these broken platforms, use +	 * SQ_13_IGNORE_3 for now. +	 */ +	set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, sid); + +	return 0; +} + +int set_msi_sid(struct irte *irte, struct pci_dev *dev) +{ +	struct pci_dev *bridge; + +	if (!irte || !dev) +		return -1; + +	/* PCIe device or Root Complex integrated PCI device */ +	if (pci_is_pcie(dev) || !dev->bus->parent) { +		set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, +			     (dev->bus->number << 8) | dev->devfn); +		return 0; +	} + +	bridge = pci_find_upstream_pcie_bridge(dev); +	if (bridge) { +		if (pci_is_pcie(bridge))/* this is a PCIe-to-PCI/PCIX bridge */ +			set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16, +				(bridge->bus->number << 8) | dev->bus->number); +		else /* this is a legacy PCI bridge */ +			set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, +				(bridge->bus->number << 8) | bridge->devfn); +	} + +	return 0; +} + +static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode) +{ +	u64 addr; +	u32 sts; +	unsigned long flags; + +	addr = virt_to_phys((void *)iommu->ir_table->base); + +	spin_lock_irqsave(&iommu->register_lock, flags); + +	dmar_writeq(iommu->reg + DMAR_IRTA_REG, +		    (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE); + +	/* Set interrupt-remapping table pointer */ +	iommu->gcmd |= DMA_GCMD_SIRTP; +	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + +	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +		      readl, (sts & DMA_GSTS_IRTPS), sts); +	spin_unlock_irqrestore(&iommu->register_lock, flags); + +	/* +	 * global invalidation of interrupt entry cache before enabling +	 * interrupt-remapping. +	 */ +	qi_global_iec(iommu); + +	spin_lock_irqsave(&iommu->register_lock, flags); + +	/* Enable interrupt-remapping */ +	iommu->gcmd |= DMA_GCMD_IRE; +	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + +	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +		      readl, (sts & DMA_GSTS_IRES), sts); + +	spin_unlock_irqrestore(&iommu->register_lock, flags); +} + + +static int setup_intr_remapping(struct intel_iommu *iommu, int mode) +{ +	struct ir_table *ir_table; +	struct page *pages; + +	ir_table = iommu->ir_table = kzalloc(sizeof(struct ir_table), +					     GFP_ATOMIC); + +	if (!iommu->ir_table) +		return -ENOMEM; + +	pages = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO, +				 INTR_REMAP_PAGE_ORDER); + +	if (!pages) { +		printk(KERN_ERR "failed to allocate pages of order %d\n", +		       INTR_REMAP_PAGE_ORDER); +		kfree(iommu->ir_table); +		return -ENOMEM; +	} + +	ir_table->base = page_address(pages); + +	iommu_set_intr_remapping(iommu, mode); +	return 0; +} + +/* + * Disable Interrupt Remapping. + */ +static void iommu_disable_intr_remapping(struct intel_iommu *iommu) +{ +	unsigned long flags; +	u32 sts; + +	if (!ecap_ir_support(iommu->ecap)) +		return; + +	/* +	 * global invalidation of interrupt entry cache before disabling +	 * interrupt-remapping. +	 */ +	qi_global_iec(iommu); + +	spin_lock_irqsave(&iommu->register_lock, flags); + +	sts = dmar_readq(iommu->reg + DMAR_GSTS_REG); +	if (!(sts & DMA_GSTS_IRES)) +		goto end; + +	iommu->gcmd &= ~DMA_GCMD_IRE; +	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + +	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, +		      readl, !(sts & DMA_GSTS_IRES), sts); + +end: +	spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +int __init intr_remapping_supported(void) +{ +	struct dmar_drhd_unit *drhd; + +	if (disable_intremap) +		return 0; + +	if (!dmar_ir_support()) +		return 0; + +	for_each_drhd_unit(drhd) { +		struct intel_iommu *iommu = drhd->iommu; + +		if (!ecap_ir_support(iommu->ecap)) +			return 0; +	} + +	return 1; +} + +int __init enable_intr_remapping(int eim) +{ +	struct dmar_drhd_unit *drhd; +	int setup = 0; + +	if (parse_ioapics_under_ir() != 1) { +		printk(KERN_INFO "Not enable interrupt remapping\n"); +		return -1; +	} + +	for_each_drhd_unit(drhd) { +		struct intel_iommu *iommu = drhd->iommu; + +		/* +		 * If the queued invalidation is already initialized, +		 * shouldn't disable it. +		 */ +		if (iommu->qi) +			continue; + +		/* +		 * Clear previous faults. +		 */ +		dmar_fault(-1, iommu); + +		/* +		 * Disable intr remapping and queued invalidation, if already +		 * enabled prior to OS handover. +		 */ +		iommu_disable_intr_remapping(iommu); + +		dmar_disable_qi(iommu); +	} + +	/* +	 * check for the Interrupt-remapping support +	 */ +	for_each_drhd_unit(drhd) { +		struct intel_iommu *iommu = drhd->iommu; + +		if (!ecap_ir_support(iommu->ecap)) +			continue; + +		if (eim && !ecap_eim_support(iommu->ecap)) { +			printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, " +			       " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap); +			return -1; +		} +	} + +	/* +	 * Enable queued invalidation for all the DRHD's. +	 */ +	for_each_drhd_unit(drhd) { +		int ret; +		struct intel_iommu *iommu = drhd->iommu; +		ret = dmar_enable_qi(iommu); + +		if (ret) { +			printk(KERN_ERR "DRHD %Lx: failed to enable queued, " +			       " invalidation, ecap %Lx, ret %d\n", +			       drhd->reg_base_addr, iommu->ecap, ret); +			return -1; +		} +	} + +	/* +	 * Setup Interrupt-remapping for all the DRHD's now. +	 */ +	for_each_drhd_unit(drhd) { +		struct intel_iommu *iommu = drhd->iommu; + +		if (!ecap_ir_support(iommu->ecap)) +			continue; + +		if (setup_intr_remapping(iommu, eim)) +			goto error; + +		setup = 1; +	} + +	if (!setup) +		goto error; + +	intr_remapping_enabled = 1; + +	return 0; + +error: +	/* +	 * handle error condition gracefully here! +	 */ +	return -1; +} + +static void ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope, +				      struct intel_iommu *iommu) +{ +	struct acpi_dmar_pci_path *path; +	u8 bus; +	int count; + +	bus = scope->bus; +	path = (struct acpi_dmar_pci_path *)(scope + 1); +	count = (scope->length - sizeof(struct acpi_dmar_device_scope)) +		/ sizeof(struct acpi_dmar_pci_path); + +	while (--count > 0) { +		/* +		 * Access PCI directly due to the PCI +		 * subsystem isn't initialized yet. +		 */ +		bus = read_pci_config_byte(bus, path->dev, path->fn, +					   PCI_SECONDARY_BUS); +		path++; +	} +	ir_hpet[ir_hpet_num].bus   = bus; +	ir_hpet[ir_hpet_num].devfn = PCI_DEVFN(path->dev, path->fn); +	ir_hpet[ir_hpet_num].iommu = iommu; +	ir_hpet[ir_hpet_num].id    = scope->enumeration_id; +	ir_hpet_num++; +} + +static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope, +				      struct intel_iommu *iommu) +{ +	struct acpi_dmar_pci_path *path; +	u8 bus; +	int count; + +	bus = scope->bus; +	path = (struct acpi_dmar_pci_path *)(scope + 1); +	count = (scope->length - sizeof(struct acpi_dmar_device_scope)) +		/ sizeof(struct acpi_dmar_pci_path); + +	while (--count > 0) { +		/* +		 * Access PCI directly due to the PCI +		 * subsystem isn't initialized yet. +		 */ +		bus = read_pci_config_byte(bus, path->dev, path->fn, +					   PCI_SECONDARY_BUS); +		path++; +	} + +	ir_ioapic[ir_ioapic_num].bus   = bus; +	ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->dev, path->fn); +	ir_ioapic[ir_ioapic_num].iommu = iommu; +	ir_ioapic[ir_ioapic_num].id    = scope->enumeration_id; +	ir_ioapic_num++; +} + +static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header, +				      struct intel_iommu *iommu) +{ +	struct acpi_dmar_hardware_unit *drhd; +	struct acpi_dmar_device_scope *scope; +	void *start, *end; + +	drhd = (struct acpi_dmar_hardware_unit *)header; + +	start = (void *)(drhd + 1); +	end = ((void *)drhd) + header->length; + +	while (start < end) { +		scope = start; +		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) { +			if (ir_ioapic_num == MAX_IO_APICS) { +				printk(KERN_WARNING "Exceeded Max IO APICS\n"); +				return -1; +			} + +			printk(KERN_INFO "IOAPIC id %d under DRHD base " +			       " 0x%Lx IOMMU %d\n", scope->enumeration_id, +			       drhd->address, iommu->seq_id); + +			ir_parse_one_ioapic_scope(scope, iommu); +		} else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET) { +			if (ir_hpet_num == MAX_HPET_TBS) { +				printk(KERN_WARNING "Exceeded Max HPET blocks\n"); +				return -1; +			} + +			printk(KERN_INFO "HPET id %d under DRHD base" +			       " 0x%Lx\n", scope->enumeration_id, +			       drhd->address); + +			ir_parse_one_hpet_scope(scope, iommu); +		} +		start += scope->length; +	} + +	return 0; +} + +/* + * Finds the assocaition between IOAPIC's and its Interrupt-remapping + * hardware unit. + */ +int __init parse_ioapics_under_ir(void) +{ +	struct dmar_drhd_unit *drhd; +	int ir_supported = 0; + +	for_each_drhd_unit(drhd) { +		struct intel_iommu *iommu = drhd->iommu; + +		if (ecap_ir_support(iommu->ecap)) { +			if (ir_parse_ioapic_hpet_scope(drhd->hdr, iommu)) +				return -1; + +			ir_supported = 1; +		} +	} + +	if (ir_supported && ir_ioapic_num != nr_ioapics) { +		printk(KERN_WARNING +		       "Not all IO-APIC's listed under remapping hardware\n"); +		return -1; +	} + +	return ir_supported; +} + +void disable_intr_remapping(void) +{ +	struct dmar_drhd_unit *drhd; +	struct intel_iommu *iommu = NULL; + +	/* +	 * Disable Interrupt-remapping for all the DRHD's now. +	 */ +	for_each_iommu(iommu, drhd) { +		if (!ecap_ir_support(iommu->ecap)) +			continue; + +		iommu_disable_intr_remapping(iommu); +	} +} + +int reenable_intr_remapping(int eim) +{ +	struct dmar_drhd_unit *drhd; +	int setup = 0; +	struct intel_iommu *iommu = NULL; + +	for_each_iommu(iommu, drhd) +		if (iommu->qi) +			dmar_reenable_qi(iommu); + +	/* +	 * Setup Interrupt-remapping for all the DRHD's now. +	 */ +	for_each_iommu(iommu, drhd) { +		if (!ecap_ir_support(iommu->ecap)) +			continue; + +		/* Set up interrupt remapping for iommu.*/ +		iommu_set_intr_remapping(iommu, eim); +		setup = 1; +	} + +	if (!setup) +		goto error; + +	return 0; + +error: +	/* +	 * handle error condition gracefully here! +	 */ +	return -1; +} + diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h new file mode 100644 index 000000000000..5662fecfee60 --- /dev/null +++ b/drivers/iommu/intr_remapping.h @@ -0,0 +1,17 @@ +#include <linux/intel-iommu.h> + +struct ioapic_scope { +	struct intel_iommu *iommu; +	unsigned int id; +	unsigned int bus;	/* PCI bus number */ +	unsigned int devfn;	/* PCI devfn number */ +}; + +struct hpet_scope { +	struct intel_iommu *iommu; +	u8 id; +	unsigned int bus; +	unsigned int devfn; +}; + +#define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c new file mode 100644 index 000000000000..6e6b6a11b3ce --- /dev/null +++ b/drivers/iommu/iommu.c @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA + */ + +#include <linux/bug.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/errno.h> +#include <linux/iommu.h> + +static struct iommu_ops *iommu_ops; + +void register_iommu(struct iommu_ops *ops) +{ +	if (iommu_ops) +		BUG(); + +	iommu_ops = ops; +} + +bool iommu_found(void) +{ +	return iommu_ops != NULL; +} +EXPORT_SYMBOL_GPL(iommu_found); + +struct iommu_domain *iommu_domain_alloc(void) +{ +	struct iommu_domain *domain; +	int ret; + +	domain = kmalloc(sizeof(*domain), GFP_KERNEL); +	if (!domain) +		return NULL; + +	ret = iommu_ops->domain_init(domain); +	if (ret) +		goto out_free; + +	return domain; + +out_free: +	kfree(domain); + +	return NULL; +} +EXPORT_SYMBOL_GPL(iommu_domain_alloc); + +void iommu_domain_free(struct iommu_domain *domain) +{ +	iommu_ops->domain_destroy(domain); +	kfree(domain); +} +EXPORT_SYMBOL_GPL(iommu_domain_free); + +int iommu_attach_device(struct iommu_domain *domain, struct device *dev) +{ +	return iommu_ops->attach_dev(domain, dev); +} +EXPORT_SYMBOL_GPL(iommu_attach_device); + +void iommu_detach_device(struct iommu_domain *domain, struct device *dev) +{ +	iommu_ops->detach_dev(domain, dev); +} +EXPORT_SYMBOL_GPL(iommu_detach_device); + +phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, +			       unsigned long iova) +{ +	return iommu_ops->iova_to_phys(domain, iova); +} +EXPORT_SYMBOL_GPL(iommu_iova_to_phys); + +int iommu_domain_has_cap(struct iommu_domain *domain, +			 unsigned long cap) +{ +	return iommu_ops->domain_has_cap(domain, cap); +} +EXPORT_SYMBOL_GPL(iommu_domain_has_cap); + +int iommu_map(struct iommu_domain *domain, unsigned long iova, +	      phys_addr_t paddr, int gfp_order, int prot) +{ +	unsigned long invalid_mask; +	size_t size; + +	size         = 0x1000UL << gfp_order; +	invalid_mask = size - 1; + +	BUG_ON((iova | paddr) & invalid_mask); + +	return iommu_ops->map(domain, iova, paddr, gfp_order, prot); +} +EXPORT_SYMBOL_GPL(iommu_map); + +int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order) +{ +	unsigned long invalid_mask; +	size_t size; + +	size         = 0x1000UL << gfp_order; +	invalid_mask = size - 1; + +	BUG_ON(iova & invalid_mask); + +	return iommu_ops->unmap(domain, iova, gfp_order); +} +EXPORT_SYMBOL_GPL(iommu_unmap); diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c new file mode 100644 index 000000000000..c5c274ab5c5a --- /dev/null +++ b/drivers/iommu/iova.c @@ -0,0 +1,435 @@ +/* + * Copyright © 2006-2009, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> + */ + +#include <linux/iova.h> + +void +init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit) +{ +	spin_lock_init(&iovad->iova_rbtree_lock); +	iovad->rbroot = RB_ROOT; +	iovad->cached32_node = NULL; +	iovad->dma_32bit_pfn = pfn_32bit; +} + +static struct rb_node * +__get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn) +{ +	if ((*limit_pfn != iovad->dma_32bit_pfn) || +		(iovad->cached32_node == NULL)) +		return rb_last(&iovad->rbroot); +	else { +		struct rb_node *prev_node = rb_prev(iovad->cached32_node); +		struct iova *curr_iova = +			container_of(iovad->cached32_node, struct iova, node); +		*limit_pfn = curr_iova->pfn_lo - 1; +		return prev_node; +	} +} + +static void +__cached_rbnode_insert_update(struct iova_domain *iovad, +	unsigned long limit_pfn, struct iova *new) +{ +	if (limit_pfn != iovad->dma_32bit_pfn) +		return; +	iovad->cached32_node = &new->node; +} + +static void +__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free) +{ +	struct iova *cached_iova; +	struct rb_node *curr; + +	if (!iovad->cached32_node) +		return; +	curr = iovad->cached32_node; +	cached_iova = container_of(curr, struct iova, node); + +	if (free->pfn_lo >= cached_iova->pfn_lo) { +		struct rb_node *node = rb_next(&free->node); +		struct iova *iova = container_of(node, struct iova, node); + +		/* only cache if it's below 32bit pfn */ +		if (node && iova->pfn_lo < iovad->dma_32bit_pfn) +			iovad->cached32_node = node; +		else +			iovad->cached32_node = NULL; +	} +} + +/* Computes the padding size required, to make the + * the start address naturally aligned on its size + */ +static int +iova_get_pad_size(int size, unsigned int limit_pfn) +{ +	unsigned int pad_size = 0; +	unsigned int order = ilog2(size); + +	if (order) +		pad_size = (limit_pfn + 1) % (1 << order); + +	return pad_size; +} + +static int __alloc_and_insert_iova_range(struct iova_domain *iovad, +		unsigned long size, unsigned long limit_pfn, +			struct iova *new, bool size_aligned) +{ +	struct rb_node *prev, *curr = NULL; +	unsigned long flags; +	unsigned long saved_pfn; +	unsigned int pad_size = 0; + +	/* Walk the tree backwards */ +	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags); +	saved_pfn = limit_pfn; +	curr = __get_cached_rbnode(iovad, &limit_pfn); +	prev = curr; +	while (curr) { +		struct iova *curr_iova = container_of(curr, struct iova, node); + +		if (limit_pfn < curr_iova->pfn_lo) +			goto move_left; +		else if (limit_pfn < curr_iova->pfn_hi) +			goto adjust_limit_pfn; +		else { +			if (size_aligned) +				pad_size = iova_get_pad_size(size, limit_pfn); +			if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn) +				break;	/* found a free slot */ +		} +adjust_limit_pfn: +		limit_pfn = curr_iova->pfn_lo - 1; +move_left: +		prev = curr; +		curr = rb_prev(curr); +	} + +	if (!curr) { +		if (size_aligned) +			pad_size = iova_get_pad_size(size, limit_pfn); +		if ((IOVA_START_PFN + size + pad_size) > limit_pfn) { +			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); +			return -ENOMEM; +		} +	} + +	/* pfn_lo will point to size aligned address if size_aligned is set */ +	new->pfn_lo = limit_pfn - (size + pad_size) + 1; +	new->pfn_hi = new->pfn_lo + size - 1; + +	/* Insert the new_iova into domain rbtree by holding writer lock */ +	/* Add new node and rebalance tree. */ +	{ +		struct rb_node **entry, *parent = NULL; + +		/* If we have 'prev', it's a valid place to start the +		   insertion. Otherwise, start from the root. */ +		if (prev) +			entry = &prev; +		else +			entry = &iovad->rbroot.rb_node; + +		/* Figure out where to put new node */ +		while (*entry) { +			struct iova *this = container_of(*entry, +							struct iova, node); +			parent = *entry; + +			if (new->pfn_lo < this->pfn_lo) +				entry = &((*entry)->rb_left); +			else if (new->pfn_lo > this->pfn_lo) +				entry = &((*entry)->rb_right); +			else +				BUG(); /* this should not happen */ +		} + +		/* Add new node and rebalance tree. */ +		rb_link_node(&new->node, parent, entry); +		rb_insert_color(&new->node, &iovad->rbroot); +	} +	__cached_rbnode_insert_update(iovad, saved_pfn, new); + +	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); + + +	return 0; +} + +static void +iova_insert_rbtree(struct rb_root *root, struct iova *iova) +{ +	struct rb_node **new = &(root->rb_node), *parent = NULL; +	/* Figure out where to put new node */ +	while (*new) { +		struct iova *this = container_of(*new, struct iova, node); +		parent = *new; + +		if (iova->pfn_lo < this->pfn_lo) +			new = &((*new)->rb_left); +		else if (iova->pfn_lo > this->pfn_lo) +			new = &((*new)->rb_right); +		else +			BUG(); /* this should not happen */ +	} +	/* Add new node and rebalance tree. */ +	rb_link_node(&iova->node, parent, new); +	rb_insert_color(&iova->node, root); +} + +/** + * alloc_iova - allocates an iova + * @iovad - iova domain in question + * @size - size of page frames to allocate + * @limit_pfn - max limit address + * @size_aligned - set if size_aligned address range is required + * This function allocates an iova in the range limit_pfn to IOVA_START_PFN + * looking from limit_pfn instead from IOVA_START_PFN. If the size_aligned + * flag is set then the allocated address iova->pfn_lo will be naturally + * aligned on roundup_power_of_two(size). + */ +struct iova * +alloc_iova(struct iova_domain *iovad, unsigned long size, +	unsigned long limit_pfn, +	bool size_aligned) +{ +	struct iova *new_iova; +	int ret; + +	new_iova = alloc_iova_mem(); +	if (!new_iova) +		return NULL; + +	/* If size aligned is set then round the size to +	 * to next power of two. +	 */ +	if (size_aligned) +		size = __roundup_pow_of_two(size); + +	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn, +			new_iova, size_aligned); + +	if (ret) { +		free_iova_mem(new_iova); +		return NULL; +	} + +	return new_iova; +} + +/** + * find_iova - find's an iova for a given pfn + * @iovad - iova domain in question. + * pfn - page frame number + * This function finds and returns an iova belonging to the + * given doamin which matches the given pfn. + */ +struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn) +{ +	unsigned long flags; +	struct rb_node *node; + +	/* Take the lock so that no other thread is manipulating the rbtree */ +	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags); +	node = iovad->rbroot.rb_node; +	while (node) { +		struct iova *iova = container_of(node, struct iova, node); + +		/* If pfn falls within iova's range, return iova */ +		if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) { +			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); +			/* We are not holding the lock while this iova +			 * is referenced by the caller as the same thread +			 * which called this function also calls __free_iova() +			 * and it is by desing that only one thread can possibly +			 * reference a particular iova and hence no conflict. +			 */ +			return iova; +		} + +		if (pfn < iova->pfn_lo) +			node = node->rb_left; +		else if (pfn > iova->pfn_lo) +			node = node->rb_right; +	} + +	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); +	return NULL; +} + +/** + * __free_iova - frees the given iova + * @iovad: iova domain in question. + * @iova: iova in question. + * Frees the given iova belonging to the giving domain + */ +void +__free_iova(struct iova_domain *iovad, struct iova *iova) +{ +	unsigned long flags; + +	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags); +	__cached_rbnode_delete_update(iovad, iova); +	rb_erase(&iova->node, &iovad->rbroot); +	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); +	free_iova_mem(iova); +} + +/** + * free_iova - finds and frees the iova for a given pfn + * @iovad: - iova domain in question. + * @pfn: - pfn that is allocated previously + * This functions finds an iova for a given pfn and then + * frees the iova from that domain. + */ +void +free_iova(struct iova_domain *iovad, unsigned long pfn) +{ +	struct iova *iova = find_iova(iovad, pfn); +	if (iova) +		__free_iova(iovad, iova); + +} + +/** + * put_iova_domain - destroys the iova doamin + * @iovad: - iova domain in question. + * All the iova's in that domain are destroyed. + */ +void put_iova_domain(struct iova_domain *iovad) +{ +	struct rb_node *node; +	unsigned long flags; + +	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags); +	node = rb_first(&iovad->rbroot); +	while (node) { +		struct iova *iova = container_of(node, struct iova, node); +		rb_erase(node, &iovad->rbroot); +		free_iova_mem(iova); +		node = rb_first(&iovad->rbroot); +	} +	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); +} + +static int +__is_range_overlap(struct rb_node *node, +	unsigned long pfn_lo, unsigned long pfn_hi) +{ +	struct iova *iova = container_of(node, struct iova, node); + +	if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo)) +		return 1; +	return 0; +} + +static struct iova * +__insert_new_range(struct iova_domain *iovad, +	unsigned long pfn_lo, unsigned long pfn_hi) +{ +	struct iova *iova; + +	iova = alloc_iova_mem(); +	if (!iova) +		return iova; + +	iova->pfn_hi = pfn_hi; +	iova->pfn_lo = pfn_lo; +	iova_insert_rbtree(&iovad->rbroot, iova); +	return iova; +} + +static void +__adjust_overlap_range(struct iova *iova, +	unsigned long *pfn_lo, unsigned long *pfn_hi) +{ +	if (*pfn_lo < iova->pfn_lo) +		iova->pfn_lo = *pfn_lo; +	if (*pfn_hi > iova->pfn_hi) +		*pfn_lo = iova->pfn_hi + 1; +} + +/** + * reserve_iova - reserves an iova in the given range + * @iovad: - iova domain pointer + * @pfn_lo: - lower page frame address + * @pfn_hi:- higher pfn adderss + * This function allocates reserves the address range from pfn_lo to pfn_hi so + * that this address is not dished out as part of alloc_iova. + */ +struct iova * +reserve_iova(struct iova_domain *iovad, +	unsigned long pfn_lo, unsigned long pfn_hi) +{ +	struct rb_node *node; +	unsigned long flags; +	struct iova *iova; +	unsigned int overlap = 0; + +	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags); +	for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) { +		if (__is_range_overlap(node, pfn_lo, pfn_hi)) { +			iova = container_of(node, struct iova, node); +			__adjust_overlap_range(iova, &pfn_lo, &pfn_hi); +			if ((pfn_lo >= iova->pfn_lo) && +				(pfn_hi <= iova->pfn_hi)) +				goto finish; +			overlap = 1; + +		} else if (overlap) +				break; +	} + +	/* We are here either because this is the first reserver node +	 * or need to insert remaining non overlap addr range +	 */ +	iova = __insert_new_range(iovad, pfn_lo, pfn_hi); +finish: + +	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); +	return iova; +} + +/** + * copy_reserved_iova - copies the reserved between domains + * @from: - source doamin from where to copy + * @to: - destination domin where to copy + * This function copies reserved iova's from one doamin to + * other. + */ +void +copy_reserved_iova(struct iova_domain *from, struct iova_domain *to) +{ +	unsigned long flags; +	struct rb_node *node; + +	spin_lock_irqsave(&from->iova_rbtree_lock, flags); +	for (node = rb_first(&from->rbroot); node; node = rb_next(node)) { +		struct iova *iova = container_of(node, struct iova, node); +		struct iova *new_iova; +		new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi); +		if (!new_iova) +			printk(KERN_ERR "Reserve iova range %lx@%lx failed\n", +				iova->pfn_lo, iova->pfn_lo); +	} +	spin_unlock_irqrestore(&from->iova_rbtree_lock, flags); +} diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c new file mode 100644 index 000000000000..1a584e077c61 --- /dev/null +++ b/drivers/iommu/msm_iommu.c @@ -0,0 +1,731 @@ +/* Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/errno.h> +#include <linux/io.h> +#include <linux/interrupt.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/iommu.h> +#include <linux/clk.h> + +#include <asm/cacheflush.h> +#include <asm/sizes.h> + +#include <mach/iommu_hw-8xxx.h> +#include <mach/iommu.h> + +#define MRC(reg, processor, op1, crn, crm, op2)				\ +__asm__ __volatile__ (							\ +"   mrc   "   #processor "," #op1 ", %0,"  #crn "," #crm "," #op2 "\n"  \ +: "=r" (reg)) + +#define RCP15_PRRR(reg)		MRC(reg, p15, 0, c10, c2, 0) +#define RCP15_NMRR(reg)		MRC(reg, p15, 0, c10, c2, 1) + +static int msm_iommu_tex_class[4]; + +DEFINE_SPINLOCK(msm_iommu_lock); + +struct msm_priv { +	unsigned long *pgtable; +	struct list_head list_attached; +}; + +static int __enable_clocks(struct msm_iommu_drvdata *drvdata) +{ +	int ret; + +	ret = clk_enable(drvdata->pclk); +	if (ret) +		goto fail; + +	if (drvdata->clk) { +		ret = clk_enable(drvdata->clk); +		if (ret) +			clk_disable(drvdata->pclk); +	} +fail: +	return ret; +} + +static void __disable_clocks(struct msm_iommu_drvdata *drvdata) +{ +	if (drvdata->clk) +		clk_disable(drvdata->clk); +	clk_disable(drvdata->pclk); +} + +static int __flush_iotlb(struct iommu_domain *domain) +{ +	struct msm_priv *priv = domain->priv; +	struct msm_iommu_drvdata *iommu_drvdata; +	struct msm_iommu_ctx_drvdata *ctx_drvdata; +	int ret = 0; +#ifndef CONFIG_IOMMU_PGTABLES_L2 +	unsigned long *fl_table = priv->pgtable; +	int i; + +	if (!list_empty(&priv->list_attached)) { +		dmac_flush_range(fl_table, fl_table + SZ_16K); + +		for (i = 0; i < NUM_FL_PTE; i++) +			if ((fl_table[i] & 0x03) == FL_TYPE_TABLE) { +				void *sl_table = __va(fl_table[i] & +								FL_BASE_MASK); +				dmac_flush_range(sl_table, sl_table + SZ_4K); +			} +	} +#endif + +	list_for_each_entry(ctx_drvdata, &priv->list_attached, attached_elm) { +		if (!ctx_drvdata->pdev || !ctx_drvdata->pdev->dev.parent) +			BUG(); + +		iommu_drvdata = dev_get_drvdata(ctx_drvdata->pdev->dev.parent); +		BUG_ON(!iommu_drvdata); + +		ret = __enable_clocks(iommu_drvdata); +		if (ret) +			goto fail; + +		SET_CTX_TLBIALL(iommu_drvdata->base, ctx_drvdata->num, 0); +		__disable_clocks(iommu_drvdata); +	} +fail: +	return ret; +} + +static void __reset_context(void __iomem *base, int ctx) +{ +	SET_BPRCOSH(base, ctx, 0); +	SET_BPRCISH(base, ctx, 0); +	SET_BPRCNSH(base, ctx, 0); +	SET_BPSHCFG(base, ctx, 0); +	SET_BPMTCFG(base, ctx, 0); +	SET_ACTLR(base, ctx, 0); +	SET_SCTLR(base, ctx, 0); +	SET_FSRRESTORE(base, ctx, 0); +	SET_TTBR0(base, ctx, 0); +	SET_TTBR1(base, ctx, 0); +	SET_TTBCR(base, ctx, 0); +	SET_BFBCR(base, ctx, 0); +	SET_PAR(base, ctx, 0); +	SET_FAR(base, ctx, 0); +	SET_CTX_TLBIALL(base, ctx, 0); +	SET_TLBFLPTER(base, ctx, 0); +	SET_TLBSLPTER(base, ctx, 0); +	SET_TLBLKCR(base, ctx, 0); +	SET_PRRR(base, ctx, 0); +	SET_NMRR(base, ctx, 0); +} + +static void __program_context(void __iomem *base, int ctx, phys_addr_t pgtable) +{ +	unsigned int prrr, nmrr; +	__reset_context(base, ctx); + +	/* Set up HTW mode */ +	/* TLB miss configuration: perform HTW on miss */ +	SET_TLBMCFG(base, ctx, 0x3); + +	/* V2P configuration: HTW for access */ +	SET_V2PCFG(base, ctx, 0x3); + +	SET_TTBCR(base, ctx, 0); +	SET_TTBR0_PA(base, ctx, (pgtable >> 14)); + +	/* Invalidate the TLB for this context */ +	SET_CTX_TLBIALL(base, ctx, 0); + +	/* Set interrupt number to "secure" interrupt */ +	SET_IRPTNDX(base, ctx, 0); + +	/* Enable context fault interrupt */ +	SET_CFEIE(base, ctx, 1); + +	/* Stall access on a context fault and let the handler deal with it */ +	SET_CFCFG(base, ctx, 1); + +	/* Redirect all cacheable requests to L2 slave port. */ +	SET_RCISH(base, ctx, 1); +	SET_RCOSH(base, ctx, 1); +	SET_RCNSH(base, ctx, 1); + +	/* Turn on TEX Remap */ +	SET_TRE(base, ctx, 1); + +	/* Set TEX remap attributes */ +	RCP15_PRRR(prrr); +	RCP15_NMRR(nmrr); +	SET_PRRR(base, ctx, prrr); +	SET_NMRR(base, ctx, nmrr); + +	/* Turn on BFB prefetch */ +	SET_BFBDFE(base, ctx, 1); + +#ifdef CONFIG_IOMMU_PGTABLES_L2 +	/* Configure page tables as inner-cacheable and shareable to reduce +	 * the TLB miss penalty. +	 */ +	SET_TTBR0_SH(base, ctx, 1); +	SET_TTBR1_SH(base, ctx, 1); + +	SET_TTBR0_NOS(base, ctx, 1); +	SET_TTBR1_NOS(base, ctx, 1); + +	SET_TTBR0_IRGNH(base, ctx, 0); /* WB, WA */ +	SET_TTBR0_IRGNL(base, ctx, 1); + +	SET_TTBR1_IRGNH(base, ctx, 0); /* WB, WA */ +	SET_TTBR1_IRGNL(base, ctx, 1); + +	SET_TTBR0_ORGN(base, ctx, 1); /* WB, WA */ +	SET_TTBR1_ORGN(base, ctx, 1); /* WB, WA */ +#endif + +	/* Enable the MMU */ +	SET_M(base, ctx, 1); +} + +static int msm_iommu_domain_init(struct iommu_domain *domain) +{ +	struct msm_priv *priv = kzalloc(sizeof(*priv), GFP_KERNEL); + +	if (!priv) +		goto fail_nomem; + +	INIT_LIST_HEAD(&priv->list_attached); +	priv->pgtable = (unsigned long *)__get_free_pages(GFP_KERNEL, +							  get_order(SZ_16K)); + +	if (!priv->pgtable) +		goto fail_nomem; + +	memset(priv->pgtable, 0, SZ_16K); +	domain->priv = priv; +	return 0; + +fail_nomem: +	kfree(priv); +	return -ENOMEM; +} + +static void msm_iommu_domain_destroy(struct iommu_domain *domain) +{ +	struct msm_priv *priv; +	unsigned long flags; +	unsigned long *fl_table; +	int i; + +	spin_lock_irqsave(&msm_iommu_lock, flags); +	priv = domain->priv; +	domain->priv = NULL; + +	if (priv) { +		fl_table = priv->pgtable; + +		for (i = 0; i < NUM_FL_PTE; i++) +			if ((fl_table[i] & 0x03) == FL_TYPE_TABLE) +				free_page((unsigned long) __va(((fl_table[i]) & +								FL_BASE_MASK))); + +		free_pages((unsigned long)priv->pgtable, get_order(SZ_16K)); +		priv->pgtable = NULL; +	} + +	kfree(priv); +	spin_unlock_irqrestore(&msm_iommu_lock, flags); +} + +static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +{ +	struct msm_priv *priv; +	struct msm_iommu_ctx_dev *ctx_dev; +	struct msm_iommu_drvdata *iommu_drvdata; +	struct msm_iommu_ctx_drvdata *ctx_drvdata; +	struct msm_iommu_ctx_drvdata *tmp_drvdata; +	int ret = 0; +	unsigned long flags; + +	spin_lock_irqsave(&msm_iommu_lock, flags); + +	priv = domain->priv; + +	if (!priv || !dev) { +		ret = -EINVAL; +		goto fail; +	} + +	iommu_drvdata = dev_get_drvdata(dev->parent); +	ctx_drvdata = dev_get_drvdata(dev); +	ctx_dev = dev->platform_data; + +	if (!iommu_drvdata || !ctx_drvdata || !ctx_dev) { +		ret = -EINVAL; +		goto fail; +	} + +	if (!list_empty(&ctx_drvdata->attached_elm)) { +		ret = -EBUSY; +		goto fail; +	} + +	list_for_each_entry(tmp_drvdata, &priv->list_attached, attached_elm) +		if (tmp_drvdata == ctx_drvdata) { +			ret = -EBUSY; +			goto fail; +		} + +	ret = __enable_clocks(iommu_drvdata); +	if (ret) +		goto fail; + +	__program_context(iommu_drvdata->base, ctx_dev->num, +			  __pa(priv->pgtable)); + +	__disable_clocks(iommu_drvdata); +	list_add(&(ctx_drvdata->attached_elm), &priv->list_attached); +	ret = __flush_iotlb(domain); + +fail: +	spin_unlock_irqrestore(&msm_iommu_lock, flags); +	return ret; +} + +static void msm_iommu_detach_dev(struct iommu_domain *domain, +				 struct device *dev) +{ +	struct msm_priv *priv; +	struct msm_iommu_ctx_dev *ctx_dev; +	struct msm_iommu_drvdata *iommu_drvdata; +	struct msm_iommu_ctx_drvdata *ctx_drvdata; +	unsigned long flags; +	int ret; + +	spin_lock_irqsave(&msm_iommu_lock, flags); +	priv = domain->priv; + +	if (!priv || !dev) +		goto fail; + +	iommu_drvdata = dev_get_drvdata(dev->parent); +	ctx_drvdata = dev_get_drvdata(dev); +	ctx_dev = dev->platform_data; + +	if (!iommu_drvdata || !ctx_drvdata || !ctx_dev) +		goto fail; + +	ret = __flush_iotlb(domain); +	if (ret) +		goto fail; + +	ret = __enable_clocks(iommu_drvdata); +	if (ret) +		goto fail; + +	__reset_context(iommu_drvdata->base, ctx_dev->num); +	__disable_clocks(iommu_drvdata); +	list_del_init(&ctx_drvdata->attached_elm); + +fail: +	spin_unlock_irqrestore(&msm_iommu_lock, flags); +} + +static int msm_iommu_map(struct iommu_domain *domain, unsigned long va, +			 phys_addr_t pa, int order, int prot) +{ +	struct msm_priv *priv; +	unsigned long flags; +	unsigned long *fl_table; +	unsigned long *fl_pte; +	unsigned long fl_offset; +	unsigned long *sl_table; +	unsigned long *sl_pte; +	unsigned long sl_offset; +	unsigned int pgprot; +	size_t len = 0x1000UL << order; +	int ret = 0, tex, sh; + +	spin_lock_irqsave(&msm_iommu_lock, flags); + +	sh = (prot & MSM_IOMMU_ATTR_SH) ? 1 : 0; +	tex = msm_iommu_tex_class[prot & MSM_IOMMU_CP_MASK]; + +	if (tex < 0 || tex > NUM_TEX_CLASS - 1) { +		ret = -EINVAL; +		goto fail; +	} + +	priv = domain->priv; +	if (!priv) { +		ret = -EINVAL; +		goto fail; +	} + +	fl_table = priv->pgtable; + +	if (len != SZ_16M && len != SZ_1M && +	    len != SZ_64K && len != SZ_4K) { +		pr_debug("Bad size: %d\n", len); +		ret = -EINVAL; +		goto fail; +	} + +	if (!fl_table) { +		pr_debug("Null page table\n"); +		ret = -EINVAL; +		goto fail; +	} + +	if (len == SZ_16M || len == SZ_1M) { +		pgprot = sh ? FL_SHARED : 0; +		pgprot |= tex & 0x01 ? FL_BUFFERABLE : 0; +		pgprot |= tex & 0x02 ? FL_CACHEABLE : 0; +		pgprot |= tex & 0x04 ? FL_TEX0 : 0; +	} else	{ +		pgprot = sh ? SL_SHARED : 0; +		pgprot |= tex & 0x01 ? SL_BUFFERABLE : 0; +		pgprot |= tex & 0x02 ? SL_CACHEABLE : 0; +		pgprot |= tex & 0x04 ? SL_TEX0 : 0; +	} + +	fl_offset = FL_OFFSET(va);	/* Upper 12 bits */ +	fl_pte = fl_table + fl_offset;	/* int pointers, 4 bytes */ + +	if (len == SZ_16M) { +		int i = 0; +		for (i = 0; i < 16; i++) +			*(fl_pte+i) = (pa & 0xFF000000) | FL_SUPERSECTION | +				  FL_AP_READ | FL_AP_WRITE | FL_TYPE_SECT | +				  FL_SHARED | FL_NG | pgprot; +	} + +	if (len == SZ_1M) +		*fl_pte = (pa & 0xFFF00000) | FL_AP_READ | FL_AP_WRITE | FL_NG | +					    FL_TYPE_SECT | FL_SHARED | pgprot; + +	/* Need a 2nd level table */ +	if ((len == SZ_4K || len == SZ_64K) && (*fl_pte) == 0) { +		unsigned long *sl; +		sl = (unsigned long *) __get_free_pages(GFP_ATOMIC, +							get_order(SZ_4K)); + +		if (!sl) { +			pr_debug("Could not allocate second level table\n"); +			ret = -ENOMEM; +			goto fail; +		} + +		memset(sl, 0, SZ_4K); +		*fl_pte = ((((int)__pa(sl)) & FL_BASE_MASK) | FL_TYPE_TABLE); +	} + +	sl_table = (unsigned long *) __va(((*fl_pte) & FL_BASE_MASK)); +	sl_offset = SL_OFFSET(va); +	sl_pte = sl_table + sl_offset; + + +	if (len == SZ_4K) +		*sl_pte = (pa & SL_BASE_MASK_SMALL) | SL_AP0 | SL_AP1 | SL_NG | +					  SL_SHARED | SL_TYPE_SMALL | pgprot; + +	if (len == SZ_64K) { +		int i; + +		for (i = 0; i < 16; i++) +			*(sl_pte+i) = (pa & SL_BASE_MASK_LARGE) | SL_AP0 | +			    SL_NG | SL_AP1 | SL_SHARED | SL_TYPE_LARGE | pgprot; +	} + +	ret = __flush_iotlb(domain); +fail: +	spin_unlock_irqrestore(&msm_iommu_lock, flags); +	return ret; +} + +static int msm_iommu_unmap(struct iommu_domain *domain, unsigned long va, +			    int order) +{ +	struct msm_priv *priv; +	unsigned long flags; +	unsigned long *fl_table; +	unsigned long *fl_pte; +	unsigned long fl_offset; +	unsigned long *sl_table; +	unsigned long *sl_pte; +	unsigned long sl_offset; +	size_t len = 0x1000UL << order; +	int i, ret = 0; + +	spin_lock_irqsave(&msm_iommu_lock, flags); + +	priv = domain->priv; + +	if (!priv) { +		ret = -ENODEV; +		goto fail; +	} + +	fl_table = priv->pgtable; + +	if (len != SZ_16M && len != SZ_1M && +	    len != SZ_64K && len != SZ_4K) { +		pr_debug("Bad length: %d\n", len); +		ret = -EINVAL; +		goto fail; +	} + +	if (!fl_table) { +		pr_debug("Null page table\n"); +		ret = -EINVAL; +		goto fail; +	} + +	fl_offset = FL_OFFSET(va);	/* Upper 12 bits */ +	fl_pte = fl_table + fl_offset;	/* int pointers, 4 bytes */ + +	if (*fl_pte == 0) { +		pr_debug("First level PTE is 0\n"); +		ret = -ENODEV; +		goto fail; +	} + +	/* Unmap supersection */ +	if (len == SZ_16M) +		for (i = 0; i < 16; i++) +			*(fl_pte+i) = 0; + +	if (len == SZ_1M) +		*fl_pte = 0; + +	sl_table = (unsigned long *) __va(((*fl_pte) & FL_BASE_MASK)); +	sl_offset = SL_OFFSET(va); +	sl_pte = sl_table + sl_offset; + +	if (len == SZ_64K) { +		for (i = 0; i < 16; i++) +			*(sl_pte+i) = 0; +	} + +	if (len == SZ_4K) +		*sl_pte = 0; + +	if (len == SZ_4K || len == SZ_64K) { +		int used = 0; + +		for (i = 0; i < NUM_SL_PTE; i++) +			if (sl_table[i]) +				used = 1; +		if (!used) { +			free_page((unsigned long)sl_table); +			*fl_pte = 0; +		} +	} + +	ret = __flush_iotlb(domain); +fail: +	spin_unlock_irqrestore(&msm_iommu_lock, flags); +	return ret; +} + +static phys_addr_t msm_iommu_iova_to_phys(struct iommu_domain *domain, +					  unsigned long va) +{ +	struct msm_priv *priv; +	struct msm_iommu_drvdata *iommu_drvdata; +	struct msm_iommu_ctx_drvdata *ctx_drvdata; +	unsigned int par; +	unsigned long flags; +	void __iomem *base; +	phys_addr_t ret = 0; +	int ctx; + +	spin_lock_irqsave(&msm_iommu_lock, flags); + +	priv = domain->priv; +	if (list_empty(&priv->list_attached)) +		goto fail; + +	ctx_drvdata = list_entry(priv->list_attached.next, +				 struct msm_iommu_ctx_drvdata, attached_elm); +	iommu_drvdata = dev_get_drvdata(ctx_drvdata->pdev->dev.parent); + +	base = iommu_drvdata->base; +	ctx = ctx_drvdata->num; + +	ret = __enable_clocks(iommu_drvdata); +	if (ret) +		goto fail; + +	/* Invalidate context TLB */ +	SET_CTX_TLBIALL(base, ctx, 0); +	SET_V2PPR(base, ctx, va & V2Pxx_VA); + +	par = GET_PAR(base, ctx); + +	/* We are dealing with a supersection */ +	if (GET_NOFAULT_SS(base, ctx)) +		ret = (par & 0xFF000000) | (va & 0x00FFFFFF); +	else	/* Upper 20 bits from PAR, lower 12 from VA */ +		ret = (par & 0xFFFFF000) | (va & 0x00000FFF); + +	if (GET_FAULT(base, ctx)) +		ret = 0; + +	__disable_clocks(iommu_drvdata); +fail: +	spin_unlock_irqrestore(&msm_iommu_lock, flags); +	return ret; +} + +static int msm_iommu_domain_has_cap(struct iommu_domain *domain, +				    unsigned long cap) +{ +	return 0; +} + +static void print_ctx_regs(void __iomem *base, int ctx) +{ +	unsigned int fsr = GET_FSR(base, ctx); +	pr_err("FAR    = %08x    PAR    = %08x\n", +	       GET_FAR(base, ctx), GET_PAR(base, ctx)); +	pr_err("FSR    = %08x [%s%s%s%s%s%s%s%s%s%s]\n", fsr, +			(fsr & 0x02) ? "TF " : "", +			(fsr & 0x04) ? "AFF " : "", +			(fsr & 0x08) ? "APF " : "", +			(fsr & 0x10) ? "TLBMF " : "", +			(fsr & 0x20) ? "HTWDEEF " : "", +			(fsr & 0x40) ? "HTWSEEF " : "", +			(fsr & 0x80) ? "MHF " : "", +			(fsr & 0x10000) ? "SL " : "", +			(fsr & 0x40000000) ? "SS " : "", +			(fsr & 0x80000000) ? "MULTI " : ""); + +	pr_err("FSYNR0 = %08x    FSYNR1 = %08x\n", +	       GET_FSYNR0(base, ctx), GET_FSYNR1(base, ctx)); +	pr_err("TTBR0  = %08x    TTBR1  = %08x\n", +	       GET_TTBR0(base, ctx), GET_TTBR1(base, ctx)); +	pr_err("SCTLR  = %08x    ACTLR  = %08x\n", +	       GET_SCTLR(base, ctx), GET_ACTLR(base, ctx)); +	pr_err("PRRR   = %08x    NMRR   = %08x\n", +	       GET_PRRR(base, ctx), GET_NMRR(base, ctx)); +} + +irqreturn_t msm_iommu_fault_handler(int irq, void *dev_id) +{ +	struct msm_iommu_drvdata *drvdata = dev_id; +	void __iomem *base; +	unsigned int fsr; +	int i, ret; + +	spin_lock(&msm_iommu_lock); + +	if (!drvdata) { +		pr_err("Invalid device ID in context interrupt handler\n"); +		goto fail; +	} + +	base = drvdata->base; + +	pr_err("Unexpected IOMMU page fault!\n"); +	pr_err("base = %08x\n", (unsigned int) base); + +	ret = __enable_clocks(drvdata); +	if (ret) +		goto fail; + +	for (i = 0; i < drvdata->ncb; i++) { +		fsr = GET_FSR(base, i); +		if (fsr) { +			pr_err("Fault occurred in context %d.\n", i); +			pr_err("Interesting registers:\n"); +			print_ctx_regs(base, i); +			SET_FSR(base, i, 0x4000000F); +		} +	} +	__disable_clocks(drvdata); +fail: +	spin_unlock(&msm_iommu_lock); +	return 0; +} + +static struct iommu_ops msm_iommu_ops = { +	.domain_init = msm_iommu_domain_init, +	.domain_destroy = msm_iommu_domain_destroy, +	.attach_dev = msm_iommu_attach_dev, +	.detach_dev = msm_iommu_detach_dev, +	.map = msm_iommu_map, +	.unmap = msm_iommu_unmap, +	.iova_to_phys = msm_iommu_iova_to_phys, +	.domain_has_cap = msm_iommu_domain_has_cap +}; + +static int __init get_tex_class(int icp, int ocp, int mt, int nos) +{ +	int i = 0; +	unsigned int prrr = 0; +	unsigned int nmrr = 0; +	int c_icp, c_ocp, c_mt, c_nos; + +	RCP15_PRRR(prrr); +	RCP15_NMRR(nmrr); + +	for (i = 0; i < NUM_TEX_CLASS; i++) { +		c_nos = PRRR_NOS(prrr, i); +		c_mt = PRRR_MT(prrr, i); +		c_icp = NMRR_ICP(nmrr, i); +		c_ocp = NMRR_OCP(nmrr, i); + +		if (icp == c_icp && ocp == c_ocp && c_mt == mt && c_nos == nos) +			return i; +	} + +	return -ENODEV; +} + +static void __init setup_iommu_tex_classes(void) +{ +	msm_iommu_tex_class[MSM_IOMMU_ATTR_NONCACHED] = +			get_tex_class(CP_NONCACHED, CP_NONCACHED, MT_NORMAL, 1); + +	msm_iommu_tex_class[MSM_IOMMU_ATTR_CACHED_WB_WA] = +			get_tex_class(CP_WB_WA, CP_WB_WA, MT_NORMAL, 1); + +	msm_iommu_tex_class[MSM_IOMMU_ATTR_CACHED_WB_NWA] = +			get_tex_class(CP_WB_NWA, CP_WB_NWA, MT_NORMAL, 1); + +	msm_iommu_tex_class[MSM_IOMMU_ATTR_CACHED_WT] = +			get_tex_class(CP_WT, CP_WT, MT_NORMAL, 1); +} + +static int __init msm_iommu_init(void) +{ +	setup_iommu_tex_classes(); +	register_iommu(&msm_iommu_ops); +	return 0; +} + +subsys_initcall(msm_iommu_init); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Stepan Moskovchenko <stepanm@codeaurora.org>"); diff --git a/drivers/iommu/msm_iommu_dev.c b/drivers/iommu/msm_iommu_dev.c new file mode 100644 index 000000000000..8e8fb079852d --- /dev/null +++ b/drivers/iommu/msm_iommu_dev.c @@ -0,0 +1,422 @@ +/* Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/io.h> +#include <linux/clk.h> +#include <linux/iommu.h> +#include <linux/interrupt.h> +#include <linux/err.h> +#include <linux/slab.h> + +#include <mach/iommu_hw-8xxx.h> +#include <mach/iommu.h> +#include <mach/clk.h> + +struct iommu_ctx_iter_data { +	/* input */ +	const char *name; + +	/* output */ +	struct device *dev; +}; + +static struct platform_device *msm_iommu_root_dev; + +static int each_iommu_ctx(struct device *dev, void *data) +{ +	struct iommu_ctx_iter_data *res = data; +	struct msm_iommu_ctx_dev *c = dev->platform_data; + +	if (!res || !c || !c->name || !res->name) +		return -EINVAL; + +	if (!strcmp(res->name, c->name)) { +		res->dev = dev; +		return 1; +	} +	return 0; +} + +static int each_iommu(struct device *dev, void *data) +{ +	return device_for_each_child(dev, data, each_iommu_ctx); +} + +struct device *msm_iommu_get_ctx(const char *ctx_name) +{ +	struct iommu_ctx_iter_data r; +	int found; + +	if (!msm_iommu_root_dev) { +		pr_err("No root IOMMU device.\n"); +		goto fail; +	} + +	r.name = ctx_name; +	found = device_for_each_child(&msm_iommu_root_dev->dev, &r, each_iommu); + +	if (!found) { +		pr_err("Could not find context <%s>\n", ctx_name); +		goto fail; +	} + +	return r.dev; +fail: +	return NULL; +} +EXPORT_SYMBOL(msm_iommu_get_ctx); + +static void msm_iommu_reset(void __iomem *base, int ncb) +{ +	int ctx; + +	SET_RPUE(base, 0); +	SET_RPUEIE(base, 0); +	SET_ESRRESTORE(base, 0); +	SET_TBE(base, 0); +	SET_CR(base, 0); +	SET_SPDMBE(base, 0); +	SET_TESTBUSCR(base, 0); +	SET_TLBRSW(base, 0); +	SET_GLOBAL_TLBIALL(base, 0); +	SET_RPU_ACR(base, 0); +	SET_TLBLKCRWE(base, 1); + +	for (ctx = 0; ctx < ncb; ctx++) { +		SET_BPRCOSH(base, ctx, 0); +		SET_BPRCISH(base, ctx, 0); +		SET_BPRCNSH(base, ctx, 0); +		SET_BPSHCFG(base, ctx, 0); +		SET_BPMTCFG(base, ctx, 0); +		SET_ACTLR(base, ctx, 0); +		SET_SCTLR(base, ctx, 0); +		SET_FSRRESTORE(base, ctx, 0); +		SET_TTBR0(base, ctx, 0); +		SET_TTBR1(base, ctx, 0); +		SET_TTBCR(base, ctx, 0); +		SET_BFBCR(base, ctx, 0); +		SET_PAR(base, ctx, 0); +		SET_FAR(base, ctx, 0); +		SET_CTX_TLBIALL(base, ctx, 0); +		SET_TLBFLPTER(base, ctx, 0); +		SET_TLBSLPTER(base, ctx, 0); +		SET_TLBLKCR(base, ctx, 0); +		SET_PRRR(base, ctx, 0); +		SET_NMRR(base, ctx, 0); +		SET_CONTEXTIDR(base, ctx, 0); +	} +} + +static int msm_iommu_probe(struct platform_device *pdev) +{ +	struct resource *r, *r2; +	struct clk *iommu_clk; +	struct clk *iommu_pclk; +	struct msm_iommu_drvdata *drvdata; +	struct msm_iommu_dev *iommu_dev = pdev->dev.platform_data; +	void __iomem *regs_base; +	resource_size_t	len; +	int ret, irq, par; + +	if (pdev->id == -1) { +		msm_iommu_root_dev = pdev; +		return 0; +	} + +	drvdata = kzalloc(sizeof(*drvdata), GFP_KERNEL); + +	if (!drvdata) { +		ret = -ENOMEM; +		goto fail; +	} + +	if (!iommu_dev) { +		ret = -ENODEV; +		goto fail; +	} + +	iommu_pclk = clk_get(NULL, "smmu_pclk"); +	if (IS_ERR(iommu_pclk)) { +		ret = -ENODEV; +		goto fail; +	} + +	ret = clk_enable(iommu_pclk); +	if (ret) +		goto fail_enable; + +	iommu_clk = clk_get(&pdev->dev, "iommu_clk"); + +	if (!IS_ERR(iommu_clk))	{ +		if (clk_get_rate(iommu_clk) == 0) +			clk_set_min_rate(iommu_clk, 1); + +		ret = clk_enable(iommu_clk); +		if (ret) { +			clk_put(iommu_clk); +			goto fail_pclk; +		} +	} else +		iommu_clk = NULL; + +	r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "physbase"); + +	if (!r) { +		ret = -ENODEV; +		goto fail_clk; +	} + +	len = resource_size(r); + +	r2 = request_mem_region(r->start, len, r->name); +	if (!r2) { +		pr_err("Could not request memory region: start=%p, len=%d\n", +							(void *) r->start, len); +		ret = -EBUSY; +		goto fail_clk; +	} + +	regs_base = ioremap(r2->start, len); + +	if (!regs_base) { +		pr_err("Could not ioremap: start=%p, len=%d\n", +			 (void *) r2->start, len); +		ret = -EBUSY; +		goto fail_mem; +	} + +	irq = platform_get_irq_byname(pdev, "secure_irq"); +	if (irq < 0) { +		ret = -ENODEV; +		goto fail_io; +	} + +	msm_iommu_reset(regs_base, iommu_dev->ncb); + +	SET_M(regs_base, 0, 1); +	SET_PAR(regs_base, 0, 0); +	SET_V2PCFG(regs_base, 0, 1); +	SET_V2PPR(regs_base, 0, 0); +	par = GET_PAR(regs_base, 0); +	SET_V2PCFG(regs_base, 0, 0); +	SET_M(regs_base, 0, 0); + +	if (!par) { +		pr_err("%s: Invalid PAR value detected\n", iommu_dev->name); +		ret = -ENODEV; +		goto fail_io; +	} + +	ret = request_irq(irq, msm_iommu_fault_handler, 0, +			"msm_iommu_secure_irpt_handler", drvdata); +	if (ret) { +		pr_err("Request IRQ %d failed with ret=%d\n", irq, ret); +		goto fail_io; +	} + + +	drvdata->pclk = iommu_pclk; +	drvdata->clk = iommu_clk; +	drvdata->base = regs_base; +	drvdata->irq = irq; +	drvdata->ncb = iommu_dev->ncb; + +	pr_info("device %s mapped at %p, irq %d with %d ctx banks\n", +		iommu_dev->name, regs_base, irq, iommu_dev->ncb); + +	platform_set_drvdata(pdev, drvdata); + +	if (iommu_clk) +		clk_disable(iommu_clk); + +	clk_disable(iommu_pclk); + +	return 0; +fail_io: +	iounmap(regs_base); +fail_mem: +	release_mem_region(r->start, len); +fail_clk: +	if (iommu_clk) { +		clk_disable(iommu_clk); +		clk_put(iommu_clk); +	} +fail_pclk: +	clk_disable(iommu_pclk); +fail_enable: +	clk_put(iommu_pclk); +fail: +	kfree(drvdata); +	return ret; +} + +static int msm_iommu_remove(struct platform_device *pdev) +{ +	struct msm_iommu_drvdata *drv = NULL; + +	drv = platform_get_drvdata(pdev); +	if (drv) { +		if (drv->clk) +			clk_put(drv->clk); +		clk_put(drv->pclk); +		memset(drv, 0, sizeof(*drv)); +		kfree(drv); +		platform_set_drvdata(pdev, NULL); +	} +	return 0; +} + +static int msm_iommu_ctx_probe(struct platform_device *pdev) +{ +	struct msm_iommu_ctx_dev *c = pdev->dev.platform_data; +	struct msm_iommu_drvdata *drvdata; +	struct msm_iommu_ctx_drvdata *ctx_drvdata = NULL; +	int i, ret; +	if (!c || !pdev->dev.parent) { +		ret = -EINVAL; +		goto fail; +	} + +	drvdata = dev_get_drvdata(pdev->dev.parent); + +	if (!drvdata) { +		ret = -ENODEV; +		goto fail; +	} + +	ctx_drvdata = kzalloc(sizeof(*ctx_drvdata), GFP_KERNEL); +	if (!ctx_drvdata) { +		ret = -ENOMEM; +		goto fail; +	} +	ctx_drvdata->num = c->num; +	ctx_drvdata->pdev = pdev; + +	INIT_LIST_HEAD(&ctx_drvdata->attached_elm); +	platform_set_drvdata(pdev, ctx_drvdata); + +	ret = clk_enable(drvdata->pclk); +	if (ret) +		goto fail; + +	if (drvdata->clk) { +		ret = clk_enable(drvdata->clk); +		if (ret) { +			clk_disable(drvdata->pclk); +			goto fail; +		} +	} + +	/* Program the M2V tables for this context */ +	for (i = 0; i < MAX_NUM_MIDS; i++) { +		int mid = c->mids[i]; +		if (mid == -1) +			break; + +		SET_M2VCBR_N(drvdata->base, mid, 0); +		SET_CBACR_N(drvdata->base, c->num, 0); + +		/* Set VMID = 0 */ +		SET_VMID(drvdata->base, mid, 0); + +		/* Set the context number for that MID to this context */ +		SET_CBNDX(drvdata->base, mid, c->num); + +		/* Set MID associated with this context bank to 0*/ +		SET_CBVMID(drvdata->base, c->num, 0); + +		/* Set the ASID for TLB tagging for this context */ +		SET_CONTEXTIDR_ASID(drvdata->base, c->num, c->num); + +		/* Set security bit override to be Non-secure */ +		SET_NSCFG(drvdata->base, mid, 3); +	} + +	if (drvdata->clk) +		clk_disable(drvdata->clk); +	clk_disable(drvdata->pclk); + +	dev_info(&pdev->dev, "context %s using bank %d\n", c->name, c->num); +	return 0; +fail: +	kfree(ctx_drvdata); +	return ret; +} + +static int msm_iommu_ctx_remove(struct platform_device *pdev) +{ +	struct msm_iommu_ctx_drvdata *drv = NULL; +	drv = platform_get_drvdata(pdev); +	if (drv) { +		memset(drv, 0, sizeof(struct msm_iommu_ctx_drvdata)); +		kfree(drv); +		platform_set_drvdata(pdev, NULL); +	} +	return 0; +} + +static struct platform_driver msm_iommu_driver = { +	.driver = { +		.name	= "msm_iommu", +	}, +	.probe		= msm_iommu_probe, +	.remove		= msm_iommu_remove, +}; + +static struct platform_driver msm_iommu_ctx_driver = { +	.driver = { +		.name	= "msm_iommu_ctx", +	}, +	.probe		= msm_iommu_ctx_probe, +	.remove		= msm_iommu_ctx_remove, +}; + +static int __init msm_iommu_driver_init(void) +{ +	int ret; +	ret = platform_driver_register(&msm_iommu_driver); +	if (ret != 0) { +		pr_err("Failed to register IOMMU driver\n"); +		goto error; +	} + +	ret = platform_driver_register(&msm_iommu_ctx_driver); +	if (ret != 0) { +		pr_err("Failed to register IOMMU context driver\n"); +		goto error; +	} + +error: +	return ret; +} + +static void __exit msm_iommu_driver_exit(void) +{ +	platform_driver_unregister(&msm_iommu_ctx_driver); +	platform_driver_unregister(&msm_iommu_driver); +} + +subsys_initcall(msm_iommu_driver_init); +module_exit(msm_iommu_driver_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Stepan Moskovchenko <stepanm@codeaurora.org>"); | 
