diff options
Diffstat (limited to 'arch/x86/kernel')
| -rw-r--r-- | arch/x86/kernel/apic/x2apic_uv_x.c | 29 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/bugs.c | 55 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mce/core.c | 6 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/microcode/intel.c | 63 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/resctrl/rdtgroup.c | 65 | ||||
| -rw-r--r-- | arch/x86/kernel/dumpstack.c | 23 | ||||
| -rw-r--r-- | arch/x86/kernel/head_64.S | 16 | ||||
| -rw-r--r-- | arch/x86/kernel/perf_regs.c | 15 | ||||
| -rw-r--r-- | arch/x86/kernel/process.c | 12 | ||||
| -rw-r--r-- | arch/x86/kernel/sev-es-shared.c | 26 | ||||
| -rw-r--r-- | arch/x86/kernel/sev-es.c | 20 | ||||
| -rw-r--r-- | arch/x86/kernel/sev_verify_cbit.S | 89 | ||||
| -rw-r--r-- | arch/x86/kernel/tboot.c | 8 | 
13 files changed, 278 insertions, 149 deletions
| diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 714233cee0b5..1b98f8c12b96 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -33,7 +33,7 @@ static union uvh_apicid		uvh_apicid;  static int			uv_node_id;  /* Unpack AT/OEM/TABLE ID's to be NULL terminated strings */ -static u8 uv_archtype[UV_AT_SIZE]; +static u8 uv_archtype[UV_AT_SIZE + 1];  static u8 oem_id[ACPI_OEM_ID_SIZE + 1];  static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; @@ -290,6 +290,9 @@ static void __init uv_stringify(int len, char *to, char *from)  {  	/* Relies on 'to' being NULL chars so result will be NULL terminated */  	strncpy(to, from, len-1); + +	/* Trim trailing spaces */ +	(void)strim(to);  }  /* Find UV arch type entry in UVsystab */ @@ -317,7 +320,7 @@ static int __init decode_arch_type(unsigned long ptr)  	if (n > 0 && n < sizeof(uv_ate->archtype)) {  		pr_info("UV: UVarchtype received from BIOS\n"); -		uv_stringify(UV_AT_SIZE, uv_archtype, uv_ate->archtype); +		uv_stringify(sizeof(uv_archtype), uv_archtype, uv_ate->archtype);  		return 1;  	}  	return 0; @@ -366,7 +369,7 @@ static int __init early_get_arch_type(void)  	return ret;  } -static int __init uv_set_system_type(char *_oem_id) +static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id)  {  	/* Save OEM_ID passed from ACPI MADT */  	uv_stringify(sizeof(oem_id), oem_id, _oem_id); @@ -375,7 +378,7 @@ static int __init uv_set_system_type(char *_oem_id)  	if (!early_get_arch_type())  		/* If not use OEM ID for UVarchtype */ -		uv_stringify(UV_AT_SIZE, uv_archtype, _oem_id); +		uv_stringify(sizeof(uv_archtype), uv_archtype, oem_id);  	/* Check if not hubbed */  	if (strncmp(uv_archtype, "SGI", 3) != 0) { @@ -386,13 +389,23 @@ static int __init uv_set_system_type(char *_oem_id)  			/* (Not hubless), not a UV */  			return 0; +		/* Is UV hubless system */ +		uv_hubless_system = 0x01; + +		/* UV5 Hubless */ +		if (strncmp(uv_archtype, "NSGI5", 5) == 0) +			uv_hubless_system |= 0x20; +  		/* UV4 Hubless: CH */ -		if (strncmp(uv_archtype, "NSGI4", 5) == 0) -			uv_hubless_system = 0x11; +		else if (strncmp(uv_archtype, "NSGI4", 5) == 0) +			uv_hubless_system |= 0x10;  		/* UV3 Hubless: UV300/MC990X w/o hub */  		else -			uv_hubless_system = 0x9; +			uv_hubless_system |= 0x8; + +		/* Copy APIC type */ +		uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);  		pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n",  			oem_id, oem_table_id, uv_system_type, uv_hubless_system); @@ -456,7 +469,7 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id)  	uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;  	/* If not UV, return. */ -	if (likely(uv_set_system_type(_oem_id) == 0)) +	if (uv_set_system_type(_oem_id, _oem_table_id) == 0)  		return 0;  	/* Save and Decode OEM Table ID */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d3f0db463f96..d41b70fe4918 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -739,11 +739,13 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)  	if (boot_cpu_has(X86_FEATURE_IBPB)) {  		setup_force_cpu_cap(X86_FEATURE_USE_IBPB); +		spectre_v2_user_ibpb = mode;  		switch (cmd) {  		case SPECTRE_V2_USER_CMD_FORCE:  		case SPECTRE_V2_USER_CMD_PRCTL_IBPB:  		case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:  			static_branch_enable(&switch_mm_always_ibpb); +			spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;  			break;  		case SPECTRE_V2_USER_CMD_PRCTL:  		case SPECTRE_V2_USER_CMD_AUTO: @@ -757,8 +759,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)  		pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",  			static_key_enabled(&switch_mm_always_ibpb) ?  			"always-on" : "conditional"); - -		spectre_v2_user_ibpb = mode;  	}  	/* @@ -1254,6 +1254,14 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)  	return 0;  } +static bool is_spec_ib_user_controlled(void) +{ +	return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || +		spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || +		spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || +		spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP; +} +  static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)  {  	switch (ctrl) { @@ -1261,16 +1269,26 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)  		if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&  		    spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)  			return 0; +  		/* -		 * Indirect branch speculation is always disabled in strict -		 * mode. It can neither be enabled if it was force-disabled -		 * by a  previous prctl call. +		 * With strict mode for both IBPB and STIBP, the instruction +		 * code paths avoid checking this task flag and instead, +		 * unconditionally run the instruction. However, STIBP and IBPB +		 * are independent and either can be set to conditionally +		 * enabled regardless of the mode of the other. +		 * +		 * If either is set to conditional, allow the task flag to be +		 * updated, unless it was force-disabled by a previous prctl +		 * call. Currently, this is possible on an AMD CPU which has the +		 * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the +		 * kernel is booted with 'spectre_v2_user=seccomp', then +		 * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and +		 * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED.  		 */ -		if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || -		    spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || -		    spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED || +		if (!is_spec_ib_user_controlled() ||  		    task_spec_ib_force_disable(task))  			return -EPERM; +  		task_clear_spec_ib_disable(task);  		task_update_spec_tif(task);  		break; @@ -1283,10 +1301,10 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)  		if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&  		    spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)  			return -EPERM; -		if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || -		    spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || -		    spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) + +		if (!is_spec_ib_user_controlled())  			return 0; +  		task_set_spec_ib_disable(task);  		if (ctrl == PR_SPEC_FORCE_DISABLE)  			task_set_spec_ib_force_disable(task); @@ -1351,20 +1369,17 @@ static int ib_prctl_get(struct task_struct *task)  	if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&  	    spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)  		return PR_SPEC_ENABLE; -	else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || -	    spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || -	    spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) -		return PR_SPEC_DISABLE; -	else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || -	    spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || -	    spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || -	    spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) { +	else if (is_spec_ib_user_controlled()) {  		if (task_spec_ib_force_disable(task))  			return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;  		if (task_spec_ib_disable(task))  			return PR_SPEC_PRCTL | PR_SPEC_DISABLE;  		return PR_SPEC_PRCTL | PR_SPEC_ENABLE; -	} else +	} else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || +	    spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || +	    spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) +		return PR_SPEC_DISABLE; +	else  		return PR_SPEC_NOT_AFFECTED;  } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 4102b866e7c0..32b7099e3511 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1384,8 +1384,10 @@ noinstr void do_machine_check(struct pt_regs *regs)  	 * When there's any problem use only local no_way_out state.  	 */  	if (!lmce) { -		if (mce_end(order) < 0) -			no_way_out = worst >= MCE_PANIC_SEVERITY; +		if (mce_end(order) < 0) { +			if (!no_way_out) +				no_way_out = worst >= MCE_PANIC_SEVERITY; +		}  	} else {  		/*  		 * If there was a fatal machine check we should have diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 6a99535d7f37..7e8e07bddd5f 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -100,53 +100,6 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev  	return find_matching_signature(mc, csig, cpf);  } -/* - * Given CPU signature and a microcode patch, this function finds if the - * microcode patch has matching family and model with the CPU. - * - * %true - if there's a match - * %false - otherwise - */ -static bool microcode_matches(struct microcode_header_intel *mc_header, -			      unsigned long sig) -{ -	unsigned long total_size = get_totalsize(mc_header); -	unsigned long data_size = get_datasize(mc_header); -	struct extended_sigtable *ext_header; -	unsigned int fam_ucode, model_ucode; -	struct extended_signature *ext_sig; -	unsigned int fam, model; -	int ext_sigcount, i; - -	fam   = x86_family(sig); -	model = x86_model(sig); - -	fam_ucode   = x86_family(mc_header->sig); -	model_ucode = x86_model(mc_header->sig); - -	if (fam == fam_ucode && model == model_ucode) -		return true; - -	/* Look for ext. headers: */ -	if (total_size <= data_size + MC_HEADER_SIZE) -		return false; - -	ext_header   = (void *) mc_header + data_size + MC_HEADER_SIZE; -	ext_sig      = (void *)ext_header + EXT_HEADER_SIZE; -	ext_sigcount = ext_header->count; - -	for (i = 0; i < ext_sigcount; i++) { -		fam_ucode   = x86_family(ext_sig->sig); -		model_ucode = x86_model(ext_sig->sig); - -		if (fam == fam_ucode && model == model_ucode) -			return true; - -		ext_sig++; -	} -	return false; -} -  static struct ucode_patch *memdup_patch(void *data, unsigned int size)  {  	struct ucode_patch *p; @@ -164,7 +117,7 @@ static struct ucode_patch *memdup_patch(void *data, unsigned int size)  	return p;  } -static void save_microcode_patch(void *data, unsigned int size) +static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigned int size)  {  	struct microcode_header_intel *mc_hdr, *mc_saved_hdr;  	struct ucode_patch *iter, *tmp, *p = NULL; @@ -210,6 +163,9 @@ static void save_microcode_patch(void *data, unsigned int size)  	if (!p)  		return; +	if (!find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf)) +		return; +  	/*  	 * Save for early loading. On 32-bit, that needs to be a physical  	 * address as the APs are running from physical addresses, before @@ -344,13 +300,14 @@ scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)  		size -= mc_size; -		if (!microcode_matches(mc_header, uci->cpu_sig.sig)) { +		if (!find_matching_signature(data, uci->cpu_sig.sig, +					     uci->cpu_sig.pf)) {  			data += mc_size;  			continue;  		}  		if (save) { -			save_microcode_patch(data, mc_size); +			save_microcode_patch(uci, data, mc_size);  			goto next;  		} @@ -483,14 +440,14 @@ static void show_saved_mc(void)   * Save this microcode patch. It will be loaded early when a CPU is   * hot-added or resumes.   */ -static void save_mc_for_early(u8 *mc, unsigned int size) +static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int size)  {  	/* Synchronization during CPU hotplug. */  	static DEFINE_MUTEX(x86_cpu_microcode_mutex);  	mutex_lock(&x86_cpu_microcode_mutex); -	save_microcode_patch(mc, size); +	save_microcode_patch(uci, mc, size);  	show_saved_mc();  	mutex_unlock(&x86_cpu_microcode_mutex); @@ -935,7 +892,7 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter)  	 * permanent memory. So it will be loaded early when a CPU is hot added  	 * or resumes.  	 */ -	save_mc_for_early(new_mc, new_mc_size); +	save_mc_for_early(uci, new_mc, new_mc_size);  	pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",  		 cpu, new_rev, uci->cpu_sig.rev); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index af323e2e3100..6f4ca4bea625 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -507,6 +507,24 @@ unlock:  	return ret ?: nbytes;  } +/** + * rdtgroup_remove - the helper to remove resource group safely + * @rdtgrp: resource group to remove + * + * On resource group creation via a mkdir, an extra kernfs_node reference is + * taken to ensure that the rdtgroup structure remains accessible for the + * rdtgroup_kn_unlock() calls where it is removed. + * + * Drop the extra reference here, then free the rdtgroup structure. + * + * Return: void + */ +static void rdtgroup_remove(struct rdtgroup *rdtgrp) +{ +	kernfs_put(rdtgrp->kn); +	kfree(rdtgrp); +} +  struct task_move_callback {  	struct callback_head	work;  	struct rdtgroup		*rdtgrp; @@ -529,7 +547,7 @@ static void move_myself(struct callback_head *head)  	    (rdtgrp->flags & RDT_DELETED)) {  		current->closid = 0;  		current->rmid = 0; -		kfree(rdtgrp); +		rdtgroup_remove(rdtgrp);  	}  	if (unlikely(current->flags & PF_EXITING)) @@ -1769,7 +1787,6 @@ static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,  	if (IS_ERR(kn_subdir))  		return PTR_ERR(kn_subdir); -	kernfs_get(kn_subdir);  	ret = rdtgroup_kn_set_ugid(kn_subdir);  	if (ret)  		return ret; @@ -1792,7 +1809,6 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)  	kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);  	if (IS_ERR(kn_info))  		return PTR_ERR(kn_info); -	kernfs_get(kn_info);  	ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);  	if (ret) @@ -1813,12 +1829,6 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)  			goto out_destroy;  	} -	/* -	 * This extra ref will be put in kernfs_remove() and guarantees -	 * that @rdtgrp->kn is always accessible. -	 */ -	kernfs_get(kn_info); -  	ret = rdtgroup_kn_set_ugid(kn_info);  	if (ret)  		goto out_destroy; @@ -1847,12 +1857,6 @@ mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,  	if (dest_kn)  		*dest_kn = kn; -	/* -	 * This extra ref will be put in kernfs_remove() and guarantees -	 * that @rdtgrp->kn is always accessible. -	 */ -	kernfs_get(kn); -  	ret = rdtgroup_kn_set_ugid(kn);  	if (ret)  		goto out_destroy; @@ -2079,8 +2083,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)  		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)  			rdtgroup_pseudo_lock_remove(rdtgrp);  		kernfs_unbreak_active_protection(kn); -		kernfs_put(rdtgrp->kn); -		kfree(rdtgrp); +		rdtgroup_remove(rdtgrp);  	} else {  		kernfs_unbreak_active_protection(kn);  	} @@ -2139,13 +2142,11 @@ static int rdt_get_tree(struct fs_context *fc)  					  &kn_mongrp);  		if (ret < 0)  			goto out_info; -		kernfs_get(kn_mongrp);  		ret = mkdir_mondata_all(rdtgroup_default.kn,  					&rdtgroup_default, &kn_mondata);  		if (ret < 0)  			goto out_mongrp; -		kernfs_get(kn_mondata);  		rdtgroup_default.mon.mon_data_kn = kn_mondata;  	} @@ -2357,7 +2358,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)  		if (atomic_read(&sentry->waitcount) != 0)  			sentry->flags = RDT_DELETED;  		else -			kfree(sentry); +			rdtgroup_remove(sentry);  	}  } @@ -2399,7 +2400,7 @@ static void rmdir_all_sub(void)  		if (atomic_read(&rdtgrp->waitcount) != 0)  			rdtgrp->flags = RDT_DELETED;  		else -			kfree(rdtgrp); +			rdtgroup_remove(rdtgrp);  	}  	/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */  	update_closid_rmid(cpu_online_mask, &rdtgroup_default); @@ -2499,11 +2500,6 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,  	if (IS_ERR(kn))  		return PTR_ERR(kn); -	/* -	 * This extra ref will be put in kernfs_remove() and guarantees -	 * that kn is always accessible. -	 */ -	kernfs_get(kn);  	ret = rdtgroup_kn_set_ugid(kn);  	if (ret)  		goto out_destroy; @@ -2838,8 +2834,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,  	/*  	 * kernfs_remove() will drop the reference count on "kn" which  	 * will free it. But we still need it to stick around for the -	 * rdtgroup_kn_unlock(kn} call below. Take one extra reference -	 * here, which will be dropped inside rdtgroup_kn_unlock(). +	 * rdtgroup_kn_unlock(kn) call. Take one extra reference here, +	 * which will be dropped by kernfs_put() in rdtgroup_remove().  	 */  	kernfs_get(kn); @@ -2880,6 +2876,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,  out_idfree:  	free_rmid(rdtgrp->mon.rmid);  out_destroy: +	kernfs_put(rdtgrp->kn);  	kernfs_remove(rdtgrp->kn);  out_free_rgrp:  	kfree(rdtgrp); @@ -2892,7 +2889,7 @@ static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)  {  	kernfs_remove(rgrp->kn);  	free_rmid(rgrp->mon.rmid); -	kfree(rgrp); +	rdtgroup_remove(rgrp);  }  /* @@ -3049,11 +3046,6 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,  	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));  	list_del(&rdtgrp->mon.crdtgrp_list); -	/* -	 * one extra hold on this, will drop when we kfree(rdtgrp) -	 * in rdtgroup_kn_unlock() -	 */ -	kernfs_get(kn);  	kernfs_remove(rdtgrp->kn);  	return 0; @@ -3065,11 +3057,6 @@ static int rdtgroup_ctrl_remove(struct kernfs_node *kn,  	rdtgrp->flags = RDT_DELETED;  	list_del(&rdtgrp->rdtgroup_list); -	/* -	 * one extra hold on this, will drop when we kfree(rdtgrp) -	 * in rdtgroup_kn_unlock() -	 */ -	kernfs_get(kn);  	kernfs_remove(rdtgrp->kn);  	return 0;  } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 25c06b67e7e0..97aa900386cb 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -78,6 +78,9 @@ static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,  	if (!user_mode(regs))  		return copy_from_kernel_nofault(buf, (u8 *)src, nbytes); +	/* The user space code from other tasks cannot be accessed. */ +	if (regs != task_pt_regs(current)) +		return -EPERM;  	/*  	 * Make sure userspace isn't trying to trick us into dumping kernel  	 * memory by pointing the userspace instruction pointer at it. @@ -85,6 +88,12 @@ static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,  	if (__chk_range_not_ok(src, nbytes, TASK_SIZE_MAX))  		return -EINVAL; +	/* +	 * Even if named copy_from_user_nmi() this can be invoked from +	 * other contexts and will not try to resolve a pagefault, which is +	 * the correct thing to do here as this code can be called from any +	 * context. +	 */  	return copy_from_user_nmi(buf, (void __user *)src, nbytes);  } @@ -115,13 +124,19 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl)  	u8 opcodes[OPCODE_BUFSIZE];  	unsigned long prologue = regs->ip - PROLOGUE_SIZE; -	if (copy_code(regs, opcodes, prologue, sizeof(opcodes))) { -		printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n", -		       loglvl, prologue); -	} else { +	switch (copy_code(regs, opcodes, prologue, sizeof(opcodes))) { +	case 0:  		printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %"  		       __stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes,  		       opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1); +		break; +	case -EPERM: +		/* No access to the user space stack of other tasks. Ignore. */ +		break; +	default: +		printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n", +		       loglvl, prologue); +		break;  	}  } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 7eb2a1c87969..3c417734790f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -161,6 +161,21 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)  	/* Setup early boot stage 4-/5-level pagetables. */  	addq	phys_base(%rip), %rax + +	/* +	 * For SEV guests: Verify that the C-bit is correct. A malicious +	 * hypervisor could lie about the C-bit position to perform a ROP +	 * attack on the guest by writing to the unencrypted stack and wait for +	 * the next RET instruction. +	 * %rsi carries pointer to realmode data and is callee-clobbered. Save +	 * and restore it. +	 */ +	pushq	%rsi +	movq	%rax, %rdi +	call	sev_verify_cbit +	popq	%rsi + +	/* Switch to new page-table */  	movq	%rax, %cr3  	/* Ensure I am executing from virtual addresses */ @@ -279,6 +294,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)  SYM_CODE_END(secondary_startup_64)  #include "verify_cpu.S" +#include "sev_verify_cbit.S"  #ifdef CONFIG_HOTPLUG_CPU  /* diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index bb7e1132290b..f9e5352b3bef 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -101,8 +101,7 @@ u64 perf_reg_abi(struct task_struct *task)  }  void perf_get_regs_user(struct perf_regs *regs_user, -			struct pt_regs *regs, -			struct pt_regs *regs_user_copy) +			struct pt_regs *regs)  {  	regs_user->regs = task_pt_regs(current);  	regs_user->abi = perf_reg_abi(current); @@ -129,12 +128,20 @@ u64 perf_reg_abi(struct task_struct *task)  		return PERF_SAMPLE_REGS_ABI_64;  } +static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs); +  void perf_get_regs_user(struct perf_regs *regs_user, -			struct pt_regs *regs, -			struct pt_regs *regs_user_copy) +			struct pt_regs *regs)  { +	struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs);  	struct pt_regs *user_regs = task_pt_regs(current); +	if (!in_nmi()) { +		regs_user->regs = user_regs; +		regs_user->abi = perf_reg_abi(current); +		return; +	} +  	/*  	 * If we're in an NMI that interrupted task_pt_regs setup, then  	 * we can't sample user regs at all.  This check isn't really diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ba4593a913fa..145a7ac0c19a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -685,7 +685,7 @@ void arch_cpu_idle(void)   */  void __cpuidle default_idle(void)  { -	safe_halt(); +	raw_safe_halt();  }  #if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)  EXPORT_SYMBOL(default_idle); @@ -736,6 +736,8 @@ void stop_this_cpu(void *dummy)  /*   * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power   * states (local apic timer and TSC stop). + * + * XXX this function is completely buggered vs RCU and tracing.   */  static void amd_e400_idle(void)  { @@ -757,9 +759,9 @@ static void amd_e400_idle(void)  	 * The switch back from broadcast mode needs to be called with  	 * interrupts disabled.  	 */ -	local_irq_disable(); +	raw_local_irq_disable();  	tick_broadcast_exit(); -	local_irq_enable(); +	raw_local_irq_enable();  }  /* @@ -801,9 +803,9 @@ static __cpuidle void mwait_idle(void)  		if (!need_resched())  			__sti_mwait(0, 0);  		else -			local_irq_enable(); +			raw_local_irq_enable();  	} else { -		local_irq_enable(); +		raw_local_irq_enable();  	}  	__current_clr_polling();  } diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c index 5f83ccaab877..7d04b356d44d 100644 --- a/arch/x86/kernel/sev-es-shared.c +++ b/arch/x86/kernel/sev-es-shared.c @@ -178,6 +178,32 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)  		goto fail;  	regs->dx = val >> 32; +	/* +	 * This is a VC handler and the #VC is only raised when SEV-ES is +	 * active, which means SEV must be active too. Do sanity checks on the +	 * CPUID results to make sure the hypervisor does not trick the kernel +	 * into the no-sev path. This could map sensitive data unencrypted and +	 * make it accessible to the hypervisor. +	 * +	 * In particular, check for: +	 *	- Hypervisor CPUID bit +	 *	- Availability of CPUID leaf 0x8000001f +	 *	- SEV CPUID bit. +	 * +	 * The hypervisor might still report the wrong C-bit position, but this +	 * can't be checked here. +	 */ + +	if ((fn == 1 && !(regs->cx & BIT(31)))) +		/* Hypervisor bit */ +		goto fail; +	else if (fn == 0x80000000 && (regs->ax < 0x8000001f)) +		/* SEV leaf check */ +		goto fail; +	else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) +		/* SEV bit */ +		goto fail; +  	/* Skip over the CPUID two-byte opcode */  	regs->ip += 2; diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 4a96726fbaf8..0bd1a0fc587e 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -374,8 +374,8 @@ fault:  	return ES_EXCEPTION;  } -static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, -				 unsigned long vaddr, phys_addr_t *paddr) +static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, +					   unsigned long vaddr, phys_addr_t *paddr)  {  	unsigned long va = (unsigned long)vaddr;  	unsigned int level; @@ -394,15 +394,19 @@ static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,  		if (user_mode(ctxt->regs))  			ctxt->fi.error_code |= X86_PF_USER; -		return false; +		return ES_EXCEPTION;  	} +	if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) +		/* Emulated MMIO to/from encrypted memory not supported */ +		return ES_UNSUPPORTED; +  	pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;  	pa |= va & ~page_level_mask(level);  	*paddr = pa; -	return true; +	return ES_OK;  }  /* Include code shared with pre-decompression boot stage */ @@ -731,6 +735,7 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,  {  	u64 exit_code, exit_info_1, exit_info_2;  	unsigned long ghcb_pa = __pa(ghcb); +	enum es_result res;  	phys_addr_t paddr;  	void __user *ref; @@ -740,11 +745,12 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,  	exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; -	if (!vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr)) { -		if (!read) +	res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); +	if (res != ES_OK) { +		if (res == ES_EXCEPTION && !read)  			ctxt->fi.error_code |= X86_PF_WRITE; -		return ES_EXCEPTION; +		return res;  	}  	exit_info_1 = paddr; diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S new file mode 100644 index 000000000000..ee04941a6546 --- /dev/null +++ b/arch/x86/kernel/sev_verify_cbit.S @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + *	sev_verify_cbit.S - Code for verification of the C-bit position reported + *			    by the Hypervisor when running with SEV enabled. + * + *	Copyright (c) 2020  Joerg Roedel (jroedel@suse.de) + * + * sev_verify_cbit() is called before switching to a new long-mode page-table + * at boot. + * + * Verify that the C-bit position is correct by writing a random value to + * an encrypted memory location while on the current page-table. Then it + * switches to the new page-table to verify the memory content is still the + * same. After that it switches back to the current page-table and when the + * check succeeded it returns. If the check failed the code invalidates the + * stack pointer and goes into a hlt loop. The stack-pointer is invalidated to + * make sure no interrupt or exception can get the CPU out of the hlt loop. + * + * New page-table pointer is expected in %rdi (first parameter) + * + */ +SYM_FUNC_START(sev_verify_cbit) +#ifdef CONFIG_AMD_MEM_ENCRYPT +	/* First check if a C-bit was detected */ +	movq	sme_me_mask(%rip), %rsi +	testq	%rsi, %rsi +	jz	3f + +	/* sme_me_mask != 0 could mean SME or SEV - Check also for SEV */ +	movq	sev_status(%rip), %rsi +	testq	%rsi, %rsi +	jz	3f + +	/* Save CR4 in %rsi */ +	movq	%cr4, %rsi + +	/* Disable Global Pages */ +	movq	%rsi, %rdx +	andq	$(~X86_CR4_PGE), %rdx +	movq	%rdx, %cr4 + +	/* +	 * Verified that running under SEV - now get a random value using +	 * RDRAND. This instruction is mandatory when running as an SEV guest. +	 * +	 * Don't bail out of the loop if RDRAND returns errors. It is better to +	 * prevent forward progress than to work with a non-random value here. +	 */ +1:	rdrand	%rdx +	jnc	1b + +	/* Store value to memory and keep it in %rdx */ +	movq	%rdx, sev_check_data(%rip) + +	/* Backup current %cr3 value to restore it later */ +	movq	%cr3, %rcx + +	/* Switch to new %cr3 - This might unmap the stack */ +	movq	%rdi, %cr3 + +	/* +	 * Compare value in %rdx with memory location. If C-bit is incorrect +	 * this would read the encrypted data and make the check fail. +	 */ +	cmpq	%rdx, sev_check_data(%rip) + +	/* Restore old %cr3 */ +	movq	%rcx, %cr3 + +	/* Restore previous CR4 */ +	movq	%rsi, %cr4 + +	/* Check CMPQ result */ +	je	3f + +	/* +	 * The check failed, prevent any forward progress to prevent ROP +	 * attacks, invalidate the stack and go into a hlt loop. +	 */ +	xorq	%rsp, %rsp +	subq	$0x1000, %rsp +2:	hlt +	jmp 2b +3: +#endif +	/* Return page-table pointer */ +	movq	%rdi, %rax +	ret +SYM_FUNC_END(sev_verify_cbit) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 992fb1415c0f..ae64f98ec2ab 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -514,16 +514,10 @@ int tboot_force_iommu(void)  	if (!tboot_enabled())  		return 0; -	if (intel_iommu_tboot_noforce) -		return 1; - -	if (no_iommu || swiotlb || dmar_disabled) +	if (no_iommu || dmar_disabled)  		pr_warn("Forcing Intel-IOMMU to enabled\n");  	dmar_disabled = 0; -#ifdef CONFIG_SWIOTLB -	swiotlb = 0; -#endif  	no_iommu = 0;  	return 1; | 
