diff options
Diffstat (limited to 'net/smc')
| -rw-r--r-- | net/smc/Kconfig | 4 | ||||
| -rw-r--r-- | net/smc/af_smc.c | 118 | ||||
| -rw-r--r-- | net/smc/smc.h | 3 | ||||
| -rw-r--r-- | net/smc/smc_cdc.c | 8 | ||||
| -rw-r--r-- | net/smc/smc_cdc.h | 4 | ||||
| -rw-r--r-- | net/smc/smc_clc.c | 23 | ||||
| -rw-r--r-- | net/smc/smc_clc.h | 4 | ||||
| -rw-r--r-- | net/smc/smc_close.c | 31 | ||||
| -rw-r--r-- | net/smc/smc_close.h | 1 | ||||
| -rw-r--r-- | net/smc/smc_core.c | 417 | ||||
| -rw-r--r-- | net/smc/smc_core.h | 32 | ||||
| -rw-r--r-- | net/smc/smc_ib.c | 154 | ||||
| -rw-r--r-- | net/smc/smc_ib.h | 20 | ||||
| -rw-r--r-- | net/smc/smc_llc.c | 1 | ||||
| -rw-r--r-- | net/smc/smc_llc.h | 1 | ||||
| -rw-r--r-- | net/smc/smc_pnet.c | 5 | ||||
| -rw-r--r-- | net/smc/smc_pnet.h | 1 | ||||
| -rw-r--r-- | net/smc/smc_rx.c | 6 | ||||
| -rw-r--r-- | net/smc/smc_rx.h | 1 | ||||
| -rw-r--r-- | net/smc/smc_tx.c | 28 | ||||
| -rw-r--r-- | net/smc/smc_tx.h | 1 | ||||
| -rw-r--r-- | net/smc/smc_wr.c | 66 | ||||
| -rw-r--r-- | net/smc/smc_wr.h | 2 | 
23 files changed, 611 insertions, 320 deletions
| diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 33954852f3f8..c717ef0896aa 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -8,10 +8,6 @@ config SMC  	  The Linux implementation of the SMC-R solution is designed as  	  a separate socket family SMC. -	  Warning: SMC will expose all memory for remote reads and writes -	  once a connection is established.  Don't enable this option except -	  for tightly controlled lab environment. -  	  Select this option if you want to run SMC socket applications  config SMC_DIAG diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6793d7348cc8..6451c5013e06 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -282,6 +282,7 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock,  			 __be32 *subnet, u8 *prefix_len)  {  	struct dst_entry *dst = sk_dst_get(clcsock->sk); +	struct in_device *in_dev;  	struct sockaddr_in addr;  	int rc = -ENOENT;  	int len; @@ -298,14 +299,17 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock,  	/* get address to which the internal TCP socket is bound */  	kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);  	/* analyze IPv4 specific data of net_device belonging to TCP socket */ -	for_ifa(dst->dev->ip_ptr) { -		if (ifa->ifa_address != addr.sin_addr.s_addr) +	rcu_read_lock(); +	in_dev = __in_dev_get_rcu(dst->dev); +	for_ifa(in_dev) { +		if (!inet_ifa_match(addr.sin_addr.s_addr, ifa))  			continue;  		*prefix_len = inet_mask_len(ifa->ifa_mask);  		*subnet = ifa->ifa_address & ifa->ifa_mask;  		rc = 0;  		break; -	} endfor_ifa(dst->dev->ip_ptr); +	} endfor_ifa(in_dev); +	rcu_read_unlock();  out_rel:  	dst_release(dst); @@ -338,6 +342,12 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)  		return SMC_CLC_DECL_INTERR;  	smc_wr_remember_qp_attr(link); + +	rc = smc_wr_reg_send(link, +			     smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); +	if (rc) +		return SMC_CLC_DECL_INTERR; +  	/* send CONFIRM LINK response over RoCE fabric */  	rc = smc_llc_send_confirm_link(link,  				       link->smcibdev->mac[link->ibport - 1], @@ -380,6 +390,12 @@ static int smc_connect_rdma(struct smc_sock *smc)  	int rc = 0;  	u8 ibport; +	if (!tcp_sk(smc->clcsock->sk)->syn_smc) { +		/* peer has not signalled SMC-capability */ +		smc->use_fallback = true; +		goto out_connected; +	} +  	/* IPSec connections opt out of SMC-R optimizations */  	if (using_ipsec(smc)) {  		reason_code = SMC_CLC_DECL_IPSEC; @@ -430,12 +446,8 @@ static int smc_connect_rdma(struct smc_sock *smc)  	smc_conn_save_peer_info(smc, &aclc); -	rc = smc_sndbuf_create(smc); -	if (rc) { -		reason_code = SMC_CLC_DECL_MEM; -		goto decline_rdma_unlock; -	} -	rc = smc_rmb_create(smc); +	/* create send buffer and rmb */ +	rc = smc_buf_create(smc);  	if (rc) {  		reason_code = SMC_CLC_DECL_MEM;  		goto decline_rdma_unlock; @@ -459,7 +471,20 @@ static int smc_connect_rdma(struct smc_sock *smc)  			reason_code = SMC_CLC_DECL_INTERR;  			goto decline_rdma_unlock;  		} +	} else { +		struct smc_buf_desc *buf_desc = smc->conn.rmb_desc; + +		if (!buf_desc->reused) { +			/* register memory region for new rmb */ +			rc = smc_wr_reg_send(link, +					     buf_desc->mr_rx[SMC_SINGLE_LINK]); +			if (rc) { +				reason_code = SMC_CLC_DECL_INTERR; +				goto decline_rdma_unlock; +			} +		}  	} +	smc_rmb_sync_sg_for_device(&smc->conn);  	rc = smc_clc_send_confirm(smc);  	if (rc) @@ -494,7 +519,7 @@ decline_rdma:  	/* RDMA setup failed, switch back to TCP */  	smc->use_fallback = true;  	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { -		rc = smc_clc_send_decline(smc, reason_code, 0); +		rc = smc_clc_send_decline(smc, reason_code);  		if (rc < sizeof(struct smc_clc_msg_decline))  			goto out_err;  	} @@ -536,6 +561,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,  	}  	smc_copy_sock_settings_to_clc(smc); +	tcp_sk(smc->clcsock->sk)->syn_smc = 1;  	rc = kernel_connect(smc->clcsock, addr, alen, flags);  	if (rc)  		goto out; @@ -692,6 +718,12 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)  	int rc;  	link = &lgr->lnk[SMC_SINGLE_LINK]; + +	rc = smc_wr_reg_send(link, +			     smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); +	if (rc) +		return SMC_CLC_DECL_INTERR; +  	/* send CONFIRM LINK request to client over the RoCE fabric */  	rc = smc_llc_send_confirm_link(link,  				       link->smcibdev->mac[link->ibport - 1], @@ -734,6 +766,12 @@ static void smc_listen_work(struct work_struct *work)  	u8 prefix_len;  	u8 ibport; +	/* check if peer is smc capable */ +	if (!tcp_sk(newclcsock->sk)->syn_smc) { +		new_smc->use_fallback = true; +		goto out_connected; +	} +  	/* do inband token exchange -  	 *wait for and receive SMC Proposal CLC message  	 */ @@ -779,46 +817,50 @@ static void smc_listen_work(struct work_struct *work)  	mutex_lock(&smc_create_lgr_pending);  	local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,  					smcibdev, ibport, &pclc.lcl, 0); -	if (local_contact == SMC_REUSE_CONTACT) -		/* lock no longer needed, free it due to following -		 * smc_clc_wait_msg() call -		 */ -		mutex_unlock(&smc_create_lgr_pending);  	if (local_contact < 0) {  		rc = local_contact;  		if (rc == -ENOMEM)  			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ -		else if (rc == -ENOLINK) -			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ -		goto decline_rdma; +		goto decline_rdma_unlock;  	}  	link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; -	rc = smc_sndbuf_create(new_smc); +	/* create send buffer and rmb */ +	rc = smc_buf_create(new_smc);  	if (rc) {  		reason_code = SMC_CLC_DECL_MEM; -		goto decline_rdma; -	} -	rc = smc_rmb_create(new_smc); -	if (rc) { -		reason_code = SMC_CLC_DECL_MEM; -		goto decline_rdma; +		goto decline_rdma_unlock;  	}  	smc_close_init(new_smc);  	smc_rx_init(new_smc); +	if (local_contact != SMC_FIRST_CONTACT) { +		struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc; + +		if (!buf_desc->reused) { +			/* register memory region for new rmb */ +			rc = smc_wr_reg_send(link, +					     buf_desc->mr_rx[SMC_SINGLE_LINK]); +			if (rc) { +				reason_code = SMC_CLC_DECL_INTERR; +				goto decline_rdma_unlock; +			} +		} +	} +	smc_rmb_sync_sg_for_device(&new_smc->conn); +  	rc = smc_clc_send_accept(new_smc, local_contact);  	if (rc) -		goto out_err; +		goto out_err_unlock;  	/* receive SMC Confirm CLC message */  	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),  				       SMC_CLC_CONFIRM);  	if (reason_code < 0) -		goto out_err; +		goto out_err_unlock;  	if (reason_code > 0) -		goto decline_rdma; +		goto decline_rdma_unlock;  	smc_conn_save_peer_info(new_smc, &cclc);  	if (local_contact == SMC_FIRST_CONTACT)  		smc_link_save_peer_info(link, &cclc); @@ -826,35 +868,34 @@ static void smc_listen_work(struct work_struct *work)  	rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);  	if (rc) {  		reason_code = SMC_CLC_DECL_INTERR; -		goto decline_rdma; +		goto decline_rdma_unlock;  	}  	if (local_contact == SMC_FIRST_CONTACT) {  		rc = smc_ib_ready_link(link);  		if (rc) {  			reason_code = SMC_CLC_DECL_INTERR; -			goto decline_rdma; +			goto decline_rdma_unlock;  		}  		/* QP confirmation over RoCE fabric */  		reason_code = smc_serv_conf_first_link(new_smc);  		if (reason_code < 0) {  			/* peer is not aware of a problem */  			rc = reason_code; -			goto out_err; +			goto out_err_unlock;  		}  		if (reason_code > 0) -			goto decline_rdma; +			goto decline_rdma_unlock;  	}  	smc_tx_init(new_smc); +	mutex_unlock(&smc_create_lgr_pending);  out_connected:  	sk_refcnt_debug_inc(newsmcsk);  	if (newsmcsk->sk_state == SMC_INIT)  		newsmcsk->sk_state = SMC_ACTIVE;  enqueue: -	if (local_contact == SMC_FIRST_CONTACT) -		mutex_unlock(&smc_create_lgr_pending);  	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);  	if (lsmc->sk.sk_state == SMC_LISTEN) {  		smc_accept_enqueue(&lsmc->sk, newsmcsk); @@ -868,17 +909,21 @@ enqueue:  	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */  	return; +decline_rdma_unlock: +	mutex_unlock(&smc_create_lgr_pending);  decline_rdma:  	/* RDMA setup failed, switch back to TCP */  	smc_conn_free(&new_smc->conn);  	new_smc->use_fallback = true;  	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { -		rc = smc_clc_send_decline(new_smc, reason_code, 0); +		rc = smc_clc_send_decline(new_smc, reason_code);  		if (rc < sizeof(struct smc_clc_msg_decline))  			goto out_err;  	}  	goto out_connected; +out_err_unlock: +	mutex_unlock(&smc_create_lgr_pending);  out_err:  	newsmcsk->sk_state = SMC_CLOSED;  	smc_conn_free(&new_smc->conn); @@ -935,6 +980,7 @@ static int smc_listen(struct socket *sock, int backlog)  	 * them to the clc socket -- copy smc socket options to clc socket  	 */  	smc_copy_sock_settings_to_clc(smc); +	tcp_sk(smc->clcsock->sk)->syn_smc = 1;  	rc = kernel_listen(smc->clcsock, backlog);  	if (rc) @@ -1377,6 +1423,7 @@ static int __init smc_init(void)  		goto out_sock;  	} +	static_branch_enable(&tcp_have_smc);  	return 0;  out_sock: @@ -1401,6 +1448,7 @@ static void __exit smc_exit(void)  		list_del_init(&lgr->list);  		smc_lgr_free(lgr); /* free link group */  	} +	static_branch_disable(&tcp_have_smc);  	smc_ib_unregister_client();  	sock_unregister(PF_SMC);  	proto_unregister(&smc_proto); diff --git a/net/smc/smc.h b/net/smc/smc.h index 6e44313e4467..0bee9d16cf29 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   *  Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -149,7 +150,7 @@ struct smc_connection {  	atomic_t		sndbuf_space;	/* remaining space in sndbuf */  	u16			tx_cdc_seq;	/* sequence # for CDC send */  	spinlock_t		send_lock;	/* protect wr_sends */ -	struct work_struct	tx_work;	/* retry of smc_cdc_msg_send */ +	struct delayed_work	tx_work;	/* retry of smc_cdc_msg_send */  	struct smc_host_cdc_msg	local_rx_ctrl;	/* filled during event_handl.  						 * .prod cf. TCP rcv_nxt diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index a7294edbc221..87f7bede6eab 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -62,10 +63,12 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,  	bh_unlock_sock(&smc->sk);  } -int smc_cdc_get_free_slot(struct smc_link *link, +int smc_cdc_get_free_slot(struct smc_connection *conn,  			  struct smc_wr_buf **wr_buf,  			  struct smc_cdc_tx_pend **pend)  { +	struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; +  	return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,  				       (struct smc_wr_tx_pend_priv **)pend);  } @@ -118,8 +121,7 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)  	struct smc_wr_buf *wr_buf;  	int rc; -	rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf, -				   &pend); +	rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend);  	if (rc)  		return rc; diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 8e1d76f26007..149ceda1b088 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -206,7 +207,8 @@ static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,  struct smc_cdc_tx_pend; -int smc_cdc_get_free_slot(struct smc_link *link, struct smc_wr_buf **wr_buf, +int smc_cdc_get_free_slot(struct smc_connection *conn, +			  struct smc_wr_buf **wr_buf,  			  struct smc_cdc_tx_pend **pend);  void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);  int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 03ec058d18df..1800e16b2a02 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -95,9 +96,10 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,  	}  	if (clcm->type == SMC_CLC_DECLINE) {  		reason_code = SMC_CLC_DECL_REPLY; -		if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis) -			== SMC_CLC_DECL_SYNCERR) +		if (((struct smc_clc_msg_decline *)buf)->hdr.flag) {  			smc->conn.lgr->sync_err = true; +			smc_lgr_terminate(smc->conn.lgr); +		}  	}  out: @@ -105,8 +107,7 @@ out:  }  /* send CLC DECLINE message across internal TCP socket */ -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, -			 u8 out_of_sync) +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)  {  	struct smc_clc_msg_decline dclc;  	struct msghdr msg; @@ -118,7 +119,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,  	dclc.hdr.type = SMC_CLC_DECLINE;  	dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));  	dclc.hdr.version = SMC_CLC_V1; -	dclc.hdr.flag = out_of_sync ? 1 : 0; +	dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0;  	memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid));  	dclc.peer_diagnosis = htonl(peer_diag_info);  	memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); @@ -204,13 +205,13 @@ int smc_clc_send_confirm(struct smc_sock *smc)  	memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN);  	hton24(cclc.qpn, link->roce_qp->qp_num);  	cclc.rmb_rkey = -		htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]); +		htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);  	cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */  	cclc.rmbe_alert_token = htonl(conn->alert_token_local);  	cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);  	cclc.rmbe_size = conn->rmbe_size_short; -	cclc.rmb_dma_addr = -		cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]); +	cclc.rmb_dma_addr = cpu_to_be64( +		(u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));  	hton24(cclc.psn, link->psn_initial);  	memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); @@ -256,13 +257,13 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)  	memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN);  	hton24(aclc.qpn, link->roce_qp->qp_num);  	aclc.rmb_rkey = -		htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]); +		htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);  	aclc.conn_idx = 1;			/* as long as 1 RMB = 1 RMBE */  	aclc.rmbe_alert_token = htonl(conn->alert_token_local);  	aclc.qp_mtu = link->path_mtu;  	aclc.rmbe_size = conn->rmbe_size_short, -	aclc.rmb_dma_addr = -		cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]); +	aclc.rmb_dma_addr = cpu_to_be64( +		(u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));  	hton24(aclc.psn, link->psn_initial);  	memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 13db8ce177c9..12a9af1539a2 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   *  Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -106,8 +107,7 @@ struct smc_ib_device;  int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,  		     u8 expected_type); -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, -			 u8 out_of_sync); +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);  int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev,  			  u8 ibport);  int smc_clc_send_confirm(struct smc_sock *smc); diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 3c2e166b5d22..48615d2ac4aa 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -174,15 +175,15 @@ int smc_close_active(struct smc_sock *smc)  {  	struct smc_cdc_conn_state_flags *txflags =  		&smc->conn.local_tx_ctrl.conn_state_flags; -	long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;  	struct smc_connection *conn = &smc->conn;  	struct sock *sk = &smc->sk;  	int old_state; +	long timeout;  	int rc = 0; -	if (sock_flag(sk, SOCK_LINGER) && -	    !(current->flags & PF_EXITING)) -		timeout = sk->sk_lingertime; +	timeout = current->flags & PF_EXITING ? +		  0 : sock_flag(sk, SOCK_LINGER) ? +		      sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT;  again:  	old_state = sk->sk_state; @@ -208,7 +209,7 @@ again:  	case SMC_ACTIVE:  		smc_close_stream_wait(smc, timeout);  		release_sock(sk); -		cancel_work_sync(&conn->tx_work); +		cancel_delayed_work_sync(&conn->tx_work);  		lock_sock(sk);  		if (sk->sk_state == SMC_ACTIVE) {  			/* send close request */ @@ -234,7 +235,7 @@ again:  		if (!smc_cdc_rxed_any_close(conn))  			smc_close_stream_wait(smc, timeout);  		release_sock(sk); -		cancel_work_sync(&conn->tx_work); +		cancel_delayed_work_sync(&conn->tx_work);  		lock_sock(sk);  		if (sk->sk_err != ECONNABORTED) {  			/* confirm close from peer */ @@ -263,7 +264,9 @@ again:  		/* peer sending PeerConnectionClosed will cause transition */  		break;  	case SMC_PROCESSABORT: -		cancel_work_sync(&conn->tx_work); +		release_sock(sk); +		cancel_delayed_work_sync(&conn->tx_work); +		lock_sock(sk);  		smc_close_abort(conn);  		sk->sk_state = SMC_CLOSED;  		smc_close_wait_tx_pends(smc); @@ -358,7 +361,8 @@ static void smc_close_passive_work(struct work_struct *work)  	case SMC_PEERCLOSEWAIT1:  		if (rxflags->peer_done_writing)  			sk->sk_state = SMC_PEERCLOSEWAIT2; -		/* fall through to check for closing */ +		/* fall through */ +		/* to check for closing */  	case SMC_PEERCLOSEWAIT2:  	case SMC_PEERFINCLOSEWAIT:  		if (!smc_cdc_rxed_any_close(&smc->conn)) @@ -411,13 +415,14 @@ void smc_close_sock_put_work(struct work_struct *work)  int smc_close_shutdown_write(struct smc_sock *smc)  {  	struct smc_connection *conn = &smc->conn; -	long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;  	struct sock *sk = &smc->sk;  	int old_state; +	long timeout;  	int rc = 0; -	if (sock_flag(sk, SOCK_LINGER)) -		timeout = sk->sk_lingertime; +	timeout = current->flags & PF_EXITING ? +		  0 : sock_flag(sk, SOCK_LINGER) ? +		      sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT;  again:  	old_state = sk->sk_state; @@ -425,7 +430,7 @@ again:  	case SMC_ACTIVE:  		smc_close_stream_wait(smc, timeout);  		release_sock(sk); -		cancel_work_sync(&conn->tx_work); +		cancel_delayed_work_sync(&conn->tx_work);  		lock_sock(sk);  		/* send close wr request */  		rc = smc_close_wr(conn); @@ -439,7 +444,7 @@ again:  		if (!smc_cdc_rxed_any_close(conn))  			smc_close_stream_wait(smc, timeout);  		release_sock(sk); -		cancel_work_sync(&conn->tx_work); +		cancel_delayed_work_sync(&conn->tx_work);  		lock_sock(sk);  		/* confirm close from peer */  		rc = smc_close_wr(conn); diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h index 4a3d99a8d7cb..ed82506b1b0a 100644 --- a/net/smc/smc_close.h +++ b/net/smc/smc_close.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Shared Memory Communications over RDMA (SMC-R) and RoCE   * diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3ac09a629ea1..94f21116dac5 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -25,8 +26,9 @@  #include "smc_cdc.h"  #include "smc_close.h" -#define SMC_LGR_NUM_INCR	256 -#define SMC_LGR_FREE_DELAY	(600 * HZ) +#define SMC_LGR_NUM_INCR		256 +#define SMC_LGR_FREE_DELAY_SERV		(600 * HZ) +#define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10)  static u32 smc_lgr_num;			/* unique link group number */ @@ -107,8 +109,15 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn)  		__smc_lgr_unregister_conn(conn);  	}  	write_unlock_bh(&lgr->conns_lock); -	if (reduced && !lgr->conns_num) -		schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY); +	if (!reduced || lgr->conns_num) +		return; +	/* client link group creation always follows the server link group +	 * creation. For client use a somewhat higher removal delay time, +	 * otherwise there is a risk of out-of-sync link groups. +	 */ +	mod_delayed_work(system_wq, &lgr->free_work, +			 lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : +						 SMC_LGR_FREE_DELAY_SERV);  }  static void smc_lgr_free_work(struct work_struct *work) @@ -175,7 +184,6 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,  	rc = smc_wr_alloc_link_mem(lnk);  	if (rc)  		goto free_lgr; -	init_waitqueue_head(&lnk->wr_tx_wait);  	rc = smc_ib_create_protection_domain(lnk);  	if (rc)  		goto free_link_mem; @@ -207,17 +215,14 @@ out:  	return rc;  } -static void smc_sndbuf_unuse(struct smc_connection *conn) +static void smc_buf_unuse(struct smc_connection *conn)  {  	if (conn->sndbuf_desc) {  		conn->sndbuf_desc->used = 0;  		conn->sndbuf_size = 0;  	} -} - -static void smc_rmb_unuse(struct smc_connection *conn) -{  	if (conn->rmb_desc) { +		conn->rmb_desc->reused = true;  		conn->rmb_desc->used = 0;  		conn->rmbe_size = 0;  	} @@ -232,8 +237,7 @@ void smc_conn_free(struct smc_connection *conn)  		return;  	smc_cdc_tx_dismiss_slots(conn);  	smc_lgr_unregister_conn(conn); -	smc_rmb_unuse(conn); -	smc_sndbuf_unuse(conn); +	smc_buf_unuse(conn);  }  static void smc_link_clear(struct smc_link *lnk) @@ -246,48 +250,57 @@ static void smc_link_clear(struct smc_link *lnk)  	smc_wr_free_link_mem(lnk);  } -static void smc_lgr_free_sndbufs(struct smc_link_group *lgr) +static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, +			 bool is_rmb)  { -	struct smc_buf_desc *sndbuf_desc, *bf_desc; -	int i; - -	for (i = 0; i < SMC_RMBE_SIZES; i++) { -		list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i], -					 list) { -			list_del(&sndbuf_desc->list); -			smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev, -					 smc_uncompress_bufsize(i), -					 sndbuf_desc, DMA_TO_DEVICE); -			kfree(sndbuf_desc->cpu_addr); -			kfree(sndbuf_desc); -		} +	if (is_rmb) { +		if (buf_desc->mr_rx[SMC_SINGLE_LINK]) +			smc_ib_put_memory_region( +					buf_desc->mr_rx[SMC_SINGLE_LINK]); +		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, +				    DMA_FROM_DEVICE); +	} else { +		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, +				    DMA_TO_DEVICE);  	} +	sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); +	if (buf_desc->cpu_addr) +		free_pages((unsigned long)buf_desc->cpu_addr, buf_desc->order); +	kfree(buf_desc);  } -static void smc_lgr_free_rmbs(struct smc_link_group *lgr) +static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)  { -	struct smc_buf_desc *rmb_desc, *bf_desc;  	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; +	struct smc_buf_desc *buf_desc, *bf_desc; +	struct list_head *buf_list;  	int i;  	for (i = 0; i < SMC_RMBE_SIZES; i++) { -		list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i], +		if (is_rmb) +			buf_list = &lgr->rmbs[i]; +		else +			buf_list = &lgr->sndbufs[i]; +		list_for_each_entry_safe(buf_desc, bf_desc, buf_list,  					 list) { -			list_del(&rmb_desc->list); -			smc_ib_buf_unmap(lnk->smcibdev, -					 smc_uncompress_bufsize(i), -					 rmb_desc, DMA_FROM_DEVICE); -			kfree(rmb_desc->cpu_addr); -			kfree(rmb_desc); +			list_del(&buf_desc->list); +			smc_buf_free(buf_desc, lnk, is_rmb);  		}  	}  } +static void smc_lgr_free_bufs(struct smc_link_group *lgr) +{ +	/* free send buffers */ +	__smc_lgr_free_bufs(lgr, false); +	/* free rmbs */ +	__smc_lgr_free_bufs(lgr, true); +} +  /* remove a link group */  void smc_lgr_free(struct smc_link_group *lgr)  { -	smc_lgr_free_rmbs(lgr); -	smc_lgr_free_sndbufs(lgr); +	smc_lgr_free_bufs(lgr);  	smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);  	kfree(lgr);  } @@ -368,10 +381,14 @@ static int smc_link_determine_gid(struct smc_link_group *lgr)  		if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,  				 &gattr))  			continue; -		if (gattr.ndev && -		    (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) { -			lnk->gid = gid; -			return 0; +		if (gattr.ndev) { +			if (is_vlan_dev(gattr.ndev) && +			    vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) { +				lnk->gid = gid; +				dev_put(gattr.ndev); +				return 0; +			} +			dev_put(gattr.ndev);  		}  	}  	return -ENODEV; @@ -452,45 +469,25 @@ out:  	return rc ? rc : local_contact;  } -/* try to reuse a sndbuf description slot of the sndbufs list for a certain - * buf_size; if not available, return NULL +/* try to reuse a sndbuf or rmb description slot for a certain + * buffer size; if not available, return NULL   */  static inline -struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr, -					 int compressed_bufsize) +struct smc_buf_desc *smc_buf_get_slot(struct smc_link_group *lgr, +				      int compressed_bufsize, +				      rwlock_t *lock, +				      struct list_head *buf_list)  { -	struct smc_buf_desc *sndbuf_slot; - -	read_lock_bh(&lgr->sndbufs_lock); -	list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize], -			    list) { -		if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) { -			read_unlock_bh(&lgr->sndbufs_lock); -			return sndbuf_slot; -		} -	} -	read_unlock_bh(&lgr->sndbufs_lock); -	return NULL; -} +	struct smc_buf_desc *buf_slot; -/* try to reuse an rmb description slot of the rmbs list for a certain - * rmbe_size; if not available, return NULL - */ -static inline -struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr, -				      int compressed_bufsize) -{ -	struct smc_buf_desc *rmb_slot; - -	read_lock_bh(&lgr->rmbs_lock); -	list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize], -			    list) { -		if (cmpxchg(&rmb_slot->used, 0, 1) == 0) { -			read_unlock_bh(&lgr->rmbs_lock); -			return rmb_slot; +	read_lock_bh(lock); +	list_for_each_entry(buf_slot, buf_list, list) { +		if (cmpxchg(&buf_slot->used, 0, 1) == 0) { +			read_unlock_bh(lock); +			return buf_slot;  		}  	} -	read_unlock_bh(&lgr->rmbs_lock); +	read_unlock_bh(lock);  	return NULL;  } @@ -503,136 +500,186 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size)  	return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);  } -/* create the tx buffer for an SMC socket */ -int smc_sndbuf_create(struct smc_sock *smc) +static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, +					       bool is_rmb, int bufsize)  { -	struct smc_connection *conn = &smc->conn; -	struct smc_link_group *lgr = conn->lgr; -	int tmp_bufsize, tmp_bufsize_short; -	struct smc_buf_desc *sndbuf_desc; +	struct smc_buf_desc *buf_desc; +	struct smc_link *lnk;  	int rc; -	/* use socket send buffer size (w/o overhead) as start value */ -	for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2); -	     tmp_bufsize_short >= 0; tmp_bufsize_short--) { -		tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short); -		/* check for reusable sndbuf_slot in the link group */ -		sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short); -		if (sndbuf_desc) { -			memset(sndbuf_desc->cpu_addr, 0, tmp_bufsize); -			break; /* found reusable slot */ -		} -		/* try to alloc a new send buffer */ -		sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL); -		if (!sndbuf_desc) -			break; /* give up with -ENOMEM */ -		sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize, -						GFP_KERNEL | __GFP_NOWARN | -						__GFP_NOMEMALLOC | -						__GFP_NORETRY); -		if (!sndbuf_desc->cpu_addr) { -			kfree(sndbuf_desc); -			sndbuf_desc = NULL; -			/* if send buffer allocation has failed, -			 * try a smaller one -			 */ -			continue; -		} -		rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev, -				    tmp_bufsize, sndbuf_desc, -				    DMA_TO_DEVICE); +	/* try to alloc a new buffer */ +	buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); +	if (!buf_desc) +		return ERR_PTR(-ENOMEM); + +	buf_desc->cpu_addr = +		(void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | +					 __GFP_NOMEMALLOC | +					 __GFP_NORETRY | __GFP_ZERO, +					 get_order(bufsize)); +	if (!buf_desc->cpu_addr) { +		kfree(buf_desc); +		return ERR_PTR(-EAGAIN); +	} +	buf_desc->order = get_order(bufsize); + +	/* build the sg table from the pages */ +	lnk = &lgr->lnk[SMC_SINGLE_LINK]; +	rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, +			    GFP_KERNEL); +	if (rc) { +		smc_buf_free(buf_desc, lnk, is_rmb); +		return ERR_PTR(rc); +	} +	sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, +		   buf_desc->cpu_addr, bufsize); + +	/* map sg table to DMA address */ +	rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc, +			       is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); +	/* SMC protocol depends on mapping to one DMA address only */ +	if (rc != 1)  { +		smc_buf_free(buf_desc, lnk, is_rmb); +		return ERR_PTR(-EAGAIN); +	} + +	/* create a new memory region for the RMB */ +	if (is_rmb) { +		rc = smc_ib_get_memory_region(lnk->roce_pd, +					      IB_ACCESS_REMOTE_WRITE | +					      IB_ACCESS_LOCAL_WRITE, +					      buf_desc);  		if (rc) { -			kfree(sndbuf_desc->cpu_addr); -			kfree(sndbuf_desc); -			sndbuf_desc = NULL; -			continue; /* if mapping failed, try smaller one */ +			smc_buf_free(buf_desc, lnk, is_rmb); +			return ERR_PTR(rc);  		} -		sndbuf_desc->used = 1; -		write_lock_bh(&lgr->sndbufs_lock); -		list_add(&sndbuf_desc->list, -			 &lgr->sndbufs[tmp_bufsize_short]); -		write_unlock_bh(&lgr->sndbufs_lock); -		break; -	} -	if (sndbuf_desc && sndbuf_desc->cpu_addr) { -		conn->sndbuf_desc = sndbuf_desc; -		conn->sndbuf_size = tmp_bufsize; -		smc->sk.sk_sndbuf = tmp_bufsize * 2; -		atomic_set(&conn->sndbuf_space, tmp_bufsize); -		return 0; -	} else { -		return -ENOMEM;  	} + +	return buf_desc;  } -/* create the RMB for an SMC socket (even though the SMC protocol - * allows more than one RMB-element per RMB, the Linux implementation - * uses just one RMB-element per RMB, i.e. uses an extra RMB for every - * connection in a link group - */ -int smc_rmb_create(struct smc_sock *smc) +static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)  {  	struct smc_connection *conn = &smc->conn;  	struct smc_link_group *lgr = conn->lgr; -	int tmp_bufsize, tmp_bufsize_short; -	struct smc_buf_desc *rmb_desc; -	int rc; +	struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); +	struct list_head *buf_list; +	int bufsize, bufsize_short; +	int sk_buf_size; +	rwlock_t *lock; + +	if (is_rmb) +		/* use socket recv buffer size (w/o overhead) as start value */ +		sk_buf_size = smc->sk.sk_rcvbuf / 2; +	else +		/* use socket send buffer size (w/o overhead) as start value */ +		sk_buf_size = smc->sk.sk_sndbuf / 2; + +	for (bufsize_short = smc_compress_bufsize(sk_buf_size); +	     bufsize_short >= 0; bufsize_short--) { + +		if (is_rmb) { +			lock = &lgr->rmbs_lock; +			buf_list = &lgr->rmbs[bufsize_short]; +		} else { +			lock = &lgr->sndbufs_lock; +			buf_list = &lgr->sndbufs[bufsize_short]; +		} +		bufsize = smc_uncompress_bufsize(bufsize_short); +		if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC) +			continue; -	/* use socket recv buffer size (w/o overhead) as start value */ -	for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2); -	     tmp_bufsize_short >= 0; tmp_bufsize_short--) { -		tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short); -		/* check for reusable rmb_slot in the link group */ -		rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short); -		if (rmb_desc) { -			memset(rmb_desc->cpu_addr, 0, tmp_bufsize); +		/* check for reusable slot in the link group */ +		buf_desc = smc_buf_get_slot(lgr, bufsize_short, lock, buf_list); +		if (buf_desc) { +			memset(buf_desc->cpu_addr, 0, bufsize);  			break; /* found reusable slot */  		} -		/* try to alloc a new RMB */ -		rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL); -		if (!rmb_desc) -			break; /* give up with -ENOMEM */ -		rmb_desc->cpu_addr = kzalloc(tmp_bufsize, -					     GFP_KERNEL | __GFP_NOWARN | -					     __GFP_NOMEMALLOC | -					     __GFP_NORETRY); -		if (!rmb_desc->cpu_addr) { -			kfree(rmb_desc); -			rmb_desc = NULL; -			/* if RMB allocation has failed, -			 * try a smaller one -			 */ + +		buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize); +		if (PTR_ERR(buf_desc) == -ENOMEM) +			break; +		if (IS_ERR(buf_desc))  			continue; -		} -		rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev, -				    tmp_bufsize, rmb_desc, -				    DMA_FROM_DEVICE); -		if (rc) { -			kfree(rmb_desc->cpu_addr); -			kfree(rmb_desc); -			rmb_desc = NULL; -			continue; /* if mapping failed, try smaller one */ -		} -		rmb_desc->rkey[SMC_SINGLE_LINK] = -			lgr->lnk[SMC_SINGLE_LINK].roce_pd->unsafe_global_rkey; -		rmb_desc->used = 1; -		write_lock_bh(&lgr->rmbs_lock); -		list_add(&rmb_desc->list, -			 &lgr->rmbs[tmp_bufsize_short]); -		write_unlock_bh(&lgr->rmbs_lock); -		break; + +		buf_desc->used = 1; +		write_lock_bh(lock); +		list_add(&buf_desc->list, buf_list); +		write_unlock_bh(lock); +		break; /* found */  	} -	if (rmb_desc && rmb_desc->cpu_addr) { -		conn->rmb_desc = rmb_desc; -		conn->rmbe_size = tmp_bufsize; -		conn->rmbe_size_short = tmp_bufsize_short; -		smc->sk.sk_rcvbuf = tmp_bufsize * 2; + +	if (IS_ERR(buf_desc)) +		return -ENOMEM; + +	if (is_rmb) { +		conn->rmb_desc = buf_desc; +		conn->rmbe_size = bufsize; +		conn->rmbe_size_short = bufsize_short; +		smc->sk.sk_rcvbuf = bufsize * 2;  		atomic_set(&conn->bytes_to_rcv, 0); -		conn->rmbe_update_limit = smc_rmb_wnd_update_limit(tmp_bufsize); -		return 0; +		conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize);  	} else { -		return -ENOMEM; +		conn->sndbuf_desc = buf_desc; +		conn->sndbuf_size = bufsize; +		smc->sk.sk_sndbuf = bufsize * 2; +		atomic_set(&conn->sndbuf_space, bufsize);  	} +	return 0; +} + +void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) +{ +	struct smc_link_group *lgr = conn->lgr; + +	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, +			       conn->sndbuf_desc, DMA_TO_DEVICE); +} + +void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) +{ +	struct smc_link_group *lgr = conn->lgr; + +	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, +				  conn->sndbuf_desc, DMA_TO_DEVICE); +} + +void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) +{ +	struct smc_link_group *lgr = conn->lgr; + +	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, +			       conn->rmb_desc, DMA_FROM_DEVICE); +} + +void smc_rmb_sync_sg_for_device(struct smc_connection *conn) +{ +	struct smc_link_group *lgr = conn->lgr; + +	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, +				  conn->rmb_desc, DMA_FROM_DEVICE); +} + +/* create the send and receive buffer for an SMC socket; + * receive buffers are called RMBs; + * (even though the SMC protocol allows more than one RMB-element per RMB, + * the Linux implementation uses just one RMB-element per RMB, i.e. uses an + * extra RMB for every connection in a link group + */ +int smc_buf_create(struct smc_sock *smc) +{ +	int rc; + +	/* create send buffer */ +	rc = __smc_buf_create(smc, false); +	if (rc) +		return rc; +	/* create rmb */ +	rc = __smc_buf_create(smc, true); +	if (rc) +		smc_buf_free(smc->conn.sndbuf_desc, +			     &smc->conn.lgr->lnk[SMC_SINGLE_LINK], false); +	return rc;  }  static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index b013cb43a327..fe691bf9af91 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -37,6 +38,14 @@ struct smc_wr_buf {  	u8	raw[SMC_WR_BUF_SIZE];  }; +#define SMC_WR_REG_MR_WAIT_TIME	(5 * HZ)/* wait time for ib_wr_reg_mr result */ + +enum smc_wr_reg_state { +	POSTED,		/* ib_wr_reg_mr request posted */ +	CONFIRMED,	/* ib_wr_reg_mr response: successful */ +	FAILED		/* ib_wr_reg_mr response: failure */ +}; +  struct smc_link {  	struct smc_ib_device	*smcibdev;	/* ib-device */  	u8			ibport;		/* port - values 1 | 2 */ @@ -65,6 +74,10 @@ struct smc_link {  	u64			wr_rx_id;	/* seq # of last recv WR */  	u32			wr_rx_cnt;	/* number of WR recv buffers */ +	struct ib_reg_wr	wr_reg;		/* WR register memory region */ +	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */ +	enum smc_wr_reg_state	wr_reg_state;	/* state of wr_reg request */ +  	union ib_gid		gid;		/* gid matching used vlan id */  	u32			peer_qpn;	/* QP number of peer */  	enum ib_mtu		path_mtu;	/* used mtu */ @@ -90,14 +103,15 @@ struct smc_link {  /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */  struct smc_buf_desc {  	struct list_head	list; -	u64			dma_addr[SMC_LINKS_PER_LGR_MAX]; -						/* mapped address of buffer */  	void			*cpu_addr;	/* virtual address of buffer */ -	u32			rkey[SMC_LINKS_PER_LGR_MAX]; -						/* for rmb only: -						 * rkey provided to peer +	struct sg_table		sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */ +	struct ib_mr		*mr_rx[SMC_LINKS_PER_LGR_MAX]; +						/* for rmb only: memory region +						 * incl. rkey provided to peer  						 */ +	u32			order;		/* allocation order */  	u32			used;		/* currently used / unused */ +	bool			reused;		/* new created / reused */  };  struct smc_rtoken {				/* address/key of remote RMB */ @@ -173,9 +187,11 @@ struct smc_clc_msg_accept_confirm;  void smc_lgr_free(struct smc_link_group *lgr);  void smc_lgr_terminate(struct smc_link_group *lgr); -int smc_sndbuf_create(struct smc_sock *smc); -int smc_rmb_create(struct smc_sock *smc); +int smc_buf_create(struct smc_sock *smc);  int smc_rmb_rtoken_handling(struct smc_connection *conn,  			    struct smc_clc_msg_accept_confirm *clc); - +void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); +void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); +void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); +void smc_rmb_sync_sg_for_device(struct smc_connection *conn);  #endif diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index b31715505a35..90f1a7f9085c 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -13,6 +14,7 @@  #include <linux/random.h>  #include <linux/workqueue.h> +#include <linux/scatterlist.h>  #include <rdma/ib_verbs.h>  #include "smc_pnet.h" @@ -192,8 +194,7 @@ int smc_ib_create_protection_domain(struct smc_link *lnk)  {  	int rc; -	lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, -				   IB_PD_UNSAFE_GLOBAL_RKEY); +	lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);  	rc = PTR_ERR_OR_ZERO(lnk->roce_pd);  	if (IS_ERR(lnk->roce_pd))  		lnk->roce_pd = NULL; @@ -232,10 +233,10 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)  		.recv_cq = lnk->smcibdev->roce_cq_recv,  		.srq = NULL,  		.cap = { -			.max_send_wr = SMC_WR_BUF_CNT,  				/* include unsolicited rdma_writes as well,  				 * there are max. 2 RDMA_WRITE per 1 WR_SEND  				 */ +			.max_send_wr = SMC_WR_BUF_CNT * 3,  			.max_recv_wr = SMC_WR_BUF_CNT * 3,  			.max_send_sge = SMC_IB_MAX_SEND_SGE,  			.max_recv_sge = 1, @@ -254,56 +255,132 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)  	return rc;  } -/* map a new TX or RX buffer to DMA */ -int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size, -		   struct smc_buf_desc *buf_slot, -		   enum dma_data_direction data_direction) +void smc_ib_put_memory_region(struct ib_mr *mr)  { -	int rc = 0; +	ib_dereg_mr(mr); +} -	if (buf_slot->dma_addr[SMC_SINGLE_LINK]) -		return rc; /* already mapped */ -	buf_slot->dma_addr[SMC_SINGLE_LINK] = -		ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr, -				  buf_size, data_direction); -	if (ib_dma_mapping_error(smcibdev->ibdev, -				 buf_slot->dma_addr[SMC_SINGLE_LINK])) -		rc = -EIO; -	return rc; +static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot) +{ +	unsigned int offset = 0; +	int sg_num; + +	/* map the largest prefix of a dma mapped SG list */ +	sg_num = ib_map_mr_sg(buf_slot->mr_rx[SMC_SINGLE_LINK], +			      buf_slot->sgt[SMC_SINGLE_LINK].sgl, +			      buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, +			      &offset, PAGE_SIZE); + +	return sg_num; +} + +/* Allocate a memory region and map the dma mapped SG list of buf_slot */ +int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, +			     struct smc_buf_desc *buf_slot) +{ +	if (buf_slot->mr_rx[SMC_SINGLE_LINK]) +		return 0; /* already done */ + +	buf_slot->mr_rx[SMC_SINGLE_LINK] = +		ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); +	if (IS_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK])) { +		int rc; + +		rc = PTR_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK]); +		buf_slot->mr_rx[SMC_SINGLE_LINK] = NULL; +		return rc; +	} + +	if (smc_ib_map_mr_sg(buf_slot) != 1) +		return -EINVAL; + +	return 0; +} + +/* synchronize buffer usage for cpu access */ +void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, +			    struct smc_buf_desc *buf_slot, +			    enum dma_data_direction data_direction) +{ +	struct scatterlist *sg; +	unsigned int i; + +	/* for now there is just one DMA address */ +	for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, +		    buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { +		if (!sg_dma_len(sg)) +			break; +		ib_dma_sync_single_for_cpu(smcibdev->ibdev, +					   sg_dma_address(sg), +					   sg_dma_len(sg), +					   data_direction); +	} +} + +/* synchronize buffer usage for device access */ +void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, +			       struct smc_buf_desc *buf_slot, +			       enum dma_data_direction data_direction) +{ +	struct scatterlist *sg; +	unsigned int i; + +	/* for now there is just one DMA address */ +	for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, +		    buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { +		if (!sg_dma_len(sg)) +			break; +		ib_dma_sync_single_for_device(smcibdev->ibdev, +					      sg_dma_address(sg), +					      sg_dma_len(sg), +					      data_direction); +	}  } -void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size, +/* Map a new TX or RX buffer SG-table to DMA */ +int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev,  		      struct smc_buf_desc *buf_slot,  		      enum dma_data_direction data_direction)  { -	if (!buf_slot->dma_addr[SMC_SINGLE_LINK]) +	int mapped_nents; + +	mapped_nents = ib_dma_map_sg(smcibdev->ibdev, +				     buf_slot->sgt[SMC_SINGLE_LINK].sgl, +				     buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, +				     data_direction); +	if (!mapped_nents) +		return -ENOMEM; + +	return mapped_nents; +} + +void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, +			 struct smc_buf_desc *buf_slot, +			 enum dma_data_direction data_direction) +{ +	if (!buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address)  		return; /* already unmapped */ -	ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size, -			    data_direction); -	buf_slot->dma_addr[SMC_SINGLE_LINK] = 0; + +	ib_dma_unmap_sg(smcibdev->ibdev, +			buf_slot->sgt[SMC_SINGLE_LINK].sgl, +			buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, +			data_direction); +	buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0;  }  static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)  { -	struct net_device *ndev; +	struct ib_gid_attr gattr;  	int rc;  	rc = ib_query_gid(smcibdev->ibdev, ibport, 0, -			  &smcibdev->gid[ibport - 1], NULL); -	/* the SMC protocol requires specification of the roce MAC address; -	 * if net_device cannot be determined, it can be derived from gid 0 -	 */ -	ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport); -	if (ndev) { -		memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN); -	} else if (!rc) { -		memcpy(&smcibdev->mac[ibport - 1][0], -		       &smcibdev->gid[ibport - 1].raw[8], 3); -		memcpy(&smcibdev->mac[ibport - 1][3], -		       &smcibdev->gid[ibport - 1].raw[13], 3); -		smcibdev->mac[ibport - 1][0] &= ~0x02; -	} -	return rc; +			  &smcibdev->gid[ibport - 1], &gattr); +	if (rc || !gattr.ndev) +		return -ENODEV; + +	memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN); +	dev_put(gattr.ndev); +	return 0;  }  /* Create an identifier unique for this instance of SMC-R. @@ -334,6 +411,7 @@ int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)  			   &smcibdev->pattr[ibport - 1]);  	if (rc)  		goto out; +	/* the SMC protocol requires specification of the RoCE MAC address */  	rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);  	if (rc)  		goto out; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index b567152a526d..e90630dadf8e 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -51,12 +52,12 @@ int smc_ib_register_client(void) __init;  void smc_ib_unregister_client(void);  bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);  int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport); -int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size, -		   struct smc_buf_desc *buf_slot, -		   enum dma_data_direction data_direction); -void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int bufsize, +int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev,  		      struct smc_buf_desc *buf_slot,  		      enum dma_data_direction data_direction); +void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, +			 struct smc_buf_desc *buf_slot, +			 enum dma_data_direction data_direction);  void smc_ib_dealloc_protection_domain(struct smc_link *lnk);  int smc_ib_create_protection_domain(struct smc_link *lnk);  void smc_ib_destroy_queue_pair(struct smc_link *lnk); @@ -65,6 +66,13 @@ int smc_ib_ready_link(struct smc_link *lnk);  int smc_ib_modify_qp_rts(struct smc_link *lnk);  int smc_ib_modify_qp_reset(struct smc_link *lnk);  long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); - - +int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, +			     struct smc_buf_desc *buf_slot); +void smc_ib_put_memory_region(struct ib_mr *mr); +void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, +			    struct smc_buf_desc *buf_slot, +			    enum dma_data_direction data_direction); +void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, +			       struct smc_buf_desc *buf_slot, +			       enum dma_data_direction data_direction);  #endif diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index c2f9165d13ef..92fe4cc8c82c 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  Shared Memory Communications over RDMA (SMC-R) and RoCE   * diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index b472f853953a..51b27ce90dbd 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   *  Shared Memory Communications over RDMA (SMC-R) and RoCE   * diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 78f7af28ae4f..74568cdbca70 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -181,8 +182,10 @@ static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem)  			     sizeof(new_pnetelem->ndev->name)) ||  		    smc_pnet_same_ibname(pnetelem,  					 new_pnetelem->smcibdev->ibdev->name, -					 new_pnetelem->ib_port)) +					 new_pnetelem->ib_port)) { +			dev_put(pnetelem->ndev);  			goto found; +		}  	}  	list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist);  	rc = 0; diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index c4f1bccd4358..5a29519db976 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Shared Memory Communications over RDMA (SMC-R) and RoCE   * diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index f0c8b089f770..cbf58637ee14 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -148,6 +149,8 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,  				read_done = sock_intr_errno(timeo);  				break;  			} +			if (!timeo) +				return -EAGAIN;  		}  		if (!atomic_read(&conn->bytes_to_rcv)) { @@ -170,6 +173,7 @@ copy:  				  copylen, conn->rmbe_size - cons.count);  		chunk_len_sum = chunk_len;  		chunk_off = cons.count; +		smc_rmb_sync_sg_for_cpu(conn);  		for (chunk = 0; chunk < 2; chunk++) {  			if (!(flags & MSG_TRUNC)) {  				rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off, @@ -177,6 +181,7 @@ copy:  				if (rc) {  					if (!read_done)  						read_done = -EFAULT; +					smc_rmb_sync_sg_for_device(conn);  					goto out;  				}  			} @@ -190,6 +195,7 @@ copy:  			chunk_len_sum += chunk_len;  			chunk_off = 0; /* modulo offset in recv ring buffer */  		} +		smc_rmb_sync_sg_for_device(conn);  		/* update cursors */  		if (!(flags & MSG_PEEK)) { diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h index b5b80e1f8b0f..3a32b59bf06c 100644 --- a/net/smc/smc_rx.h +++ b/net/smc/smc_rx.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Shared Memory Communications over RDMA (SMC-R) and RoCE   * diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 21ec1832ab51..c48dc2d5fd3a 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -24,6 +25,8 @@  #include "smc_cdc.h"  #include "smc_tx.h" +#define SMC_TX_WORK_DELAY	HZ +  /***************************** sndbuf producer *******************************/  /* callback implementation for sk.sk_write_space() @@ -174,10 +177,12 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)  				  copylen, conn->sndbuf_size - tx_cnt_prep);  		chunk_len_sum = chunk_len;  		chunk_off = tx_cnt_prep; +		smc_sndbuf_sync_sg_for_cpu(conn);  		for (chunk = 0; chunk < 2; chunk++) {  			rc = memcpy_from_msg(sndbuf_base + chunk_off,  					     msg, chunk_len);  			if (rc) { +				smc_sndbuf_sync_sg_for_device(conn);  				if (send_done)  					return send_done;  				goto out_err; @@ -192,6 +197,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)  			chunk_len_sum += chunk_len;  			chunk_off = 0; /* modulo offset in send ring buffer */  		} +		smc_sndbuf_sync_sg_for_device(conn);  		/* update cursors */  		smc_curs_add(conn->sndbuf_size, &prep, copylen);  		smc_curs_write(&conn->tx_curs_prep, @@ -277,6 +283,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)  	struct smc_link_group *lgr = conn->lgr;  	int to_send, rmbespace;  	struct smc_link *link; +	dma_addr_t dma_addr;  	int num_sges;  	int rc; @@ -334,12 +341,11 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)  		src_len = conn->sndbuf_size - sent.count;  	}  	src_len_sum = src_len; +	dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl);  	for (dstchunk = 0; dstchunk < 2; dstchunk++) {  		num_sges = 0;  		for (srcchunk = 0; srcchunk < 2; srcchunk++) { -			sges[srcchunk].addr = -				conn->sndbuf_desc->dma_addr[SMC_SINGLE_LINK] + -				src_off; +			sges[srcchunk].addr = dma_addr + src_off;  			sges[srcchunk].length = src_len;  			sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;  			num_sges++; @@ -391,8 +397,7 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)  	int rc;  	spin_lock_bh(&conn->send_lock); -	rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf, -				   &pend); +	rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend);  	if (rc < 0) {  		if (rc == -EBUSY) {  			struct smc_sock *smc = @@ -403,7 +408,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)  				goto out_unlock;  			}  			rc = 0; -			schedule_work(&conn->tx_work); +			schedule_delayed_work(&conn->tx_work, +					      SMC_TX_WORK_DELAY);  		}  		goto out_unlock;  	} @@ -427,7 +433,7 @@ out_unlock:   */  static void smc_tx_work(struct work_struct *work)  { -	struct smc_connection *conn = container_of(work, +	struct smc_connection *conn = container_of(to_delayed_work(work),  						   struct smc_connection,  						   tx_work);  	struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -460,12 +466,12 @@ void smc_tx_consumer_update(struct smc_connection *conn)  	    ((to_confirm > conn->rmbe_update_limit) &&  	     ((to_confirm > (conn->rmbe_size / 2)) ||  	      conn->local_rx_ctrl.prod_flags.write_blocked))) { -		rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], -					   &wr_buf, &pend); +		rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend);  		if (!rc)  			rc = smc_cdc_msg_send(conn, wr_buf, pend);  		if (rc < 0) { -			schedule_work(&conn->tx_work); +			schedule_delayed_work(&conn->tx_work, +					      SMC_TX_WORK_DELAY);  			return;  		}  		smc_curs_write(&conn->rx_curs_confirmed, @@ -484,6 +490,6 @@ void smc_tx_consumer_update(struct smc_connection *conn)  void smc_tx_init(struct smc_sock *smc)  {  	smc->sk.sk_write_space = smc_tx_write_space; -	INIT_WORK(&smc->conn.tx_work, smc_tx_work); +	INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);  	spin_lock_init(&smc->conn.send_lock);  } diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index 1d6a0dcdcfe6..78255964fa4d 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Shared Memory Communications over RDMA (SMC-R) and RoCE   * diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 874ee9f9d796..de4537f66832 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -68,6 +69,16 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)  	int i;  	link = wc->qp->qp_context; + +	if (wc->opcode == IB_WC_REG_MR) { +		if (wc->status) +			link->wr_reg_state = FAILED; +		else +			link->wr_reg_state = CONFIRMED; +		wake_up(&link->wr_reg_wait); +		return; +	} +  	pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);  	if (pnd_snd_idx == link->wr_tx_cnt)  		return; @@ -234,7 +245,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)  	int rc;  	ib_req_notify_cq(link->smcibdev->roce_cq_send, -			 IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS); +			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);  	pend = container_of(priv, struct smc_wr_tx_pend, priv);  	rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx],  			  &failed_wr); @@ -243,6 +254,52 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)  	return rc;  } +/* Register a memory region and wait for result. */ +int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) +{ +	struct ib_send_wr *failed_wr = NULL; +	int rc; + +	ib_req_notify_cq(link->smcibdev->roce_cq_send, +			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); +	link->wr_reg_state = POSTED; +	link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; +	link->wr_reg.mr = mr; +	link->wr_reg.key = mr->rkey; +	failed_wr = &link->wr_reg.wr; +	rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, &failed_wr); +	WARN_ON(failed_wr != &link->wr_reg.wr); +	if (rc) +		return rc; + +	rc = wait_event_interruptible_timeout(link->wr_reg_wait, +					      (link->wr_reg_state != POSTED), +					      SMC_WR_REG_MR_WAIT_TIME); +	if (!rc) { +		/* timeout - terminate connections */ +		struct smc_link_group *lgr; + +		lgr = container_of(link, struct smc_link_group, +				   lnk[SMC_SINGLE_LINK]); +		smc_lgr_terminate(lgr); +		return -EPIPE; +	} +	if (rc == -ERESTARTSYS) +		return -EINTR; +	switch (link->wr_reg_state) { +	case CONFIRMED: +		rc = 0; +		break; +	case FAILED: +		rc = -EIO; +		break; +	case POSTED: +		rc = -EPIPE; +		break; +	} +	return rc; +} +  void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type,  			     smc_wr_tx_filter filter,  			     smc_wr_tx_dismisser dismisser, @@ -458,6 +515,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)  		lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];  		lnk->wr_rx_ibs[i].num_sge = 1;  	} +	lnk->wr_reg.wr.next = NULL; +	lnk->wr_reg.wr.num_sge = 0; +	lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED; +	lnk->wr_reg.wr.opcode = IB_WR_REG_MR; +	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;  }  void smc_wr_free_link(struct smc_link *lnk) @@ -602,6 +664,8 @@ int smc_wr_create_link(struct smc_link *lnk)  	smc_wr_init_sge(lnk);  	memset(lnk->wr_tx_mask, 0,  	       BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); +	init_waitqueue_head(&lnk->wr_tx_wait); +	init_waitqueue_head(&lnk->wr_reg_wait);  	return rc;  dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 0b9beeda6053..2acf12b06063 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Shared Memory Communications over RDMA (SMC-R) and RoCE   * @@ -102,5 +103,6 @@ void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,  int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);  int smc_wr_rx_post_init(struct smc_link *link);  void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context); +int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr);  #endif /* SMC_WR_H */ | 
