From bedbdd8bada194a690d2901801bf8451965086b3 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Tue, 10 Jun 2008 08:40:35 -0400 Subject: knfsd: Replace lock_kernel with a mutex for nfsd thread startup/shutdown locking. This removes the BKL from the RPC service creation codepath. The BKL really isn't adequate for this job since some of this info needs protection across sleeps. Also, add some comments to try and clarify how the locking should work and to make it clear that the BKL isn't necessary as long as there is adequate locking between tasks when touching the svc_serv fields. Signed-off-by: Neil Brown Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- net/sunrpc/svc.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 01c7e311b90..7bffaff2a3a 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -461,7 +461,8 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize, EXPORT_SYMBOL(svc_create_pooled); /* - * Destroy an RPC service. Should be called with the BKL held + * Destroy an RPC service. Should be called with appropriate locking to + * protect the sv_nrthreads, sv_permsocks and sv_tempsocks. */ void svc_destroy(struct svc_serv *serv) @@ -578,9 +579,10 @@ out_enomem: EXPORT_SYMBOL(svc_prepare_thread); /* - * Create a thread in the given pool. Caller must hold BKL. - * On a NUMA or SMP machine, with a multi-pool serv, the thread - * will be restricted to run on the cpus belonging to the pool. + * Create a thread in the given pool. Caller must hold BKL or another lock to + * serialize access to the svc_serv struct. On a NUMA or SMP machine, with a + * multi-pool serv, the thread will be restricted to run on the cpus belonging + * to the pool. */ static int __svc_create_thread(svc_thread_fn func, struct svc_serv *serv, @@ -674,7 +676,7 @@ found_pool: * of threads the given number. If `pool' is non-NULL, applies * only to threads in that pool, otherwise round-robins between * all pools. Must be called with a svc_get() reference and - * the BKL held. + * the BKL or another lock to protect access to svc_serv fields. * * Destroying threads relies on the service threads filling in * rqstp->rq_task, which only the nfs ones do. Assumes the serv @@ -722,7 +724,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) EXPORT_SYMBOL(svc_set_num_threads); /* - * Called from a server thread as it's exiting. Caller must hold BKL. + * Called from a server thread as it's exiting. Caller must hold the BKL or + * the "service mutex", whichever is appropriate for the service. */ void svc_exit_thread(struct svc_rqst *rqstp) -- cgit v1.2.3 From 9867d76ca16b3f455f9ca83861f4ce5c94a25928 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 10 Jun 2008 08:40:38 -0400 Subject: knfsd: convert knfsd to kthread API This patch is rather large, but I couldn't figure out a way to break it up that would remain bisectable. It does several things: - change svc_thread_fn typedef to better match what kthread_create expects - change svc_pool_map_set_cpumask to be more kthread friendly. Make it take a task arg and and get rid of the "oldmask" - have svc_set_num_threads call kthread_create directly - eliminate __svc_create_thread Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 45 ++++++++++++-------- include/linux/sunrpc/svc.h | 2 +- net/sunrpc/svc.c | 100 +++++++++++++++------------------------------ 3 files changed, 64 insertions(+), 83 deletions(-) (limited to 'net') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 6339cb70a08..9e215681371 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -46,7 +47,7 @@ #define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGHUP) | sigmask(SIGINT) | sigmask(SIGQUIT)) extern struct svc_program nfsd_program; -static void nfsd(struct svc_rqst *rqstp); +static int nfsd(void *vrqstp); struct timeval nfssvc_boot; static atomic_t nfsd_busy; static unsigned long nfsd_last_call; @@ -407,18 +408,19 @@ update_thread_usage(int busy_threads) /* * This is the NFS server kernel thread */ -static void -nfsd(struct svc_rqst *rqstp) +static int +nfsd(void *vrqstp) { + struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; struct fs_struct *fsp; - int err; sigset_t shutdown_mask, allowed_mask; + int err, preverr = 0; + unsigned int signo; /* Lock module and set up kernel thread */ mutex_lock(&nfsd_mutex); - daemonize("nfsd"); - /* After daemonize() this kernel thread shares current->fs + /* At this point, the thread shares current->fs * with the init process. We need to create files with a * umask of 0 instead of init's umask. */ fsp = copy_fs_struct(current->fs); @@ -433,14 +435,18 @@ nfsd(struct svc_rqst *rqstp) siginitsetinv(&shutdown_mask, SHUTDOWN_SIGS); siginitsetinv(&allowed_mask, ALLOWED_SIGS); + /* + * thread is spawned with all signals set to SIG_IGN, re-enable + * the ones that matter + */ + for (signo = 1; signo <= _NSIG; signo++) { + if (!sigismember(&shutdown_mask, signo)) + allow_signal(signo); + } nfsdstats.th_cnt++; - - rqstp->rq_task = current; - mutex_unlock(&nfsd_mutex); - /* * We want less throttling in balance_dirty_pages() so that nfs to * localhost doesn't cause nfsd to lock up due to all the client's @@ -462,15 +468,25 @@ nfsd(struct svc_rqst *rqstp) */ while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN) ; - if (err < 0) + if (err == -EINTR) break; + else if (err < 0) { + if (err != preverr) { + printk(KERN_WARNING "%s: unexpected error " + "from svc_recv (%d)\n", __func__, -err); + preverr = err; + } + schedule_timeout_uninterruptible(HZ); + continue; + } + update_thread_usage(atomic_read(&nfsd_busy)); atomic_inc(&nfsd_busy); /* Lock the export hash tables for reading. */ exp_readlock(); - /* Process request with signals blocked. */ + /* Process request with signals blocked. */ sigprocmask(SIG_SETMASK, &allowed_mask, NULL); svc_process(rqstp); @@ -481,14 +497,10 @@ nfsd(struct svc_rqst *rqstp) atomic_dec(&nfsd_busy); } - if (err != -EINTR) - printk(KERN_WARNING "nfsd: terminating on error %d\n", -err); - /* Clear signals before calling svc_exit_thread() */ flush_signals(current); mutex_lock(&nfsd_mutex); - nfsdstats.th_cnt --; out: @@ -498,6 +510,7 @@ out: /* Release module */ mutex_unlock(&nfsd_mutex); module_put_and_exit(0); + return 0; } static __be32 map_new_errors(u32 vers, __be32 nfserr) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 4b54c5fdcfd..011d6d8100d 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -22,7 +22,7 @@ /* * This is the RPC server thread function prototype */ -typedef void (*svc_thread_fn)(struct svc_rqst *); +typedef int (*svc_thread_fn)(void *); /* * diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 7bffaff2a3a..03a9f1a9e75 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -291,15 +292,14 @@ svc_pool_map_put(void) /* - * Set the current thread's cpus_allowed mask so that it + * Set the given thread's cpus_allowed mask so that it * will only run on cpus in the given pool. - * - * Returns 1 and fills in oldmask iff a cpumask was applied. */ -static inline int -svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) +static inline void +svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx) { struct svc_pool_map *m = &svc_pool_map; + unsigned int node = m->pool_to[pidx]; /* * The caller checks for sv_nrpools > 1, which @@ -307,26 +307,17 @@ svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) */ BUG_ON(m->count == 0); - switch (m->mode) - { - default: - return 0; + switch (m->mode) { case SVC_POOL_PERCPU: { - unsigned int cpu = m->pool_to[pidx]; - - *oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - return 1; + set_cpus_allowed_ptr(task, &cpumask_of_cpu(node)); + break; } case SVC_POOL_PERNODE: { - unsigned int node = m->pool_to[pidx]; node_to_cpumask_ptr(nodecpumask, node); - - *oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, nodecpumask); - return 1; + set_cpus_allowed_ptr(task, nodecpumask); + break; } } } @@ -578,47 +569,6 @@ out_enomem: } EXPORT_SYMBOL(svc_prepare_thread); -/* - * Create a thread in the given pool. Caller must hold BKL or another lock to - * serialize access to the svc_serv struct. On a NUMA or SMP machine, with a - * multi-pool serv, the thread will be restricted to run on the cpus belonging - * to the pool. - */ -static int -__svc_create_thread(svc_thread_fn func, struct svc_serv *serv, - struct svc_pool *pool) -{ - struct svc_rqst *rqstp; - int error = -ENOMEM; - int have_oldmask = 0; - cpumask_t uninitialized_var(oldmask); - - rqstp = svc_prepare_thread(serv, pool); - if (IS_ERR(rqstp)) { - error = PTR_ERR(rqstp); - goto out; - } - - if (serv->sv_nrpools > 1) - have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); - - error = kernel_thread((int (*)(void *)) func, rqstp, 0); - - if (have_oldmask) - set_cpus_allowed(current, oldmask); - - if (error < 0) - goto out_thread; - svc_sock_update_bufs(serv); - error = 0; -out: - return error; - -out_thread: - svc_exit_thread(rqstp); - goto out; -} - /* * Choose a pool in which to create a new thread, for svc_set_num_threads */ @@ -688,7 +638,9 @@ found_pool: int svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { - struct task_struct *victim; + struct svc_rqst *rqstp; + struct task_struct *task; + struct svc_pool *chosen_pool; int error = 0; unsigned int state = serv->sv_nrthreads-1; @@ -704,18 +656,34 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) /* create new threads */ while (nrservs > 0) { nrservs--; + chosen_pool = choose_pool(serv, pool, &state); + + rqstp = svc_prepare_thread(serv, chosen_pool); + if (IS_ERR(rqstp)) { + error = PTR_ERR(rqstp); + break; + } + __module_get(serv->sv_module); - error = __svc_create_thread(serv->sv_function, serv, - choose_pool(serv, pool, &state)); - if (error < 0) { + task = kthread_create(serv->sv_function, rqstp, serv->sv_name); + if (IS_ERR(task)) { + error = PTR_ERR(task); module_put(serv->sv_module); + svc_exit_thread(rqstp); break; } + + rqstp->rq_task = task; + if (serv->sv_nrpools > 1) + svc_pool_map_set_cpumask(task, chosen_pool->sp_id); + + svc_sock_update_bufs(serv); + wake_up_process(task); } /* destroy old threads */ while (nrservs < 0 && - (victim = choose_victim(serv, pool, &state)) != NULL) { - send_sig(serv->sv_kill_signal, victim, 1); + (task = choose_victim(serv, pool, &state)) != NULL) { + send_sig(serv->sv_kill_signal, task, 1); nrservs++; } -- cgit v1.2.3 From a75c5d01e4235a7dd785548ac756f248b1b40107 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 10 Jun 2008 08:40:39 -0400 Subject: sunrpc: remove sv_kill_signal field from svc_serv struct Since we no longer make any distinction between shutdown signals with nfsd, then it becomes easier to just standardize on a particular signal to use to bring it down (SIGINT, in this case). Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 3 +-- include/linux/sunrpc/svc.h | 5 ++--- net/sunrpc/svc.c | 5 ++--- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9e215681371..26c81149d49 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -236,8 +236,7 @@ int nfsd_create_serv(void) atomic_set(&nfsd_busy, 0); nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, - nfsd_last_thread, nfsd, SIGINT, - THIS_MODULE); + nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) err = -ENOMEM; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 011d6d8100d..dc69068d94c 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -80,7 +80,6 @@ struct svc_serv { struct module * sv_module; /* optional module to count when * adding threads */ svc_thread_fn sv_function; /* main function for threads */ - int sv_kill_signal; /* signal to kill threads */ }; /* @@ -388,8 +387,8 @@ struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool); void svc_exit_thread(struct svc_rqst *); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, - void (*shutdown)(struct svc_serv*), - svc_thread_fn, int sig, struct module *); + void (*shutdown)(struct svc_serv*), svc_thread_fn, + struct module *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); void svc_destroy(struct svc_serv *); int svc_process(struct svc_rqst *); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 03a9f1a9e75..5a32cb7c4bb 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -434,7 +434,7 @@ EXPORT_SYMBOL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, void (*shutdown)(struct svc_serv *serv), - svc_thread_fn func, int sig, struct module *mod) + svc_thread_fn func, struct module *mod) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); @@ -443,7 +443,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize, if (serv != NULL) { serv->sv_function = func; - serv->sv_kill_signal = sig; serv->sv_module = mod; } @@ -683,7 +682,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) /* destroy old threads */ while (nrservs < 0 && (task = choose_victim(serv, pool, &state)) != NULL) { - send_sig(serv->sv_kill_signal, task, 1); + send_sig(SIGINT, task, 1); nrservs++; } -- cgit v1.2.3 From d00953a53e9a2edbe005c1e596f1e96a8a293401 Mon Sep 17 00:00:00 2001 From: Kevin Coffman Date: Wed, 30 Apr 2008 12:45:53 -0400 Subject: gss_krb5: create a define for token header size and clean up ptr location cleanup: Document token header size with a #define instead of open-coding it. Don't needlessly increment "ptr" past the beginning of the header which makes the values passed to functions more understandable and eliminates the need for extra "krb5_hdr" pointer. Clean up some intersecting white-space issues flagged by checkpatch.pl. This leaves the checksum length hard-coded at 8 for DES. A later patch cleans that up. Signed-off-by: Kevin Coffman Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/gss_krb5.h | 3 +++ net/sunrpc/auth_gss/gss_krb5_seal.c | 26 +++++++++--------- net/sunrpc/auth_gss/gss_krb5_unseal.c | 16 +++++------ net/sunrpc/auth_gss/gss_krb5_wrap.c | 50 +++++++++++++++++------------------ 4 files changed, 49 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h index a10f1fb0bf7..e7bbdba474d 100644 --- a/include/linux/sunrpc/gss_krb5.h +++ b/include/linux/sunrpc/gss_krb5.h @@ -51,6 +51,9 @@ struct krb5_ctx { extern spinlock_t krb5_seq_lock; +/* The length of the Kerberos GSS token header */ +#define GSS_KRB5_TOK_HDR_LEN (16) + #define KG_TOK_MIC_MSG 0x0101 #define KG_TOK_WRAP_MSG 0x0201 diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 5f1d36dfbcf..b8f42ef7178 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -78,7 +78,7 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; char cksumdata[16]; struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; - unsigned char *ptr, *krb5_hdr, *msg_start; + unsigned char *ptr, *msg_start; s32 now; u32 seq_send; @@ -87,36 +87,36 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, now = get_seconds(); - token->len = g_token_size(&ctx->mech_used, 24); + token->len = g_token_size(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8); ptr = token->data; - g_make_token_header(&ctx->mech_used, 24, &ptr); + g_make_token_header(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8, &ptr); - *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff); - *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff); + /* ptr now at header described in rfc 1964, section 1.2.1: */ + ptr[0] = (unsigned char) ((KG_TOK_MIC_MSG >> 8) & 0xff); + ptr[1] = (unsigned char) (KG_TOK_MIC_MSG & 0xff); - /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ - krb5_hdr = ptr - 2; - msg_start = krb5_hdr + 24; + msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8; - *(__be16 *)(krb5_hdr + 2) = htons(SGN_ALG_DES_MAC_MD5); - memset(krb5_hdr + 4, 0xff, 4); + *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5); + memset(ptr + 4, 0xff, 4); - if (make_checksum("md5", krb5_hdr, 8, text, 0, &md5cksum)) + if (make_checksum("md5", ptr, 8, text, 0, &md5cksum)) return GSS_S_FAILURE; if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, md5cksum.data, md5cksum.len)) return GSS_S_FAILURE; - memcpy(krb5_hdr + 16, md5cksum.data + md5cksum.len - 8, 8); + memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8); spin_lock(&krb5_seq_lock); seq_send = ctx->seq_send++; spin_unlock(&krb5_seq_lock); if (krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff, - seq_send, krb5_hdr + 16, krb5_hdr + 8)) + seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, + ptr + 8)) return GSS_S_FAILURE; return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index d91a5d00480..066ec73c84d 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -92,30 +92,30 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, read_token->len)) return GSS_S_DEFECTIVE_TOKEN; - if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) || - (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) ) + if ((ptr[0] != ((KG_TOK_MIC_MSG >> 8) & 0xff)) || + (ptr[1] != (KG_TOK_MIC_MSG & 0xff))) return GSS_S_DEFECTIVE_TOKEN; /* XXX sanity-check bodysize?? */ - signalg = ptr[0] + (ptr[1] << 8); + signalg = ptr[2] + (ptr[3] << 8); if (signalg != SGN_ALG_DES_MAC_MD5) return GSS_S_DEFECTIVE_TOKEN; - sealalg = ptr[2] + (ptr[3] << 8); + sealalg = ptr[4] + (ptr[5] << 8); if (sealalg != SEAL_ALG_NONE) return GSS_S_DEFECTIVE_TOKEN; - if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) + if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) return GSS_S_DEFECTIVE_TOKEN; - if (make_checksum("md5", ptr - 2, 8, message_buffer, 0, &md5cksum)) + if (make_checksum("md5", ptr, 8, message_buffer, 0, &md5cksum)) return GSS_S_FAILURE; if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, md5cksum.data, 16)) return GSS_S_FAILURE; - if (memcmp(md5cksum.data + 8, ptr + 14, 8)) + if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8)) return GSS_S_BAD_SIG; /* it got through unscathed. Make sure the context is unexpired */ @@ -127,7 +127,7 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, /* do sequencing checks */ - if (krb5_get_seq_num(ctx->seq, ptr + 14, ptr + 6, &direction, &seqnum)) + if (krb5_get_seq_num(ctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, &direction, &seqnum)) return GSS_S_FAILURE; if ((ctx->initiate && direction != 0xff) || diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index b00b1b42630..283cb25c623 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -122,7 +122,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset, char cksumdata[16]; struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; int blocksize = 0, plainlen; - unsigned char *ptr, *krb5_hdr, *msg_start; + unsigned char *ptr, *msg_start; s32 now; int headlen; struct page **tmp_pages; @@ -149,26 +149,26 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset, buf->len += headlen; BUG_ON((buf->len - offset - headlen) % blocksize); - g_make_token_header(&kctx->mech_used, 24 + plainlen, &ptr); + g_make_token_header(&kctx->mech_used, + GSS_KRB5_TOK_HDR_LEN + 8 + plainlen, &ptr); - *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff); - *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff); + /* ptr now at header described in rfc 1964, section 1.2.1: */ + ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff); + ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff); - /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ - krb5_hdr = ptr - 2; - msg_start = krb5_hdr + 24; + msg_start = ptr + 24; - *(__be16 *)(krb5_hdr + 2) = htons(SGN_ALG_DES_MAC_MD5); - memset(krb5_hdr + 4, 0xff, 4); - *(__be16 *)(krb5_hdr + 4) = htons(SEAL_ALG_DES); + *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5); + memset(ptr + 4, 0xff, 4); + *(__be16 *)(ptr + 4) = htons(SEAL_ALG_DES); make_confounder(msg_start, blocksize); /* XXXJBF: UGH!: */ tmp_pages = buf->pages; buf->pages = pages; - if (make_checksum("md5", krb5_hdr, 8, buf, + if (make_checksum("md5", ptr, 8, buf, offset + headlen - blocksize, &md5cksum)) return GSS_S_FAILURE; buf->pages = tmp_pages; @@ -176,7 +176,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset, if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, md5cksum.data, md5cksum.len)) return GSS_S_FAILURE; - memcpy(krb5_hdr + 16, md5cksum.data + md5cksum.len - 8, 8); + memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8); spin_lock(&krb5_seq_lock); seq_send = kctx->seq_send++; @@ -185,7 +185,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset, /* XXX would probably be more efficient to compute checksum * and encrypt at the same time: */ if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff, - seq_send, krb5_hdr + 16, krb5_hdr + 8))) + seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))) return GSS_S_FAILURE; if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize, @@ -219,38 +219,38 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) buf->len - offset)) return GSS_S_DEFECTIVE_TOKEN; - if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) || - (*ptr++ != (KG_TOK_WRAP_MSG &0xff)) ) + if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) || + (ptr[1] != (KG_TOK_WRAP_MSG & 0xff))) return GSS_S_DEFECTIVE_TOKEN; /* XXX sanity-check bodysize?? */ /* get the sign and seal algorithms */ - signalg = ptr[0] + (ptr[1] << 8); + signalg = ptr[2] + (ptr[3] << 8); if (signalg != SGN_ALG_DES_MAC_MD5) return GSS_S_DEFECTIVE_TOKEN; - sealalg = ptr[2] + (ptr[3] << 8); + sealalg = ptr[4] + (ptr[5] << 8); if (sealalg != SEAL_ALG_DES) return GSS_S_DEFECTIVE_TOKEN; - if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) + if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) return GSS_S_DEFECTIVE_TOKEN; if (gss_decrypt_xdr_buf(kctx->enc, buf, - ptr + 22 - (unsigned char *)buf->head[0].iov_base)) + ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base)) return GSS_S_DEFECTIVE_TOKEN; - if (make_checksum("md5", ptr - 2, 8, buf, - ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum)) + if (make_checksum("md5", ptr, 8, buf, + ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base, &md5cksum)) return GSS_S_FAILURE; if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, md5cksum.data, md5cksum.len)) return GSS_S_FAILURE; - if (memcmp(md5cksum.data + 8, ptr + 14, 8)) + if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8)) return GSS_S_BAD_SIG; /* it got through unscathed. Make sure the context is unexpired */ @@ -262,8 +262,8 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) /* do sequencing checks */ - if (krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction, - &seqnum)) + if (krb5_get_seq_num(kctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, + &direction, &seqnum)) return GSS_S_BAD_SIG; if ((kctx->initiate && direction != 0xff) || @@ -274,7 +274,7 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) * better to copy and encrypt at the same time. */ blocksize = crypto_blkcipher_blocksize(kctx->enc); - data_start = ptr + 22 + blocksize; + data_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8 + blocksize; orig_start = buf->head[0].iov_base + offset; data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; memmove(orig_start, data_start, data_len); -- cgit v1.2.3 From db8add57898751b9c0ff93ead25f20d21da5ddd0 Mon Sep 17 00:00:00 2001 From: Kevin Coffman Date: Wed, 30 Apr 2008 12:45:58 -0400 Subject: gss_krb5: move gss_krb5_crypto into the krb5 module The gss_krb5_crypto.o object belongs in the rpcsec_gss_krb5 module. Also, there is no need to export symbols from gss_krb5_crypto.c Signed-off-by: Kevin Coffman Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/Makefile | 4 ++-- net/sunrpc/auth_gss/gss_krb5_crypto.c | 10 ---------- 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile index f3431a7e33d..4de8bcf26fa 100644 --- a/net/sunrpc/auth_gss/Makefile +++ b/net/sunrpc/auth_gss/Makefile @@ -5,12 +5,12 @@ obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o auth_rpcgss-objs := auth_gss.o gss_generic_token.o \ - gss_mech_switch.o svcauth_gss.o gss_krb5_crypto.o + gss_mech_switch.o svcauth_gss.o obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ - gss_krb5_seqnum.o gss_krb5_wrap.o + gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 1d52308ca32..c93fca20455 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -83,8 +83,6 @@ out: return ret; } -EXPORT_SYMBOL(krb5_encrypt); - u32 krb5_decrypt( struct crypto_blkcipher *tfm, @@ -118,8 +116,6 @@ out: return ret; } -EXPORT_SYMBOL(krb5_decrypt); - static int checksummer(struct scatterlist *sg, void *data) { @@ -161,8 +157,6 @@ out: return err ? GSS_S_FAILURE : 0; } -EXPORT_SYMBOL(make_checksum); - struct encryptor_desc { u8 iv[8]; /* XXX hard-coded blocksize */ struct blkcipher_desc desc; @@ -262,8 +256,6 @@ gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf, return ret; } -EXPORT_SYMBOL(gss_encrypt_xdr_buf); - struct decryptor_desc { u8 iv[8]; /* XXX hard-coded blocksize */ struct blkcipher_desc desc; @@ -334,5 +326,3 @@ gss_decrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf, return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc); } - -EXPORT_SYMBOL(gss_decrypt_xdr_buf); -- cgit v1.2.3 From 863a24882ed0a57ff25daaf39885f3a47b706e4b Mon Sep 17 00:00:00 2001 From: Kevin Coffman Date: Wed, 30 Apr 2008 12:46:08 -0400 Subject: gss_krb5: Use random value to initialize confounder Initialize the value used for the confounder to a random value rather than starting from zero. Allow for confounders of length 8 or 16 (which will be needed for AES). Signed-off-by: Kevin Coffman Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/gss_krb5_wrap.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 283cb25c623..ae8e69b59c4 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -87,8 +87,8 @@ out: return 0; } -static inline void -make_confounder(char *p, int blocksize) +static void +make_confounder(char *p, u32 conflen) { static u64 i = 0; u64 *q = (u64 *)p; @@ -102,8 +102,22 @@ make_confounder(char *p, int blocksize) * uniqueness would mean worrying about atomicity and rollover, and I * don't care enough. */ - BUG_ON(blocksize != 8); - *q = i++; + /* initialize to random value */ + if (i == 0) { + i = random32(); + i = (i << 32) | random32(); + } + + switch (conflen) { + case 16: + *q++ = i++; + /* fall through */ + case 8: + *q++ = i++; + break; + default: + BUG(); + } } /* Assumptions: the head and tail of inbuf are ours to play with. -- cgit v1.2.3 From ab96dddbedf4bb8a7a0fe44012efc1d99598c36f Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 28 May 2008 13:54:04 -0500 Subject: svcrdma: Add a type for keeping NFS RPC mapping Create a new data structure to hold the remote client address space to local server address space mapping. Signed-off-by: Tom Tucker --- include/linux/sunrpc/svc_rdma.h | 27 +++++++++++++++++++++++++++ net/sunrpc/xprtrdma/svc_rdma.c | 19 +++++++++++++++++++ net/sunrpc/xprtrdma/svc_rdma_transport.c | 26 ++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) (limited to 'net') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 05eb4664d0d..bd8749cc808 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -86,6 +86,31 @@ struct svc_rdma_op_ctxt { struct page *pages[RPCSVC_MAXPAGES]; }; +/* + * NFS_ requests are mapped on the client side by the chunk lists in + * the RPCRDMA header. During the fetching of the RPC from the client + * and the writing of the reply to the client, the memory in the + * client and the memory in the server must be mapped as contiguous + * vaddr/len for access by the hardware. These data strucures keep + * these mappings. + * + * For an RDMA_WRITE, the 'sge' maps the RPC REPLY. For RDMA_READ, the + * 'sge' in the svc_rdma_req_map maps the server side RPC reply and the + * 'ch' field maps the read-list of the RPCRDMA header to the 'sge' + * mapping of the reply. + */ +struct svc_rdma_chunk_sge { + int start; /* sge no for this chunk */ + int count; /* sge count for this chunk */ +}; +struct svc_rdma_req_map { + unsigned long count; + union { + struct kvec sge[RPCSVC_MAXPAGES]; + struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES]; + }; +}; + #define RDMACTXT_F_LAST_CTXT 2 struct svcxprt_rdma { @@ -173,6 +198,8 @@ extern int svc_rdma_post_recv(struct svcxprt_rdma *); extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); +extern struct svc_rdma_req_map *svc_rdma_get_req_map(void); +extern void svc_rdma_put_req_map(struct svc_rdma_req_map *); extern void svc_sq_reap(struct svcxprt_rdma *); extern void svc_rq_reap(struct svcxprt_rdma *); extern struct svc_xprt_class svc_rdma_class; diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 88c0ca20bb1..171f2053e90 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -69,6 +69,9 @@ atomic_t rdma_stat_rq_prod; atomic_t rdma_stat_sq_poll; atomic_t rdma_stat_sq_prod; +/* Temporary NFS request map cache */ +struct kmem_cache *svc_rdma_map_cachep; + /* * This function implements reading and resetting an atomic_t stat * variable through read/write to a proc file. Any write to the file @@ -241,6 +244,7 @@ void svc_rdma_cleanup(void) svcrdma_table_header = NULL; } svc_unreg_xprt_class(&svc_rdma_class); + kmem_cache_destroy(svc_rdma_map_cachep); } int svc_rdma_init(void) @@ -255,9 +259,24 @@ int svc_rdma_init(void) svcrdma_table_header = register_sysctl_table(svcrdma_root_table); + /* Create the temporary map cache */ + svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache", + sizeof(struct svc_rdma_req_map), + 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!svc_rdma_map_cachep) { + printk(KERN_INFO "Could not allocate map cache.\n"); + goto err; + } + /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); return 0; + + err: + unregister_sysctl_table(svcrdma_table_header); + return -ENOMEM; } MODULE_AUTHOR("Tom Tucker "); MODULE_DESCRIPTION("SVC RDMA Transport"); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index e132509d1db..ae90758d8e9 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -173,6 +173,32 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) atomic_dec(&xprt->sc_ctxt_used); } +/* Temporary NFS request map cache. Created in svc_rdma.c */ +extern struct kmem_cache *svc_rdma_map_cachep; + +/* + * Temporary NFS req mappings are shared across all transport + * instances. These are short lived and should be bounded by the number + * of concurrent server threads * depth of the SQ. + */ +struct svc_rdma_req_map *svc_rdma_get_req_map(void) +{ + struct svc_rdma_req_map *map; + while (1) { + map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL); + if (map) + break; + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); + } + map->count = 0; + return map; +} + +void svc_rdma_put_req_map(struct svc_rdma_req_map *map) +{ + kmem_cache_free(svc_rdma_map_cachep, map); +} + /* ib_cq event handler */ static void cq_event_handler(struct ib_event *event, void *context) { -- cgit v1.2.3 From 34d16e42a6ab74a4a4389c061dfa3c6609e08fa0 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 2 Jul 2008 14:56:13 -0500 Subject: svcrdma: Use RPC reply map for RDMA_WRITE processing Use the new svc_rdma_req_map data type for mapping the client side memory to the server side memory. Move the DMA mapping to the context pointed to by each WR individually so that it is unmapped after the WR completes. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 163 ++++++++++++++----------------- net/sunrpc/xprtrdma/svc_rdma_transport.c | 5 +- 2 files changed, 80 insertions(+), 88 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index fb82b1b683f..bdc11a30e93 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -63,52 +63,44 @@ * SGE[2..sge_count-2] data from xdr->pages[] * SGE[sge_count-1] data from xdr->tail. * + * The max SGE we need is the length of the XDR / pagesize + one for + * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES + * reserves a page for both the request and the reply header, and this + * array is only concerned with the reply we are assured that we have + * on extra page for the RPCRMDA header. */ -static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct ib_sge *sge, - int *sge_count) +static void xdr_to_sge(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) { - /* Max we need is the length of the XDR / pagesize + one for - * head + one for tail + one for RPCRDMA header - */ int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; int sge_no; - u32 byte_count = xdr->len; u32 sge_bytes; u32 page_bytes; - int page_off; + u32 page_off; int page_no; + BUG_ON(xdr->len != + (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); + /* Skip the first sge, this is for the RPCRDMA header */ sge_no = 1; /* Head SGE */ - sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device, - xdr->head[0].iov_base, - xdr->head[0].iov_len, - DMA_TO_DEVICE); - sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len); - byte_count -= sge_bytes; - sge[sge_no].length = sge_bytes; - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + vec->sge[sge_no].iov_base = xdr->head[0].iov_base; + vec->sge[sge_no].iov_len = xdr->head[0].iov_len; sge_no++; /* pages SGE */ page_no = 0; page_bytes = xdr->page_len; page_off = xdr->page_base; - while (byte_count && page_bytes) { - sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off)); - sge[sge_no].addr = - ib_dma_map_page(xprt->sc_cm_id->device, - xdr->pages[page_no], page_off, - sge_bytes, DMA_TO_DEVICE); - sge_bytes = min(sge_bytes, page_bytes); - byte_count -= sge_bytes; + while (page_bytes) { + vec->sge[sge_no].iov_base = + page_address(xdr->pages[page_no]) + page_off; + sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); page_bytes -= sge_bytes; - sge[sge_no].length = sge_bytes; - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + vec->sge[sge_no].iov_len = sge_bytes; sge_no++; page_no++; @@ -116,36 +108,24 @@ static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt, } /* Tail SGE */ - if (byte_count && xdr->tail[0].iov_len) { - sge[sge_no].addr = - ib_dma_map_single(xprt->sc_cm_id->device, - xdr->tail[0].iov_base, - xdr->tail[0].iov_len, - DMA_TO_DEVICE); - sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len); - byte_count -= sge_bytes; - sge[sge_no].length = sge_bytes; - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + if (xdr->tail[0].iov_len) { + vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; + vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; sge_no++; } BUG_ON(sge_no > sge_max); - BUG_ON(byte_count != 0); - - *sge_count = sge_no; - return sge; + vec->count = sge_no; } - /* Assumptions: * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE */ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, u32 rmr, u64 to, u32 xdr_off, int write_len, - struct ib_sge *xdr_sge, int sge_count) + struct svc_rdma_req_map *vec) { - struct svc_rdma_op_ctxt *tmp_sge_ctxt; struct ib_send_wr write_wr; struct ib_sge *sge; int xdr_sge_no; @@ -154,25 +134,23 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, int sge_off; int bc; struct svc_rdma_op_ctxt *ctxt; - int ret = 0; - BUG_ON(sge_count > RPCSVC_MAXPAGES); + BUG_ON(vec->count > RPCSVC_MAXPAGES); dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " - "write_len=%d, xdr_sge=%p, sge_count=%d\n", + "write_len=%d, vec->sge=%p, vec->count=%lu\n", rmr, (unsigned long long)to, xdr_off, - write_len, xdr_sge, sge_count); + write_len, vec->sge, vec->count); ctxt = svc_rdma_get_context(xprt); - ctxt->count = 0; - tmp_sge_ctxt = svc_rdma_get_context(xprt); - sge = tmp_sge_ctxt->sge; + ctxt->direction = DMA_TO_DEVICE; + sge = ctxt->sge; /* Find the SGE associated with xdr_off */ - for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count; + for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count; xdr_sge_no++) { - if (xdr_sge[xdr_sge_no].length > bc) + if (vec->sge[xdr_sge_no].iov_len > bc) break; - bc -= xdr_sge[xdr_sge_no].length; + bc -= vec->sge[xdr_sge_no].iov_len; } sge_off = bc; @@ -180,21 +158,27 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge_no = 0; /* Copy the remaining SGE */ - while (bc != 0 && xdr_sge_no < sge_count) { - sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off; - sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey; + while (bc != 0 && xdr_sge_no < vec->count) { + sge[sge_no].lkey = xprt->sc_phys_mr->lkey; sge_bytes = min((size_t)bc, - (size_t)(xdr_sge[xdr_sge_no].length-sge_off)); + (size_t)(vec->sge[xdr_sge_no].iov_len-sge_off)); sge[sge_no].length = sge_bytes; - + sge[sge_no].addr = + ib_dma_map_single(xprt->sc_cm_id->device, + (void *) + vec->sge[xdr_sge_no].iov_base + sge_off, + sge_bytes, DMA_TO_DEVICE); + if (dma_mapping_error(sge[sge_no].addr)) + goto err; sge_off = 0; sge_no++; + ctxt->count++; xdr_sge_no++; bc -= sge_bytes; } BUG_ON(bc != 0); - BUG_ON(xdr_sge_no > sge_count); + BUG_ON(xdr_sge_no > vec->count); /* Prepare WRITE WR */ memset(&write_wr, 0, sizeof write_wr); @@ -209,21 +193,20 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, /* Post It */ atomic_inc(&rdma_stat_write); - if (svc_rdma_send(xprt, &write_wr)) { - svc_rdma_put_context(ctxt, 1); - /* Fatal error, close transport */ - ret = -EIO; - } - svc_rdma_put_context(tmp_sge_ctxt, 0); - return ret; + if (svc_rdma_send(xprt, &write_wr)) + goto err; + return 0; + err: + svc_rdma_put_context(ctxt, 0); + /* Fatal error, close transport */ + return -EIO; } static int send_write_chunks(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rdma_argp, struct rpcrdma_msg *rdma_resp, struct svc_rqst *rqstp, - struct ib_sge *sge, - int sge_count) + struct svc_rdma_req_map *vec) { u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; int write_len; @@ -269,8 +252,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, rs_offset + chunk_off, xdr_off, this_write, - sge, - sge_count); + vec); if (ret) { dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", ret); @@ -292,8 +274,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rdma_argp, struct rpcrdma_msg *rdma_resp, struct svc_rqst *rqstp, - struct ib_sge *sge, - int sge_count) + struct svc_rdma_req_map *vec) { u32 xfer_len = rqstp->rq_res.len; int write_len; @@ -341,8 +322,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, rs_offset + chunk_off, xdr_off, this_write, - sge, - sge_count); + vec); if (ret) { dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", ret); @@ -380,7 +360,7 @@ static int send_reply(struct svcxprt_rdma *rdma, struct page *page, struct rpcrdma_msg *rdma_resp, struct svc_rdma_op_ctxt *ctxt, - int sge_count, + struct svc_rdma_req_map *vec, int byte_count) { struct ib_send_wr send_wr; @@ -413,10 +393,15 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey; /* Determine how many of our SGE are to be transmitted */ - for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) { - sge_bytes = min((size_t)ctxt->sge[sge_no].length, - (size_t)byte_count); + for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { + sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; + ctxt->sge[sge_no].addr = + ib_dma_map_single(rdma->sc_cm_id->device, + vec->sge[sge_no].iov_base, + sge_bytes, DMA_TO_DEVICE); + ctxt->sge[sge_no].length = sge_bytes; + ctxt->sge[sge_no].lkey = rdma->sc_phys_mr->lkey; } BUG_ON(byte_count != 0); @@ -428,8 +413,10 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; ctxt->count++; rqstp->rq_respages[page_no] = NULL; + /* If there are more pages than SGE, terminate SGE list */ + if (page_no+1 >= sge_no) + ctxt->sge[page_no+1].length = 0; } - BUG_ON(sge_no > rdma->sc_max_sge); memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; @@ -473,20 +460,20 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) enum rpcrdma_proc reply_type; int ret; int inline_bytes; - struct ib_sge *sge; - int sge_count = 0; struct page *res_page; struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_req_map *vec; dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); /* Get the RDMA request header. */ rdma_argp = xdr_start(&rqstp->rq_arg); - /* Build an SGE for the XDR */ + /* Build an req vec for the XDR */ ctxt = svc_rdma_get_context(rdma); ctxt->direction = DMA_TO_DEVICE; - sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count); + vec = svc_rdma_get_req_map(); + xdr_to_sge(rdma, &rqstp->rq_res, vec); inline_bytes = rqstp->rq_res.len; @@ -503,7 +490,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) /* Send any write-chunk data and build resp write-list */ ret = send_write_chunks(rdma, rdma_argp, rdma_resp, - rqstp, sge, sge_count); + rqstp, vec); if (ret < 0) { printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", ret); @@ -513,7 +500,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) /* Send any reply-list data and update resp reply-list */ ret = send_reply_chunks(rdma, rdma_argp, rdma_resp, - rqstp, sge, sge_count); + rqstp, vec); if (ret < 0) { printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", ret); @@ -521,11 +508,13 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) } inline_bytes -= ret; - ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count, + ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec, inline_bytes); + svc_rdma_put_req_map(vec); dprintk("svcrdma: send_reply returns %d\n", ret); return ret; error: + svc_rdma_put_req_map(vec); svc_rdma_put_context(ctxt, 0); put_page(res_page); return ret; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index ae90758d8e9..fc86338bcbb 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -387,10 +387,13 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) switch (ctxt->wr_op) { case IB_WR_SEND: - case IB_WR_RDMA_WRITE: svc_rdma_put_context(ctxt, 1); break; + case IB_WR_RDMA_WRITE: + svc_rdma_put_context(ctxt, 0); + break; + case IB_WR_RDMA_READ: if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; -- cgit v1.2.3 From f820c57ebf5493d4602cc00577c8b0fadd27a7b8 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 27 May 2008 17:03:14 -0500 Subject: svcrdma: Use reply and chunk map for RDMA_READ processing Modify the RDMA_READ processing to use the reply and chunk list mapping data types. Also add a special purpose 'hdr_count' field in in the context to hold the header page count instead of overloading the SGE length field and corrupting the DMA map length. Signed-off-by: Tom Tucker --- include/linux/sunrpc/svc_rdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 83 +++++++++++++++------------------ 2 files changed, 39 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index bd8749cc808..fd5e8a1c17d 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -72,6 +72,7 @@ extern atomic_t rdma_stat_sq_prod; */ struct svc_rdma_op_ctxt { struct svc_rdma_op_ctxt *read_hdr; + int hdr_count; struct list_head free_list; struct xdr_buf arg; struct list_head dto_q; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 06ab4841537..d25971b42a7 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -112,11 +112,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, rqstp->rq_arg.tail[0].iov_len = 0; } -struct chunk_sge { - int start; /* sge no for this chunk */ - int count; /* sge count for this chunk */ -}; - /* Encode a read-chunk-list as an array of IB SGE * * Assumptions: @@ -134,8 +129,8 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head, struct rpcrdma_msg *rmsgp, - struct ib_sge *sge, - struct chunk_sge *ch_sge_ary, + struct svc_rdma_req_map *rpl_map, + struct svc_rdma_req_map *chl_map, int ch_count, int byte_count) { @@ -156,22 +151,18 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, head->arg.head[0] = rqstp->rq_arg.head[0]; head->arg.tail[0] = rqstp->rq_arg.tail[0]; head->arg.pages = &head->pages[head->count]; - head->sge[0].length = head->count; /* save count of hdr pages */ + head->hdr_count = head->count; /* save count of hdr pages */ head->arg.page_base = 0; head->arg.page_len = ch_bytes; head->arg.len = rqstp->rq_arg.len + ch_bytes; head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes; head->count++; - ch_sge_ary[0].start = 0; + chl_map->ch[0].start = 0; while (byte_count) { + rpl_map->sge[sge_no].iov_base = + page_address(rqstp->rq_arg.pages[page_no]) + page_off; sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes); - sge[sge_no].addr = - ib_dma_map_page(xprt->sc_cm_id->device, - rqstp->rq_arg.pages[page_no], - page_off, sge_bytes, - DMA_FROM_DEVICE); - sge[sge_no].length = sge_bytes; - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + rpl_map->sge[sge_no].iov_len = sge_bytes; /* * Don't bump head->count here because the same page * may be used by multiple SGE. @@ -187,11 +178,11 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, * SGE, move to the next SGE */ if (ch_bytes == 0) { - ch_sge_ary[ch_no].count = - sge_no - ch_sge_ary[ch_no].start; + chl_map->ch[ch_no].count = + sge_no - chl_map->ch[ch_no].start; ch_no++; ch++; - ch_sge_ary[ch_no].start = sge_no; + chl_map->ch[ch_no].start = sge_no; ch_bytes = ch->rc_target.rs_length; /* If bytes remaining account for next chunk */ if (byte_count) { @@ -220,18 +211,24 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, return sge_no; } -static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt, - struct ib_sge *sge, +static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, + struct svc_rdma_op_ctxt *ctxt, + struct kvec *vec, u64 *sgl_offset, int count) { int i; ctxt->count = count; + ctxt->direction = DMA_FROM_DEVICE; for (i = 0; i < count; i++) { - ctxt->sge[i].addr = sge[i].addr; - ctxt->sge[i].length = sge[i].length; - *sgl_offset = *sgl_offset + sge[i].length; + ctxt->sge[i].addr = + ib_dma_map_single(xprt->sc_cm_id->device, + vec[i].iov_base, vec[i].iov_len, + DMA_FROM_DEVICE); + ctxt->sge[i].length = vec[i].iov_len; + ctxt->sge[i].lkey = xprt->sc_phys_mr->lkey; + *sgl_offset = *sgl_offset + vec[i].iov_len; } } @@ -282,34 +279,29 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, struct ib_send_wr read_wr; int err = 0; int ch_no; - struct ib_sge *sge; int ch_count; int byte_count; int sge_count; u64 sgl_offset; struct rpcrdma_read_chunk *ch; struct svc_rdma_op_ctxt *ctxt = NULL; - struct svc_rdma_op_ctxt *tmp_sge_ctxt; - struct svc_rdma_op_ctxt *tmp_ch_ctxt; - struct chunk_sge *ch_sge_ary; + struct svc_rdma_req_map *rpl_map; + struct svc_rdma_req_map *chl_map; /* If no read list is present, return 0 */ ch = svc_rdma_get_read_chunk(rmsgp); if (!ch) return 0; - /* Allocate temporary contexts to keep SGE */ - BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge)); - tmp_sge_ctxt = svc_rdma_get_context(xprt); - sge = tmp_sge_ctxt->sge; - tmp_ch_ctxt = svc_rdma_get_context(xprt); - ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge; + /* Allocate temporary reply and chunk maps */ + rpl_map = svc_rdma_get_req_map(); + chl_map = svc_rdma_get_req_map(); svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); if (ch_count > RPCSVC_MAXPAGES) return -EINVAL; sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, - sge, ch_sge_ary, + rpl_map, chl_map, ch_count, byte_count); sgl_offset = 0; ch_no = 0; @@ -331,14 +323,15 @@ next_sge: read_wr.wr.rdma.remote_addr = get_unaligned(&(ch->rc_target.rs_offset)) + sgl_offset; - read_wr.sg_list = &sge[ch_sge_ary[ch_no].start]; + read_wr.sg_list = ctxt->sge; read_wr.num_sge = - rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count); - rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start], + rdma_read_max_sge(xprt, chl_map->ch[ch_no].count); + rdma_set_ctxt_sge(xprt, ctxt, + &rpl_map->sge[chl_map->ch[ch_no].start], &sgl_offset, read_wr.num_sge); if (((ch+1)->rc_discrim == 0) && - (read_wr.num_sge == ch_sge_ary[ch_no].count)) { + (read_wr.num_sge == chl_map->ch[ch_no].count)) { /* * Mark the last RDMA_READ with a bit to * indicate all RPC data has been fetched from @@ -358,9 +351,9 @@ next_sge: } atomic_inc(&rdma_stat_read); - if (read_wr.num_sge < ch_sge_ary[ch_no].count) { - ch_sge_ary[ch_no].count -= read_wr.num_sge; - ch_sge_ary[ch_no].start += read_wr.num_sge; + if (read_wr.num_sge < chl_map->ch[ch_no].count) { + chl_map->ch[ch_no].count -= read_wr.num_sge; + chl_map->ch[ch_no].start += read_wr.num_sge; goto next_sge; } sgl_offset = 0; @@ -368,8 +361,8 @@ next_sge: } out: - svc_rdma_put_context(tmp_sge_ctxt, 0); - svc_rdma_put_context(tmp_ch_ctxt, 0); + svc_rdma_put_req_map(rpl_map); + svc_rdma_put_req_map(chl_map); /* Detach arg pages. svc_recv will replenish them */ for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) @@ -399,7 +392,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_pages[page_no] = head->pages[page_no]; } /* Point rq_arg.pages past header */ - rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length]; + rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; rqstp->rq_arg.page_len = head->arg.page_len; rqstp->rq_arg.page_base = head->arg.page_base; -- cgit v1.2.3 From e6ab9143719ff76f0b95f0866c4d0f6c743ad2e0 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 28 May 2008 12:08:48 -0500 Subject: svcrdma: Move the DMA unmap logic to the CQ handler Separate DMA unmap from context destruction and perform DMA unmapping in the SQ/RQ CQ reap functions. This is necessary to support software based RDMA implementations that actually copy the data in their ib_dma_unmap callback functions and architectures that don't have cache coherent I/O busses. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index fc86338bcbb..7e8ee66458e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -150,6 +150,18 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) return ctxt; } +static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) +{ + struct svcxprt_rdma *xprt = ctxt->xprt; + int i; + for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { + ib_dma_unmap_single(xprt->sc_cm_id->device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); + } +} + void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) { struct svcxprt_rdma *xprt; @@ -161,12 +173,6 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) for (i = 0; i < ctxt->count; i++) put_page(ctxt->pages[i]); - for (i = 0; i < ctxt->count; i++) - ib_dma_unmap_single(xprt->sc_cm_id->device, - ctxt->sge[i].addr, - ctxt->sge[i].length, - ctxt->direction); - spin_lock_bh(&xprt->sc_ctxt_lock); list_add(&ctxt->free_list, &xprt->sc_ctxt_free); spin_unlock_bh(&xprt->sc_ctxt_lock); @@ -328,6 +334,7 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; ctxt->wc_status = wc.status; ctxt->byte_len = wc.byte_len; + svc_rdma_unmap_dma(ctxt); if (wc.status != IB_WC_SUCCESS) { /* Close the transport */ dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt); @@ -377,6 +384,7 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; xprt = ctxt->xprt; + svc_rdma_unmap_dma(ctxt); if (wc.status != IB_WC_SUCCESS) /* Close the transport */ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); -- cgit v1.2.3 From 87295b6c5c7fd7bbc0ce3e7f42d2adbbac7352b9 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 28 May 2008 13:17:44 -0500 Subject: svcrdma: Add dma map count and WARN_ON Add a dma map count in order to verify that all DMA mapping resources have been freed when the transport is closed. Signed-off-by: Tom Tucker --- include/linux/sunrpc/svc_rdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 1 + net/sunrpc/xprtrdma/svc_rdma_sendto.c | 3 +++ net/sunrpc/xprtrdma/svc_rdma_transport.c | 5 +++++ 4 files changed, 10 insertions(+) (limited to 'net') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index fd5e8a1c17d..ab93afc03c4 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -130,6 +130,7 @@ struct svcxprt_rdma { struct ib_pd *sc_pd; + atomic_t sc_dma_used; atomic_t sc_ctxt_used; struct list_head sc_ctxt_free; int sc_ctxt_cnt; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index d25971b42a7..b4b17f44cb2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -222,6 +222,7 @@ static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, ctxt->count = count; ctxt->direction = DMA_FROM_DEVICE; for (i = 0; i < count; i++) { + atomic_inc(&xprt->sc_dma_used); ctxt->sge[i].addr = ib_dma_map_single(xprt->sc_cm_id->device, vec[i].iov_base, vec[i].iov_len, diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index bdc11a30e93..a19b22b452a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -163,6 +163,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge_bytes = min((size_t)bc, (size_t)(vec->sge[xdr_sge_no].iov_len-sge_off)); sge[sge_no].length = sge_bytes; + atomic_inc(&xprt->sc_dma_used); sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device, (void *) @@ -385,6 +386,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->count = 1; /* Prepare the SGE for the RPCRDMA Header */ + atomic_inc(&rdma->sc_dma_used); ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, PAGE_SIZE, DMA_TO_DEVICE); @@ -396,6 +398,7 @@ static int send_reply(struct svcxprt_rdma *rdma, for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; + atomic_inc(&rdma->sc_dma_used); ctxt->sge[sge_no].addr = ib_dma_map_single(rdma->sc_cm_id->device, vec->sge[sge_no].iov_base, diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 7e8ee66458e..6fddd588c03 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -155,6 +155,7 @@ static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) struct svcxprt_rdma *xprt = ctxt->xprt; int i; for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { + atomic_dec(&xprt->sc_dma_used); ib_dma_unmap_single(xprt->sc_cm_id->device, ctxt->sge[i].addr, ctxt->sge[i].length, @@ -519,6 +520,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, cma_xprt->sc_max_requests = svcrdma_max_requests; cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT; atomic_set(&cma_xprt->sc_sq_count, 0); + atomic_set(&cma_xprt->sc_ctxt_used, 0); if (!listener) { int reqs = cma_xprt->sc_max_requests; @@ -569,6 +571,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) BUG_ON(sge_no >= xprt->sc_max_sge); page = svc_rdma_get_page(); ctxt->pages[sge_no] = page; + atomic_inc(&xprt->sc_dma_used); pa = ib_dma_map_page(xprt->sc_cm_id->device, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); @@ -1049,6 +1052,7 @@ static void __svc_rdma_free(struct work_struct *work) /* Warn if we leaked a resource or under-referenced */ WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); + WARN_ON(atomic_read(&rdma->sc_dma_used) != 0); /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) @@ -1169,6 +1173,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); /* Prepare SGE for local address */ + atomic_inc(&xprt->sc_dma_used); sge.addr = ib_dma_map_page(xprt->sc_cm_id->device, p, 0, PAGE_SIZE, DMA_FROM_DEVICE); sge.lkey = xprt->sc_phys_mr->lkey; -- cgit v1.2.3 From 94dba4918d4570bfa98776e54a5fa527c848dc78 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 28 May 2008 13:20:24 -0500 Subject: svcrdma: Remove unneeded spin locks from __svc_rdma_free At the time __svc_rdma_free is called, we are guaranteed that all references to this transport are gone. There is, therefore, no need to protect the resource lists with a spin lock. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 6fddd588c03..7647789a1f6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1027,7 +1027,6 @@ static void __svc_rdma_free(struct work_struct *work) * cm_id because the device ptr is needed to unmap the dma in * svc_rdma_put_context. */ - spin_lock_bh(&rdma->sc_read_complete_lock); while (!list_empty(&rdma->sc_read_complete_q)) { struct svc_rdma_op_ctxt *ctxt; ctxt = list_entry(rdma->sc_read_complete_q.next, @@ -1036,10 +1035,8 @@ static void __svc_rdma_free(struct work_struct *work) list_del_init(&ctxt->dto_q); svc_rdma_put_context(ctxt, 1); } - spin_unlock_bh(&rdma->sc_read_complete_lock); /* Destroy queued, but not processed recv completions */ - spin_lock_bh(&rdma->sc_rq_dto_lock); while (!list_empty(&rdma->sc_rq_dto_q)) { struct svc_rdma_op_ctxt *ctxt; ctxt = list_entry(rdma->sc_rq_dto_q.next, @@ -1048,7 +1045,6 @@ static void __svc_rdma_free(struct work_struct *work) list_del_init(&ctxt->dto_q); svc_rdma_put_context(ctxt, 1); } - spin_unlock_bh(&rdma->sc_rq_dto_lock); /* Warn if we leaked a resource or under-referenced */ WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); -- cgit v1.2.3 From 36ef25e464dbc5820c22a9b2f619b787eda594df Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Mon, 19 May 2008 19:00:24 -0500 Subject: svcrdma: Limit ORD based on client's advertised IRD When adapters have differing IRD limits, the RDMA transport will fail to connect properly. The RDMA transport should use the client's advertised inbound read limit when computing its outbound read limit. For iWARP transports, there is currently no standard for exchanging IRD/ORD during connection establishment so the 'responder_resources' field in the connect event is the local device's limit. The RDMA transport can be configured to use a smaller ORD by writing the desired number to the /proc/sys/sunrpc/svc_rdma/max_outbound_read_requests file. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 7647789a1f6..80104f4999d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -606,7 +606,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) * will call the recvfrom method on the listen xprt which will accept the new * connection. */ -static void handle_connect_req(struct rdma_cm_id *new_cma_id) +static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird) { struct svcxprt_rdma *listen_xprt = new_cma_id->context; struct svcxprt_rdma *newxprt; @@ -623,6 +623,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id) dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", newxprt, newxprt->sc_cm_id, listen_xprt); + /* Save client advertised inbound read limit for use later in accept. */ + newxprt->sc_ord = client_ird; + /* Set the local and remote addresses in the transport */ sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); @@ -659,7 +662,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, case RDMA_CM_EVENT_CONNECT_REQUEST: dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " "event=%d\n", cma_id, cma_id->context, event->event); - handle_connect_req(cma_id); + handle_connect_req(cma_id, + event->param.conn.responder_resources); break; case RDMA_CM_EVENT_ESTABLISHED: @@ -833,8 +837,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) (size_t)svcrdma_max_requests); newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; - newxprt->sc_ord = min((size_t)devattr.max_qp_rd_atom, - (size_t)svcrdma_ord); + /* + * Limit ORD based on client limit, local device limit, and + * configured svcrdma limit. + */ + newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord); + newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord); newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); if (IS_ERR(newxprt->sc_pd)) { -- cgit v1.2.3 From 902a94e0889be1f9fcefc0e1b602b06136e01812 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 28 May 2008 13:57:05 -0500 Subject: svcrdma: Add flush_scheduled_work to module exit function Make certain all transports pending free are flushed from the wq before unloading the module. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 171f2053e90..527acfd3828 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -239,6 +239,7 @@ static ctl_table svcrdma_root_table[] = { void svc_rdma_cleanup(void) { dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); + flush_scheduled_work(); if (svcrdma_table_header) { unregister_sysctl_table(svcrdma_table_header); svcrdma_table_header = NULL; -- cgit v1.2.3 From bf5927d84e70d522f234ca247b27d27c63878b93 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 28 May 2008 14:05:54 -0500 Subject: svcrdma: Create a kmem cache for the WR contexts Create a kmem cache to hold WR contexts. Next we will convert the WR context get and put services to use this kmem cache. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 527acfd3828..87101177825 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -69,8 +69,9 @@ atomic_t rdma_stat_rq_prod; atomic_t rdma_stat_sq_poll; atomic_t rdma_stat_sq_prod; -/* Temporary NFS request map cache */ +/* Temporary NFS request map and context caches */ struct kmem_cache *svc_rdma_map_cachep; +struct kmem_cache *svc_rdma_ctxt_cachep; /* * This function implements reading and resetting an atomic_t stat @@ -246,6 +247,7 @@ void svc_rdma_cleanup(void) } svc_unreg_xprt_class(&svc_rdma_class); kmem_cache_destroy(svc_rdma_map_cachep); + kmem_cache_destroy(svc_rdma_ctxt_cachep); } int svc_rdma_init(void) @@ -268,14 +270,27 @@ int svc_rdma_init(void) NULL); if (!svc_rdma_map_cachep) { printk(KERN_INFO "Could not allocate map cache.\n"); - goto err; + goto err0; + } + + /* Create the temporary context cache */ + svc_rdma_ctxt_cachep = + kmem_cache_create("svc_rdma_ctxt_cache", + sizeof(struct svc_rdma_op_ctxt), + 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!svc_rdma_ctxt_cachep) { + printk(KERN_INFO "Could not allocate WR ctxt cache.\n"); + goto err1; } /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); return 0; - - err: + err1: + kmem_cache_destroy(svc_rdma_map_cachep); + err0: unregister_sysctl_table(svcrdma_table_header); return -ENOMEM; } -- cgit v1.2.3 From 8948896c9e098c6fd31a6a698a598a7cbd7fa40e Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 28 May 2008 15:14:02 -0500 Subject: svcrdma: Change WR context get/put to use the kmem cache Change the WR context pool to be shared across mount points. This reduces the RDMA transport memory footprint significantly since idle mounts don't consume WR context memory. Signed-off-by: Tom Tucker --- include/linux/sunrpc/svc_rdma.h | 6 -- net/sunrpc/xprtrdma/svc_rdma_transport.c | 121 +++---------------------------- 2 files changed, 12 insertions(+), 115 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index d8d74c4ab50..ef2e3a20bf3 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -73,7 +73,6 @@ extern atomic_t rdma_stat_sq_prod; struct svc_rdma_op_ctxt { struct svc_rdma_op_ctxt *read_hdr; int hdr_count; - struct list_head free_list; struct xdr_buf arg; struct list_head dto_q; enum ib_wr_opcode wr_op; @@ -131,11 +130,6 @@ struct svcxprt_rdma { atomic_t sc_dma_used; atomic_t sc_ctxt_used; - struct list_head sc_ctxt_free; - int sc_ctxt_cnt; - int sc_ctxt_bump; - int sc_ctxt_max; - spinlock_t sc_ctxt_lock; struct list_head sc_rq_dto_q; spinlock_t sc_rq_dto_lock; struct ib_qp *sc_qp; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 80104f4999d..19ddc382b77 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -84,69 +84,23 @@ struct svc_xprt_class svc_rdma_class = { .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, }; -static int rdma_bump_context_cache(struct svcxprt_rdma *xprt) -{ - int target; - int at_least_one = 0; - struct svc_rdma_op_ctxt *ctxt; - - target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump, - xprt->sc_ctxt_max); - - spin_lock_bh(&xprt->sc_ctxt_lock); - while (xprt->sc_ctxt_cnt < target) { - xprt->sc_ctxt_cnt++; - spin_unlock_bh(&xprt->sc_ctxt_lock); - - ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); - - spin_lock_bh(&xprt->sc_ctxt_lock); - if (ctxt) { - at_least_one = 1; - INIT_LIST_HEAD(&ctxt->free_list); - list_add(&ctxt->free_list, &xprt->sc_ctxt_free); - } else { - /* kmalloc failed...give up for now */ - xprt->sc_ctxt_cnt--; - break; - } - } - spin_unlock_bh(&xprt->sc_ctxt_lock); - dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n", - xprt->sc_ctxt_max, xprt->sc_ctxt_cnt); - return at_least_one; -} +/* WR context cache. Created in svc_rdma.c */ +extern struct kmem_cache *svc_rdma_ctxt_cachep; struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) { struct svc_rdma_op_ctxt *ctxt; while (1) { - spin_lock_bh(&xprt->sc_ctxt_lock); - if (unlikely(list_empty(&xprt->sc_ctxt_free))) { - /* Try to bump my cache. */ - spin_unlock_bh(&xprt->sc_ctxt_lock); - - if (rdma_bump_context_cache(xprt)) - continue; - - printk(KERN_INFO "svcrdma: sleeping waiting for " - "context memory on xprt=%p\n", - xprt); - schedule_timeout_uninterruptible(msecs_to_jiffies(500)); - continue; - } - ctxt = list_entry(xprt->sc_ctxt_free.next, - struct svc_rdma_op_ctxt, - free_list); - list_del_init(&ctxt->free_list); - spin_unlock_bh(&xprt->sc_ctxt_lock); - ctxt->xprt = xprt; - INIT_LIST_HEAD(&ctxt->dto_q); - ctxt->count = 0; - atomic_inc(&xprt->sc_ctxt_used); - break; + ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL); + if (ctxt) + break; + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); } + ctxt->xprt = xprt; + INIT_LIST_HEAD(&ctxt->dto_q); + ctxt->count = 0; + atomic_inc(&xprt->sc_ctxt_used); return ctxt; } @@ -174,9 +128,7 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) for (i = 0; i < ctxt->count; i++) put_page(ctxt->pages[i]); - spin_lock_bh(&xprt->sc_ctxt_lock); - list_add(&ctxt->free_list, &xprt->sc_ctxt_free); - spin_unlock_bh(&xprt->sc_ctxt_lock); + kmem_cache_free(svc_rdma_ctxt_cachep, ctxt); atomic_dec(&xprt->sc_ctxt_used); } @@ -461,40 +413,6 @@ static void sq_comp_handler(struct ib_cq *cq, void *cq_context) tasklet_schedule(&dto_tasklet); } -static void create_context_cache(struct svcxprt_rdma *xprt, - int ctxt_count, int ctxt_bump, int ctxt_max) -{ - struct svc_rdma_op_ctxt *ctxt; - int i; - - xprt->sc_ctxt_max = ctxt_max; - xprt->sc_ctxt_bump = ctxt_bump; - xprt->sc_ctxt_cnt = 0; - atomic_set(&xprt->sc_ctxt_used, 0); - - INIT_LIST_HEAD(&xprt->sc_ctxt_free); - for (i = 0; i < ctxt_count; i++) { - ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); - if (ctxt) { - INIT_LIST_HEAD(&ctxt->free_list); - list_add(&ctxt->free_list, &xprt->sc_ctxt_free); - xprt->sc_ctxt_cnt++; - } - } -} - -static void destroy_context_cache(struct svcxprt_rdma *xprt) -{ - while (!list_empty(&xprt->sc_ctxt_free)) { - struct svc_rdma_op_ctxt *ctxt; - ctxt = list_entry(xprt->sc_ctxt_free.next, - struct svc_rdma_op_ctxt, - free_list); - list_del_init(&ctxt->free_list); - kfree(ctxt); - } -} - static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, int listener) { @@ -511,7 +429,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_read_complete_lock); - spin_lock_init(&cma_xprt->sc_ctxt_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); cma_xprt->sc_ord = svcrdma_ord; @@ -522,20 +439,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, atomic_set(&cma_xprt->sc_sq_count, 0); atomic_set(&cma_xprt->sc_ctxt_used, 0); - if (!listener) { - int reqs = cma_xprt->sc_max_requests; - create_context_cache(cma_xprt, - reqs << 1, /* starting size */ - reqs, /* bump amount */ - reqs + - cma_xprt->sc_sq_depth + - RPCRDMA_MAX_THREADS + 1); /* max */ - if (list_empty(&cma_xprt->sc_ctxt_free)) { - kfree(cma_xprt); - return NULL; - } - clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); - } else + if (listener) set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); return cma_xprt; @@ -1077,7 +981,6 @@ static void __svc_rdma_free(struct work_struct *work) /* Destroy the CM ID */ rdma_destroy_id(rdma->sc_cm_id); - destroy_context_cache(rdma); kfree(rdma); } -- cgit v1.2.3