aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-16 08:25:04 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-16 08:25:04 -0700
commit3cd8b194bf3428dfa53120fee47e827a7c495815 (patch)
tree804f42c1a41ce5579a255e9e373fbfcb1419089b /fs
parentd3d9443f8bac799340bb04db51ef4ababc4f7267 (diff)
parentd09a040c186a2083b1cfa9c3c112782ce4b1f6d4 (diff)
Merge tag 'v7.1-rc-part1-smbdirect-fixes' of git://git.samba.org/ksmbdHEADmaster
Pull smbdirect updates from Steve French: "Move smbdirect server and client code to common directory: - temporary use of smbdirect_all_c_files.c to allow micro steps - factor out common functions into a smbdirect.ko. - convert cifs.ko to use smbdirect.ko - convert ksmbd.ko to use smbdirect.ko - let smbdirect.ko use global workqueues - move ib_client logic from ksmbd.ko into smbdirect.ko - remove smbdirect_all_c_files.c hack again - some locking and teardown related fixes on top" * tag 'v7.1-rc-part1-smbdirect-fixes' of git://git.samba.org/ksmbd: (145 commits) smb: smbdirect: let smbdirect_connection_deregister_mr_io unlock while waiting smb: smbdirect: fix the logic in smbdirect_socket_destroy_sync() without an error smb: smbdirect: fix copyright header of smbdirect.h smb: smbdirect: change smbdirect_socket_parameters.{initiator_depth,responder_resources} to __u16 smb: smbdirect: remove unused SMBDIRECT_USE_INLINE_C_FILES logic smb: server: no longer use smbdirect_socket_set_custom_workqueue() smb: client: no longer use smbdirect_socket_set_custom_workqueue() smb: smbdirect: introduce global workqueues smb: smbdirect: prepare use of dedicated workqueues for different steps smb: smbdirect: remove unused smbdirect_connection_mr_io_recovery_work() smb: smbdirect: wrap rdma_disconnect() in rdma_[un]lock_handler() smb: server: make use of smbdirect_netdev_rdma_capable_mode_type() smb: smbdirect: introduce smbdirect_netdev_rdma_capable_mode_type() smb: server: make use of smbdirect.ko smb: server: remove unused ksmbd_transport_ops.prepare() smb: server: make use of smbdirect_socket_{listen,accept}() smb: server: only use public smbdirect functions smb: server: make use of smbdirect_socket_create_accepting()/smbdirect_socket_release() smb: server: make use of smbdirect_{socket_init_accepting,connection_wait_for_connected}() smb: server: make use of smbdirect_connection_send_iter() and related functions ...
Diffstat (limited to 'fs')
-rw-r--r--fs/smb/Kconfig1
-rw-r--r--fs/smb/client/Kconfig4
-rw-r--r--fs/smb/client/cifs_debug.c67
-rw-r--r--fs/smb/client/smb2pdu.c9
-rw-r--r--fs/smb/client/smbdirect.c3116
-rw-r--r--fs/smb/client/smbdirect.h19
-rw-r--r--fs/smb/common/Makefile1
-rw-r--r--fs/smb/common/smbdirect/Kconfig9
-rw-r--r--fs/smb/common/smbdirect/Makefile18
-rw-r--r--fs/smb/common/smbdirect/smbdirect.h14
-rw-r--r--fs/smb/common/smbdirect/smbdirect_accept.c857
-rw-r--r--fs/smb/common/smbdirect/smbdirect_connect.c925
-rw-r--r--fs/smb/common/smbdirect/smbdirect_connection.c2181
-rw-r--r--fs/smb/common/smbdirect/smbdirect_debug.c88
-rw-r--r--fs/smb/common/smbdirect/smbdirect_devices.c277
-rw-r--r--fs/smb/common/smbdirect/smbdirect_internal.h141
-rw-r--r--fs/smb/common/smbdirect/smbdirect_listen.c308
-rw-r--r--fs/smb/common/smbdirect/smbdirect_main.c121
-rw-r--r--fs/smb/common/smbdirect/smbdirect_mr.c493
-rw-r--r--fs/smb/common/smbdirect/smbdirect_pdu.h4
-rw-r--r--fs/smb/common/smbdirect/smbdirect_public.h148
-rw-r--r--fs/smb/common/smbdirect/smbdirect_rw.c255
-rw-r--r--fs/smb/common/smbdirect/smbdirect_socket.c743
-rw-r--r--fs/smb/common/smbdirect/smbdirect_socket.h183
-rw-r--r--fs/smb/server/Kconfig5
-rw-r--r--fs/smb/server/connection.c5
-rw-r--r--fs/smb/server/connection.h1
-rw-r--r--fs/smb/server/smb2pdu.c1
-rw-r--r--fs/smb/server/transport_rdma.c2956
-rw-r--r--fs/smb/server/transport_rdma.h4
30 files changed, 7125 insertions, 5829 deletions
diff --git a/fs/smb/Kconfig b/fs/smb/Kconfig
index 85f7ad5fbc5e..b4b2cfdc2a6b 100644
--- a/fs/smb/Kconfig
+++ b/fs/smb/Kconfig
@@ -4,6 +4,7 @@
source "fs/smb/client/Kconfig"
source "fs/smb/server/Kconfig"
+source "fs/smb/common/smbdirect/Kconfig"
config SMBFS
tristate
diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig
index d112da38c881..63831242fddf 100644
--- a/fs/smb/client/Kconfig
+++ b/fs/smb/client/Kconfig
@@ -180,7 +180,9 @@ if CIFS
config CIFS_SMB_DIRECT
bool "SMB Direct support"
- depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y
+ depends on CIFS && INFINIBAND && INFINIBAND_ADDR_TRANS
+ depends on CIFS=m || INFINIBAND=y
+ select SMB_COMMON_SMBDIRECT
help
Enables SMB Direct support for SMB 3.0, 3.02 and 3.1.1.
SMB Direct allows transferring SMB packets over RDMA. If unsure,
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index 217444e3e6d0..0691d2a3e04b 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -23,7 +23,6 @@
#endif
#ifdef CONFIG_CIFS_SMB_DIRECT
#include "smbdirect.h"
-#include "../common/smbdirect/smbdirect_pdu.h"
#endif
#include "cifs_swn.h"
#include "cached_dir.h"
@@ -452,11 +451,6 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
c = 0;
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
-#ifdef CONFIG_CIFS_SMB_DIRECT
- struct smbdirect_socket *sc;
- struct smbdirect_socket_parameters *sp;
-#endif
-
/* channel info will be printed as a part of sessions below */
if (SERVER_IS_CHAN(server))
continue;
@@ -471,66 +465,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\nClientGUID: %pUL", server->client_guid);
spin_unlock(&server->srv_lock);
#ifdef CONFIG_CIFS_SMB_DIRECT
- if (!server->rdma)
- goto skip_rdma;
-
- if (!server->smbd_conn) {
- seq_printf(m, "\nSMBDirect transport not available");
- goto skip_rdma;
- }
- sc = &server->smbd_conn->socket;
- sp = &sc->parameters;
-
- seq_printf(m, "\nSMBDirect protocol version: 0x%x "
- "transport status: %s (%u)",
- SMBDIRECT_V1,
- smbdirect_socket_status_string(sc->status),
- sc->status);
- seq_printf(m, "\nConn receive_credit_max: %u "
- "send_credit_target: %u max_send_size: %u",
- sp->recv_credit_max,
- sp->send_credit_target,
- sp->max_send_size);
- seq_printf(m, "\nConn max_fragmented_recv_size: %u "
- "max_fragmented_send_size: %u max_receive_size:%u",
- sp->max_fragmented_recv_size,
- sp->max_fragmented_send_size,
- sp->max_recv_size);
- seq_printf(m, "\nConn keep_alive_interval: %u "
- "max_readwrite_size: %u rdma_readwrite_threshold: %u",
- sp->keepalive_interval_msec * 1000,
- sp->max_read_write_size,
- server->rdma_readwrite_threshold);
- seq_printf(m, "\nDebug count_get_receive_buffer: %llu "
- "count_put_receive_buffer: %llu count_send_empty: %llu",
- sc->statistics.get_receive_buffer,
- sc->statistics.put_receive_buffer,
- sc->statistics.send_empty);
- seq_printf(m, "\nRead Queue "
- "count_enqueue_reassembly_queue: %llu "
- "count_dequeue_reassembly_queue: %llu "
- "reassembly_data_length: %u "
- "reassembly_queue_length: %u",
- sc->statistics.enqueue_reassembly_queue,
- sc->statistics.dequeue_reassembly_queue,
- sc->recv_io.reassembly.data_length,
- sc->recv_io.reassembly.queue_length);
- seq_printf(m, "\nCurrent Credits send_credits: %u "
- "receive_credits: %u receive_credit_target: %u",
- atomic_read(&sc->send_io.credits.count),
- atomic_read(&sc->recv_io.credits.count),
- sc->recv_io.credits.target);
- seq_printf(m, "\nPending send_pending: %u ",
- atomic_read(&sc->send_io.pending.count));
- seq_printf(m, "\nMR responder_resources: %u "
- "max_frmr_depth: %u mr_type: 0x%x",
- sp->responder_resources,
- sp->max_frmr_depth,
- sc->mr_io.type);
- seq_printf(m, "\nMR mr_ready_count: %u mr_used_count: %u",
- atomic_read(&sc->mr_io.ready.count),
- atomic_read(&sc->mr_io.used.count));
-skip_rdma:
+ smbd_debug_proc_show(server, m);
#endif
seq_printf(m, "\nNumber of credits: %d,%d,%d Dialect 0x%x",
server->credits,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 59d7418cc480..957aca2222b5 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -36,7 +36,6 @@
#include "../common/smb2status.h"
#include "smb2glob.h"
#include "cifs_spnego.h"
-#include "../common/smbdirect/smbdirect.h"
#include "smbdirect.h"
#include "trace.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -4554,9 +4553,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
req->ReadChannelInfoLength =
cpu_to_le16(sizeof(struct smbdirect_buffer_descriptor_v1));
v1 = (struct smbdirect_buffer_descriptor_v1 *) &req->Buffer[0];
- v1->offset = cpu_to_le64(rdata->mr->mr->iova);
- v1->token = cpu_to_le32(rdata->mr->mr->rkey);
- v1->length = cpu_to_le32(rdata->mr->mr->length);
+ smbd_mr_fill_buffer_descriptor(rdata->mr, v1);
*total_len += sizeof(*v1) - 1;
}
@@ -5155,9 +5152,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
req->WriteChannelInfoLength =
cpu_to_le16(sizeof(struct smbdirect_buffer_descriptor_v1));
v1 = (struct smbdirect_buffer_descriptor_v1 *) &req->Buffer[0];
- v1->offset = cpu_to_le64(wdata->mr->mr->iova);
- v1->token = cpu_to_le32(wdata->mr->mr->rkey);
- v1->length = cpu_to_le32(wdata->mr->mr->length);
+ smbd_mr_fill_buffer_descriptor(wdata->mr, v1);
rqst.rq_iov[0].iov_len += sizeof(*v1);
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index 461658105013..9e67adcdc7d3 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -4,60 +4,12 @@
*
* Author(s): Long Li <longli@microsoft.com>
*/
-#include <linux/module.h>
-#include <linux/highmem.h>
-#include <linux/folio_queue.h>
-#define __SMBDIRECT_SOCKET_DISCONNECT(__sc) smbd_disconnect_rdma_connection(__sc)
-#include "../common/smbdirect/smbdirect_pdu.h"
+
#include "smbdirect.h"
#include "cifs_debug.h"
#include "cifsproto.h"
#include "smb2proto.h"
-
-const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn)
-{
- struct smbdirect_socket *sc = &conn->socket;
-
- return &sc->parameters;
-}
-
-static struct smbdirect_recv_io *get_receive_buffer(
- struct smbdirect_socket *sc);
-static void put_receive_buffer(
- struct smbdirect_socket *sc,
- struct smbdirect_recv_io *response);
-static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf);
-static void destroy_receive_buffers(struct smbdirect_socket *sc);
-
-static void enqueue_reassembly(
- struct smbdirect_socket *sc,
- struct smbdirect_recv_io *response, int data_length);
-static struct smbdirect_recv_io *_get_first_reassembly(
- struct smbdirect_socket *sc);
-
-static int smbd_post_send(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *batch,
- struct smbdirect_send_io *request);
-
-static int smbd_post_recv(
- struct smbdirect_socket *sc,
- struct smbdirect_recv_io *response);
-
-static int smbd_post_send_empty(struct smbdirect_socket *sc);
-
-static void destroy_mr_list(struct smbdirect_socket *sc);
-static int allocate_mr_list(struct smbdirect_socket *sc);
-
-struct smb_extract_to_rdma {
- struct ib_sge *sge;
- unsigned int nr_sge;
- unsigned int max_sge;
- struct ib_device *device;
- u32 local_dma_lkey;
- enum dma_data_direction direction;
-};
-static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
- struct smb_extract_to_rdma *rdma);
+#include "../common/smbdirect/smbdirect_public.h"
/* Port numbers for SMBD transport */
#define SMB_PORT 445
@@ -72,21 +24,12 @@ static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
/* The timeout to wait for a keepalive message from peer in seconds */
#define KEEPALIVE_RECV_TIMEOUT 5
-/* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
-#define SMBD_MIN_RECEIVE_SIZE 128
-#define SMBD_MIN_FRAGMENTED_SIZE 131072
-
/*
* Default maximum number of RDMA read/write outstanding on this connection
* This value is possibly decreased during QP creation on hardware limit
*/
#define SMBD_CM_RESPONDER_RESOURCES 32
-/* Maximum number of retries on data transfer operations */
-#define SMBD_CM_RETRY 6
-/* No need to retry on Receiver Not Ready since SMBD manages credits */
-#define SMBD_CM_RNR_RETRY 0
-
/*
* User configurable initial values per SMBD transport connection
* as defined in [MS-SMBD] 3.1.1.1
@@ -162,6 +105,43 @@ module_param(smbd_logging_level, uint, 0644);
MODULE_PARM_DESC(smbd_logging_level,
"Logging level for SMBD transport, 0 (default): error, 1: info");
+static bool smbd_logging_needed(struct smbdirect_socket *sc,
+ void *private_ptr,
+ unsigned int lvl,
+ unsigned int cls)
+{
+#define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_LOG_ ##x)
+ BUILD_BUG_SAME(ERR);
+ BUILD_BUG_SAME(INFO);
+#undef BUILD_BUG_SAME
+#define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_ ##x)
+ BUILD_BUG_SAME(LOG_OUTGOING);
+ BUILD_BUG_SAME(LOG_INCOMING);
+ BUILD_BUG_SAME(LOG_READ);
+ BUILD_BUG_SAME(LOG_WRITE);
+ BUILD_BUG_SAME(LOG_RDMA_SEND);
+ BUILD_BUG_SAME(LOG_RDMA_RECV);
+ BUILD_BUG_SAME(LOG_KEEP_ALIVE);
+ BUILD_BUG_SAME(LOG_RDMA_EVENT);
+ BUILD_BUG_SAME(LOG_RDMA_MR);
+#undef BUILD_BUG_SAME
+
+ if (lvl <= smbd_logging_level || cls & smbd_logging_class)
+ return true;
+ return false;
+}
+
+static void smbd_logging_vaprintf(struct smbdirect_socket *sc,
+ const char *func,
+ unsigned int line,
+ void *private_ptr,
+ unsigned int lvl,
+ unsigned int cls,
+ struct va_format *vaf)
+{
+ cifs_dbg(VFS, "%s:%u %pV", func, line, vaf);
+}
+
#define log_rdma(level, class, fmt, args...) \
do { \
if (level <= smbd_logging_level || class & smbd_logging_class) \
@@ -185,1703 +165,34 @@ do { \
#define log_rdma_mr(level, fmt, args...) \
log_rdma(level, LOG_RDMA_MR, fmt, ##args)
-static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc)
-{
- /*
- * Wake up all waiters in all wait queues
- * in order to notice the broken connection.
- */
- wake_up_all(&sc->status_wait);
- wake_up_all(&sc->send_io.lcredits.wait_queue);
- wake_up_all(&sc->send_io.credits.wait_queue);
- wake_up_all(&sc->send_io.pending.dec_wait_queue);
- wake_up_all(&sc->send_io.pending.zero_wait_queue);
- wake_up_all(&sc->recv_io.reassembly.wait_queue);
- wake_up_all(&sc->mr_io.ready.wait_queue);
- wake_up_all(&sc->mr_io.cleanup.wait_queue);
-}
-
-static void smbd_disconnect_rdma_work(struct work_struct *work)
-{
- struct smbdirect_socket *sc =
- container_of(work, struct smbdirect_socket, disconnect_work);
-
- if (sc->first_error == 0)
- sc->first_error = -ECONNABORTED;
-
- /*
- * make sure this and other work is not queued again
- * but here we don't block and avoid
- * disable[_delayed]_work_sync()
- */
- disable_work(&sc->disconnect_work);
- disable_work(&sc->recv_io.posted.refill_work);
- disable_work(&sc->mr_io.recovery_work);
- disable_work(&sc->idle.immediate_work);
- disable_delayed_work(&sc->idle.timer_work);
-
- switch (sc->status) {
- case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
- case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
- case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
- case SMBDIRECT_SOCKET_CONNECTED:
- case SMBDIRECT_SOCKET_ERROR:
- sc->status = SMBDIRECT_SOCKET_DISCONNECTING;
- rdma_disconnect(sc->rdma.cm_id);
- break;
-
- case SMBDIRECT_SOCKET_CREATED:
- case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
- case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
- case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
- case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
- case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
- case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
- case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
- case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
- case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
- /*
- * rdma_connect() never reached
- * RDMA_CM_EVENT_ESTABLISHED
- */
- sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
- break;
-
- case SMBDIRECT_SOCKET_DISCONNECTING:
- case SMBDIRECT_SOCKET_DISCONNECTED:
- case SMBDIRECT_SOCKET_DESTROYED:
- break;
- }
-
- /*
- * Wake up all waiters in all wait queues
- * in order to notice the broken connection.
- */
- smbd_disconnect_wake_up_all(sc);
-}
-
-static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc)
-{
- if (sc->first_error == 0)
- sc->first_error = -ECONNABORTED;
-
- /*
- * make sure other work (than disconnect_work) is
- * not queued again but here we don't block and avoid
- * disable[_delayed]_work_sync()
- */
- disable_work(&sc->recv_io.posted.refill_work);
- disable_work(&sc->mr_io.recovery_work);
- disable_work(&sc->idle.immediate_work);
- disable_delayed_work(&sc->idle.timer_work);
-
- switch (sc->status) {
- case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
- case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
- case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
- case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
- case SMBDIRECT_SOCKET_ERROR:
- case SMBDIRECT_SOCKET_DISCONNECTING:
- case SMBDIRECT_SOCKET_DISCONNECTED:
- case SMBDIRECT_SOCKET_DESTROYED:
- /*
- * Keep the current error status
- */
- break;
-
- case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
- case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
- sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
- break;
-
- case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
- case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
- sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
- break;
-
- case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
- case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
- sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
- break;
-
- case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
- case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
- sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
- break;
-
- case SMBDIRECT_SOCKET_CREATED:
- sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
- break;
-
- case SMBDIRECT_SOCKET_CONNECTED:
- sc->status = SMBDIRECT_SOCKET_ERROR;
- break;
- }
-
- /*
- * Wake up all waiters in all wait queues
- * in order to notice the broken connection.
- */
- smbd_disconnect_wake_up_all(sc);
-
- queue_work(sc->workqueue, &sc->disconnect_work);
-}
-
-/* Upcall from RDMA CM */
-static int smbd_conn_upcall(
- struct rdma_cm_id *id, struct rdma_cm_event *event)
-{
- struct smbdirect_socket *sc = id->context;
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- const char *event_name = rdma_event_msg(event->event);
- u8 peer_initiator_depth;
- u8 peer_responder_resources;
-
- log_rdma_event(INFO, "event=%s status=%d\n",
- event_name, event->status);
-
- switch (event->event) {
- case RDMA_CM_EVENT_ADDR_RESOLVED:
- if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING))
- break;
- sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED;
- wake_up(&sc->status_wait);
- break;
-
- case RDMA_CM_EVENT_ROUTE_RESOLVED:
- if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING))
- break;
- sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED;
- wake_up(&sc->status_wait);
- break;
-
- case RDMA_CM_EVENT_ADDR_ERROR:
- log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
- sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
- smbd_disconnect_rdma_work(&sc->disconnect_work);
- break;
-
- case RDMA_CM_EVENT_ROUTE_ERROR:
- log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
- sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
- smbd_disconnect_rdma_work(&sc->disconnect_work);
- break;
-
- case RDMA_CM_EVENT_ESTABLISHED:
- log_rdma_event(INFO, "connected event=%s\n", event_name);
-
- /*
- * Here we work around an inconsistency between
- * iWarp and other devices (at least rxe and irdma using RoCEv2)
- */
- if (rdma_protocol_iwarp(id->device, id->port_num)) {
- /*
- * iWarp devices report the peer's values
- * with the perspective of the peer here.
- * Tested with siw and irdma (in iwarp mode)
- * We need to change to our perspective here,
- * so we need to switch the values.
- */
- peer_initiator_depth = event->param.conn.responder_resources;
- peer_responder_resources = event->param.conn.initiator_depth;
- } else {
- /*
- * Non iWarp devices report the peer's values
- * already changed to our perspective here.
- * Tested with rxe and irdma (in roce mode).
- */
- peer_initiator_depth = event->param.conn.initiator_depth;
- peer_responder_resources = event->param.conn.responder_resources;
- }
- if (rdma_protocol_iwarp(id->device, id->port_num) &&
- event->param.conn.private_data_len == 8) {
- /*
- * Legacy clients with only iWarp MPA v1 support
- * need a private blob in order to negotiate
- * the IRD/ORD values.
- */
- const __be32 *ird_ord_hdr = event->param.conn.private_data;
- u32 ird32 = be32_to_cpu(ird_ord_hdr[0]);
- u32 ord32 = be32_to_cpu(ird_ord_hdr[1]);
-
- /*
- * cifs.ko sends the legacy IRD/ORD negotiation
- * event if iWarp MPA v2 was used.
- *
- * Here we check that the values match and only
- * mark the client as legacy if they don't match.
- */
- if ((u32)event->param.conn.initiator_depth != ird32 ||
- (u32)event->param.conn.responder_resources != ord32) {
- /*
- * There are broken clients (old cifs.ko)
- * using little endian and also
- * struct rdma_conn_param only uses u8
- * for initiator_depth and responder_resources,
- * so we truncate the value to U8_MAX.
- *
- * smb_direct_accept_client() will then
- * do the real negotiation in order to
- * select the minimum between client and
- * server.
- */
- ird32 = min_t(u32, ird32, U8_MAX);
- ord32 = min_t(u32, ord32, U8_MAX);
-
- sc->rdma.legacy_iwarp = true;
- peer_initiator_depth = (u8)ird32;
- peer_responder_resources = (u8)ord32;
- }
- }
-
- /*
- * negotiate the value by using the minimum
- * between client and server if the client provided
- * non 0 values.
- */
- if (peer_initiator_depth != 0)
- sp->initiator_depth =
- min_t(u8, sp->initiator_depth,
- peer_initiator_depth);
- if (peer_responder_resources != 0)
- sp->responder_resources =
- min_t(u8, sp->responder_resources,
- peer_responder_resources);
-
- if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING))
- break;
- sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
- wake_up(&sc->status_wait);
- break;
-
- case RDMA_CM_EVENT_CONNECT_ERROR:
- case RDMA_CM_EVENT_UNREACHABLE:
- case RDMA_CM_EVENT_REJECTED:
- log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
- sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
- smbd_disconnect_rdma_work(&sc->disconnect_work);
- break;
-
- case RDMA_CM_EVENT_DEVICE_REMOVAL:
- case RDMA_CM_EVENT_DISCONNECTED:
- /* This happens when we fail the negotiation */
- if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) {
- log_rdma_event(ERR, "event=%s during negotiation\n", event_name);
- }
-
- sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
- smbd_disconnect_rdma_work(&sc->disconnect_work);
- break;
-
- default:
- log_rdma_event(ERR, "unexpected event=%s status=%d\n",
- event_name, event->status);
- break;
- }
-
- return 0;
-}
-
-/* Upcall from RDMA QP */
-static void
-smbd_qp_async_error_upcall(struct ib_event *event, void *context)
-{
- struct smbdirect_socket *sc = context;
-
- log_rdma_event(ERR, "%s on device %s socket %p\n",
- ib_event_msg(event->event), event->device->name, sc);
-
- switch (event->event) {
- case IB_EVENT_CQ_ERR:
- case IB_EVENT_QP_FATAL:
- smbd_disconnect_rdma_connection(sc);
- break;
-
- default:
- break;
- }
-}
-
-static inline void *smbdirect_send_io_payload(struct smbdirect_send_io *request)
-{
- return (void *)request->packet;
-}
-
-static inline void *smbdirect_recv_io_payload(struct smbdirect_recv_io *response)
-{
- return (void *)response->packet;
-}
-
-static struct smbdirect_send_io *smbd_alloc_send_io(struct smbdirect_socket *sc)
-{
- struct smbdirect_send_io *msg;
-
- msg = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);
- if (!msg)
- return ERR_PTR(-ENOMEM);
- msg->socket = sc;
- INIT_LIST_HEAD(&msg->sibling_list);
- msg->num_sge = 0;
-
- return msg;
-}
-
-static void smbd_free_send_io(struct smbdirect_send_io *msg)
-{
- struct smbdirect_socket *sc = msg->socket;
- size_t i;
-
- /*
- * The list needs to be empty!
- * The caller should take care of it.
- */
- WARN_ON_ONCE(!list_empty(&msg->sibling_list));
-
- /*
- * Note we call ib_dma_unmap_page(), even if some sges are mapped using
- * ib_dma_map_single().
- *
- * The difference between _single() and _page() only matters for the
- * ib_dma_map_*() case.
- *
- * For the ib_dma_unmap_*() case it does not matter as both take the
- * dma_addr_t and dma_unmap_single_attrs() is just an alias to
- * dma_unmap_page_attrs().
- */
- for (i = 0; i < msg->num_sge; i++)
- ib_dma_unmap_page(sc->ib.dev,
- msg->sge[i].addr,
- msg->sge[i].length,
- DMA_TO_DEVICE);
-
- mempool_free(msg, sc->send_io.mem.pool);
-}
-
-/* Called when a RDMA send is done */
-static void send_done(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct smbdirect_send_io *request =
- container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
- struct smbdirect_socket *sc = request->socket;
- struct smbdirect_send_io *sibling, *next;
- int lcredits = 0;
-
- log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n",
- request, ib_wc_status_msg(wc->status));
-
- if (unlikely(!(request->wr.send_flags & IB_SEND_SIGNALED))) {
- /*
- * This happens when smbdirect_send_io is a sibling
- * before the final message, it is signaled on
- * error anyway, so we need to skip
- * smbdirect_connection_free_send_io here,
- * otherwise is will destroy the memory
- * of the siblings too, which will cause
- * use after free problems for the others
- * triggered from ib_drain_qp().
- */
- if (wc->status != IB_WC_SUCCESS)
- goto skip_free;
-
- /*
- * This should not happen!
- * But we better just close the
- * connection...
- */
- log_rdma_send(ERR,
- "unexpected send completion wc->status=%s (%d) wc->opcode=%d\n",
- ib_wc_status_msg(wc->status), wc->status, wc->opcode);
- smbd_disconnect_rdma_connection(sc);
- return;
- }
-
- /*
- * Free possible siblings and then the main send_io
- */
- list_for_each_entry_safe(sibling, next, &request->sibling_list, sibling_list) {
- list_del_init(&sibling->sibling_list);
- smbd_free_send_io(sibling);
- lcredits += 1;
- }
- /* Note this frees wc->wr_cqe, but not wc */
- smbd_free_send_io(request);
- lcredits += 1;
-
- if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
-skip_free:
- if (wc->status != IB_WC_WR_FLUSH_ERR)
- log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n",
- ib_wc_status_msg(wc->status), wc->opcode);
- smbd_disconnect_rdma_connection(sc);
- return;
- }
-
- atomic_add(lcredits, &sc->send_io.lcredits.count);
- wake_up(&sc->send_io.lcredits.wait_queue);
-
- if (atomic_dec_and_test(&sc->send_io.pending.count))
- wake_up(&sc->send_io.pending.zero_wait_queue);
-
- wake_up(&sc->send_io.pending.dec_wait_queue);
-}
-
-static void dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp *resp)
-{
- log_rdma_event(INFO, "resp message min_version %u max_version %u negotiated_version %u credits_requested %u credits_granted %u status %u max_readwrite_size %u preferred_send_size %u max_receive_size %u max_fragmented_size %u\n",
- resp->min_version, resp->max_version,
- resp->negotiated_version, resp->credits_requested,
- resp->credits_granted, resp->status,
- resp->max_readwrite_size, resp->preferred_send_size,
- resp->max_receive_size, resp->max_fragmented_size);
-}
-
-/*
- * Process a negotiation response message, according to [MS-SMBD]3.1.5.7
- * response, packet_length: the negotiation response message
- * return value: true if negotiation is a success, false if failed
- */
-static bool process_negotiation_response(
- struct smbdirect_recv_io *response, int packet_length)
-{
- struct smbdirect_socket *sc = response->socket;
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct smbdirect_negotiate_resp *packet = smbdirect_recv_io_payload(response);
-
- if (packet_length < sizeof(struct smbdirect_negotiate_resp)) {
- log_rdma_event(ERR,
- "error: packet_length=%d\n", packet_length);
- return false;
- }
-
- if (le16_to_cpu(packet->negotiated_version) != SMBDIRECT_V1) {
- log_rdma_event(ERR, "error: negotiated_version=%x\n",
- le16_to_cpu(packet->negotiated_version));
- return false;
- }
-
- if (packet->credits_requested == 0) {
- log_rdma_event(ERR, "error: credits_requested==0\n");
- return false;
- }
- sc->recv_io.credits.target = le16_to_cpu(packet->credits_requested);
- sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
-
- if (packet->credits_granted == 0) {
- log_rdma_event(ERR, "error: credits_granted==0\n");
- return false;
- }
- atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target);
- atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted));
-
- if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) {
- log_rdma_event(ERR, "error: preferred_send_size=%d\n",
- le32_to_cpu(packet->preferred_send_size));
- return false;
- }
- sp->max_recv_size = le32_to_cpu(packet->preferred_send_size);
-
- if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) {
- log_rdma_event(ERR, "error: max_receive_size=%d\n",
- le32_to_cpu(packet->max_receive_size));
- return false;
- }
- sp->max_send_size = min_t(u32, sp->max_send_size,
- le32_to_cpu(packet->max_receive_size));
-
- if (le32_to_cpu(packet->max_fragmented_size) <
- SMBD_MIN_FRAGMENTED_SIZE) {
- log_rdma_event(ERR, "error: max_fragmented_size=%d\n",
- le32_to_cpu(packet->max_fragmented_size));
- return false;
- }
- sp->max_fragmented_send_size =
- le32_to_cpu(packet->max_fragmented_size);
-
-
- sp->max_read_write_size = min_t(u32,
- le32_to_cpu(packet->max_readwrite_size),
- sp->max_frmr_depth * PAGE_SIZE);
- sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE;
-
- atomic_set(&sc->send_io.bcredits.count, 1);
- sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
- return true;
-}
-
-static void smbd_post_send_credits(struct work_struct *work)
-{
- int rc;
- struct smbdirect_recv_io *response;
- struct smbdirect_socket *sc =
- container_of(work, struct smbdirect_socket, recv_io.posted.refill_work);
- int posted = 0;
-
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
- return;
- }
-
- if (sc->recv_io.credits.target >
- atomic_read(&sc->recv_io.credits.count)) {
- while (true) {
- response = get_receive_buffer(sc);
- if (!response)
- break;
-
- response->first_segment = false;
- rc = smbd_post_recv(sc, response);
- if (rc) {
- log_rdma_recv(ERR,
- "post_recv failed rc=%d\n", rc);
- put_receive_buffer(sc, response);
- break;
- }
-
- atomic_inc(&sc->recv_io.posted.count);
- posted += 1;
- }
- }
-
- atomic_add(posted, &sc->recv_io.credits.available);
-
- /*
- * If the last send credit is waiting for credits
- * it can grant we need to wake it up
- */
- if (posted &&
- atomic_read(&sc->send_io.bcredits.count) == 0 &&
- atomic_read(&sc->send_io.credits.count) == 0)
- wake_up(&sc->send_io.credits.wait_queue);
-
- /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */
- if (atomic_read(&sc->recv_io.credits.count) <
- sc->recv_io.credits.target - 1) {
- log_keep_alive(INFO, "schedule send of an empty message\n");
- queue_work(sc->workqueue, &sc->idle.immediate_work);
- }
-}
-
-/* Called from softirq, when recv is done */
-static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct smbdirect_data_transfer *data_transfer;
- struct smbdirect_recv_io *response =
- container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
- struct smbdirect_socket *sc = response->socket;
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- int current_recv_credits;
- u16 old_recv_credit_target;
- u32 data_offset = 0;
- u32 data_length = 0;
- u32 remaining_data_length = 0;
- bool negotiate_done = false;
-
- log_rdma_recv(INFO,
- "response=0x%p type=%d wc status=%s wc opcode %d byte_len=%d pkey_index=%u\n",
- response, sc->recv_io.expected,
- ib_wc_status_msg(wc->status), wc->opcode,
- wc->byte_len, wc->pkey_index);
-
- if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
- if (wc->status != IB_WC_WR_FLUSH_ERR)
- log_rdma_recv(ERR, "wc->status=%s opcode=%d\n",
- ib_wc_status_msg(wc->status), wc->opcode);
- goto error;
- }
-
- ib_dma_sync_single_for_cpu(
- wc->qp->device,
- response->sge.addr,
- response->sge.length,
- DMA_FROM_DEVICE);
-
- /*
- * Reset timer to the keepalive interval in
- * order to trigger our next keepalive message.
- */
- sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
- mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
- msecs_to_jiffies(sp->keepalive_interval_msec));
-
- switch (sc->recv_io.expected) {
- /* SMBD negotiation response */
- case SMBDIRECT_EXPECT_NEGOTIATE_REP:
- dump_smbdirect_negotiate_resp(smbdirect_recv_io_payload(response));
- sc->recv_io.reassembly.full_packet_received = true;
- negotiate_done =
- process_negotiation_response(response, wc->byte_len);
- put_receive_buffer(sc, response);
- if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_NEGOTIATE_RUNNING))
- negotiate_done = false;
- if (!negotiate_done) {
- sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
- smbd_disconnect_rdma_connection(sc);
- } else {
- sc->status = SMBDIRECT_SOCKET_CONNECTED;
- wake_up(&sc->status_wait);
- }
-
- return;
-
- /* SMBD data transfer packet */
- case SMBDIRECT_EXPECT_DATA_TRANSFER:
- data_transfer = smbdirect_recv_io_payload(response);
-
- if (wc->byte_len <
- offsetof(struct smbdirect_data_transfer, padding))
- goto error;
-
- remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
- data_offset = le32_to_cpu(data_transfer->data_offset);
- data_length = le32_to_cpu(data_transfer->data_length);
- if (wc->byte_len < data_offset ||
- (u64)wc->byte_len < (u64)data_offset + data_length)
- goto error;
-
- if (remaining_data_length > sp->max_fragmented_recv_size ||
- data_length > sp->max_fragmented_recv_size ||
- (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size)
- goto error;
-
- if (data_length) {
- if (sc->recv_io.reassembly.full_packet_received)
- response->first_segment = true;
-
- if (le32_to_cpu(data_transfer->remaining_data_length))
- sc->recv_io.reassembly.full_packet_received = false;
- else
- sc->recv_io.reassembly.full_packet_received = true;
- }
-
- atomic_dec(&sc->recv_io.posted.count);
- current_recv_credits = atomic_dec_return(&sc->recv_io.credits.count);
-
- old_recv_credit_target = sc->recv_io.credits.target;
- sc->recv_io.credits.target =
- le16_to_cpu(data_transfer->credits_requested);
- sc->recv_io.credits.target =
- min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
- sc->recv_io.credits.target =
- max_t(u16, sc->recv_io.credits.target, 1);
- if (le16_to_cpu(data_transfer->credits_granted)) {
- atomic_add(le16_to_cpu(data_transfer->credits_granted),
- &sc->send_io.credits.count);
- /*
- * We have new send credits granted from remote peer
- * If any sender is waiting for credits, unblock it
- */
- wake_up(&sc->send_io.credits.wait_queue);
- }
-
- log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n",
- le16_to_cpu(data_transfer->flags),
- le32_to_cpu(data_transfer->data_offset),
- le32_to_cpu(data_transfer->data_length),
- le32_to_cpu(data_transfer->remaining_data_length));
-
- /* Send an immediate response right away if requested */
- if (le16_to_cpu(data_transfer->flags) &
- SMBDIRECT_FLAG_RESPONSE_REQUESTED) {
- log_keep_alive(INFO, "schedule send of immediate response\n");
- queue_work(sc->workqueue, &sc->idle.immediate_work);
- }
-
- /*
- * If this is a packet with data playload place the data in
- * reassembly queue and wake up the reading thread
- */
- if (data_length) {
- if (current_recv_credits <= (sc->recv_io.credits.target / 4) ||
- sc->recv_io.credits.target > old_recv_credit_target)
- queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
-
- enqueue_reassembly(sc, response, data_length);
- wake_up(&sc->recv_io.reassembly.wait_queue);
- } else
- put_receive_buffer(sc, response);
-
- return;
-
- case SMBDIRECT_EXPECT_NEGOTIATE_REQ:
- /* Only server... */
- break;
- }
-
- /*
- * This is an internal error!
- */
- log_rdma_recv(ERR, "unexpected response type=%d\n", sc->recv_io.expected);
- WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER);
-error:
- put_receive_buffer(sc, response);
- smbd_disconnect_rdma_connection(sc);
-}
-
-static struct rdma_cm_id *smbd_create_id(
- struct smbdirect_socket *sc,
- struct sockaddr *dstaddr, int port)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct rdma_cm_id *id;
- u8 node_type = RDMA_NODE_UNSPECIFIED;
- int rc;
- __be16 *sport;
-
- id = rdma_create_id(&init_net, smbd_conn_upcall, sc,
- RDMA_PS_TCP, IB_QPT_RC);
- if (IS_ERR(id)) {
- rc = PTR_ERR(id);
- log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc);
- return id;
- }
-
- switch (port) {
- case SMBD_PORT:
- /*
- * only allow iWarp devices
- * for port 5445.
- */
- node_type = RDMA_NODE_RNIC;
- break;
- case SMB_PORT:
- /*
- * only allow InfiniBand, RoCEv1 or RoCEv2
- * devices for port 445.
- *
- * (Basically don't allow iWarp devices)
- */
- node_type = RDMA_NODE_IB_CA;
- break;
- }
- rc = rdma_restrict_node_type(id, node_type);
- if (rc) {
- log_rdma_event(ERR, "rdma_restrict_node_type(%u) failed %i\n",
- node_type, rc);
- goto out;
- }
-
- if (dstaddr->sa_family == AF_INET6)
- sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
- else
- sport = &((struct sockaddr_in *)dstaddr)->sin_port;
-
- *sport = htons(port);
-
- WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED);
- sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING;
- rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr,
- sp->resolve_addr_timeout_msec);
- if (rc) {
- log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
- goto out;
- }
- rc = wait_event_interruptible_timeout(
- sc->status_wait,
- sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING,
- msecs_to_jiffies(sp->resolve_addr_timeout_msec));
- /* e.g. if interrupted returns -ERESTARTSYS */
- if (rc < 0) {
- log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
- goto out;
- }
- if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING) {
- rc = -ETIMEDOUT;
- log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
- goto out;
- }
- if (sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED) {
- rc = -EHOSTUNREACH;
- log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
- goto out;
- }
-
- WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED);
- sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING;
- rc = rdma_resolve_route(id, sp->resolve_route_timeout_msec);
- if (rc) {
- log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
- goto out;
- }
- rc = wait_event_interruptible_timeout(
- sc->status_wait,
- sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING,
- msecs_to_jiffies(sp->resolve_route_timeout_msec));
- /* e.g. if interrupted returns -ERESTARTSYS */
- if (rc < 0) {
- log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
- goto out;
- }
- if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING) {
- rc = -ETIMEDOUT;
- log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
- goto out;
- }
- if (sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED) {
- rc = -ENETUNREACH;
- log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
- goto out;
- }
-
- return id;
-
-out:
- rdma_destroy_id(id);
- return ERR_PTR(rc);
-}
-
-/*
- * Test if FRWR (Fast Registration Work Requests) is supported on the device
- * This implementation requires FRWR on RDMA read/write
- * return value: true if it is supported
- */
-static bool frwr_is_supported(struct ib_device_attr *attrs)
-{
- if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
- return false;
- if (attrs->max_fast_reg_page_list_len == 0)
- return false;
- return true;
-}
-
-static int smbd_ia_open(
- struct smbdirect_socket *sc,
- struct sockaddr *dstaddr, int port)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- int rc;
-
- WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
- sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED;
-
- sc->rdma.cm_id = smbd_create_id(sc, dstaddr, port);
- if (IS_ERR(sc->rdma.cm_id)) {
- rc = PTR_ERR(sc->rdma.cm_id);
- goto out1;
- }
- sc->ib.dev = sc->rdma.cm_id->device;
-
- if (!frwr_is_supported(&sc->ib.dev->attrs)) {
- log_rdma_event(ERR, "Fast Registration Work Requests (FRWR) is not supported\n");
- log_rdma_event(ERR, "Device capability flags = %llx max_fast_reg_page_list_len = %u\n",
- sc->ib.dev->attrs.device_cap_flags,
- sc->ib.dev->attrs.max_fast_reg_page_list_len);
- rc = -EPROTONOSUPPORT;
- goto out2;
- }
- sp->max_frmr_depth = min_t(u32,
- sp->max_frmr_depth,
- sc->ib.dev->attrs.max_fast_reg_page_list_len);
- sc->mr_io.type = IB_MR_TYPE_MEM_REG;
- if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
- sc->mr_io.type = IB_MR_TYPE_SG_GAPS;
-
- return 0;
-
-out2:
- rdma_destroy_id(sc->rdma.cm_id);
- sc->rdma.cm_id = NULL;
-
-out1:
- return rc;
-}
-
-/*
- * Send a negotiation request message to the peer
- * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3
- * After negotiation, the transport is connected and ready for
- * carrying upper layer SMB payload
- */
-static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- int rc;
- struct smbdirect_send_io *request;
- struct smbdirect_negotiate_req *packet;
-
- request = smbd_alloc_send_io(sc);
- if (IS_ERR(request))
- return PTR_ERR(request);
-
- packet = smbdirect_send_io_payload(request);
- packet->min_version = cpu_to_le16(SMBDIRECT_V1);
- packet->max_version = cpu_to_le16(SMBDIRECT_V1);
- packet->reserved = 0;
- packet->credits_requested = cpu_to_le16(sp->send_credit_target);
- packet->preferred_send_size = cpu_to_le32(sp->max_send_size);
- packet->max_receive_size = cpu_to_le32(sp->max_recv_size);
- packet->max_fragmented_size =
- cpu_to_le32(sp->max_fragmented_recv_size);
-
- request->sge[0].addr = ib_dma_map_single(
- sc->ib.dev, (void *)packet,
- sizeof(*packet), DMA_TO_DEVICE);
- if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
- rc = -EIO;
- goto dma_mapping_failed;
- }
- request->num_sge = 1;
-
- request->sge[0].length = sizeof(*packet);
- request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
-
- rc = smbd_post_send(sc, NULL, request);
- if (!rc)
- return 0;
-
- if (rc == -EAGAIN)
- rc = -EIO;
-
-dma_mapping_failed:
- smbd_free_send_io(request);
- return rc;
-}
-
-/*
- * Extend the credits to remote peer
- * This implements [MS-SMBD] 3.1.5.9
- * The idea is that we should extend credits to remote peer as quickly as
- * it's allowed, to maintain data flow. We allocate as much receive
- * buffer as possible, and extend the receive credits to remote peer
- * return value: the new credtis being granted.
- */
-static int manage_credits_prior_sending(struct smbdirect_socket *sc)
-{
- int missing;
- int available;
- int new_credits;
-
- if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target)
- return 0;
-
- missing = (int)sc->recv_io.credits.target - atomic_read(&sc->recv_io.credits.count);
- available = atomic_xchg(&sc->recv_io.credits.available, 0);
- new_credits = (u16)min3(U16_MAX, missing, available);
- if (new_credits <= 0) {
- /*
- * If credits are available, but not granted
- * we need to re-add them again.
- */
- if (available)
- atomic_add(available, &sc->recv_io.credits.available);
- return 0;
- }
-
- if (new_credits < available) {
- /*
- * Readd the remaining available again.
- */
- available -= new_credits;
- atomic_add(available, &sc->recv_io.credits.available);
- }
-
- /*
- * Remember we granted the credits
- */
- atomic_add(new_credits, &sc->recv_io.credits.count);
- return new_credits;
-}
-
-/*
- * Check if we need to send a KEEP_ALIVE message
- * The idle connection timer triggers a KEEP_ALIVE message when expires
- * SMBDIRECT_FLAG_RESPONSE_REQUESTED is set in the message flag to have peer send
- * back a response.
- * return value:
- * 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set
- * 0: otherwise
- */
-static int manage_keep_alive_before_sending(struct smbdirect_socket *sc)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
-
- if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) {
- sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT;
- /*
- * Now use the keepalive timeout (instead of keepalive interval)
- * in order to wait for a response
- */
- mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
- msecs_to_jiffies(sp->keepalive_timeout_msec));
- return 1;
- }
- return 0;
-}
-
-static int smbd_ib_post_send(struct smbdirect_socket *sc,
- struct ib_send_wr *wr)
-{
- int ret;
-
- atomic_inc(&sc->send_io.pending.count);
- ret = ib_post_send(sc->ib.qp, wr, NULL);
- if (ret) {
- pr_err("failed to post send: %d\n", ret);
- smbd_disconnect_rdma_connection(sc);
- ret = -EAGAIN;
- }
- return ret;
-}
-
-/* Post the send request */
-static int smbd_post_send(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *batch,
- struct smbdirect_send_io *request)
-{
- int i;
-
- for (i = 0; i < request->num_sge; i++) {
- log_rdma_send(INFO,
- "rdma_request sge[%d] addr=0x%llx length=%u\n",
- i, request->sge[i].addr, request->sge[i].length);
- ib_dma_sync_single_for_device(
- sc->ib.dev,
- request->sge[i].addr,
- request->sge[i].length,
- DMA_TO_DEVICE);
- }
-
- request->cqe.done = send_done;
- request->wr.next = NULL;
- request->wr.sg_list = request->sge;
- request->wr.num_sge = request->num_sge;
- request->wr.opcode = IB_WR_SEND;
-
- if (batch) {
- request->wr.wr_cqe = NULL;
- request->wr.send_flags = 0;
- if (!list_empty(&batch->msg_list)) {
- struct smbdirect_send_io *last;
-
- last = list_last_entry(&batch->msg_list,
- struct smbdirect_send_io,
- sibling_list);
- last->wr.next = &request->wr;
- }
- list_add_tail(&request->sibling_list, &batch->msg_list);
- batch->wr_cnt++;
- return 0;
- }
-
- request->wr.wr_cqe = &request->cqe;
- request->wr.send_flags = IB_SEND_SIGNALED;
- return smbd_ib_post_send(sc, &request->wr);
-}
-
-static void smbd_send_batch_init(struct smbdirect_send_batch *batch,
- bool need_invalidate_rkey,
- unsigned int remote_key)
-{
- INIT_LIST_HEAD(&batch->msg_list);
- batch->wr_cnt = 0;
- batch->need_invalidate_rkey = need_invalidate_rkey;
- batch->remote_key = remote_key;
- batch->credit = 0;
-}
-
-static int smbd_send_batch_flush(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *batch,
- bool is_last)
-{
- struct smbdirect_send_io *first, *last;
- int ret = 0;
-
- if (list_empty(&batch->msg_list))
- goto release_credit;
-
- first = list_first_entry(&batch->msg_list,
- struct smbdirect_send_io,
- sibling_list);
- last = list_last_entry(&batch->msg_list,
- struct smbdirect_send_io,
- sibling_list);
-
- if (batch->need_invalidate_rkey) {
- first->wr.opcode = IB_WR_SEND_WITH_INV;
- first->wr.ex.invalidate_rkey = batch->remote_key;
- batch->need_invalidate_rkey = false;
- batch->remote_key = 0;
- }
-
- last->wr.send_flags = IB_SEND_SIGNALED;
- last->wr.wr_cqe = &last->cqe;
-
- /*
- * Remove last from batch->msg_list
- * and splice the rest of batch->msg_list
- * to last->sibling_list.
- *
- * batch->msg_list is a valid empty list
- * at the end.
- */
- list_del_init(&last->sibling_list);
- list_splice_tail_init(&batch->msg_list, &last->sibling_list);
- batch->wr_cnt = 0;
-
- ret = smbd_ib_post_send(sc, &first->wr);
- if (ret) {
- struct smbdirect_send_io *sibling, *next;
-
- list_for_each_entry_safe(sibling, next, &last->sibling_list, sibling_list) {
- list_del_init(&sibling->sibling_list);
- smbd_free_send_io(sibling);
- }
- smbd_free_send_io(last);
- }
-
-release_credit:
- if (is_last && !ret && batch->credit) {
- atomic_add(batch->credit, &sc->send_io.bcredits.count);
- batch->credit = 0;
- wake_up(&sc->send_io.bcredits.wait_queue);
- }
-
- return ret;
-}
-
-static int wait_for_credits(struct smbdirect_socket *sc,
- wait_queue_head_t *waitq, atomic_t *total_credits,
- int needed)
-{
- int ret;
-
- do {
- if (atomic_sub_return(needed, total_credits) >= 0)
- return 0;
-
- atomic_add(needed, total_credits);
- ret = wait_event_interruptible(*waitq,
- atomic_read(total_credits) >= needed ||
- sc->status != SMBDIRECT_SOCKET_CONNECTED);
-
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
- return -ENOTCONN;
- else if (ret < 0)
- return ret;
- } while (true);
-}
-
-static int wait_for_send_bcredit(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *batch)
-{
- int ret;
-
- if (batch->credit)
- return 0;
-
- ret = wait_for_credits(sc,
- &sc->send_io.bcredits.wait_queue,
- &sc->send_io.bcredits.count,
- 1);
- if (ret)
- return ret;
-
- batch->credit = 1;
- return 0;
-}
-
-static int wait_for_send_lcredit(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *batch)
-{
- if (batch && (atomic_read(&sc->send_io.lcredits.count) <= 1)) {
- int ret;
-
- ret = smbd_send_batch_flush(sc, batch, false);
- if (ret)
- return ret;
- }
-
- return wait_for_credits(sc,
- &sc->send_io.lcredits.wait_queue,
- &sc->send_io.lcredits.count,
- 1);
-}
-
-static int wait_for_send_credits(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *batch)
-{
- if (batch &&
- (batch->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) {
- int ret;
-
- ret = smbd_send_batch_flush(sc, batch, false);
- if (ret)
- return ret;
- }
-
- return wait_for_credits(sc,
- &sc->send_io.credits.wait_queue,
- &sc->send_io.credits.count,
- 1);
-}
-
-static int smbd_post_send_iter(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *batch,
- struct iov_iter *iter,
- int *_remaining_data_length)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- int rc;
- int header_length;
- int data_length;
- struct smbdirect_send_io *request;
- struct smbdirect_data_transfer *packet;
- int new_credits = 0;
- struct smbdirect_send_batch _batch;
-
- if (!batch) {
- smbd_send_batch_init(&_batch, false, 0);
- batch = &_batch;
- }
-
- rc = wait_for_send_bcredit(sc, batch);
- if (rc) {
- log_outgoing(ERR, "disconnected not sending on wait_bcredit\n");
- rc = -EAGAIN;
- goto err_wait_bcredit;
- }
-
- rc = wait_for_send_lcredit(sc, batch);
- if (rc) {
- log_outgoing(ERR, "disconnected not sending on wait_lcredit\n");
- rc = -EAGAIN;
- goto err_wait_lcredit;
- }
-
- rc = wait_for_send_credits(sc, batch);
- if (rc) {
- log_outgoing(ERR, "disconnected not sending on wait_credit\n");
- rc = -EAGAIN;
- goto err_wait_credit;
- }
-
- new_credits = manage_credits_prior_sending(sc);
- if (new_credits == 0 &&
- atomic_read(&sc->send_io.credits.count) == 0 &&
- atomic_read(&sc->recv_io.credits.count) == 0) {
- queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
- rc = wait_event_interruptible(sc->send_io.credits.wait_queue,
- atomic_read(&sc->send_io.credits.count) >= 1 ||
- atomic_read(&sc->recv_io.credits.available) >= 1 ||
- sc->status != SMBDIRECT_SOCKET_CONNECTED);
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
- rc = -ENOTCONN;
- if (rc < 0) {
- log_outgoing(ERR, "disconnected not sending on last credit\n");
- rc = -EAGAIN;
- goto err_wait_credit;
- }
-
- new_credits = manage_credits_prior_sending(sc);
- }
-
- request = smbd_alloc_send_io(sc);
- if (IS_ERR(request)) {
- rc = PTR_ERR(request);
- goto err_alloc;
- }
-
- memset(request->sge, 0, sizeof(request->sge));
-
- /* Map the packet to DMA */
- header_length = sizeof(struct smbdirect_data_transfer);
- /* If this is a packet without payload, don't send padding */
- if (!iter)
- header_length = offsetof(struct smbdirect_data_transfer, padding);
-
- packet = smbdirect_send_io_payload(request);
- request->sge[0].addr = ib_dma_map_single(sc->ib.dev,
- (void *)packet,
- header_length,
- DMA_TO_DEVICE);
- if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
- rc = -EIO;
- goto err_dma;
- }
-
- request->sge[0].length = header_length;
- request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
- request->num_sge = 1;
-
- /* Fill in the data payload to find out how much data we can add */
- if (iter) {
- struct smb_extract_to_rdma extract = {
- .nr_sge = request->num_sge,
- .max_sge = SMBDIRECT_SEND_IO_MAX_SGE,
- .sge = request->sge,
- .device = sc->ib.dev,
- .local_dma_lkey = sc->ib.pd->local_dma_lkey,
- .direction = DMA_TO_DEVICE,
- };
- size_t payload_len = umin(*_remaining_data_length,
- sp->max_send_size - sizeof(*packet));
-
- rc = smb_extract_iter_to_rdma(iter, payload_len,
- &extract);
- if (rc < 0)
- goto err_dma;
- data_length = rc;
- request->num_sge = extract.nr_sge;
- *_remaining_data_length -= data_length;
- } else {
- data_length = 0;
- }
-
- /* Fill in the packet header */
- packet->credits_requested = cpu_to_le16(sp->send_credit_target);
- packet->credits_granted = cpu_to_le16(new_credits);
-
- packet->flags = 0;
- if (manage_keep_alive_before_sending(sc))
- packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED);
-
- packet->reserved = 0;
- if (!data_length)
- packet->data_offset = 0;
- else
- packet->data_offset = cpu_to_le32(24);
- packet->data_length = cpu_to_le32(data_length);
- packet->remaining_data_length = cpu_to_le32(*_remaining_data_length);
- packet->padding = 0;
-
- log_outgoing(INFO, "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n",
- le16_to_cpu(packet->credits_requested),
- le16_to_cpu(packet->credits_granted),
- le32_to_cpu(packet->data_offset),
- le32_to_cpu(packet->data_length),
- le32_to_cpu(packet->remaining_data_length));
-
- rc = smbd_post_send(sc, batch, request);
- if (!rc) {
- /*
- * From here request is moved to batch
- * and we should not free it explicitly.
- */
-
- if (batch != &_batch)
- return 0;
-
- rc = smbd_send_batch_flush(sc, batch, true);
- if (!rc)
- return 0;
-
- goto err_flush;
- }
-
-err_dma:
- smbd_free_send_io(request);
-
-err_flush:
-err_alloc:
- atomic_inc(&sc->send_io.credits.count);
- wake_up(&sc->send_io.credits.wait_queue);
-
-err_wait_credit:
- atomic_inc(&sc->send_io.lcredits.count);
- wake_up(&sc->send_io.lcredits.wait_queue);
-
-err_wait_lcredit:
- atomic_add(batch->credit, &sc->send_io.bcredits.count);
- batch->credit = 0;
- wake_up(&sc->send_io.bcredits.wait_queue);
-
-err_wait_bcredit:
- return rc;
-}
-
-/*
- * Send an empty message
- * Empty message is used to extend credits to peer to for keep live
- * while there is no upper layer payload to send at the time
- */
-static int smbd_post_send_empty(struct smbdirect_socket *sc)
-{
- int remaining_data_length = 0;
-
- sc->statistics.send_empty++;
- return smbd_post_send_iter(sc, NULL, NULL, &remaining_data_length);
-}
-
static int smbd_post_send_full_iter(struct smbdirect_socket *sc,
struct smbdirect_send_batch *batch,
struct iov_iter *iter,
- int *_remaining_data_length)
+ u32 remaining_data_length)
{
- int rc = 0;
+ int bytes = 0;
/*
- * smbd_post_send_iter() respects the
+ * smbdirect_connection_send_single_iter() respects the
* negotiated max_send_size, so we need to
* loop until the full iter is posted
*/
while (iov_iter_count(iter) > 0) {
- rc = smbd_post_send_iter(sc, batch, iter, _remaining_data_length);
- if (rc < 0)
- break;
- }
-
- return rc;
-}
-
-/*
- * Post a receive request to the transport
- * The remote peer can only send data when a receive request is posted
- * The interaction is controlled by send/receive credit system
- */
-static int smbd_post_recv(
- struct smbdirect_socket *sc, struct smbdirect_recv_io *response)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct ib_recv_wr recv_wr;
- int rc = -EIO;
-
- response->sge.addr = ib_dma_map_single(
- sc->ib.dev, response->packet,
- sp->max_recv_size, DMA_FROM_DEVICE);
- if (ib_dma_mapping_error(sc->ib.dev, response->sge.addr))
- return rc;
-
- response->sge.length = sp->max_recv_size;
- response->sge.lkey = sc->ib.pd->local_dma_lkey;
-
- response->cqe.done = recv_done;
-
- recv_wr.wr_cqe = &response->cqe;
- recv_wr.next = NULL;
- recv_wr.sg_list = &response->sge;
- recv_wr.num_sge = 1;
-
- rc = ib_post_recv(sc->ib.qp, &recv_wr, NULL);
- if (rc) {
- ib_dma_unmap_single(sc->ib.dev, response->sge.addr,
- response->sge.length, DMA_FROM_DEVICE);
- response->sge.length = 0;
- smbd_disconnect_rdma_connection(sc);
- log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
- }
-
- return rc;
-}
-
-/* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
-static int smbd_negotiate(struct smbdirect_socket *sc)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- int rc;
- struct smbdirect_recv_io *response = get_receive_buffer(sc);
-
- WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED);
- sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
-
- sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP;
- rc = smbd_post_recv(sc, response);
- log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n",
- rc, response->sge.addr,
- response->sge.length, response->sge.lkey);
- if (rc) {
- put_receive_buffer(sc, response);
- return rc;
- }
-
- rc = smbd_post_send_negotiate_req(sc);
- if (rc)
- return rc;
-
- rc = wait_event_interruptible_timeout(
- sc->status_wait,
- sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING,
- msecs_to_jiffies(sp->negotiate_timeout_msec));
- log_rdma_event(INFO, "wait_event_interruptible_timeout rc=%d\n", rc);
-
- if (sc->status == SMBDIRECT_SOCKET_CONNECTED)
- return 0;
-
- if (rc == 0)
- rc = -ETIMEDOUT;
- else if (rc == -ERESTARTSYS)
- rc = -EINTR;
- else
- rc = -ENOTCONN;
-
- return rc;
-}
-
-/*
- * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1
- * This is a queue for reassembling upper layer payload and present to upper
- * layer. All the inncoming payload go to the reassembly queue, regardless of
- * if reassembly is required. The uuper layer code reads from the queue for all
- * incoming payloads.
- * Put a received packet to the reassembly queue
- * response: the packet received
- * data_length: the size of payload in this packet
- */
-static void enqueue_reassembly(
- struct smbdirect_socket *sc,
- struct smbdirect_recv_io *response,
- int data_length)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
- list_add_tail(&response->list, &sc->recv_io.reassembly.list);
- sc->recv_io.reassembly.queue_length++;
- /*
- * Make sure reassembly_data_length is updated after list and
- * reassembly_queue_length are updated. On the dequeue side
- * reassembly_data_length is checked without a lock to determine
- * if reassembly_queue_length and list is up to date
- */
- virt_wmb();
- sc->recv_io.reassembly.data_length += data_length;
- spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
- sc->statistics.enqueue_reassembly_queue++;
-}
-
-/*
- * Get the first entry at the front of reassembly queue
- * Caller is responsible for locking
- * return value: the first entry if any, NULL if queue is empty
- */
-static struct smbdirect_recv_io *_get_first_reassembly(struct smbdirect_socket *sc)
-{
- struct smbdirect_recv_io *ret = NULL;
-
- if (!list_empty(&sc->recv_io.reassembly.list)) {
- ret = list_first_entry(
- &sc->recv_io.reassembly.list,
- struct smbdirect_recv_io, list);
- }
- return ret;
-}
-
-/*
- * Get a receive buffer
- * For each remote send, we need to post a receive. The receive buffers are
- * pre-allocated in advance.
- * return value: the receive buffer, NULL if none is available
- */
-static struct smbdirect_recv_io *get_receive_buffer(struct smbdirect_socket *sc)
-{
- struct smbdirect_recv_io *ret = NULL;
- unsigned long flags;
-
- spin_lock_irqsave(&sc->recv_io.free.lock, flags);
- if (!list_empty(&sc->recv_io.free.list)) {
- ret = list_first_entry(
- &sc->recv_io.free.list,
- struct smbdirect_recv_io, list);
- list_del(&ret->list);
- sc->statistics.get_receive_buffer++;
- }
- spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
-
- return ret;
-}
-
-/*
- * Return a receive buffer
- * Upon returning of a receive buffer, we can post new receive and extend
- * more receive credits to remote peer. This is done immediately after a
- * receive buffer is returned.
- */
-static void put_receive_buffer(
- struct smbdirect_socket *sc, struct smbdirect_recv_io *response)
-{
- unsigned long flags;
-
- if (likely(response->sge.length != 0)) {
- ib_dma_unmap_single(sc->ib.dev,
- response->sge.addr,
- response->sge.length,
- DMA_FROM_DEVICE);
- response->sge.length = 0;
- }
-
- spin_lock_irqsave(&sc->recv_io.free.lock, flags);
- list_add_tail(&response->list, &sc->recv_io.free.list);
- sc->statistics.put_receive_buffer++;
- spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
-
- queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
-}
-
-/* Preallocate all receive buffer on transport establishment */
-static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf)
-{
- struct smbdirect_recv_io *response;
- int i;
-
- for (i = 0; i < num_buf; i++) {
- response = mempool_alloc(sc->recv_io.mem.pool, GFP_KERNEL);
- if (!response)
- goto allocate_failed;
-
- response->socket = sc;
- response->sge.length = 0;
- list_add_tail(&response->list, &sc->recv_io.free.list);
- }
-
- return 0;
-
-allocate_failed:
- while (!list_empty(&sc->recv_io.free.list)) {
- response = list_first_entry(
- &sc->recv_io.free.list,
- struct smbdirect_recv_io, list);
- list_del(&response->list);
-
- mempool_free(response, sc->recv_io.mem.pool);
- }
- return -ENOMEM;
-}
-
-static void destroy_receive_buffers(struct smbdirect_socket *sc)
-{
- struct smbdirect_recv_io *response;
-
- while ((response = get_receive_buffer(sc)))
- mempool_free(response, sc->recv_io.mem.pool);
-}
-
-static void send_immediate_empty_message(struct work_struct *work)
-{
- struct smbdirect_socket *sc =
- container_of(work, struct smbdirect_socket, idle.immediate_work);
-
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
- return;
-
- log_keep_alive(INFO, "send an empty message\n");
- smbd_post_send_empty(sc);
-}
+ int rc;
-/* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
-static void idle_connection_timer(struct work_struct *work)
-{
- struct smbdirect_socket *sc =
- container_of(work, struct smbdirect_socket, idle.timer_work.work);
- struct smbdirect_socket_parameters *sp = &sc->parameters;
-
- if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) {
- log_keep_alive(ERR,
- "error status sc->idle.keepalive=%d\n",
- sc->idle.keepalive);
- smbd_disconnect_rdma_connection(sc);
- return;
+ rc = smbdirect_connection_send_single_iter(sc,
+ batch,
+ iter,
+ 0, /* flags */
+ remaining_data_length);
+ if (rc < 0)
+ return rc;
+ remaining_data_length -= rc;
+ bytes += rc;
}
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
- return;
-
- /*
- * Now use the keepalive timeout (instead of keepalive interval)
- * in order to wait for a response
- */
- sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
- mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
- msecs_to_jiffies(sp->keepalive_timeout_msec));
- log_keep_alive(INFO, "schedule send of empty idle message\n");
- queue_work(sc->workqueue, &sc->idle.immediate_work);
+ return bytes;
}
/*
@@ -1892,88 +203,14 @@ static void idle_connection_timer(struct work_struct *work)
void smbd_destroy(struct TCP_Server_Info *server)
{
struct smbd_connection *info = server->smbd_conn;
- struct smbdirect_socket *sc;
- struct smbdirect_recv_io *response;
- unsigned long flags;
if (!info) {
log_rdma_event(INFO, "rdma session already destroyed\n");
return;
}
- sc = &info->socket;
-
- log_rdma_event(INFO, "cancelling and disable disconnect_work\n");
- disable_work_sync(&sc->disconnect_work);
-
- log_rdma_event(INFO, "destroying rdma session\n");
- if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING)
- smbd_disconnect_rdma_work(&sc->disconnect_work);
- if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) {
- log_rdma_event(INFO, "wait for transport being disconnected\n");
- wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
- log_rdma_event(INFO, "waited for transport being disconnected\n");
- }
-
- /*
- * Wake up all waiters in all wait queues
- * in order to notice the broken connection.
- *
- * Most likely this was already called via
- * smbd_disconnect_rdma_work(), but call it again...
- */
- smbd_disconnect_wake_up_all(sc);
-
- log_rdma_event(INFO, "cancelling recv_io.posted.refill_work\n");
- disable_work_sync(&sc->recv_io.posted.refill_work);
-
- log_rdma_event(INFO, "destroying qp\n");
- ib_drain_qp(sc->ib.qp);
- rdma_destroy_qp(sc->rdma.cm_id);
- sc->ib.qp = NULL;
- log_rdma_event(INFO, "cancelling idle timer\n");
- disable_delayed_work_sync(&sc->idle.timer_work);
- log_rdma_event(INFO, "cancelling send immediate work\n");
- disable_work_sync(&sc->idle.immediate_work);
+ smbdirect_socket_release(info->socket);
- /* It's not possible for upper layer to get to reassembly */
- log_rdma_event(INFO, "drain the reassembly queue\n");
- do {
- spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
- response = _get_first_reassembly(sc);
- if (response) {
- list_del(&response->list);
- spin_unlock_irqrestore(
- &sc->recv_io.reassembly.lock, flags);
- put_receive_buffer(sc, response);
- } else
- spin_unlock_irqrestore(
- &sc->recv_io.reassembly.lock, flags);
- } while (response);
- sc->recv_io.reassembly.data_length = 0;
-
- log_rdma_event(INFO, "free receive buffers\n");
- destroy_receive_buffers(sc);
-
- log_rdma_event(INFO, "freeing mr list\n");
- destroy_mr_list(sc);
-
- ib_free_cq(sc->ib.send_cq);
- ib_free_cq(sc->ib.recv_cq);
- ib_dealloc_pd(sc->ib.pd);
- rdma_destroy_id(sc->rdma.cm_id);
-
- /* free mempools */
- mempool_destroy(sc->send_io.mem.pool);
- kmem_cache_destroy(sc->send_io.mem.cache);
-
- mempool_destroy(sc->recv_io.mem.pool);
- kmem_cache_destroy(sc->recv_io.mem.cache);
-
- sc->status = SMBDIRECT_SOCKET_DESTROYED;
-
- destroy_workqueue(sc->workqueue);
- log_rdma_event(INFO, "rdma session destroyed\n");
kfree(info);
server->smbd_conn = NULL;
}
@@ -1995,10 +232,8 @@ int smbd_reconnect(struct TCP_Server_Info *server)
* This is possible if transport is disconnected and we haven't received
* notification from RDMA, but upper layer has detected timeout
*/
- if (server->smbd_conn->socket.status == SMBDIRECT_SOCKET_CONNECTED) {
- log_rdma_event(INFO, "disconnecting transport\n");
- smbd_destroy(server);
- }
+ log_rdma_event(INFO, "disconnecting transport\n");
+ smbd_destroy(server);
create_conn:
log_rdma_event(INFO, "creating rdma session\n");
@@ -2014,112 +249,43 @@ create_conn:
return -ENOENT;
}
-static void destroy_caches(struct smbdirect_socket *sc)
-{
- destroy_receive_buffers(sc);
- mempool_destroy(sc->recv_io.mem.pool);
- kmem_cache_destroy(sc->recv_io.mem.cache);
- mempool_destroy(sc->send_io.mem.pool);
- kmem_cache_destroy(sc->send_io.mem.cache);
-}
-
-#define MAX_NAME_LEN 80
-static int allocate_caches(struct smbdirect_socket *sc)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- char name[MAX_NAME_LEN];
- int rc;
-
- if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer)))
- return -ENOMEM;
-
- scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", sc);
- sc->send_io.mem.cache =
- kmem_cache_create(
- name,
- sizeof(struct smbdirect_send_io) +
- sizeof(struct smbdirect_data_transfer),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (!sc->send_io.mem.cache)
- return -ENOMEM;
-
- sc->send_io.mem.pool =
- mempool_create(sp->send_credit_target, mempool_alloc_slab,
- mempool_free_slab, sc->send_io.mem.cache);
- if (!sc->send_io.mem.pool)
- goto out1;
-
- scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", sc);
-
- struct kmem_cache_args response_args = {
- .align = __alignof__(struct smbdirect_recv_io),
- .useroffset = (offsetof(struct smbdirect_recv_io, packet) +
- sizeof(struct smbdirect_data_transfer)),
- .usersize = sp->max_recv_size - sizeof(struct smbdirect_data_transfer),
- };
- sc->recv_io.mem.cache =
- kmem_cache_create(name,
- sizeof(struct smbdirect_recv_io) + sp->max_recv_size,
- &response_args, SLAB_HWCACHE_ALIGN);
- if (!sc->recv_io.mem.cache)
- goto out2;
-
- sc->recv_io.mem.pool =
- mempool_create(sp->recv_credit_max, mempool_alloc_slab,
- mempool_free_slab, sc->recv_io.mem.cache);
- if (!sc->recv_io.mem.pool)
- goto out3;
-
- rc = allocate_receive_buffers(sc, sp->recv_credit_max);
- if (rc) {
- log_rdma_event(ERR, "failed to allocate receive buffers\n");
- goto out4;
- }
-
- return 0;
-
-out4:
- mempool_destroy(sc->recv_io.mem.pool);
-out3:
- kmem_cache_destroy(sc->recv_io.mem.cache);
-out2:
- mempool_destroy(sc->send_io.mem.pool);
-out1:
- kmem_cache_destroy(sc->send_io.mem.cache);
- return -ENOMEM;
-}
-
/* Create a SMBD connection, called by upper layer */
static struct smbd_connection *_smbd_get_connection(
struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port)
{
- int rc;
+ struct net *net = cifs_net_ns(server);
struct smbd_connection *info;
struct smbdirect_socket *sc;
+ struct smbdirect_socket_parameters init_params = {};
struct smbdirect_socket_parameters *sp;
- struct rdma_conn_param conn_param;
- struct ib_qp_cap qp_cap;
- struct ib_qp_init_attr qp_attr;
- struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
- struct ib_port_immutable port_immutable;
- __be32 ird_ord_hdr[2];
- char wq_name[80];
- struct workqueue_struct *workqueue;
-
- info = kzalloc_obj(struct smbd_connection);
- if (!info)
- return NULL;
- sc = &info->socket;
- scnprintf(wq_name, ARRAY_SIZE(wq_name), "smbd_%p", sc);
- workqueue = create_workqueue(wq_name);
- if (!workqueue)
- goto create_wq_failed;
- smbdirect_socket_init(sc);
- sc->workqueue = workqueue;
- sp = &sc->parameters;
+ __be16 *sport;
+ u64 port_flags = 0;
+ int ret;
- INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work);
+ switch (port) {
+ case SMBD_PORT:
+ /*
+ * only allow iWarp devices
+ * for port 5445.
+ */
+ port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW;
+ break;
+ case SMB_PORT:
+ /*
+ * only allow InfiniBand, RoCEv1 or RoCEv2
+ * devices for port 445.
+ *
+ * (Basically don't allow iWarp devices)
+ */
+ port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB;
+ break;
+ }
+ /*
+ * Create the initial parameters
+ */
+ sp = &init_params;
+ sp->flags = port_flags;
sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT;
sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT;
sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT;
@@ -2135,213 +301,55 @@ static struct smbd_connection *_smbd_get_connection(
sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000;
sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000;
- rc = smbd_ia_open(sc, dstaddr, port);
- if (rc) {
- log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
- goto create_id_failed;
- }
-
- if (sp->send_credit_target > sc->ib.dev->attrs.max_cqe ||
- sp->send_credit_target > sc->ib.dev->attrs.max_qp_wr) {
- log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
- sp->send_credit_target,
- sc->ib.dev->attrs.max_cqe,
- sc->ib.dev->attrs.max_qp_wr);
- goto config_failed;
- }
-
- if (sp->recv_credit_max > sc->ib.dev->attrs.max_cqe ||
- sp->recv_credit_max > sc->ib.dev->attrs.max_qp_wr) {
- log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
- sp->recv_credit_max,
- sc->ib.dev->attrs.max_cqe,
- sc->ib.dev->attrs.max_qp_wr);
- goto config_failed;
- }
-
- if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE ||
- sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) {
- log_rdma_event(ERR,
- "device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
- IB_DEVICE_NAME_MAX,
- sc->ib.dev->name,
- sc->ib.dev->attrs.max_send_sge,
- sc->ib.dev->attrs.max_recv_sge);
- goto config_failed;
- }
-
- sp->responder_resources =
- min_t(u8, sp->responder_resources,
- sc->ib.dev->attrs.max_qp_rd_atom);
- log_rdma_mr(INFO, "responder_resources=%d\n",
- sp->responder_resources);
-
- /*
- * We use allocate sp->responder_resources * 2 MRs
- * and each MR needs WRs for REG and INV, so
- * we use '* 4'.
- *
- * +1 for ib_drain_qp()
- */
- memset(&qp_cap, 0, sizeof(qp_cap));
- qp_cap.max_send_wr = sp->send_credit_target + sp->responder_resources * 4 + 1;
- qp_cap.max_recv_wr = sp->recv_credit_max + 1;
- qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
- qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
-
- sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
- if (IS_ERR(sc->ib.pd)) {
- rc = PTR_ERR(sc->ib.pd);
- sc->ib.pd = NULL;
- log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
- goto alloc_pd_failed;
- }
-
- sc->ib.send_cq =
- ib_alloc_cq_any(sc->ib.dev, sc,
- qp_cap.max_send_wr, IB_POLL_SOFTIRQ);
- if (IS_ERR(sc->ib.send_cq)) {
- sc->ib.send_cq = NULL;
- goto alloc_cq_failed;
- }
-
- sc->ib.recv_cq =
- ib_alloc_cq_any(sc->ib.dev, sc,
- qp_cap.max_recv_wr, IB_POLL_SOFTIRQ);
- if (IS_ERR(sc->ib.recv_cq)) {
- sc->ib.recv_cq = NULL;
- goto alloc_cq_failed;
- }
-
- memset(&qp_attr, 0, sizeof(qp_attr));
- qp_attr.event_handler = smbd_qp_async_error_upcall;
- qp_attr.qp_context = sc;
- qp_attr.cap = qp_cap;
- qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
- qp_attr.qp_type = IB_QPT_RC;
- qp_attr.send_cq = sc->ib.send_cq;
- qp_attr.recv_cq = sc->ib.recv_cq;
- qp_attr.port_num = ~0;
-
- rc = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr);
- if (rc) {
- log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc);
- goto create_qp_failed;
- }
- sc->ib.qp = sc->rdma.cm_id->qp;
-
- memset(&conn_param, 0, sizeof(conn_param));
- conn_param.initiator_depth = sp->initiator_depth;
- conn_param.responder_resources = sp->responder_resources;
-
- /* Need to send IRD/ORD in private data for iWARP */
- sc->ib.dev->ops.get_port_immutable(
- sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable);
- if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
- ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources);
- ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth);
- conn_param.private_data = ird_ord_hdr;
- conn_param.private_data_len = sizeof(ird_ord_hdr);
- } else {
- conn_param.private_data = NULL;
- conn_param.private_data_len = 0;
- }
-
- conn_param.retry_count = SMBD_CM_RETRY;
- conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
- conn_param.flow_control = 0;
-
- log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
- &addr_in->sin_addr, port);
-
- WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED);
- sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING;
- rc = rdma_connect(sc->rdma.cm_id, &conn_param);
- if (rc) {
- log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
- goto rdma_connect_failed;
- }
-
- wait_event_interruptible_timeout(
- sc->status_wait,
- sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING,
- msecs_to_jiffies(sp->rdma_connect_timeout_msec));
+ info = kzalloc_obj(*info);
+ if (!info)
+ return NULL;
+ ret = smbdirect_socket_create_kern(net, &sc);
+ if (ret)
+ goto socket_init_failed;
+ smbdirect_socket_set_logging(sc, NULL, smbd_logging_needed, smbd_logging_vaprintf);
+ ret = smbdirect_socket_set_initial_parameters(sc, sp);
+ if (ret)
+ goto set_params_failed;
+ ret = smbdirect_socket_set_kernel_settings(sc, IB_POLL_SOFTIRQ, GFP_KERNEL);
+ if (ret)
+ goto set_settings_failed;
- if (sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) {
- log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
- goto rdma_connect_failed;
- }
+ if (dstaddr->sa_family == AF_INET6)
+ sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
+ else
+ sport = &((struct sockaddr_in *)dstaddr)->sin_port;
- log_rdma_event(INFO, "rdma_connect connected\n");
+ *sport = htons(port);
- rc = allocate_caches(sc);
- if (rc) {
- log_rdma_event(ERR, "cache allocation failed\n");
- goto allocate_cache_failed;
+ ret = smbdirect_connect_sync(sc, dstaddr);
+ if (ret) {
+ log_rdma_event(ERR, "connect to %pISpsfc failed: %1pe\n",
+ dstaddr, ERR_PTR(ret));
+ goto connect_failed;
}
- INIT_WORK(&sc->idle.immediate_work, send_immediate_empty_message);
- INIT_DELAYED_WORK(&sc->idle.timer_work, idle_connection_timer);
- /*
- * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING
- * so that the timer will cause a disconnect.
- */
- sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
- mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
- msecs_to_jiffies(sp->negotiate_timeout_msec));
+ info->socket = sc;
+ return info;
- INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits);
+connect_failed:
+set_settings_failed:
+set_params_failed:
+ smbdirect_socket_release(sc);
+socket_init_failed:
+ kfree(info);
+ return NULL;
+}
- rc = smbd_negotiate(sc);
- if (rc) {
- log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc);
- goto negotiation_failed;
- }
+const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn)
+{
+ if (unlikely(!conn->socket)) {
+ static const struct smbdirect_socket_parameters zero_params;
- rc = allocate_mr_list(sc);
- if (rc) {
- log_rdma_mr(ERR, "memory registration allocation failed\n");
- goto allocate_mr_failed;
+ return &zero_params;
}
- return info;
-
-allocate_mr_failed:
- /* At this point, need to a full transport shutdown */
- server->smbd_conn = info;
- smbd_destroy(server);
- return NULL;
-
-negotiation_failed:
- disable_delayed_work_sync(&sc->idle.timer_work);
- destroy_caches(sc);
- sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
- rdma_disconnect(sc->rdma.cm_id);
- wait_event(sc->status_wait,
- sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
-
-allocate_cache_failed:
-rdma_connect_failed:
- rdma_destroy_qp(sc->rdma.cm_id);
-
-create_qp_failed:
-alloc_cq_failed:
- if (sc->ib.send_cq)
- ib_free_cq(sc->ib.send_cq);
- if (sc->ib.recv_cq)
- ib_free_cq(sc->ib.recv_cq);
-
- ib_dealloc_pd(sc->ib.pd);
-
-alloc_pd_failed:
-config_failed:
- rdma_destroy_id(sc->rdma.cm_id);
-
-create_id_failed:
- destroy_workqueue(sc->workqueue);
-create_wq_failed:
- kfree(info);
- return NULL;
+ return smbdirect_socket_get_current_parameters(conn->socket);
}
struct smbd_connection *smbd_get_connection(
@@ -2362,7 +370,7 @@ try_again:
if (!ret)
return NULL;
- sp = &ret->socket.parameters;
+ sp = smbd_get_parameters(ret);
server->rdma_readwrite_threshold =
rdma_readwrite_threshold > sp->max_fragmented_send_size ?
@@ -2388,138 +396,12 @@ try_again:
*/
int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
{
- struct smbdirect_socket *sc = &info->socket;
- struct smbdirect_recv_io *response;
- struct smbdirect_data_transfer *data_transfer;
- size_t size = iov_iter_count(&msg->msg_iter);
- int to_copy, to_read, data_read, offset;
- u32 data_length, remaining_data_length, data_offset;
- int rc;
-
- if (WARN_ON_ONCE(iov_iter_rw(&msg->msg_iter) == WRITE))
- return -EINVAL; /* It's a bug in upper layer to get there */
-
-again:
- /*
- * No need to hold the reassembly queue lock all the time as we are
- * the only one reading from the front of the queue. The transport
- * may add more entries to the back of the queue at the same time
- */
- log_read(INFO, "size=%zd sc->recv_io.reassembly.data_length=%d\n", size,
- sc->recv_io.reassembly.data_length);
- if (sc->recv_io.reassembly.data_length >= size) {
- int queue_length;
- int queue_removed = 0;
- unsigned long flags;
-
- /*
- * Need to make sure reassembly_data_length is read before
- * reading reassembly_queue_length and calling
- * _get_first_reassembly. This call is lock free
- * as we never read at the end of the queue which are being
- * updated in SOFTIRQ as more data is received
- */
- virt_rmb();
- queue_length = sc->recv_io.reassembly.queue_length;
- data_read = 0;
- to_read = size;
- offset = sc->recv_io.reassembly.first_entry_offset;
- while (data_read < size) {
- response = _get_first_reassembly(sc);
- data_transfer = smbdirect_recv_io_payload(response);
- data_length = le32_to_cpu(data_transfer->data_length);
- remaining_data_length =
- le32_to_cpu(
- data_transfer->remaining_data_length);
- data_offset = le32_to_cpu(data_transfer->data_offset);
-
- /*
- * The upper layer expects RFC1002 length at the
- * beginning of the payload. Return it to indicate
- * the total length of the packet. This minimize the
- * change to upper layer packet processing logic. This
- * will be eventually remove when an intermediate
- * transport layer is added
- */
- if (response->first_segment && size == 4) {
- unsigned int rfc1002_len =
- data_length + remaining_data_length;
- __be32 rfc1002_hdr = cpu_to_be32(rfc1002_len);
- if (copy_to_iter(&rfc1002_hdr, sizeof(rfc1002_hdr),
- &msg->msg_iter) != sizeof(rfc1002_hdr))
- return -EFAULT;
- data_read = 4;
- response->first_segment = false;
- log_read(INFO, "returning rfc1002 length %d\n",
- rfc1002_len);
- goto read_rfc1002_done;
- }
-
- to_copy = min_t(int, data_length - offset, to_read);
- if (copy_to_iter((char *)data_transfer + data_offset + offset,
- to_copy, &msg->msg_iter) != to_copy)
- return -EFAULT;
-
- /* move on to the next buffer? */
- if (to_copy == data_length - offset) {
- queue_length--;
- /*
- * No need to lock if we are not at the
- * end of the queue
- */
- if (queue_length)
- list_del(&response->list);
- else {
- spin_lock_irqsave(
- &sc->recv_io.reassembly.lock, flags);
- list_del(&response->list);
- spin_unlock_irqrestore(
- &sc->recv_io.reassembly.lock, flags);
- }
- queue_removed++;
- sc->statistics.dequeue_reassembly_queue++;
- put_receive_buffer(sc, response);
- offset = 0;
- log_read(INFO, "put_receive_buffer offset=0\n");
- } else
- offset += to_copy;
-
- to_read -= to_copy;
- data_read += to_copy;
-
- log_read(INFO, "_get_first_reassembly memcpy %d bytes data_transfer_length-offset=%d after that to_read=%d data_read=%d offset=%d\n",
- to_copy, data_length - offset,
- to_read, data_read, offset);
- }
-
- spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
- sc->recv_io.reassembly.data_length -= data_read;
- sc->recv_io.reassembly.queue_length -= queue_removed;
- spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
-
- sc->recv_io.reassembly.first_entry_offset = offset;
- log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n",
- data_read, sc->recv_io.reassembly.data_length,
- sc->recv_io.reassembly.first_entry_offset);
-read_rfc1002_done:
- return data_read;
- }
+ struct smbdirect_socket *sc = info->socket;
- log_read(INFO, "wait_event on more data\n");
- rc = wait_event_interruptible(
- sc->recv_io.reassembly.wait_queue,
- sc->recv_io.reassembly.data_length >= size ||
- sc->status != SMBDIRECT_SOCKET_CONNECTED);
- /* Don't return any data if interrupted */
- if (rc)
- return rc;
-
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
- log_read(ERR, "disconnected\n");
- return -ECONNABORTED;
- }
+ if (!smbdirect_connection_is_connected(sc))
+ return -ENOTCONN;
- goto again;
+ return smbdirect_connection_recvmsg(sc, msg, 0);
}
/*
@@ -2532,16 +414,17 @@ int smbd_send(struct TCP_Server_Info *server,
int num_rqst, struct smb_rqst *rqst_array)
{
struct smbd_connection *info = server->smbd_conn;
- struct smbdirect_socket *sc = &info->socket;
- struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_socket *sc = info->socket;
+ const struct smbdirect_socket_parameters *sp = smbd_get_parameters(info);
struct smb_rqst *rqst;
struct iov_iter iter;
- struct smbdirect_send_batch batch;
+ struct smbdirect_send_batch_storage bstorage;
+ struct smbdirect_send_batch *batch;
unsigned int remaining_data_length, klen;
int rc, i, rqst_idx;
int error = 0;
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+ if (!smbdirect_connection_is_connected(sc))
return -EAGAIN;
/*
@@ -2564,7 +447,7 @@ int smbd_send(struct TCP_Server_Info *server,
num_rqst, remaining_data_length);
rqst_idx = 0;
- smbd_send_batch_init(&batch, false, 0);
+ batch = smbdirect_init_send_batch_storage(&bstorage, false, 0);
do {
rqst = &rqst_array[rqst_idx];
@@ -2583,25 +466,27 @@ int smbd_send(struct TCP_Server_Info *server,
klen += rqst->rq_iov[i].iov_len;
iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
- rc = smbd_post_send_full_iter(sc, &batch, &iter, &remaining_data_length);
+ rc = smbd_post_send_full_iter(sc, batch, &iter, remaining_data_length);
if (rc < 0) {
error = rc;
break;
}
+ remaining_data_length -= rc;
if (iov_iter_count(&rqst->rq_iter) > 0) {
/* And then the data pages if there are any */
- rc = smbd_post_send_full_iter(sc, &batch, &rqst->rq_iter,
- &remaining_data_length);
+ rc = smbd_post_send_full_iter(sc, batch, &rqst->rq_iter,
+ remaining_data_length);
if (rc < 0) {
error = rc;
break;
}
+ remaining_data_length -= rc;
}
} while (++rqst_idx < num_rqst);
- rc = smbd_send_batch_flush(sc, &batch, true);
+ rc = smbdirect_connection_send_batch_flush(sc, batch, true);
if (unlikely(!rc && error))
rc = error;
@@ -2612,298 +497,15 @@ int smbd_send(struct TCP_Server_Info *server,
* that means all the I/Os have been out and we are good to return
*/
- wait_event(sc->send_io.pending.zero_wait_queue,
- atomic_read(&sc->send_io.pending.count) == 0 ||
- sc->status != SMBDIRECT_SOCKET_CONNECTED);
-
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0)
- rc = -EAGAIN;
-
- return rc;
-}
-
-static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct smbdirect_mr_io *mr =
- container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe);
- struct smbdirect_socket *sc = mr->socket;
-
- if (wc->status) {
- log_rdma_mr(ERR, "status=%d\n", wc->status);
- smbd_disconnect_rdma_connection(sc);
- }
-}
-
-/*
- * The work queue function that recovers MRs
- * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used
- * again. Both calls are slow, so finish them in a workqueue. This will not
- * block I/O path.
- * There is one workqueue that recovers MRs, there is no need to lock as the
- * I/O requests calling smbd_register_mr will never update the links in the
- * mr_list.
- */
-static void smbd_mr_recovery_work(struct work_struct *work)
-{
- struct smbdirect_socket *sc =
- container_of(work, struct smbdirect_socket, mr_io.recovery_work);
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct smbdirect_mr_io *smbdirect_mr;
- int rc;
-
- list_for_each_entry(smbdirect_mr, &sc->mr_io.all.list, list) {
- if (smbdirect_mr->state == SMBDIRECT_MR_ERROR) {
-
- /* recover this MR entry */
- rc = ib_dereg_mr(smbdirect_mr->mr);
- if (rc) {
- log_rdma_mr(ERR,
- "ib_dereg_mr failed rc=%x\n",
- rc);
- smbd_disconnect_rdma_connection(sc);
- continue;
- }
-
- smbdirect_mr->mr = ib_alloc_mr(
- sc->ib.pd, sc->mr_io.type,
- sp->max_frmr_depth);
- if (IS_ERR(smbdirect_mr->mr)) {
- log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
- sc->mr_io.type,
- sp->max_frmr_depth);
- smbd_disconnect_rdma_connection(sc);
- continue;
- }
- } else
- /* This MR is being used, don't recover it */
- continue;
-
- smbdirect_mr->state = SMBDIRECT_MR_READY;
-
- /* smbdirect_mr->state is updated by this function
- * and is read and updated by I/O issuing CPUs trying
- * to get a MR, the call to atomic_inc_return
- * implicates a memory barrier and guarantees this
- * value is updated before waking up any calls to
- * get_mr() from the I/O issuing CPUs
- */
- if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
- wake_up(&sc->mr_io.ready.wait_queue);
- }
-}
-
-static void smbd_mr_disable_locked(struct smbdirect_mr_io *mr)
-{
- struct smbdirect_socket *sc = mr->socket;
-
- lockdep_assert_held(&mr->mutex);
-
- if (mr->state == SMBDIRECT_MR_DISABLED)
- return;
-
- if (mr->mr)
- ib_dereg_mr(mr->mr);
- if (mr->sgt.nents)
- ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
- kfree(mr->sgt.sgl);
-
- mr->mr = NULL;
- mr->sgt.sgl = NULL;
- mr->sgt.nents = 0;
-
- mr->state = SMBDIRECT_MR_DISABLED;
-}
-
-static void smbd_mr_free_locked(struct kref *kref)
-{
- struct smbdirect_mr_io *mr =
- container_of(kref, struct smbdirect_mr_io, kref);
-
- lockdep_assert_held(&mr->mutex);
-
- /*
- * smbd_mr_disable_locked() should already be called!
- */
- if (WARN_ON_ONCE(mr->state != SMBDIRECT_MR_DISABLED))
- smbd_mr_disable_locked(mr);
-
- mutex_unlock(&mr->mutex);
- mutex_destroy(&mr->mutex);
- kfree(mr);
-}
-
-static void destroy_mr_list(struct smbdirect_socket *sc)
-{
- struct smbdirect_mr_io *mr, *tmp;
- LIST_HEAD(all_list);
- unsigned long flags;
-
- disable_work_sync(&sc->mr_io.recovery_work);
-
- spin_lock_irqsave(&sc->mr_io.all.lock, flags);
- list_splice_tail_init(&sc->mr_io.all.list, &all_list);
- spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
-
- list_for_each_entry_safe(mr, tmp, &all_list, list) {
- mutex_lock(&mr->mutex);
-
- smbd_mr_disable_locked(mr);
- list_del(&mr->list);
- mr->socket = NULL;
-
- /*
- * No kref_put_mutex() as it's already locked.
- *
- * If smbd_mr_free_locked() is called
- * and the mutex is unlocked and mr is gone,
- * in that case kref_put() returned 1.
- *
- * If kref_put() returned 0 we know that
- * smbd_mr_free_locked() didn't
- * run. Not by us nor by anyone else, as we
- * still hold the mutex, so we need to unlock.
- *
- * If the mr is still registered it will
- * be dangling (detached from the connection
- * waiting for smbd_deregister_mr() to be
- * called in order to free the memory.
- */
- if (!kref_put(&mr->kref, smbd_mr_free_locked))
- mutex_unlock(&mr->mutex);
- }
-}
-
-/*
- * Allocate MRs used for RDMA read/write
- * The number of MRs will not exceed hardware capability in responder_resources
- * All MRs are kept in mr_list. The MR can be recovered after it's used
- * Recovery is done in smbd_mr_recovery_work. The content of list entry changes
- * as MRs are used and recovered for I/O, but the list links will not change
- */
-static int allocate_mr_list(struct smbdirect_socket *sc)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct smbdirect_mr_io *mr;
- int ret;
- u32 i;
-
- if (sp->responder_resources == 0) {
- log_rdma_mr(ERR, "responder_resources negotiated as 0\n");
- return -EINVAL;
- }
-
- /* Allocate more MRs (2x) than hardware responder_resources */
- for (i = 0; i < sp->responder_resources * 2; i++) {
- mr = kzalloc_obj(*mr);
- if (!mr) {
- ret = -ENOMEM;
- goto kzalloc_mr_failed;
- }
-
- kref_init(&mr->kref);
- mutex_init(&mr->mutex);
-
- mr->mr = ib_alloc_mr(sc->ib.pd,
- sc->mr_io.type,
- sp->max_frmr_depth);
- if (IS_ERR(mr->mr)) {
- ret = PTR_ERR(mr->mr);
- log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
- sc->mr_io.type, sp->max_frmr_depth);
- goto ib_alloc_mr_failed;
- }
-
- mr->sgt.sgl = kzalloc_objs(struct scatterlist,
- sp->max_frmr_depth);
- if (!mr->sgt.sgl) {
- ret = -ENOMEM;
- log_rdma_mr(ERR, "failed to allocate sgl\n");
- goto kcalloc_sgl_failed;
- }
- mr->state = SMBDIRECT_MR_READY;
- mr->socket = sc;
+ error = rc;
+ rc = smbdirect_connection_send_wait_zero_pending(sc);
+ if (unlikely(rc && !error))
+ error = -EAGAIN;
- list_add_tail(&mr->list, &sc->mr_io.all.list);
- atomic_inc(&sc->mr_io.ready.count);
- }
-
- INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work);
+ if (unlikely(error))
+ return error;
return 0;
-
-kcalloc_sgl_failed:
- ib_dereg_mr(mr->mr);
-ib_alloc_mr_failed:
- mutex_destroy(&mr->mutex);
- kfree(mr);
-kzalloc_mr_failed:
- destroy_mr_list(sc);
- return ret;
-}
-
-/*
- * Get a MR from mr_list. This function waits until there is at least one
- * MR available in the list. It may access the list while the
- * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
- * as they never modify the same places. However, there may be several CPUs
- * issuing I/O trying to get MR at the same time, mr_list_lock is used to
- * protect this situation.
- */
-static struct smbdirect_mr_io *get_mr(struct smbdirect_socket *sc)
-{
- struct smbdirect_mr_io *ret;
- unsigned long flags;
- int rc;
-again:
- rc = wait_event_interruptible(sc->mr_io.ready.wait_queue,
- atomic_read(&sc->mr_io.ready.count) ||
- sc->status != SMBDIRECT_SOCKET_CONNECTED);
- if (rc) {
- log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
- return NULL;
- }
-
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
- log_rdma_mr(ERR, "sc->status=%x\n", sc->status);
- return NULL;
- }
-
- spin_lock_irqsave(&sc->mr_io.all.lock, flags);
- list_for_each_entry(ret, &sc->mr_io.all.list, list) {
- if (ret->state == SMBDIRECT_MR_READY) {
- ret->state = SMBDIRECT_MR_REGISTERED;
- kref_get(&ret->kref);
- spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
- atomic_dec(&sc->mr_io.ready.count);
- atomic_inc(&sc->mr_io.used.count);
- return ret;
- }
- }
-
- spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
- /*
- * It is possible that we could fail to get MR because other processes may
- * try to acquire a MR at the same time. If this is the case, retry it.
- */
- goto again;
-}
-
-/*
- * Transcribe the pages from an iterator into an MR scatterlist.
- */
-static int smbd_iter_to_mr(struct iov_iter *iter,
- struct sg_table *sgt,
- unsigned int max_sg)
-{
- int ret;
-
- memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist));
-
- ret = extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0);
- WARN_ON(ret < 0);
- if (sgt->nents > 0)
- sg_mark_end(&sgt->sgl[sgt->nents - 1]);
- return ret;
}
/*
@@ -2917,132 +519,18 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,
struct iov_iter *iter,
bool writing, bool need_invalidate)
{
- struct smbdirect_socket *sc = &info->socket;
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct smbdirect_mr_io *mr;
- int rc, num_pages;
- struct ib_reg_wr *reg_wr;
-
- num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1);
- if (num_pages > sp->max_frmr_depth) {
- log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
- num_pages, sp->max_frmr_depth);
- WARN_ON_ONCE(1);
- return NULL;
- }
+ struct smbdirect_socket *sc = info->socket;
- mr = get_mr(sc);
- if (!mr) {
- log_rdma_mr(ERR, "get_mr returning NULL\n");
+ if (!smbdirect_connection_is_connected(sc))
return NULL;
- }
-
- mutex_lock(&mr->mutex);
-
- mr->dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
- mr->need_invalidate = need_invalidate;
- mr->sgt.nents = 0;
- mr->sgt.orig_nents = 0;
-
- log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n",
- num_pages, iov_iter_count(iter), sp->max_frmr_depth);
- smbd_iter_to_mr(iter, &mr->sgt, sp->max_frmr_depth);
- rc = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
- if (!rc) {
- log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
- num_pages, mr->dir, rc);
- goto dma_map_error;
- }
-
- rc = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE);
- if (rc != mr->sgt.nents) {
- log_rdma_mr(ERR,
- "ib_map_mr_sg failed rc = %d nents = %x\n",
- rc, mr->sgt.nents);
- goto map_mr_error;
- }
-
- ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey));
- reg_wr = &mr->wr;
- reg_wr->wr.opcode = IB_WR_REG_MR;
- mr->cqe.done = register_mr_done;
- reg_wr->wr.wr_cqe = &mr->cqe;
- reg_wr->wr.num_sge = 0;
- reg_wr->wr.send_flags = IB_SEND_SIGNALED;
- reg_wr->mr = mr->mr;
- reg_wr->key = mr->mr->rkey;
- reg_wr->access = writing ?
- IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
- IB_ACCESS_REMOTE_READ;
-
- /*
- * There is no need for waiting for complemtion on ib_post_send
- * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
- * on the next ib_post_send when we actually send I/O to remote peer
- */
- rc = ib_post_send(sc->ib.qp, &reg_wr->wr, NULL);
- if (!rc) {
- /*
- * get_mr() gave us a reference
- * via kref_get(&mr->kref), we keep that and let
- * the caller use smbd_deregister_mr()
- * to remove it again.
- */
- mutex_unlock(&mr->mutex);
- return mr;
- }
-
- log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
- rc, reg_wr->key);
-
- /* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/
-map_mr_error:
- ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
-
-dma_map_error:
- mr->sgt.nents = 0;
- mr->state = SMBDIRECT_MR_ERROR;
- if (atomic_dec_and_test(&sc->mr_io.used.count))
- wake_up(&sc->mr_io.cleanup.wait_queue);
-
- smbd_disconnect_rdma_connection(sc);
-
- /*
- * get_mr() gave us a reference
- * via kref_get(&mr->kref), we need to remove it again
- * on error.
- *
- * No kref_put_mutex() as it's already locked.
- *
- * If smbd_mr_free_locked() is called
- * and the mutex is unlocked and mr is gone,
- * in that case kref_put() returned 1.
- *
- * If kref_put() returned 0 we know that
- * smbd_mr_free_locked() didn't
- * run. Not by us nor by anyone else, as we
- * still hold the mutex, so we need to unlock.
- */
- if (!kref_put(&mr->kref, smbd_mr_free_locked))
- mutex_unlock(&mr->mutex);
-
- return NULL;
+ return smbdirect_connection_register_mr_io(sc, iter, writing, need_invalidate);
}
-static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
+void smbd_mr_fill_buffer_descriptor(struct smbdirect_mr_io *mr,
+ struct smbdirect_buffer_descriptor_v1 *v1)
{
- struct smbdirect_mr_io *smbdirect_mr;
- struct ib_cqe *cqe;
-
- cqe = wc->wr_cqe;
- smbdirect_mr = container_of(cqe, struct smbdirect_mr_io, cqe);
- smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED;
- if (wc->status != IB_WC_SUCCESS) {
- log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status);
- smbdirect_mr->state = SMBDIRECT_MR_ERROR;
- }
- complete(&smbdirect_mr->invalidate_done);
+ smbdirect_mr_io_fill_buffer_descriptor(mr, v1);
}
/*
@@ -3053,300 +541,20 @@ static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
*/
void smbd_deregister_mr(struct smbdirect_mr_io *mr)
{
- struct smbdirect_socket *sc = mr->socket;
-
- mutex_lock(&mr->mutex);
- if (mr->state == SMBDIRECT_MR_DISABLED)
- goto put_kref;
-
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
- smbd_mr_disable_locked(mr);
- goto put_kref;
- }
-
- if (mr->need_invalidate) {
- struct ib_send_wr *wr = &mr->inv_wr;
- int rc;
-
- /* Need to finish local invalidation before returning */
- wr->opcode = IB_WR_LOCAL_INV;
- mr->cqe.done = local_inv_done;
- wr->wr_cqe = &mr->cqe;
- wr->num_sge = 0;
- wr->ex.invalidate_rkey = mr->mr->rkey;
- wr->send_flags = IB_SEND_SIGNALED;
-
- init_completion(&mr->invalidate_done);
- rc = ib_post_send(sc->ib.qp, wr, NULL);
- if (rc) {
- log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
- smbd_mr_disable_locked(mr);
- smbd_disconnect_rdma_connection(sc);
- goto done;
- }
- wait_for_completion(&mr->invalidate_done);
- mr->need_invalidate = false;
- } else
- /*
- * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED
- * and defer to mr_recovery_work to recover the MR for next use
- */
- mr->state = SMBDIRECT_MR_INVALIDATED;
-
- if (mr->sgt.nents) {
- ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
- mr->sgt.nents = 0;
- }
-
- if (mr->state == SMBDIRECT_MR_INVALIDATED) {
- mr->state = SMBDIRECT_MR_READY;
- if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
- wake_up(&sc->mr_io.ready.wait_queue);
- } else
- /*
- * Schedule the work to do MR recovery for future I/Os MR
- * recovery is slow and don't want it to block current I/O
- */
- queue_work(sc->workqueue, &sc->mr_io.recovery_work);
-
-done:
- if (atomic_dec_and_test(&sc->mr_io.used.count))
- wake_up(&sc->mr_io.cleanup.wait_queue);
-
-put_kref:
- /*
- * No kref_put_mutex() as it's already locked.
- *
- * If smbd_mr_free_locked() is called
- * and the mutex is unlocked and mr is gone,
- * in that case kref_put() returned 1.
- *
- * If kref_put() returned 0 we know that
- * smbd_mr_free_locked() didn't
- * run. Not by us nor by anyone else, as we
- * still hold the mutex, so we need to unlock
- * and keep the mr in SMBDIRECT_MR_READY or
- * SMBDIRECT_MR_ERROR state.
- */
- if (!kref_put(&mr->kref, smbd_mr_free_locked))
- mutex_unlock(&mr->mutex);
+ smbdirect_connection_deregister_mr_io(mr);
}
-static bool smb_set_sge(struct smb_extract_to_rdma *rdma,
- struct page *lowest_page, size_t off, size_t len)
+void smbd_debug_proc_show(struct TCP_Server_Info *server, struct seq_file *m)
{
- struct ib_sge *sge = &rdma->sge[rdma->nr_sge];
- u64 addr;
-
- addr = ib_dma_map_page(rdma->device, lowest_page,
- off, len, rdma->direction);
- if (ib_dma_mapping_error(rdma->device, addr))
- return false;
-
- sge->addr = addr;
- sge->length = len;
- sge->lkey = rdma->local_dma_lkey;
- rdma->nr_sge++;
- return true;
-}
-
-/*
- * Extract page fragments from a BVEC-class iterator and add them to an RDMA
- * element list. The pages are not pinned.
- */
-static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter,
- struct smb_extract_to_rdma *rdma,
- ssize_t maxsize)
-{
- const struct bio_vec *bv = iter->bvec;
- unsigned long start = iter->iov_offset;
- unsigned int i;
- ssize_t ret = 0;
-
- for (i = 0; i < iter->nr_segs; i++) {
- size_t off, len;
-
- len = bv[i].bv_len;
- if (start >= len) {
- start -= len;
- continue;
- }
-
- len = min_t(size_t, maxsize, len - start);
- off = bv[i].bv_offset + start;
-
- if (!smb_set_sge(rdma, bv[i].bv_page, off, len))
- return -EIO;
-
- ret += len;
- maxsize -= len;
- if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
- break;
- start = 0;
- }
-
- if (ret > 0)
- iov_iter_advance(iter, ret);
- return ret;
-}
-
-/*
- * Extract fragments from a KVEC-class iterator and add them to an RDMA list.
- * This can deal with vmalloc'd buffers as well as kmalloc'd or static buffers.
- * The pages are not pinned.
- */
-static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter,
- struct smb_extract_to_rdma *rdma,
- ssize_t maxsize)
-{
- const struct kvec *kv = iter->kvec;
- unsigned long start = iter->iov_offset;
- unsigned int i;
- ssize_t ret = 0;
-
- for (i = 0; i < iter->nr_segs; i++) {
- struct page *page;
- unsigned long kaddr;
- size_t off, len, seg;
-
- len = kv[i].iov_len;
- if (start >= len) {
- start -= len;
- continue;
- }
-
- kaddr = (unsigned long)kv[i].iov_base + start;
- off = kaddr & ~PAGE_MASK;
- len = min_t(size_t, maxsize, len - start);
- kaddr &= PAGE_MASK;
-
- maxsize -= len;
- do {
- seg = min_t(size_t, len, PAGE_SIZE - off);
-
- if (is_vmalloc_or_module_addr((void *)kaddr))
- page = vmalloc_to_page((void *)kaddr);
- else
- page = virt_to_page((void *)kaddr);
-
- if (!smb_set_sge(rdma, page, off, seg))
- return -EIO;
-
- ret += seg;
- len -= seg;
- kaddr += PAGE_SIZE;
- off = 0;
- } while (len > 0 && rdma->nr_sge < rdma->max_sge);
-
- if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
- break;
- start = 0;
- }
-
- if (ret > 0)
- iov_iter_advance(iter, ret);
- return ret;
-}
-
-/*
- * Extract folio fragments from a FOLIOQ-class iterator and add them to an RDMA
- * list. The folios are not pinned.
- */
-static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter,
- struct smb_extract_to_rdma *rdma,
- ssize_t maxsize)
-{
- const struct folio_queue *folioq = iter->folioq;
- unsigned int slot = iter->folioq_slot;
- ssize_t ret = 0;
- size_t offset = iter->iov_offset;
-
- BUG_ON(!folioq);
-
- if (slot >= folioq_nr_slots(folioq)) {
- folioq = folioq->next;
- if (WARN_ON_ONCE(!folioq))
- return -EIO;
- slot = 0;
- }
-
- do {
- struct folio *folio = folioq_folio(folioq, slot);
- size_t fsize = folioq_folio_size(folioq, slot);
-
- if (offset < fsize) {
- size_t part = umin(maxsize, fsize - offset);
-
- if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part))
- return -EIO;
-
- offset += part;
- ret += part;
- maxsize -= part;
- }
-
- if (offset >= fsize) {
- offset = 0;
- slot++;
- if (slot >= folioq_nr_slots(folioq)) {
- if (!folioq->next) {
- WARN_ON_ONCE(ret < iter->count);
- break;
- }
- folioq = folioq->next;
- slot = 0;
- }
- }
- } while (rdma->nr_sge < rdma->max_sge && maxsize > 0);
-
- iter->folioq = folioq;
- iter->folioq_slot = slot;
- iter->iov_offset = offset;
- iter->count -= ret;
- return ret;
-}
-
-/*
- * Extract page fragments from up to the given amount of the source iterator
- * and build up an RDMA list that refers to all of those bits. The RDMA list
- * is appended to, up to the maximum number of elements set in the parameter
- * block.
- *
- * The extracted page fragments are not pinned or ref'd in any way; if an
- * IOVEC/UBUF-type iterator is to be used, it should be converted to a
- * BVEC-type iterator and the pages pinned, ref'd or otherwise held in some
- * way.
- */
-static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
- struct smb_extract_to_rdma *rdma)
-{
- ssize_t ret;
- int before = rdma->nr_sge;
-
- switch (iov_iter_type(iter)) {
- case ITER_BVEC:
- ret = smb_extract_bvec_to_rdma(iter, rdma, len);
- break;
- case ITER_KVEC:
- ret = smb_extract_kvec_to_rdma(iter, rdma, len);
- break;
- case ITER_FOLIOQ:
- ret = smb_extract_folioq_to_rdma(iter, rdma, len);
- break;
- default:
- WARN_ON_ONCE(1);
- return -EIO;
- }
-
- if (ret < 0) {
- while (rdma->nr_sge > before) {
- struct ib_sge *sge = &rdma->sge[rdma->nr_sge--];
+ if (!server->rdma)
+ return;
- ib_dma_unmap_single(rdma->device, sge->addr, sge->length,
- rdma->direction);
- sge->addr = 0;
- }
+ if (!server->smbd_conn) {
+ seq_puts(m, "\nSMBDirect transport not available");
+ return;
}
- return ret;
+ smbdirect_connection_legacy_debug_proc_show(server->smbd_conn->socket,
+ server->rdma_readwrite_threshold,
+ m);
}
diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h
index 577d37dbeb8a..0017d5b2de44 100644
--- a/fs/smb/client/smbdirect.h
+++ b/fs/smb/client/smbdirect.h
@@ -11,12 +11,8 @@
#define cifs_rdma_enabled(server) ((server)->rdma)
#include "cifsglob.h"
-#include <rdma/ib_verbs.h>
-#include <rdma/rdma_cm.h>
-#include <linux/mempool.h>
#include "../common/smbdirect/smbdirect.h"
-#include "../common/smbdirect/smbdirect_socket.h"
extern int rdma_readwrite_threshold;
extern int smbd_max_frmr_depth;
@@ -27,17 +23,8 @@ extern int smbd_max_send_size;
extern int smbd_send_credit_target;
extern int smbd_receive_credit_max;
-/*
- * The context for the SMBDirect transport
- * Everything related to the transport is here. It has several logical parts
- * 1. RDMA related structures
- * 2. SMBDirect connection parameters
- * 3. Memory registrations
- * 4. Receive and reassembly queues for data receive path
- * 5. mempools for allocating packets
- */
struct smbd_connection {
- struct smbdirect_socket socket;
+ struct smbdirect_socket *socket;
};
/* Create a SMBDirect session */
@@ -60,8 +47,12 @@ int smbd_send(struct TCP_Server_Info *server,
struct smbdirect_mr_io *smbd_register_mr(
struct smbd_connection *info, struct iov_iter *iter,
bool writing, bool need_invalidate);
+void smbd_mr_fill_buffer_descriptor(struct smbdirect_mr_io *mr,
+ struct smbdirect_buffer_descriptor_v1 *v1);
void smbd_deregister_mr(struct smbdirect_mr_io *mr);
+void smbd_debug_proc_show(struct TCP_Server_Info *server, struct seq_file *m);
+
#else
#define cifs_rdma_enabled(server) 0
struct smbd_connection {};
diff --git a/fs/smb/common/Makefile b/fs/smb/common/Makefile
index 9e0730a385fb..e6ee65c31b5d 100644
--- a/fs/smb/common/Makefile
+++ b/fs/smb/common/Makefile
@@ -4,3 +4,4 @@
#
obj-$(CONFIG_SMBFS) += cifs_md4.o
+obj-$(CONFIG_SMB_COMMON_SMBDIRECT) += smbdirect/
diff --git a/fs/smb/common/smbdirect/Kconfig b/fs/smb/common/smbdirect/Kconfig
new file mode 100644
index 000000000000..a46a2e6ec87a
--- /dev/null
+++ b/fs/smb/common/smbdirect/Kconfig
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# smbdirect configuration
+
+config SMB_COMMON_SMBDIRECT
+ def_tristate n
+ depends on INFINIBAND && INFINIBAND_ADDR_TRANS
+ depends on m || INFINIBAND=y
+ select SG_POOL
diff --git a/fs/smb/common/smbdirect/Makefile b/fs/smb/common/smbdirect/Makefile
new file mode 100644
index 000000000000..423f533e1002
--- /dev/null
+++ b/fs/smb/common/smbdirect/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Makefile for smbdirect support
+#
+
+obj-$(CONFIG_SMB_COMMON_SMBDIRECT) += smbdirect.o
+
+smbdirect-y := \
+ smbdirect_socket.o \
+ smbdirect_connection.o \
+ smbdirect_mr.o \
+ smbdirect_rw.o \
+ smbdirect_debug.o \
+ smbdirect_connect.o \
+ smbdirect_listen.o \
+ smbdirect_accept.o \
+ smbdirect_devices.o \
+ smbdirect_main.o
diff --git a/fs/smb/common/smbdirect/smbdirect.h b/fs/smb/common/smbdirect/smbdirect.h
index 821a34c4cc47..bbab5f7f7cc9 100644
--- a/fs/smb/common/smbdirect/smbdirect.h
+++ b/fs/smb/common/smbdirect/smbdirect.h
@@ -1,7 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
- * Copyright (C) 2017, Microsoft Corporation.
- * Copyright (C) 2018, LG Electronics.
+ * Copyright (C) 2025 Stefan Metzmacher
*/
#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__
@@ -25,12 +24,15 @@ struct smbdirect_buffer_descriptor_v1 {
* Some values are important for the upper layer.
*/
struct smbdirect_socket_parameters {
+ __u64 flags;
+#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB ((__u64)0x1)
+#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW ((__u64)0x2)
__u32 resolve_addr_timeout_msec;
__u32 resolve_route_timeout_msec;
__u32 rdma_connect_timeout_msec;
__u32 negotiate_timeout_msec;
- __u8 initiator_depth;
- __u8 responder_resources;
+ __u16 initiator_depth; /* limited to U8_MAX */
+ __u16 responder_resources; /* limited to U8_MAX */
__u16 recv_credit_max;
__u16 send_credit_target;
__u32 max_send_size;
@@ -43,4 +45,8 @@ struct smbdirect_socket_parameters {
__u32 keepalive_timeout_msec;
} __packed;
+#define SMBDIRECT_FLAG_PORT_RANGE_MASK ( \
+ SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB | \
+ SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW)
+
#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ */
diff --git a/fs/smb/common/smbdirect/smbdirect_accept.c b/fs/smb/common/smbdirect/smbdirect_accept.c
new file mode 100644
index 000000000000..d6d5e6a3f5de
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_accept.c
@@ -0,0 +1,857 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017, Microsoft Corporation.
+ * Copyright (C) 2018, LG Electronics.
+ * Copyright (c) 2025, Stefan Metzmacher
+ */
+
+#include "smbdirect_internal.h"
+#include <net/sock.h>
+#include "../../common/smb2status.h"
+
+static int smbdirect_accept_rdma_event_handler(struct rdma_cm_id *id,
+ struct rdma_cm_event *event);
+static int smbdirect_accept_init_params(struct smbdirect_socket *sc);
+static void smbdirect_accept_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void smbdirect_accept_negotiate_send_done(struct ib_cq *cq, struct ib_wc *wc);
+
+int smbdirect_accept_connect_request(struct smbdirect_socket *sc,
+ const struct rdma_conn_param *param)
+{
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_recv_io *recv_io;
+ u8 peer_initiator_depth;
+ u8 peer_responder_resources;
+ struct rdma_conn_param conn_param;
+ __be32 ird_ord_hdr[2];
+ int ret;
+
+ if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_CREATED))
+ return -EINVAL;
+
+ /*
+ * First set what the we as server are able to support
+ */
+ sp->initiator_depth = min_t(u8, sp->initiator_depth,
+ sc->ib.dev->attrs.max_qp_rd_atom);
+
+ peer_initiator_depth = param->initiator_depth;
+ peer_responder_resources = param->responder_resources;
+ smbdirect_connection_negotiate_rdma_resources(sc,
+ peer_initiator_depth,
+ peer_responder_resources,
+ param);
+
+ ret = smbdirect_accept_init_params(sc);
+ if (ret) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_accept_init_params() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ goto init_params_failed;
+ }
+
+ ret = smbdirect_connection_create_qp(sc);
+ if (ret) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_create_qp() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ goto create_qp_failed;
+ }
+
+ ret = smbdirect_connection_create_mem_pools(sc);
+ if (ret) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_create_mem_pools() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ goto create_mem_failed;
+ }
+
+ recv_io = smbdirect_connection_get_recv_io(sc);
+ if (WARN_ON_ONCE(!recv_io)) {
+ ret = -EINVAL;
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_get_recv_io() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ goto get_recv_io_failed;
+ }
+ recv_io->cqe.done = smbdirect_accept_negotiate_recv_done;
+
+ /*
+ * Now post the recv_io buffer in order to get
+ * the negotiate request
+ */
+ sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ;
+ ret = smbdirect_connection_post_recv_io(recv_io);
+ if (ret) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_post_recv_io() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ goto post_recv_io_failed;
+ }
+ /*
+ * From here recv_io is known to the RDMA QP and needs ib_drain_qp and
+ * smbdirect_accept_negotiate_recv_done to cleanup...
+ */
+ recv_io = NULL;
+
+ /* already checked with SMBDIRECT_CHECK_STATUS_WARN above */
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
+ sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED;
+
+ /*
+ * We already negotiated sp->initiator_depth
+ * and sp->responder_resources above.
+ */
+ memset(&conn_param, 0, sizeof(conn_param));
+ conn_param.initiator_depth = sp->initiator_depth;
+ conn_param.responder_resources = sp->responder_resources;
+
+ if (sc->rdma.legacy_iwarp) {
+ ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources);
+ ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth);
+ conn_param.private_data = ird_ord_hdr;
+ conn_param.private_data_len = sizeof(ird_ord_hdr);
+ } else {
+ conn_param.private_data = NULL;
+ conn_param.private_data_len = 0;
+ }
+ conn_param.retry_count = SMBDIRECT_RDMA_CM_RETRY;
+ conn_param.rnr_retry_count = SMBDIRECT_RDMA_CM_RNR_RETRY;
+ conn_param.flow_control = 0;
+
+ /* explicitly set above */
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED);
+ sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING;
+ sc->rdma.expected_event = RDMA_CM_EVENT_ESTABLISHED;
+ sc->rdma.cm_id->event_handler = smbdirect_accept_rdma_event_handler;
+ ret = rdma_accept(sc->rdma.cm_id, &conn_param);
+ if (ret) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "rdma_accept() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ goto rdma_accept_failed;
+ }
+
+ /*
+ * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING
+ * so that the timer will cause a disconnect.
+ */
+ INIT_DELAYED_WORK(&sc->idle.timer_work, smbdirect_connection_idle_timer_work);
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
+ mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->negotiate_timeout_msec));
+
+ return 0;
+
+rdma_accept_failed:
+ /*
+ * smbdirect_connection_destroy_qp() calls ib_drain_qp(),
+ * so that smbdirect_accept_negotiate_recv_done() will
+ * call smbdirect_connection_put_recv_io()
+ */
+post_recv_io_failed:
+ if (recv_io)
+ smbdirect_connection_put_recv_io(recv_io);
+get_recv_io_failed:
+ smbdirect_connection_destroy_mem_pools(sc);
+create_mem_failed:
+ smbdirect_connection_destroy_qp(sc);
+create_qp_failed:
+init_params_failed:
+ return ret;
+}
+
+static int smbdirect_accept_init_params(struct smbdirect_socket *sc)
+{
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ int max_send_sges;
+ unsigned int maxpages;
+
+ /* need 3 more sge. because a SMB_DIRECT header, SMB2 header,
+ * SMB2 response could be mapped.
+ */
+ max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3;
+ if (max_send_sges > SMBDIRECT_SEND_IO_MAX_SGE) {
+ pr_err("max_send_size %d is too large\n", sp->max_send_size);
+ return -EINVAL;
+ }
+
+ /*
+ * There is only a single batch credit
+ */
+ atomic_set(&sc->send_io.bcredits.count, 1);
+
+ /*
+ * Initialize the local credits to post
+ * IB_WR_SEND[_WITH_INV].
+ */
+ atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target);
+
+ if (sp->max_read_write_size) {
+ maxpages = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE);
+ sc->rw_io.credits.max = rdma_rw_mr_factor(sc->ib.dev,
+ sc->rdma.cm_id->port_num,
+ maxpages);
+ sc->rw_io.credits.num_pages = DIV_ROUND_UP(maxpages, sc->rw_io.credits.max);
+ /* add one extra in order to handle unaligned pages */
+ sc->rw_io.credits.max += 1;
+ }
+
+ sc->recv_io.credits.target = 1;
+
+ atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max);
+
+ return 0;
+}
+
+static void smbdirect_accept_negotiate_recv_work(struct work_struct *work);
+
+static void smbdirect_accept_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct smbdirect_recv_io *recv_io =
+ container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
+ struct smbdirect_socket *sc = recv_io->socket;
+ unsigned long flags;
+
+ if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_RECV))) {
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_ERR,
+ "wc->status=%s (%d) wc->opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status, wc->opcode);
+ goto error;
+ }
+
+ smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_INFO,
+ "smbdirect_recv_io completed. status='%s (%d)', opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status, wc->opcode);
+
+ /*
+ * This is an internal error!
+ */
+ if (WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_NEGOTIATE_REQ))
+ goto error;
+
+ /*
+ * Don't reset timer to the keepalive interval in
+ * this will be done in smbdirect_accept_direct_negotiate_recv_work.
+ */
+
+ ib_dma_sync_single_for_cpu(sc->ib.dev,
+ recv_io->sge.addr,
+ recv_io->sge.length,
+ DMA_FROM_DEVICE);
+
+ /*
+ * Only remember recv_io if it has enough bytes,
+ * this gives smbdirect_accept_negotiate_recv_work enough
+ * information in order to disconnect if it was not
+ * valid.
+ */
+ sc->recv_io.reassembly.full_packet_received = true;
+ if (wc->byte_len >= sizeof(struct smbdirect_negotiate_req))
+ smbdirect_connection_reassembly_append_recv_io(sc, recv_io, 0);
+ else
+ smbdirect_connection_put_recv_io(recv_io);
+
+ /*
+ * Some drivers (at least mlx5_ib and irdma) might post a
+ * recv completion before RDMA_CM_EVENT_ESTABLISHED,
+ * we need to adjust our expectation in that case.
+ *
+ * So we defer further processing of the negotiation
+ * to smbdirect_accept_negotiate_recv_work().
+ *
+ * If we are already in SMBDIRECT_SOCKET_NEGOTIATE_NEEDED
+ * we queue the work directly otherwise
+ * smbdirect_accept_rdma_event_handler() will do it, when
+ * RDMA_CM_EVENT_ESTABLISHED arrived.
+ */
+ spin_lock_irqsave(&sc->connect.lock, flags);
+ if (!sc->first_error) {
+ INIT_WORK(&sc->connect.work, smbdirect_accept_negotiate_recv_work);
+ if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)
+ queue_work(sc->workqueues.accept, &sc->connect.work);
+ }
+ spin_unlock_irqrestore(&sc->connect.lock, flags);
+
+ return;
+
+error:
+ /*
+ * recv_io.posted.refill_work is still disabled,
+ * so smbdirect_connection_put_recv_io() won't
+ * start it.
+ */
+ smbdirect_connection_put_recv_io(recv_io);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+}
+
+static void smbdirect_accept_negotiate_recv_work(struct work_struct *work)
+{
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, connect.work);
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_recv_io *recv_io;
+ struct smbdirect_negotiate_req *nreq;
+ unsigned long flags;
+ u16 min_version;
+ u16 max_version;
+ u16 credits_requested;
+ u32 preferred_send_size;
+ u32 max_receive_size;
+ u32 max_fragmented_size;
+ u32 ntstatus;
+
+ if (sc->first_error)
+ return;
+
+ /*
+ * make sure we won't start again...
+ */
+ disable_work(work);
+
+ /*
+ * Reset timer to the keepalive interval in
+ * order to trigger our next keepalive message.
+ */
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
+ mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->keepalive_interval_msec));
+
+ /*
+ * If smbdirect_accept_negotiate_recv_done() detected an
+ * invalid request we want to disconnect.
+ */
+ recv_io = smbdirect_connection_reassembly_first_recv_io(sc);
+ if (!recv_io) {
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+ spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
+ sc->recv_io.reassembly.queue_length--;
+ list_del(&recv_io->list);
+ spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
+ smbdirect_connection_put_recv_io(recv_io);
+
+ if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED))
+ return;
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
+
+ /*
+ * Note recv_io is already part of the free list,
+ * as we just called smbdirect_connection_put_recv_io(),
+ * but it won't be reused before we call
+ * smbdirect_connection_recv_io_refill() below.
+ */
+
+ nreq = (struct smbdirect_negotiate_req *)recv_io->packet;
+ min_version = le16_to_cpu(nreq->min_version);
+ max_version = le16_to_cpu(nreq->max_version);
+ credits_requested = le16_to_cpu(nreq->credits_requested);
+ preferred_send_size = le32_to_cpu(nreq->preferred_send_size);
+ max_receive_size = le32_to_cpu(nreq->max_receive_size);
+ max_fragmented_size = le32_to_cpu(nreq->max_fragmented_size);
+
+ smbdirect_log_negotiate(sc, SMBDIRECT_LOG_INFO,
+ "ReqIn: %s%x, %s%x, %s%u, %s%u, %s%u, %s%u\n",
+ "MinVersion=0x",
+ le16_to_cpu(nreq->min_version),
+ "MaxVersion=0x",
+ le16_to_cpu(nreq->max_version),
+ "CreditsRequested=",
+ le16_to_cpu(nreq->credits_requested),
+ "PreferredSendSize=",
+ le32_to_cpu(nreq->preferred_send_size),
+ "MaxRecvSize=",
+ le32_to_cpu(nreq->max_receive_size),
+ "MaxFragmentedSize=",
+ le32_to_cpu(nreq->max_fragmented_size));
+
+ if (!(min_version <= SMBDIRECT_V1 && max_version >= SMBDIRECT_V1)) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "invalid: min_version=0x%x max_version=0x%x\n",
+ min_version, max_version);
+ ntstatus = le32_to_cpu(STATUS_NOT_SUPPORTED);
+ goto not_supported;
+ }
+
+ if (credits_requested == 0) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "invalid: credits_requested == 0\n");
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+
+ if (max_receive_size < SMBDIRECT_MIN_RECEIVE_SIZE) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "invalid: max_receive_size=%u < %u\n",
+ max_receive_size,
+ SMBDIRECT_MIN_RECEIVE_SIZE);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+
+ if (max_fragmented_size < SMBDIRECT_MIN_FRAGMENTED_SIZE) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "invalid: max_fragmented_size=%u < %u\n",
+ max_fragmented_size,
+ SMBDIRECT_MIN_FRAGMENTED_SIZE);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+
+ /*
+ * At least the value of SMBDIRECT_MIN_RECEIVE_SIZE is used.
+ */
+ sp->max_recv_size = min_t(u32, sp->max_recv_size, preferred_send_size);
+ sp->max_recv_size = max_t(u32, sp->max_recv_size, SMBDIRECT_MIN_RECEIVE_SIZE);
+
+ /*
+ * The maximum fragmented upper-layer payload receive size supported
+ *
+ * Assume max_payload_per_credit is
+ * smb_direct_receive_credit_max - 24 = 1340
+ *
+ * The maximum number would be
+ * smb_direct_receive_credit_max * max_payload_per_credit
+ *
+ * 1340 * 255 = 341700 (0x536C4)
+ *
+ * The minimum value from the spec is 131072 (0x20000)
+ *
+ * For now we use the logic we used in ksmbd before:
+ * (1364 * 255) / 2 = 173910 (0x2A756)
+ *
+ * We need to adjust this here in case the peer
+ * lowered sp->max_recv_size.
+ *
+ * TODO: instead of adjusting max_fragmented_recv_size
+ * we should adjust the number of available buffers,
+ * but for now we keep the logic as it was used
+ * in ksmbd before.
+ */
+ sp->max_fragmented_recv_size = (sp->recv_credit_max * sp->max_recv_size) / 2;
+
+ /*
+ * We take the value from the peer, which is checked to be higher than 0,
+ * but we limit it to the max value we support in order to have
+ * the main logic simpler.
+ */
+ sc->recv_io.credits.target = credits_requested;
+ sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target,
+ sp->recv_credit_max);
+
+ /*
+ * Note nreq->max_receive_size was already checked against
+ * SMBDIRECT_MIN_RECEIVE_SIZE above.
+ */
+ sp->max_send_size = min_t(u32, sp->max_send_size, max_receive_size);
+
+ /*
+ * Note nreq->max_fragmented_size was already checked against
+ * SMBDIRECT_MIN_FRAGMENTED_SIZE above.
+ */
+ sp->max_fragmented_send_size = max_fragmented_size;
+
+ if (sc->accept.listener) {
+ struct smbdirect_socket *lsc = sc->accept.listener;
+ unsigned long flags;
+
+ spin_lock_irqsave(&lsc->listen.lock, flags);
+ list_del(&sc->accept.list);
+ list_add_tail(&sc->accept.list, &lsc->listen.ready);
+ wake_up(&lsc->listen.wait_queue);
+ spin_unlock_irqrestore(&lsc->listen.lock, flags);
+
+ /*
+ * smbdirect_socket_accept() will call
+ * smbdirect_accept_negotiate_finish(nsc, 0);
+ *
+ * So that we don't send the negotiation
+ * response that grants credits to the peer
+ * before the socket is accepted by the
+ * application.
+ */
+ return;
+ }
+
+ ntstatus = le32_to_cpu(STATUS_SUCCESS);
+
+not_supported:
+ smbdirect_accept_negotiate_finish(sc, ntstatus);
+}
+
+void smbdirect_accept_negotiate_finish(struct smbdirect_socket *sc, u32 ntstatus)
+{
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_recv_io *recv_io;
+ struct smbdirect_send_io *send_io;
+ struct smbdirect_negotiate_resp *nrep;
+ int posted;
+ u16 new_credits;
+ int ret;
+
+ if (ntstatus)
+ goto not_supported;
+
+ /*
+ * Prepare for receiving data_transfer messages
+ */
+ sc->recv_io.reassembly.full_packet_received = true;
+ sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
+ list_for_each_entry(recv_io, &sc->recv_io.free.list, list)
+ recv_io->cqe.done = smbdirect_connection_recv_io_done;
+ recv_io = NULL;
+
+ /*
+ * We should at least post 1 smbdirect_recv_io!
+ */
+ posted = smbdirect_connection_recv_io_refill(sc);
+ if (posted < 1) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_recv_io_refill() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(posted));
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+
+ /*
+ * The response will grant credits for all posted
+ * smbdirect_recv_io messages.
+ */
+ new_credits = smbdirect_connection_grant_recv_credits(sc);
+
+not_supported:
+ send_io = smbdirect_connection_alloc_send_io(sc);
+ if (IS_ERR(send_io)) {
+ ret = PTR_ERR(send_io);
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_alloc_send_io() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ smbdirect_socket_schedule_cleanup(sc, ret);
+ return;
+ }
+ send_io->cqe.done = smbdirect_accept_negotiate_send_done;
+
+ nrep = (struct smbdirect_negotiate_resp *)send_io->packet;
+ nrep->min_version = cpu_to_le16(SMBDIRECT_V1);
+ nrep->max_version = cpu_to_le16(SMBDIRECT_V1);
+ if (ntstatus == 0) {
+ nrep->negotiated_version = cpu_to_le16(SMBDIRECT_V1);
+ nrep->reserved = 0;
+ nrep->credits_requested = cpu_to_le16(sp->send_credit_target);
+ nrep->credits_granted = cpu_to_le16(new_credits);
+ nrep->status = cpu_to_le32(ntstatus);
+ nrep->max_readwrite_size = cpu_to_le32(sp->max_read_write_size);
+ nrep->preferred_send_size = cpu_to_le32(sp->max_send_size);
+ nrep->max_receive_size = cpu_to_le32(sp->max_recv_size);
+ nrep->max_fragmented_size = cpu_to_le32(sp->max_fragmented_recv_size);
+ } else {
+ nrep->negotiated_version = 0;
+ nrep->reserved = 0;
+ nrep->credits_requested = 0;
+ nrep->credits_granted = 0;
+ nrep->status = cpu_to_le32(ntstatus);
+ nrep->max_readwrite_size = 0;
+ nrep->preferred_send_size = 0;
+ nrep->max_receive_size = 0;
+ nrep->max_fragmented_size = 0;
+ }
+
+ smbdirect_log_negotiate(sc, SMBDIRECT_LOG_INFO,
+ "RepOut: %s%x, %s%x, %s%x, %s%u, %s%u, %s%x, %s%u, %s%u, %s%u, %s%u\n",
+ "MinVersion=0x",
+ le16_to_cpu(nrep->min_version),
+ "MaxVersion=0x",
+ le16_to_cpu(nrep->max_version),
+ "NegotiatedVersion=0x",
+ le16_to_cpu(nrep->negotiated_version),
+ "CreditsRequested=",
+ le16_to_cpu(nrep->credits_requested),
+ "CreditsGranted=",
+ le16_to_cpu(nrep->credits_granted),
+ "Status=0x",
+ le32_to_cpu(nrep->status),
+ "MaxReadWriteSize=",
+ le32_to_cpu(nrep->max_readwrite_size),
+ "PreferredSendSize=",
+ le32_to_cpu(nrep->preferred_send_size),
+ "MaxRecvSize=",
+ le32_to_cpu(nrep->max_receive_size),
+ "MaxFragmentedSize=",
+ le32_to_cpu(nrep->max_fragmented_size));
+
+ send_io->sge[0].addr = ib_dma_map_single(sc->ib.dev,
+ nrep,
+ sizeof(*nrep),
+ DMA_TO_DEVICE);
+ ret = ib_dma_mapping_error(sc->ib.dev, send_io->sge[0].addr);
+ if (ret) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "ib_dma_mapping_error() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ smbdirect_connection_free_send_io(send_io);
+ smbdirect_socket_schedule_cleanup(sc, ret);
+ return;
+ }
+
+ send_io->sge[0].length = sizeof(*nrep);
+ send_io->sge[0].lkey = sc->ib.pd->local_dma_lkey;
+ send_io->num_sge = 1;
+
+ ib_dma_sync_single_for_device(sc->ib.dev,
+ send_io->sge[0].addr,
+ send_io->sge[0].length,
+ DMA_TO_DEVICE);
+
+ send_io->wr.next = NULL;
+ send_io->wr.wr_cqe = &send_io->cqe;
+ send_io->wr.sg_list = send_io->sge;
+ send_io->wr.num_sge = send_io->num_sge;
+ send_io->wr.opcode = IB_WR_SEND;
+ send_io->wr.send_flags = IB_SEND_SIGNALED;
+
+ ret = smbdirect_connection_post_send_wr(sc, &send_io->wr);
+ if (ret) {
+ /* if we reach here, post send failed */
+ smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_post_send_wr() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ /*
+ * Note smbdirect_connection_free_send_io()
+ * does ib_dma_unmap_page()
+ */
+ smbdirect_connection_free_send_io(send_io);
+ smbdirect_socket_schedule_cleanup(sc, ret);
+ return;
+ }
+
+ /*
+ * smbdirect_accept_negotiate_send_done
+ * will do all remaining work...
+ */
+}
+
+static void smbdirect_accept_negotiate_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct smbdirect_send_io *send_io =
+ container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
+ struct smbdirect_socket *sc = send_io->socket;
+ struct smbdirect_negotiate_resp *nrep;
+ u32 ntstatus;
+
+ smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_INFO,
+ "smbdirect_send_io completed. status='%s (%d)', opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status, wc->opcode);
+
+ nrep = (struct smbdirect_negotiate_resp *)send_io->packet;
+ ntstatus = le32_to_cpu(nrep->status);
+
+ /* Note this frees wc->wr_cqe, but not wc */
+ smbdirect_connection_free_send_io(send_io);
+ atomic_dec(&sc->send_io.pending.count);
+
+ if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_SEND))) {
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR,
+ "wc->status=%s (%d) wc->opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status, wc->opcode);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+
+ /*
+ * If we send a smbdirect_negotiate_resp without NT_STATUS_OK (0)
+ * we need to disconnect now.
+ *
+ * Otherwise smbdirect_connection_negotiation_done()
+ * will setup all required things and wake up
+ * the waiter.
+ */
+ if (ntstatus)
+ smbdirect_socket_schedule_cleanup(sc, -EOPNOTSUPP);
+ else
+ smbdirect_connection_negotiation_done(sc);
+}
+
+static int smbdirect_accept_rdma_event_handler(struct rdma_cm_id *id,
+ struct rdma_cm_event *event)
+{
+ struct smbdirect_socket *sc = id->context;
+ unsigned long flags;
+
+ /*
+ * cma_cm_event_handler() has
+ * lockdep_assert_held(&id_priv->handler_mutex);
+ *
+ * Mutexes are not allowed in interrupts,
+ * and we rely on not being in an interrupt here,
+ * as we might sleep.
+ *
+ * We didn't timeout so we cancel our idle timer,
+ * it will be scheduled again if needed.
+ */
+ WARN_ON_ONCE(in_interrupt());
+
+ if (event->status || event->event != sc->rdma.expected_event) {
+ int ret = -ECONNABORTED;
+
+ if (event->event == RDMA_CM_EVENT_REJECTED)
+ ret = -ECONNREFUSED;
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ret = -ENETDOWN;
+ if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status)))
+ ret = event->status;
+
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "%s (first_error=%1pe, expected=%s) => event=%s status=%d => ret=%1pe\n",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
+ rdma_event_msg(sc->rdma.expected_event),
+ rdma_event_msg(event->event),
+ event->status,
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+
+ smbdirect_socket_schedule_cleanup(sc, ret);
+ return 0;
+ }
+
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "%s (first_error=%1pe) event=%s\n",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
+ rdma_event_msg(event->event));
+
+ if (sc->first_error)
+ return 0;
+
+ switch (event->event) {
+ case RDMA_CM_EVENT_ESTABLISHED:
+ smbdirect_connection_rdma_established(sc);
+
+ /*
+ * Some drivers (at least mlx5_ib and irdma) might post a
+ * recv completion before RDMA_CM_EVENT_ESTABLISHED,
+ * we need to adjust our expectation in that case.
+ *
+ * If smbdirect_accept_negotiate_recv_done was called first
+ * it initialized sc->connect.work only for us to
+ * start, so that we turned into
+ * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, before
+ * smbdirect_accept_negotiate_recv_work() runs.
+ *
+ * If smbdirect_accept_negotiate_recv_done didn't happen
+ * yet. sc->connect.work is still be disabled and
+ * queue_work() is a no-op.
+ */
+ if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING))
+ return 0;
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
+ spin_lock_irqsave(&sc->connect.lock, flags);
+ if (!sc->first_error)
+ queue_work(sc->workqueues.accept, &sc->connect.work);
+ spin_unlock_irqrestore(&sc->connect.lock, flags);
+
+ /*
+ * wait for smbdirect_accept_negotiate_recv_done()
+ * to get the negotiate request.
+ */
+ return 0;
+
+ default:
+ break;
+ }
+
+ /*
+ * This is an internal error
+ */
+ WARN_ON_ONCE(sc->rdma.expected_event != RDMA_CM_EVENT_ESTABLISHED);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return 0;
+}
+
+static long smbdirect_socket_wait_for_accept(struct smbdirect_socket *lsc, long timeo)
+{
+ long ret;
+
+ ret = wait_event_interruptible_timeout(lsc->listen.wait_queue,
+ !list_empty_careful(&lsc->listen.ready) ||
+ lsc->status != SMBDIRECT_SOCKET_LISTENING ||
+ lsc->first_error,
+ timeo);
+ if (lsc->status != SMBDIRECT_SOCKET_LISTENING)
+ return -EINVAL;
+ if (lsc->first_error)
+ return lsc->first_error;
+ if (!ret)
+ ret = -ETIMEDOUT;
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc,
+ long timeo,
+ struct proto_accept_arg *arg)
+{
+ struct smbdirect_socket *nsc;
+ unsigned long flags;
+
+ if (lsc->status != SMBDIRECT_SOCKET_LISTENING) {
+ arg->err = -EINVAL;
+ return NULL;
+ }
+
+ if (lsc->first_error) {
+ arg->err = lsc->first_error;
+ return NULL;
+ }
+
+ if (list_empty_careful(&lsc->listen.ready)) {
+ int ret;
+
+ if (timeo == 0) {
+ arg->err = -EAGAIN;
+ return NULL;
+ }
+
+ ret = smbdirect_socket_wait_for_accept(lsc, timeo);
+ if (ret) {
+ arg->err = ret;
+ return NULL;
+ }
+ }
+
+ spin_lock_irqsave(&lsc->listen.lock, flags);
+ nsc = list_first_entry_or_null(&lsc->listen.ready,
+ struct smbdirect_socket,
+ accept.list);
+ if (nsc) {
+ nsc->accept.listener = NULL;
+ list_del_init_careful(&nsc->accept.list);
+ arg->is_empty = list_empty_careful(&lsc->listen.ready);
+ }
+ spin_unlock_irqrestore(&lsc->listen.lock, flags);
+ if (!nsc) {
+ arg->err = -EAGAIN;
+ return NULL;
+ }
+
+ /*
+ * We did not send the negotiation response
+ * yet, so we did not grant any credits to the client,
+ * so it didn't grant any credits to us.
+ *
+ * The caller expects a connected socket
+ * now as there are no credits anyway.
+ *
+ * Then we send the negotiation response in
+ * order to grant credits to the peer.
+ */
+ nsc->status = SMBDIRECT_SOCKET_CONNECTED;
+ smbdirect_accept_negotiate_finish(nsc, 0);
+
+ return nsc;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_accept);
diff --git a/fs/smb/common/smbdirect/smbdirect_connect.c b/fs/smb/common/smbdirect/smbdirect_connect.c
new file mode 100644
index 000000000000..2b54f79dba43
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_connect.c
@@ -0,0 +1,925 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2012,2016,2017,2025 Stefan Metzmacher
+ */
+
+#include "smbdirect_internal.h"
+#include "../../common/smb2status.h"
+
+static int smbdirect_connect_setup_connection(struct smbdirect_socket *sc);
+static int smbdirect_connect_resolve_addr(struct smbdirect_socket *sc,
+ const struct sockaddr *src,
+ const struct sockaddr *dst);
+static int smbdirect_connect_rdma_event_handler(struct rdma_cm_id *id,
+ struct rdma_cm_event *event);
+static int smbdirect_connect_negotiate_start(struct smbdirect_socket *sc);
+static void smbdirect_connect_negotiate_send_done(struct ib_cq *cq, struct ib_wc *wc);
+static void smbdirect_connect_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+
+int smbdirect_connect(struct smbdirect_socket *sc, const struct sockaddr *dst)
+{
+ const struct sockaddr *src = NULL;
+ union {
+ struct sockaddr sa;
+ struct sockaddr_storage ss;
+ } src_addr = {
+ .sa = {
+ .sa_family = AF_UNSPEC,
+ },
+ };
+ int ret;
+
+ if (sc->first_error)
+ return -ENOTCONN;
+
+ if (sc->status != SMBDIRECT_SOCKET_CREATED)
+ return -EALREADY;
+
+ if (WARN_ON_ONCE(!sc->rdma.cm_id))
+ return -EINVAL;
+
+ src_addr.ss = sc->rdma.cm_id->route.addr.src_addr;
+ if (src_addr.sa.sa_family != AF_UNSPEC)
+ src = &src_addr.sa;
+
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "connect: src: %pISpsfc dst: %pISpsfc\n",
+ src, dst);
+
+ ret = smbdirect_connect_setup_connection(sc);
+ if (ret)
+ return ret;
+
+ ret = smbdirect_connect_resolve_addr(sc, src, dst);
+ if (ret)
+ return ret;
+
+ /*
+ * The rest happens async via smbdirect_connect_rdma_event_handler()
+ * the caller will decide to wait or not.
+ */
+ return 0;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connect);
+
+static int smbdirect_connect_setup_connection(struct smbdirect_socket *sc)
+{
+ rdma_lock_handler(sc->rdma.cm_id);
+ sc->rdma.cm_id->event_handler = smbdirect_connect_rdma_event_handler;
+ rdma_unlock_handler(sc->rdma.cm_id);
+
+ if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_CREATED))
+ return -EINVAL;
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED;
+
+ return 0;
+}
+
+static int smbdirect_connect_resolve_addr(struct smbdirect_socket *sc,
+ const struct sockaddr *src,
+ const struct sockaddr *dst)
+{
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct sockaddr *src_addr = NULL;
+ struct sockaddr *dst_addr = NULL;
+ int ret;
+
+ src_addr = (struct sockaddr *)src;
+ if (src_addr && src_addr->sa_family == AF_UNSPEC)
+ src_addr = NULL;
+ dst_addr = (struct sockaddr *)dst;
+
+ if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED))
+ return -EINVAL;
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING;
+ sc->rdma.expected_event = RDMA_CM_EVENT_ADDR_RESOLVED;
+ ret = rdma_resolve_addr(sc->rdma.cm_id, src_addr, dst_addr,
+ sp->resolve_addr_timeout_msec);
+ if (ret) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "rdma_resolve_addr() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ return ret;
+ }
+
+ return 0;
+}
+
+static int smbdirect_connect_resolve_route(struct smbdirect_socket *sc)
+{
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ int ret;
+
+ if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED))
+ return sc->first_error;
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING;
+ sc->rdma.expected_event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+ ret = rdma_resolve_route(sc->rdma.cm_id, sp->resolve_route_timeout_msec);
+ if (ret) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "rdma_resolve_route() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ return ret;
+ }
+
+ return 0;
+}
+
+static int smbdirect_connect_rdma_connect(struct smbdirect_socket *sc)
+{
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct rdma_conn_param conn_param;
+ __be32 ird_ord_hdr[2];
+ int ret;
+
+ sc->ib.dev = sc->rdma.cm_id->device;
+
+ if (!smbdirect_frwr_is_supported(&sc->ib.dev->attrs)) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "Fast Registration Work Requests (FRWR) is not supported device %.*s\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name);
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "Device capability flags = %llx max_fast_reg_page_list_len = %u\n",
+ sc->ib.dev->attrs.device_cap_flags,
+ sc->ib.dev->attrs.max_fast_reg_page_list_len);
+ return -EPROTONOSUPPORT;
+ }
+
+ if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB &&
+ !rdma_ib_or_roce(sc->ib.dev, sc->rdma.cm_id->port_num)) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "Not IB: device: %.*s IW:%u local: %pISpsfc remote: %pISpsfc\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ rdma_protocol_iwarp(sc->ib.dev, sc->rdma.cm_id->port_num),
+ &sc->rdma.cm_id->route.addr.src_addr,
+ &sc->rdma.cm_id->route.addr.dst_addr);
+ return -EPROTONOSUPPORT;
+ }
+ if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW &&
+ !rdma_protocol_iwarp(sc->ib.dev, sc->rdma.cm_id->port_num)) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "Not IW: device: %.*s IB:%u local: %pISpsfc remote: %pISpsfc\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ rdma_ib_or_roce(sc->ib.dev, sc->rdma.cm_id->port_num),
+ &sc->rdma.cm_id->route.addr.src_addr,
+ &sc->rdma.cm_id->route.addr.dst_addr);
+ return -EPROTONOSUPPORT;
+ }
+
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "rdma connect: device: %.*s local: %pISpsfc remote: %pISpsfc\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ &sc->rdma.cm_id->route.addr.src_addr,
+ &sc->rdma.cm_id->route.addr.dst_addr);
+
+ sp->max_frmr_depth = min_t(u32, sp->max_frmr_depth,
+ sc->ib.dev->attrs.max_fast_reg_page_list_len);
+ sc->mr_io.type = IB_MR_TYPE_MEM_REG;
+ if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
+ sc->mr_io.type = IB_MR_TYPE_SG_GAPS;
+
+ sp->responder_resources = min_t(u8, sp->responder_resources,
+ sc->ib.dev->attrs.max_qp_rd_atom);
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_INFO,
+ "responder_resources=%d\n",
+ sp->responder_resources);
+
+ ret = smbdirect_connection_create_qp(sc);
+ if (ret) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_create_qp() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ return ret;
+ }
+
+ memset(&conn_param, 0, sizeof(conn_param));
+ conn_param.initiator_depth = sp->initiator_depth;
+ conn_param.responder_resources = sp->responder_resources;
+
+ /* Need to send IRD/ORD in private data for iWARP */
+ if (rdma_protocol_iwarp(sc->ib.dev, sc->rdma.cm_id->port_num)) {
+ ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources);
+ ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth);
+ conn_param.private_data = ird_ord_hdr;
+ conn_param.private_data_len = sizeof(ird_ord_hdr);
+ } else {
+ conn_param.private_data = NULL;
+ conn_param.private_data_len = 0;
+ }
+
+ conn_param.retry_count = SMBDIRECT_RDMA_CM_RETRY;
+ conn_param.rnr_retry_count = SMBDIRECT_RDMA_CM_RNR_RETRY;
+ conn_param.flow_control = 0;
+
+ if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED))
+ return sc->first_error;
+ sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING;
+ sc->rdma.expected_event = RDMA_CM_EVENT_ESTABLISHED;
+ ret = rdma_connect_locked(sc->rdma.cm_id, &conn_param);
+ if (ret) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "rdma_connect_locked() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ return ret;
+ }
+
+ /*
+ * start with the rdma connect timeout and SMBDIRECT_KEEPALIVE_PENDING
+ * so that the timer will cause a disconnect.
+ */
+ INIT_DELAYED_WORK(&sc->idle.timer_work, smbdirect_connection_idle_timer_work);
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
+ mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->rdma_connect_timeout_msec));
+
+ return 0;
+}
+
+static int smbdirect_connect_rdma_event_handler(struct rdma_cm_id *id,
+ struct rdma_cm_event *event)
+{
+ struct smbdirect_socket *sc = id->context;
+ u8 peer_initiator_depth;
+ u8 peer_responder_resources;
+ int ret;
+
+ /*
+ * cma_cm_event_handler() has
+ * lockdep_assert_held(&id_priv->handler_mutex);
+ *
+ * Mutexes are not allowed in interrupts,
+ * and we rely on not being in an interrupt here,
+ * as we might sleep.
+ *
+ * We didn't timeout so we cancel our idle timer,
+ * it will be scheduled again if needed.
+ */
+ WARN_ON_ONCE(in_interrupt());
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
+ cancel_delayed_work_sync(&sc->idle.timer_work);
+
+ if (event->status || event->event != sc->rdma.expected_event) {
+ int lvl = SMBDIRECT_LOG_ERR;
+
+ ret = -ECONNABORTED;
+
+ if (event->event == RDMA_CM_EVENT_REJECTED)
+ ret = -ECONNREFUSED;
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ret = -ENETDOWN;
+ if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status)))
+ ret = event->status;
+
+ if (ret == -ENODEV)
+ lvl = SMBDIRECT_LOG_INFO;
+
+ smbdirect_log_rdma_event(sc, lvl,
+ "%s (first_error=%1pe, expected=%s) => event=%s status=%d => ret=%1pe\n",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
+ rdma_event_msg(sc->rdma.expected_event),
+ rdma_event_msg(event->event),
+ event->status,
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+
+ smbdirect_socket_schedule_cleanup_lvl(sc,
+ lvl,
+ ret);
+ return 0;
+ }
+
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "%s (first_error=%1pe) event=%s\n",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
+ rdma_event_msg(event->event));
+
+ if (sc->first_error)
+ return 0;
+
+ switch (event->event) {
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING))
+ return 0;
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED;
+
+ ret = smbdirect_connect_resolve_route(sc);
+ if (ret)
+ smbdirect_socket_schedule_cleanup(sc, ret);
+ return 0;
+
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING))
+ return 0;
+ sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED;
+
+ ret = smbdirect_connect_rdma_connect(sc);
+ if (ret)
+ smbdirect_socket_schedule_cleanup(sc, ret);
+ return 0;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ smbdirect_connection_rdma_established(sc);
+
+ if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING))
+ return 0;
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
+
+ /*
+ * Here we work around an inconsistency between
+ * iWarp and other devices (at least rxe and irdma using RoCEv2)
+ */
+ if (rdma_protocol_iwarp(id->device, id->port_num)) {
+ /*
+ * iWarp devices report the peer's values
+ * with the perspective of the peer here.
+ * Tested with siw and irdma (in iwarp mode)
+ * We need to change to our perspective here,
+ * so we need to switch the values.
+ */
+ peer_initiator_depth = event->param.conn.responder_resources;
+ peer_responder_resources = event->param.conn.initiator_depth;
+ } else {
+ /*
+ * Non iWarp devices report the peer's values
+ * already changed to our perspective here.
+ * Tested with rxe and irdma (in roce mode).
+ */
+ peer_initiator_depth = event->param.conn.initiator_depth;
+ peer_responder_resources = event->param.conn.responder_resources;
+ }
+ smbdirect_connection_negotiate_rdma_resources(sc,
+ peer_initiator_depth,
+ peer_responder_resources,
+ &event->param.conn);
+
+ ret = smbdirect_connect_negotiate_start(sc);
+ if (ret)
+ smbdirect_socket_schedule_cleanup(sc, ret);
+ return 0;
+
+ default:
+ break;
+ }
+
+ /*
+ * This is an internal error
+ */
+ WARN_ON_ONCE(sc->rdma.expected_event != RDMA_CM_EVENT_ESTABLISHED);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return 0;
+}
+
+static int smbdirect_connect_negotiate_start(struct smbdirect_socket *sc)
+{
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_recv_io *recv_io = NULL;
+ struct smbdirect_send_io *send_io = NULL;
+ struct smbdirect_negotiate_req *nreq = NULL;
+ int ret;
+
+ if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED))
+ return sc->first_error;
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
+
+ ret = smbdirect_connection_create_mem_pools(sc);
+ if (ret) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_create_mem_pools() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ goto create_mem_pools_failed;
+ }
+
+ /*
+ * There is only a single batch credit
+ */
+ atomic_set(&sc->send_io.bcredits.count, 1);
+
+ /*
+ * Initialize the local credits to post
+ * IB_WR_SEND[_WITH_INV].
+ */
+ atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target);
+
+ recv_io = smbdirect_connection_get_recv_io(sc);
+ if (WARN_ON_ONCE(!recv_io)) {
+ ret = -EINVAL;
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_get_recv_io() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ goto get_recv_io_failed;
+ }
+ recv_io->cqe.done = smbdirect_connect_negotiate_recv_done;
+
+ send_io = smbdirect_connection_alloc_send_io(sc);
+ if (IS_ERR(send_io)) {
+ ret = PTR_ERR(send_io);
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_alloc_send_io() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ goto alloc_send_io_failed;
+ }
+ send_io->cqe.done = smbdirect_connect_negotiate_send_done;
+
+ nreq = (struct smbdirect_negotiate_req *)send_io->packet;
+ nreq->min_version = cpu_to_le16(SMBDIRECT_V1);
+ nreq->max_version = cpu_to_le16(SMBDIRECT_V1);
+ nreq->reserved = 0;
+ nreq->credits_requested = cpu_to_le16(sp->send_credit_target);
+ nreq->preferred_send_size = cpu_to_le32(sp->max_send_size);
+ nreq->max_receive_size = cpu_to_le32(sp->max_recv_size);
+ nreq->max_fragmented_size = cpu_to_le32(sp->max_fragmented_recv_size);
+
+ smbdirect_log_negotiate(sc, SMBDIRECT_LOG_INFO,
+ "ReqOut: %s%x, %s%x, %s%u, %s%u, %s%u, %s%u\n",
+ "MinVersion=0x",
+ le16_to_cpu(nreq->min_version),
+ "MaxVersion=0x",
+ le16_to_cpu(nreq->max_version),
+ "CreditsRequested=",
+ le16_to_cpu(nreq->credits_requested),
+ "PreferredSendSize=",
+ le32_to_cpu(nreq->preferred_send_size),
+ "MaxRecvSize=",
+ le32_to_cpu(nreq->max_receive_size),
+ "MaxFragmentedSize=",
+ le32_to_cpu(nreq->max_fragmented_size));
+
+ send_io->sge[0].addr = ib_dma_map_single(sc->ib.dev,
+ nreq,
+ sizeof(*nreq),
+ DMA_TO_DEVICE);
+ ret = ib_dma_mapping_error(sc->ib.dev, send_io->sge[0].addr);
+ if (ret) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "ib_dma_mapping_error() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ goto dma_mapping_failed;
+ }
+
+ send_io->sge[0].length = sizeof(*nreq);
+ send_io->sge[0].lkey = sc->ib.pd->local_dma_lkey;
+ send_io->num_sge = 1;
+
+ ib_dma_sync_single_for_device(sc->ib.dev,
+ send_io->sge[0].addr,
+ send_io->sge[0].length,
+ DMA_TO_DEVICE);
+
+ smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_INFO,
+ "sge addr=0x%llx length=%u lkey=0x%x\n",
+ send_io->sge[0].addr,
+ send_io->sge[0].length,
+ send_io->sge[0].lkey);
+
+ /*
+ * Now post the recv_io buffer in order to get
+ * the negotiate response
+ */
+ sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP;
+ ret = smbdirect_connection_post_recv_io(recv_io);
+ if (ret) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_post_recv_io() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ goto post_recv_io_failed;
+ }
+
+ send_io->wr.next = NULL;
+ send_io->wr.wr_cqe = &send_io->cqe;
+ send_io->wr.sg_list = send_io->sge;
+ send_io->wr.num_sge = send_io->num_sge;
+ send_io->wr.opcode = IB_WR_SEND;
+ send_io->wr.send_flags = IB_SEND_SIGNALED;
+
+ ret = smbdirect_connection_post_send_wr(sc, &send_io->wr);
+ if (ret) {
+ /* if we reach here, post send failed */
+ smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_post_send_wr() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ goto post_send_wr_failed;
+ }
+
+ /*
+ * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING
+ * so that the timer will cause a disconnect.
+ */
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
+ mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->negotiate_timeout_msec));
+
+ return 0;
+
+post_send_wr_failed:
+ /*
+ * ib_dma_unmap_single is called in
+ * smbdirect_connection_free_send_io()
+ */
+ smbdirect_connection_free_send_io(send_io);
+ /*
+ * recv_io is given to the rdma layer,
+ * we should not put it even on error
+ * nor call smbdirect_connection_destroy_mem_pools()
+ * it will be cleaned up during disconnect.
+ */
+ return ret;
+
+post_recv_io_failed:
+ /*
+ * ib_dma_unmap_single is called in
+ * smbdirect_connection_free_send_io()
+ */
+dma_mapping_failed:
+ smbdirect_connection_free_send_io(send_io);
+
+alloc_send_io_failed:
+ smbdirect_connection_put_recv_io(recv_io);
+
+get_recv_io_failed:
+ smbdirect_connection_destroy_mem_pools(sc);
+
+create_mem_pools_failed:
+ return ret;
+}
+
+static void smbdirect_connect_negotiate_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct smbdirect_send_io *send_io =
+ container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
+ struct smbdirect_socket *sc = send_io->socket;
+
+ smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_INFO,
+ "smbdirect_send_io completed. status='%s (%d)', opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status, wc->opcode);
+
+ /* Note this frees wc->wr_cqe, but not wc */
+ smbdirect_connection_free_send_io(send_io);
+ atomic_dec(&sc->send_io.pending.count);
+
+ if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_SEND))) {
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR,
+ "wc->status=%s (%d) wc->opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status, wc->opcode);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+}
+
+static void smbdirect_connect_negotiate_recv_work(struct work_struct *work);
+
+static void smbdirect_connect_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct smbdirect_recv_io *recv_io =
+ container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
+ struct smbdirect_socket *sc = recv_io->socket;
+ unsigned long flags;
+
+ if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_RECV))) {
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_ERR,
+ "wc->status=%s (%d) wc->opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status, wc->opcode);
+ goto error;
+ }
+
+ smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_INFO,
+ "smbdirect_recv_io completed. status='%s (%d)', opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status, wc->opcode);
+
+ /*
+ * This is an internal error!
+ */
+ if (WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_NEGOTIATE_REP))
+ goto error;
+
+ /*
+ * Don't reset timer to the keepalive interval in
+ * this will be done in smbdirect_accept_direct_negotiate_recv_work.
+ */
+
+ ib_dma_sync_single_for_cpu(sc->ib.dev,
+ recv_io->sge.addr,
+ recv_io->sge.length,
+ DMA_FROM_DEVICE);
+
+ /*
+ * Only remember recv_io if it has enough bytes,
+ * this gives smbdirect_accept_negotiate_recv_work enough
+ * information in order to disconnect if it was not
+ * valid.
+ */
+ sc->recv_io.reassembly.full_packet_received = true;
+ if (wc->byte_len >= sizeof(struct smbdirect_negotiate_resp))
+ smbdirect_connection_reassembly_append_recv_io(sc, recv_io, 0);
+ else
+ smbdirect_connection_put_recv_io(recv_io);
+
+ /*
+ * We continue via the workqueue as we may have
+ * complex work that might sleep.
+ *
+ * So we defer further processing of the negotiation
+ * to smbdirect_connect_negotiate_recv_work().
+ */
+ spin_lock_irqsave(&sc->connect.lock, flags);
+ if (!sc->first_error) {
+ INIT_WORK(&sc->connect.work, smbdirect_connect_negotiate_recv_work);
+ if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_RUNNING)
+ queue_work(sc->workqueues.connect, &sc->connect.work);
+ }
+ spin_unlock_irqrestore(&sc->connect.lock, flags);
+
+ return;
+
+error:
+ /*
+ * recv_io.posted.refill_work is still disabled,
+ * so smbdirect_connection_put_recv_io() won't
+ * start it.
+ */
+ smbdirect_connection_put_recv_io(recv_io);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+}
+
+static void smbdirect_connect_negotiate_recv_work(struct work_struct *work)
+{
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, connect.work);
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_recv_io *recv_io;
+ struct smbdirect_negotiate_resp *nrep;
+ unsigned long flags;
+ u16 negotiated_version;
+ u16 credits_requested;
+ u16 credits_granted;
+ u32 status;
+ u32 max_readwrite_size;
+ u32 preferred_send_size;
+ u32 max_receive_size;
+ u32 max_fragmented_size;
+ int posted;
+ int ret;
+
+ if (sc->first_error)
+ return;
+
+ /*
+ * make sure we won't start again...
+ */
+ disable_work(work);
+
+ /*
+ * Reset timer to the keepalive interval in
+ * order to trigger our next keepalive message.
+ */
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
+ mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->keepalive_interval_msec));
+
+ /*
+ * If smbdirect_connect_negotiate_recv_done() detected an
+ * invalid request we want to disconnect.
+ */
+ recv_io = smbdirect_connection_reassembly_first_recv_io(sc);
+ if (!recv_io) {
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+ spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
+ sc->recv_io.reassembly.queue_length--;
+ list_del(&recv_io->list);
+ spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
+ smbdirect_connection_put_recv_io(recv_io);
+
+ if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_NEGOTIATE_RUNNING))
+ return;
+
+ /*
+ * Note recv_io is already part of the free list,
+ * as we just called smbdirect_connection_put_recv_io(),
+ * but it won't be reused before we call
+ * smbdirect_connection_recv_io_refill() below.
+ */
+
+ nrep = (struct smbdirect_negotiate_resp *)recv_io->packet;
+ negotiated_version = le16_to_cpu(nrep->negotiated_version);
+ credits_requested = le16_to_cpu(nrep->credits_requested);
+ credits_granted = le16_to_cpu(nrep->credits_granted);
+ status = le32_to_cpu(nrep->status);
+ max_readwrite_size = le32_to_cpu(nrep->max_readwrite_size);
+ preferred_send_size = le32_to_cpu(nrep->preferred_send_size);
+ max_receive_size = le32_to_cpu(nrep->max_receive_size);
+ max_fragmented_size = le32_to_cpu(nrep->max_fragmented_size);
+
+ smbdirect_log_negotiate(sc, SMBDIRECT_LOG_INFO,
+ "RepIn: %s%x, %s%x, %s%x, %s%u, %s%u, %s%x, %s%u, %s%u, %s%u, %s%u\n",
+ "MinVersion=0x",
+ le16_to_cpu(nrep->min_version),
+ "MaxVersion=0x",
+ le16_to_cpu(nrep->max_version),
+ "NegotiatedVersion=0x",
+ le16_to_cpu(nrep->negotiated_version),
+ "CreditsRequested=",
+ le16_to_cpu(nrep->credits_requested),
+ "CreditsGranted=",
+ le16_to_cpu(nrep->credits_granted),
+ "Status=0x",
+ le32_to_cpu(nrep->status),
+ "MaxReadWriteSize=",
+ le32_to_cpu(nrep->max_readwrite_size),
+ "PreferredSendSize=",
+ le32_to_cpu(nrep->preferred_send_size),
+ "MaxRecvSize=",
+ le32_to_cpu(nrep->max_receive_size),
+ "MaxFragmentedSize=",
+ le32_to_cpu(nrep->max_fragmented_size));
+
+ if (negotiated_version != SMBDIRECT_V1) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "invalid: negotiated_version=0x%x\n",
+ negotiated_version);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNREFUSED);
+ return;
+ }
+
+ if (status != le32_to_cpu(STATUS_SUCCESS)) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "invalid: status=0x%x != 0x0\n",
+ status);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNREFUSED);
+ return;
+ }
+
+ if (max_receive_size < SMBDIRECT_MIN_RECEIVE_SIZE) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "invalid: max_receive_size=%u < %u\n",
+ max_receive_size,
+ SMBDIRECT_MIN_RECEIVE_SIZE);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+
+ if (max_fragmented_size < SMBDIRECT_MIN_FRAGMENTED_SIZE) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "invalid: max_fragmented_size=%u < %u\n",
+ max_fragmented_size,
+ SMBDIRECT_MIN_FRAGMENTED_SIZE);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+
+ if (credits_granted == 0) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "invalid: credits_granted == 0\n");
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+
+ if (credits_requested == 0) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "invalid: credits_requested == 0\n");
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+
+ if (preferred_send_size > sp->max_recv_size) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "invalid: preferred_send_size=%u > max_recv_size=%u\n",
+ preferred_send_size,
+ sp->max_recv_size);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+
+ /*
+ * We take the value from the peer, which is checked to be higher than 0,
+ * but we limit it to the max value we support in order to have
+ * the main logic simpler.
+ */
+ sc->recv_io.credits.target = credits_requested;
+ sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target,
+ sp->recv_credit_max);
+
+ /*
+ * At least the value of SMBDIRECT_MIN_RECEIVE_SIZE is used.
+ */
+ sp->max_recv_size = min_t(u32, sp->max_recv_size, preferred_send_size);
+ sp->max_recv_size = max_t(u32, sp->max_recv_size, SMBDIRECT_MIN_RECEIVE_SIZE);
+
+ /*
+ * We already sent our sp->max_fragmented_recv_size
+ * to the peer, so we can't lower it here any more.
+ *
+ * TODO: but if the peer lowered sp->max_recv_size
+ * we will have to adjust our number of buffers.
+ *
+ * But for now we keep it as the cifs.ko code
+ * worked before.
+ */
+
+ /*
+ * Note nrep->max_receive_size was already checked against
+ * SMBDIRECT_MIN_RECEIVE_SIZE above.
+ */
+ sp->max_send_size = min_t(u32, sp->max_send_size, max_receive_size);
+
+ /*
+ * Make sure the resulting max_frmr_depth is at least 1,
+ * which means max_read_write_size needs to be at least PAGE_SIZE.
+ */
+ sp->max_read_write_size = min_t(u32, sp->max_frmr_depth * PAGE_SIZE,
+ max_readwrite_size);
+ if (sp->max_read_write_size < PAGE_SIZE) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "invalid: max_readwrite_size=%u < PAGE_SIZE(%lu)\n",
+ max_readwrite_size,
+ PAGE_SIZE);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+ sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE;
+
+ /*
+ * Note nrep->credits_granted was already checked against 0 above.
+ */
+ atomic_set(&sc->send_io.credits.count, credits_granted);
+
+ /*
+ * Note nrep->max_fragmented_size was already checked against
+ * SMBDIRECT_MIN_FRAGMENTED_SIZE above.
+ */
+ sp->max_fragmented_send_size = max_fragmented_size;
+
+ ret = smbdirect_connection_create_mr_list(sc);
+ if (ret) {
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_create_mr_list() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ smbdirect_socket_schedule_cleanup(sc, ret);
+ return;
+ }
+
+ /*
+ * Prepare for receiving data_transfer messages
+ */
+ sc->recv_io.reassembly.full_packet_received = true;
+ sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
+ list_for_each_entry(recv_io, &sc->recv_io.free.list, list)
+ recv_io->cqe.done = smbdirect_connection_recv_io_done;
+ recv_io = NULL;
+
+ /*
+ * We should at least post 1 smbdirect_recv_io!
+ */
+ posted = smbdirect_connection_recv_io_refill(sc);
+ if (posted < 1) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_recv_io_refill() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+
+ /*
+ * smbdirect_connection_negotiation_done()
+ * will setup all required things and wake up
+ * the waiter.
+ */
+ smbdirect_connection_negotiation_done(sc);
+}
+
+int smbdirect_connect_sync(struct smbdirect_socket *sc,
+ const struct sockaddr *dst)
+{
+ int ret;
+
+ ret = smbdirect_connect(sc, dst);
+ if (ret) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connect(%pISpsfc) failed %1pe\n",
+ dst, SMBDIRECT_DEBUG_ERR_PTR(ret));
+ return ret;
+ }
+
+ ret = smbdirect_connection_wait_for_connected(sc);
+ if (ret) {
+ int lvl = SMBDIRECT_LOG_ERR;
+
+ if (ret == -ENODEV)
+ lvl = SMBDIRECT_LOG_INFO;
+
+ smbdirect_log_rdma_event(sc, lvl,
+ "wait for smbdirect_connect(%pISpsfc) failed %1pe\n",
+ dst, SMBDIRECT_DEBUG_ERR_PTR(ret));
+ return ret;
+ }
+
+ return 0;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connect_sync);
diff --git a/fs/smb/common/smbdirect/smbdirect_connection.c b/fs/smb/common/smbdirect/smbdirect_connection.c
new file mode 100644
index 000000000000..7e4921b9538c
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_connection.c
@@ -0,0 +1,2181 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017, Microsoft Corporation.
+ * Copyright (c) 2025, Stefan Metzmacher
+ */
+
+#include "smbdirect_internal.h"
+#include <linux/folio_queue.h>
+
+struct smbdirect_map_sges {
+ struct ib_sge *sge;
+ size_t num_sge;
+ size_t max_sge;
+ struct ib_device *device;
+ u32 local_dma_lkey;
+ enum dma_data_direction direction;
+};
+
+static ssize_t smbdirect_map_sges_from_iter(struct iov_iter *iter, size_t len,
+ struct smbdirect_map_sges *state);
+
+static void smbdirect_connection_recv_io_refill_work(struct work_struct *work);
+static void smbdirect_connection_send_immediate_work(struct work_struct *work);
+
+static void smbdirect_connection_qp_event_handler(struct ib_event *event, void *context)
+{
+ struct smbdirect_socket *sc = context;
+
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "%s on device %.*s socket %p (cm_id=%p) status %s first_error %1pe\n",
+ ib_event_msg(event->event),
+ IB_DEVICE_NAME_MAX,
+ event->device->name,
+ sc, sc->rdma.cm_id,
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
+
+ switch (event->event) {
+ case IB_EVENT_CQ_ERR:
+ case IB_EVENT_QP_FATAL:
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static int smbdirect_connection_rdma_event_handler(struct rdma_cm_id *id,
+ struct rdma_cm_event *event)
+{
+ struct smbdirect_socket *sc = id->context;
+ int ret = -ECONNRESET;
+
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ret = -ENETDOWN;
+ if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status)))
+ ret = event->status;
+
+ /*
+ * cma_cm_event_handler() has
+ * lockdep_assert_held(&id_priv->handler_mutex);
+ *
+ * Mutexes are not allowed in interrupts,
+ * and we rely on not being in an interrupt here.
+ */
+ WARN_ON_ONCE(in_interrupt());
+
+ if (event->event != sc->rdma.expected_event) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "%s (first_error=%1pe, expected=%s) => event=%s status=%d => ret=%1pe\n",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
+ rdma_event_msg(sc->rdma.expected_event),
+ rdma_event_msg(event->event),
+ event->status,
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+
+ /*
+ * If we get RDMA_CM_EVENT_DEVICE_REMOVAL,
+ * we should change to SMBDIRECT_SOCKET_DISCONNECTED,
+ * so that rdma_disconnect() is avoided later via
+ * smbdirect_socket_schedule_cleanup[_status]() =>
+ * smbdirect_socket_cleanup_work().
+ *
+ * As otherwise we'd set SMBDIRECT_SOCKET_DISCONNECTING,
+ * but never ever get RDMA_CM_EVENT_DISCONNECTED and
+ * never reach SMBDIRECT_SOCKET_DISCONNECTED.
+ */
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ smbdirect_socket_schedule_cleanup_status(sc,
+ SMBDIRECT_LOG_ERR,
+ ret,
+ SMBDIRECT_SOCKET_DISCONNECTED);
+ else
+ smbdirect_socket_schedule_cleanup(sc, ret);
+ if (sc->ib.qp)
+ ib_drain_qp(sc->ib.qp);
+ return 0;
+ }
+
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "%s (first_error=%1pe) event=%s\n",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
+ rdma_event_msg(event->event));
+
+ switch (event->event) {
+ case RDMA_CM_EVENT_DISCONNECTED:
+ /*
+ * We need to change to SMBDIRECT_SOCKET_DISCONNECTED,
+ * so that rdma_disconnect() is avoided later via
+ * smbdirect_socket_schedule_cleanup_status() =>
+ * smbdirect_socket_cleanup_work().
+ *
+ * As otherwise we'd set SMBDIRECT_SOCKET_DISCONNECTING,
+ * but never ever get RDMA_CM_EVENT_DISCONNECTED and
+ * never reach SMBDIRECT_SOCKET_DISCONNECTED.
+ *
+ * This is also a normal disconnect so
+ * SMBDIRECT_LOG_INFO should be good enough
+ * and avoids spamming the default logs.
+ */
+ smbdirect_socket_schedule_cleanup_status(sc,
+ SMBDIRECT_LOG_INFO,
+ ret,
+ SMBDIRECT_SOCKET_DISCONNECTED);
+ if (sc->ib.qp)
+ ib_drain_qp(sc->ib.qp);
+ return 0;
+
+ default:
+ break;
+ }
+
+ /*
+ * This is an internal error, should be handled above via
+ * event->event != sc->rdma.expected_event already.
+ */
+ WARN_ON_ONCE(sc->rdma.expected_event != RDMA_CM_EVENT_DISCONNECTED);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return 0;
+}
+
+void smbdirect_connection_rdma_established(struct smbdirect_socket *sc)
+{
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "rdma established: device: %.*s local: %pISpsfc remote: %pISpsfc\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ &sc->rdma.cm_id->route.addr.src_addr,
+ &sc->rdma.cm_id->route.addr.dst_addr);
+
+ sc->rdma.cm_id->event_handler = smbdirect_connection_rdma_event_handler;
+ sc->rdma.expected_event = RDMA_CM_EVENT_DISCONNECTED;
+}
+
+void smbdirect_connection_negotiation_done(struct smbdirect_socket *sc)
+{
+ if (unlikely(sc->first_error))
+ return;
+
+ if (sc->status == SMBDIRECT_SOCKET_CONNECTED)
+ /*
+ * This is the accept case where
+ * smbdirect_socket_accept() already sets
+ * SMBDIRECT_SOCKET_CONNECTED
+ */
+ goto done;
+
+ if (sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING) {
+ /*
+ * Something went wrong...
+ */
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "status=%s first_error=%1pe local: %pISpsfc remote: %pISpsfc\n",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
+ &sc->rdma.cm_id->route.addr.src_addr,
+ &sc->rdma.cm_id->route.addr.dst_addr);
+ return;
+ }
+
+ /*
+ * We are done, so we can wake up the waiter.
+ */
+ WARN_ONCE(sc->status == SMBDIRECT_SOCKET_CONNECTED,
+ "status=%s first_error=%1pe",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
+ sc->status = SMBDIRECT_SOCKET_CONNECTED;
+
+ /*
+ * We need to setup the refill and send immediate work
+ * in order to get a working connection.
+ */
+done:
+ INIT_WORK(&sc->recv_io.posted.refill_work, smbdirect_connection_recv_io_refill_work);
+ INIT_WORK(&sc->idle.immediate_work, smbdirect_connection_send_immediate_work);
+
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "negotiated: local: %pISpsfc remote: %pISpsfc\n",
+ &sc->rdma.cm_id->route.addr.src_addr,
+ &sc->rdma.cm_id->route.addr.dst_addr);
+
+ wake_up(&sc->status_wait);
+}
+
+static u32 smbdirect_rdma_rw_send_wrs(struct ib_device *dev,
+ const struct ib_qp_init_attr *attr)
+{
+ /*
+ * This could be split out of rdma_rw_init_qp()
+ * and be a helper function next to rdma_rw_mr_factor()
+ *
+ * We can't check unlikely(rdma_rw_force_mr) here,
+ * but that is most likely 0 anyway.
+ */
+ u32 factor;
+
+ WARN_ON_ONCE(attr->port_num == 0);
+
+ /*
+ * Each context needs at least one RDMA READ or WRITE WR.
+ *
+ * For some hardware we might need more, eventually we should ask the
+ * HCA driver for a multiplier here.
+ */
+ factor = 1;
+
+ /*
+ * If the device needs MRs to perform RDMA READ or WRITE operations,
+ * we'll need two additional MRs for the registrations and the
+ * invalidation.
+ */
+ if (rdma_protocol_iwarp(dev, attr->port_num) || dev->attrs.max_sgl_rd)
+ factor += 2; /* inv + reg */
+
+ return factor * attr->cap.max_rdma_ctxs;
+}
+
+int smbdirect_connection_create_qp(struct smbdirect_socket *sc)
+{
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct ib_qp_init_attr qp_attr;
+ struct ib_qp_cap qp_cap;
+ u32 rdma_send_wr;
+ u32 max_send_wr;
+ int ret;
+
+ /*
+ * Note that {rdma,ib}_create_qp() will call
+ * rdma_rw_init_qp() if max_rdma_ctxs is not 0.
+ * It will adjust max_send_wr to the required
+ * number of additional WRs for the RDMA RW operations.
+ * It will cap max_send_wr to the device limit.
+ *
+ * We use allocate sp->responder_resources * 2 MRs
+ * and each MR needs WRs for REG and INV, so
+ * we use '* 4'.
+ *
+ * +1 for ib_drain_qp()
+ */
+ memset(&qp_cap, 0, sizeof(qp_cap));
+ qp_cap.max_send_wr = sp->send_credit_target + sp->responder_resources * 4 + 1;
+ qp_cap.max_recv_wr = sp->recv_credit_max + 1;
+ qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
+ qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
+ qp_cap.max_inline_data = 0;
+ qp_cap.max_rdma_ctxs = sc->rw_io.credits.max;
+
+ /*
+ * Find out the number of max_send_wr
+ * after rdma_rw_init_qp() adjusted it.
+ *
+ * We only do it on a temporary variable,
+ * as rdma_create_qp() will trigger
+ * rdma_rw_init_qp() again.
+ */
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.cap = qp_cap;
+ qp_attr.port_num = sc->rdma.cm_id->port_num;
+ rdma_send_wr = smbdirect_rdma_rw_send_wrs(sc->ib.dev, &qp_attr);
+ max_send_wr = qp_cap.max_send_wr + rdma_send_wr;
+
+ if (qp_cap.max_send_wr > sc->ib.dev->attrs.max_cqe ||
+ qp_cap.max_send_wr > sc->ib.dev->attrs.max_qp_wr) {
+ pr_err("Possible CQE overrun: max_send_wr %d\n",
+ qp_cap.max_send_wr);
+ pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ sc->ib.dev->attrs.max_cqe,
+ sc->ib.dev->attrs.max_qp_wr);
+ pr_err("consider lowering send_credit_target = %d\n",
+ sp->send_credit_target);
+ return -EINVAL;
+ }
+
+ if (qp_cap.max_rdma_ctxs &&
+ (max_send_wr >= sc->ib.dev->attrs.max_cqe ||
+ max_send_wr >= sc->ib.dev->attrs.max_qp_wr)) {
+ pr_err("Possible CQE overrun: rdma_send_wr %d + max_send_wr %d = %d\n",
+ rdma_send_wr, qp_cap.max_send_wr, max_send_wr);
+ pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ sc->ib.dev->attrs.max_cqe,
+ sc->ib.dev->attrs.max_qp_wr);
+ pr_err("consider lowering send_credit_target = %d, max_rdma_ctxs = %d\n",
+ sp->send_credit_target, qp_cap.max_rdma_ctxs);
+ return -EINVAL;
+ }
+
+ if (qp_cap.max_recv_wr > sc->ib.dev->attrs.max_cqe ||
+ qp_cap.max_recv_wr > sc->ib.dev->attrs.max_qp_wr) {
+ pr_err("Possible CQE overrun: max_recv_wr %d\n",
+ qp_cap.max_recv_wr);
+ pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ sc->ib.dev->attrs.max_cqe,
+ sc->ib.dev->attrs.max_qp_wr);
+ pr_err("consider lowering receive_credit_max = %d\n",
+ sp->recv_credit_max);
+ return -EINVAL;
+ }
+
+ if (qp_cap.max_send_sge > sc->ib.dev->attrs.max_send_sge ||
+ qp_cap.max_recv_sge > sc->ib.dev->attrs.max_recv_sge) {
+ pr_err("device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
+ IB_DEVICE_NAME_MAX,
+ sc->ib.dev->name,
+ sc->ib.dev->attrs.max_send_sge,
+ sc->ib.dev->attrs.max_recv_sge);
+ return -EINVAL;
+ }
+
+ sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
+ if (IS_ERR(sc->ib.pd)) {
+ pr_err("Can't create RDMA PD: %1pe\n", sc->ib.pd);
+ ret = PTR_ERR(sc->ib.pd);
+ sc->ib.pd = NULL;
+ return ret;
+ }
+
+ sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc,
+ max_send_wr,
+ sc->ib.poll_ctx);
+ if (IS_ERR(sc->ib.send_cq)) {
+ pr_err("Can't create RDMA send CQ: %1pe\n", sc->ib.send_cq);
+ ret = PTR_ERR(sc->ib.send_cq);
+ sc->ib.send_cq = NULL;
+ goto err;
+ }
+
+ sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc,
+ qp_cap.max_recv_wr,
+ sc->ib.poll_ctx);
+ if (IS_ERR(sc->ib.recv_cq)) {
+ pr_err("Can't create RDMA recv CQ: %1pe\n", sc->ib.recv_cq);
+ ret = PTR_ERR(sc->ib.recv_cq);
+ sc->ib.recv_cq = NULL;
+ goto err;
+ }
+
+ /*
+ * We reset completely here!
+ * As the above use was just temporary
+ * to calc max_send_wr and rdma_send_wr.
+ *
+ * rdma_create_qp() will trigger rdma_rw_init_qp()
+ * again if max_rdma_ctxs is not 0.
+ */
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.event_handler = smbdirect_connection_qp_event_handler;
+ qp_attr.qp_context = sc;
+ qp_attr.cap = qp_cap;
+ qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ qp_attr.qp_type = IB_QPT_RC;
+ qp_attr.send_cq = sc->ib.send_cq;
+ qp_attr.recv_cq = sc->ib.recv_cq;
+ qp_attr.port_num = ~0;
+
+ ret = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr);
+ if (ret) {
+ pr_err("Can't create RDMA QP: %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ goto err;
+ }
+ sc->ib.qp = sc->rdma.cm_id->qp;
+
+ return 0;
+err:
+ smbdirect_connection_destroy_qp(sc);
+ return ret;
+}
+
+void smbdirect_connection_destroy_qp(struct smbdirect_socket *sc)
+{
+ if (sc->ib.qp) {
+ ib_drain_qp(sc->ib.qp);
+ sc->ib.qp = NULL;
+ rdma_destroy_qp(sc->rdma.cm_id);
+ }
+ if (sc->ib.recv_cq) {
+ ib_destroy_cq(sc->ib.recv_cq);
+ sc->ib.recv_cq = NULL;
+ }
+ if (sc->ib.send_cq) {
+ ib_destroy_cq(sc->ib.send_cq);
+ sc->ib.send_cq = NULL;
+ }
+ if (sc->ib.pd) {
+ ib_dealloc_pd(sc->ib.pd);
+ sc->ib.pd = NULL;
+ }
+}
+
+int smbdirect_connection_create_mem_pools(struct smbdirect_socket *sc)
+{
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ char name[80];
+ size_t i;
+
+ /*
+ * We use sizeof(struct smbdirect_negotiate_resp) for the
+ * payload size as it is larger as
+ * sizeof(struct smbdirect_data_transfer).
+ *
+ * This will fit client and server usage for now.
+ */
+ snprintf(name, sizeof(name), "smbdirect_send_io_cache_%p", sc);
+ struct kmem_cache_args send_io_args = {
+ .align = __alignof__(struct smbdirect_send_io),
+ };
+ sc->send_io.mem.cache = kmem_cache_create(name,
+ sizeof(struct smbdirect_send_io) +
+ sizeof(struct smbdirect_negotiate_resp),
+ &send_io_args,
+ SLAB_HWCACHE_ALIGN);
+ if (!sc->send_io.mem.cache)
+ goto err;
+
+ sc->send_io.mem.pool = mempool_create_slab_pool(sp->send_credit_target,
+ sc->send_io.mem.cache);
+ if (!sc->send_io.mem.pool)
+ goto err;
+
+ /*
+ * A payload size of sp->max_recv_size should fit
+ * any message.
+ *
+ * For smbdirect_data_transfer messages the whole
+ * buffer might be exposed to userspace
+ * (currently on the client side...)
+ * The documentation says data_offset = 0 would be
+ * strange but valid.
+ */
+ snprintf(name, sizeof(name), "smbdirect_recv_io_cache_%p", sc);
+ struct kmem_cache_args recv_io_args = {
+ .align = __alignof__(struct smbdirect_recv_io),
+ .useroffset = sizeof(struct smbdirect_recv_io),
+ .usersize = sp->max_recv_size,
+ };
+ sc->recv_io.mem.cache = kmem_cache_create(name,
+ sizeof(struct smbdirect_recv_io) +
+ sp->max_recv_size,
+ &recv_io_args,
+ SLAB_HWCACHE_ALIGN);
+ if (!sc->recv_io.mem.cache)
+ goto err;
+
+ sc->recv_io.mem.pool = mempool_create_slab_pool(sp->recv_credit_max,
+ sc->recv_io.mem.cache);
+ if (!sc->recv_io.mem.pool)
+ goto err;
+
+ for (i = 0; i < sp->recv_credit_max; i++) {
+ struct smbdirect_recv_io *recv_io;
+
+ recv_io = mempool_alloc(sc->recv_io.mem.pool,
+ sc->recv_io.mem.gfp_mask);
+ if (!recv_io)
+ goto err;
+ recv_io->socket = sc;
+ recv_io->sge.length = 0;
+ list_add_tail(&recv_io->list, &sc->recv_io.free.list);
+ }
+
+ return 0;
+err:
+ smbdirect_connection_destroy_mem_pools(sc);
+ return -ENOMEM;
+}
+
+void smbdirect_connection_destroy_mem_pools(struct smbdirect_socket *sc)
+{
+ struct smbdirect_recv_io *recv_io, *next_io;
+
+ list_for_each_entry_safe(recv_io, next_io, &sc->recv_io.free.list, list) {
+ list_del(&recv_io->list);
+ mempool_free(recv_io, sc->recv_io.mem.pool);
+ }
+
+ /*
+ * Note mempool_destroy() and kmem_cache_destroy()
+ * work fine with a NULL pointer
+ */
+
+ mempool_destroy(sc->recv_io.mem.pool);
+ sc->recv_io.mem.pool = NULL;
+
+ kmem_cache_destroy(sc->recv_io.mem.cache);
+ sc->recv_io.mem.cache = NULL;
+
+ mempool_destroy(sc->send_io.mem.pool);
+ sc->send_io.mem.pool = NULL;
+
+ kmem_cache_destroy(sc->send_io.mem.cache);
+ sc->send_io.mem.cache = NULL;
+}
+
+struct smbdirect_send_io *smbdirect_connection_alloc_send_io(struct smbdirect_socket *sc)
+{
+ struct smbdirect_send_io *msg;
+
+ msg = mempool_alloc(sc->send_io.mem.pool, sc->send_io.mem.gfp_mask);
+ if (!msg)
+ return ERR_PTR(-ENOMEM);
+ msg->socket = sc;
+ INIT_LIST_HEAD(&msg->sibling_list);
+ msg->num_sge = 0;
+
+ return msg;
+}
+
+void smbdirect_connection_free_send_io(struct smbdirect_send_io *msg)
+{
+ struct smbdirect_socket *sc = msg->socket;
+ size_t i;
+
+ /*
+ * The list needs to be empty!
+ * The caller should take care of it.
+ */
+ WARN_ON_ONCE(!list_empty(&msg->sibling_list));
+
+ /*
+ * Note we call ib_dma_unmap_page(), even if some sges are mapped using
+ * ib_dma_map_single().
+ *
+ * The difference between _single() and _page() only matters for the
+ * ib_dma_map_*() case.
+ *
+ * For the ib_dma_unmap_*() case it does not matter as both take the
+ * dma_addr_t and dma_unmap_single_attrs() is just an alias to
+ * dma_unmap_page_attrs().
+ */
+ for (i = 0; i < msg->num_sge; i++)
+ ib_dma_unmap_page(sc->ib.dev,
+ msg->sge[i].addr,
+ msg->sge[i].length,
+ DMA_TO_DEVICE);
+
+ mempool_free(msg, sc->send_io.mem.pool);
+}
+
+struct smbdirect_recv_io *smbdirect_connection_get_recv_io(struct smbdirect_socket *sc)
+{
+ struct smbdirect_recv_io *msg = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&sc->recv_io.free.lock, flags);
+ if (likely(!sc->first_error))
+ msg = list_first_entry_or_null(&sc->recv_io.free.list,
+ struct smbdirect_recv_io,
+ list);
+ if (likely(msg)) {
+ list_del(&msg->list);
+ sc->statistics.get_receive_buffer++;
+ }
+ spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
+
+ return msg;
+}
+
+void smbdirect_connection_put_recv_io(struct smbdirect_recv_io *msg)
+{
+ struct smbdirect_socket *sc = msg->socket;
+ unsigned long flags;
+
+ if (likely(msg->sge.length != 0)) {
+ ib_dma_unmap_single(sc->ib.dev,
+ msg->sge.addr,
+ msg->sge.length,
+ DMA_FROM_DEVICE);
+ msg->sge.length = 0;
+ }
+
+ spin_lock_irqsave(&sc->recv_io.free.lock, flags);
+ list_add_tail(&msg->list, &sc->recv_io.free.list);
+ sc->statistics.put_receive_buffer++;
+ spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
+
+ queue_work(sc->workqueues.refill, &sc->recv_io.posted.refill_work);
+}
+
+void smbdirect_connection_reassembly_append_recv_io(struct smbdirect_socket *sc,
+ struct smbdirect_recv_io *msg,
+ u32 data_length)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
+ list_add_tail(&msg->list, &sc->recv_io.reassembly.list);
+ sc->recv_io.reassembly.queue_length++;
+ /*
+ * Make sure reassembly_data_length is updated after list and
+ * reassembly_queue_length are updated. On the dequeue side
+ * reassembly_data_length is checked without a lock to determine
+ * if reassembly_queue_length and list is up to date
+ */
+ virt_wmb();
+ sc->recv_io.reassembly.data_length += data_length;
+ spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
+ sc->statistics.enqueue_reassembly_queue++;
+}
+
+struct smbdirect_recv_io *
+smbdirect_connection_reassembly_first_recv_io(struct smbdirect_socket *sc)
+{
+ struct smbdirect_recv_io *msg;
+
+ msg = list_first_entry_or_null(&sc->recv_io.reassembly.list,
+ struct smbdirect_recv_io,
+ list);
+
+ return msg;
+}
+
+void smbdirect_connection_negotiate_rdma_resources(struct smbdirect_socket *sc,
+ u8 peer_initiator_depth,
+ u8 peer_responder_resources,
+ const struct rdma_conn_param *param)
+{
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+
+ if (rdma_protocol_iwarp(sc->ib.dev, sc->rdma.cm_id->port_num) &&
+ param->private_data_len == 8) {
+ /*
+ * Legacy clients with only iWarp MPA v1 support
+ * need a private blob in order to negotiate
+ * the IRD/ORD values.
+ */
+ const __be32 *ird_ord_hdr = param->private_data;
+ u32 ird32 = be32_to_cpu(ird_ord_hdr[0]);
+ u32 ord32 = be32_to_cpu(ird_ord_hdr[1]);
+
+ /*
+ * cifs.ko sends the legacy IRD/ORD negotiation
+ * event if iWarp MPA v2 was used.
+ *
+ * Here we check that the values match and only
+ * mark the client as legacy if they don't match.
+ */
+ if ((u32)param->initiator_depth != ird32 ||
+ (u32)param->responder_resources != ord32) {
+ /*
+ * There are broken clients (old cifs.ko)
+ * using little endian and also
+ * struct rdma_conn_param only uses u8
+ * for initiator_depth and responder_resources,
+ * so we truncate the value to U8_MAX.
+ *
+ * smb_direct_accept_client() will then
+ * do the real negotiation in order to
+ * select the minimum between client and
+ * server.
+ */
+ ird32 = min_t(u32, ird32, U8_MAX);
+ ord32 = min_t(u32, ord32, U8_MAX);
+
+ sc->rdma.legacy_iwarp = true;
+ peer_initiator_depth = (u8)ird32;
+ peer_responder_resources = (u8)ord32;
+ }
+ }
+
+ /*
+ * negotiate the value by using the minimum
+ * between client and server if the client provided
+ * non 0 values.
+ */
+ if (peer_initiator_depth != 0)
+ sp->initiator_depth = min_t(u8, sp->initiator_depth,
+ peer_initiator_depth);
+ if (peer_responder_resources != 0)
+ sp->responder_resources = min_t(u8, sp->responder_resources,
+ peer_responder_resources);
+}
+
+bool smbdirect_connection_is_connected(struct smbdirect_socket *sc)
+{
+ if (unlikely(!sc || sc->first_error || sc->status != SMBDIRECT_SOCKET_CONNECTED))
+ return false;
+ return true;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_is_connected);
+
+int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc)
+{
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ union {
+ struct sockaddr sa;
+ struct sockaddr_storage ss;
+ } src_addr, dst_addr;
+ const struct sockaddr *src = NULL;
+ const struct sockaddr *dst = NULL;
+ char _devname[IB_DEVICE_NAME_MAX] = { 0, };
+ const char *devname = NULL;
+ int ret;
+
+ if (sc->rdma.cm_id) {
+ src_addr.ss = sc->rdma.cm_id->route.addr.src_addr;
+ if (src_addr.sa.sa_family != AF_UNSPEC)
+ src = &src_addr.sa;
+ dst_addr.ss = sc->rdma.cm_id->route.addr.dst_addr;
+ if (dst_addr.sa.sa_family != AF_UNSPEC)
+ dst = &dst_addr.sa;
+
+ if (sc->ib.dev) {
+ memcpy(_devname, sc->ib.dev->name, IB_DEVICE_NAME_MAX);
+ devname = _devname;
+ }
+ }
+
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "waiting for connection: device: %.*s local: %pISpsfc remote: %pISpsfc\n",
+ IB_DEVICE_NAME_MAX, devname, src, dst);
+
+ ret = wait_event_interruptible_timeout(sc->status_wait,
+ sc->status == SMBDIRECT_SOCKET_CONNECTED ||
+ sc->first_error,
+ msecs_to_jiffies(sp->negotiate_timeout_msec));
+ if (sc->rdma.cm_id) {
+ /*
+ * Maybe src and dev are updated in the meantime.
+ */
+ src_addr.ss = sc->rdma.cm_id->route.addr.src_addr;
+ if (src_addr.sa.sa_family != AF_UNSPEC)
+ src = &src_addr.sa;
+ dst_addr.ss = sc->rdma.cm_id->route.addr.dst_addr;
+ if (dst_addr.sa.sa_family != AF_UNSPEC)
+ dst = &dst_addr.sa;
+
+ if (sc->ib.dev) {
+ memcpy(_devname, sc->ib.dev->name, IB_DEVICE_NAME_MAX);
+ devname = _devname;
+ }
+ }
+ if (ret == 0)
+ ret = -ETIMEDOUT;
+ if (ret < 0)
+ smbdirect_socket_schedule_cleanup(sc, ret);
+ if (sc->first_error) {
+ int lvl = SMBDIRECT_LOG_ERR;
+
+ ret = sc->first_error;
+ if (ret == -ENODEV)
+ lvl = SMBDIRECT_LOG_INFO;
+
+ smbdirect_log_rdma_event(sc, lvl,
+ "connection failed %1pe device: %.*s local: %pISpsfc remote: %pISpsfc\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret),
+ IB_DEVICE_NAME_MAX, devname, src, dst);
+ return ret;
+ }
+
+ return 0;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_wait_for_connected);
+
+void smbdirect_connection_idle_timer_work(struct work_struct *work)
+{
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, idle.timer_work.work);
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+
+ if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) {
+ smbdirect_log_keep_alive(sc, SMBDIRECT_LOG_ERR,
+ "%s => timeout sc->idle.keepalive=%s\n",
+ smbdirect_socket_status_string(sc->status),
+ sc->idle.keepalive == SMBDIRECT_KEEPALIVE_SENT ?
+ "SENT" : "PENDING");
+ smbdirect_socket_schedule_cleanup(sc, -ETIMEDOUT);
+ return;
+ }
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+ return;
+
+ /*
+ * Now use the keepalive timeout (instead of keepalive interval)
+ * in order to wait for a response
+ */
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
+ mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->keepalive_timeout_msec));
+ smbdirect_log_keep_alive(sc, SMBDIRECT_LOG_INFO,
+ "schedule send of empty idle message\n");
+ queue_work(sc->workqueues.immediate, &sc->idle.immediate_work);
+}
+
+u16 smbdirect_connection_grant_recv_credits(struct smbdirect_socket *sc)
+{
+ int missing;
+ int available;
+ int new_credits;
+
+ if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target)
+ return 0;
+
+ missing = (int)sc->recv_io.credits.target - atomic_read(&sc->recv_io.credits.count);
+ available = atomic_xchg(&sc->recv_io.credits.available, 0);
+ new_credits = min3((int)U16_MAX, missing, available);
+ if (new_credits <= 0) {
+ /*
+ * If credits are available, but not granted
+ * we need to re-add them again.
+ */
+ if (available)
+ atomic_add(available, &sc->recv_io.credits.available);
+ return 0;
+ }
+
+ if (new_credits < available) {
+ /*
+ * Readd the remaining available again.
+ */
+ available -= new_credits;
+ atomic_add(available, &sc->recv_io.credits.available);
+ }
+
+ /*
+ * Remember we granted the credits
+ */
+ atomic_add(new_credits, &sc->recv_io.credits.count);
+ return new_credits;
+}
+
+static bool smbdirect_connection_request_keep_alive(struct smbdirect_socket *sc)
+{
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+
+ if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) {
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT;
+ /*
+ * Now use the keepalive timeout (instead of keepalive interval)
+ * in order to wait for a response
+ */
+ mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->keepalive_timeout_msec));
+ return true;
+ }
+
+ return false;
+}
+
+int smbdirect_connection_post_send_wr(struct smbdirect_socket *sc,
+ struct ib_send_wr *wr)
+{
+ int ret;
+
+ if (unlikely(sc->first_error))
+ return sc->first_error;
+
+ atomic_inc(&sc->send_io.pending.count);
+ ret = ib_post_send(sc->ib.qp, wr, NULL);
+ if (ret) {
+ atomic_dec(&sc->send_io.pending.count);
+ smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR,
+ "ib_post_send() failed %1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ smbdirect_socket_schedule_cleanup(sc, ret);
+ }
+
+ return ret;
+}
+
+static void smbdirect_connection_send_batch_init(struct smbdirect_send_batch *batch,
+ bool need_invalidate_rkey,
+ unsigned int remote_key)
+{
+ INIT_LIST_HEAD(&batch->msg_list);
+ batch->wr_cnt = 0;
+ batch->need_invalidate_rkey = need_invalidate_rkey;
+ batch->remote_key = remote_key;
+ batch->credit = 0;
+}
+
+int smbdirect_connection_send_batch_flush(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch,
+ bool is_last)
+{
+ struct smbdirect_send_io *first, *last;
+ int ret = 0;
+
+ if (list_empty(&batch->msg_list))
+ goto release_credit;
+
+ first = list_first_entry(&batch->msg_list,
+ struct smbdirect_send_io,
+ sibling_list);
+ last = list_last_entry(&batch->msg_list,
+ struct smbdirect_send_io,
+ sibling_list);
+
+ if (batch->need_invalidate_rkey) {
+ first->wr.opcode = IB_WR_SEND_WITH_INV;
+ first->wr.ex.invalidate_rkey = batch->remote_key;
+ batch->need_invalidate_rkey = false;
+ batch->remote_key = 0;
+ }
+
+ last->wr.send_flags = IB_SEND_SIGNALED;
+ last->wr.wr_cqe = &last->cqe;
+
+ /*
+ * Remove last from send_ctx->msg_list
+ * and splice the rest of send_ctx->msg_list
+ * to last->sibling_list.
+ *
+ * send_ctx->msg_list is a valid empty list
+ * at the end.
+ */
+ list_del_init(&last->sibling_list);
+ list_splice_tail_init(&batch->msg_list, &last->sibling_list);
+ batch->wr_cnt = 0;
+
+ ret = smbdirect_connection_post_send_wr(sc, &first->wr);
+ if (ret) {
+ struct smbdirect_send_io *sibling, *next;
+
+ list_for_each_entry_safe(sibling, next, &last->sibling_list, sibling_list) {
+ list_del_init(&sibling->sibling_list);
+ smbdirect_connection_free_send_io(sibling);
+ }
+ smbdirect_connection_free_send_io(last);
+ }
+
+release_credit:
+ if (is_last && !ret && batch->credit) {
+ atomic_add(batch->credit, &sc->send_io.bcredits.count);
+ batch->credit = 0;
+ wake_up(&sc->send_io.bcredits.wait_queue);
+ }
+
+ return ret;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_batch_flush);
+
+struct smbdirect_send_batch *
+smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage,
+ bool need_invalidate_rkey,
+ unsigned int remote_key)
+{
+ struct smbdirect_send_batch *batch = (struct smbdirect_send_batch *)storage;
+
+ memset(storage, 0, sizeof(*storage));
+ BUILD_BUG_ON(sizeof(*batch) > sizeof(*storage));
+
+ smbdirect_connection_send_batch_init(batch,
+ need_invalidate_rkey,
+ remote_key);
+
+ return batch;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_init_send_batch_storage);
+
+static int smbdirect_connection_wait_for_send_bcredit(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch)
+{
+ int ret;
+
+ if (batch->credit)
+ return 0;
+
+ ret = smbdirect_socket_wait_for_credits(sc,
+ SMBDIRECT_SOCKET_CONNECTED,
+ -ENOTCONN,
+ &sc->send_io.bcredits.wait_queue,
+ &sc->send_io.bcredits.count,
+ 1);
+ if (ret)
+ return ret;
+
+ batch->credit = 1;
+ return 0;
+}
+
+static int smbdirect_connection_wait_for_send_lcredit(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch)
+{
+ if (batch && atomic_read(&sc->send_io.lcredits.count) <= 1) {
+ int ret;
+
+ ret = smbdirect_connection_send_batch_flush(sc, batch, false);
+ if (ret)
+ return ret;
+ }
+
+ return smbdirect_socket_wait_for_credits(sc,
+ SMBDIRECT_SOCKET_CONNECTED,
+ -ENOTCONN,
+ &sc->send_io.lcredits.wait_queue,
+ &sc->send_io.lcredits.count,
+ 1);
+}
+
+static int smbdirect_connection_wait_for_send_credits(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch)
+{
+ if (batch && (batch->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) {
+ int ret;
+
+ ret = smbdirect_connection_send_batch_flush(sc, batch, false);
+ if (ret)
+ return ret;
+ }
+
+ return smbdirect_socket_wait_for_credits(sc,
+ SMBDIRECT_SOCKET_CONNECTED,
+ -ENOTCONN,
+ &sc->send_io.credits.wait_queue,
+ &sc->send_io.credits.count,
+ 1);
+}
+
+static void smbdirect_connection_send_io_done(struct ib_cq *cq, struct ib_wc *wc);
+
+static int smbdirect_connection_post_send_io(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch,
+ struct smbdirect_send_io *msg)
+{
+ int i;
+
+ for (i = 0; i < msg->num_sge; i++)
+ ib_dma_sync_single_for_device(sc->ib.dev,
+ msg->sge[i].addr, msg->sge[i].length,
+ DMA_TO_DEVICE);
+
+ msg->cqe.done = smbdirect_connection_send_io_done;
+ msg->wr.wr_cqe = &msg->cqe;
+ msg->wr.opcode = IB_WR_SEND;
+ msg->wr.sg_list = &msg->sge[0];
+ msg->wr.num_sge = msg->num_sge;
+ msg->wr.next = NULL;
+
+ if (batch) {
+ msg->wr.send_flags = 0;
+ if (!list_empty(&batch->msg_list)) {
+ struct smbdirect_send_io *last;
+
+ last = list_last_entry(&batch->msg_list,
+ struct smbdirect_send_io,
+ sibling_list);
+ last->wr.next = &msg->wr;
+ }
+ list_add_tail(&msg->sibling_list, &batch->msg_list);
+ batch->wr_cnt++;
+ return 0;
+ }
+
+ msg->wr.send_flags = IB_SEND_SIGNALED;
+ return smbdirect_connection_post_send_wr(sc, &msg->wr);
+}
+
+int smbdirect_connection_send_single_iter(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch,
+ struct iov_iter *iter,
+ unsigned int flags,
+ u32 remaining_data_length)
+{
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_send_batch _batch;
+ struct smbdirect_send_io *msg;
+ struct smbdirect_data_transfer *packet;
+ size_t header_length;
+ u16 new_credits = 0;
+ u32 data_length = 0;
+ int ret;
+
+ if (WARN_ON_ONCE(flags))
+ return -EINVAL; /* no flags support for now */
+
+ if (iter) {
+ if (WARN_ON_ONCE(iov_iter_rw(iter) != ITER_SOURCE))
+ return -EINVAL; /* It's a bug in upper layer to get there */
+
+ header_length = sizeof(struct smbdirect_data_transfer);
+ if (WARN_ON_ONCE(remaining_data_length == 0 ||
+ iov_iter_count(iter) > remaining_data_length))
+ return -EINVAL;
+ } else {
+ /* If this is a packet without payload, don't send padding */
+ header_length = offsetof(struct smbdirect_data_transfer, padding);
+ if (WARN_ON_ONCE(remaining_data_length))
+ return -EINVAL;
+ }
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
+ smbdirect_log_write(sc, SMBDIRECT_LOG_ERR,
+ "status=%s first_error=%1pe => %1pe\n",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
+ SMBDIRECT_DEBUG_ERR_PTR(-ENOTCONN));
+ return -ENOTCONN;
+ }
+
+ if (!batch) {
+ smbdirect_connection_send_batch_init(&_batch, false, 0);
+ batch = &_batch;
+ }
+
+ ret = smbdirect_connection_wait_for_send_bcredit(sc, batch);
+ if (ret)
+ goto bcredit_failed;
+
+ ret = smbdirect_connection_wait_for_send_lcredit(sc, batch);
+ if (ret)
+ goto lcredit_failed;
+
+ ret = smbdirect_connection_wait_for_send_credits(sc, batch);
+ if (ret)
+ goto credit_failed;
+
+ new_credits = smbdirect_connection_grant_recv_credits(sc);
+ if (new_credits == 0 &&
+ atomic_read(&sc->send_io.credits.count) == 0 &&
+ atomic_read(&sc->recv_io.credits.count) == 0) {
+ /*
+ * queue the refill work in order to
+ * get some new recv credits we can grant to
+ * the peer.
+ */
+ queue_work(sc->workqueues.refill, &sc->recv_io.posted.refill_work);
+
+ /*
+ * wait until either the refill work or the peer
+ * granted new credits
+ */
+ ret = wait_event_interruptible(sc->send_io.credits.wait_queue,
+ atomic_read(&sc->send_io.credits.count) >= 1 ||
+ atomic_read(&sc->recv_io.credits.available) >= 1 ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+ ret = -ENOTCONN;
+ if (ret < 0)
+ goto credit_failed;
+
+ new_credits = smbdirect_connection_grant_recv_credits(sc);
+ }
+
+ msg = smbdirect_connection_alloc_send_io(sc);
+ if (IS_ERR(msg)) {
+ ret = PTR_ERR(msg);
+ goto alloc_failed;
+ }
+
+ /* Map the packet to DMA */
+ msg->sge[0].addr = ib_dma_map_single(sc->ib.dev,
+ msg->packet,
+ header_length,
+ DMA_TO_DEVICE);
+ ret = ib_dma_mapping_error(sc->ib.dev, msg->sge[0].addr);
+ if (ret)
+ goto err;
+
+ msg->sge[0].length = header_length;
+ msg->sge[0].lkey = sc->ib.pd->local_dma_lkey;
+ msg->num_sge = 1;
+
+ if (iter) {
+ struct smbdirect_map_sges extract = {
+ .num_sge = msg->num_sge,
+ .max_sge = ARRAY_SIZE(msg->sge),
+ .sge = msg->sge,
+ .device = sc->ib.dev,
+ .local_dma_lkey = sc->ib.pd->local_dma_lkey,
+ .direction = DMA_TO_DEVICE,
+ };
+ size_t payload_len = umin(iov_iter_count(iter),
+ sp->max_send_size - sizeof(*packet));
+
+ ret = smbdirect_map_sges_from_iter(iter, payload_len, &extract);
+ if (ret < 0)
+ goto err;
+ data_length = ret;
+ remaining_data_length -= data_length;
+ msg->num_sge = extract.num_sge;
+ }
+
+ /* Fill in the packet header */
+ packet = (struct smbdirect_data_transfer *)msg->packet;
+ packet->credits_requested = cpu_to_le16(sp->send_credit_target);
+ packet->credits_granted = cpu_to_le16(new_credits);
+
+ packet->flags = 0;
+ if (smbdirect_connection_request_keep_alive(sc))
+ packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED);
+
+ packet->reserved = 0;
+ if (!data_length)
+ packet->data_offset = 0;
+ else
+ packet->data_offset = cpu_to_le32(24);
+ packet->data_length = cpu_to_le32(data_length);
+ packet->remaining_data_length = cpu_to_le32(remaining_data_length);
+ packet->padding = 0;
+
+ smbdirect_log_outgoing(sc, SMBDIRECT_LOG_INFO,
+ "DataOut: %s=%u, %s=%u, %s=0x%x, %s=%u, %s=%u, %s=%u\n",
+ "CreditsRequested",
+ le16_to_cpu(packet->credits_requested),
+ "CreditsGranted",
+ le16_to_cpu(packet->credits_granted),
+ "Flags",
+ le16_to_cpu(packet->flags),
+ "RemainingDataLength",
+ le32_to_cpu(packet->remaining_data_length),
+ "DataOffset",
+ le32_to_cpu(packet->data_offset),
+ "DataLength",
+ le32_to_cpu(packet->data_length));
+
+ ret = smbdirect_connection_post_send_io(sc, batch, msg);
+ if (ret)
+ goto err;
+
+ /*
+ * From here msg is moved to send_ctx
+ * and we should not free it explicitly.
+ */
+
+ if (batch == &_batch) {
+ ret = smbdirect_connection_send_batch_flush(sc, batch, true);
+ if (ret)
+ goto flush_failed;
+ }
+
+ return data_length;
+err:
+ smbdirect_connection_free_send_io(msg);
+flush_failed:
+alloc_failed:
+ atomic_inc(&sc->send_io.credits.count);
+credit_failed:
+ atomic_inc(&sc->send_io.lcredits.count);
+lcredit_failed:
+ atomic_add(batch->credit, &sc->send_io.bcredits.count);
+ batch->credit = 0;
+bcredit_failed:
+ return ret;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_single_iter);
+
+int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc)
+{
+ /*
+ * As an optimization, we don't wait for individual I/O to finish
+ * before sending the next one.
+ * Send them all and wait for pending send count to get to 0
+ * that means all the I/Os have been out and we are good to return
+ */
+
+ wait_event(sc->send_io.pending.zero_wait_queue,
+ atomic_read(&sc->send_io.pending.count) == 0 ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
+ smbdirect_log_write(sc, SMBDIRECT_LOG_ERR,
+ "status=%s first_error=%1pe => %1pe\n",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
+ SMBDIRECT_DEBUG_ERR_PTR(-ENOTCONN));
+ return -ENOTCONN;
+ }
+
+ return 0;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_wait_zero_pending);
+
+int smbdirect_connection_send_iter(struct smbdirect_socket *sc,
+ struct iov_iter *iter,
+ unsigned int flags,
+ bool need_invalidate,
+ unsigned int remote_key)
+{
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_send_batch batch;
+ int total_count = iov_iter_count(iter);
+ int ret;
+ int error = 0;
+ __be32 hdr;
+
+ if (WARN_ONCE(flags, "unexpected flags=0x%x\n", flags))
+ return -EINVAL; /* no flags support for now */
+
+ if (WARN_ON_ONCE(iov_iter_rw(iter) != ITER_SOURCE))
+ return -EINVAL; /* It's a bug in upper layer to get there */
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
+ smbdirect_log_write(sc, SMBDIRECT_LOG_INFO,
+ "status=%s first_error=%1pe => %1pe\n",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
+ SMBDIRECT_DEBUG_ERR_PTR(-ENOTCONN));
+ return -ENOTCONN;
+ }
+
+ /*
+ * For now we expect the iter to have the full
+ * message, including a 4 byte length header.
+ */
+ if (iov_iter_count(iter) <= 4)
+ return -EINVAL;
+ if (!copy_from_iter_full(&hdr, sizeof(hdr), iter))
+ return -EFAULT;
+ if (iov_iter_count(iter) != be32_to_cpu(hdr))
+ return -EINVAL;
+
+ /*
+ * The size must fit into the negotiated
+ * fragmented send size.
+ */
+ if (iov_iter_count(iter) > sp->max_fragmented_send_size)
+ return -EMSGSIZE;
+
+ smbdirect_log_write(sc, SMBDIRECT_LOG_INFO,
+ "Sending (RDMA): length=%zu\n",
+ iov_iter_count(iter));
+
+ smbdirect_connection_send_batch_init(&batch, need_invalidate, remote_key);
+ while (iov_iter_count(iter)) {
+ ret = smbdirect_connection_send_single_iter(sc,
+ &batch,
+ iter,
+ flags,
+ iov_iter_count(iter));
+ if (unlikely(ret < 0)) {
+ error = ret;
+ break;
+ }
+ }
+
+ ret = smbdirect_connection_send_batch_flush(sc, &batch, true);
+ if (unlikely(ret && !error))
+ error = ret;
+
+ /*
+ * As an optimization, we don't wait for individual I/O to finish
+ * before sending the next one.
+ * Send them all and wait for pending send count to get to 0
+ * that means all the I/Os have been out and we are good to return
+ */
+
+ ret = smbdirect_connection_send_wait_zero_pending(sc);
+ if (unlikely(ret && !error))
+ error = ret;
+
+ if (unlikely(error))
+ return error;
+
+ return total_count;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_iter);
+
+static void smbdirect_connection_send_io_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct smbdirect_send_io *msg =
+ container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
+ struct smbdirect_socket *sc = msg->socket;
+ struct smbdirect_send_io *sibling, *next;
+ int lcredits = 0;
+
+ smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_INFO,
+ "smbdirect_send_io completed. status='%s (%d)', opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status, wc->opcode);
+
+ if (unlikely(!(msg->wr.send_flags & IB_SEND_SIGNALED))) {
+ /*
+ * This happens when smbdirect_send_io is a sibling
+ * before the final message, it is signaled on
+ * error anyway, so we need to skip
+ * smbdirect_connection_free_send_io here,
+ * otherwise is will destroy the memory
+ * of the siblings too, which will cause
+ * use after free problems for the others
+ * triggered from ib_drain_qp().
+ */
+ if (wc->status != IB_WC_SUCCESS)
+ goto skip_free;
+
+ /*
+ * This should not happen!
+ * But we better just close the
+ * connection...
+ */
+ smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR,
+ "unexpected send completion wc->status=%s (%d) wc->opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status, wc->opcode);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+
+ /*
+ * Free possible siblings and then the main send_io
+ */
+ list_for_each_entry_safe(sibling, next, &msg->sibling_list, sibling_list) {
+ list_del_init(&sibling->sibling_list);
+ smbdirect_connection_free_send_io(sibling);
+ lcredits += 1;
+ }
+ /* Note this frees wc->wr_cqe, but not wc */
+ smbdirect_connection_free_send_io(msg);
+ lcredits += 1;
+
+ if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_SEND))) {
+skip_free:
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR,
+ "wc->status=%s (%d) wc->opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status, wc->opcode);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ return;
+ }
+
+ atomic_add(lcredits, &sc->send_io.lcredits.count);
+ wake_up(&sc->send_io.lcredits.wait_queue);
+
+ if (atomic_dec_and_test(&sc->send_io.pending.count))
+ wake_up(&sc->send_io.pending.zero_wait_queue);
+}
+
+static void smbdirect_connection_send_immediate_work(struct work_struct *work)
+{
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, idle.immediate_work);
+ int ret;
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+ return;
+
+ smbdirect_log_keep_alive(sc, SMBDIRECT_LOG_INFO,
+ "send an empty message\n");
+ sc->statistics.send_empty++;
+ ret = smbdirect_connection_send_single_iter(sc, NULL, NULL, 0, 0);
+ if (ret < 0) {
+ smbdirect_log_write(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_send_single_iter ret=%1pe\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ smbdirect_socket_schedule_cleanup(sc, ret);
+ }
+}
+
+int smbdirect_connection_post_recv_io(struct smbdirect_recv_io *msg)
+{
+ struct smbdirect_socket *sc = msg->socket;
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct ib_recv_wr recv_wr = {
+ .wr_cqe = &msg->cqe,
+ .sg_list = &msg->sge,
+ .num_sge = 1,
+ };
+ int ret;
+
+ if (unlikely(sc->first_error))
+ return sc->first_error;
+
+ msg->sge.addr = ib_dma_map_single(sc->ib.dev,
+ msg->packet,
+ sp->max_recv_size,
+ DMA_FROM_DEVICE);
+ ret = ib_dma_mapping_error(sc->ib.dev, msg->sge.addr);
+ if (ret)
+ return ret;
+
+ msg->sge.length = sp->max_recv_size;
+ msg->sge.lkey = sc->ib.pd->local_dma_lkey;
+
+ ret = ib_post_recv(sc->ib.qp, &recv_wr, NULL);
+ if (ret) {
+ smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_ERR,
+ "ib_post_recv failed ret=%d (%1pe)\n",
+ ret, SMBDIRECT_DEBUG_ERR_PTR(ret));
+ ib_dma_unmap_single(sc->ib.dev,
+ msg->sge.addr,
+ msg->sge.length,
+ DMA_FROM_DEVICE);
+ msg->sge.length = 0;
+ smbdirect_socket_schedule_cleanup(sc, ret);
+ }
+
+ return ret;
+}
+
+void smbdirect_connection_recv_io_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct smbdirect_recv_io *recv_io =
+ container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
+ struct smbdirect_socket *sc = recv_io->socket;
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_data_transfer *data_transfer;
+ int current_recv_credits;
+ u16 old_recv_credit_target;
+ u16 credits_requested;
+ u16 credits_granted;
+ u16 flags;
+ u32 data_offset;
+ u32 data_length;
+ u32 remaining_data_length;
+
+ if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_RECV))) {
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_ERR,
+ "wc->status=%s (%d) wc->opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status, wc->opcode);
+ goto error;
+ }
+
+ smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_INFO,
+ "recv_io=0x%p type=%d wc status=%s wc opcode %d byte_len=%d pkey_index=%u\n",
+ recv_io, sc->recv_io.expected,
+ ib_wc_status_msg(wc->status), wc->opcode,
+ wc->byte_len, wc->pkey_index);
+
+ /*
+ * Reset timer to the keepalive interval in
+ * order to trigger our next keepalive message.
+ */
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
+ mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->keepalive_interval_msec));
+
+ ib_dma_sync_single_for_cpu(sc->ib.dev,
+ recv_io->sge.addr,
+ recv_io->sge.length,
+ DMA_FROM_DEVICE);
+
+ if (unlikely(wc->byte_len <
+ offsetof(struct smbdirect_data_transfer, padding))) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "wc->byte_len=%u < %zu\n",
+ wc->byte_len,
+ offsetof(struct smbdirect_data_transfer, padding));
+ goto error;
+ }
+
+ data_transfer = (struct smbdirect_data_transfer *)recv_io->packet;
+ credits_requested = le16_to_cpu(data_transfer->credits_requested);
+ credits_granted = le16_to_cpu(data_transfer->credits_granted);
+ flags = le16_to_cpu(data_transfer->flags);
+ remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
+ data_offset = le32_to_cpu(data_transfer->data_offset);
+ data_length = le32_to_cpu(data_transfer->data_length);
+
+ smbdirect_log_incoming(sc, SMBDIRECT_LOG_INFO,
+ "DataIn: %s=%u, %s=%u, %s=0x%x, %s=%u, %s=%u, %s=%u\n",
+ "CreditsRequested",
+ credits_requested,
+ "CreditsGranted",
+ credits_granted,
+ "Flags",
+ flags,
+ "RemainingDataLength",
+ remaining_data_length,
+ "DataOffset",
+ data_offset,
+ "DataLength",
+ data_length);
+
+ if (unlikely(credits_requested == 0)) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "invalid: credits_requested == 0\n");
+ goto error;
+ }
+
+ if (unlikely(data_offset % 8 != 0)) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "invalid: data_offset=%u (0x%x) not aligned to 8\n",
+ data_offset, data_offset);
+ goto error;
+ }
+
+ if (unlikely(wc->byte_len < data_offset ||
+ (u64)wc->byte_len < (u64)data_offset + data_length)) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "wc->byte_len=%u < date_offset=%u + data_length=%u\n",
+ wc->byte_len, data_offset, data_length);
+ goto error;
+ }
+
+ if (unlikely(remaining_data_length > sp->max_fragmented_recv_size ||
+ data_length > sp->max_fragmented_recv_size ||
+ (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size)) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "remaining_data_length=%u + data_length=%u > max_fragmented=%u\n",
+ remaining_data_length, data_length, sp->max_fragmented_recv_size);
+ goto error;
+ }
+
+ if (data_length) {
+ if (sc->recv_io.reassembly.full_packet_received)
+ recv_io->first_segment = true;
+
+ if (remaining_data_length)
+ sc->recv_io.reassembly.full_packet_received = false;
+ else
+ sc->recv_io.reassembly.full_packet_received = true;
+ }
+
+ atomic_dec(&sc->recv_io.posted.count);
+ current_recv_credits = atomic_dec_return(&sc->recv_io.credits.count);
+
+ /*
+ * We take the value from the peer, which is checked to be higher than 0,
+ * but we limit it to the max value we support in order to have
+ * the main logic simpler.
+ */
+ old_recv_credit_target = sc->recv_io.credits.target;
+ sc->recv_io.credits.target = credits_requested;
+ sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target,
+ sp->recv_credit_max);
+ if (credits_granted) {
+ atomic_add(credits_granted, &sc->send_io.credits.count);
+ /*
+ * We have new send credits granted from remote peer
+ * If any sender is waiting for credits, unblock it
+ */
+ wake_up(&sc->send_io.credits.wait_queue);
+ }
+
+ /* Send an immediate response right away if requested */
+ if (flags & SMBDIRECT_FLAG_RESPONSE_REQUESTED) {
+ smbdirect_log_keep_alive(sc, SMBDIRECT_LOG_INFO,
+ "schedule send of immediate response\n");
+ queue_work(sc->workqueues.immediate, &sc->idle.immediate_work);
+ }
+
+ /*
+ * If this is a packet with data playload place the data in
+ * reassembly queue and wake up the reading thread
+ */
+ if (data_length) {
+ if (current_recv_credits <= (sc->recv_io.credits.target / 4) ||
+ sc->recv_io.credits.target > old_recv_credit_target)
+ queue_work(sc->workqueues.refill, &sc->recv_io.posted.refill_work);
+
+ smbdirect_connection_reassembly_append_recv_io(sc, recv_io, data_length);
+ wake_up(&sc->recv_io.reassembly.wait_queue);
+ } else
+ smbdirect_connection_put_recv_io(recv_io);
+
+ return;
+
+error:
+ /*
+ * Make sure smbdirect_connection_put_recv_io() does not
+ * start recv_io.posted.refill_work.
+ */
+ disable_work(&sc->recv_io.posted.refill_work);
+ smbdirect_connection_put_recv_io(recv_io);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+}
+
+int smbdirect_connection_recv_io_refill(struct smbdirect_socket *sc)
+{
+ int missing;
+ int posted = 0;
+
+ if (unlikely(sc->first_error))
+ return sc->first_error;
+
+ /*
+ * Find out how much smbdirect_recv_io buffers we should post.
+ *
+ * Note that sc->recv_io.credits.target is the value
+ * from the peer and it can in theory change over time,
+ * but it is forced to be at least 1 and at max
+ * sp->recv_credit_max.
+ *
+ * So it can happen that missing will be lower than 0,
+ * which means the peer has recently lowered its desired
+ * target, while be already granted a higher number of credits.
+ *
+ * Note 'posted' is the number of smbdirect_recv_io buffers
+ * posted within this function, while sc->recv_io.posted.count
+ * is the overall value of posted smbdirect_recv_io buffers.
+ *
+ * We try to post as much buffers as missing, but
+ * this is limited if a lot of smbdirect_recv_io buffers
+ * are still in the sc->recv_io.reassembly.list instead of
+ * the sc->recv_io.free.list.
+ *
+ */
+ missing = (int)sc->recv_io.credits.target - atomic_read(&sc->recv_io.posted.count);
+ while (posted < missing) {
+ struct smbdirect_recv_io *recv_io;
+ int ret;
+
+ /*
+ * It's ok if smbdirect_connection_get_recv_io()
+ * returns NULL, it means smbdirect_recv_io structures
+ * are still be in the reassembly.list.
+ */
+ recv_io = smbdirect_connection_get_recv_io(sc);
+ if (!recv_io)
+ break;
+
+ recv_io->first_segment = false;
+
+ ret = smbdirect_connection_post_recv_io(recv_io);
+ if (ret) {
+ smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_post_recv_io failed rc=%d (%1pe)\n",
+ ret, SMBDIRECT_DEBUG_ERR_PTR(ret));
+ smbdirect_connection_put_recv_io(recv_io);
+ return ret;
+ }
+
+ atomic_inc(&sc->recv_io.posted.count);
+ posted += 1;
+ }
+
+ /* If nothing was posted we're done */
+ if (posted == 0)
+ return 0;
+
+ atomic_add(posted, &sc->recv_io.credits.available);
+
+ /*
+ * If the last send credit is waiting for credits
+ * it can grant we need to wake it up
+ */
+ if (atomic_read(&sc->send_io.bcredits.count) == 0 &&
+ atomic_read(&sc->send_io.credits.count) == 0)
+ wake_up(&sc->send_io.credits.wait_queue);
+
+ /*
+ * If we posted at least one smbdirect_recv_io buffer,
+ * we need to inform the peer about it and grant
+ * additional credits.
+ *
+ * However there is one case where we don't want to
+ * do that.
+ *
+ * If only a single credit was missing before
+ * reaching the requested target, we should not
+ * post an immediate send, as that would cause
+ * endless ping pong once a keep alive exchange
+ * is started.
+ *
+ * However if sc->recv_io.credits.target is only 1,
+ * the peer has no credit left and we need to
+ * grant the credit anyway.
+ */
+ if (missing == 1 && sc->recv_io.credits.target != 1)
+ return 0;
+
+ return posted;
+}
+
+static void smbdirect_connection_recv_io_refill_work(struct work_struct *work)
+{
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, recv_io.posted.refill_work);
+ int posted;
+
+ posted = smbdirect_connection_recv_io_refill(sc);
+ if (unlikely(posted < 0)) {
+ smbdirect_socket_schedule_cleanup(sc, posted);
+ return;
+ }
+ if (posted > 0) {
+ smbdirect_log_keep_alive(sc, SMBDIRECT_LOG_INFO,
+ "schedule send of an empty message\n");
+ queue_work(sc->workqueues.immediate, &sc->idle.immediate_work);
+ }
+}
+
+int smbdirect_connection_recvmsg(struct smbdirect_socket *sc,
+ struct msghdr *msg,
+ unsigned int flags)
+{
+ struct smbdirect_recv_io *response;
+ struct smbdirect_data_transfer *data_transfer;
+ size_t size = iov_iter_count(&msg->msg_iter);
+ int to_copy, to_read, data_read, offset;
+ u32 data_length, remaining_data_length, data_offset;
+ int ret;
+
+ if (WARN_ONCE(flags, "unexpected flags=0x%x\n", flags))
+ return -EINVAL; /* no flags support for now */
+
+ if (WARN_ON_ONCE(iov_iter_rw(&msg->msg_iter) != ITER_DEST))
+ return -EINVAL; /* It's a bug in upper layer to get there */
+
+again:
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
+ smbdirect_log_read(sc, SMBDIRECT_LOG_INFO,
+ "status=%s first_error=%1pe => %1pe\n",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
+ SMBDIRECT_DEBUG_ERR_PTR(-ENOTCONN));
+ return -ENOTCONN;
+ }
+
+ /*
+ * No need to hold the reassembly queue lock all the time as we are
+ * the only one reading from the front of the queue. The transport
+ * may add more entries to the back of the queue at the same time
+ */
+ smbdirect_log_read(sc, SMBDIRECT_LOG_INFO,
+ "size=%zd sc->recv_io.reassembly.data_length=%d\n",
+ size, sc->recv_io.reassembly.data_length);
+ if (sc->recv_io.reassembly.data_length >= size) {
+ int queue_length;
+ int queue_removed = 0;
+ unsigned long flags;
+
+ /*
+ * Need to make sure reassembly_data_length is read before
+ * reading reassembly_queue_length and calling
+ * smbdirect_connection_reassembly_first_recv_io. This call is lock free
+ * as we never read at the end of the queue which are being
+ * updated in SOFTIRQ as more data is received
+ */
+ virt_rmb();
+ queue_length = sc->recv_io.reassembly.queue_length;
+ data_read = 0;
+ to_read = size;
+ offset = sc->recv_io.reassembly.first_entry_offset;
+ while (data_read < size) {
+ response = smbdirect_connection_reassembly_first_recv_io(sc);
+ data_transfer = (void *)response->packet;
+ data_length = le32_to_cpu(data_transfer->data_length);
+ remaining_data_length =
+ le32_to_cpu(
+ data_transfer->remaining_data_length);
+ data_offset = le32_to_cpu(data_transfer->data_offset);
+
+ /*
+ * The upper layer expects RFC1002 length at the
+ * beginning of the payload. Return it to indicate
+ * the total length of the packet. This minimize the
+ * change to upper layer packet processing logic. This
+ * will be eventually remove when an intermediate
+ * transport layer is added
+ */
+ if (response->first_segment && size == 4) {
+ unsigned int rfc1002_len =
+ data_length + remaining_data_length;
+ __be32 rfc1002_hdr = cpu_to_be32(rfc1002_len);
+
+ if (copy_to_iter(&rfc1002_hdr, sizeof(rfc1002_hdr),
+ &msg->msg_iter) != sizeof(rfc1002_hdr))
+ return -EFAULT;
+ data_read = 4;
+ response->first_segment = false;
+ smbdirect_log_read(sc, SMBDIRECT_LOG_INFO,
+ "returning rfc1002 length %d\n",
+ rfc1002_len);
+ goto read_rfc1002_done;
+ }
+
+ to_copy = min_t(int, data_length - offset, to_read);
+ if (copy_to_iter((u8 *)data_transfer + data_offset + offset,
+ to_copy, &msg->msg_iter) != to_copy)
+ return -EFAULT;
+
+ /* move on to the next buffer? */
+ if (to_copy == data_length - offset) {
+ queue_length--;
+ /*
+ * No need to lock if we are not at the
+ * end of the queue
+ */
+ if (queue_length)
+ list_del(&response->list);
+ else {
+ spin_lock_irqsave(
+ &sc->recv_io.reassembly.lock, flags);
+ list_del(&response->list);
+ spin_unlock_irqrestore(
+ &sc->recv_io.reassembly.lock, flags);
+ }
+ queue_removed++;
+ sc->statistics.dequeue_reassembly_queue++;
+ smbdirect_connection_put_recv_io(response);
+ offset = 0;
+ smbdirect_log_read(sc, SMBDIRECT_LOG_INFO,
+ "smbdirect_connection_put_recv_io offset=0\n");
+ } else
+ offset += to_copy;
+
+ to_read -= to_copy;
+ data_read += to_copy;
+
+ smbdirect_log_read(sc, SMBDIRECT_LOG_INFO,
+ "memcpy %d bytes len-ofs=%u => todo=%u done=%u ofs=%u\n",
+ to_copy, data_length - offset,
+ to_read, data_read, offset);
+ }
+
+ spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
+ sc->recv_io.reassembly.data_length -= data_read;
+ sc->recv_io.reassembly.queue_length -= queue_removed;
+ spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
+
+ sc->recv_io.reassembly.first_entry_offset = offset;
+ smbdirect_log_read(sc, SMBDIRECT_LOG_INFO,
+ "returning data_read=%d reassembly_length=%d first_ofs=%u\n",
+ data_read, sc->recv_io.reassembly.data_length,
+ sc->recv_io.reassembly.first_entry_offset);
+read_rfc1002_done:
+ return data_read;
+ }
+
+ smbdirect_log_read(sc, SMBDIRECT_LOG_INFO,
+ "wait_event on more data\n");
+ ret = wait_event_interruptible(sc->recv_io.reassembly.wait_queue,
+ sc->recv_io.reassembly.data_length >= size ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
+ /* Don't return any data if interrupted */
+ if (ret)
+ return ret;
+
+ goto again;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_recvmsg);
+
+static bool smbdirect_map_sges_single_page(struct smbdirect_map_sges *state,
+ struct page *page, size_t off, size_t len)
+{
+ struct ib_sge *sge;
+ u64 addr;
+
+ if (state->num_sge >= state->max_sge)
+ return false;
+
+ addr = ib_dma_map_page(state->device, page,
+ off, len, state->direction);
+ if (ib_dma_mapping_error(state->device, addr))
+ return false;
+
+ sge = &state->sge[state->num_sge++];
+ sge->addr = addr;
+ sge->length = len;
+ sge->lkey = state->local_dma_lkey;
+
+ return true;
+}
+
+/*
+ * Extract page fragments from a BVEC-class iterator and add them to an ib_sge
+ * list. The pages are not pinned.
+ */
+static ssize_t smbdirect_map_sges_from_bvec(struct iov_iter *iter,
+ struct smbdirect_map_sges *state,
+ ssize_t maxsize)
+{
+ const struct bio_vec *bv = iter->bvec;
+ unsigned long start = iter->iov_offset;
+ unsigned int i;
+ ssize_t ret = 0;
+
+ for (i = 0; i < iter->nr_segs; i++) {
+ size_t off, len;
+ bool ok;
+
+ len = bv[i].bv_len;
+ if (start >= len) {
+ start -= len;
+ continue;
+ }
+
+ len = min_t(size_t, maxsize, len - start);
+ off = bv[i].bv_offset + start;
+
+ ok = smbdirect_map_sges_single_page(state,
+ bv[i].bv_page,
+ off,
+ len);
+ if (!ok)
+ return -EIO;
+
+ ret += len;
+ maxsize -= len;
+ if (state->num_sge >= state->max_sge || maxsize <= 0)
+ break;
+ start = 0;
+ }
+
+ if (ret > 0)
+ iov_iter_advance(iter, ret);
+ return ret;
+}
+
+/*
+ * Extract fragments from a KVEC-class iterator and add them to an ib_sge list.
+ * This can deal with vmalloc'd buffers as well as kmalloc'd or static buffers.
+ * The pages are not pinned.
+ */
+static ssize_t smbdirect_map_sges_from_kvec(struct iov_iter *iter,
+ struct smbdirect_map_sges *state,
+ ssize_t maxsize)
+{
+ const struct kvec *kv = iter->kvec;
+ unsigned long start = iter->iov_offset;
+ unsigned int i;
+ ssize_t ret = 0;
+
+ for (i = 0; i < iter->nr_segs; i++) {
+ struct page *page;
+ unsigned long kaddr;
+ size_t off, len, seg;
+
+ len = kv[i].iov_len;
+ if (start >= len) {
+ start -= len;
+ continue;
+ }
+
+ kaddr = (unsigned long)kv[i].iov_base + start;
+ off = kaddr & ~PAGE_MASK;
+ len = min_t(size_t, maxsize, len - start);
+ kaddr &= PAGE_MASK;
+
+ maxsize -= len;
+ do {
+ bool ok;
+
+ seg = min_t(size_t, len, PAGE_SIZE - off);
+
+ if (is_vmalloc_or_module_addr((void *)kaddr))
+ page = vmalloc_to_page((void *)kaddr);
+ else
+ page = virt_to_page((void *)kaddr);
+
+ ok = smbdirect_map_sges_single_page(state, page, off, seg);
+ if (!ok)
+ return -EIO;
+
+ ret += seg;
+ len -= seg;
+ kaddr += PAGE_SIZE;
+ off = 0;
+ } while (len > 0 && state->num_sge < state->max_sge);
+
+ if (state->num_sge >= state->max_sge || maxsize <= 0)
+ break;
+ start = 0;
+ }
+
+ if (ret > 0)
+ iov_iter_advance(iter, ret);
+ return ret;
+}
+
+/*
+ * Extract folio fragments from a FOLIOQ-class iterator and add them to an
+ * ib_sge list. The folios are not pinned.
+ */
+static ssize_t smbdirect_map_sges_from_folioq(struct iov_iter *iter,
+ struct smbdirect_map_sges *state,
+ ssize_t maxsize)
+{
+ const struct folio_queue *folioq = iter->folioq;
+ unsigned int slot = iter->folioq_slot;
+ ssize_t ret = 0;
+ size_t offset = iter->iov_offset;
+
+ if (WARN_ON_ONCE(!folioq))
+ return -EIO;
+
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = folioq->next;
+ if (WARN_ON_ONCE(!folioq))
+ return -EIO;
+ slot = 0;
+ }
+
+ do {
+ struct folio *folio = folioq_folio(folioq, slot);
+ size_t fsize = folioq_folio_size(folioq, slot);
+
+ if (offset < fsize) {
+ size_t part = umin(maxsize, fsize - offset);
+ bool ok;
+
+ ok = smbdirect_map_sges_single_page(state,
+ folio_page(folio, 0),
+ offset,
+ part);
+ if (!ok)
+ return -EIO;
+
+ offset += part;
+ ret += part;
+ maxsize -= part;
+ }
+
+ if (offset >= fsize) {
+ offset = 0;
+ slot++;
+ if (slot >= folioq_nr_slots(folioq)) {
+ if (!folioq->next) {
+ WARN_ON_ONCE(ret < iter->count);
+ break;
+ }
+ folioq = folioq->next;
+ slot = 0;
+ }
+ }
+ } while (state->num_sge < state->max_sge && maxsize > 0);
+
+ iter->folioq = folioq;
+ iter->folioq_slot = slot;
+ iter->iov_offset = offset;
+ iter->count -= ret;
+ return ret;
+}
+
+/*
+ * Extract page fragments from up to the given amount of the source iterator
+ * and build up an ib_sge list that refers to all of those bits. The ib_sge list
+ * is appended to, up to the maximum number of elements set in the parameter
+ * block.
+ *
+ * The extracted page fragments are not pinned or ref'd in any way; if an
+ * IOVEC/UBUF-type iterator is to be used, it should be converted to a
+ * BVEC-type iterator and the pages pinned, ref'd or otherwise held in some
+ * way.
+ */
+static ssize_t smbdirect_map_sges_from_iter(struct iov_iter *iter, size_t len,
+ struct smbdirect_map_sges *state)
+{
+ ssize_t ret;
+ size_t before = state->num_sge;
+
+ if (WARN_ON_ONCE(iov_iter_rw(iter) != ITER_SOURCE))
+ return -EIO;
+
+ switch (iov_iter_type(iter)) {
+ case ITER_BVEC:
+ ret = smbdirect_map_sges_from_bvec(iter, state, len);
+ break;
+ case ITER_KVEC:
+ ret = smbdirect_map_sges_from_kvec(iter, state, len);
+ break;
+ case ITER_FOLIOQ:
+ ret = smbdirect_map_sges_from_folioq(iter, state, len);
+ break;
+ default:
+ WARN_ONCE(1, "iov_iter_type[%u]\n", iov_iter_type(iter));
+ return -EIO;
+ }
+
+ if (ret < 0) {
+ while (state->num_sge > before) {
+ struct ib_sge *sge = &state->sge[state->num_sge--];
+
+ ib_dma_unmap_page(state->device,
+ sge->addr,
+ sge->length,
+ state->direction);
+ }
+ }
+
+ return ret;
+}
diff --git a/fs/smb/common/smbdirect/smbdirect_debug.c b/fs/smb/common/smbdirect/smbdirect_debug.c
new file mode 100644
index 000000000000..d8664fd7f71a
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_debug.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017, Microsoft Corporation.
+ * Copyright (c) 2025, Stefan Metzmacher
+ */
+
+#include "smbdirect_internal.h"
+#include <linux/seq_file.h>
+
+void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc,
+ unsigned int rdma_readwrite_threshold,
+ struct seq_file *m)
+{
+ const struct smbdirect_socket_parameters *sp;
+
+ if (!sc)
+ return;
+ sp = &sc->parameters;
+
+ seq_puts(m, "\n");
+ seq_printf(m, "SMBDirect protocol version: 0x%x ",
+ SMBDIRECT_V1);
+ seq_printf(m, "transport status: %s (%u)",
+ smbdirect_socket_status_string(sc->status),
+ sc->status);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "Conn receive_credit_max: %u ",
+ sp->recv_credit_max);
+ seq_printf(m, "send_credit_target: %u max_send_size: %u",
+ sp->send_credit_target,
+ sp->max_send_size);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "Conn max_fragmented_recv_size: %u ",
+ sp->max_fragmented_recv_size);
+ seq_printf(m, "max_fragmented_send_size: %u max_receive_size:%u",
+ sp->max_fragmented_send_size,
+ sp->max_recv_size);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "Conn keep_alive_interval: %u ",
+ sp->keepalive_interval_msec * 1000);
+ seq_printf(m, "max_readwrite_size: %u rdma_readwrite_threshold: %u",
+ sp->max_read_write_size,
+ rdma_readwrite_threshold);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "Debug count_get_receive_buffer: %llu ",
+ sc->statistics.get_receive_buffer);
+ seq_printf(m, "count_put_receive_buffer: %llu count_send_empty: %llu",
+ sc->statistics.put_receive_buffer,
+ sc->statistics.send_empty);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "Read Queue count_enqueue_reassembly_queue: %llu ",
+ sc->statistics.enqueue_reassembly_queue);
+ seq_printf(m, "count_dequeue_reassembly_queue: %llu ",
+ sc->statistics.dequeue_reassembly_queue);
+ seq_printf(m, "reassembly_data_length: %u ",
+ sc->recv_io.reassembly.data_length);
+ seq_printf(m, "reassembly_queue_length: %u",
+ sc->recv_io.reassembly.queue_length);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "Current Credits send_credits: %u ",
+ atomic_read(&sc->send_io.credits.count));
+ seq_printf(m, "receive_credits: %u receive_credit_target: %u",
+ atomic_read(&sc->recv_io.credits.count),
+ sc->recv_io.credits.target);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "Pending send_pending: %u ",
+ atomic_read(&sc->send_io.pending.count));
+
+ seq_puts(m, "\n");
+ seq_printf(m, "MR responder_resources: %u ",
+ sp->responder_resources);
+ seq_printf(m, "max_frmr_depth: %u mr_type: 0x%x",
+ sp->max_frmr_depth,
+ sc->mr_io.type);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "MR mr_ready_count: %u mr_used_count: %u",
+ atomic_read(&sc->mr_io.ready.count),
+ atomic_read(&sc->mr_io.used.count));
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_legacy_debug_proc_show);
diff --git a/fs/smb/common/smbdirect/smbdirect_devices.c b/fs/smb/common/smbdirect/smbdirect_devices.c
new file mode 100644
index 000000000000..aaab99e9c045
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_devices.c
@@ -0,0 +1,277 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017, Microsoft Corporation.
+ * Copyright (C) 2018, LG Electronics.
+ * Copyright (c) 2025 Stefan Metzmacher
+ */
+
+#include "smbdirect_internal.h"
+
+static u8 smbdirect_ib_device_rdma_capable_node_type(struct ib_device *ib_dev)
+{
+ if (!smbdirect_frwr_is_supported(&ib_dev->attrs))
+ return RDMA_NODE_UNSPECIFIED;
+
+ switch (ib_dev->node_type) {
+ case RDMA_NODE_IB_CA: /* Infiniband, RoCE v1 and v2 */
+ case RDMA_NODE_RNIC: /* iWarp */
+ return ib_dev->node_type;
+ }
+
+ return RDMA_NODE_UNSPECIFIED;
+}
+
+static int smbdirect_ib_client_add(struct ib_device *ib_dev)
+{
+ u8 node_type = smbdirect_ib_device_rdma_capable_node_type(ib_dev);
+ struct smbdirect_device *sdev;
+ const char *node_str;
+ const char *action;
+ u32 pidx;
+
+ switch (node_type) {
+ case RDMA_NODE_IB_CA:
+ node_str = "IB_CA";
+ action = "added";
+ break;
+ case RDMA_NODE_RNIC:
+ node_str = "RNIC";
+ action = "added";
+ break;
+ case RDMA_NODE_UNSPECIFIED:
+ node_str = "UNSPECIFIED";
+ action = "ignored";
+ break;
+ default:
+ node_str = "UNKNOWN";
+ action = "ignored";
+ node_type = RDMA_NODE_UNSPECIFIED;
+ break;
+ }
+
+ pr_info("ib_dev[%.*s]: %s: %s %s=%u %s=0x%llx %s=0x%llx %s=0x%llx\n",
+ IB_DEVICE_NAME_MAX,
+ ib_dev->name,
+ action,
+ node_str,
+ "max_fast_reg_page_list_len",
+ ib_dev->attrs.max_fast_reg_page_list_len,
+ "device_cap_flags",
+ ib_dev->attrs.device_cap_flags,
+ "kernel_cap_flags",
+ ib_dev->attrs.kernel_cap_flags,
+ "page_size_cap",
+ ib_dev->attrs.page_size_cap);
+
+ if (node_type == RDMA_NODE_UNSPECIFIED)
+ return 0;
+
+ pr_info("ib_dev[%.*s]: %s=%u %s=%u %s=%u %s=%u %s=%u %s=%u %s=%u %s=%u %s=%u\n",
+ IB_DEVICE_NAME_MAX,
+ ib_dev->name,
+ "num_ports",
+ rdma_end_port(ib_dev),
+ "max_qp_rd_atom",
+ ib_dev->attrs.max_qp_rd_atom,
+ "max_qp_init_rd_atom",
+ ib_dev->attrs.max_qp_init_rd_atom,
+ "max_sgl_rd",
+ ib_dev->attrs.max_sgl_rd,
+ "max_sge_rd",
+ ib_dev->attrs.max_sge_rd,
+ "max_cqe",
+ ib_dev->attrs.max_cqe,
+ "max_qp_wr",
+ ib_dev->attrs.max_qp_wr,
+ "max_send_sge",
+ ib_dev->attrs.max_send_sge,
+ "max_recv_sge",
+ ib_dev->attrs.max_recv_sge);
+
+ rdma_for_each_port(ib_dev, pidx) {
+ const struct ib_port_immutable *ib_pi =
+ ib_port_immutable_read(ib_dev, pidx);
+ u32 core_cap_flags = ib_pi ? ib_pi->core_cap_flags : 0;
+
+ pr_info("ib_dev[%.*s]PORT[%u]: %s=%u %s=%u %s=%u %s=%u %s=%u %s=0x%x\n",
+ IB_DEVICE_NAME_MAX,
+ ib_dev->name,
+ pidx,
+ "iwarp",
+ rdma_protocol_iwarp(ib_dev, pidx),
+ "ib",
+ rdma_protocol_ib(ib_dev, pidx),
+ "roce",
+ rdma_protocol_roce(ib_dev, pidx),
+ "v1",
+ rdma_protocol_roce_eth_encap(ib_dev, pidx),
+ "v2",
+ rdma_protocol_roce_udp_encap(ib_dev, pidx),
+ "core_cap_flags",
+ core_cap_flags);
+ }
+
+ sdev = kzalloc_obj(*sdev);
+ if (!sdev)
+ return -ENOMEM;
+ sdev->ib_dev = ib_dev;
+ snprintf(sdev->ib_name, ARRAY_SIZE(sdev->ib_name), "%.*s",
+ IB_DEVICE_NAME_MAX, ib_dev->name);
+
+ write_lock(&smbdirect_globals.devices.lock);
+ list_add(&sdev->list, &smbdirect_globals.devices.list);
+ write_unlock(&smbdirect_globals.devices.lock);
+
+ return 0;
+}
+
+static void smbdirect_ib_client_remove(struct ib_device *ib_dev, void *client_data)
+{
+ struct smbdirect_device *sdev, *tmp;
+
+ write_lock(&smbdirect_globals.devices.lock);
+ list_for_each_entry_safe(sdev, tmp, &smbdirect_globals.devices.list, list) {
+ if (sdev->ib_dev == ib_dev) {
+ list_del(&sdev->list);
+ pr_info("ib_dev[%.*s] removed\n",
+ IB_DEVICE_NAME_MAX, sdev->ib_name);
+ kfree(sdev);
+ break;
+ }
+ }
+ write_unlock(&smbdirect_globals.devices.lock);
+}
+
+static void smbdirect_ib_client_rename(struct ib_device *ib_dev, void *client_data)
+{
+ struct smbdirect_device *sdev;
+
+ write_lock(&smbdirect_globals.devices.lock);
+ list_for_each_entry(sdev, &smbdirect_globals.devices.list, list) {
+ if (sdev->ib_dev == ib_dev) {
+ pr_info("ib_dev[%.*s] renamed to [%.*s]\n",
+ IB_DEVICE_NAME_MAX, sdev->ib_name,
+ IB_DEVICE_NAME_MAX, ib_dev->name);
+ snprintf(sdev->ib_name, ARRAY_SIZE(sdev->ib_name), "%.*s",
+ IB_DEVICE_NAME_MAX, ib_dev->name);
+ break;
+ }
+ }
+ write_unlock(&smbdirect_globals.devices.lock);
+}
+
+static struct ib_client smbdirect_ib_client = {
+ .name = "smbdirect_ib_client",
+ .add = smbdirect_ib_client_add,
+ .remove = smbdirect_ib_client_remove,
+ .rename = smbdirect_ib_client_rename,
+};
+
+static u8 smbdirect_netdev_find_rdma_capable_node_type(struct net_device *netdev)
+{
+ struct smbdirect_device *sdev;
+ u8 node_type = RDMA_NODE_UNSPECIFIED;
+
+ read_lock(&smbdirect_globals.devices.lock);
+ list_for_each_entry(sdev, &smbdirect_globals.devices.list, list) {
+ u32 pi;
+
+ rdma_for_each_port(sdev->ib_dev, pi) {
+ struct net_device *ndev;
+
+ ndev = ib_device_get_netdev(sdev->ib_dev, pi);
+ if (!ndev)
+ continue;
+
+ if (ndev == netdev) {
+ dev_put(ndev);
+ node_type = sdev->ib_dev->node_type;
+ goto out;
+ }
+ dev_put(ndev);
+ }
+ }
+out:
+ read_unlock(&smbdirect_globals.devices.lock);
+
+ if (node_type == RDMA_NODE_UNSPECIFIED) {
+ struct ib_device *ibdev;
+
+ ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN);
+ if (ibdev) {
+ node_type = smbdirect_ib_device_rdma_capable_node_type(ibdev);
+ ib_device_put(ibdev);
+ }
+ }
+
+ return node_type;
+}
+
+/*
+ * Returns RDMA_NODE_UNSPECIFIED when the netdev has
+ * no support for smbdirect capable rdma.
+ *
+ * Otherwise RDMA_NODE_RNIC is returned for iwarp devices
+ * and RDMA_NODE_IB_CA or Infiniband and RoCE (v1 and v2)
+ */
+u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+ u8 node_type = RDMA_NODE_UNSPECIFIED;
+
+ node_type = smbdirect_netdev_find_rdma_capable_node_type(netdev);
+ if (node_type != RDMA_NODE_UNSPECIFIED)
+ return node_type;
+
+ /* check if netdev is bridge or VLAN */
+ if (netif_is_bridge_master(netdev) || netdev->priv_flags & IFF_802_1Q_VLAN)
+ netdev_for_each_lower_dev(netdev, lower_dev, iter) {
+ node_type = smbdirect_netdev_find_rdma_capable_node_type(lower_dev);
+ if (node_type != RDMA_NODE_UNSPECIFIED)
+ return node_type;
+ }
+
+ /* check if netdev is IPoIB safely without layer violation */
+ if (netdev->type == ARPHRD_INFINIBAND)
+ return RDMA_NODE_IB_CA;
+
+ return RDMA_NODE_UNSPECIFIED;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_netdev_rdma_capable_node_type);
+
+__init int smbdirect_devices_init(void)
+{
+ int ret;
+
+ rwlock_init(&smbdirect_globals.devices.lock);
+ INIT_LIST_HEAD(&smbdirect_globals.devices.list);
+
+ ret = ib_register_client(&smbdirect_ib_client);
+ if (ret) {
+ pr_crit("failed to ib_register_client: %d %1pe\n",
+ ret, SMBDIRECT_DEBUG_ERR_PTR(ret));
+ return ret;
+ }
+
+ return 0;
+}
+
+__exit void smbdirect_devices_exit(void)
+{
+ struct smbdirect_device *sdev, *tmp;
+
+ /*
+ * On exist we just cleanup so that
+ * smbdirect_ib_client_remove() won't
+ * print removals of devices.
+ */
+ write_lock(&smbdirect_globals.devices.lock);
+ list_for_each_entry_safe(sdev, tmp, &smbdirect_globals.devices.list, list) {
+ list_del(&sdev->list);
+ kfree(sdev);
+ }
+ write_unlock(&smbdirect_globals.devices.lock);
+
+ ib_unregister_client(&smbdirect_ib_client);
+}
diff --git a/fs/smb/common/smbdirect/smbdirect_internal.h b/fs/smb/common/smbdirect/smbdirect_internal.h
new file mode 100644
index 000000000000..30a1b8643657
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_internal.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2025, Stefan Metzmacher
+ */
+
+#ifndef __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__
+#define __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "smbdirect.h"
+#include "smbdirect_pdu.h"
+#include "smbdirect_public.h"
+
+#include <linux/mutex.h>
+
+struct smbdirect_module_state {
+ struct mutex mutex;
+
+ struct {
+ struct workqueue_struct *accept;
+ struct workqueue_struct *connect;
+ struct workqueue_struct *idle;
+ struct workqueue_struct *refill;
+ struct workqueue_struct *immediate;
+ struct workqueue_struct *cleanup;
+ } workqueues;
+
+ struct {
+ rwlock_t lock;
+ struct list_head list;
+ } devices;
+};
+
+extern struct smbdirect_module_state smbdirect_globals;
+
+#include "smbdirect_socket.h"
+
+struct smbdirect_device {
+ struct list_head list;
+ struct ib_device *ib_dev;
+ /*
+ * copy of ib_dev->name,
+ * in order to print renames
+ */
+ char ib_name[IB_DEVICE_NAME_MAX];
+};
+
+int smbdirect_socket_init_new(struct net *net, struct smbdirect_socket *sc);
+
+int smbdirect_socket_init_accepting(struct rdma_cm_id *id, struct smbdirect_socket *sc);
+
+void __smbdirect_socket_schedule_cleanup(struct smbdirect_socket *sc,
+ const char *macro_name,
+ unsigned int lvl,
+ const char *func,
+ unsigned int line,
+ int error,
+ enum smbdirect_socket_status *force_status);
+#define smbdirect_socket_schedule_cleanup(__sc, __error) \
+ __smbdirect_socket_schedule_cleanup(__sc, \
+ "smbdirect_socket_schedule_cleanup", SMBDIRECT_LOG_ERR, \
+ __func__, __LINE__, __error, NULL)
+#define smbdirect_socket_schedule_cleanup_lvl(__sc, __lvl, __error) \
+ __smbdirect_socket_schedule_cleanup(__sc, \
+ "smbdirect_socket_schedule_cleanup_lvl", __lvl, \
+ __func__, __LINE__, __error, NULL)
+#define smbdirect_socket_schedule_cleanup_status(__sc, __lvl, __error, __status) do { \
+ enum smbdirect_socket_status __force_status = __status; \
+ __smbdirect_socket_schedule_cleanup(__sc, \
+ "smbdirect_socket_schedule_cleanup_status", __lvl, \
+ __func__, __LINE__, __error, &__force_status); \
+} while (0)
+
+void smbdirect_socket_destroy_sync(struct smbdirect_socket *sc);
+
+int smbdirect_socket_wait_for_credits(struct smbdirect_socket *sc,
+ enum smbdirect_socket_status expected_status,
+ int unexpected_errno,
+ wait_queue_head_t *waitq,
+ atomic_t *total_credits,
+ int needed);
+
+void smbdirect_connection_rdma_established(struct smbdirect_socket *sc);
+
+void smbdirect_connection_negotiation_done(struct smbdirect_socket *sc);
+
+int smbdirect_connection_create_qp(struct smbdirect_socket *sc);
+
+void smbdirect_connection_destroy_qp(struct smbdirect_socket *sc);
+
+int smbdirect_connection_create_mem_pools(struct smbdirect_socket *sc);
+
+void smbdirect_connection_destroy_mem_pools(struct smbdirect_socket *sc);
+
+struct smbdirect_send_io *smbdirect_connection_alloc_send_io(struct smbdirect_socket *sc);
+
+void smbdirect_connection_free_send_io(struct smbdirect_send_io *msg);
+
+struct smbdirect_recv_io *smbdirect_connection_get_recv_io(struct smbdirect_socket *sc);
+
+void smbdirect_connection_put_recv_io(struct smbdirect_recv_io *msg);
+
+void smbdirect_connection_reassembly_append_recv_io(struct smbdirect_socket *sc,
+ struct smbdirect_recv_io *msg,
+ u32 data_length);
+
+struct smbdirect_recv_io *
+smbdirect_connection_reassembly_first_recv_io(struct smbdirect_socket *sc);
+
+void smbdirect_connection_negotiate_rdma_resources(struct smbdirect_socket *sc,
+ u8 peer_initiator_depth,
+ u8 peer_responder_resources,
+ const struct rdma_conn_param *param);
+
+void smbdirect_connection_idle_timer_work(struct work_struct *work);
+
+u16 smbdirect_connection_grant_recv_credits(struct smbdirect_socket *sc);
+
+int smbdirect_connection_post_send_wr(struct smbdirect_socket *sc,
+ struct ib_send_wr *wr);
+
+int smbdirect_connection_post_recv_io(struct smbdirect_recv_io *msg);
+
+void smbdirect_connection_recv_io_done(struct ib_cq *cq, struct ib_wc *wc);
+
+int smbdirect_connection_recv_io_refill(struct smbdirect_socket *sc);
+
+int smbdirect_connection_create_mr_list(struct smbdirect_socket *sc);
+
+void smbdirect_connection_destroy_mr_list(struct smbdirect_socket *sc);
+
+int smbdirect_accept_connect_request(struct smbdirect_socket *sc,
+ const struct rdma_conn_param *param);
+
+void smbdirect_accept_negotiate_finish(struct smbdirect_socket *sc, u32 ntstatus);
+
+__init int smbdirect_devices_init(void);
+__exit void smbdirect_devices_exit(void);
+
+#endif /* __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__ */
diff --git a/fs/smb/common/smbdirect/smbdirect_listen.c b/fs/smb/common/smbdirect/smbdirect_listen.c
new file mode 100644
index 000000000000..05c7902e7020
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_listen.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017, Microsoft Corporation.
+ * Copyright (C) 2018, LG Electronics.
+ * Copyright (c) 2025, Stefan Metzmacher
+ */
+
+#include "smbdirect_internal.h"
+
+static int smbdirect_listen_rdma_event_handler(struct rdma_cm_id *id,
+ struct rdma_cm_event *event);
+
+int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog)
+{
+ int ret;
+
+ if (backlog < 0)
+ return -EINVAL;
+ if (!backlog)
+ backlog = 1; /* use 1 as default for now */
+
+ if (sc->first_error)
+ return -EINVAL;
+
+ if (sc->status != SMBDIRECT_SOCKET_CREATED)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(!sc->rdma.cm_id))
+ return -EINVAL;
+
+ if (sc->rdma.cm_id->device)
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "try to listen on addr: %pISpsfc dev: %.*s\n",
+ &sc->rdma.cm_id->route.addr.src_addr,
+ IB_DEVICE_NAME_MAX,
+ sc->rdma.cm_id->device->name);
+ else
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "try to listen on addr: %pISpsfc\n",
+ &sc->rdma.cm_id->route.addr.src_addr);
+
+ /* already checked above */
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
+ sc->status = SMBDIRECT_SOCKET_LISTENING;
+ sc->rdma.expected_event = RDMA_CM_EVENT_CONNECT_REQUEST;
+ rdma_lock_handler(sc->rdma.cm_id);
+ sc->rdma.cm_id->event_handler = smbdirect_listen_rdma_event_handler;
+ rdma_unlock_handler(sc->rdma.cm_id);
+
+ ret = rdma_listen(sc->rdma.cm_id, backlog);
+ if (ret) {
+ sc->first_error = ret;
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+ if (sc->rdma.cm_id->device)
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "listening failed %1pe on addr: %pISpsfc dev: %.*s\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret),
+ &sc->rdma.cm_id->route.addr.src_addr,
+ IB_DEVICE_NAME_MAX,
+ sc->rdma.cm_id->device->name);
+ else
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "listening failed %1pe on addr: %pISpsfc\n",
+ SMBDIRECT_DEBUG_ERR_PTR(ret),
+ &sc->rdma.cm_id->route.addr.src_addr);
+ return ret;
+ }
+
+ /*
+ * This is a value > 0, checked above,
+ * so we are able to use sc->listen.backlog == -1,
+ * as indication that the socket was never
+ * a listener.
+ */
+ sc->listen.backlog = backlog;
+
+ if (sc->rdma.cm_id->device)
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "listening on addr: %pISpsfc dev: %.*s\n",
+ &sc->rdma.cm_id->route.addr.src_addr,
+ IB_DEVICE_NAME_MAX,
+ sc->rdma.cm_id->device->name);
+ else
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "listening on addr: %pISpsfc\n",
+ &sc->rdma.cm_id->route.addr.src_addr);
+
+ /*
+ * The rest happens async via smbdirect_listen_rdma_event_handler()
+ */
+ return 0;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_listen);
+
+static int smbdirect_new_rdma_event_handler(struct rdma_cm_id *new_id,
+ struct rdma_cm_event *event)
+{
+ int ret = -ESTALE;
+
+ /*
+ * This should be replaced before any real work
+ * starts! So it should never be called!
+ */
+
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ret = -ENETDOWN;
+ if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status)))
+ ret = event->status;
+ WARN_ONCE(1,
+ "%s should not be called! event=%s status=%d => ret=%1pe\n",
+ __func__,
+ rdma_event_msg(event->event),
+ event->status,
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ return -ESTALE;
+}
+
+static int smbdirect_listen_connect_request(struct smbdirect_socket *lsc,
+ struct rdma_cm_id *new_id,
+ const struct rdma_cm_event *event);
+
+static int smbdirect_listen_rdma_event_handler(struct rdma_cm_id *new_id,
+ struct rdma_cm_event *event)
+{
+ struct smbdirect_socket *lsc = new_id->context;
+ int ret;
+
+ if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+ new_id->context = NULL;
+ new_id->event_handler = smbdirect_new_rdma_event_handler;
+ } else
+ new_id = NULL;
+
+ /*
+ * cma_cm_event_handler() has
+ * lockdep_assert_held(&id_priv->handler_mutex);
+ *
+ * Mutexes are not allowed in interrupts,
+ * and we rely on not being in an interrupt here,
+ * as we might sleep.
+ */
+ WARN_ON_ONCE(in_interrupt());
+
+ if (event->status || event->event != lsc->rdma.expected_event) {
+ ret = -ECONNABORTED;
+
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ret = -ENETDOWN;
+ if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status)))
+ ret = event->status;
+
+ smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR,
+ "%s (first_error=%1pe, expected=%s) => event=%s status=%d => ret=%1pe\n",
+ smbdirect_socket_status_string(lsc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(lsc->first_error),
+ rdma_event_msg(lsc->rdma.expected_event),
+ rdma_event_msg(event->event),
+ event->status,
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+
+ /*
+ * In case of error return it and let the caller
+ * destroy new_id
+ */
+ smbdirect_socket_schedule_cleanup(lsc, ret);
+ return new_id ? ret : 0;
+ }
+
+ smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_INFO,
+ "%s (first_error=%1pe) event=%s\n",
+ smbdirect_socket_status_string(lsc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(lsc->first_error),
+ rdma_event_msg(event->event));
+
+ /*
+ * In case of error return it and let the caller
+ * destroy new_id
+ */
+ if (lsc->first_error)
+ return new_id ? lsc->first_error : 0;
+
+ switch (event->event) {
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ WARN_ON_ONCE(lsc->status != SMBDIRECT_SOCKET_LISTENING);
+
+ /*
+ * In case of error return it and let the caller
+ * destroy new_id
+ */
+ ret = smbdirect_listen_connect_request(lsc, new_id, event);
+ if (ret)
+ return ret;
+ return 0;
+
+ default:
+ break;
+ }
+
+ /*
+ * This is an internal error
+ */
+ WARN_ON_ONCE(lsc->rdma.expected_event != RDMA_CM_EVENT_CONNECT_REQUEST);
+ smbdirect_socket_schedule_cleanup(lsc, -EINVAL);
+ return 0;
+}
+
+static int smbdirect_listen_connect_request(struct smbdirect_socket *lsc,
+ struct rdma_cm_id *new_id,
+ const struct rdma_cm_event *event)
+{
+ const struct smbdirect_socket_parameters *lsp = &lsc->parameters;
+ struct smbdirect_socket *nsc;
+ unsigned long flags;
+ size_t backlog = max_t(size_t, 1, lsc->listen.backlog);
+ size_t psockets;
+ size_t rsockets;
+ int ret;
+
+ if (!smbdirect_frwr_is_supported(&new_id->device->attrs)) {
+ smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR,
+ "Fast Registration Work Requests (FRWR) is not supported device %.*s\n",
+ IB_DEVICE_NAME_MAX,
+ new_id->device->name);
+ smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR,
+ "Device capability flags = %llx max_fast_reg_page_list_len = %u\n",
+ new_id->device->attrs.device_cap_flags,
+ new_id->device->attrs.max_fast_reg_page_list_len);
+ return -EPROTONOSUPPORT;
+ }
+
+ if (lsp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB &&
+ !rdma_ib_or_roce(new_id->device, new_id->port_num)) {
+ smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR,
+ "Not IB: device: %.*s IW:%u local: %pISpsfc remote: %pISpsfc\n",
+ IB_DEVICE_NAME_MAX,
+ new_id->device->name,
+ rdma_protocol_iwarp(new_id->device, new_id->port_num),
+ &new_id->route.addr.src_addr,
+ &new_id->route.addr.dst_addr);
+ return -EPROTONOSUPPORT;
+ }
+ if (lsp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW &&
+ !rdma_protocol_iwarp(new_id->device, new_id->port_num)) {
+ smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR,
+ "Not IW: device: %.*s IB:%u local: %pISpsfc remote: %pISpsfc\n",
+ IB_DEVICE_NAME_MAX,
+ new_id->device->name,
+ rdma_ib_or_roce(new_id->device, new_id->port_num),
+ &new_id->route.addr.src_addr,
+ &new_id->route.addr.dst_addr);
+ return -EPROTONOSUPPORT;
+ }
+
+ spin_lock_irqsave(&lsc->listen.lock, flags);
+ psockets = list_count_nodes(&lsc->listen.pending);
+ rsockets = list_count_nodes(&lsc->listen.ready);
+ spin_unlock_irqrestore(&lsc->listen.lock, flags);
+
+ if (psockets > backlog ||
+ rsockets > backlog ||
+ (psockets + rsockets) > backlog) {
+ smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR,
+ "Backlog[%d][%zu] full pending[%zu] ready[%zu]\n",
+ lsc->listen.backlog, backlog, psockets, rsockets);
+ return -EBUSY;
+ }
+
+ ret = smbdirect_socket_create_accepting(new_id, &nsc);
+ if (ret)
+ goto socket_init_failed;
+
+ nsc->logging = lsc->logging;
+ ret = smbdirect_socket_set_initial_parameters(nsc, &lsc->parameters);
+ if (ret)
+ goto set_params_failed;
+ ret = smbdirect_socket_set_kernel_settings(nsc,
+ lsc->ib.poll_ctx,
+ lsc->send_io.mem.gfp_mask);
+ if (ret)
+ goto set_settings_failed;
+
+ spin_lock_irqsave(&lsc->listen.lock, flags);
+ list_add_tail(&nsc->accept.list, &lsc->listen.pending);
+ nsc->accept.listener = lsc;
+ spin_unlock_irqrestore(&lsc->listen.lock, flags);
+
+ ret = smbdirect_accept_connect_request(nsc, &event->param.conn);
+ if (ret)
+ goto accept_connect_failed;
+
+ return 0;
+
+accept_connect_failed:
+ spin_lock_irqsave(&lsc->listen.lock, flags);
+ list_del_init(&nsc->accept.list);
+ nsc->accept.listener = NULL;
+ spin_unlock_irqrestore(&lsc->listen.lock, flags);
+set_settings_failed:
+set_params_failed:
+ /*
+ * The caller will destroy new_id
+ */
+ nsc->ib.dev = NULL;
+ nsc->rdma.cm_id = NULL;
+ smbdirect_socket_release(nsc);
+socket_init_failed:
+ return ret;
+}
diff --git a/fs/smb/common/smbdirect/smbdirect_main.c b/fs/smb/common/smbdirect/smbdirect_main.c
new file mode 100644
index 000000000000..fe6e8d93c34c
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_main.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2025, Stefan Metzmacher
+ */
+
+#include "smbdirect_internal.h"
+#include <linux/module.h>
+
+struct smbdirect_module_state smbdirect_globals = {
+ .mutex = __MUTEX_INITIALIZER(smbdirect_globals.mutex),
+};
+
+static __init int smbdirect_module_init(void)
+{
+ int ret = -ENOMEM;
+
+ pr_notice("subsystem loading...\n");
+ mutex_lock(&smbdirect_globals.mutex);
+
+ smbdirect_globals.workqueues.accept = alloc_workqueue("smbdirect-accept",
+ WQ_SYSFS |
+ WQ_PERCPU |
+ WQ_POWER_EFFICIENT,
+ 0);
+ if (smbdirect_globals.workqueues.accept == NULL)
+ goto alloc_accept_wq_failed;
+
+ smbdirect_globals.workqueues.connect = alloc_workqueue("smbdirect-connect",
+ WQ_SYSFS |
+ WQ_PERCPU |
+ WQ_POWER_EFFICIENT,
+ 0);
+ if (smbdirect_globals.workqueues.connect == NULL)
+ goto alloc_connect_wq_failed;
+
+ smbdirect_globals.workqueues.idle = alloc_workqueue("smbdirect-idle",
+ WQ_SYSFS |
+ WQ_PERCPU |
+ WQ_POWER_EFFICIENT,
+ 0);
+ if (smbdirect_globals.workqueues.idle == NULL)
+ goto alloc_idle_wq_failed;
+
+ smbdirect_globals.workqueues.refill = alloc_workqueue("smbdirect-refill",
+ WQ_HIGHPRI |
+ WQ_SYSFS |
+ WQ_PERCPU |
+ WQ_POWER_EFFICIENT,
+ 0);
+ if (smbdirect_globals.workqueues.refill == NULL)
+ goto alloc_refill_wq_failed;
+
+ smbdirect_globals.workqueues.immediate = alloc_workqueue("smbdirect-immediate",
+ WQ_HIGHPRI |
+ WQ_SYSFS |
+ WQ_PERCPU |
+ WQ_POWER_EFFICIENT,
+ 0);
+ if (smbdirect_globals.workqueues.immediate == NULL)
+ goto alloc_immediate_wq_failed;
+
+ smbdirect_globals.workqueues.cleanup = alloc_workqueue("smbdirect-cleanup",
+ WQ_MEM_RECLAIM |
+ WQ_HIGHPRI |
+ WQ_SYSFS |
+ WQ_PERCPU |
+ WQ_POWER_EFFICIENT,
+ 0);
+ if (smbdirect_globals.workqueues.cleanup == NULL)
+ goto alloc_cleanup_wq_failed;
+
+ ret = smbdirect_devices_init();
+ if (ret)
+ goto devices_init_failed;
+
+ mutex_unlock(&smbdirect_globals.mutex);
+ pr_notice("subsystem loaded\n");
+ return 0;
+
+devices_init_failed:
+ destroy_workqueue(smbdirect_globals.workqueues.cleanup);
+alloc_cleanup_wq_failed:
+ destroy_workqueue(smbdirect_globals.workqueues.immediate);
+alloc_immediate_wq_failed:
+ destroy_workqueue(smbdirect_globals.workqueues.refill);
+alloc_refill_wq_failed:
+ destroy_workqueue(smbdirect_globals.workqueues.idle);
+alloc_idle_wq_failed:
+ destroy_workqueue(smbdirect_globals.workqueues.connect);
+alloc_connect_wq_failed:
+ destroy_workqueue(smbdirect_globals.workqueues.accept);
+alloc_accept_wq_failed:
+ mutex_unlock(&smbdirect_globals.mutex);
+ pr_crit("failed to loaded: %d (%1pe)\n",
+ ret, SMBDIRECT_DEBUG_ERR_PTR(ret));
+ return ret;
+}
+
+static __exit void smbdirect_module_exit(void)
+{
+ pr_notice("subsystem unloading...\n");
+ mutex_lock(&smbdirect_globals.mutex);
+
+ smbdirect_devices_exit();
+
+ destroy_workqueue(smbdirect_globals.workqueues.accept);
+ destroy_workqueue(smbdirect_globals.workqueues.connect);
+ destroy_workqueue(smbdirect_globals.workqueues.idle);
+ destroy_workqueue(smbdirect_globals.workqueues.refill);
+ destroy_workqueue(smbdirect_globals.workqueues.immediate);
+ destroy_workqueue(smbdirect_globals.workqueues.cleanup);
+
+ mutex_unlock(&smbdirect_globals.mutex);
+ pr_notice("subsystem unloaded\n");
+}
+
+module_init(smbdirect_module_init);
+module_exit(smbdirect_module_exit);
+
+MODULE_DESCRIPTION("smbdirect subsystem");
+MODULE_LICENSE("GPL");
diff --git a/fs/smb/common/smbdirect/smbdirect_mr.c b/fs/smb/common/smbdirect/smbdirect_mr.c
new file mode 100644
index 000000000000..fa9be8089925
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_mr.c
@@ -0,0 +1,493 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017, Microsoft Corporation.
+ * Copyright (c) 2025, Stefan Metzmacher
+ */
+
+#include "smbdirect_internal.h"
+
+/*
+ * Allocate MRs used for RDMA read/write
+ * The number of MRs will not exceed hardware capability in responder_resources
+ * All MRs are kept in mr_list. The MR can be recovered after it's used
+ * Recovery is done in smbd_mr_recovery_work. The content of list entry changes
+ * as MRs are used and recovered for I/O, but the list links will not change
+ */
+int smbdirect_connection_create_mr_list(struct smbdirect_socket *sc)
+{
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_mr_io *mr;
+ int ret;
+ u32 i;
+
+ if (sp->responder_resources == 0) {
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
+ "responder_resources negotiated as 0\n");
+ return -EINVAL;
+ }
+
+ /* Allocate more MRs (2x) than hardware responder_resources */
+ for (i = 0; i < sp->responder_resources * 2; i++) {
+ mr = kzalloc_obj(*mr);
+ if (!mr) {
+ ret = -ENOMEM;
+ goto kzalloc_mr_failed;
+ }
+
+ kref_init(&mr->kref);
+ mutex_init(&mr->mutex);
+
+ mr->mr = ib_alloc_mr(sc->ib.pd,
+ sc->mr_io.type,
+ sp->max_frmr_depth);
+ if (IS_ERR(mr->mr)) {
+ ret = PTR_ERR(mr->mr);
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
+ "ib_alloc_mr failed ret=%d (%1pe) type=0x%x max_frmr_depth=%u\n",
+ ret, SMBDIRECT_DEBUG_ERR_PTR(ret),
+ sc->mr_io.type, sp->max_frmr_depth);
+ goto ib_alloc_mr_failed;
+ }
+ mr->sgt.sgl = kzalloc_objs(struct scatterlist, sp->max_frmr_depth);
+ if (!mr->sgt.sgl) {
+ ret = -ENOMEM;
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
+ "failed to allocate sgl, max_frmr_depth=%u\n",
+ sp->max_frmr_depth);
+ goto kcalloc_sgl_failed;
+ }
+ mr->state = SMBDIRECT_MR_READY;
+ mr->socket = sc;
+
+ list_add_tail(&mr->list, &sc->mr_io.all.list);
+ atomic_inc(&sc->mr_io.ready.count);
+ }
+
+ return 0;
+
+kcalloc_sgl_failed:
+ ib_dereg_mr(mr->mr);
+ib_alloc_mr_failed:
+ mutex_destroy(&mr->mutex);
+ kfree(mr);
+kzalloc_mr_failed:
+ smbdirect_connection_destroy_mr_list(sc);
+ return ret;
+}
+
+static void smbdirect_mr_io_disable_locked(struct smbdirect_mr_io *mr)
+{
+ struct smbdirect_socket *sc = mr->socket;
+
+ lockdep_assert_held(&mr->mutex);
+
+ if (mr->state == SMBDIRECT_MR_DISABLED)
+ return;
+
+ if (mr->mr)
+ ib_dereg_mr(mr->mr);
+ if (mr->sgt.nents)
+ ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
+ kfree(mr->sgt.sgl);
+
+ mr->mr = NULL;
+ mr->sgt.sgl = NULL;
+ mr->sgt.nents = 0;
+
+ mr->state = SMBDIRECT_MR_DISABLED;
+}
+
+static void smbdirect_mr_io_free_locked(struct kref *kref)
+{
+ struct smbdirect_mr_io *mr =
+ container_of(kref, struct smbdirect_mr_io, kref);
+
+ lockdep_assert_held(&mr->mutex);
+
+ /*
+ * smbdirect_mr_io_disable_locked() should already be called!
+ */
+ if (WARN_ON_ONCE(mr->state != SMBDIRECT_MR_DISABLED))
+ smbdirect_mr_io_disable_locked(mr);
+
+ mutex_unlock(&mr->mutex);
+ mutex_destroy(&mr->mutex);
+ kfree(mr);
+}
+
+void smbdirect_connection_destroy_mr_list(struct smbdirect_socket *sc)
+{
+ struct smbdirect_mr_io *mr, *tmp;
+ LIST_HEAD(all_list);
+ unsigned long flags;
+
+ spin_lock_irqsave(&sc->mr_io.all.lock, flags);
+ list_splice_tail_init(&sc->mr_io.all.list, &all_list);
+ spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
+
+ list_for_each_entry_safe(mr, tmp, &all_list, list) {
+ mutex_lock(&mr->mutex);
+
+ smbdirect_mr_io_disable_locked(mr);
+ list_del(&mr->list);
+ mr->socket = NULL;
+
+ /*
+ * No kref_put_mutex() as it's already locked.
+ *
+ * If smbdirect_mr_io_free_locked() is called
+ * and the mutex is unlocked and mr is gone,
+ * in that case kref_put() returned 1.
+ *
+ * If kref_put() returned 0 we know that
+ * smbdirect_mr_io_free_locked() didn't
+ * run. Not by us nor by anyone else, as we
+ * still hold the mutex, so we need to unlock.
+ *
+ * If the mr is still registered it will
+ * be dangling (detached from the connection
+ * waiting for smbd_deregister_mr() to be
+ * called in order to free the memory.
+ */
+ if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked))
+ mutex_unlock(&mr->mutex);
+ }
+}
+
+/*
+ * Get a MR from mr_list. This function waits until there is at least one MR
+ * available in the list. There may be several CPUs issuing I/O trying to get MR
+ * at the same time, mr_list_lock is used to protect this situation.
+ */
+static struct smbdirect_mr_io *
+smbdirect_connection_get_mr_io(struct smbdirect_socket *sc)
+{
+ struct smbdirect_mr_io *mr;
+ unsigned long flags;
+ int ret;
+
+again:
+ ret = wait_event_interruptible(sc->mr_io.ready.wait_queue,
+ atomic_read(&sc->mr_io.ready.count) ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
+ if (ret) {
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
+ "wait_event_interruptible ret=%d (%1pe)\n",
+ ret, SMBDIRECT_DEBUG_ERR_PTR(ret));
+ return NULL;
+ }
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
+ "sc->status=%s sc->first_error=%1pe\n",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
+ return NULL;
+ }
+
+ spin_lock_irqsave(&sc->mr_io.all.lock, flags);
+ list_for_each_entry(mr, &sc->mr_io.all.list, list) {
+ if (mr->state == SMBDIRECT_MR_READY) {
+ mr->state = SMBDIRECT_MR_REGISTERED;
+ kref_get(&mr->kref);
+ spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
+ atomic_dec(&sc->mr_io.ready.count);
+ atomic_inc(&sc->mr_io.used.count);
+ return mr;
+ }
+ }
+
+ spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
+ /*
+ * It is possible that we could fail to get MR because other processes may
+ * try to acquire a MR at the same time. If this is the case, retry it.
+ */
+ goto again;
+}
+
+static void smbdirect_connection_mr_io_register_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct smbdirect_mr_io *mr =
+ container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe);
+ struct smbdirect_socket *sc = mr->socket;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
+ "wc->status=%s opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->opcode);
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ }
+}
+
+static void smbdirect_connection_mr_io_local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct smbdirect_mr_io *mr =
+ container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe);
+ struct smbdirect_socket *sc = mr->socket;
+
+ mr->state = SMBDIRECT_MR_INVALIDATED;
+ if (wc->status != IB_WC_SUCCESS) {
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
+ "invalidate failed status=%s\n",
+ ib_wc_status_msg(wc->status));
+ smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
+ }
+ complete(&mr->invalidate_done);
+}
+
+/*
+ * Transcribe the pages from an iterator into an MR scatterlist.
+ */
+static int smbdirect_iter_to_sgt(struct iov_iter *iter,
+ struct sg_table *sgt,
+ unsigned int max_sg)
+{
+ int ret;
+
+ memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist));
+
+ ret = extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0);
+ WARN_ON(ret < 0);
+ if (sgt->nents > 0)
+ sg_mark_end(&sgt->sgl[sgt->nents - 1]);
+
+ return ret;
+}
+
+/*
+ * Register memory for RDMA read/write
+ * iter: the buffer to register memory with
+ * writing: true if this is a RDMA write (SMB read), false for RDMA read
+ * need_invalidate: true if this MR needs to be locally invalidated after I/O
+ * return value: the MR registered, NULL if failed.
+ */
+struct smbdirect_mr_io *
+smbdirect_connection_register_mr_io(struct smbdirect_socket *sc,
+ struct iov_iter *iter,
+ bool writing,
+ bool need_invalidate)
+{
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_mr_io *mr;
+ int ret, num_pages;
+ struct ib_reg_wr *reg_wr;
+
+ num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1);
+ if (num_pages > sp->max_frmr_depth) {
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
+ "num_pages=%d max_frmr_depth=%d\n",
+ num_pages, sp->max_frmr_depth);
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ mr = smbdirect_connection_get_mr_io(sc);
+ if (!mr) {
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
+ "smbdirect_connection_get_mr_io returning NULL\n");
+ return NULL;
+ }
+
+ mutex_lock(&mr->mutex);
+
+ mr->dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+ mr->need_invalidate = need_invalidate;
+ mr->sgt.nents = 0;
+ mr->sgt.orig_nents = 0;
+
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_INFO,
+ "num_pages=%u count=%zu depth=%u\n",
+ num_pages, iov_iter_count(iter), sp->max_frmr_depth);
+ smbdirect_iter_to_sgt(iter, &mr->sgt, sp->max_frmr_depth);
+
+ ret = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
+ if (!ret) {
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
+ "ib_dma_map_sg num_pages=%u dir=%x ret=%d (%1pe)\n",
+ num_pages, mr->dir, ret, SMBDIRECT_DEBUG_ERR_PTR(ret));
+ goto dma_map_error;
+ }
+
+ ret = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE);
+ if (ret != mr->sgt.nents) {
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
+ "ib_map_mr_sg failed ret = %d nents = %u\n",
+ ret, mr->sgt.nents);
+ goto map_mr_error;
+ }
+
+ ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey));
+ reg_wr = &mr->wr;
+ reg_wr->wr.opcode = IB_WR_REG_MR;
+ mr->cqe.done = smbdirect_connection_mr_io_register_done;
+ reg_wr->wr.wr_cqe = &mr->cqe;
+ reg_wr->wr.num_sge = 0;
+ reg_wr->wr.send_flags = IB_SEND_SIGNALED;
+ reg_wr->mr = mr->mr;
+ reg_wr->key = mr->mr->rkey;
+ reg_wr->access = writing ?
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+ IB_ACCESS_REMOTE_READ;
+
+ /*
+ * There is no need for waiting for complemtion on ib_post_send
+ * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
+ * on the next ib_post_send when we actually send I/O to remote peer
+ */
+ ret = ib_post_send(sc->ib.qp, &reg_wr->wr, NULL);
+ if (!ret) {
+ /*
+ * smbdirect_connection_get_mr_io() gave us a reference
+ * via kref_get(&mr->kref), we keep that and let
+ * the caller use smbdirect_connection_deregister_mr_io()
+ * to remove it again.
+ */
+ mutex_unlock(&mr->mutex);
+ return mr;
+ }
+
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
+ "ib_post_send failed ret=%d (%1pe) reg_wr->key=0x%x\n",
+ ret, SMBDIRECT_DEBUG_ERR_PTR(ret), reg_wr->key);
+
+map_mr_error:
+ ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
+
+dma_map_error:
+ mr->sgt.nents = 0;
+ mr->state = SMBDIRECT_MR_ERROR;
+ atomic_dec(&sc->mr_io.used.count);
+
+ smbdirect_socket_schedule_cleanup(sc, ret);
+
+ /*
+ * smbdirect_connection_get_mr_io() gave us a reference
+ * via kref_get(&mr->kref), we need to remove it again
+ * on error.
+ *
+ * No kref_put_mutex() as it's already locked.
+ *
+ * If smbdirect_mr_io_free_locked() is called
+ * and the mutex is unlocked and mr is gone,
+ * in that case kref_put() returned 1.
+ *
+ * If kref_put() returned 0 we know that
+ * smbdirect_mr_io_free_locked() didn't
+ * run. Not by us nor by anyone else, as we
+ * still hold the mutex, so we need to unlock.
+ */
+ if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked))
+ mutex_unlock(&mr->mutex);
+ return NULL;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_register_mr_io);
+
+void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr,
+ struct smbdirect_buffer_descriptor_v1 *v1)
+{
+ mutex_lock(&mr->mutex);
+ if (mr->state == SMBDIRECT_MR_REGISTERED) {
+ v1->offset = cpu_to_le64(mr->mr->iova);
+ v1->token = cpu_to_le32(mr->mr->rkey);
+ v1->length = cpu_to_le32(mr->mr->length);
+ } else {
+ v1->offset = cpu_to_le64(U64_MAX);
+ v1->token = cpu_to_le32(U32_MAX);
+ v1->length = cpu_to_le32(U32_MAX);
+ }
+ mutex_unlock(&mr->mutex);
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_mr_io_fill_buffer_descriptor);
+
+/*
+ * Deregister a MR after I/O is done
+ * This function may wait if remote invalidation is not used
+ * and we have to locally invalidate the buffer to prevent data is being
+ * modified by remote peer after upper layer consumes it
+ */
+void smbdirect_connection_deregister_mr_io(struct smbdirect_mr_io *mr)
+{
+ struct smbdirect_socket *sc = mr->socket;
+ int ret = 0;
+
+lock_again:
+ mutex_lock(&mr->mutex);
+ if (mr->state == SMBDIRECT_MR_DISABLED)
+ goto put_kref;
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
+ smbdirect_mr_io_disable_locked(mr);
+ goto put_kref;
+ }
+
+ if (mr->need_invalidate) {
+ struct ib_send_wr *wr = &mr->inv_wr;
+
+ /* Need to finish local invalidation before returning */
+ wr->opcode = IB_WR_LOCAL_INV;
+ mr->cqe.done = smbdirect_connection_mr_io_local_inv_done;
+ wr->wr_cqe = &mr->cqe;
+ wr->num_sge = 0;
+ wr->ex.invalidate_rkey = mr->mr->rkey;
+ wr->send_flags = IB_SEND_SIGNALED;
+
+ init_completion(&mr->invalidate_done);
+ ret = ib_post_send(sc->ib.qp, wr, NULL);
+ if (ret) {
+ smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
+ "ib_post_send failed ret=%d (%1pe)\n",
+ ret, SMBDIRECT_DEBUG_ERR_PTR(ret));
+ smbdirect_mr_io_disable_locked(mr);
+ smbdirect_socket_schedule_cleanup(sc, ret);
+ goto done;
+ }
+
+ /*
+ * We still hold the reference to mr
+ * so we can unlock while waiting.
+ */
+ mutex_unlock(&mr->mutex);
+ wait_for_completion(&mr->invalidate_done);
+ mr->need_invalidate = false;
+ goto lock_again;
+ } else
+ /*
+ * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED
+ * and defer to mr_recovery_work to recover the MR for next use
+ */
+ mr->state = SMBDIRECT_MR_INVALIDATED;
+
+ if (mr->sgt.nents) {
+ ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
+ mr->sgt.nents = 0;
+ }
+
+ WARN_ONCE(mr->state != SMBDIRECT_MR_INVALIDATED,
+ "mr->state[%u] != SMBDIRECT_MR_INVALIDATED[%u]\n",
+ mr->state, SMBDIRECT_MR_INVALIDATED);
+ mr->state = SMBDIRECT_MR_READY;
+ if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
+ wake_up(&sc->mr_io.ready.wait_queue);
+
+done:
+ atomic_dec(&sc->mr_io.used.count);
+
+put_kref:
+ /*
+ * No kref_put_mutex() as it's already locked.
+ *
+ * If smbdirect_mr_io_free_locked() is called
+ * and the mutex is unlocked and mr is gone,
+ * in that case kref_put() returned 1.
+ *
+ * If kref_put() returned 0 we know that
+ * smbdirect_mr_io_free_locked() didn't
+ * run. Not by us nor by anyone else, as we
+ * still hold the mutex, so we need to unlock
+ * and keep the mr in SMBDIRECT_MR_READY or
+ * SMBDIRECT_MR_ERROR state.
+ */
+ if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked))
+ mutex_unlock(&mr->mutex);
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_deregister_mr_io);
diff --git a/fs/smb/common/smbdirect/smbdirect_pdu.h b/fs/smb/common/smbdirect/smbdirect_pdu.h
index ae9fdb05ce23..7693ba337873 100644
--- a/fs/smb/common/smbdirect/smbdirect_pdu.h
+++ b/fs/smb/common/smbdirect/smbdirect_pdu.h
@@ -8,6 +8,10 @@
#define SMBDIRECT_V1 0x0100
+/* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
+#define SMBDIRECT_MIN_RECEIVE_SIZE 128
+#define SMBDIRECT_MIN_FRAGMENTED_SIZE 131072
+
/* SMBD negotiation request packet [MS-SMBD] 2.2.1 */
struct smbdirect_negotiate_req {
__le16 min_version;
diff --git a/fs/smb/common/smbdirect/smbdirect_public.h b/fs/smb/common/smbdirect/smbdirect_public.h
new file mode 100644
index 000000000000..50088155e7c3
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_public.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2025, Stefan Metzmacher
+ */
+
+#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__
+#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__
+
+struct smbdirect_buffer_descriptor_v1;
+struct smbdirect_socket_parameters;
+
+struct smbdirect_socket;
+struct smbdirect_send_batch;
+struct smbdirect_mr_io;
+
+#define __SMBDIRECT_EXPORT_SYMBOL__(__sym) EXPORT_SYMBOL_FOR_MODULES(__sym, "cifs,ksmbd")
+
+#include <rdma/rw.h>
+
+u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev);
+
+bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs);
+
+int smbdirect_socket_create_kern(struct net *net, struct smbdirect_socket **_sc);
+
+int smbdirect_socket_create_accepting(struct rdma_cm_id *id, struct smbdirect_socket **_sc);
+
+int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc,
+ const struct smbdirect_socket_parameters *sp);
+
+const struct smbdirect_socket_parameters *
+smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc);
+
+int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc,
+ enum ib_poll_context poll_ctx,
+ gfp_t gfp_mask);
+
+#define SMBDIRECT_LOG_ERR 0x0
+#define SMBDIRECT_LOG_INFO 0x1
+
+#define SMBDIRECT_LOG_OUTGOING 0x1
+#define SMBDIRECT_LOG_INCOMING 0x2
+#define SMBDIRECT_LOG_READ 0x4
+#define SMBDIRECT_LOG_WRITE 0x8
+#define SMBDIRECT_LOG_RDMA_SEND 0x10
+#define SMBDIRECT_LOG_RDMA_RECV 0x20
+#define SMBDIRECT_LOG_KEEP_ALIVE 0x40
+#define SMBDIRECT_LOG_RDMA_EVENT 0x80
+#define SMBDIRECT_LOG_RDMA_MR 0x100
+#define SMBDIRECT_LOG_RDMA_RW 0x200
+#define SMBDIRECT_LOG_NEGOTIATE 0x400
+void smbdirect_socket_set_logging(struct smbdirect_socket *sc,
+ void *private_ptr,
+ bool (*needed)(struct smbdirect_socket *sc,
+ void *private_ptr,
+ unsigned int lvl,
+ unsigned int cls),
+ void (*vaprintf)(struct smbdirect_socket *sc,
+ const char *func,
+ unsigned int line,
+ void *private_ptr,
+ unsigned int lvl,
+ unsigned int cls,
+ struct va_format *vaf));
+
+bool smbdirect_connection_is_connected(struct smbdirect_socket *sc);
+
+int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc);
+
+int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr);
+
+void smbdirect_socket_shutdown(struct smbdirect_socket *sc);
+
+void smbdirect_socket_release(struct smbdirect_socket *sc);
+
+int smbdirect_connection_send_batch_flush(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch,
+ bool is_last);
+
+/*
+ * This is only temporary and only needed
+ * as long as the client still requires
+ * to use smbdirect_connection_send_single_iter()
+ */
+struct smbdirect_send_batch_storage {
+ union {
+ struct list_head __msg_list;
+ __aligned_u64 __space[5];
+ };
+};
+
+struct smbdirect_send_batch *
+smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage,
+ bool need_invalidate_rkey,
+ unsigned int remote_key);
+
+int smbdirect_connection_send_single_iter(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *batch,
+ struct iov_iter *iter,
+ unsigned int flags,
+ u32 remaining_data_length);
+
+int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc);
+
+int smbdirect_connection_send_iter(struct smbdirect_socket *sc,
+ struct iov_iter *iter,
+ unsigned int flags,
+ bool need_invalidate,
+ unsigned int remote_key);
+
+int smbdirect_connection_recvmsg(struct smbdirect_socket *sc,
+ struct msghdr *msg,
+ unsigned int flags);
+
+int smbdirect_connect(struct smbdirect_socket *sc,
+ const struct sockaddr *dst);
+
+int smbdirect_connect_sync(struct smbdirect_socket *sc,
+ const struct sockaddr *dst);
+
+int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog);
+
+struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc,
+ long timeo,
+ struct proto_accept_arg *arg);
+
+int smbdirect_connection_rdma_xmit(struct smbdirect_socket *sc,
+ void *buf, size_t buf_len,
+ struct smbdirect_buffer_descriptor_v1 *desc,
+ size_t desc_len,
+ bool is_read);
+
+struct smbdirect_mr_io *
+smbdirect_connection_register_mr_io(struct smbdirect_socket *sc,
+ struct iov_iter *iter,
+ bool writing,
+ bool need_invalidate);
+
+void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr,
+ struct smbdirect_buffer_descriptor_v1 *v1);
+
+void smbdirect_connection_deregister_mr_io(struct smbdirect_mr_io *mr);
+
+void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc,
+ unsigned int rdma_readwrite_threshold,
+ struct seq_file *m);
+
+#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ */
diff --git a/fs/smb/common/smbdirect/smbdirect_rw.c b/fs/smb/common/smbdirect/smbdirect_rw.c
new file mode 100644
index 000000000000..3b2eb8c48efc
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_rw.c
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017, Microsoft Corporation.
+ * Copyright (C) 2018, LG Electronics.
+ * Copyright (c) 2025, Stefan Metzmacher
+ */
+
+#include "smbdirect_internal.h"
+
+static int smbdirect_connection_wait_for_rw_credits(struct smbdirect_socket *sc,
+ int credits)
+{
+ return smbdirect_socket_wait_for_credits(sc,
+ SMBDIRECT_SOCKET_CONNECTED,
+ -ENOTCONN,
+ &sc->rw_io.credits.wait_queue,
+ &sc->rw_io.credits.count,
+ credits);
+}
+
+static int smbdirect_connection_calc_rw_credits(struct smbdirect_socket *sc,
+ const void *buf,
+ size_t len)
+{
+ return DIV_ROUND_UP(smbdirect_get_buf_page_count(buf, len),
+ sc->rw_io.credits.num_pages);
+}
+
+static int smbdirect_connection_rdma_get_sg_list(void *buf,
+ size_t size,
+ struct scatterlist *sg_list,
+ size_t nentries)
+{
+ bool high = is_vmalloc_addr(buf);
+ struct page *page;
+ size_t offset, len;
+ int i = 0;
+
+ if (size == 0 || nentries < smbdirect_get_buf_page_count(buf, size))
+ return -EINVAL;
+
+ offset = offset_in_page(buf);
+ buf -= offset;
+ while (size > 0) {
+ len = min_t(size_t, PAGE_SIZE - offset, size);
+ if (high)
+ page = vmalloc_to_page(buf);
+ else
+ page = kmap_to_page(buf);
+
+ if (!sg_list)
+ return -EINVAL;
+ sg_set_page(sg_list, page, len, offset);
+ sg_list = sg_next(sg_list);
+
+ buf += PAGE_SIZE;
+ size -= len;
+ offset = 0;
+ i++;
+ }
+
+ return i;
+}
+
+static void smbdirect_connection_rw_io_free(struct smbdirect_rw_io *msg,
+ enum dma_data_direction dir)
+{
+ struct smbdirect_socket *sc = msg->socket;
+
+ rdma_rw_ctx_destroy(&msg->rdma_ctx,
+ sc->ib.qp,
+ sc->ib.qp->port,
+ msg->sgt.sgl,
+ msg->sgt.nents,
+ dir);
+ sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
+ kfree(msg);
+}
+
+static void smbdirect_connection_rdma_rw_done(struct ib_cq *cq, struct ib_wc *wc,
+ enum dma_data_direction dir)
+{
+ struct smbdirect_rw_io *msg =
+ container_of(wc->wr_cqe, struct smbdirect_rw_io, cqe);
+ struct smbdirect_socket *sc = msg->socket;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ msg->error = -EIO;
+ pr_err("read/write error. opcode = %d, status = %s(%d)\n",
+ wc->opcode, ib_wc_status_msg(wc->status), wc->status);
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ smbdirect_socket_schedule_cleanup(sc, msg->error);
+ }
+
+ complete(msg->completion);
+}
+
+static void smbdirect_connection_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ smbdirect_connection_rdma_rw_done(cq, wc, DMA_FROM_DEVICE);
+}
+
+static void smbdirect_connection_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ smbdirect_connection_rdma_rw_done(cq, wc, DMA_TO_DEVICE);
+}
+
+int smbdirect_connection_rdma_xmit(struct smbdirect_socket *sc,
+ void *buf, size_t buf_len,
+ struct smbdirect_buffer_descriptor_v1 *desc,
+ size_t desc_len,
+ bool is_read)
+{
+ const struct smbdirect_socket_parameters *sp = &sc->parameters;
+ enum dma_data_direction direction = is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+ struct smbdirect_rw_io *msg, *next_msg;
+ size_t i;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(completion);
+ struct ib_send_wr *first_wr;
+ LIST_HEAD(msg_list);
+ u8 *desc_buf;
+ int credits_needed;
+ size_t desc_buf_len, desc_num = 0;
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+ return -ENOTCONN;
+
+ if (buf_len > sp->max_read_write_size)
+ return -EINVAL;
+
+ /* calculate needed credits */
+ credits_needed = 0;
+ desc_buf = buf;
+ for (i = 0; i < desc_len / sizeof(*desc); i++) {
+ if (!buf_len)
+ break;
+
+ desc_buf_len = le32_to_cpu(desc[i].length);
+ if (!desc_buf_len)
+ return -EINVAL;
+
+ if (desc_buf_len > buf_len) {
+ desc_buf_len = buf_len;
+ desc[i].length = cpu_to_le32(desc_buf_len);
+ buf_len = 0;
+ }
+
+ credits_needed += smbdirect_connection_calc_rw_credits(sc,
+ desc_buf,
+ desc_buf_len);
+ desc_buf += desc_buf_len;
+ buf_len -= desc_buf_len;
+ desc_num++;
+ }
+
+ smbdirect_log_rdma_rw(sc, SMBDIRECT_LOG_INFO,
+ "RDMA %s, len %zu, needed credits %d\n",
+ str_read_write(is_read), buf_len, credits_needed);
+
+ ret = smbdirect_connection_wait_for_rw_credits(sc, credits_needed);
+ if (ret < 0)
+ return ret;
+
+ /* build rdma_rw_ctx for each descriptor */
+ desc_buf = buf;
+ for (i = 0; i < desc_num; i++) {
+ size_t page_count;
+
+ msg = kzalloc_flex(*msg, sg_list, SG_CHUNK_SIZE,
+ sc->rw_io.mem.gfp_mask);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ desc_buf_len = le32_to_cpu(desc[i].length);
+ page_count = smbdirect_get_buf_page_count(desc_buf, desc_buf_len);
+
+ msg->socket = sc;
+ msg->cqe.done = is_read ?
+ smbdirect_connection_rdma_read_done :
+ smbdirect_connection_rdma_write_done;
+ msg->completion = &completion;
+
+ msg->sgt.sgl = &msg->sg_list[0];
+ ret = sg_alloc_table_chained(&msg->sgt,
+ page_count,
+ msg->sg_list,
+ SG_CHUNK_SIZE);
+ if (ret) {
+ ret = -ENOMEM;
+ goto free_msg;
+ }
+
+ ret = smbdirect_connection_rdma_get_sg_list(desc_buf,
+ desc_buf_len,
+ msg->sgt.sgl,
+ msg->sgt.orig_nents);
+ if (ret < 0)
+ goto free_table;
+
+ ret = rdma_rw_ctx_init(&msg->rdma_ctx,
+ sc->ib.qp,
+ sc->ib.qp->port,
+ msg->sgt.sgl,
+ page_count,
+ 0,
+ le64_to_cpu(desc[i].offset),
+ le32_to_cpu(desc[i].token),
+ direction);
+ if (ret < 0) {
+ pr_err("failed to init rdma_rw_ctx: %d\n", ret);
+ goto free_table;
+ }
+
+ list_add_tail(&msg->list, &msg_list);
+ desc_buf += desc_buf_len;
+ }
+
+ /* concatenate work requests of rdma_rw_ctxs */
+ first_wr = NULL;
+ list_for_each_entry_reverse(msg, &msg_list, list) {
+ first_wr = rdma_rw_ctx_wrs(&msg->rdma_ctx,
+ sc->ib.qp,
+ sc->ib.qp->port,
+ &msg->cqe,
+ first_wr);
+ }
+
+ ret = ib_post_send(sc->ib.qp, first_wr, NULL);
+ if (ret) {
+ pr_err("failed to post send wr for RDMA R/W: %d\n", ret);
+ goto out;
+ }
+
+ msg = list_last_entry(&msg_list, struct smbdirect_rw_io, list);
+ wait_for_completion(&completion);
+ ret = msg->error;
+out:
+ list_for_each_entry_safe(msg, next_msg, &msg_list, list) {
+ list_del(&msg->list);
+ smbdirect_connection_rw_io_free(msg, direction);
+ }
+ atomic_add(credits_needed, &sc->rw_io.credits.count);
+ wake_up(&sc->rw_io.credits.wait_queue);
+ return ret;
+
+free_table:
+ sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
+free_msg:
+ kfree(msg);
+ goto out;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_rdma_xmit);
diff --git a/fs/smb/common/smbdirect/smbdirect_socket.c b/fs/smb/common/smbdirect/smbdirect_socket.c
new file mode 100644
index 000000000000..9153e1dbf53d
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_socket.c
@@ -0,0 +1,743 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017, Microsoft Corporation.
+ * Copyright (c) 2025, Stefan Metzmacher
+ */
+
+#include "smbdirect_internal.h"
+
+bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs)
+{
+ /*
+ * Test if FRWR (Fast Registration Work Requests) is supported on the
+ * device This implementation requires FRWR on RDMA read/write return
+ * value: true if it is supported
+ */
+
+ if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
+ return false;
+ if (attrs->max_fast_reg_page_list_len == 0)
+ return false;
+ return true;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_frwr_is_supported);
+
+static void smbdirect_socket_cleanup_work(struct work_struct *work);
+
+static int smbdirect_socket_rdma_event_handler(struct rdma_cm_id *id,
+ struct rdma_cm_event *event)
+{
+ struct smbdirect_socket *sc = id->context;
+ int ret = -ESTALE;
+
+ /*
+ * This should be replaced before any real work
+ * starts! So it should never be called!
+ */
+
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ret = -ENETDOWN;
+ if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status)))
+ ret = event->status;
+ pr_err("%s (first_error=%1pe, expected=%s) => event=%s status=%d => ret=%1pe\n",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
+ rdma_event_msg(sc->rdma.expected_event),
+ rdma_event_msg(event->event),
+ event->status,
+ SMBDIRECT_DEBUG_ERR_PTR(ret));
+ WARN_ONCE(1, "%s should not be called!\n", __func__);
+ sc->rdma.cm_id = NULL;
+ return -ESTALE;
+}
+
+int smbdirect_socket_init_new(struct net *net, struct smbdirect_socket *sc)
+{
+ struct rdma_cm_id *id;
+ int ret;
+
+ smbdirect_socket_init(sc);
+
+ id = rdma_create_id(net,
+ smbdirect_socket_rdma_event_handler,
+ sc,
+ RDMA_PS_TCP,
+ IB_QPT_RC);
+ if (IS_ERR(id)) {
+ pr_err("%s: rdma_create_id() failed %1pe\n", __func__, id);
+ return PTR_ERR(id);
+ }
+
+ ret = rdma_set_afonly(id, 1);
+ if (ret) {
+ rdma_destroy_id(id);
+ pr_err("%s: rdma_set_afonly() failed %1pe\n",
+ __func__, SMBDIRECT_DEBUG_ERR_PTR(ret));
+ return ret;
+ }
+
+ sc->rdma.cm_id = id;
+
+ INIT_WORK(&sc->disconnect_work, smbdirect_socket_cleanup_work);
+
+ return 0;
+}
+
+int smbdirect_socket_create_kern(struct net *net, struct smbdirect_socket **_sc)
+{
+ struct smbdirect_socket *sc;
+ int ret;
+
+ ret = -ENOMEM;
+ sc = kzalloc_obj(*sc);
+ if (!sc)
+ goto alloc_failed;
+
+ ret = smbdirect_socket_init_new(net, sc);
+ if (ret)
+ goto init_failed;
+
+ kref_init(&sc->refs.destroy);
+
+ *_sc = sc;
+ return 0;
+
+init_failed:
+ kfree(sc);
+alloc_failed:
+ return ret;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_create_kern);
+
+int smbdirect_socket_init_accepting(struct rdma_cm_id *id, struct smbdirect_socket *sc)
+{
+ smbdirect_socket_init(sc);
+
+ sc->rdma.cm_id = id;
+ sc->rdma.cm_id->context = sc;
+ sc->rdma.cm_id->event_handler = smbdirect_socket_rdma_event_handler;
+
+ sc->ib.dev = sc->rdma.cm_id->device;
+
+ INIT_WORK(&sc->disconnect_work, smbdirect_socket_cleanup_work);
+
+ return 0;
+}
+
+int smbdirect_socket_create_accepting(struct rdma_cm_id *id, struct smbdirect_socket **_sc)
+{
+ struct smbdirect_socket *sc;
+ int ret;
+
+ ret = -ENOMEM;
+ sc = kzalloc_obj(*sc);
+ if (!sc)
+ goto alloc_failed;
+
+ ret = smbdirect_socket_init_accepting(id, sc);
+ if (ret)
+ goto init_failed;
+
+ kref_init(&sc->refs.destroy);
+
+ *_sc = sc;
+ return 0;
+
+init_failed:
+ kfree(sc);
+alloc_failed:
+ return ret;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_create_accepting);
+
+int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc,
+ const struct smbdirect_socket_parameters *sp)
+{
+ /*
+ * This is only allowed before connect or accept
+ */
+ WARN_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED,
+ "status=%s first_error=%1pe",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
+ if (sc->status != SMBDIRECT_SOCKET_CREATED)
+ return -EINVAL;
+
+ if (sp->flags & ~SMBDIRECT_FLAG_PORT_RANGE_MASK)
+ return -EINVAL;
+
+ if (sp->initiator_depth > U8_MAX)
+ return -EINVAL;
+ if (sp->responder_resources > U8_MAX)
+ return -EINVAL;
+
+ if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB &&
+ sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW)
+ return -EINVAL;
+ else if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB)
+ rdma_restrict_node_type(sc->rdma.cm_id, RDMA_NODE_IB_CA);
+ else if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW)
+ rdma_restrict_node_type(sc->rdma.cm_id, RDMA_NODE_RNIC);
+
+ /*
+ * Make a copy of the callers parameters
+ * from here we only work on the copy
+ *
+ * TODO: do we want consistency checking?
+ */
+ sc->parameters = *sp;
+
+ return 0;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_initial_parameters);
+
+const struct smbdirect_socket_parameters *
+smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc)
+{
+ return &sc->parameters;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_get_current_parameters);
+
+int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc,
+ enum ib_poll_context poll_ctx,
+ gfp_t gfp_mask)
+{
+ /*
+ * This is only allowed before connect or accept
+ */
+ WARN_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED,
+ "status=%s first_error=%1pe",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
+ if (sc->status != SMBDIRECT_SOCKET_CREATED)
+ return -EINVAL;
+
+ sc->ib.poll_ctx = poll_ctx;
+
+ sc->send_io.mem.gfp_mask = gfp_mask;
+ sc->recv_io.mem.gfp_mask = gfp_mask;
+ sc->rw_io.mem.gfp_mask = gfp_mask;
+
+ return 0;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_kernel_settings);
+
+void smbdirect_socket_set_logging(struct smbdirect_socket *sc,
+ void *private_ptr,
+ bool (*needed)(struct smbdirect_socket *sc,
+ void *private_ptr,
+ unsigned int lvl,
+ unsigned int cls),
+ void (*vaprintf)(struct smbdirect_socket *sc,
+ const char *func,
+ unsigned int line,
+ void *private_ptr,
+ unsigned int lvl,
+ unsigned int cls,
+ struct va_format *vaf))
+{
+ sc->logging.private_ptr = private_ptr;
+ sc->logging.needed = needed;
+ sc->logging.vaprintf = vaprintf;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_logging);
+
+static void smbdirect_socket_wake_up_all(struct smbdirect_socket *sc)
+{
+ /*
+ * Wake up all waiters in all wait queues
+ * in order to notice the broken connection.
+ */
+ wake_up_all(&sc->status_wait);
+ wake_up_all(&sc->listen.wait_queue);
+ wake_up_all(&sc->send_io.bcredits.wait_queue);
+ wake_up_all(&sc->send_io.lcredits.wait_queue);
+ wake_up_all(&sc->send_io.credits.wait_queue);
+ wake_up_all(&sc->send_io.pending.zero_wait_queue);
+ wake_up_all(&sc->recv_io.reassembly.wait_queue);
+ wake_up_all(&sc->rw_io.credits.wait_queue);
+ wake_up_all(&sc->mr_io.ready.wait_queue);
+}
+
+void __smbdirect_socket_schedule_cleanup(struct smbdirect_socket *sc,
+ const char *macro_name,
+ unsigned int lvl,
+ const char *func,
+ unsigned int line,
+ int error,
+ enum smbdirect_socket_status *force_status)
+{
+ struct smbdirect_socket *psc, *tsc;
+ unsigned long flags;
+ bool was_first = false;
+
+ if (!sc->first_error) {
+ ___smbdirect_log_generic(sc, func, line,
+ lvl,
+ SMBDIRECT_LOG_RDMA_EVENT,
+ "%s(%1pe%s%s) called from %s in line=%u status=%s\n",
+ macro_name,
+ SMBDIRECT_DEBUG_ERR_PTR(error),
+ force_status ? ", " : "",
+ force_status ? smbdirect_socket_status_string(*force_status) : "",
+ func, line,
+ smbdirect_socket_status_string(sc->status));
+ if (error)
+ sc->first_error = error;
+ else
+ sc->first_error = -ECONNABORTED;
+ was_first = true;
+ }
+
+ /*
+ * make sure other work (than disconnect_work)
+ * is not queued again but here we don't block and avoid
+ * disable[_delayed]_work_sync()
+ */
+ disable_work(&sc->connect.work);
+ disable_work(&sc->recv_io.posted.refill_work);
+ disable_work(&sc->idle.immediate_work);
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
+ disable_delayed_work(&sc->idle.timer_work);
+
+ /*
+ * In case we were a listener we need to
+ * disconnect all pending and ready sockets
+ *
+ * First we move ready sockets to pending again.
+ */
+ spin_lock_irqsave(&sc->listen.lock, flags);
+ list_splice_init(&sc->listen.ready, &sc->listen.pending);
+ list_for_each_entry_safe(psc, tsc, &sc->listen.pending, accept.list)
+ smbdirect_socket_schedule_cleanup(psc, sc->first_error);
+ spin_unlock_irqrestore(&sc->listen.lock, flags);
+
+ switch (sc->status) {
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
+ case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
+ case SMBDIRECT_SOCKET_ERROR:
+ case SMBDIRECT_SOCKET_DISCONNECTING:
+ case SMBDIRECT_SOCKET_DISCONNECTED:
+ case SMBDIRECT_SOCKET_DESTROYED:
+ /*
+ * Keep the current error status
+ */
+ break;
+
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
+ break;
+
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
+ break;
+
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
+ sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
+ break;
+
+ case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
+ case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
+ break;
+
+ case SMBDIRECT_SOCKET_CREATED:
+ case SMBDIRECT_SOCKET_LISTENING:
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+ break;
+
+ case SMBDIRECT_SOCKET_CONNECTED:
+ sc->status = SMBDIRECT_SOCKET_ERROR;
+ break;
+ }
+
+ if (force_status && (was_first || *force_status > sc->status))
+ sc->status = *force_status;
+
+ /*
+ * Wake up all waiters in all wait queues
+ * in order to notice the broken connection.
+ */
+ smbdirect_socket_wake_up_all(sc);
+
+ queue_work(sc->workqueues.cleanup, &sc->disconnect_work);
+}
+
+static void smbdirect_socket_cleanup_work(struct work_struct *work)
+{
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, disconnect_work);
+ struct smbdirect_socket *psc, *tsc;
+ unsigned long flags;
+
+ /*
+ * This should not never be called in an interrupt!
+ */
+ WARN_ON_ONCE(in_interrupt());
+
+ if (!sc->first_error) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
+ "%s called with first_error==0\n",
+ smbdirect_socket_status_string(sc->status));
+
+ sc->first_error = -ECONNABORTED;
+ }
+
+ /*
+ * make sure this and other work is not queued again
+ * but here we don't block and avoid
+ * disable[_delayed]_work_sync()
+ */
+ disable_work(&sc->disconnect_work);
+ disable_work(&sc->connect.work);
+ disable_work(&sc->recv_io.posted.refill_work);
+ disable_work(&sc->idle.immediate_work);
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
+ disable_delayed_work(&sc->idle.timer_work);
+
+ /*
+ * In case we were a listener we need to
+ * disconnect all pending and ready sockets
+ *
+ * First we move ready sockets to pending again.
+ */
+ spin_lock_irqsave(&sc->listen.lock, flags);
+ list_splice_init(&sc->listen.ready, &sc->listen.pending);
+ list_for_each_entry_safe(psc, tsc, &sc->listen.pending, accept.list)
+ smbdirect_socket_schedule_cleanup(psc, sc->first_error);
+ spin_unlock_irqrestore(&sc->listen.lock, flags);
+
+ switch (sc->status) {
+ case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
+ case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
+ case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
+ case SMBDIRECT_SOCKET_CONNECTED:
+ case SMBDIRECT_SOCKET_ERROR:
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTING;
+ /*
+ * Make sure we hold the callback lock
+ * im order to coordinate with the
+ * rdma_event handlers, typically
+ * smbdirect_connection_rdma_event_handler(),
+ * and smbdirect_socket_destroy().
+ *
+ * So that the order of ib_drain_qp()
+ * and rdma_disconnect() is controlled
+ * by the mutex.
+ */
+ rdma_lock_handler(sc->rdma.cm_id);
+ rdma_disconnect(sc->rdma.cm_id);
+ rdma_unlock_handler(sc->rdma.cm_id);
+ break;
+
+ case SMBDIRECT_SOCKET_CREATED:
+ case SMBDIRECT_SOCKET_LISTENING:
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
+ /*
+ * rdma_{accept,connect}() never reached
+ * RDMA_CM_EVENT_ESTABLISHED
+ */
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+ break;
+
+ case SMBDIRECT_SOCKET_DISCONNECTING:
+ case SMBDIRECT_SOCKET_DISCONNECTED:
+ case SMBDIRECT_SOCKET_DESTROYED:
+ break;
+ }
+
+ /*
+ * Wake up all waiters in all wait queues
+ * in order to notice the broken connection.
+ */
+ smbdirect_socket_wake_up_all(sc);
+}
+
+static void smbdirect_socket_destroy(struct smbdirect_socket *sc)
+{
+ struct smbdirect_socket *psc, *tsc;
+ size_t psockets;
+ struct smbdirect_recv_io *recv_io;
+ struct smbdirect_recv_io *recv_tmp;
+ LIST_HEAD(all_list);
+ unsigned long flags;
+
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "status=%s first_error=%1pe",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
+
+ /*
+ * This should not never be called in an interrupt!
+ */
+ WARN_ON_ONCE(in_interrupt());
+
+ if (sc->status == SMBDIRECT_SOCKET_DESTROYED)
+ return;
+
+ WARN_ONCE(sc->status != SMBDIRECT_SOCKET_DISCONNECTED,
+ "status=%s first_error=%1pe",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
+
+ /*
+ * The listener should clear this before we reach this
+ */
+ WARN_ONCE(sc->accept.listener,
+ "status=%s first_error=%1pe",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
+
+ /*
+ * Wake up all waiters in all wait queues
+ * in order to notice the broken connection.
+ *
+ * Most likely this was already called via
+ * smbdirect_socket_cleanup_work(), but call it again...
+ */
+ smbdirect_socket_wake_up_all(sc);
+
+ disable_work_sync(&sc->disconnect_work);
+ disable_work_sync(&sc->connect.work);
+ disable_work_sync(&sc->recv_io.posted.refill_work);
+ disable_work_sync(&sc->idle.immediate_work);
+ disable_delayed_work_sync(&sc->idle.timer_work);
+
+ if (sc->rdma.cm_id)
+ rdma_lock_handler(sc->rdma.cm_id);
+
+ if (sc->ib.qp) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "drain qp\n");
+ ib_drain_qp(sc->ib.qp);
+ }
+
+ /*
+ * In case we were a listener we need to
+ * disconnect all pending and ready sockets
+ *
+ * We move ready sockets to pending again.
+ */
+ spin_lock_irqsave(&sc->listen.lock, flags);
+ list_splice_tail_init(&sc->listen.ready, &all_list);
+ list_splice_tail_init(&sc->listen.pending, &all_list);
+ spin_unlock_irqrestore(&sc->listen.lock, flags);
+ psockets = list_count_nodes(&all_list);
+ if (sc->listen.backlog != -1) /* was a listener */
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "release %zu pending sockets\n", psockets);
+ list_for_each_entry_safe(psc, tsc, &all_list, accept.list) {
+ list_del_init(&psc->accept.list);
+ psc->accept.listener = NULL;
+ smbdirect_socket_release(psc);
+ }
+ if (sc->listen.backlog != -1) /* was a listener */
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "released %zu pending sockets\n", psockets);
+ INIT_LIST_HEAD(&all_list);
+
+ /* It's not possible for upper layer to get to reassembly */
+ if (sc->listen.backlog == -1) /* was not a listener */
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "drain the reassembly queue\n");
+ spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
+ list_splice_tail_init(&sc->recv_io.reassembly.list, &all_list);
+ spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
+ list_for_each_entry_safe(recv_io, recv_tmp, &all_list, list)
+ smbdirect_connection_put_recv_io(recv_io);
+ sc->recv_io.reassembly.data_length = 0;
+
+ if (sc->listen.backlog == -1) /* was not a listener */
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "freeing mr list\n");
+ smbdirect_connection_destroy_mr_list(sc);
+
+ if (sc->listen.backlog == -1) /* was not a listener */
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "destroying qp\n");
+ smbdirect_connection_destroy_qp(sc);
+ if (sc->rdma.cm_id) {
+ rdma_unlock_handler(sc->rdma.cm_id);
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "destroying cm_id\n");
+ rdma_destroy_id(sc->rdma.cm_id);
+ sc->rdma.cm_id = NULL;
+ }
+
+ if (sc->listen.backlog == -1) /* was not a listener */
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "destroying mem pools\n");
+ smbdirect_connection_destroy_mem_pools(sc);
+
+ sc->status = SMBDIRECT_SOCKET_DESTROYED;
+
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "rdma session destroyed\n");
+}
+
+void smbdirect_socket_destroy_sync(struct smbdirect_socket *sc)
+{
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "status=%s first_error=%1pe",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
+
+ /*
+ * This should not never be called in an interrupt!
+ */
+ WARN_ON_ONCE(in_interrupt());
+
+ /*
+ * First we try to disable the work
+ * without disable_work_sync() in a
+ * non blocking way, if it's already
+ * running it will be handles by
+ * disable_work_sync() below.
+ *
+ * Here we just want to make sure queue_work() in
+ * smbdirect_socket_schedule_cleanup_lvl()
+ * is a no-op.
+ */
+ disable_work(&sc->disconnect_work);
+
+ if (!sc->first_error)
+ /*
+ * SMBDIRECT_LOG_INFO is enough here
+ * as this is the typical case where
+ * we terminate the connection ourself.
+ */
+ smbdirect_socket_schedule_cleanup_lvl(sc,
+ SMBDIRECT_LOG_INFO,
+ -ESHUTDOWN);
+
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "cancelling and disable disconnect_work\n");
+ disable_work_sync(&sc->disconnect_work);
+
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "destroying rdma session\n");
+ if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING)
+ smbdirect_socket_cleanup_work(&sc->disconnect_work);
+ if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) {
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "wait for transport being disconnected\n");
+ wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "waited for transport being disconnected\n");
+ }
+
+ /*
+ * Once we reached SMBDIRECT_SOCKET_DISCONNECTED,
+ * we should call smbdirect_socket_destroy()
+ */
+ smbdirect_socket_destroy(sc);
+ smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
+ "status=%s first_error=%1pe",
+ smbdirect_socket_status_string(sc->status),
+ SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
+}
+
+int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr)
+{
+ int ret;
+
+ if (sc->status != SMBDIRECT_SOCKET_CREATED)
+ return -EINVAL;
+
+ ret = rdma_bind_addr(sc->rdma.cm_id, addr);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_bind);
+
+void smbdirect_socket_shutdown(struct smbdirect_socket *sc)
+{
+ smbdirect_socket_schedule_cleanup_lvl(sc, SMBDIRECT_LOG_INFO, -ESHUTDOWN);
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_shutdown);
+
+static void smbdirect_socket_release_disconnect(struct kref *kref)
+{
+ struct smbdirect_socket *sc =
+ container_of(kref, struct smbdirect_socket, refs.disconnect);
+
+ /*
+ * For now do a sync disconnect/destroy
+ */
+ smbdirect_socket_destroy_sync(sc);
+}
+
+static void smbdirect_socket_release_destroy(struct kref *kref)
+{
+ struct smbdirect_socket *sc =
+ container_of(kref, struct smbdirect_socket, refs.destroy);
+
+ /*
+ * Do a sync disconnect/destroy...
+ * hopefully a no-op, as it should be already
+ * in DESTROYED state, before we free the memory.
+ */
+ smbdirect_socket_destroy_sync(sc);
+ kfree(sc);
+}
+
+void smbdirect_socket_release(struct smbdirect_socket *sc)
+{
+ /*
+ * We expect only 1 disconnect reference
+ * and if it is already 0, it's a use after free!
+ */
+ WARN_ON_ONCE(kref_read(&sc->refs.disconnect) != 1);
+ WARN_ON(!kref_put(&sc->refs.disconnect, smbdirect_socket_release_disconnect));
+
+ /*
+ * This may not trigger smbdirect_socket_release_destroy(),
+ * if struct smbdirect_socket is embedded in another structure
+ * indicated by REFCOUNT_MAX.
+ */
+ kref_put(&sc->refs.destroy, smbdirect_socket_release_destroy);
+}
+__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_release);
+
+int smbdirect_socket_wait_for_credits(struct smbdirect_socket *sc,
+ enum smbdirect_socket_status expected_status,
+ int unexpected_errno,
+ wait_queue_head_t *waitq,
+ atomic_t *total_credits,
+ int needed)
+{
+ int ret;
+
+ if (WARN_ON_ONCE(needed < 0))
+ return -EINVAL;
+
+ do {
+ if (atomic_sub_return(needed, total_credits) >= 0)
+ return 0;
+
+ atomic_add(needed, total_credits);
+ ret = wait_event_interruptible(*waitq,
+ atomic_read(total_credits) >= needed ||
+ sc->status != expected_status);
+
+ if (sc->status != expected_status)
+ return unexpected_errno;
+ else if (ret < 0)
+ return ret;
+ } while (true);
+}
diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h
index 22184e53d445..c09eddd8ad16 100644
--- a/fs/smb/common/smbdirect/smbdirect_socket.h
+++ b/fs/smb/common/smbdirect/smbdirect_socket.h
@@ -6,10 +6,18 @@
#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__
#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/kref.h>
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
#include <rdma/rw.h>
enum smbdirect_socket_status {
SMBDIRECT_SOCKET_CREATED,
+ SMBDIRECT_SOCKET_LISTENING,
SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED,
SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING,
SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED,
@@ -35,6 +43,8 @@ const char *smbdirect_socket_status_string(enum smbdirect_socket_status status)
switch (status) {
case SMBDIRECT_SOCKET_CREATED:
return "CREATED";
+ case SMBDIRECT_SOCKET_LISTENING:
+ return "LISTENING";
case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
return "RESOLVE_ADDR_NEEDED";
case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
@@ -99,19 +109,60 @@ struct smbdirect_socket {
int first_error;
/*
- * This points to the workqueue to
+ * This points to the workqueues to
* be used for this socket.
- * It can be per socket (on the client)
- * or point to a global workqueue (on the server)
*/
- struct workqueue_struct *workqueue;
+ struct {
+ struct workqueue_struct *accept;
+ struct workqueue_struct *connect;
+ struct workqueue_struct *idle;
+ struct workqueue_struct *refill;
+ struct workqueue_struct *immediate;
+ struct workqueue_struct *cleanup;
+ } workqueues;
struct work_struct disconnect_work;
+ /*
+ * The reference counts.
+ */
+ struct {
+ /*
+ * This holds the references by the
+ * frontend, typically the smb layer.
+ *
+ * It is typically 1 and a disconnect
+ * will happen if it reaches 0.
+ */
+ struct kref disconnect;
+
+ /*
+ * This holds the reference by the
+ * backend, the code that manages
+ * the lifetime of the whole
+ * struct smbdirect_socket,
+ * if this reaches 0 it can will
+ * be freed.
+ *
+ * Can be REFCOUNT_MAX is part
+ * of another structure.
+ *
+ * This is equal or higher than
+ * the disconnect refcount.
+ */
+ struct kref destroy;
+ } refs;
+
/* RDMA related */
struct {
struct rdma_cm_id *cm_id;
/*
+ * The expected event in our current
+ * cm_id->event_handler, all other events
+ * are treated as an error.
+ */
+ enum rdma_cm_event_type expected_event;
+ /*
* This is for iWarp MPA v1
*/
bool legacy_iwarp;
@@ -120,6 +171,7 @@ struct smbdirect_socket {
/* IB verbs related */
struct {
struct ib_pd *pd;
+ enum ib_poll_context poll_ctx;
struct ib_cq *send_cq;
struct ib_cq *recv_cq;
@@ -150,6 +202,35 @@ struct smbdirect_socket {
} idle;
/*
+ * The state for listen sockets
+ */
+ struct {
+ spinlock_t lock;
+ struct list_head pending;
+ struct list_head ready;
+ wait_queue_head_t wait_queue;
+ /*
+ * This starts as -1 and a value != -1
+ * means this socket was in LISTENING state
+ * before. Note the valid backlog can
+ * only be > 0.
+ */
+ int backlog;
+ } listen;
+
+ /*
+ * The state for sockets waiting
+ * for accept, either still waiting
+ * for the negotiation to finish
+ * or already ready with a usable
+ * connection.
+ */
+ struct {
+ struct smbdirect_socket *listener;
+ struct list_head list;
+ } accept;
+
+ /*
* The state for posted send buffers
*/
struct {
@@ -158,8 +239,9 @@ struct smbdirect_socket {
* smbdirect_send_io buffers
*/
struct {
- struct kmem_cache *cache;
- mempool_t *pool;
+ struct kmem_cache *cache;
+ mempool_t *pool;
+ gfp_t gfp_mask;
} mem;
/*
@@ -195,10 +277,6 @@ struct smbdirect_socket {
struct {
atomic_t count;
/*
- * woken when count is decremented
- */
- wait_queue_head_t dec_wait_queue;
- /*
* woken when count reached zero
*/
wait_queue_head_t zero_wait_queue;
@@ -223,8 +301,9 @@ struct smbdirect_socket {
* smbdirect_recv_io buffers
*/
struct {
- struct kmem_cache *cache;
- mempool_t *pool;
+ struct kmem_cache *cache;
+ mempool_t *pool;
+ gfp_t gfp_mask;
} mem;
/*
@@ -310,13 +389,6 @@ struct smbdirect_socket {
struct {
atomic_t count;
} used;
-
- struct work_struct recovery_work;
-
- /* Used by transport to wait until all MRs are returned */
- struct {
- wait_queue_head_t wait_queue;
- } cleanup;
} mr_io;
/*
@@ -324,6 +396,14 @@ struct smbdirect_socket {
*/
struct {
/*
+ * Memory hints for
+ * smbdirect_rw_io structs
+ */
+ struct {
+ gfp_t gfp_mask;
+ } mem;
+
+ /*
* The credit state for the send side
*/
struct {
@@ -352,20 +432,6 @@ struct smbdirect_socket {
} statistics;
struct {
-#define SMBDIRECT_LOG_ERR 0x0
-#define SMBDIRECT_LOG_INFO 0x1
-
-#define SMBDIRECT_LOG_OUTGOING 0x1
-#define SMBDIRECT_LOG_INCOMING 0x2
-#define SMBDIRECT_LOG_READ 0x4
-#define SMBDIRECT_LOG_WRITE 0x8
-#define SMBDIRECT_LOG_RDMA_SEND 0x10
-#define SMBDIRECT_LOG_RDMA_RECV 0x20
-#define SMBDIRECT_LOG_KEEP_ALIVE 0x40
-#define SMBDIRECT_LOG_RDMA_EVENT 0x80
-#define SMBDIRECT_LOG_RDMA_MR 0x100
-#define SMBDIRECT_LOG_RDMA_RW 0x200
-#define SMBDIRECT_LOG_NEGOTIATE 0x400
void *private_ptr;
bool (*needed)(struct smbdirect_socket *sc,
void *private_ptr,
@@ -493,9 +559,23 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
init_waitqueue_head(&sc->status_wait);
+ sc->workqueues.accept = smbdirect_globals.workqueues.accept;
+ sc->workqueues.connect = smbdirect_globals.workqueues.connect;
+ sc->workqueues.idle = smbdirect_globals.workqueues.idle;
+ sc->workqueues.refill = smbdirect_globals.workqueues.refill;
+ sc->workqueues.immediate = smbdirect_globals.workqueues.immediate;
+ sc->workqueues.cleanup = smbdirect_globals.workqueues.cleanup;
+
INIT_WORK(&sc->disconnect_work, __smbdirect_socket_disabled_work);
disable_work_sync(&sc->disconnect_work);
+ kref_init(&sc->refs.disconnect);
+ sc->refs.destroy = (struct kref) KREF_INIT(REFCOUNT_MAX);
+
+ sc->rdma.expected_event = RDMA_CM_EVENT_INTERNAL;
+
+ sc->ib.poll_ctx = IB_POLL_UNBOUND_WORKQUEUE;
+
spin_lock_init(&sc->connect.lock);
INIT_WORK(&sc->connect.work, __smbdirect_socket_disabled_work);
disable_work_sync(&sc->connect.work);
@@ -505,6 +585,16 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work);
disable_delayed_work_sync(&sc->idle.timer_work);
+ spin_lock_init(&sc->listen.lock);
+ INIT_LIST_HEAD(&sc->listen.pending);
+ INIT_LIST_HEAD(&sc->listen.ready);
+ sc->listen.backlog = -1; /* not a listener */
+ init_waitqueue_head(&sc->listen.wait_queue);
+
+ INIT_LIST_HEAD(&sc->accept.list);
+
+ sc->send_io.mem.gfp_mask = GFP_KERNEL;
+
atomic_set(&sc->send_io.bcredits.count, 0);
init_waitqueue_head(&sc->send_io.bcredits.wait_queue);
@@ -515,9 +605,10 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
init_waitqueue_head(&sc->send_io.credits.wait_queue);
atomic_set(&sc->send_io.pending.count, 0);
- init_waitqueue_head(&sc->send_io.pending.dec_wait_queue);
init_waitqueue_head(&sc->send_io.pending.zero_wait_queue);
+ sc->recv_io.mem.gfp_mask = GFP_KERNEL;
+
INIT_LIST_HEAD(&sc->recv_io.free.list);
spin_lock_init(&sc->recv_io.free.lock);
@@ -532,6 +623,7 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
spin_lock_init(&sc->recv_io.reassembly.lock);
init_waitqueue_head(&sc->recv_io.reassembly.wait_queue);
+ sc->rw_io.mem.gfp_mask = GFP_KERNEL;
atomic_set(&sc->rw_io.credits.count, 0);
init_waitqueue_head(&sc->rw_io.credits.wait_queue);
@@ -540,9 +632,6 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
atomic_set(&sc->mr_io.ready.count, 0);
init_waitqueue_head(&sc->mr_io.ready.wait_queue);
atomic_set(&sc->mr_io.used.count, 0);
- INIT_WORK(&sc->mr_io.recovery_work, __smbdirect_socket_disabled_work);
- disable_work_sync(&sc->mr_io.recovery_work);
- init_waitqueue_head(&sc->mr_io.cleanup.wait_queue);
sc->logging.private_ptr = NULL;
sc->logging.needed = __smbdirect_log_needed;
@@ -602,6 +691,11 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
#define SMBDIRECT_CHECK_STATUS_WARN(__sc, __expected_status) \
__SMBDIRECT_CHECK_STATUS_WARN(__sc, __expected_status, /* nothing */)
+#ifndef __SMBDIRECT_SOCKET_DISCONNECT
+#define __SMBDIRECT_SOCKET_DISCONNECT(__sc) \
+ smbdirect_socket_schedule_cleanup(__sc, -ECONNABORTED)
+#endif /* ! __SMBDIRECT_SOCKET_DISCONNECT */
+
#define SMBDIRECT_CHECK_STATUS_DISCONNECT(__sc, __expected_status) \
__SMBDIRECT_CHECK_STATUS_WARN(__sc, __expected_status, \
__SMBDIRECT_SOCKET_DISCONNECT(__sc);)
@@ -720,4 +814,19 @@ struct smbdirect_rw_io {
struct scatterlist sg_list[];
};
+static inline size_t smbdirect_get_buf_page_count(const void *buf, size_t size)
+{
+ return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) -
+ (uintptr_t)buf / PAGE_SIZE;
+}
+
+/*
+ * Maximum number of retries on data transfer operations
+ */
+#define SMBDIRECT_RDMA_CM_RETRY 6
+/*
+ * No need to retry on Receiver Not Ready since SMB_DIRECT manages credits
+ */
+#define SMBDIRECT_RDMA_CM_RNR_RETRY 0
+
#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ */
diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig
index 96aa8e2a8770..37387410e5bb 100644
--- a/fs/smb/server/Kconfig
+++ b/fs/smb/server/Kconfig
@@ -47,8 +47,9 @@ if SMB_SERVER
config SMB_SERVER_SMBDIRECT
bool "Support for SMB Direct protocol"
- depends on SMB_SERVER=m && INFINIBAND && INFINIBAND_ADDR_TRANS || SMB_SERVER=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y
- select SG_POOL
+ depends on SMB_SERVER && INFINIBAND && INFINIBAND_ADDR_TRANS
+ depends on SMB_SERVER=m || INFINIBAND=y
+ select SMB_COMMON_SMBDIRECT
default n
help
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 26cfce344861..a26899d12df1 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -376,9 +376,6 @@ int ksmbd_conn_handler_loop(void *p)
mutex_init(&conn->srv_mutex);
__module_get(THIS_MODULE);
- if (t->ops->prepare && t->ops->prepare(t))
- goto out;
-
max_req = server_conf.max_inflight_req;
conn->last_active = jiffies;
set_freezable();
@@ -470,7 +467,6 @@ recheck:
}
}
-out:
ksmbd_conn_set_releasing(conn);
/* Wait till all reference dropped to the Server object*/
ksmbd_debug(CONN, "Wait for all pending requests(%d)\n", atomic_read(&conn->r_count));
@@ -566,6 +562,5 @@ void ksmbd_conn_transport_destroy(void)
ksmbd_tcp_destroy();
ksmbd_rdma_stop_listening();
stop_sessions();
- ksmbd_rdma_destroy();
mutex_unlock(&init_lock);
}
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 1e2587036bca..ae21a1bd4c70 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -127,7 +127,6 @@ struct ksmbd_conn_ops {
};
struct ksmbd_transport_ops {
- int (*prepare)(struct ksmbd_transport *t);
void (*disconnect)(struct ksmbd_transport *t);
void (*shutdown)(struct ksmbd_transport *t);
int (*read)(struct ksmbd_transport *t, char *buf,
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 10ae77dae5a1..ee32e61b6d3c 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -24,7 +24,6 @@
#include "asn1.h"
#include "connection.h"
#include "transport_ipc.h"
-#include "../common/smbdirect/smbdirect.h"
#include "transport_rdma.h"
#include "vfs.h"
#include "vfs_cache.h"
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index dbc8dedb85dc..706a2c897948 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -11,30 +11,19 @@
#include <linux/kthread.h>
#include <linux/list.h>
-#include <linux/mempool.h>
-#include <linux/highmem.h>
-#include <linux/scatterlist.h>
#include <linux/string_choices.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/rdma_cm.h>
-#include <rdma/rw.h>
-
-#define __SMBDIRECT_SOCKET_DISCONNECT(__sc) smb_direct_disconnect_rdma_connection(__sc)
#include "glob.h"
#include "connection.h"
#include "smb_common.h"
#include "../common/smb2status.h"
-#include "../common/smbdirect/smbdirect.h"
-#include "../common/smbdirect/smbdirect_pdu.h"
-#include "../common/smbdirect/smbdirect_socket.h"
#include "transport_rdma.h"
+#include "../common/smbdirect/smbdirect_public.h"
+
#define SMB_DIRECT_PORT_IWARP 5445
#define SMB_DIRECT_PORT_INFINIBAND 445
-#define SMB_DIRECT_VERSION_LE cpu_to_le16(SMBDIRECT_V1)
-
/* SMB_DIRECT negotiation timeout (for the server) in seconds */
#define SMB_DIRECT_NEGOTIATE_TIMEOUT 5
@@ -50,11 +39,6 @@
*/
#define SMB_DIRECT_CM_INITIATOR_DEPTH 8
-/* Maximum number of retries on data transfer operations */
-#define SMB_DIRECT_CM_RETRY 6
-/* No need to retry on Receiver Not Ready since SMB_DIRECT manages credits */
-#define SMB_DIRECT_CM_RNR_RETRY 0
-
/*
* User configurable initial values per SMB_DIRECT transport connection
* as defined in [MS-SMBD] 3.1.1.1
@@ -93,357 +77,118 @@ static int smb_direct_max_receive_size = 1364;
static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE;
-static LIST_HEAD(smb_direct_device_list);
-static DEFINE_RWLOCK(smb_direct_device_lock);
-
-struct smb_direct_device {
- struct ib_device *ib_dev;
- struct list_head list;
-};
-
static struct smb_direct_listener {
int port;
- struct rdma_cm_id *cm_id;
-} smb_direct_ib_listener, smb_direct_iw_listener;
-static struct workqueue_struct *smb_direct_wq;
+ struct task_struct *thread;
+
+ struct smbdirect_socket *socket;
+} smb_direct_ib_listener, smb_direct_iw_listener;
struct smb_direct_transport {
struct ksmbd_transport transport;
- struct smbdirect_socket socket;
+ struct smbdirect_socket *socket;
};
-#define KSMBD_TRANS(t) (&(t)->transport)
-#define SMBD_TRANS(t) (container_of(t, \
- struct smb_direct_transport, transport))
-
-static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops;
-
-void init_smbd_max_io_size(unsigned int sz)
-{
- sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE);
- smb_direct_max_read_write_size = sz;
-}
-
-unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt)
-{
- struct smb_direct_transport *t;
- struct smbdirect_socket *sc;
- struct smbdirect_socket_parameters *sp;
-
- if (kt->ops != &ksmbd_smb_direct_transport_ops)
- return 0;
-
- t = SMBD_TRANS(kt);
- sc = &t->socket;
- sp = &sc->parameters;
-
- return sp->max_read_write_size;
-}
-
-static inline int get_buf_page_count(void *buf, int size)
-{
- return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) -
- (uintptr_t)buf / PAGE_SIZE;
-}
-
-static void smb_direct_destroy_pools(struct smbdirect_socket *sc);
-static void smb_direct_post_recv_credits(struct work_struct *work);
-static int smb_direct_post_send_data(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *send_ctx,
- struct kvec *iov, int niov,
- int remaining_data_length);
-
-static inline void
-*smbdirect_recv_io_payload(struct smbdirect_recv_io *recvmsg)
-{
- return (void *)recvmsg->packet;
-}
-
-static struct
-smbdirect_recv_io *get_free_recvmsg(struct smbdirect_socket *sc)
-{
- struct smbdirect_recv_io *recvmsg = NULL;
- unsigned long flags;
-
- spin_lock_irqsave(&sc->recv_io.free.lock, flags);
- if (!list_empty(&sc->recv_io.free.list)) {
- recvmsg = list_first_entry(&sc->recv_io.free.list,
- struct smbdirect_recv_io,
- list);
- list_del(&recvmsg->list);
- }
- spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
- return recvmsg;
-}
-
-static void put_recvmsg(struct smbdirect_socket *sc,
- struct smbdirect_recv_io *recvmsg)
-{
- unsigned long flags;
-
- if (likely(recvmsg->sge.length != 0)) {
- ib_dma_unmap_single(sc->ib.dev,
- recvmsg->sge.addr,
- recvmsg->sge.length,
- DMA_FROM_DEVICE);
- recvmsg->sge.length = 0;
- }
-
- spin_lock_irqsave(&sc->recv_io.free.lock, flags);
- list_add(&recvmsg->list, &sc->recv_io.free.list);
- spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
-
- queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
-}
-
-static void enqueue_reassembly(struct smbdirect_socket *sc,
- struct smbdirect_recv_io *recvmsg,
- int data_length)
+static bool smb_direct_logging_needed(struct smbdirect_socket *sc,
+ void *private_ptr,
+ unsigned int lvl,
+ unsigned int cls)
{
- unsigned long flags;
-
- spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
- list_add_tail(&recvmsg->list, &sc->recv_io.reassembly.list);
- sc->recv_io.reassembly.queue_length++;
- /*
- * Make sure reassembly_data_length is updated after list and
- * reassembly_queue_length are updated. On the dequeue side
- * reassembly_data_length is checked without a lock to determine
- * if reassembly_queue_length and list is up to date
- */
- virt_wmb();
- sc->recv_io.reassembly.data_length += data_length;
- spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
-}
+ if (lvl <= SMBDIRECT_LOG_ERR)
+ return true;
-static struct smbdirect_recv_io *get_first_reassembly(struct smbdirect_socket *sc)
-{
- if (!list_empty(&sc->recv_io.reassembly.list))
- return list_first_entry(&sc->recv_io.reassembly.list,
- struct smbdirect_recv_io, list);
- else
- return NULL;
-}
+ if (lvl > SMBDIRECT_LOG_INFO)
+ return false;
-static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc)
-{
+ switch (cls) {
/*
- * Wake up all waiters in all wait queues
- * in order to notice the broken connection.
- */
- wake_up_all(&sc->status_wait);
- wake_up_all(&sc->send_io.bcredits.wait_queue);
- wake_up_all(&sc->send_io.lcredits.wait_queue);
- wake_up_all(&sc->send_io.credits.wait_queue);
- wake_up_all(&sc->send_io.pending.zero_wait_queue);
- wake_up_all(&sc->recv_io.reassembly.wait_queue);
- wake_up_all(&sc->rw_io.credits.wait_queue);
-}
-
-static void smb_direct_disconnect_rdma_work(struct work_struct *work)
-{
- struct smbdirect_socket *sc =
- container_of(work, struct smbdirect_socket, disconnect_work);
-
- if (sc->first_error == 0)
- sc->first_error = -ECONNABORTED;
-
+ * These were more or less also logged before
+ * the move to common code.
+ *
+ * SMBDIRECT_LOG_RDMA_MR was not used, but
+ * that's client only code and we should
+ * notice if it's used on the server...
+ */
+ case SMBDIRECT_LOG_RDMA_EVENT:
+ case SMBDIRECT_LOG_RDMA_SEND:
+ case SMBDIRECT_LOG_RDMA_RECV:
+ case SMBDIRECT_LOG_WRITE:
+ case SMBDIRECT_LOG_READ:
+ case SMBDIRECT_LOG_NEGOTIATE:
+ case SMBDIRECT_LOG_OUTGOING:
+ case SMBDIRECT_LOG_RDMA_RW:
+ case SMBDIRECT_LOG_RDMA_MR:
+ return true;
/*
- * make sure this and other work is not queued again
- * but here we don't block and avoid
- * disable[_delayed]_work_sync()
+ * These were not logged before the move
+ * to common code.
*/
- disable_work(&sc->disconnect_work);
- disable_work(&sc->connect.work);
- disable_work(&sc->recv_io.posted.refill_work);
- disable_delayed_work(&sc->idle.timer_work);
- disable_work(&sc->idle.immediate_work);
-
- switch (sc->status) {
- case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
- case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
- case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
- case SMBDIRECT_SOCKET_CONNECTED:
- case SMBDIRECT_SOCKET_ERROR:
- sc->status = SMBDIRECT_SOCKET_DISCONNECTING;
- rdma_disconnect(sc->rdma.cm_id);
- break;
-
- case SMBDIRECT_SOCKET_CREATED:
- case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
- case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
- case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
- case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
- case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
- case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
- case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
- case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
- case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
- /*
- * rdma_accept() never reached
- * RDMA_CM_EVENT_ESTABLISHED
- */
- sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
- break;
-
- case SMBDIRECT_SOCKET_DISCONNECTING:
- case SMBDIRECT_SOCKET_DISCONNECTED:
- case SMBDIRECT_SOCKET_DESTROYED:
- break;
+ case SMBDIRECT_LOG_KEEP_ALIVE:
+ case SMBDIRECT_LOG_INCOMING:
+ return false;
}
/*
- * Wake up all waiters in all wait queues
- * in order to notice the broken connection.
+ * Log all unknown messages
*/
- smb_direct_disconnect_wake_up_all(sc);
+ return true;
}
-static void
-smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc)
+static void smb_direct_logging_vaprintf(struct smbdirect_socket *sc,
+ const char *func,
+ unsigned int line,
+ void *private_ptr,
+ unsigned int lvl,
+ unsigned int cls,
+ struct va_format *vaf)
{
- if (sc->first_error == 0)
- sc->first_error = -ECONNABORTED;
-
- /*
- * make sure other work (than disconnect_work) is
- * not queued again but here we don't block and avoid
- * disable[_delayed]_work_sync()
- */
- disable_work(&sc->connect.work);
- disable_work(&sc->recv_io.posted.refill_work);
- disable_work(&sc->idle.immediate_work);
- disable_delayed_work(&sc->idle.timer_work);
-
- switch (sc->status) {
- case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
- case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
- case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
- case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
- case SMBDIRECT_SOCKET_ERROR:
- case SMBDIRECT_SOCKET_DISCONNECTING:
- case SMBDIRECT_SOCKET_DISCONNECTED:
- case SMBDIRECT_SOCKET_DESTROYED:
- /*
- * Keep the current error status
- */
- break;
-
- case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
- case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
- sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
- break;
-
- case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
- case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
- sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
- break;
-
- case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
- case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
- sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
- break;
-
- case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
- case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
- sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
- break;
-
- case SMBDIRECT_SOCKET_CREATED:
- sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
- break;
-
- case SMBDIRECT_SOCKET_CONNECTED:
- sc->status = SMBDIRECT_SOCKET_ERROR;
- break;
- }
-
- /*
- * Wake up all waiters in all wait queues
- * in order to notice the broken connection.
- */
- smb_direct_disconnect_wake_up_all(sc);
-
- queue_work(sc->workqueue, &sc->disconnect_work);
+ if (lvl <= SMBDIRECT_LOG_ERR)
+ pr_err("%pV", vaf);
+ else
+ ksmbd_debug(RDMA, "%pV", vaf);
}
-static void smb_direct_send_immediate_work(struct work_struct *work)
-{
- struct smbdirect_socket *sc =
- container_of(work, struct smbdirect_socket, idle.immediate_work);
+#define KSMBD_TRANS(t) (&(t)->transport)
+#define SMBD_TRANS(t) (container_of(t, \
+ struct smb_direct_transport, transport))
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
- return;
+static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops;
- smb_direct_post_send_data(sc, NULL, NULL, 0, 0);
+void init_smbd_max_io_size(unsigned int sz)
+{
+ sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE);
+ smb_direct_max_read_write_size = sz;
}
-static void smb_direct_idle_connection_timer(struct work_struct *work)
+unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt)
{
- struct smbdirect_socket *sc =
- container_of(work, struct smbdirect_socket, idle.timer_work.work);
- struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smb_direct_transport *t;
+ const struct smbdirect_socket_parameters *sp;
- if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) {
- smb_direct_disconnect_rdma_connection(sc);
- return;
- }
+ if (kt->ops != &ksmbd_smb_direct_transport_ops)
+ return 0;
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
- return;
+ t = SMBD_TRANS(kt);
+ sp = smbdirect_socket_get_current_parameters(t->socket);
- /*
- * Now use the keepalive timeout (instead of keepalive interval)
- * in order to wait for a response
- */
- sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
- mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
- msecs_to_jiffies(sp->keepalive_timeout_msec));
- queue_work(sc->workqueue, &sc->idle.immediate_work);
+ return sp->max_read_write_size;
}
-static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
+static struct smb_direct_transport *alloc_transport(struct smbdirect_socket *sc)
{
struct smb_direct_transport *t;
- struct smbdirect_socket *sc;
- struct smbdirect_socket_parameters *sp;
struct ksmbd_conn *conn;
t = kzalloc_obj(*t, KSMBD_DEFAULT_GFP);
if (!t)
return NULL;
- sc = &t->socket;
- smbdirect_socket_init(sc);
- sp = &sc->parameters;
-
- sc->workqueue = smb_direct_wq;
-
- INIT_WORK(&sc->disconnect_work, smb_direct_disconnect_rdma_work);
-
- sp->negotiate_timeout_msec = SMB_DIRECT_NEGOTIATE_TIMEOUT * 1000;
- sp->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH;
- sp->responder_resources = 1;
- sp->recv_credit_max = smb_direct_receive_credit_max;
- sp->send_credit_target = smb_direct_send_credit_target;
- sp->max_send_size = smb_direct_max_send_size;
- sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size;
- sp->max_recv_size = smb_direct_max_receive_size;
- sp->max_read_write_size = smb_direct_max_read_write_size;
- sp->keepalive_interval_msec = SMB_DIRECT_KEEPALIVE_SEND_INTERVAL * 1000;
- sp->keepalive_timeout_msec = SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT * 1000;
-
- sc->rdma.cm_id = cm_id;
- cm_id->context = sc;
-
- sc->ib.dev = sc->rdma.cm_id->device;
-
- INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer);
+ t->socket = sc;
conn = ksmbd_conn_alloc();
if (!conn)
- goto err;
+ goto conn_alloc_failed;
down_write(&conn_list_lock);
hash_add(conn_list, &conn->hlist, 0);
@@ -452,1165 +197,45 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
conn->transport = KSMBD_TRANS(t);
KSMBD_TRANS(t)->conn = conn;
KSMBD_TRANS(t)->ops = &ksmbd_smb_direct_transport_ops;
+
return t;
-err:
+
+conn_alloc_failed:
kfree(t);
return NULL;
}
static void smb_direct_free_transport(struct ksmbd_transport *kt)
{
- kfree(SMBD_TRANS(kt));
+ struct smb_direct_transport *t = SMBD_TRANS(kt);
+
+ smbdirect_socket_release(t->socket);
+ kfree(t);
}
static void free_transport(struct smb_direct_transport *t)
{
- struct smbdirect_socket *sc = &t->socket;
- struct smbdirect_recv_io *recvmsg;
-
- disable_work_sync(&sc->disconnect_work);
- if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING)
- smb_direct_disconnect_rdma_work(&sc->disconnect_work);
- if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED)
- wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
-
- /*
- * Wake up all waiters in all wait queues
- * in order to notice the broken connection.
- *
- * Most likely this was already called via
- * smb_direct_disconnect_rdma_work(), but call it again...
- */
- smb_direct_disconnect_wake_up_all(sc);
-
- disable_work_sync(&sc->connect.work);
- disable_work_sync(&sc->recv_io.posted.refill_work);
- disable_delayed_work_sync(&sc->idle.timer_work);
- disable_work_sync(&sc->idle.immediate_work);
-
- if (sc->rdma.cm_id)
- rdma_lock_handler(sc->rdma.cm_id);
-
- if (sc->ib.qp) {
- ib_drain_qp(sc->ib.qp);
- sc->ib.qp = NULL;
- rdma_destroy_qp(sc->rdma.cm_id);
- }
-
- ksmbd_debug(RDMA, "drain the reassembly queue\n");
- do {
- unsigned long flags;
-
- spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
- recvmsg = get_first_reassembly(sc);
- if (recvmsg) {
- list_del(&recvmsg->list);
- spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
- put_recvmsg(sc, recvmsg);
- } else {
- spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
- }
- } while (recvmsg);
- sc->recv_io.reassembly.data_length = 0;
-
- if (sc->ib.send_cq)
- ib_free_cq(sc->ib.send_cq);
- if (sc->ib.recv_cq)
- ib_free_cq(sc->ib.recv_cq);
- if (sc->ib.pd)
- ib_dealloc_pd(sc->ib.pd);
- if (sc->rdma.cm_id) {
- rdma_unlock_handler(sc->rdma.cm_id);
- rdma_destroy_id(sc->rdma.cm_id);
- }
-
- smb_direct_destroy_pools(sc);
+ smbdirect_socket_shutdown(t->socket);
ksmbd_conn_free(KSMBD_TRANS(t)->conn);
}
-static struct smbdirect_send_io
-*smb_direct_alloc_sendmsg(struct smbdirect_socket *sc)
-{
- struct smbdirect_send_io *msg;
-
- msg = mempool_alloc(sc->send_io.mem.pool, KSMBD_DEFAULT_GFP);
- if (!msg)
- return ERR_PTR(-ENOMEM);
- msg->socket = sc;
- INIT_LIST_HEAD(&msg->sibling_list);
- msg->num_sge = 0;
- return msg;
-}
-
-static void smb_direct_free_sendmsg(struct smbdirect_socket *sc,
- struct smbdirect_send_io *msg)
-{
- int i;
-
- /*
- * The list needs to be empty!
- * The caller should take care of it.
- */
- WARN_ON_ONCE(!list_empty(&msg->sibling_list));
-
- if (msg->num_sge > 0) {
- ib_dma_unmap_single(sc->ib.dev,
- msg->sge[0].addr, msg->sge[0].length,
- DMA_TO_DEVICE);
- for (i = 1; i < msg->num_sge; i++)
- ib_dma_unmap_page(sc->ib.dev,
- msg->sge[i].addr, msg->sge[i].length,
- DMA_TO_DEVICE);
- }
- mempool_free(msg, sc->send_io.mem.pool);
-}
-
-static int smb_direct_check_recvmsg(struct smbdirect_recv_io *recvmsg)
-{
- struct smbdirect_socket *sc = recvmsg->socket;
-
- switch (sc->recv_io.expected) {
- case SMBDIRECT_EXPECT_DATA_TRANSFER: {
- struct smbdirect_data_transfer *req =
- (struct smbdirect_data_transfer *)recvmsg->packet;
- struct smb2_hdr *hdr = (struct smb2_hdr *)(recvmsg->packet
- + le32_to_cpu(req->data_offset));
- ksmbd_debug(RDMA,
- "CreditGranted: %u, CreditRequested: %u, DataLength: %u, RemainingDataLength: %u, SMB: %x, Command: %u\n",
- le16_to_cpu(req->credits_granted),
- le16_to_cpu(req->credits_requested),
- req->data_length, req->remaining_data_length,
- hdr->ProtocolId, hdr->Command);
- return 0;
- }
- case SMBDIRECT_EXPECT_NEGOTIATE_REQ: {
- struct smbdirect_negotiate_req *req =
- (struct smbdirect_negotiate_req *)recvmsg->packet;
- ksmbd_debug(RDMA,
- "MinVersion: %u, MaxVersion: %u, CreditRequested: %u, MaxSendSize: %u, MaxRecvSize: %u, MaxFragmentedSize: %u\n",
- le16_to_cpu(req->min_version),
- le16_to_cpu(req->max_version),
- le16_to_cpu(req->credits_requested),
- le32_to_cpu(req->preferred_send_size),
- le32_to_cpu(req->max_receive_size),
- le32_to_cpu(req->max_fragmented_size));
- if (le16_to_cpu(req->min_version) > 0x0100 ||
- le16_to_cpu(req->max_version) < 0x0100)
- return -EOPNOTSUPP;
- if (le16_to_cpu(req->credits_requested) <= 0 ||
- le32_to_cpu(req->max_receive_size) <= 128 ||
- le32_to_cpu(req->max_fragmented_size) <=
- 128 * 1024)
- return -ECONNABORTED;
-
- return 0;
- }
- case SMBDIRECT_EXPECT_NEGOTIATE_REP:
- /* client only */
- break;
- }
-
- /* This is an internal error */
- return -EINVAL;
-}
-
-static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct smbdirect_recv_io *recvmsg;
- struct smbdirect_socket *sc;
- struct smbdirect_socket_parameters *sp;
-
- recvmsg = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
- sc = recvmsg->socket;
- sp = &sc->parameters;
-
- if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
- put_recvmsg(sc, recvmsg);
- if (wc->status != IB_WC_WR_FLUSH_ERR) {
- pr_err("Recv error. status='%s (%d)' opcode=%d\n",
- ib_wc_status_msg(wc->status), wc->status,
- wc->opcode);
- smb_direct_disconnect_rdma_connection(sc);
- }
- return;
- }
-
- ksmbd_debug(RDMA, "Recv completed. status='%s (%d)', opcode=%d\n",
- ib_wc_status_msg(wc->status), wc->status,
- wc->opcode);
-
- ib_dma_sync_single_for_cpu(wc->qp->device, recvmsg->sge.addr,
- recvmsg->sge.length, DMA_FROM_DEVICE);
-
- /*
- * Reset timer to the keepalive interval in
- * order to trigger our next keepalive message.
- */
- sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
- mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
- msecs_to_jiffies(sp->keepalive_interval_msec));
-
- switch (sc->recv_io.expected) {
- case SMBDIRECT_EXPECT_NEGOTIATE_REQ:
- /* see smb_direct_negotiate_recv_done */
- break;
- case SMBDIRECT_EXPECT_DATA_TRANSFER: {
- struct smbdirect_data_transfer *data_transfer =
- (struct smbdirect_data_transfer *)recvmsg->packet;
- u32 remaining_data_length, data_offset, data_length;
- int current_recv_credits;
- u16 old_recv_credit_target;
-
- if (wc->byte_len <
- offsetof(struct smbdirect_data_transfer, padding)) {
- put_recvmsg(sc, recvmsg);
- smb_direct_disconnect_rdma_connection(sc);
- return;
- }
-
- remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
- data_length = le32_to_cpu(data_transfer->data_length);
- data_offset = le32_to_cpu(data_transfer->data_offset);
- if (wc->byte_len < data_offset ||
- wc->byte_len < (u64)data_offset + data_length) {
- put_recvmsg(sc, recvmsg);
- smb_direct_disconnect_rdma_connection(sc);
- return;
- }
- if (remaining_data_length > sp->max_fragmented_recv_size ||
- data_length > sp->max_fragmented_recv_size ||
- (u64)remaining_data_length + (u64)data_length >
- (u64)sp->max_fragmented_recv_size) {
- put_recvmsg(sc, recvmsg);
- smb_direct_disconnect_rdma_connection(sc);
- return;
- }
-
- if (data_length) {
- if (sc->recv_io.reassembly.full_packet_received)
- recvmsg->first_segment = true;
-
- if (le32_to_cpu(data_transfer->remaining_data_length))
- sc->recv_io.reassembly.full_packet_received = false;
- else
- sc->recv_io.reassembly.full_packet_received = true;
- }
-
- atomic_dec(&sc->recv_io.posted.count);
- current_recv_credits = atomic_dec_return(&sc->recv_io.credits.count);
-
- old_recv_credit_target = sc->recv_io.credits.target;
- sc->recv_io.credits.target =
- le16_to_cpu(data_transfer->credits_requested);
- sc->recv_io.credits.target =
- min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
- sc->recv_io.credits.target =
- max_t(u16, sc->recv_io.credits.target, 1);
- atomic_add(le16_to_cpu(data_transfer->credits_granted),
- &sc->send_io.credits.count);
-
- if (le16_to_cpu(data_transfer->flags) &
- SMBDIRECT_FLAG_RESPONSE_REQUESTED)
- queue_work(sc->workqueue, &sc->idle.immediate_work);
-
- if (atomic_read(&sc->send_io.credits.count) > 0)
- wake_up(&sc->send_io.credits.wait_queue);
-
- if (data_length) {
- if (current_recv_credits <= (sc->recv_io.credits.target / 4) ||
- sc->recv_io.credits.target > old_recv_credit_target)
- queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
-
- enqueue_reassembly(sc, recvmsg, (int)data_length);
- wake_up(&sc->recv_io.reassembly.wait_queue);
- } else
- put_recvmsg(sc, recvmsg);
-
- return;
- }
- case SMBDIRECT_EXPECT_NEGOTIATE_REP:
- /* client only */
- break;
- }
-
- /*
- * This is an internal error!
- */
- WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER);
- put_recvmsg(sc, recvmsg);
- smb_direct_disconnect_rdma_connection(sc);
-}
-
-static void smb_direct_negotiate_recv_work(struct work_struct *work);
-
-static void smb_direct_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct smbdirect_recv_io *recv_io =
- container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
- struct smbdirect_socket *sc = recv_io->socket;
- unsigned long flags;
-
- /*
- * reset the common recv_done for later reuse.
- */
- recv_io->cqe.done = recv_done;
-
- if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
- put_recvmsg(sc, recv_io);
- if (wc->status != IB_WC_WR_FLUSH_ERR) {
- pr_err("Negotiate Recv error. status='%s (%d)' opcode=%d\n",
- ib_wc_status_msg(wc->status), wc->status,
- wc->opcode);
- smb_direct_disconnect_rdma_connection(sc);
- }
- return;
- }
-
- ksmbd_debug(RDMA, "Negotiate Recv completed. status='%s (%d)', opcode=%d\n",
- ib_wc_status_msg(wc->status), wc->status,
- wc->opcode);
-
- ib_dma_sync_single_for_cpu(sc->ib.dev,
- recv_io->sge.addr,
- recv_io->sge.length,
- DMA_FROM_DEVICE);
-
- /*
- * This is an internal error!
- */
- if (WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_NEGOTIATE_REQ)) {
- put_recvmsg(sc, recv_io);
- smb_direct_disconnect_rdma_connection(sc);
- return;
- }
-
- /*
- * Don't reset timer to the keepalive interval in
- * this will be done in smb_direct_negotiate_recv_work.
- */
-
- /*
- * Only remember the recv_io if it has enough bytes,
- * this gives smb_direct_negotiate_recv_work enough
- * information in order to disconnect if it was not
- * valid.
- */
- sc->recv_io.reassembly.full_packet_received = true;
- if (wc->byte_len >= sizeof(struct smbdirect_negotiate_req))
- enqueue_reassembly(sc, recv_io, 0);
- else
- put_recvmsg(sc, recv_io);
-
- /*
- * Some drivers (at least mlx5_ib and irdma in roce mode)
- * might post a recv completion before RDMA_CM_EVENT_ESTABLISHED,
- * we need to adjust our expectation in that case.
- *
- * So we defer further processing of the negotiation
- * to smb_direct_negotiate_recv_work().
- *
- * If we are already in SMBDIRECT_SOCKET_NEGOTIATE_NEEDED
- * we queue the work directly otherwise
- * smb_direct_cm_handler() will do it, when
- * RDMA_CM_EVENT_ESTABLISHED arrived.
- */
- spin_lock_irqsave(&sc->connect.lock, flags);
- if (!sc->first_error) {
- INIT_WORK(&sc->connect.work, smb_direct_negotiate_recv_work);
- if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)
- queue_work(sc->workqueue, &sc->connect.work);
- }
- spin_unlock_irqrestore(&sc->connect.lock, flags);
-}
-
-static void smb_direct_negotiate_recv_work(struct work_struct *work)
-{
- struct smbdirect_socket *sc =
- container_of(work, struct smbdirect_socket, connect.work);
- const struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct smbdirect_recv_io *recv_io;
-
- if (sc->first_error)
- return;
-
- ksmbd_debug(RDMA, "Negotiate Recv Work running\n");
-
- /*
- * Reset timer to the keepalive interval in
- * order to trigger our next keepalive message.
- */
- sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
- mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
- msecs_to_jiffies(sp->keepalive_interval_msec));
-
- /*
- * If smb_direct_negotiate_recv_done() detected an
- * invalid request we want to disconnect.
- */
- recv_io = get_first_reassembly(sc);
- if (!recv_io) {
- smb_direct_disconnect_rdma_connection(sc);
- return;
- }
-
- if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)) {
- smb_direct_disconnect_rdma_connection(sc);
- return;
- }
- sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
- wake_up(&sc->status_wait);
-}
-
-static int smb_direct_post_recv(struct smbdirect_socket *sc,
- struct smbdirect_recv_io *recvmsg)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct ib_recv_wr wr;
- int ret;
-
- recvmsg->sge.addr = ib_dma_map_single(sc->ib.dev,
- recvmsg->packet,
- sp->max_recv_size,
- DMA_FROM_DEVICE);
- ret = ib_dma_mapping_error(sc->ib.dev, recvmsg->sge.addr);
- if (ret)
- return ret;
- recvmsg->sge.length = sp->max_recv_size;
- recvmsg->sge.lkey = sc->ib.pd->local_dma_lkey;
-
- wr.wr_cqe = &recvmsg->cqe;
- wr.next = NULL;
- wr.sg_list = &recvmsg->sge;
- wr.num_sge = 1;
-
- ret = ib_post_recv(sc->ib.qp, &wr, NULL);
- if (ret) {
- pr_err("Can't post recv: %d\n", ret);
- ib_dma_unmap_single(sc->ib.dev,
- recvmsg->sge.addr, recvmsg->sge.length,
- DMA_FROM_DEVICE);
- recvmsg->sge.length = 0;
- smb_direct_disconnect_rdma_connection(sc);
- return ret;
- }
- return ret;
-}
-
static int smb_direct_read(struct ksmbd_transport *t, char *buf,
unsigned int size, int unused)
{
- struct smbdirect_recv_io *recvmsg;
- struct smbdirect_data_transfer *data_transfer;
- int to_copy, to_read, data_read, offset;
- u32 data_length, remaining_data_length, data_offset;
- int rc;
struct smb_direct_transport *st = SMBD_TRANS(t);
- struct smbdirect_socket *sc = &st->socket;
-
-again:
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
- pr_err("disconnected\n");
- return -ENOTCONN;
- }
-
- /*
- * No need to hold the reassembly queue lock all the time as we are
- * the only one reading from the front of the queue. The transport
- * may add more entries to the back of the queue at the same time
- */
- if (sc->recv_io.reassembly.data_length >= size) {
- int queue_length;
- int queue_removed = 0;
- unsigned long flags;
-
- /*
- * Need to make sure reassembly_data_length is read before
- * reading reassembly_queue_length and calling
- * get_first_reassembly. This call is lock free
- * as we never read at the end of the queue which are being
- * updated in SOFTIRQ as more data is received
- */
- virt_rmb();
- queue_length = sc->recv_io.reassembly.queue_length;
- data_read = 0;
- to_read = size;
- offset = sc->recv_io.reassembly.first_entry_offset;
- while (data_read < size) {
- recvmsg = get_first_reassembly(sc);
- data_transfer = smbdirect_recv_io_payload(recvmsg);
- data_length = le32_to_cpu(data_transfer->data_length);
- remaining_data_length =
- le32_to_cpu(data_transfer->remaining_data_length);
- data_offset = le32_to_cpu(data_transfer->data_offset);
-
- /*
- * The upper layer expects RFC1002 length at the
- * beginning of the payload. Return it to indicate
- * the total length of the packet. This minimize the
- * change to upper layer packet processing logic. This
- * will be eventually remove when an intermediate
- * transport layer is added
- */
- if (recvmsg->first_segment && size == 4) {
- unsigned int rfc1002_len =
- data_length + remaining_data_length;
- *((__be32 *)buf) = cpu_to_be32(rfc1002_len);
- data_read = 4;
- recvmsg->first_segment = false;
- ksmbd_debug(RDMA,
- "returning rfc1002 length %d\n",
- rfc1002_len);
- goto read_rfc1002_done;
- }
-
- to_copy = min_t(int, data_length - offset, to_read);
- memcpy(buf + data_read, (char *)data_transfer + data_offset + offset,
- to_copy);
-
- /* move on to the next buffer? */
- if (to_copy == data_length - offset) {
- queue_length--;
- /*
- * No need to lock if we are not at the
- * end of the queue
- */
- if (queue_length) {
- list_del(&recvmsg->list);
- } else {
- spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
- list_del(&recvmsg->list);
- spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
- }
- queue_removed++;
- put_recvmsg(sc, recvmsg);
- offset = 0;
- } else {
- offset += to_copy;
- }
-
- to_read -= to_copy;
- data_read += to_copy;
- }
-
- spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
- sc->recv_io.reassembly.data_length -= data_read;
- sc->recv_io.reassembly.queue_length -= queue_removed;
- spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
-
- sc->recv_io.reassembly.first_entry_offset = offset;
- ksmbd_debug(RDMA,
- "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n",
- data_read, sc->recv_io.reassembly.data_length,
- sc->recv_io.reassembly.first_entry_offset);
-read_rfc1002_done:
- return data_read;
- }
-
- ksmbd_debug(RDMA, "wait_event on more data\n");
- rc = wait_event_interruptible(sc->recv_io.reassembly.wait_queue,
- sc->recv_io.reassembly.data_length >= size ||
- sc->status != SMBDIRECT_SOCKET_CONNECTED);
- if (rc)
- return -EINTR;
-
- goto again;
-}
-
-static void smb_direct_post_recv_credits(struct work_struct *work)
-{
- struct smbdirect_socket *sc =
- container_of(work, struct smbdirect_socket, recv_io.posted.refill_work);
- struct smbdirect_recv_io *recvmsg;
- int credits = 0;
- int ret;
-
- if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target) {
- while (true) {
- recvmsg = get_free_recvmsg(sc);
- if (!recvmsg)
- break;
-
- recvmsg->first_segment = false;
-
- ret = smb_direct_post_recv(sc, recvmsg);
- if (ret) {
- pr_err("Can't post recv: %d\n", ret);
- put_recvmsg(sc, recvmsg);
- break;
- }
- credits++;
-
- atomic_inc(&sc->recv_io.posted.count);
- }
- }
-
- atomic_add(credits, &sc->recv_io.credits.available);
-
- /*
- * If the last send credit is waiting for credits
- * it can grant we need to wake it up
- */
- if (credits &&
- atomic_read(&sc->send_io.bcredits.count) == 0 &&
- atomic_read(&sc->send_io.credits.count) == 0)
- wake_up(&sc->send_io.credits.wait_queue);
-
- if (credits)
- queue_work(sc->workqueue, &sc->idle.immediate_work);
-}
-
-static void send_done(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct smbdirect_send_io *sendmsg, *sibling, *next;
- struct smbdirect_socket *sc;
- int lcredits = 0;
-
- sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
- sc = sendmsg->socket;
-
- ksmbd_debug(RDMA, "Send completed. status='%s (%d)', opcode=%d\n",
- ib_wc_status_msg(wc->status), wc->status,
- wc->opcode);
-
- if (unlikely(!(sendmsg->wr.send_flags & IB_SEND_SIGNALED))) {
- /*
- * This happens when smbdirect_send_io is a sibling
- * before the final message, it is signaled on
- * error anyway, so we need to skip
- * smbdirect_connection_free_send_io here,
- * otherwise is will destroy the memory
- * of the siblings too, which will cause
- * use after free problems for the others
- * triggered from ib_drain_qp().
- */
- if (wc->status != IB_WC_SUCCESS)
- goto skip_free;
-
- /*
- * This should not happen!
- * But we better just close the
- * connection...
- */
- pr_err("unexpected send completion wc->status=%s (%d) wc->opcode=%d\n",
- ib_wc_status_msg(wc->status), wc->status, wc->opcode);
- smb_direct_disconnect_rdma_connection(sc);
- return;
- }
-
- /*
- * Free possible siblings and then the main send_io
- */
- list_for_each_entry_safe(sibling, next, &sendmsg->sibling_list, sibling_list) {
- list_del_init(&sibling->sibling_list);
- smb_direct_free_sendmsg(sc, sibling);
- lcredits += 1;
- }
- /* Note this frees wc->wr_cqe, but not wc */
- smb_direct_free_sendmsg(sc, sendmsg);
- lcredits += 1;
-
- if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
-skip_free:
- pr_err("Send error. status='%s (%d)', opcode=%d\n",
- ib_wc_status_msg(wc->status), wc->status,
- wc->opcode);
- smb_direct_disconnect_rdma_connection(sc);
- return;
- }
-
- atomic_add(lcredits, &sc->send_io.lcredits.count);
- wake_up(&sc->send_io.lcredits.wait_queue);
-
- if (atomic_dec_and_test(&sc->send_io.pending.count))
- wake_up(&sc->send_io.pending.zero_wait_queue);
-}
-
-static int manage_credits_prior_sending(struct smbdirect_socket *sc)
-{
- int missing;
- int available;
- int new_credits;
-
- if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target)
- return 0;
-
- missing = (int)sc->recv_io.credits.target - atomic_read(&sc->recv_io.credits.count);
- available = atomic_xchg(&sc->recv_io.credits.available, 0);
- new_credits = (u16)min3(U16_MAX, missing, available);
- if (new_credits <= 0) {
- /*
- * If credits are available, but not granted
- * we need to re-add them again.
- */
- if (available)
- atomic_add(available, &sc->recv_io.credits.available);
- return 0;
- }
-
- if (new_credits < available) {
- /*
- * Readd the remaining available again.
- */
- available -= new_credits;
- atomic_add(available, &sc->recv_io.credits.available);
- }
-
- /*
- * Remember we granted the credits
- */
- atomic_add(new_credits, &sc->recv_io.credits.count);
- return new_credits;
-}
-
-static int manage_keep_alive_before_sending(struct smbdirect_socket *sc)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
-
- if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) {
- sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT;
- /*
- * Now use the keepalive timeout (instead of keepalive interval)
- * in order to wait for a response
- */
- mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
- msecs_to_jiffies(sp->keepalive_timeout_msec));
- return 1;
- }
- return 0;
-}
-
-static int smb_direct_post_send(struct smbdirect_socket *sc,
- struct ib_send_wr *wr)
-{
+ struct smbdirect_socket *sc = st->socket;
+ struct msghdr msg = { .msg_flags = 0, };
+ struct kvec iov = {
+ .iov_base = buf,
+ .iov_len = size,
+ };
int ret;
- atomic_inc(&sc->send_io.pending.count);
- ret = ib_post_send(sc->ib.qp, wr, NULL);
- if (ret) {
- pr_err("failed to post send: %d\n", ret);
- smb_direct_disconnect_rdma_connection(sc);
- }
- return ret;
-}
-
-static void smb_direct_send_ctx_init(struct smbdirect_send_batch *send_ctx,
- bool need_invalidate_rkey,
- unsigned int remote_key)
-{
- INIT_LIST_HEAD(&send_ctx->msg_list);
- send_ctx->wr_cnt = 0;
- send_ctx->need_invalidate_rkey = need_invalidate_rkey;
- send_ctx->remote_key = remote_key;
- send_ctx->credit = 0;
-}
-
-static int smb_direct_flush_send_list(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *send_ctx,
- bool is_last)
-{
- struct smbdirect_send_io *first, *last;
- int ret = 0;
-
- if (list_empty(&send_ctx->msg_list))
- goto release_credit;
-
- first = list_first_entry(&send_ctx->msg_list,
- struct smbdirect_send_io,
- sibling_list);
- last = list_last_entry(&send_ctx->msg_list,
- struct smbdirect_send_io,
- sibling_list);
-
- if (send_ctx->need_invalidate_rkey) {
- first->wr.opcode = IB_WR_SEND_WITH_INV;
- first->wr.ex.invalidate_rkey = send_ctx->remote_key;
- send_ctx->need_invalidate_rkey = false;
- send_ctx->remote_key = 0;
- }
-
- last->wr.send_flags = IB_SEND_SIGNALED;
- last->wr.wr_cqe = &last->cqe;
-
- /*
- * Remove last from send_ctx->msg_list
- * and splice the rest of send_ctx->msg_list
- * to last->sibling_list.
- *
- * send_ctx->msg_list is a valid empty list
- * at the end.
- */
- list_del_init(&last->sibling_list);
- list_splice_tail_init(&send_ctx->msg_list, &last->sibling_list);
- send_ctx->wr_cnt = 0;
-
- ret = smb_direct_post_send(sc, &first->wr);
- if (ret) {
- struct smbdirect_send_io *sibling, *next;
-
- list_for_each_entry_safe(sibling, next, &last->sibling_list, sibling_list) {
- list_del_init(&sibling->sibling_list);
- smb_direct_free_sendmsg(sc, sibling);
- }
- smb_direct_free_sendmsg(sc, last);
- }
-
-release_credit:
- if (is_last && !ret && send_ctx->credit) {
- atomic_add(send_ctx->credit, &sc->send_io.bcredits.count);
- send_ctx->credit = 0;
- wake_up(&sc->send_io.bcredits.wait_queue);
- }
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, size);
- return ret;
-}
-
-static int wait_for_credits(struct smbdirect_socket *sc,
- wait_queue_head_t *waitq, atomic_t *total_credits,
- int needed)
-{
- int ret;
-
- do {
- if (atomic_sub_return(needed, total_credits) >= 0)
- return 0;
-
- atomic_add(needed, total_credits);
- ret = wait_event_interruptible(*waitq,
- atomic_read(total_credits) >= needed ||
- sc->status != SMBDIRECT_SOCKET_CONNECTED);
-
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
- return -ENOTCONN;
- else if (ret < 0)
- return ret;
- } while (true);
-}
-
-static int wait_for_send_bcredit(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *send_ctx)
-{
- int ret;
-
- if (send_ctx->credit)
- return 0;
-
- ret = wait_for_credits(sc,
- &sc->send_io.bcredits.wait_queue,
- &sc->send_io.bcredits.count,
- 1);
- if (ret)
- return ret;
-
- send_ctx->credit = 1;
- return 0;
-}
-
-static int wait_for_send_lcredit(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *send_ctx)
-{
- if (send_ctx && (atomic_read(&sc->send_io.lcredits.count) <= 1)) {
- int ret;
-
- ret = smb_direct_flush_send_list(sc, send_ctx, false);
- if (ret)
- return ret;
- }
-
- return wait_for_credits(sc,
- &sc->send_io.lcredits.wait_queue,
- &sc->send_io.lcredits.count,
- 1);
-}
-
-static int wait_for_send_credits(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *send_ctx)
-{
- int ret;
-
- if (send_ctx &&
- (send_ctx->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) {
- ret = smb_direct_flush_send_list(sc, send_ctx, false);
- if (ret)
- return ret;
- }
-
- return wait_for_credits(sc, &sc->send_io.credits.wait_queue, &sc->send_io.credits.count, 1);
-}
-
-static int wait_for_rw_credits(struct smbdirect_socket *sc, int credits)
-{
- return wait_for_credits(sc,
- &sc->rw_io.credits.wait_queue,
- &sc->rw_io.credits.count,
- credits);
-}
-
-static int calc_rw_credits(struct smbdirect_socket *sc,
- char *buf, unsigned int len)
-{
- return DIV_ROUND_UP(get_buf_page_count(buf, len),
- sc->rw_io.credits.num_pages);
-}
-
-static int smb_direct_create_header(struct smbdirect_socket *sc,
- int size, int remaining_data_length,
- int new_credits,
- struct smbdirect_send_io **sendmsg_out)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct smbdirect_send_io *sendmsg;
- struct smbdirect_data_transfer *packet;
- int header_length;
- int ret;
-
- sendmsg = smb_direct_alloc_sendmsg(sc);
- if (IS_ERR(sendmsg))
- return PTR_ERR(sendmsg);
-
- /* Fill in the packet header */
- packet = (struct smbdirect_data_transfer *)sendmsg->packet;
- packet->credits_requested = cpu_to_le16(sp->send_credit_target);
- packet->credits_granted = cpu_to_le16(new_credits);
-
- packet->flags = 0;
- if (manage_keep_alive_before_sending(sc))
- packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED);
-
- packet->reserved = 0;
- if (!size)
- packet->data_offset = 0;
- else
- packet->data_offset = cpu_to_le32(24);
- packet->data_length = cpu_to_le32(size);
- packet->remaining_data_length = cpu_to_le32(remaining_data_length);
- packet->padding = 0;
-
- ksmbd_debug(RDMA,
- "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n",
- le16_to_cpu(packet->credits_requested),
- le16_to_cpu(packet->credits_granted),
- le32_to_cpu(packet->data_offset),
- le32_to_cpu(packet->data_length),
- le32_to_cpu(packet->remaining_data_length));
-
- /* Map the packet to DMA */
- header_length = sizeof(struct smbdirect_data_transfer);
- /* If this is a packet without payload, don't send padding */
- if (!size)
- header_length =
- offsetof(struct smbdirect_data_transfer, padding);
-
- sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev,
- (void *)packet,
- header_length,
- DMA_TO_DEVICE);
- ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr);
- if (ret) {
- smb_direct_free_sendmsg(sc, sendmsg);
- return ret;
- }
-
- sendmsg->num_sge = 1;
- sendmsg->sge[0].length = header_length;
- sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey;
-
- *sendmsg_out = sendmsg;
- return 0;
-}
-
-static int get_sg_list(void *buf, int size, struct scatterlist *sg_list, int nentries)
-{
- bool high = is_vmalloc_addr(buf);
- struct page *page;
- int offset, len;
- int i = 0;
-
- if (size <= 0 || nentries < get_buf_page_count(buf, size))
- return -EINVAL;
-
- offset = offset_in_page(buf);
- buf -= offset;
- while (size > 0) {
- len = min_t(int, PAGE_SIZE - offset, size);
- if (high)
- page = vmalloc_to_page(buf);
- else
- page = kmap_to_page(buf);
-
- if (!sg_list)
- return -EINVAL;
- sg_set_page(sg_list, page, len, offset);
- sg_list = sg_next(sg_list);
-
- buf += PAGE_SIZE;
- size -= len;
- offset = 0;
- i++;
- }
- return i;
-}
-
-static int get_mapped_sg_list(struct ib_device *device, void *buf, int size,
- struct scatterlist *sg_list, int nentries,
- enum dma_data_direction dir, int *npages)
-{
- *npages = get_sg_list(buf, size, sg_list, nentries);
- if (*npages < 0)
- return -EINVAL;
- return ib_dma_map_sg(device, sg_list, *npages, dir);
-}
-
-static int post_sendmsg(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *send_ctx,
- struct smbdirect_send_io *msg)
-{
- int i;
-
- for (i = 0; i < msg->num_sge; i++)
- ib_dma_sync_single_for_device(sc->ib.dev,
- msg->sge[i].addr, msg->sge[i].length,
- DMA_TO_DEVICE);
-
- msg->cqe.done = send_done;
- msg->wr.opcode = IB_WR_SEND;
- msg->wr.sg_list = &msg->sge[0];
- msg->wr.num_sge = msg->num_sge;
- msg->wr.next = NULL;
-
- if (send_ctx) {
- msg->wr.wr_cqe = NULL;
- msg->wr.send_flags = 0;
- if (!list_empty(&send_ctx->msg_list)) {
- struct smbdirect_send_io *last;
-
- last = list_last_entry(&send_ctx->msg_list,
- struct smbdirect_send_io,
- sibling_list);
- last->wr.next = &msg->wr;
- }
- list_add_tail(&msg->sibling_list, &send_ctx->msg_list);
- send_ctx->wr_cnt++;
- return 0;
- }
-
- msg->wr.wr_cqe = &msg->cqe;
- msg->wr.send_flags = IB_SEND_SIGNALED;
- return smb_direct_post_send(sc, &msg->wr);
-}
-
-static int smb_direct_post_send_data(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *send_ctx,
- struct kvec *iov, int niov,
- int remaining_data_length)
-{
- int i, j, ret;
- struct smbdirect_send_io *msg;
- int data_length;
- struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1];
- struct smbdirect_send_batch _send_ctx;
- int new_credits;
-
- if (!send_ctx) {
- smb_direct_send_ctx_init(&_send_ctx, false, 0);
- send_ctx = &_send_ctx;
- }
-
- ret = wait_for_send_bcredit(sc, send_ctx);
- if (ret)
- goto bcredit_failed;
-
- ret = wait_for_send_lcredit(sc, send_ctx);
- if (ret)
- goto lcredit_failed;
-
- ret = wait_for_send_credits(sc, send_ctx);
- if (ret)
- goto credit_failed;
-
- new_credits = manage_credits_prior_sending(sc);
- if (new_credits == 0 &&
- atomic_read(&sc->send_io.credits.count) == 0 &&
- atomic_read(&sc->recv_io.credits.count) == 0) {
- queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
- ret = wait_event_interruptible(sc->send_io.credits.wait_queue,
- atomic_read(&sc->send_io.credits.count) >= 1 ||
- atomic_read(&sc->recv_io.credits.available) >= 1 ||
- sc->status != SMBDIRECT_SOCKET_CONNECTED);
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
- ret = -ENOTCONN;
- if (ret < 0)
- goto credit_failed;
-
- new_credits = manage_credits_prior_sending(sc);
- }
-
- data_length = 0;
- for (i = 0; i < niov; i++)
- data_length += iov[i].iov_len;
-
- ret = smb_direct_create_header(sc, data_length, remaining_data_length,
- new_credits, &msg);
- if (ret)
- goto header_failed;
-
- for (i = 0; i < niov; i++) {
- struct ib_sge *sge;
- int sg_cnt;
- int npages;
-
- sg_init_table(sg, SMBDIRECT_SEND_IO_MAX_SGE - 1);
- sg_cnt = get_mapped_sg_list(sc->ib.dev,
- iov[i].iov_base, iov[i].iov_len,
- sg, SMBDIRECT_SEND_IO_MAX_SGE - 1,
- DMA_TO_DEVICE, &npages);
- if (sg_cnt <= 0) {
- pr_err("failed to map buffer\n");
- ret = -ENOMEM;
- goto err;
- } else if (sg_cnt + msg->num_sge > SMBDIRECT_SEND_IO_MAX_SGE) {
- pr_err("buffer not fitted into sges\n");
- ret = -E2BIG;
- ib_dma_unmap_sg(sc->ib.dev, sg, npages,
- DMA_TO_DEVICE);
- goto err;
- }
-
- for (j = 0; j < sg_cnt; j++) {
- sge = &msg->sge[msg->num_sge];
- sge->addr = sg_dma_address(&sg[j]);
- sge->length = sg_dma_len(&sg[j]);
- sge->lkey = sc->ib.pd->local_dma_lkey;
- msg->num_sge++;
- }
- }
-
- ret = post_sendmsg(sc, send_ctx, msg);
- if (ret)
- goto err;
-
- /*
- * From here msg is moved to send_ctx
- * and we should not free it explicitly.
- */
-
- if (send_ctx == &_send_ctx) {
- ret = smb_direct_flush_send_list(sc, send_ctx, true);
- if (ret)
- goto flush_failed;
- }
-
- return 0;
-err:
- smb_direct_free_sendmsg(sc, msg);
-flush_failed:
-header_failed:
- atomic_inc(&sc->send_io.credits.count);
-credit_failed:
- atomic_inc(&sc->send_io.lcredits.count);
-lcredit_failed:
- atomic_add(send_ctx->credit, &sc->send_io.bcredits.count);
- send_ctx->credit = 0;
-bcredit_failed:
+ ret = smbdirect_connection_recvmsg(sc, &msg, 0);
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
return ret;
}
@@ -1619,319 +244,13 @@ static int smb_direct_writev(struct ksmbd_transport *t,
bool need_invalidate, unsigned int remote_key)
{
struct smb_direct_transport *st = SMBD_TRANS(t);
- struct smbdirect_socket *sc = &st->socket;
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- size_t remaining_data_length;
- size_t iov_idx;
- size_t iov_ofs;
- size_t max_iov_size = sp->max_send_size -
- sizeof(struct smbdirect_data_transfer);
- int ret;
- struct smbdirect_send_batch send_ctx;
- int error = 0;
-
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
- return -ENOTCONN;
-
- //FIXME: skip RFC1002 header..
- if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4))
- return -EINVAL;
- buflen -= 4;
- iov_idx = 1;
- iov_ofs = 0;
-
- remaining_data_length = buflen;
- ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen);
-
- smb_direct_send_ctx_init(&send_ctx, need_invalidate, remote_key);
- while (remaining_data_length) {
- struct kvec vecs[SMBDIRECT_SEND_IO_MAX_SGE - 1]; /* minus smbdirect hdr */
- size_t possible_bytes = max_iov_size;
- size_t possible_vecs;
- size_t bytes = 0;
- size_t nvecs = 0;
+ struct smbdirect_socket *sc = st->socket;
+ struct iov_iter iter;
- /*
- * For the last message remaining_data_length should be
- * have been 0 already!
- */
- if (WARN_ON_ONCE(iov_idx >= niovs)) {
- error = -EINVAL;
- goto done;
- }
+ iov_iter_kvec(&iter, ITER_SOURCE, iov, niovs, buflen);
- /*
- * We have 2 factors which limit the arguments we pass
- * to smb_direct_post_send_data():
- *
- * 1. The number of supported sges for the send,
- * while one is reserved for the smbdirect header.
- * And we currently need one SGE per page.
- * 2. The number of negotiated payload bytes per send.
- */
- possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx);
-
- while (iov_idx < niovs && possible_vecs && possible_bytes) {
- struct kvec *v = &vecs[nvecs];
- int page_count;
-
- v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs;
- v->iov_len = min_t(size_t,
- iov[iov_idx].iov_len - iov_ofs,
- possible_bytes);
- page_count = get_buf_page_count(v->iov_base, v->iov_len);
- if (page_count > possible_vecs) {
- /*
- * If the number of pages in the buffer
- * is to much (because we currently require
- * one SGE per page), we need to limit the
- * length.
- *
- * We know possible_vecs is at least 1,
- * so we always keep the first page.
- *
- * We need to calculate the number extra
- * pages (epages) we can also keep.
- *
- * We calculate the number of bytes in the
- * first page (fplen), this should never be
- * larger than v->iov_len because page_count is
- * at least 2, but adding a limitation feels
- * better.
- *
- * Then we calculate the number of bytes (elen)
- * we can keep for the extra pages.
- */
- size_t epages = possible_vecs - 1;
- size_t fpofs = offset_in_page(v->iov_base);
- size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len);
- size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE);
-
- v->iov_len = fplen + elen;
- page_count = get_buf_page_count(v->iov_base, v->iov_len);
- if (WARN_ON_ONCE(page_count > possible_vecs)) {
- /*
- * Something went wrong in the above
- * logic...
- */
- error = -EINVAL;
- goto done;
- }
- }
- possible_vecs -= page_count;
- nvecs += 1;
- possible_bytes -= v->iov_len;
- bytes += v->iov_len;
-
- iov_ofs += v->iov_len;
- if (iov_ofs >= iov[iov_idx].iov_len) {
- iov_idx += 1;
- iov_ofs = 0;
- }
- }
-
- remaining_data_length -= bytes;
-
- ret = smb_direct_post_send_data(sc, &send_ctx,
- vecs, nvecs,
- remaining_data_length);
- if (unlikely(ret)) {
- error = ret;
- goto done;
- }
- }
-
-done:
- ret = smb_direct_flush_send_list(sc, &send_ctx, true);
- if (unlikely(!ret && error))
- ret = error;
-
- /*
- * As an optimization, we don't wait for individual I/O to finish
- * before sending the next one.
- * Send them all and wait for pending send count to get to 0
- * that means all the I/Os have been out and we are good to return
- */
-
- wait_event(sc->send_io.pending.zero_wait_queue,
- atomic_read(&sc->send_io.pending.count) == 0 ||
- sc->status != SMBDIRECT_SOCKET_CONNECTED);
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED && ret == 0)
- ret = -ENOTCONN;
-
- return ret;
-}
-
-static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t,
- struct smbdirect_rw_io *msg,
- enum dma_data_direction dir)
-{
- struct smbdirect_socket *sc = &t->socket;
-
- rdma_rw_ctx_destroy(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port,
- msg->sgt.sgl, msg->sgt.nents, dir);
- sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
- kfree(msg);
-}
-
-static void read_write_done(struct ib_cq *cq, struct ib_wc *wc,
- enum dma_data_direction dir)
-{
- struct smbdirect_rw_io *msg =
- container_of(wc->wr_cqe, struct smbdirect_rw_io, cqe);
- struct smbdirect_socket *sc = msg->socket;
-
- if (wc->status != IB_WC_SUCCESS) {
- msg->error = -EIO;
- pr_err("read/write error. opcode = %d, status = %s(%d)\n",
- wc->opcode, ib_wc_status_msg(wc->status), wc->status);
- if (wc->status != IB_WC_WR_FLUSH_ERR)
- smb_direct_disconnect_rdma_connection(sc);
- }
-
- complete(msg->completion);
-}
-
-static void read_done(struct ib_cq *cq, struct ib_wc *wc)
-{
- read_write_done(cq, wc, DMA_FROM_DEVICE);
-}
-
-static void write_done(struct ib_cq *cq, struct ib_wc *wc)
-{
- read_write_done(cq, wc, DMA_TO_DEVICE);
-}
-
-static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
- void *buf, int buf_len,
- struct smbdirect_buffer_descriptor_v1 *desc,
- unsigned int desc_len,
- bool is_read)
-{
- struct smbdirect_socket *sc = &t->socket;
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct smbdirect_rw_io *msg, *next_msg;
- int i, ret;
- DECLARE_COMPLETION_ONSTACK(completion);
- struct ib_send_wr *first_wr;
- LIST_HEAD(msg_list);
- char *desc_buf;
- int credits_needed;
- unsigned int desc_buf_len, desc_num = 0;
-
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
- return -ENOTCONN;
-
- if (buf_len > sp->max_read_write_size)
- return -EINVAL;
-
- /* calculate needed credits */
- credits_needed = 0;
- desc_buf = buf;
- for (i = 0; i < desc_len / sizeof(*desc); i++) {
- if (!buf_len)
- break;
-
- desc_buf_len = le32_to_cpu(desc[i].length);
- if (!desc_buf_len)
- return -EINVAL;
-
- if (desc_buf_len > buf_len) {
- desc_buf_len = buf_len;
- desc[i].length = cpu_to_le32(desc_buf_len);
- buf_len = 0;
- }
-
- credits_needed += calc_rw_credits(sc, desc_buf, desc_buf_len);
- desc_buf += desc_buf_len;
- buf_len -= desc_buf_len;
- desc_num++;
- }
-
- ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n",
- str_read_write(is_read), buf_len, credits_needed);
-
- ret = wait_for_rw_credits(sc, credits_needed);
- if (ret < 0)
- return ret;
-
- /* build rdma_rw_ctx for each descriptor */
- desc_buf = buf;
- for (i = 0; i < desc_num; i++) {
- msg = kzalloc_flex(*msg, sg_list, SG_CHUNK_SIZE,
- KSMBD_DEFAULT_GFP);
- if (!msg) {
- ret = -ENOMEM;
- goto out;
- }
-
- desc_buf_len = le32_to_cpu(desc[i].length);
-
- msg->socket = sc;
- msg->cqe.done = is_read ? read_done : write_done;
- msg->completion = &completion;
-
- msg->sgt.sgl = &msg->sg_list[0];
- ret = sg_alloc_table_chained(&msg->sgt,
- get_buf_page_count(desc_buf, desc_buf_len),
- msg->sg_list, SG_CHUNK_SIZE);
- if (ret) {
- ret = -ENOMEM;
- goto free_msg;
- }
-
- ret = get_sg_list(desc_buf, desc_buf_len,
- msg->sgt.sgl, msg->sgt.orig_nents);
- if (ret < 0)
- goto free_table;
-
- ret = rdma_rw_ctx_init(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port,
- msg->sgt.sgl,
- get_buf_page_count(desc_buf, desc_buf_len),
- 0,
- le64_to_cpu(desc[i].offset),
- le32_to_cpu(desc[i].token),
- is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
- if (ret < 0) {
- pr_err("failed to init rdma_rw_ctx: %d\n", ret);
- goto free_table;
- }
-
- list_add_tail(&msg->list, &msg_list);
- desc_buf += desc_buf_len;
- }
-
- /* concatenate work requests of rdma_rw_ctxs */
- first_wr = NULL;
- list_for_each_entry_reverse(msg, &msg_list, list) {
- first_wr = rdma_rw_ctx_wrs(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port,
- &msg->cqe, first_wr);
- }
-
- ret = ib_post_send(sc->ib.qp, first_wr, NULL);
- if (ret) {
- pr_err("failed to post send wr for RDMA R/W: %d\n", ret);
- goto out;
- }
-
- msg = list_last_entry(&msg_list, struct smbdirect_rw_io, list);
- wait_for_completion(&completion);
- ret = msg->error;
-out:
- list_for_each_entry_safe(msg, next_msg, &msg_list, list) {
- list_del(&msg->list);
- smb_direct_free_rdma_rw_msg(t, msg,
- is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
- }
- atomic_add(credits_needed, &sc->rw_io.credits.count);
- wake_up(&sc->rw_io.credits.wait_queue);
- return ret;
-
-free_table:
- sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
-free_msg:
- kfree(msg);
- goto out;
+ return smbdirect_connection_send_iter(sc, &iter, 0,
+ need_invalidate, remote_key);
}
static int smb_direct_rdma_write(struct ksmbd_transport *t,
@@ -1939,8 +258,11 @@ static int smb_direct_rdma_write(struct ksmbd_transport *t,
struct smbdirect_buffer_descriptor_v1 *desc,
unsigned int desc_len)
{
- return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen,
- desc, desc_len, false);
+ struct smb_direct_transport *st = SMBD_TRANS(t);
+ struct smbdirect_socket *sc = st->socket;
+
+ return smbdirect_connection_rdma_xmit(sc, buf, buflen,
+ desc, desc_len, false);
}
static int smb_direct_rdma_read(struct ksmbd_transport *t,
@@ -1948,16 +270,19 @@ static int smb_direct_rdma_read(struct ksmbd_transport *t,
struct smbdirect_buffer_descriptor_v1 *desc,
unsigned int desc_len)
{
- return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen,
- desc, desc_len, true);
+ struct smb_direct_transport *st = SMBD_TRANS(t);
+ struct smbdirect_socket *sc = st->socket;
+
+ return smbdirect_connection_rdma_xmit(sc, buf, buflen,
+ desc, desc_len, true);
}
static void smb_direct_disconnect(struct ksmbd_transport *t)
{
struct smb_direct_transport *st = SMBD_TRANS(t);
- struct smbdirect_socket *sc = &st->socket;
+ struct smbdirect_socket *sc = st->socket;
- ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", sc->rdma.cm_id);
+ ksmbd_debug(RDMA, "Disconnecting sc=%p\n", sc);
free_transport(st);
}
@@ -1965,840 +290,103 @@ static void smb_direct_disconnect(struct ksmbd_transport *t)
static void smb_direct_shutdown(struct ksmbd_transport *t)
{
struct smb_direct_transport *st = SMBD_TRANS(t);
- struct smbdirect_socket *sc = &st->socket;
-
- ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", sc->rdma.cm_id);
-
- smb_direct_disconnect_rdma_work(&sc->disconnect_work);
-}
-
-static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event)
-{
- struct smbdirect_socket *sc = cm_id->context;
- unsigned long flags;
-
- ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n",
- cm_id, rdma_event_msg(event->event), event->event);
-
- switch (event->event) {
- case RDMA_CM_EVENT_ESTABLISHED: {
- /*
- * Some drivers (at least mlx5_ib and irdma in roce mode)
- * might post a recv completion before RDMA_CM_EVENT_ESTABLISHED,
- * we need to adjust our expectation in that case.
- *
- * If smb_direct_negotiate_recv_done was called first
- * it initialized sc->connect.work only for us to
- * start, so that we turned into
- * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, before
- * smb_direct_negotiate_recv_work() runs.
- *
- * If smb_direct_negotiate_recv_done didn't happen
- * yet. sc->connect.work is still be disabled and
- * queue_work() is a no-op.
- */
- if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING))
- break;
- sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
- spin_lock_irqsave(&sc->connect.lock, flags);
- if (!sc->first_error)
- queue_work(sc->workqueue, &sc->connect.work);
- spin_unlock_irqrestore(&sc->connect.lock, flags);
- wake_up(&sc->status_wait);
- break;
- }
- case RDMA_CM_EVENT_DEVICE_REMOVAL:
- case RDMA_CM_EVENT_DISCONNECTED: {
- sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
- smb_direct_disconnect_rdma_work(&sc->disconnect_work);
- if (sc->ib.qp)
- ib_drain_qp(sc->ib.qp);
- break;
- }
- case RDMA_CM_EVENT_CONNECT_ERROR: {
- sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
- smb_direct_disconnect_rdma_work(&sc->disconnect_work);
- break;
- }
- default:
- pr_err("Unexpected RDMA CM event. cm_id=%p, event=%s (%d)\n",
- cm_id, rdma_event_msg(event->event),
- event->event);
- break;
- }
- return 0;
-}
+ struct smbdirect_socket *sc = st->socket;
-static void smb_direct_qpair_handler(struct ib_event *event, void *context)
-{
- struct smbdirect_socket *sc = context;
+ ksmbd_debug(RDMA, "smb-direct shutdown sc=%p\n", sc);
- ksmbd_debug(RDMA, "Received QP event. cm_id=%p, event=%s (%d)\n",
- sc->rdma.cm_id, ib_event_msg(event->event), event->event);
-
- switch (event->event) {
- case IB_EVENT_CQ_ERR:
- case IB_EVENT_QP_FATAL:
- smb_direct_disconnect_rdma_connection(sc);
- break;
- default:
- break;
- }
+ smbdirect_socket_shutdown(sc);
}
-static int smb_direct_send_negotiate_response(struct smbdirect_socket *sc,
- int failed)
+static int smb_direct_new_connection(struct smb_direct_listener *listener,
+ struct smbdirect_socket *client_sc)
{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct smbdirect_send_io *sendmsg;
- struct smbdirect_negotiate_resp *resp;
- int ret;
-
- sendmsg = smb_direct_alloc_sendmsg(sc);
- if (IS_ERR(sendmsg))
- return -ENOMEM;
-
- resp = (struct smbdirect_negotiate_resp *)sendmsg->packet;
- if (failed) {
- memset(resp, 0, sizeof(*resp));
- resp->min_version = SMB_DIRECT_VERSION_LE;
- resp->max_version = SMB_DIRECT_VERSION_LE;
- resp->status = STATUS_NOT_SUPPORTED;
-
- sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
- } else {
- resp->status = STATUS_SUCCESS;
- resp->min_version = SMB_DIRECT_VERSION_LE;
- resp->max_version = SMB_DIRECT_VERSION_LE;
- resp->negotiated_version = SMB_DIRECT_VERSION_LE;
- resp->reserved = 0;
- resp->credits_requested =
- cpu_to_le16(sp->send_credit_target);
- resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc));
- resp->max_readwrite_size = cpu_to_le32(sp->max_read_write_size);
- resp->preferred_send_size = cpu_to_le32(sp->max_send_size);
- resp->max_receive_size = cpu_to_le32(sp->max_recv_size);
- resp->max_fragmented_size =
- cpu_to_le32(sp->max_fragmented_recv_size);
-
- atomic_set(&sc->send_io.bcredits.count, 1);
- sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
- sc->status = SMBDIRECT_SOCKET_CONNECTED;
- }
-
- sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev,
- (void *)resp, sizeof(*resp),
- DMA_TO_DEVICE);
- ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr);
- if (ret) {
- smb_direct_free_sendmsg(sc, sendmsg);
- return ret;
- }
-
- sendmsg->num_sge = 1;
- sendmsg->sge[0].length = sizeof(*resp);
- sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey;
-
- ret = post_sendmsg(sc, NULL, sendmsg);
- if (ret) {
- smb_direct_free_sendmsg(sc, sendmsg);
- return ret;
- }
-
- wait_event(sc->send_io.pending.zero_wait_queue,
- atomic_read(&sc->send_io.pending.count) == 0 ||
- sc->status != SMBDIRECT_SOCKET_CONNECTED);
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
- return -ENOTCONN;
-
- return 0;
-}
-
-static int smb_direct_accept_client(struct smbdirect_socket *sc)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct rdma_conn_param conn_param;
- __be32 ird_ord_hdr[2];
- int ret;
-
- /*
- * smb_direct_handle_connect_request()
- * already negotiated sp->initiator_depth
- * and sp->responder_resources
- */
- memset(&conn_param, 0, sizeof(conn_param));
- conn_param.initiator_depth = sp->initiator_depth;
- conn_param.responder_resources = sp->responder_resources;
-
- if (sc->rdma.legacy_iwarp) {
- ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources);
- ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth);
- conn_param.private_data = ird_ord_hdr;
- conn_param.private_data_len = sizeof(ird_ord_hdr);
- } else {
- conn_param.private_data = NULL;
- conn_param.private_data_len = 0;
- }
- conn_param.retry_count = SMB_DIRECT_CM_RETRY;
- conn_param.rnr_retry_count = SMB_DIRECT_CM_RNR_RETRY;
- conn_param.flow_control = 0;
-
- /*
- * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING
- * so that the timer will cause a disconnect.
- */
- sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
- mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
- msecs_to_jiffies(sp->negotiate_timeout_msec));
-
- WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED);
- sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING;
- ret = rdma_accept(sc->rdma.cm_id, &conn_param);
- if (ret) {
- pr_err("error at rdma_accept: %d\n", ret);
- return ret;
- }
- return 0;
-}
-
-static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc)
-{
- struct smbdirect_recv_io *recvmsg;
- bool recv_posted = false;
+ struct smb_direct_transport *t;
+ struct task_struct *handler;
int ret;
- WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
- sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED;
-
- sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ;
-
- recvmsg = get_free_recvmsg(sc);
- if (!recvmsg)
+ t = alloc_transport(client_sc);
+ if (!t) {
+ smbdirect_socket_release(client_sc);
return -ENOMEM;
- recvmsg->cqe.done = smb_direct_negotiate_recv_done;
-
- ret = smb_direct_post_recv(sc, recvmsg);
- if (ret) {
- pr_err("Can't post recv: %d\n", ret);
- goto out_err;
}
- recv_posted = true;
- ret = smb_direct_accept_client(sc);
- if (ret) {
- pr_err("Can't accept client\n");
+ handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn, "ksmbd:r%u",
+ listener->port);
+ if (IS_ERR(handler)) {
+ ret = PTR_ERR(handler);
+ pr_err("Can't start thread\n");
goto out_err;
}
return 0;
out_err:
- /*
- * If the recv was never posted, return it to the free list.
- * If it was posted, leave it alone so disconnect teardown can
- * drain the QP and complete it (flush) and the completion path
- * will unmap it exactly once.
- */
- if (!recv_posted)
- put_recvmsg(sc, recvmsg);
- return ret;
-}
-
-static int smb_direct_init_params(struct smbdirect_socket *sc)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- int max_send_sges;
- unsigned int maxpages;
-
- /* need 3 more sge. because a SMB_DIRECT header, SMB2 header,
- * SMB2 response could be mapped.
- */
- max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3;
- if (max_send_sges > SMBDIRECT_SEND_IO_MAX_SGE) {
- pr_err("max_send_size %d is too large\n", sp->max_send_size);
- return -EINVAL;
- }
-
- atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target);
-
- maxpages = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE);
- sc->rw_io.credits.max = rdma_rw_mr_factor(sc->ib.dev,
- sc->rdma.cm_id->port_num,
- maxpages);
- sc->rw_io.credits.num_pages = DIV_ROUND_UP(maxpages, sc->rw_io.credits.max);
- /* add one extra in order to handle unaligned pages */
- sc->rw_io.credits.max += 1;
-
- sc->recv_io.credits.target = 1;
-
- atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max);
-
- return 0;
-}
-
-static void smb_direct_destroy_pools(struct smbdirect_socket *sc)
-{
- struct smbdirect_recv_io *recvmsg;
-
- while ((recvmsg = get_free_recvmsg(sc)))
- mempool_free(recvmsg, sc->recv_io.mem.pool);
-
- mempool_destroy(sc->recv_io.mem.pool);
- sc->recv_io.mem.pool = NULL;
-
- kmem_cache_destroy(sc->recv_io.mem.cache);
- sc->recv_io.mem.cache = NULL;
-
- mempool_destroy(sc->send_io.mem.pool);
- sc->send_io.mem.pool = NULL;
-
- kmem_cache_destroy(sc->send_io.mem.cache);
- sc->send_io.mem.cache = NULL;
-}
-
-static int smb_direct_create_pools(struct smbdirect_socket *sc)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- char name[80];
- int i;
- struct smbdirect_recv_io *recvmsg;
-
- snprintf(name, sizeof(name), "smbdirect_send_io_pool_%p", sc);
- sc->send_io.mem.cache = kmem_cache_create(name,
- sizeof(struct smbdirect_send_io) +
- sizeof(struct smbdirect_negotiate_resp),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (!sc->send_io.mem.cache)
- return -ENOMEM;
-
- sc->send_io.mem.pool = mempool_create(sp->send_credit_target,
- mempool_alloc_slab, mempool_free_slab,
- sc->send_io.mem.cache);
- if (!sc->send_io.mem.pool)
- goto err;
-
- snprintf(name, sizeof(name), "smbdirect_recv_io_pool_%p", sc);
- sc->recv_io.mem.cache = kmem_cache_create(name,
- sizeof(struct smbdirect_recv_io) +
- sp->max_recv_size,
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (!sc->recv_io.mem.cache)
- goto err;
-
- sc->recv_io.mem.pool =
- mempool_create(sp->recv_credit_max, mempool_alloc_slab,
- mempool_free_slab, sc->recv_io.mem.cache);
- if (!sc->recv_io.mem.pool)
- goto err;
-
- for (i = 0; i < sp->recv_credit_max; i++) {
- recvmsg = mempool_alloc(sc->recv_io.mem.pool, KSMBD_DEFAULT_GFP);
- if (!recvmsg)
- goto err;
- recvmsg->socket = sc;
- recvmsg->sge.length = 0;
- list_add(&recvmsg->list, &sc->recv_io.free.list);
- }
-
- return 0;
-err:
- smb_direct_destroy_pools(sc);
- return -ENOMEM;
-}
-
-static u32 smb_direct_rdma_rw_send_wrs(struct ib_device *dev, const struct ib_qp_init_attr *attr)
-{
- /*
- * This could be split out of rdma_rw_init_qp()
- * and be a helper function next to rdma_rw_mr_factor()
- *
- * We can't check unlikely(rdma_rw_force_mr) here,
- * but that is most likely 0 anyway.
- */
- u32 factor;
-
- WARN_ON_ONCE(attr->port_num == 0);
-
- /*
- * Each context needs at least one RDMA READ or WRITE WR.
- *
- * For some hardware we might need more, eventually we should ask the
- * HCA driver for a multiplier here.
- */
- factor = 1;
-
- /*
- * If the device needs MRs to perform RDMA READ or WRITE operations,
- * we'll need two additional MRs for the registrations and the
- * invalidation.
- */
- if (rdma_protocol_iwarp(dev, attr->port_num) || dev->attrs.max_sgl_rd)
- factor += 2; /* inv + reg */
-
- return factor * attr->cap.max_rdma_ctxs;
-}
-
-static int smb_direct_create_qpair(struct smbdirect_socket *sc)
-{
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- int ret;
- struct ib_qp_cap qp_cap;
- struct ib_qp_init_attr qp_attr;
- u32 max_send_wr;
- u32 rdma_send_wr;
-
- /*
- * Note that {rdma,ib}_create_qp() will call
- * rdma_rw_init_qp() if cap->max_rdma_ctxs is not 0.
- * It will adjust cap->max_send_wr to the required
- * number of additional WRs for the RDMA RW operations.
- * It will cap cap->max_send_wr to the device limit.
- *
- * +1 for ib_drain_qp
- */
- qp_cap.max_send_wr = sp->send_credit_target + 1;
- qp_cap.max_recv_wr = sp->recv_credit_max + 1;
- qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
- qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
- qp_cap.max_inline_data = 0;
- qp_cap.max_rdma_ctxs = sc->rw_io.credits.max;
-
- /*
- * Find out the number of max_send_wr
- * after rdma_rw_init_qp() adjusted it.
- *
- * We only do it on a temporary variable,
- * as rdma_create_qp() will trigger
- * rdma_rw_init_qp() again.
- */
- memset(&qp_attr, 0, sizeof(qp_attr));
- qp_attr.cap = qp_cap;
- qp_attr.port_num = sc->rdma.cm_id->port_num;
- rdma_send_wr = smb_direct_rdma_rw_send_wrs(sc->ib.dev, &qp_attr);
- max_send_wr = qp_cap.max_send_wr + rdma_send_wr;
-
- if (qp_cap.max_send_wr > sc->ib.dev->attrs.max_cqe ||
- qp_cap.max_send_wr > sc->ib.dev->attrs.max_qp_wr) {
- pr_err("Possible CQE overrun: max_send_wr %d\n",
- qp_cap.max_send_wr);
- pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
- IB_DEVICE_NAME_MAX,
- sc->ib.dev->name,
- sc->ib.dev->attrs.max_cqe,
- sc->ib.dev->attrs.max_qp_wr);
- pr_err("consider lowering send_credit_target = %d\n",
- sp->send_credit_target);
- return -EINVAL;
- }
-
- if (qp_cap.max_rdma_ctxs &&
- (max_send_wr >= sc->ib.dev->attrs.max_cqe ||
- max_send_wr >= sc->ib.dev->attrs.max_qp_wr)) {
- pr_err("Possible CQE overrun: rdma_send_wr %d + max_send_wr %d = %d\n",
- rdma_send_wr, qp_cap.max_send_wr, max_send_wr);
- pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
- IB_DEVICE_NAME_MAX,
- sc->ib.dev->name,
- sc->ib.dev->attrs.max_cqe,
- sc->ib.dev->attrs.max_qp_wr);
- pr_err("consider lowering send_credit_target = %d, max_rdma_ctxs = %d\n",
- sp->send_credit_target, qp_cap.max_rdma_ctxs);
- return -EINVAL;
- }
-
- if (qp_cap.max_recv_wr > sc->ib.dev->attrs.max_cqe ||
- qp_cap.max_recv_wr > sc->ib.dev->attrs.max_qp_wr) {
- pr_err("Possible CQE overrun: max_recv_wr %d\n",
- qp_cap.max_recv_wr);
- pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
- IB_DEVICE_NAME_MAX,
- sc->ib.dev->name,
- sc->ib.dev->attrs.max_cqe,
- sc->ib.dev->attrs.max_qp_wr);
- pr_err("consider lowering receive_credit_max = %d\n",
- sp->recv_credit_max);
- return -EINVAL;
- }
-
- if (qp_cap.max_send_sge > sc->ib.dev->attrs.max_send_sge ||
- qp_cap.max_recv_sge > sc->ib.dev->attrs.max_recv_sge) {
- pr_err("device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
- IB_DEVICE_NAME_MAX,
- sc->ib.dev->name,
- sc->ib.dev->attrs.max_send_sge,
- sc->ib.dev->attrs.max_recv_sge);
- return -EINVAL;
- }
-
- sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
- if (IS_ERR(sc->ib.pd)) {
- pr_err("Can't create RDMA PD\n");
- ret = PTR_ERR(sc->ib.pd);
- sc->ib.pd = NULL;
- return ret;
- }
-
- sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc,
- max_send_wr,
- IB_POLL_WORKQUEUE);
- if (IS_ERR(sc->ib.send_cq)) {
- pr_err("Can't create RDMA send CQ\n");
- ret = PTR_ERR(sc->ib.send_cq);
- sc->ib.send_cq = NULL;
- goto err;
- }
-
- sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc,
- qp_cap.max_recv_wr,
- IB_POLL_WORKQUEUE);
- if (IS_ERR(sc->ib.recv_cq)) {
- pr_err("Can't create RDMA recv CQ\n");
- ret = PTR_ERR(sc->ib.recv_cq);
- sc->ib.recv_cq = NULL;
- goto err;
- }
-
- /*
- * We reset completely here!
- * As the above use was just temporary
- * to calc max_send_wr and rdma_send_wr.
- *
- * rdma_create_qp() will trigger rdma_rw_init_qp()
- * again if max_rdma_ctxs is not 0.
- */
- memset(&qp_attr, 0, sizeof(qp_attr));
- qp_attr.event_handler = smb_direct_qpair_handler;
- qp_attr.qp_context = sc;
- qp_attr.cap = qp_cap;
- qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
- qp_attr.qp_type = IB_QPT_RC;
- qp_attr.send_cq = sc->ib.send_cq;
- qp_attr.recv_cq = sc->ib.recv_cq;
- qp_attr.port_num = ~0;
-
- ret = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr);
- if (ret) {
- pr_err("Can't create RDMA QP: %d\n", ret);
- goto err;
- }
-
- sc->ib.qp = sc->rdma.cm_id->qp;
- sc->rdma.cm_id->event_handler = smb_direct_cm_handler;
-
- return 0;
-err:
- if (sc->ib.qp) {
- sc->ib.qp = NULL;
- rdma_destroy_qp(sc->rdma.cm_id);
- }
- if (sc->ib.recv_cq) {
- ib_destroy_cq(sc->ib.recv_cq);
- sc->ib.recv_cq = NULL;
- }
- if (sc->ib.send_cq) {
- ib_destroy_cq(sc->ib.send_cq);
- sc->ib.send_cq = NULL;
- }
- if (sc->ib.pd) {
- ib_dealloc_pd(sc->ib.pd);
- sc->ib.pd = NULL;
- }
- return ret;
-}
-
-static int smb_direct_prepare(struct ksmbd_transport *t)
-{
- struct smb_direct_transport *st = SMBD_TRANS(t);
- struct smbdirect_socket *sc = &st->socket;
- struct smbdirect_socket_parameters *sp = &sc->parameters;
- struct smbdirect_recv_io *recvmsg;
- struct smbdirect_negotiate_req *req;
- unsigned long flags;
- int ret;
-
- /*
- * We are waiting to pass the following states:
- *
- * SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED
- * SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING
- * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED
- *
- * To finally get to SMBDIRECT_SOCKET_NEGOTIATE_RUNNING
- * in order to continue below.
- *
- * Everything else is unexpected and an error.
- */
- ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n");
- ret = wait_event_interruptible_timeout(sc->status_wait,
- sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED &&
- sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING &&
- sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED,
- msecs_to_jiffies(sp->negotiate_timeout_msec));
- if (ret <= 0 || sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING)
- return ret < 0 ? ret : -ETIMEDOUT;
-
- recvmsg = get_first_reassembly(sc);
- if (!recvmsg)
- return -ECONNABORTED;
-
- ret = smb_direct_check_recvmsg(recvmsg);
- if (ret)
- goto put;
-
- req = (struct smbdirect_negotiate_req *)recvmsg->packet;
- sp->max_recv_size = min_t(u32, sp->max_recv_size,
- le32_to_cpu(req->preferred_send_size));
- sp->max_send_size = min_t(u32, sp->max_send_size,
- le32_to_cpu(req->max_receive_size));
- sp->max_fragmented_send_size =
- le32_to_cpu(req->max_fragmented_size);
- /*
- * The maximum fragmented upper-layer payload receive size supported
- *
- * Assume max_payload_per_credit is
- * smb_direct_receive_credit_max - 24 = 1340
- *
- * The maximum number would be
- * smb_direct_receive_credit_max * max_payload_per_credit
- *
- * 1340 * 255 = 341700 (0x536C4)
- *
- * The minimum value from the spec is 131072 (0x20000)
- *
- * For now we use the logic we used before:
- * (1364 * 255) / 2 = 173910 (0x2A756)
- *
- * We need to adjust this here in case the peer
- * lowered sp->max_recv_size.
- *
- * TODO: instead of adjusting max_fragmented_recv_size
- * we should adjust the number of available buffers,
- * but for now we keep the current logic.
- */
- sp->max_fragmented_recv_size =
- (sp->recv_credit_max * sp->max_recv_size) / 2;
- sc->recv_io.credits.target = le16_to_cpu(req->credits_requested);
- sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
- sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1);
-
-put:
- spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
- sc->recv_io.reassembly.queue_length--;
- list_del(&recvmsg->list);
- spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
- put_recvmsg(sc, recvmsg);
-
- if (ret == -ECONNABORTED)
- return ret;
-
- if (ret)
- goto respond;
-
- /*
- * We negotiated with success, so we need to refill the recv queue.
- * We do that with sc->idle.immediate_work still being disabled
- * via smbdirect_socket_init(), so that queue_work(sc->workqueue,
- * &sc->idle.immediate_work) in smb_direct_post_recv_credits()
- * is a no-op.
- *
- * The message that grants the credits to the client is
- * the negotiate response.
- */
- INIT_WORK(&sc->recv_io.posted.refill_work, smb_direct_post_recv_credits);
- smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work);
- if (unlikely(sc->first_error))
- return sc->first_error;
- INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work);
-
-respond:
- ret = smb_direct_send_negotiate_response(sc, ret);
-
+ free_transport(t);
return ret;
}
-static int smb_direct_connect(struct smbdirect_socket *sc)
+static int smb_direct_listener_kthread_fn(void *p)
{
- struct smbdirect_recv_io *recv_io;
- int ret;
+ struct smb_direct_listener *listener = (struct smb_direct_listener *)p;
+ struct smbdirect_socket *client_sc = NULL;
- ret = smb_direct_init_params(sc);
- if (ret) {
- pr_err("Can't configure RDMA parameters\n");
- return ret;
- }
+ while (!kthread_should_stop()) {
+ struct proto_accept_arg arg = { .err = -EINVAL, };
+ long timeo = MAX_SCHEDULE_TIMEOUT;
- ret = smb_direct_create_pools(sc);
- if (ret) {
- pr_err("Can't init RDMA pool: %d\n", ret);
- return ret;
- }
-
- list_for_each_entry(recv_io, &sc->recv_io.free.list, list)
- recv_io->cqe.done = recv_done;
+ if (!listener->socket)
+ break;
+ client_sc = smbdirect_socket_accept(listener->socket, timeo, &arg);
+ if (!client_sc && arg.err == -EINVAL)
+ break;
+ if (!client_sc)
+ continue;
- ret = smb_direct_create_qpair(sc);
- if (ret) {
- pr_err("Can't accept RDMA client: %d\n", ret);
- return ret;
+ ksmbd_debug(CONN, "connect success: accepted new connection\n");
+ smb_direct_new_connection(listener, client_sc);
}
- ret = smb_direct_prepare_negotiation(sc);
- if (ret) {
- pr_err("Can't negotiate: %d\n", ret);
- return ret;
- }
+ ksmbd_debug(CONN, "releasing socket\n");
return 0;
}
-static bool rdma_frwr_is_supported(struct ib_device_attr *attrs)
+static void smb_direct_listener_destroy(struct smb_direct_listener *listener)
{
- if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
- return false;
- if (attrs->max_fast_reg_page_list_len == 0)
- return false;
- return true;
-}
-
-static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id,
- struct rdma_cm_event *event)
-{
- struct smb_direct_listener *listener = new_cm_id->context;
- struct smb_direct_transport *t;
- struct smbdirect_socket *sc;
- struct smbdirect_socket_parameters *sp;
- struct task_struct *handler;
- u8 peer_initiator_depth;
- u8 peer_responder_resources;
int ret;
- if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) {
- ksmbd_debug(RDMA,
- "Fast Registration Work Requests is not supported. device capabilities=%llx\n",
- new_cm_id->device->attrs.device_cap_flags);
- return -EPROTONOSUPPORT;
- }
+ if (listener->socket)
+ smbdirect_socket_shutdown(listener->socket);
- t = alloc_transport(new_cm_id);
- if (!t)
- return -ENOMEM;
- sc = &t->socket;
- sp = &sc->parameters;
-
- peer_initiator_depth = event->param.conn.initiator_depth;
- peer_responder_resources = event->param.conn.responder_resources;
- if (rdma_protocol_iwarp(new_cm_id->device, new_cm_id->port_num) &&
- event->param.conn.private_data_len == 8) {
- /*
- * Legacy clients with only iWarp MPA v1 support
- * need a private blob in order to negotiate
- * the IRD/ORD values.
- */
- const __be32 *ird_ord_hdr = event->param.conn.private_data;
- u32 ird32 = be32_to_cpu(ird_ord_hdr[0]);
- u32 ord32 = be32_to_cpu(ird_ord_hdr[1]);
-
- /*
- * cifs.ko sends the legacy IRD/ORD negotiation
- * event if iWarp MPA v2 was used.
- *
- * Here we check that the values match and only
- * mark the client as legacy if they don't match.
- */
- if ((u32)event->param.conn.initiator_depth != ird32 ||
- (u32)event->param.conn.responder_resources != ord32) {
- /*
- * There are broken clients (old cifs.ko)
- * using little endian and also
- * struct rdma_conn_param only uses u8
- * for initiator_depth and responder_resources,
- * so we truncate the value to U8_MAX.
- *
- * smb_direct_accept_client() will then
- * do the real negotiation in order to
- * select the minimum between client and
- * server.
- */
- ird32 = min_t(u32, ird32, U8_MAX);
- ord32 = min_t(u32, ord32, U8_MAX);
-
- sc->rdma.legacy_iwarp = true;
- peer_initiator_depth = (u8)ird32;
- peer_responder_resources = (u8)ord32;
- }
+ if (listener->thread) {
+ ret = kthread_stop(listener->thread);
+ if (ret)
+ pr_err("failed to stop forker thread\n");
+ listener->thread = NULL;
}
- /*
- * First set what the we as server are able to support
- */
- sp->initiator_depth = min_t(u8, sp->initiator_depth,
- new_cm_id->device->attrs.max_qp_rd_atom);
-
- /*
- * negotiate the value by using the minimum
- * between client and server if the client provided
- * non 0 values.
- */
- if (peer_initiator_depth != 0)
- sp->initiator_depth = min_t(u8, sp->initiator_depth,
- peer_initiator_depth);
- if (peer_responder_resources != 0)
- sp->responder_resources = min_t(u8, sp->responder_resources,
- peer_responder_resources);
-
- ret = smb_direct_connect(sc);
- if (ret)
- goto out_err;
-
- handler = kthread_run(ksmbd_conn_handler_loop,
- KSMBD_TRANS(t)->conn, "ksmbd:r%u",
- listener->port);
- if (IS_ERR(handler)) {
- ret = PTR_ERR(handler);
- pr_err("Can't start thread\n");
- goto out_err;
+ if (listener->socket) {
+ smbdirect_socket_release(listener->socket);
+ listener->socket = NULL;
}
- return 0;
-out_err:
- free_transport(t);
- return ret;
-}
-
-static int smb_direct_listen_handler(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event)
-{
- switch (event->event) {
- case RDMA_CM_EVENT_CONNECT_REQUEST: {
- int ret = smb_direct_handle_connect_request(cm_id, event);
-
- if (ret) {
- pr_err("Can't create transport: %d\n", ret);
- return ret;
- }
-
- ksmbd_debug(RDMA, "Received connection request. cm_id=%p\n",
- cm_id);
- break;
- }
- default:
- pr_err("Unexpected listen event. cm_id=%p, event=%s (%d)\n",
- cm_id, rdma_event_msg(event->event), event->event);
- break;
- }
- return 0;
+ listener->port = 0;
}
static int smb_direct_listen(struct smb_direct_listener *listener,
int port)
{
- int ret;
- struct rdma_cm_id *cm_id;
- u8 node_type = RDMA_NODE_UNSPECIFIED;
+ struct net *net = current->nsproxy->net_ns;
+ struct task_struct *kthread;
struct sockaddr_in sin = {
.sin_family = AF_INET,
.sin_addr.s_addr = htonl(INADDR_ANY),
.sin_port = htons(port),
};
+ struct smbdirect_socket_parameters init_params = {};
+ struct smbdirect_socket_parameters *sp;
+ struct smbdirect_socket *sc;
+ u64 port_flags = 0;
+ int ret;
switch (port) {
case SMB_DIRECT_PORT_IWARP:
@@ -2806,7 +394,7 @@ static int smb_direct_listen(struct smb_direct_listener *listener,
* only allow iWarp devices
* for port 5445.
*/
- node_type = RDMA_NODE_RNIC;
+ port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW;
break;
case SMB_DIRECT_PORT_INFINIBAND:
/*
@@ -2815,119 +403,95 @@ static int smb_direct_listen(struct smb_direct_listener *listener,
*
* (Basically don't allow iWarp devices)
*/
- node_type = RDMA_NODE_IB_CA;
+ port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB;
break;
default:
pr_err("unsupported smbdirect port=%d!\n", port);
return -ENODEV;
}
- cm_id = rdma_create_id(&init_net, smb_direct_listen_handler,
- listener, RDMA_PS_TCP, IB_QPT_RC);
- if (IS_ERR(cm_id)) {
- pr_err("Can't create cm id: %ld\n", PTR_ERR(cm_id));
- return PTR_ERR(cm_id);
+ ret = smbdirect_socket_create_kern(net, &sc);
+ if (ret) {
+ pr_err("smbdirect_socket_create_kern() failed: %d %1pe\n",
+ ret, ERR_PTR(ret));
+ return ret;
}
- ret = rdma_restrict_node_type(cm_id, node_type);
+ /*
+ * Create the initial parameters
+ */
+ sp = &init_params;
+ sp->flags |= port_flags;
+ sp->negotiate_timeout_msec = SMB_DIRECT_NEGOTIATE_TIMEOUT * 1000;
+ sp->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH;
+ sp->responder_resources = 1;
+ sp->recv_credit_max = smb_direct_receive_credit_max;
+ sp->send_credit_target = smb_direct_send_credit_target;
+ sp->max_send_size = smb_direct_max_send_size;
+ sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size;
+ sp->max_recv_size = smb_direct_max_receive_size;
+ sp->max_read_write_size = smb_direct_max_read_write_size;
+ sp->keepalive_interval_msec = SMB_DIRECT_KEEPALIVE_SEND_INTERVAL * 1000;
+ sp->keepalive_timeout_msec = SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT * 1000;
+
+ smbdirect_socket_set_logging(sc, NULL,
+ smb_direct_logging_needed,
+ smb_direct_logging_vaprintf);
+ ret = smbdirect_socket_set_initial_parameters(sc, sp);
+ if (ret) {
+ pr_err("Failed smbdirect_socket_set_initial_parameters(): %d %1pe\n",
+ ret, ERR_PTR(ret));
+ goto err;
+ }
+ ret = smbdirect_socket_set_kernel_settings(sc, IB_POLL_WORKQUEUE, KSMBD_DEFAULT_GFP);
if (ret) {
- pr_err("rdma_restrict_node_type(%u) failed %d\n",
- node_type, ret);
+ pr_err("Failed smbdirect_socket_set_kernel_settings(): %d %1pe\n",
+ ret, ERR_PTR(ret));
goto err;
}
- ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ ret = smbdirect_socket_bind(sc, (struct sockaddr *)&sin);
if (ret) {
- pr_err("Can't bind: %d\n", ret);
+ pr_err("smbdirect_socket_bind() failed: %d %1pe\n",
+ ret, ERR_PTR(ret));
goto err;
}
- ret = rdma_listen(cm_id, 10);
+ ret = smbdirect_socket_listen(sc, 10);
if (ret) {
- pr_err("Can't listen: %d\n", ret);
+ pr_err("Port[%d] smbdirect_socket_listen() failed: %d %1pe\n",
+ port, ret, ERR_PTR(ret));
goto err;
}
listener->port = port;
- listener->cm_id = cm_id;
+ listener->socket = sc;
+
+ kthread = kthread_run(smb_direct_listener_kthread_fn,
+ listener,
+ "ksmbd-smbdirect-listener-%u", port);
+ if (IS_ERR(kthread)) {
+ ret = PTR_ERR(kthread);
+ pr_err("Can't start ksmbd listen kthread: %d %1pe\n",
+ ret, ERR_PTR(ret));
+ goto err;
+ }
+ listener->thread = kthread;
return 0;
err:
- listener->port = 0;
- listener->cm_id = NULL;
- rdma_destroy_id(cm_id);
+ smb_direct_listener_destroy(listener);
return ret;
}
-static int smb_direct_ib_client_add(struct ib_device *ib_dev)
-{
- struct smb_direct_device *smb_dev;
-
- if (!rdma_frwr_is_supported(&ib_dev->attrs))
- return 0;
-
- smb_dev = kzalloc_obj(*smb_dev, KSMBD_DEFAULT_GFP);
- if (!smb_dev)
- return -ENOMEM;
- smb_dev->ib_dev = ib_dev;
-
- write_lock(&smb_direct_device_lock);
- list_add(&smb_dev->list, &smb_direct_device_list);
- write_unlock(&smb_direct_device_lock);
-
- ksmbd_debug(RDMA, "ib device added: name %s\n", ib_dev->name);
- return 0;
-}
-
-static void smb_direct_ib_client_remove(struct ib_device *ib_dev,
- void *client_data)
-{
- struct smb_direct_device *smb_dev, *tmp;
-
- write_lock(&smb_direct_device_lock);
- list_for_each_entry_safe(smb_dev, tmp, &smb_direct_device_list, list) {
- if (smb_dev->ib_dev == ib_dev) {
- list_del(&smb_dev->list);
- kfree(smb_dev);
- break;
- }
- }
- write_unlock(&smb_direct_device_lock);
-}
-
-static struct ib_client smb_direct_ib_client = {
- .name = "ksmbd_smb_direct_ib",
- .add = smb_direct_ib_client_add,
- .remove = smb_direct_ib_client_remove,
-};
-
int ksmbd_rdma_init(void)
{
int ret;
smb_direct_ib_listener = smb_direct_iw_listener = (struct smb_direct_listener) {
- .cm_id = NULL,
+ .socket = NULL,
};
- ret = ib_register_client(&smb_direct_ib_client);
- if (ret) {
- pr_err("failed to ib_register_client\n");
- return ret;
- }
-
- /* When a client is running out of send credits, the credits are
- * granted by the server's sending a packet using this queue.
- * This avoids the situation that a clients cannot send packets
- * for lack of credits
- */
- smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq",
- WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_PERCPU,
- 0);
- if (!smb_direct_wq) {
- ret = -ENOMEM;
- goto err;
- }
-
ret = smb_direct_listen(&smb_direct_ib_listener,
SMB_DIRECT_PORT_INFINIBAND);
if (ret) {
@@ -2935,8 +499,8 @@ int ksmbd_rdma_init(void)
goto err;
}
- ksmbd_debug(RDMA, "InfiniBand/RoCEv1/RoCEv2 RDMA listener. cm_id=%p\n",
- smb_direct_ib_listener.cm_id);
+ ksmbd_debug(RDMA, "InfiniBand/RoCEv1/RoCEv2 RDMA listener. socket=%p\n",
+ smb_direct_ib_listener.socket);
ret = smb_direct_listen(&smb_direct_iw_listener,
SMB_DIRECT_PORT_IWARP);
@@ -2945,107 +509,29 @@ int ksmbd_rdma_init(void)
goto err;
}
- ksmbd_debug(RDMA, "iWarp RDMA listener. cm_id=%p\n",
- smb_direct_iw_listener.cm_id);
+ ksmbd_debug(RDMA, "iWarp RDMA listener. socket=%p\n",
+ smb_direct_iw_listener.socket);
return 0;
err:
ksmbd_rdma_stop_listening();
- ksmbd_rdma_destroy();
return ret;
}
void ksmbd_rdma_stop_listening(void)
{
- if (!smb_direct_ib_listener.cm_id && !smb_direct_iw_listener.cm_id)
- return;
-
- ib_unregister_client(&smb_direct_ib_client);
-
- if (smb_direct_ib_listener.cm_id)
- rdma_destroy_id(smb_direct_ib_listener.cm_id);
- if (smb_direct_iw_listener.cm_id)
- rdma_destroy_id(smb_direct_iw_listener.cm_id);
-
- smb_direct_ib_listener = smb_direct_iw_listener = (struct smb_direct_listener) {
- .cm_id = NULL,
- };
-}
-
-void ksmbd_rdma_destroy(void)
-{
- if (smb_direct_wq) {
- destroy_workqueue(smb_direct_wq);
- smb_direct_wq = NULL;
- }
-}
-
-static bool ksmbd_find_rdma_capable_netdev(struct net_device *netdev)
-{
- struct smb_direct_device *smb_dev;
- int i;
- bool rdma_capable = false;
-
- read_lock(&smb_direct_device_lock);
- list_for_each_entry(smb_dev, &smb_direct_device_list, list) {
- for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) {
- struct net_device *ndev;
-
- ndev = ib_device_get_netdev(smb_dev->ib_dev, i + 1);
- if (!ndev)
- continue;
-
- if (ndev == netdev) {
- dev_put(ndev);
- rdma_capable = true;
- goto out;
- }
- dev_put(ndev);
- }
- }
-out:
- read_unlock(&smb_direct_device_lock);
-
- if (rdma_capable == false) {
- struct ib_device *ibdev;
-
- ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN);
- if (ibdev) {
- rdma_capable = rdma_frwr_is_supported(&ibdev->attrs);
- ib_device_put(ibdev);
- }
- }
-
- ksmbd_debug(RDMA, "netdev(%s) rdma capable : %s\n",
- netdev->name, str_true_false(rdma_capable));
-
- return rdma_capable;
+ smb_direct_listener_destroy(&smb_direct_ib_listener);
+ smb_direct_listener_destroy(&smb_direct_iw_listener);
}
bool ksmbd_rdma_capable_netdev(struct net_device *netdev)
{
- struct net_device *lower_dev;
- struct list_head *iter;
-
- if (ksmbd_find_rdma_capable_netdev(netdev))
- return true;
-
- /* check if netdev is bridge or VLAN */
- if (netif_is_bridge_master(netdev) ||
- netdev->priv_flags & IFF_802_1Q_VLAN)
- netdev_for_each_lower_dev(netdev, lower_dev, iter)
- if (ksmbd_find_rdma_capable_netdev(lower_dev))
- return true;
-
- /* check if netdev is IPoIB safely without layer violation */
- if (netdev->type == ARPHRD_INFINIBAND)
- return true;
+ u8 node_type = smbdirect_netdev_rdma_capable_node_type(netdev);
- return false;
+ return node_type != RDMA_NODE_UNSPECIFIED;
}
static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = {
- .prepare = smb_direct_prepare,
.disconnect = smb_direct_disconnect,
.shutdown = smb_direct_shutdown,
.writev = smb_direct_writev,
diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h
index 3f93c6a9f7e4..05352dc47f95 100644
--- a/fs/smb/server/transport_rdma.h
+++ b/fs/smb/server/transport_rdma.h
@@ -14,17 +14,17 @@
#ifdef CONFIG_SMB_SERVER_SMBDIRECT
int ksmbd_rdma_init(void);
void ksmbd_rdma_stop_listening(void);
-void ksmbd_rdma_destroy(void);
bool ksmbd_rdma_capable_netdev(struct net_device *netdev);
void init_smbd_max_io_size(unsigned int sz);
unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt);
#else
static inline int ksmbd_rdma_init(void) { return 0; }
static inline void ksmbd_rdma_stop_listening(void) { }
-static inline void ksmbd_rdma_destroy(void) { }
static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; }
static inline void init_smbd_max_io_size(unsigned int sz) { }
static inline unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { return 0; }
#endif
+#include "../common/smbdirect/smbdirect.h"
+
#endif /* __KSMBD_TRANSPORT_RDMA_H__ */