[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <180be553-8297-4802-972f-d73f30da365a@redhat.com>
Date: Tue, 26 Aug 2025 12:10:54 +0200
From: Paolo Abeni <pabeni@...hat.com>
To: John Ousterhout <ouster@...stanford.edu>, netdev@...r.kernel.org
Cc: edumazet@...gle.com, horms@...nel.org, kuba@...nel.org
Subject: Re: [PATCH net-next v15 06/15] net: homa: create homa_sock.h and
homa_sock.c
On 8/18/25 10:55 PM, John Ousterhout wrote:
> +/**
> + * homa_socktab_next() - Return the next socket in an iteration over a socktab.
> + * @scan: State of the scan.
> + *
> + * Return: The next socket in the table, or NULL if the iteration has
> + * returned all of the sockets in the table. If non-NULL, a
> + * reference is held on the socket to prevent its deletion.
> + * Sockets are not returned in any particular order. It's
> + * possible that the returned socket has been destroyed.
> + */
> +struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan)
> +{
> + struct hlist_head *bucket;
> + struct hlist_node *next;
> +
> + rcu_read_lock();
> + if (scan->hsk) {
> + sock_put(&scan->hsk->sock);
> + next = rcu_dereference(hlist_next_rcu(&scan->hsk->socktab_links));
> + if (next)
> + goto success;
> + }
> + for (scan->current_bucket++;
> + scan->current_bucket < HOMA_SOCKTAB_BUCKETS;
> + scan->current_bucket++) {
> + bucket = &scan->socktab->buckets[scan->current_bucket];
> + next = rcu_dereference(hlist_first_rcu(bucket));
> + if (next)
> + goto success;
> + }
> + scan->hsk = NULL;
> + rcu_read_unlock();
> + return NULL;
> +
> +success:
> + scan->hsk = hlist_entry(next, struct homa_sock, socktab_links);
Minor nit: double space above.
> + sock_hold(&scan->hsk->sock);
> + rcu_read_unlock();
> + return scan->hsk;
> +}
> +
> +/**
> + * homa_socktab_end_scan() - Must be invoked on completion of each scan
> + * to clean up state associated with the scan.
> + * @scan: State of the scan.
> + */
> +void homa_socktab_end_scan(struct homa_socktab_scan *scan)
> +{
> + if (scan->hsk) {
> + sock_put(&scan->hsk->sock);
> + scan->hsk = NULL;
> + }
> +}
> +
> +/**
> + * homa_sock_init() - Constructor for homa_sock objects. This function
> + * initializes only the parts of the socket that are owned by Homa.
> + * @hsk: Object to initialize. The Homa-specific parts must have been
> + * initialized to zeroes by the caller.
> + *
> + * Return: 0 for success, otherwise a negative errno.
> + */
> +int homa_sock_init(struct homa_sock *hsk)
> +{
> + struct homa_pool *buffer_pool;
> + struct homa_socktab *socktab;
> + struct homa_sock *other;
> + struct homa_net *hnet;
> + struct homa *homa;
> + int starting_port;
> + int result = 0;
> + int i;
> +
> + hnet = (struct homa_net *)net_generic(sock_net(&hsk->sock),
> + homa_net_id);
> + homa = hnet->homa;
> + socktab = homa->socktab;
> +
> + /* Initialize fields outside the Homa part. */
> + hsk->sock.sk_sndbuf = homa->wmem_max;
> + sock_set_flag(&hsk->inet.sk, SOCK_RCU_FREE);
> +
> + /* Do things requiring memory allocation before locking the socket,
> + * so that GFP_ATOMIC is not needed.
> + */
> + buffer_pool = homa_pool_alloc(hsk);
> + if (IS_ERR(buffer_pool))
> + return PTR_ERR(buffer_pool);
> +
> + /* Initialize Homa-specific fields. */
> + hsk->homa = homa;
> + hsk->hnet = hnet;
> + hsk->buffer_pool = buffer_pool;
> +
> + /* Pick a default port. Must keep the socktab locked from now
> + * until the new socket is added to the socktab, to ensure that
> + * no other socket chooses the same port.
> + */
> + spin_lock_bh(&socktab->write_lock);
> + starting_port = hnet->prev_default_port;
> + while (1) {
> + hnet->prev_default_port++;
> + if (hnet->prev_default_port < HOMA_MIN_DEFAULT_PORT)
> + hnet->prev_default_port = HOMA_MIN_DEFAULT_PORT;
> + other = homa_sock_find(hnet, hnet->prev_default_port);
> + if (!other)
> + break;
> + sock_put(&other->sock);
> + if (hnet->prev_default_port == starting_port) {
> + spin_unlock_bh(&socktab->write_lock);
> + hsk->shutdown = true;
> + hsk->homa = NULL;
> + result = -EADDRNOTAVAIL;
> + goto error;
> + }
You likely need to add a cond_resched here (releasing and re-acquiring
the lock as needed)
> + }
> + hsk->port = hnet->prev_default_port;
> + hsk->inet.inet_num = hsk->port;
> + hsk->inet.inet_sport = htons(hsk->port);
> +
> + hsk->is_server = false;
> + hsk->shutdown = false;
> + hsk->ip_header_length = (hsk->inet.sk.sk_family == AF_INET) ?
> + sizeof(struct iphdr) : sizeof(struct ipv6hdr);
> + spin_lock_init(&hsk->lock);
> + atomic_set(&hsk->protect_count, 0);
> + INIT_LIST_HEAD(&hsk->active_rpcs);
> + INIT_LIST_HEAD(&hsk->dead_rpcs);
> + hsk->dead_skbs = 0;
> + INIT_LIST_HEAD(&hsk->waiting_for_bufs);
> + INIT_LIST_HEAD(&hsk->ready_rpcs);
> + INIT_LIST_HEAD(&hsk->interests);
> + for (i = 0; i < HOMA_CLIENT_RPC_BUCKETS; i++) {
> + struct homa_rpc_bucket *bucket = &hsk->client_rpc_buckets[i];
> +
> + spin_lock_init(&bucket->lock);
> + bucket->id = i;
> + INIT_HLIST_HEAD(&bucket->rpcs);
> + }
> + for (i = 0; i < HOMA_SERVER_RPC_BUCKETS; i++) {
> + struct homa_rpc_bucket *bucket = &hsk->server_rpc_buckets[i];
> +
> + spin_lock_init(&bucket->lock);
> + bucket->id = i + 1000000;
> + INIT_HLIST_HEAD(&bucket->rpcs);
> + }
Do all the above initialization steps need to be done under the socktab
lock?
> +/**
> + * homa_sock_bind() - Associates a server port with a socket; if there
> + * was a previous server port assignment for @hsk, it is abandoned.
> + * @hnet: Network namespace with which port is associated.
> + * @hsk: Homa socket.
> + * @port: Desired server port for @hsk. If 0, then this call
> + * becomes a no-op: the socket will continue to use
> + * its randomly assigned client port.
> + *
> + * Return: 0 for success, otherwise a negative errno.
> + */
> +int homa_sock_bind(struct homa_net *hnet, struct homa_sock *hsk,
> + u16 port)
> +{
> + struct homa_socktab *socktab = hnet->homa->socktab;
> + struct homa_sock *owner;
> + int result = 0;
> +
> + if (port == 0)
> + return result;
> + if (port >= HOMA_MIN_DEFAULT_PORT)
> + return -EINVAL;
> + homa_sock_lock(hsk);
> + spin_lock_bh(&socktab->write_lock);
> + if (hsk->shutdown) {
> + result = -ESHUTDOWN;
> + goto done;
> + }
> +
> + owner = homa_sock_find(hnet, port);
> + if (owner) {
> + sock_put(&owner->sock);
homa_sock_find() is used is multiple places to check for port usage. I
think it would be useful to add a variant of such helper not
incremention the socket refcount.
> + if (owner != hsk)
> + result = -EADDRINUSE;
> + goto done;
> + }
> + hlist_del_rcu(&hsk->socktab_links);
> + hsk->port = port;
> + hsk->inet.inet_num = port;
> + hsk->inet.inet_sport = htons(hsk->port);
> + hlist_add_head_rcu(&hsk->socktab_links,
> + &socktab->buckets[homa_socktab_bucket(hnet, port)]);
> + hsk->is_server = true;
> +done:
> + spin_unlock_bh(&socktab->write_lock);
> + homa_sock_unlock(hsk);
> + return result;
> +}
> +/**
> + * homa_sock_wait_wmem() - Block the thread until @hsk's usage of tx
> + * packet memory drops below the socket's limit.
> + * @hsk: Socket of interest.
> + * @nonblocking: If there's not enough memory, return -EWOLDBLOCK instead
> + * of blocking.
> + * Return: 0 for success, otherwise a negative errno.
> + */
> +int homa_sock_wait_wmem(struct homa_sock *hsk, int nonblocking)
> +{
> + long timeo = hsk->sock.sk_sndtimeo;
> + int result;
> +
> + if (nonblocking)
> + timeo = 0;
> + set_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags);
> + result = wait_event_interruptible_timeout(*sk_sleep(&hsk->sock),
> + homa_sock_wmem_avl(hsk) || hsk->shutdown,
> + timeo);
> + if (signal_pending(current))
> + return -EINTR;
> + if (result == 0)
> + return -EWOULDBLOCK;
> + return 0;
> +}
Perhaps you could use sock_wait_for_wmem() ?
> diff --git a/net/homa/homa_sock.h b/net/homa/homa_sock.h
> new file mode 100644
> index 000000000000..1f649c1da628
> --- /dev/null
> +++ b/net/homa/homa_sock.h
> @@ -0,0 +1,408 @@
> +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */
> +
> +/* This file defines structs and other things related to Homa sockets. */
> +
> +#ifndef _HOMA_SOCK_H
> +#define _HOMA_SOCK_H
> +
> +/* Forward declarations. */
> +struct homa;
> +struct homa_pool;
> +
> +/* Number of hash buckets in a homa_socktab. Must be a power of 2. */
> +#define HOMA_SOCKTAB_BUCKET_BITS 10
> +#define HOMA_SOCKTAB_BUCKETS BIT(HOMA_SOCKTAB_BUCKET_BITS)
> +
> +/**
> + * struct homa_socktab - A hash table that maps from port numbers (either
> + * client or server) to homa_sock objects.
> + *
> + * This table is managed exclusively by homa_socktab.c, using RCU to
> + * minimize synchronization during lookups.
> + */
> +struct homa_socktab {
> + /**
> + * @write_lock: Controls all modifications to this object; not needed
> + * for socket lookups (RCU is used instead). Also used to
> + * synchronize port allocation.
> + */
> + spinlock_t write_lock;
> +
> + /**
> + * @buckets: Heads of chains for hash table buckets. Chains
> + * consist of homa_sock objects.
> + */
> + struct hlist_head buckets[HOMA_SOCKTAB_BUCKETS];
> +};
> +
> +/**
> + * struct homa_socktab_scan - Records the state of an iteration over all
> + * the entries in a homa_socktab, in a way that is safe against concurrent
> + * reclamation of sockets.
> + */
> +struct homa_socktab_scan {
> + /** @socktab: The table that is being scanned. */
> + struct homa_socktab *socktab;
> +
> + /**
> + * @hsk: Points to the current socket in the iteration, or NULL if
> + * we're at the beginning or end of the iteration. If non-NULL then
> + * we are holding a reference to this socket.
> + */
> + struct homa_sock *hsk;
> +
> + /**
> + * @current_bucket: The index of the bucket in socktab->buckets
> + * currently being scanned (-1 if @hsk == NULL).
> + */
> + int current_bucket;
> +};
> +
> +/**
> + * struct homa_rpc_bucket - One bucket in a hash table of RPCs.
> + */
> +
> +struct homa_rpc_bucket {
> + /**
> + * @lock: serves as a lock both for this bucket (e.g., when
> + * adding and removing RPCs) and also for all of the RPCs in
> + * the bucket. Must be held whenever looking up an RPC in
> + * this bucket or manipulating an RPC in the bucket. This approach
> + * has the following properties:
> + * 1. An RPC can be looked up and locked (a common operation) with
> + * a single lock acquisition.
> + * 2. Looking up and locking are atomic: there is no window of
> + * vulnerability where someone else could delete an RPC after
> + * it has been looked up and before it has been locked.
> + * 3. The lookup mechanism does not use RCU. This is important because
> + * RPCs are created rapidly and typically live only a few tens of
> + * microseconds. As of May 2027 RCU introduces a lag of about
I'm unable to make prediction about the next week, I have no idea what
will happen in 2y...
> + * 25 ms before objects can be deleted; for RPCs this would result
> + * in hundreds or thousands of RPCs accumulating before RCU allows
> + * them to be deleted.
> + * This approach has the disadvantage that RPCs within a bucket share
> + * locks and thus may not be able to work concurrently, but there are
> + * enough buckets in the table to make such colllisions rare.
> + *
> + * See "Homa Locking Strategy" in homa_impl.h for more info about
> + * locking.
> + */
> + spinlock_t lock;
> +
> + /**
> + * @id: identifier for this bucket, used in error messages etc.
> + * It's the index of the bucket within its hash table bucket
> + * array, with an additional offset to separate server and
> + * client RPCs.
> + */
> + int id;
> +
> + /** @rpcs: list of RPCs that hash to this bucket. */
> + struct hlist_head rpcs;
> +};
> +
> +/**
> + * define HOMA_CLIENT_RPC_BUCKETS - Number of buckets in hash tables for
> + * client RPCs. Must be a power of 2.
> + */
> +#define HOMA_CLIENT_RPC_BUCKETS 1024
> +
> +/**
> + * define HOMA_SERVER_RPC_BUCKETS - Number of buckets in hash tables for
> + * server RPCs. Must be a power of 2.
> + */
> +#define HOMA_SERVER_RPC_BUCKETS 1024
> +
> +/**
> + * struct homa_sock - Information about an open socket.
> + */
> +struct homa_sock {
> + /* Info for other network layers. Note: IPv6 info (struct ipv6_pinfo
> + * comes at the very end of the struct, *after* Homa's data, if this
> + * socket uses IPv6).
> + */
> + union {
> + /** @sock: generic socket data; must be the first field. */
> + struct sock sock;
> +
> + /**
> + * @inet: generic Internet socket data; must also be the
> + first field (contains sock as its first member).
> + */
> + struct inet_sock inet;
> + };
> +
> + /**
> + * @homa: Overall state about the Homa implementation. NULL
> + * means this socket was never initialized or has been deleted.
> + */
> + struct homa *homa;
> +
> + /**
> + * @hnet: Overall state specific to the network namespace for
> + * this socket.
> + */
> + struct homa_net *hnet;
Both the above should likely be removed
> +
> + /**
> + * @buffer_pool: used to allocate buffer space for incoming messages.
> + * Storage is dynamically allocated.
> + */
> + struct homa_pool *buffer_pool;
> +
> + /**
> + * @port: Port number: identifies this socket uniquely among all
> + * those on this node.
> + */
> + u16 port;
> +
> + /**
> + * @is_server: True means that this socket can act as both client
> + * and server; false means the socket is client-only.
> + */
> + bool is_server;
> +
> + /**
> + * @shutdown: True means the socket is no longer usable (either
> + * shutdown has already been invoked, or the socket was never
> + * properly initialized).
> + */
> + bool shutdown;
> +
> + /**
> + * @ip_header_length: Length of IP headers for this socket (depends
> + * on IPv4 vs. IPv6).
> + */
> + int ip_header_length;
> +
> + /** @socktab_links: Links this socket into a homa_socktab bucket. */
> + struct hlist_node socktab_links;
> +
> + /* Information above is (almost) never modified; start a new
> + * cache line below for info that is modified frequently.
> + */
> +
> + /**
> + * @lock: Must be held when modifying fields such as interests
> + * and lists of RPCs. This lock is used in place of sk->sk_lock
> + * because it's used differently (it's always used as a simple
> + * spin lock). See "Homa Locking Strategy" in homa_impl.h
> + * for more on Homa's synchronization strategy.
> + */
> + spinlock_t lock ____cacheline_aligned_in_smp;
> +
> + /**
> + * @protect_count: counts the number of calls to homa_protect_rpcs
> + * for which there have not yet been calls to homa_unprotect_rpcs.
> + */
> + atomic_t protect_count;
> +
> + /**
> + * @active_rpcs: List of all existing RPCs related to this socket,
> + * including both client and server RPCs. This list isn't strictly
> + * needed, since RPCs are already in one of the hash tables below,
> + * but it's more efficient for homa_timer to have this list
> + * (so it doesn't have to scan large numbers of hash buckets).
> + * The list is sorted, with the oldest RPC first. Manipulate with
> + * RCU so timer can access without locking.
> + */
> + struct list_head active_rpcs;
> +
> + /**
> + * @dead_rpcs: Contains RPCs for which homa_rpc_end has been
> + * called, but their packet buffers haven't yet been freed.
> + */
> + struct list_head dead_rpcs;
> +
> + /** @dead_skbs: Total number of socket buffers in RPCs on dead_rpcs. */
> + int dead_skbs;
> +
> + /**
> + * @waiting_for_bufs: Contains RPCs that are blocked because there
> + * wasn't enough space in the buffer pool region for their incoming
> + * messages. Sorted in increasing order of message length.
> + */
> + struct list_head waiting_for_bufs;
> +
> + /**
> + * @ready_rpcs: List of all RPCs that are ready for attention from
> + * an application thread.
> + */
> + struct list_head ready_rpcs;
> +
> + /**
> + * @interests: List of threads that are currently waiting for
> + * incoming messages via homa_wait_shared.
> + */
> + struct list_head interests;
> +
> + /**
> + * @client_rpc_buckets: Hash table for fast lookup of client RPCs.
> + * Modifications are synchronized with bucket locks, not
> + * the socket lock.
> + */
> + struct homa_rpc_bucket client_rpc_buckets[HOMA_CLIENT_RPC_BUCKETS];
> +
> + /**
> + * @server_rpc_buckets: Hash table for fast lookup of server RPCs.
> + * Modifications are synchronized with bucket locks, not
> + * the socket lock.
> + */
> + struct homa_rpc_bucket server_rpc_buckets[HOMA_SERVER_RPC_BUCKETS];
The above 2 array are quite large, and should be probably allocated
separately.
> +/**
> + * homa_client_rpc_bucket() - Find the bucket containing a given
> + * client RPC.
> + * @hsk: Socket associated with the RPC.
> + * @id: Id of the desired RPC.
> + *
> + * Return: The bucket in which this RPC will appear, if the RPC exists.
> + */
> +static inline struct homa_rpc_bucket
> + *homa_client_rpc_bucket(struct homa_sock *hsk, u64 id)
> +{
> + /* We can use a really simple hash function here because RPC ids
> + * are allocated sequentially.
> + */
> + return &hsk->client_rpc_buckets[(id >> 1)
> + & (HOMA_CLIENT_RPC_BUCKETS - 1)];
Minor nit: '&' should be on the provious line, and please fix the alignment.
> +/**
> + * homa_sock_wakeup_wmem() - Invoked when tx packet memory has been freed;
> + * if memory usage is below the limit and there are tasks waiting for memory,
> + * wake them up.
> + * @hsk: Socket of interest.
> + */
> +static inline void homa_sock_wakeup_wmem(struct homa_sock *hsk)
> +{
> + if (test_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags) &&
> + homa_sock_wmem_avl(hsk)) {
> + clear_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags);
> + wake_up_interruptible_poll(sk_sleep(&hsk->sock), EPOLLOUT);
Can hsk be orphaned at this point? I think so.
/P
Powered by blists - more mailing lists