netdev - Re: [PATCH net-next v2 11/12] net: homa: create homa_plumbing.c homa

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <07647363-622b-4023-ba71-da213754a7ae@linux.alibaba.com>
Date: Tue, 26 Nov 2024 13:32:15 +0800
From: "D. Wythe" <alibuda@...ux.alibaba.com>
To: John Ousterhout <ouster@...stanford.edu>, netdev@...r.kernel.org,
 linux-api@...r.kernel.org
Subject: Re: [PATCH net-next v2 11/12] net: homa: create homa_plumbing.c
 homa_utils.c



On 11/12/24 7:40 AM, John Ousterhout wrote:
> homa_plumbing.c contains functions that connect Homa to the rest of
> the Linux kernel, such as dispatch tables used by Linux and the
> top-level functions that Linux invokes from those dispatch tables.
> 
> homa_utils.c contains a few odds and ends, such as code to initialize
> and destroy struct homa's.
> 
> Signed-off-by: John Ousterhout <ouster@...stanford.edu>
> ---
>   net/homa/homa_plumbing.c | 965 +++++++++++++++++++++++++++++++++++++++
>   net/homa/homa_utils.c    | 150 ++++++
>   2 files changed, 1115 insertions(+)
>   create mode 100644 net/homa/homa_plumbing.c
>   create mode 100644 net/homa/homa_utils.c
> 
> diff --git a/net/homa/homa_plumbing.c b/net/homa/homa_plumbing.c
> new file mode 100644
> index 000000000000..afd3a9cc97ba
> --- /dev/null
> +++ b/net/homa/homa_plumbing.c
> @@ -0,0 +1,965 @@
> +// SPDX-License-Identifier: BSD-2-Clause
> +
> +/* This file consists mostly of "glue" that hooks Homa into the rest of
> + * the Linux kernel. The guts of the protocol are in other files.
> + */
> +
> +#include "homa_impl.h"
> +#include "homa_peer.h"
> +#include "homa_pool.h"
> +
> +MODULE_LICENSE("Dual MIT/GPL");
> +MODULE_AUTHOR("John Ousterhout");
> +MODULE_DESCRIPTION("Homa transport protocol");
> +MODULE_VERSION("0.01");
> +
> +/* Not yet sure what these variables are for */
> +static long sysctl_homa_mem[3] __read_mostly;
> +static int sysctl_homa_rmem_min __read_mostly;
> +static int sysctl_homa_wmem_min __read_mostly;
> +
> +/* Global data for Homa. Never reference homa_data directory. Always use
> + * the homa variable instead; this allows overriding during unit tests.
> + */
> +static struct homa homa_data;
> +struct homa *homa = &homa_data;
> +
> +/* True means that the Homa module is in the process of unloading itself,
> + * so everyone should clean up.
> + */
> +static bool exiting;
> +
> +/* Thread that runs timer code to detect lost packets and crashed peers. */
> +static struct task_struct *timer_kthread;
> +
> +/* This structure defines functions that handle various operations on
> + * Homa sockets. These functions are relatively generic: they are called
> + * to implement top-level system calls. Many of these operations can
> + * be implemented by PF_INET6 functions that are independent of the
> + * Homa protocol.
> + */
> +static const struct proto_ops homa_proto_ops = {
> +	.family		   = PF_INET,
> +	.owner		   = THIS_MODULE,
> +	.release	   = inet_release,
> +	.bind		   = homa_bind,
> +	.connect	   = inet_dgram_connect,
> +	.socketpair	   = sock_no_socketpair,
> +	.accept		   = sock_no_accept,
> +	.getname	   = inet_getname,
> +	.poll		   = homa_poll,
> +	.ioctl		   = inet_ioctl,
> +	.listen		   = sock_no_listen,
> +	.shutdown	   = homa_shutdown,
> +	.setsockopt	   = sock_common_setsockopt,
> +	.getsockopt	   = sock_common_getsockopt,
> +	.sendmsg	   = inet_sendmsg,
> +	.recvmsg	   = inet_recvmsg,
> +	.mmap		   = sock_no_mmap,
> +	.set_peek_off	   = sk_set_peek_off,
> +};
> +
> +static const struct proto_ops homav6_proto_ops = {
> +	.family		   = PF_INET6,
> +	.owner		   = THIS_MODULE,
> +	.release	   = inet6_release,
> +	.bind		   = homa_bind,
> +	.connect	   = inet_dgram_connect,
> +	.socketpair	   = sock_no_socketpair,
> +	.accept		   = sock_no_accept,
> +	.getname	   = inet6_getname,
> +	.poll		   = homa_poll,
> +	.ioctl		   = inet6_ioctl,
> +	.listen		   = sock_no_listen,
> +	.shutdown	   = homa_shutdown,
> +	.setsockopt	   = sock_common_setsockopt,
> +	.getsockopt	   = sock_common_getsockopt,
> +	.sendmsg	   = inet_sendmsg,
> +	.recvmsg	   = inet_recvmsg,
> +	.mmap		   = sock_no_mmap,
> +	.set_peek_off	   = sk_set_peek_off,
> +};
> +
> +/* This structure also defines functions that handle various operations
> + * on Homa sockets. However, these functions are lower-level than those
> + * in homa_proto_ops: they are specific to the PF_INET or PF_INET6
> + * protocol family, and in many cases they are invoked by functions in
> + * homa_proto_ops. Most of these functions have Homa-specific implementations.
> + */
> +static struct proto homa_prot = {
> +	.name		   = "HOMA",
> +	.owner		   = THIS_MODULE,
> +	.close		   = homa_close,
> +	.connect	   = ip4_datagram_connect,
> +	.disconnect	   = homa_disconnect,
> +	.ioctl		   = homa_ioctl,
> +	.init		   = homa_socket,
> +	.destroy	   = NULL,
> +	.setsockopt	   = homa_setsockopt,
> +	.getsockopt	   = homa_getsockopt,
> +	.sendmsg	   = homa_sendmsg,
> +	.recvmsg	   = homa_recvmsg,
> +	.backlog_rcv       = homa_backlog_rcv,
> +	.hash		   = homa_hash,
> +	.unhash		   = homa_unhash,
> +	.get_port	   = homa_get_port,
> +	.sysctl_mem	   = sysctl_homa_mem,
> +	.sysctl_wmem	   = &sysctl_homa_wmem_min,
> +	.sysctl_rmem	   = &sysctl_homa_rmem_min,
> +	.obj_size	   = sizeof(struct homa_sock),
> +	.no_autobind       = 1,
> +};
> +
> +static struct proto homav6_prot = {
> +	.name		   = "HOMAv6",
> +	.owner		   = THIS_MODULE,
> +	.close		   = homa_close,
> +	.connect	   = ip6_datagram_connect,
> +	.disconnect	   = homa_disconnect,
> +	.ioctl		   = homa_ioctl,
> +	.init		   = homa_socket,
> +	.destroy	   = NULL,
> +	.setsockopt	   = homa_setsockopt,
> +	.getsockopt	   = homa_getsockopt,
> +	.sendmsg	   = homa_sendmsg,
> +	.recvmsg	   = homa_recvmsg,
> +	.backlog_rcv       = homa_backlog_rcv,
> +	.hash		   = homa_hash,
> +	.unhash		   = homa_unhash,
> +	.get_port	   = homa_get_port,
> +	.sysctl_mem	   = sysctl_homa_mem,
> +	.sysctl_wmem	   = &sysctl_homa_wmem_min,
> +	.sysctl_rmem	   = &sysctl_homa_rmem_min,
> +
> +	/* IPv6 data comes *after* Homa's data, and isn't included in
> +	 * struct homa_sock.
> +	 */
> +	.obj_size	   = sizeof(struct homa_sock) + sizeof(struct ipv6_pinfo),


The implementation of inet6_sk_generic() has already changed, you should set
.ipv6_pinfo_offset.


> +	.no_autobind       = 1,
> +};
> +
> +/* Top-level structure describing the Homa protocol. */
> +static struct inet_protosw homa_protosw = {
> +	.type              = SOCK_DGRAM,
> +	.protocol          = IPPROTO_HOMA,
> +	.prot              = &homa_prot,
> +	.ops               = &homa_proto_ops,
> +	.flags             = INET_PROTOSW_REUSE,
> +};
> +
> +static struct inet_protosw homav6_protosw = {
> +	.type              = SOCK_DGRAM,
> +	.protocol          = IPPROTO_HOMA,
> +	.prot              = &homav6_prot,
> +	.ops               = &homav6_proto_ops,
> +	.flags             = INET_PROTOSW_REUSE,
> +};
> +
> +/* This structure is used by IP to deliver incoming Homa packets to us. */
> +static struct net_protocol homa_protocol = {
> +	.handler =	homa_softirq,
> +	.err_handler =	homa_err_handler_v4,
> +	.no_policy =     1,
> +};
> +
> +static struct inet6_protocol homav6_protocol = {
> +	.handler =	homa_softirq,
> +	.err_handler =	homa_err_handler_v6,
> +	.flags =        INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
> +};
> +
> +/* Sizes of the headers for each Homa packet type, in bytes. */
> +static __u16 header_lengths[] = {
> +	sizeof32(struct data_header),
> +	0,
> +	sizeof32(struct resend_header),
> +	sizeof32(struct unknown_header),
> +	sizeof32(struct busy_header),
> +	0,
> +	sizeof32(struct common_header),
> +	sizeof32(struct need_ack_header),
> +	sizeof32(struct ack_header)
> +};
> +
> +static DECLARE_COMPLETION(timer_thread_done);
> +
> +/**
> + * homa_load() - invoked when this module is loaded into the Linux kernel
> + * Return: 0 on success, otherwise a negative errno.
> + */
> +static int __init homa_load(void)
> +{
> +	int status;
> +
> +	pr_notice("Homa module loading\n");
> +	pr_notice("Homa structure sizes: data_header %u, seg_header %u, ack %u, peer %u, ip_hdr %u flowi %u ipv6_hdr %u, flowi6 %u tcp_sock %u homa_rpc %u sk_buff %u rcvmsg_control %u union sockaddr_in_union %u HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n",
> +		  sizeof32(struct data_header),
> +		  sizeof32(struct seg_header),
> +		  sizeof32(struct homa_ack),
> +		  sizeof32(struct homa_peer),
> +		  sizeof32(struct iphdr),
> +		  sizeof32(struct flowi),
> +		  sizeof32(struct ipv6hdr),
> +		  sizeof32(struct flowi6),
> +		  sizeof32(struct tcp_sock),
> +		  sizeof32(struct homa_rpc),
> +		  sizeof32(struct sk_buff),
> +		  sizeof32(struct homa_recvmsg_args),
> +		  sizeof32(union sockaddr_in_union),
> +		  HOMA_MAX_BPAGES,
> +		  NR_CPUS,
> +		  nr_cpu_ids,
> +		  MAX_NUMNODES);
> +	status = proto_register(&homa_prot, 1);
> +	if (status != 0) {
> +		pr_err("proto_register failed for homa_prot: %d\n", status);
> +		goto out;
> +	}
> +	status = proto_register(&homav6_prot, 1);
> +	if (status != 0) {
> +		pr_err("proto_register failed for homav6_prot: %d\n", status);
> +		goto out;
> +	}
> +	inet_register_protosw(&homa_protosw);
> +	inet6_register_protosw(&homav6_protosw);


better to check the retval of inet6_register_protosw().


> +	status = inet_add_protocol(&homa_protocol, IPPROTO_HOMA);
> +	if (status != 0) {
> +		pr_err("inet_add_protocol failed in %s: %d\n", __func__,
> +		       status);
> +		goto out_cleanup;
> +	}
> +	status = inet6_add_protocol(&homav6_protocol, IPPROTO_HOMA);
> +	if (status != 0) {
> +		pr_err("inet6_add_protocol failed in %s: %d\n",  __func__,
> +		       status);
> +		goto out_cleanup;
> +	}
> +
> +	status = homa_init(homa);
> +	if (status)
> +		goto out_cleanup;
> +
> +	timer_kthread = kthread_run(homa_timer_main, homa, "homa_timer");
> +	if (IS_ERR(timer_kthread)) {
> +		status = PTR_ERR(timer_kthread);
> +		pr_err("couldn't create homa pacer thread: error %d\n",
> +		       status);
> +		timer_kthread = NULL;
> +		goto out_cleanup;
> +	}
> +
> +	return 0;
> +
> +out_cleanup:
> +	homa_destroy(homa);
> +	inet_del_protocol(&homa_protocol, IPPROTO_HOMA);
> +	inet_unregister_protosw(&homa_protosw);
> +	inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA);
> +	inet6_unregister_protosw(&homav6_protosw);
> +	proto_unregister(&homa_prot);
> +	proto_unregister(&homav6_prot);


It's a bit strange for me that this relies on a premise: that every reverse operation can correctly 
identify whether the corresponding forward operation has been executed. Currently, perhaps every 
function includes this capability. It's up to you, I don't insist.



> +out:
> +	return status;
> +}
> +
> +/**
> + * homa_unload() - invoked when this module is unloaded from the Linux kernel.
> + */
> +static void __exit homa_unload(void)
> +{
> +	pr_notice("Homa module unloading\n");
> +	exiting = true;
> +
> +	if (timer_kthread)
> +		wake_up_process(timer_kthread);
> +	wait_for_completion(&timer_thread_done);
> +	homa_destroy(homa);
> +	inet_del_protocol(&homa_protocol, IPPROTO_HOMA);
> +	inet_unregister_protosw(&homa_protosw);
> +	inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA);
> +	inet6_unregister_protosw(&homav6_protosw);
> +	proto_unregister(&homa_prot);
> +	proto_unregister(&homav6_prot);
> +}
> +
> +module_init(homa_load);
> +module_exit(homa_unload);

Perhaps you can try adding MODULE_ALIAS_NET_PF_PROTO_TYPE so that the kernel will automatically load 
the module when creating IPPROTO_HOMA socket. A functional suggestion, It's up to you.

> +
> +/**
> + * homa_bind() - Implements the bind system call for Homa sockets: associates
> + * a well-known service port with a socket. Unlike other AF_INET6 protocols,
> + * there is no need to invoke this system call for sockets that are only
> + * used as clients.
> + * @sock:     Socket on which the system call was invoked.
> + * @addr:    Contains the desired port number.
> + * @addr_len: Number of bytes in uaddr.
> + * Return:    0 on success, otherwise a negative errno.
> + */
> +int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
> +{
> +	struct homa_sock *hsk = homa_sk(sock->sk);
> +	union sockaddr_in_union *addr_in = (union sockaddr_in_union *)addr;
> +	int port = 0;
> +
> +	if (unlikely(addr->sa_family != sock->sk->sk_family))
> +		return -EAFNOSUPPORT;
> +	if (addr_in->in6.sin6_family == AF_INET6) {
> +		if (addr_len < sizeof(struct sockaddr_in6))
> +			return -EINVAL;
> +		port = ntohs(addr_in->in4.sin_port);
> +	} else if (addr_in->in4.sin_family == AF_INET) {
> +		if (addr_len < sizeof(struct sockaddr_in))
> +			return -EINVAL;
> +		port = ntohs(addr_in->in6.sin6_port);
> +	}
> +	return homa_sock_bind(homa->port_map, hsk, port);
> +}

Is binding multiple times legal? For example, bind 80 first and then bind 8080. If not, I think
you might need to check the inet_num.


> +
> +/**
> + * homa_close() - Invoked when close system call is invoked on a Homa socket.
> + * @sk:      Socket being closed
> + * @timeout: ??
> + */
> +void homa_close(struct sock *sk, long timeout)
> +{
> +	struct homa_sock *hsk = homa_sk(sk);
> +
> +	homa_sock_destroy(hsk);
> +	sk_common_release(sk);
> +}
> +
> +/**
> + * homa_shutdown() - Implements the shutdown system call for Homa sockets.
> + * @sock:    Socket to shut down.
> + * @how:     Ignored: for other sockets, can independently shut down
> + *           sending and receiving, but for Homa any shutdown will
> + *           shut down everything.
> + *
> + * Return: 0 on success, otherwise a negative errno.
> + */
> +int homa_shutdown(struct socket *sock, int how)
> +{
> +	homa_sock_shutdown(homa_sk(sock->sk));
> +	return 0;
> +}
> +
> +/**
> + * homa_disconnect() - Invoked when disconnect system call is invoked on a
> + * Homa socket.
> + * @sk:    Socket to disconnect
> + * @flags: ??
> + *
> + * Return: 0 on success, otherwise a negative errno.
> + */
> +int homa_disconnect(struct sock *sk, int flags)
> +{
> +	pr_warn("unimplemented disconnect invoked on Homa socket\n");
> +	return -EINVAL;
> +}
> +
> +/**
> + * homa_ioctl() - Implements the ioctl system call for Homa sockets.
> + * @sk:    Socket on which the system call was invoked.
> + * @cmd:   Identifier for a particular ioctl operation.
> + * @karg:  Operation-specific argument; typically the address of a block
> + *         of data in user address space.
> + *
> + * Return: 0 on success, otherwise a negative errno.
> + */
> +int homa_ioctl(struct sock *sk, int cmd, int *karg)
> +{
> +	return -EINVAL;
> +}
> +
> +/**
> + * homa_socket() - Implements the socket(2) system call for sockets.
> + * @sk:    Socket on which the system call was invoked. The non-Homa
> + *         parts have already been initialized.
> + *
> + * Return: always 0 (success).
> + */
> +int homa_socket(struct sock *sk)
> +{
> +	struct homa_sock *hsk = homa_sk(sk);
> +
> +	homa_sock_init(hsk, homa)
I noticed that homa_sock_init() contains a memory allocation action, perhaps you should add a return 
value check.


-> hsk->buffer_pool = kzalloc(sizeof(*hsk->buffer_pool), GFP_KERNEL);

> +	return 0;
> +}
> +
> +/**
> + * homa_setsockopt() - Implements the getsockopt system call for Homa sockets.
> + * @sk:      Socket on which the system call was invoked.
> + * @level:   Level at which the operation should be handled; will always
> + *           be IPPROTO_HOMA.
> + * @optname: Identifies a particular setsockopt operation.
> + * @optval:  Address in user space of information about the option.
> + * @optlen:  Number of bytes of data at @optval.
> + * Return:   0 on success, otherwise a negative errno.
> + */
> +int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
> +		    unsigned int optlen)
> +{
> +	struct homa_sock *hsk = homa_sk(sk);
> +	struct homa_set_buf_args args;
> +	int ret;
> +
> +	if (level != IPPROTO_HOMA || optname != SO_HOMA_SET_BUF ||
> +	    optlen != sizeof(struct homa_set_buf_args))
> +		return -EINVAL;
> +
> +	if (copy_from_sockptr(&args, optval, optlen))
> +		return -EFAULT;
> +
> +	/* Do a trivial test to make sure we can at least write the first
> +	 * page of the region.
> +	 */
> +	if (copy_to_user((__force void __user *)args.start, &args, sizeof(args)))
> +		return -EFAULT;
> +
> +	homa_sock_lock(hsk, "homa_setsockopt SO_HOMA_SET_BUF");
> +	ret = homa_pool_init(hsk, (__force void __user *)args.start, args.length);
> +	homa_sock_unlock(hsk);
> +	return ret;
> +}
> +
> +/**
> + * homa_getsockopt() - Implements the getsockopt system call for Homa sockets.
> + * @sk:      Socket on which the system call was invoked.
> + * @level:   ??
> + * @optname: Identifies a particular setsockopt operation.
> + * @optval:  Address in user space where the option's value should be stored.
> + * @option:  ??.
> + * Return:   0 on success, otherwise a negative errno.
> + */
> +int homa_getsockopt(struct sock *sk, int level, int optname,
> +		    char __user *optval, int __user *option)
> +{
> +	pr_warn("unimplemented getsockopt invoked on Homa socket: level %d, optname %d\n",
> +		level, optname);
> +	return -EINVAL;
> +}
> +
> +/**
> + * homa_sendmsg() - Send a request or response message on a Homa socket.
> + * @sk:     Socket on which the system call was invoked.
> + * @msg:    Structure describing the message to send; the msg_control
> + *          field points to additional information.
> + * @length: Number of bytes of the message.
> + * Return: 0 on success, otherwise a negative errno.
> + */
> +int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length)
> +{
> +	struct homa_sock *hsk = homa_sk(sk);
> +	struct homa_sendmsg_args args;
> +	int result = 0;
> +	struct homa_rpc *rpc = NULL;
> +	union sockaddr_in_union *addr = (union sockaddr_in_union *)msg->msg_name;
> +
> +	if (unlikely(!msg->msg_control_is_user)) {
> +		result = -EINVAL;
> +		goto error;
> +	}
> +	if (unlikely(copy_from_user(&args,
> +				    (__force void __user *)msg->msg_control,
> +				    sizeof(args)))) {
> +		result = -EFAULT;
> +		goto error;
> +	}
> +	if (addr->in6.sin6_family != sk->sk_family) {
> +		result = -EAFNOSUPPORT;
> +		goto error;
> +	}
> +	if (msg->msg_namelen < sizeof(struct sockaddr_in) ||
> +	    (msg->msg_namelen < sizeof(struct sockaddr_in6) &&
> +	     addr->in6.sin6_family == AF_INET6)) {
> +		result = -EINVAL;
> +		goto error;
> +	}
> +
> +	if (!args.id) {
> +		/* This is a request message. */
> +		rpc = homa_rpc_new_client(hsk, addr);
> +		if (IS_ERR(rpc)) {
> +			result = PTR_ERR(rpc);
> +			rpc = NULL;
> +			goto error;
> +		}
> +		rpc->completion_cookie = args.completion_cookie;
> +		result = homa_message_out_fill(rpc, &msg->msg_iter, 1);
> +		if (result)
> +			goto error;
> +		args.id = rpc->id;
> +		homa_rpc_unlock(rpc);
> +		rpc = NULL;
> +
> +		if (unlikely(copy_to_user((__force void __user *)msg->msg_control,
> +					  &args, sizeof(args)))) {
> +			rpc = homa_find_client_rpc(hsk, args.id);
> +			result = -EFAULT;
> +			goto error;
> +		}
> +	} else {
> +		/* This is a response message. */
> +		struct in6_addr canonical_dest;
> +
> +		if (args.completion_cookie != 0) {
> +			result = -EINVAL;
> +			goto error;
> +		}
> +		canonical_dest = canonical_ipv6_addr(addr);
> +
> +		rpc = homa_find_server_rpc(hsk, &canonical_dest,
> +					   ntohs(addr->in6.sin6_port), args.id);
> +		if (!rpc)
> +			/* Return without an error if the RPC doesn't exist;
> +			 * this could be totally valid (e.g. client is
> +			 * no longer interested in it).
> +			 */
> +			return 0;
> +		if (rpc->error) {
> +			result = rpc->error;
> +			goto error;
> +		}
> +		if (rpc->state != RPC_IN_SERVICE) {
> +			homa_rpc_unlock(rpc);
> +			rpc = NULL;
> +			result = -EINVAL;
> +			goto error;
> +		}
> +		rpc->state = RPC_OUTGOING;
> +
> +		result = homa_message_out_fill(rpc, &msg->msg_iter, 1);
> +		if (result && rpc->state != RPC_DEAD)
> +			goto error;
> +		homa_rpc_unlock(rpc);
> +	}
> +	return 0;
> +
> +error:
> +	if (rpc) {
> +		homa_rpc_free(rpc);
> +		homa_rpc_unlock(rpc);
> +	}
> +	return result;
> +}
> +
> +/**
> + * homa_recvmsg() - Receive a message from a Homa socket.
> + * @sk:          Socket on which the system call was invoked.
> + * @msg:         Controlling information for the receive.
> + * @len:         Total bytes of space available in msg->msg_iov; not used.
> + * @flags:       Flags from system call; only MSG_DONTWAIT is used.
> + * @addr_len:    Store the length of the sender address here
> + * Return:       The length of the message on success, otherwise a negative
> + *               errno.
> + */
> +int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
> +		 int *addr_len)
> +{
> +	struct homa_sock *hsk = homa_sk(sk);
> +	struct homa_recvmsg_args control;
> +	struct homa_rpc *rpc;
> +	int result;
> +
> +	if (unlikely(!msg->msg_control)) {
> +		/* This test isn't strictly necessary, but it provides a
> +		 * hook for testing kernel call times.
> +		 */
> +		return -EINVAL;
> +	}
> +	if (msg->msg_controllen != sizeof(control)) {
> +		result = -EINVAL;
> +		goto done;
> +	}
> +	if (unlikely(copy_from_user(&control,
> +				    (__force void __user *)msg->msg_control,
> +				    sizeof(control)))) {
> +		result = -EFAULT;
> +		goto done;
> +	}
> +	control.completion_cookie = 0;
> +
> +	if (control.num_bpages > HOMA_MAX_BPAGES ||
> +	    (control.flags & ~HOMA_RECVMSG_VALID_FLAGS)) {
> +		result = -EINVAL;
> +		goto done;
> +	}
> +	homa_pool_release_buffers(hsk->buffer_pool, control.num_bpages,
> +				  control.bpage_offsets);
> +	control.num_bpages = 0;
> +
> +	rpc = homa_wait_for_message(hsk, (flags & MSG_DONTWAIT)
> +			? (control.flags | HOMA_RECVMSG_NONBLOCKING)
> +			: control.flags, control.id);
> +	if (IS_ERR(rpc)) {
> +		/* If we get here, it means there was an error that prevented
> +		 * us from finding an RPC to return. If there's an error in
> +		 * the RPC itself we won't get here.
> +		 */
> +		result = PTR_ERR(rpc);
> +		goto done;
> +	}
> +	result = rpc->error ? rpc->error : rpc->msgin.length;
> +
> +	/* Collect result information. */
> +	control.id = rpc->id;
> +	control.completion_cookie = rpc->completion_cookie;
> +	if (likely(rpc->msgin.length >= 0)) {
> +		control.num_bpages = rpc->msgin.num_bpages;
> +		memcpy(control.bpage_offsets, rpc->msgin.bpage_offsets,
> +		       sizeof(control.bpage_offsets));
> +	}
> +	if (sk->sk_family == AF_INET6) {
> +		struct sockaddr_in6 *in6 = msg->msg_name;
> +
> +		in6->sin6_family = AF_INET6;
> +		in6->sin6_port = htons(rpc->dport);
> +		in6->sin6_addr = rpc->peer->addr;
> +		*addr_len = sizeof(*in6);
> +	} else {
> +		struct sockaddr_in *in4 = msg->msg_name;
> +
> +		in4->sin_family = AF_INET;
> +		in4->sin_port = htons(rpc->dport);
> +		in4->sin_addr.s_addr = ipv6_to_ipv4(rpc->peer->addr);
> +		*addr_len = sizeof(*in4);
> +	}
> +
> +	/* This indicates that the application now owns the buffers, so
> +	 * we won't free them in homa_rpc_free.
> +	 */
> +	rpc->msgin.num_bpages = 0;
> +
> +	/* Must release the RPC lock (and potentially free the RPC) before
> +	 * copying the results back to user space.
> +	 */
> +	if (homa_is_client(rpc->id)) {
> +		homa_peer_add_ack(rpc);
> +		homa_rpc_free(rpc);
> +	} else {
> +		if (result < 0)
> +			homa_rpc_free(rpc);
> +		else
> +			rpc->state = RPC_IN_SERVICE;
> +	}
> +	homa_rpc_unlock(rpc);
> +
> +done:
> +	if (unlikely(copy_to_user((__force void __user *)msg->msg_control,
> +		     &control, sizeof(control)))) {
> +		/* Note: in this case the message's buffers will be leaked. */
> +		pr_notice("%s couldn't copy back args\n", __func__);
> +		result = -EFAULT;
> +	}
> +
> +	return result;
> +}
> +
> +/**
> + * homa_hash() - Not needed for Homa.
> + * @sk:    Socket for the operation
> + * Return: ??
> + */
> +int homa_hash(struct sock *sk)
> +{
> +	return 0;
> +}
> +
> +/**
> + * homa_unhash() - Not needed for Homa.
> + * @sk:    Socket for the operation
> + */
> +void homa_unhash(struct sock *sk)
> +{
> +}
> +
> +/**
> + * homa_get_port() - It appears that this function is called to assign a
> + * default port for a socket.
> + * @sk:    Socket for the operation
> + * @snum:  Unclear what this is.
> + * Return: Zero for success, or a negative errno for an error.
> + */
> +int homa_get_port(struct sock *sk, unsigned short snum)
> +{
> +	/* Homa always assigns ports immediately when a socket is created,
> +	 * so there is nothing to do here.
> +	 */
> +	return 0;
> +}
> +
> +/**
> + * homa_softirq() - This function is invoked at SoftIRQ level to handle
> + * incoming packets.
> + * @skb:   The incoming packet.
> + * Return: Always 0
> + */
> +int homa_softirq(struct sk_buff *skb)
> +{
> +	struct sk_buff *packets, *other_pkts, *next;
> +	struct sk_buff **prev_link, **other_link;
> +	struct common_header *h;
> +	int first_packet = 1;
> +	int header_offset;
> +	int pull_length;
> +
> +	/* skb may actually contain many distinct packets, linked through
> +	 * skb_shinfo(skb)->frag_list by the Homa GRO mechanism. Make a
> +	 * pass through the list to process all of the short packets,
> +	 * leaving the longer packets in the list. Also, perform various
> +	 * prep/cleanup/error checking functions.
> +	 */
> +	skb->next = skb_shinfo(skb)->frag_list;
> +	skb_shinfo(skb)->frag_list = NULL;
> +	packets = skb;
> +	prev_link = &packets;
> +	for (skb = packets; skb; skb = next) {
> +		next = skb->next;
> +
> +		/* Make the header available at skb->data, even if the packet
> +		 * is fragmented. One complication: it's possible that the IP
> +		 * header hasn't yet been removed (this happens for GRO packets
> +		 * on the frag_list, since they aren't handled explicitly by IP.
> +		 */
> +		header_offset = skb_transport_header(skb) - skb->data;
> +		pull_length = HOMA_MAX_HEADER + header_offset;
> +		if (pull_length > skb->len)
> +			pull_length = skb->len;
> +		if (!pskb_may_pull(skb, pull_length))
> +			goto discard;
> +		if (header_offset)
> +			__skb_pull(skb, header_offset);
> +
> +		/* Reject packets that are too short or have bogus types. */
> +		h = (struct common_header *)skb->data;
> +		if (unlikely(skb->len < sizeof(struct common_header) ||
> +			     h->type < DATA || h->type >= BOGUS ||
> +			     skb->len < header_lengths[h->type - DATA]))
> +			goto discard;
> +
> +		if (first_packet)
> +			first_packet = 0;
> +
> +		/* Process the packet now if it is a control packet or
> +		 * if it contains an entire short message.
> +		 */
> +		if (h->type != DATA || ntohl(((struct data_header *)h)
> +				->message_length) < 1400) {
> +			*prev_link = skb->next;
> +			skb->next = NULL;
> +			homa_dispatch_pkts(skb, homa);
> +		} else {
> +			prev_link = &skb->next;
> +		}
> +		continue;
> +
> +discard:
> +		*prev_link = skb->next;
> +		kfree_skb(skb);
> +	}
> +
> +	/* Now process the longer packets. Each iteration of this loop
> +	 * collects all of the packets for a particular RPC and dispatches
> +	 * them.
> +	 */
> +	while (packets) {
> +		struct in6_addr saddr, saddr2;
> +		struct common_header *h2;
> +		struct sk_buff *skb2;
> +
> +		skb = packets;
> +		prev_link = &skb->next;
> +		saddr = skb_canonical_ipv6_saddr(skb);
> +		other_pkts = NULL;
> +		other_link = &other_pkts;
> +		h = (struct common_header *)skb->data;
> +		for (skb2 = skb->next; skb2; skb2 = next) {
> +			next = skb2->next;
> +			h2 = (struct common_header *)skb2->data;
> +			if (h2->sender_id == h->sender_id) {
> +				saddr2 = skb_canonical_ipv6_saddr(skb2);
> +				if (ipv6_addr_equal(&saddr, &saddr2)) {
> +					*prev_link = skb2;
> +					prev_link = &skb2->next;
> +					continue;
> +				}
> +			}
> +			*other_link = skb2;
> +			other_link = &skb2->next;
> +		}
> +		*prev_link = NULL;
> +		*other_link = NULL;
> +		homa_dispatch_pkts(packets, homa);
> +		packets = other_pkts;
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * homa_backlog_rcv() - Invoked to handle packets saved on a socket's
> + * backlog because it was locked when the packets first arrived.
> + * @sk:     Homa socket that owns the packet's destination port.
> + * @skb:    The incoming packet. This function takes ownership of the packet
> + *          (we'll delete it).
> + *
> + * Return:  Always returns 0.
> + */
> +int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb)
> +{
> +	pr_warn("unimplemented backlog_rcv invoked on Homa socket\n");
> +	kfree_skb(skb);
> +	return 0;
> +}
> +
> +/**
> + * homa_err_handler_v4() - Invoked by IP to handle an incoming error
> + * packet, such as ICMP UNREACHABLE.
> + * @skb:   The incoming packet.
> + * @info:  Information about the error that occurred?
> + *
> + * Return: zero, or a negative errno if the error couldn't be handled here.
> + */
> +int homa_err_handler_v4(struct sk_buff *skb, u32 info)
> +{
> +	const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb);
> +	const struct iphdr *iph = ip_hdr(skb);
> +	int type = icmp_hdr(skb)->type;
> +	int code = icmp_hdr(skb)->code;
> +
> +	if (type == ICMP_DEST_UNREACH && code == ICMP_PORT_UNREACH) {
> +		char *icmp = (char *)icmp_hdr(skb);
> +		struct common_header *h;
> +
> +		iph = (struct iphdr *)(icmp + sizeof(struct icmphdr));
> +		h = (struct common_header *)(icmp + sizeof(struct icmphdr)
> +				+ iph->ihl * 4);
> +		homa_abort_rpcs(homa, &saddr, ntohs(h->dport), -ENOTCONN);
> +	} else if (type == ICMP_DEST_UNREACH) {
> +		int error;
> +
> +		if (code == ICMP_PROT_UNREACH)
> +			error = -EPROTONOSUPPORT;
> +		else
> +			error = -EHOSTUNREACH;
> +		homa_abort_rpcs(homa, &saddr, 0, error);
> +	} else {
> +		pr_notice("%s invoked with info %x, ICMP type %d, ICMP code %d\n",
> +			  __func__, info, type, code);
> +	}
> +	return 0;
> +}
> +
> +/**
> + * homa_err_handler_v6() - Invoked by IP to handle an incoming error
> + * packet, such as ICMP UNREACHABLE.
> + * @skb:    The incoming packet.
> + * @opt:    Not used.
> + * @type:   Type of ICMP packet.
> + * @code:   Additional information about the error.
> + * @offset: Not used.
> + * @info:   Information about the error that occurred?
> + *
> + * Return: zero, or a negative errno if the error couldn't be handled here.
> + */
> +int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt,
> +			u8 type,  u8 code,  int offset,  __be32 info)
> +{
> +	const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
> +
> +	if (type == ICMPV6_DEST_UNREACH && code == ICMPV6_PORT_UNREACH) {
> +		char *icmp = (char *)icmp_hdr(skb);
> +		struct common_header *h;
> +
> +		iph = (struct ipv6hdr *)(icmp + sizeof(struct icmphdr));
> +		h = (struct common_header *)(icmp + sizeof(struct icmphdr)
> +				+ HOMA_IPV6_HEADER_LENGTH);
> +		homa_abort_rpcs(homa, &iph->daddr, ntohs(h->dport), -ENOTCONN);
> +	} else if (type == ICMPV6_DEST_UNREACH) {
> +		int error;
> +
> +		if (code == ICMP_PROT_UNREACH)
> +			error = -EPROTONOSUPPORT;
> +		else
> +			error = -EHOSTUNREACH;
> +		homa_abort_rpcs(homa, &iph->daddr, 0, error);
> +	}
> +	return 0;
> +}
> +
> +/**
> + * homa_poll() - Invoked by Linux as part of implementing select, poll,
> + * epoll, etc.
> + * @file:  Open file that is participating in a poll, select, etc.
> + * @sock:  A Homa socket, associated with @file.
> + * @wait:  This table will be registered with the socket, so that it
> + *         is notified when the socket's ready state changes.
> + *
> + * Return: A mask of bits such as EPOLLIN, which indicate the current
> + *         state of the socket.
> + */
> +__poll_t homa_poll(struct file *file, struct socket *sock,
> +		   struct poll_table_struct *wait)
> +{
> +	struct sock *sk = sock->sk;
> +	__u32 mask;
> +
> +	/* It seems to be standard practice for poll functions *not* to
> +	 * acquire the socket lock, so we don't do it here; not sure
> +	 * why...
> +	 */
> +
> +	sock_poll_wait(file, sock, wait);
> +	mask = POLLOUT | POLLWRNORM;
> +
> +	if (!list_empty(&homa_sk(sk)->ready_requests) ||
> +	    !list_empty(&homa_sk(sk)->ready_responses))
> +		mask |= POLLIN | POLLRDNORM;
> +	return (__force __poll_t)mask;
> +}
> +
> +/**
> + * homa_hrtimer() - This function is invoked by the hrtimer mechanism to
> + * wake up the timer thread. Runs at IRQ level.
> + * @timer:   The timer that triggered; not used.
> + *
> + * Return:   Always HRTIMER_RESTART.
> + */
> +enum hrtimer_restart homa_hrtimer(struct hrtimer *timer)
> +{
> +	wake_up_process(timer_kthread);
> +	return HRTIMER_NORESTART;
> +}
> +
> +/**
> + * homa_timer_main() - Top-level function for the timer thread.
> + * @transport:  Pointer to struct homa.
> + *
> + * Return:         Always 0.
> + */
> +int homa_timer_main(void *transport)
> +{
> +	struct homa *homa = (struct homa *)transport;
> +	struct hrtimer hrtimer;
> +	ktime_t tick_interval;
> +	u64 nsec;
> +
> +	hrtimer_init(&hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> +	hrtimer.function = &homa_hrtimer;
> +	nsec = 1000000;                   /* 1 ms */
> +	tick_interval = ns_to_ktime(nsec);
> +	while (1) {
> +		set_current_state(TASK_UNINTERRUPTIBLE);
> +		if (!exiting) {
> +			hrtimer_start(&hrtimer, tick_interval, HRTIMER_MODE_REL);
> +			schedule();
> +		}
> +		__set_current_state(TASK_RUNNING);
> +		if (exiting)
> +			break;
> +		homa_timer(homa);
> +	}
> +	hrtimer_cancel(&hrtimer);
> +	kthread_complete_and_exit(&timer_thread_done, 0);
> +	return 0;
> +}
> diff --git a/net/homa/homa_utils.c b/net/homa/homa_utils.c
> new file mode 100644
> index 000000000000..905d00c836bd
> --- /dev/null
> +++ b/net/homa/homa_utils.c
> @@ -0,0 +1,150 @@
> +// SPDX-License-Identifier: BSD-2-Clause
> +
> +/* This file contains miscellaneous utility functions for Homa, such
> + * as initializing and destroying homa structs.
> + */
> +
> +#include "homa_impl.h"
> +#include "homa_peer.h"
> +#include "homa_rpc.h"
> +#include "homa_stub.h"
> +
> +struct completion homa_pacer_kthread_done;
> +
> +/**
> + * homa_init() - Constructor for homa objects.
> + * @homa:   Object to initialize.
> + *
> + * Return:  0 on success, or a negative errno if there was an error. Even
> + *          if an error occurs, it is safe (and necessary) to call
> + *          homa_destroy at some point.
> + */
> +int homa_init(struct homa *homa)
> +{
> +	int err;
> +
> +	_Static_assert(HOMA_MAX_PRIORITIES >= 8,
> +		       "homa_init assumes at least 8 priority levels");
> +
> +	homa->pacer_kthread = NULL;
> +	init_completion(&homa_pacer_kthread_done);
> +	atomic64_set(&homa->next_outgoing_id, 2);
> +	atomic64_set(&homa->link_idle_time, sched_clock());
> +	spin_lock_init(&homa->pacer_mutex);
> +	homa->pacer_fifo_fraction = 50;
> +	homa->pacer_fifo_count = 1;
> +	homa->pacer_wake_time = 0;
> +	spin_lock_init(&homa->throttle_lock);
> +	INIT_LIST_HEAD_RCU(&homa->throttled_rpcs);
> +	homa->throttle_add = 0;
> +	homa->throttle_min_bytes = 200;
> +	homa->next_client_port = HOMA_MIN_DEFAULT_PORT;
> +	homa->port_map = kmalloc(sizeof(*homa->port_map), GFP_KERNEL);
> +	if (!homa->port_map) {
> +		pr_err("%s couldn't create port_map: kmalloc failure", __func__);
> +		return -ENOMEM;
> +	}
> +	homa_socktab_init(homa->port_map);
> +	homa->peers = kmalloc(sizeof(*homa->peers), GFP_KERNEL);
> +	if (!homa->peers) {
> +		pr_err("%s couldn't create peers: kmalloc failure", __func__);
> +		return -ENOMEM;
> +	}
> +	err = homa_peertab_init(homa->peers);
> +	if (err) {
> +		pr_err("%s couldn't initialize peer table (errno %d)\n",
> +		       __func__, -err);
> +		return err;
> +	}
> +
> +	/* Wild guesses to initialize configuration values... */
> +	homa->unsched_bytes = 40000;
> +	homa->window_param = 100000;
> +	homa->link_mbps = 25000;
> +	homa->fifo_grant_increment = 10000;
> +	homa->grant_fifo_fraction = 50;
> +	homa->max_overcommit = 8;
> +	homa->max_incoming = 400000;
> +	homa->max_rpcs_per_peer = 1;
> +	homa->resend_ticks = 5;
> +	homa->resend_interval = 5;
> +	homa->timeout_ticks = 100;
> +	homa->timeout_resends = 5;
> +	homa->request_ack_ticks = 2;
> +	homa->reap_limit = 10;
> +	homa->dead_buffs_limit = 5000;
> +	homa->max_dead_buffs = 0;
> +	homa->pacer_kthread = kthread_run(homa_pacer_main, homa,
> +					  "homa_pacer");
> +	if (IS_ERR(homa->pacer_kthread)) {
> +		err = PTR_ERR(homa->pacer_kthread);
> +		homa->pacer_kthread = NULL;
> +		pr_err("couldn't create homa pacer thread: error %d\n", err);
> +		return err;
> +	}
> +	homa->pacer_exit = false;
> +	homa->max_nic_queue_ns = 5000;
> +	homa->ns_per_mbyte = 0;
> +	homa->max_gso_size = 10000;
> +	homa->gso_force_software = 0;
> +	homa->max_gro_skbs = 20;
> +	homa->gro_policy = HOMA_GRO_NORMAL;
> +	homa->timer_ticks = 0;
> +	homa->flags = 0;
> +	homa->bpage_lease_usecs = 10000;
> +	homa->next_id = 0;
> +	homa_outgoing_sysctl_changed(homa);
> +	homa_incoming_sysctl_changed(homa);
> +	return 0;
> +}
> +
> +/**
> + * homa_destroy() -  Destructor for homa objects.
> + * @homa:      Object to destroy.
> + */
> +void homa_destroy(struct homa *homa)
> +{
> +	if (homa->pacer_kthread) {
> +		homa_pacer_stop(homa);
> +		wait_for_completion(&homa_pacer_kthread_done);
> +	}
> +
> +	/* The order of the following statements matters! */
> +	if (homa->port_map) {
> +		homa_socktab_destroy(homa->port_map);
> +		kfree(homa->port_map);
> +		homa->port_map = NULL;
> +	}
> +	if (homa->peers) {
> +		homa_peertab_destroy(homa->peers);
> +		kfree(homa->peers);
> +		homa->peers = NULL;
> +	}
> +}
> +
> +/**
> + * homa_spin() - Delay (without sleeping) for a given time interval.
> + * @ns:   How long to delay (in nanoseconds)
> + */
> +void homa_spin(int ns)
> +{
> +	__u64 end;
> +
> +	end = sched_clock() + ns;
> +	while (sched_clock() < end)
> +		/* Empty loop body.*/
> +		;
> +}
> +
> +/**
> + * homa_throttle_lock_slow() - This function implements the slow path for
> + * acquiring the throttle lock. It is invoked when the lock isn't immediately
> + * available. It waits for the lock, but also records statistics about
> + * the waiting time.
> + * @homa:    Overall data about the Homa protocol implementation.
> + */
> +void homa_throttle_lock_slow(struct homa *homa)
> +	__acquires(&homa->throttle_lock)
> +{
> +	spin_lock_bh(&homa->throttle_lock);
> +}