[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <07647363-622b-4023-ba71-da213754a7ae@linux.alibaba.com>
Date: Tue, 26 Nov 2024 13:32:15 +0800
From: "D. Wythe" <alibuda@...ux.alibaba.com>
To: John Ousterhout <ouster@...stanford.edu>, netdev@...r.kernel.org,
linux-api@...r.kernel.org
Subject: Re: [PATCH net-next v2 11/12] net: homa: create homa_plumbing.c
homa_utils.c
On 11/12/24 7:40 AM, John Ousterhout wrote:
> homa_plumbing.c contains functions that connect Homa to the rest of
> the Linux kernel, such as dispatch tables used by Linux and the
> top-level functions that Linux invokes from those dispatch tables.
>
> homa_utils.c contains a few odds and ends, such as code to initialize
> and destroy struct homa's.
>
> Signed-off-by: John Ousterhout <ouster@...stanford.edu>
> ---
> net/homa/homa_plumbing.c | 965 +++++++++++++++++++++++++++++++++++++++
> net/homa/homa_utils.c | 150 ++++++
> 2 files changed, 1115 insertions(+)
> create mode 100644 net/homa/homa_plumbing.c
> create mode 100644 net/homa/homa_utils.c
>
> diff --git a/net/homa/homa_plumbing.c b/net/homa/homa_plumbing.c
> new file mode 100644
> index 000000000000..afd3a9cc97ba
> --- /dev/null
> +++ b/net/homa/homa_plumbing.c
> @@ -0,0 +1,965 @@
> +// SPDX-License-Identifier: BSD-2-Clause
> +
> +/* This file consists mostly of "glue" that hooks Homa into the rest of
> + * the Linux kernel. The guts of the protocol are in other files.
> + */
> +
> +#include "homa_impl.h"
> +#include "homa_peer.h"
> +#include "homa_pool.h"
> +
> +MODULE_LICENSE("Dual MIT/GPL");
> +MODULE_AUTHOR("John Ousterhout");
> +MODULE_DESCRIPTION("Homa transport protocol");
> +MODULE_VERSION("0.01");
> +
> +/* Not yet sure what these variables are for */
> +static long sysctl_homa_mem[3] __read_mostly;
> +static int sysctl_homa_rmem_min __read_mostly;
> +static int sysctl_homa_wmem_min __read_mostly;
> +
> +/* Global data for Homa. Never reference homa_data directory. Always use
> + * the homa variable instead; this allows overriding during unit tests.
> + */
> +static struct homa homa_data;
> +struct homa *homa = &homa_data;
> +
> +/* True means that the Homa module is in the process of unloading itself,
> + * so everyone should clean up.
> + */
> +static bool exiting;
> +
> +/* Thread that runs timer code to detect lost packets and crashed peers. */
> +static struct task_struct *timer_kthread;
> +
> +/* This structure defines functions that handle various operations on
> + * Homa sockets. These functions are relatively generic: they are called
> + * to implement top-level system calls. Many of these operations can
> + * be implemented by PF_INET6 functions that are independent of the
> + * Homa protocol.
> + */
> +static const struct proto_ops homa_proto_ops = {
> + .family = PF_INET,
> + .owner = THIS_MODULE,
> + .release = inet_release,
> + .bind = homa_bind,
> + .connect = inet_dgram_connect,
> + .socketpair = sock_no_socketpair,
> + .accept = sock_no_accept,
> + .getname = inet_getname,
> + .poll = homa_poll,
> + .ioctl = inet_ioctl,
> + .listen = sock_no_listen,
> + .shutdown = homa_shutdown,
> + .setsockopt = sock_common_setsockopt,
> + .getsockopt = sock_common_getsockopt,
> + .sendmsg = inet_sendmsg,
> + .recvmsg = inet_recvmsg,
> + .mmap = sock_no_mmap,
> + .set_peek_off = sk_set_peek_off,
> +};
> +
> +static const struct proto_ops homav6_proto_ops = {
> + .family = PF_INET6,
> + .owner = THIS_MODULE,
> + .release = inet6_release,
> + .bind = homa_bind,
> + .connect = inet_dgram_connect,
> + .socketpair = sock_no_socketpair,
> + .accept = sock_no_accept,
> + .getname = inet6_getname,
> + .poll = homa_poll,
> + .ioctl = inet6_ioctl,
> + .listen = sock_no_listen,
> + .shutdown = homa_shutdown,
> + .setsockopt = sock_common_setsockopt,
> + .getsockopt = sock_common_getsockopt,
> + .sendmsg = inet_sendmsg,
> + .recvmsg = inet_recvmsg,
> + .mmap = sock_no_mmap,
> + .set_peek_off = sk_set_peek_off,
> +};
> +
> +/* This structure also defines functions that handle various operations
> + * on Homa sockets. However, these functions are lower-level than those
> + * in homa_proto_ops: they are specific to the PF_INET or PF_INET6
> + * protocol family, and in many cases they are invoked by functions in
> + * homa_proto_ops. Most of these functions have Homa-specific implementations.
> + */
> +static struct proto homa_prot = {
> + .name = "HOMA",
> + .owner = THIS_MODULE,
> + .close = homa_close,
> + .connect = ip4_datagram_connect,
> + .disconnect = homa_disconnect,
> + .ioctl = homa_ioctl,
> + .init = homa_socket,
> + .destroy = NULL,
> + .setsockopt = homa_setsockopt,
> + .getsockopt = homa_getsockopt,
> + .sendmsg = homa_sendmsg,
> + .recvmsg = homa_recvmsg,
> + .backlog_rcv = homa_backlog_rcv,
> + .hash = homa_hash,
> + .unhash = homa_unhash,
> + .get_port = homa_get_port,
> + .sysctl_mem = sysctl_homa_mem,
> + .sysctl_wmem = &sysctl_homa_wmem_min,
> + .sysctl_rmem = &sysctl_homa_rmem_min,
> + .obj_size = sizeof(struct homa_sock),
> + .no_autobind = 1,
> +};
> +
> +static struct proto homav6_prot = {
> + .name = "HOMAv6",
> + .owner = THIS_MODULE,
> + .close = homa_close,
> + .connect = ip6_datagram_connect,
> + .disconnect = homa_disconnect,
> + .ioctl = homa_ioctl,
> + .init = homa_socket,
> + .destroy = NULL,
> + .setsockopt = homa_setsockopt,
> + .getsockopt = homa_getsockopt,
> + .sendmsg = homa_sendmsg,
> + .recvmsg = homa_recvmsg,
> + .backlog_rcv = homa_backlog_rcv,
> + .hash = homa_hash,
> + .unhash = homa_unhash,
> + .get_port = homa_get_port,
> + .sysctl_mem = sysctl_homa_mem,
> + .sysctl_wmem = &sysctl_homa_wmem_min,
> + .sysctl_rmem = &sysctl_homa_rmem_min,
> +
> + /* IPv6 data comes *after* Homa's data, and isn't included in
> + * struct homa_sock.
> + */
> + .obj_size = sizeof(struct homa_sock) + sizeof(struct ipv6_pinfo),
The implementation of inet6_sk_generic() has already changed, you should set
.ipv6_pinfo_offset.
> + .no_autobind = 1,
> +};
> +
> +/* Top-level structure describing the Homa protocol. */
> +static struct inet_protosw homa_protosw = {
> + .type = SOCK_DGRAM,
> + .protocol = IPPROTO_HOMA,
> + .prot = &homa_prot,
> + .ops = &homa_proto_ops,
> + .flags = INET_PROTOSW_REUSE,
> +};
> +
> +static struct inet_protosw homav6_protosw = {
> + .type = SOCK_DGRAM,
> + .protocol = IPPROTO_HOMA,
> + .prot = &homav6_prot,
> + .ops = &homav6_proto_ops,
> + .flags = INET_PROTOSW_REUSE,
> +};
> +
> +/* This structure is used by IP to deliver incoming Homa packets to us. */
> +static struct net_protocol homa_protocol = {
> + .handler = homa_softirq,
> + .err_handler = homa_err_handler_v4,
> + .no_policy = 1,
> +};
> +
> +static struct inet6_protocol homav6_protocol = {
> + .handler = homa_softirq,
> + .err_handler = homa_err_handler_v6,
> + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
> +};
> +
> +/* Sizes of the headers for each Homa packet type, in bytes. */
> +static __u16 header_lengths[] = {
> + sizeof32(struct data_header),
> + 0,
> + sizeof32(struct resend_header),
> + sizeof32(struct unknown_header),
> + sizeof32(struct busy_header),
> + 0,
> + sizeof32(struct common_header),
> + sizeof32(struct need_ack_header),
> + sizeof32(struct ack_header)
> +};
> +
> +static DECLARE_COMPLETION(timer_thread_done);
> +
> +/**
> + * homa_load() - invoked when this module is loaded into the Linux kernel
> + * Return: 0 on success, otherwise a negative errno.
> + */
> +static int __init homa_load(void)
> +{
> + int status;
> +
> + pr_notice("Homa module loading\n");
> + pr_notice("Homa structure sizes: data_header %u, seg_header %u, ack %u, peer %u, ip_hdr %u flowi %u ipv6_hdr %u, flowi6 %u tcp_sock %u homa_rpc %u sk_buff %u rcvmsg_control %u union sockaddr_in_union %u HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n",
> + sizeof32(struct data_header),
> + sizeof32(struct seg_header),
> + sizeof32(struct homa_ack),
> + sizeof32(struct homa_peer),
> + sizeof32(struct iphdr),
> + sizeof32(struct flowi),
> + sizeof32(struct ipv6hdr),
> + sizeof32(struct flowi6),
> + sizeof32(struct tcp_sock),
> + sizeof32(struct homa_rpc),
> + sizeof32(struct sk_buff),
> + sizeof32(struct homa_recvmsg_args),
> + sizeof32(union sockaddr_in_union),
> + HOMA_MAX_BPAGES,
> + NR_CPUS,
> + nr_cpu_ids,
> + MAX_NUMNODES);
> + status = proto_register(&homa_prot, 1);
> + if (status != 0) {
> + pr_err("proto_register failed for homa_prot: %d\n", status);
> + goto out;
> + }
> + status = proto_register(&homav6_prot, 1);
> + if (status != 0) {
> + pr_err("proto_register failed for homav6_prot: %d\n", status);
> + goto out;
> + }
> + inet_register_protosw(&homa_protosw);
> + inet6_register_protosw(&homav6_protosw);
better to check the retval of inet6_register_protosw().
> + status = inet_add_protocol(&homa_protocol, IPPROTO_HOMA);
> + if (status != 0) {
> + pr_err("inet_add_protocol failed in %s: %d\n", __func__,
> + status);
> + goto out_cleanup;
> + }
> + status = inet6_add_protocol(&homav6_protocol, IPPROTO_HOMA);
> + if (status != 0) {
> + pr_err("inet6_add_protocol failed in %s: %d\n", __func__,
> + status);
> + goto out_cleanup;
> + }
> +
> + status = homa_init(homa);
> + if (status)
> + goto out_cleanup;
> +
> + timer_kthread = kthread_run(homa_timer_main, homa, "homa_timer");
> + if (IS_ERR(timer_kthread)) {
> + status = PTR_ERR(timer_kthread);
> + pr_err("couldn't create homa pacer thread: error %d\n",
> + status);
> + timer_kthread = NULL;
> + goto out_cleanup;
> + }
> +
> + return 0;
> +
> +out_cleanup:
> + homa_destroy(homa);
> + inet_del_protocol(&homa_protocol, IPPROTO_HOMA);
> + inet_unregister_protosw(&homa_protosw);
> + inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA);
> + inet6_unregister_protosw(&homav6_protosw);
> + proto_unregister(&homa_prot);
> + proto_unregister(&homav6_prot);
It's a bit strange for me that this relies on a premise: that every reverse operation can correctly
identify whether the corresponding forward operation has been executed. Currently, perhaps every
function includes this capability. It's up to you, I don't insist.
> +out:
> + return status;
> +}
> +
> +/**
> + * homa_unload() - invoked when this module is unloaded from the Linux kernel.
> + */
> +static void __exit homa_unload(void)
> +{
> + pr_notice("Homa module unloading\n");
> + exiting = true;
> +
> + if (timer_kthread)
> + wake_up_process(timer_kthread);
> + wait_for_completion(&timer_thread_done);
> + homa_destroy(homa);
> + inet_del_protocol(&homa_protocol, IPPROTO_HOMA);
> + inet_unregister_protosw(&homa_protosw);
> + inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA);
> + inet6_unregister_protosw(&homav6_protosw);
> + proto_unregister(&homa_prot);
> + proto_unregister(&homav6_prot);
> +}
> +
> +module_init(homa_load);
> +module_exit(homa_unload);
Perhaps you can try adding MODULE_ALIAS_NET_PF_PROTO_TYPE so that the kernel will automatically load
the module when creating IPPROTO_HOMA socket. A functional suggestion, It's up to you.
> +
> +/**
> + * homa_bind() - Implements the bind system call for Homa sockets: associates
> + * a well-known service port with a socket. Unlike other AF_INET6 protocols,
> + * there is no need to invoke this system call for sockets that are only
> + * used as clients.
> + * @sock: Socket on which the system call was invoked.
> + * @addr: Contains the desired port number.
> + * @addr_len: Number of bytes in uaddr.
> + * Return: 0 on success, otherwise a negative errno.
> + */
> +int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
> +{
> + struct homa_sock *hsk = homa_sk(sock->sk);
> + union sockaddr_in_union *addr_in = (union sockaddr_in_union *)addr;
> + int port = 0;
> +
> + if (unlikely(addr->sa_family != sock->sk->sk_family))
> + return -EAFNOSUPPORT;
> + if (addr_in->in6.sin6_family == AF_INET6) {
> + if (addr_len < sizeof(struct sockaddr_in6))
> + return -EINVAL;
> + port = ntohs(addr_in->in4.sin_port);
> + } else if (addr_in->in4.sin_family == AF_INET) {
> + if (addr_len < sizeof(struct sockaddr_in))
> + return -EINVAL;
> + port = ntohs(addr_in->in6.sin6_port);
> + }
> + return homa_sock_bind(homa->port_map, hsk, port);
> +}
Is binding multiple times legal? For example, bind 80 first and then bind 8080. If not, I think
you might need to check the inet_num.
> +
> +/**
> + * homa_close() - Invoked when close system call is invoked on a Homa socket.
> + * @sk: Socket being closed
> + * @timeout: ??
> + */
> +void homa_close(struct sock *sk, long timeout)
> +{
> + struct homa_sock *hsk = homa_sk(sk);
> +
> + homa_sock_destroy(hsk);
> + sk_common_release(sk);
> +}
> +
> +/**
> + * homa_shutdown() - Implements the shutdown system call for Homa sockets.
> + * @sock: Socket to shut down.
> + * @how: Ignored: for other sockets, can independently shut down
> + * sending and receiving, but for Homa any shutdown will
> + * shut down everything.
> + *
> + * Return: 0 on success, otherwise a negative errno.
> + */
> +int homa_shutdown(struct socket *sock, int how)
> +{
> + homa_sock_shutdown(homa_sk(sock->sk));
> + return 0;
> +}
> +
> +/**
> + * homa_disconnect() - Invoked when disconnect system call is invoked on a
> + * Homa socket.
> + * @sk: Socket to disconnect
> + * @flags: ??
> + *
> + * Return: 0 on success, otherwise a negative errno.
> + */
> +int homa_disconnect(struct sock *sk, int flags)
> +{
> + pr_warn("unimplemented disconnect invoked on Homa socket\n");
> + return -EINVAL;
> +}
> +
> +/**
> + * homa_ioctl() - Implements the ioctl system call for Homa sockets.
> + * @sk: Socket on which the system call was invoked.
> + * @cmd: Identifier for a particular ioctl operation.
> + * @karg: Operation-specific argument; typically the address of a block
> + * of data in user address space.
> + *
> + * Return: 0 on success, otherwise a negative errno.
> + */
> +int homa_ioctl(struct sock *sk, int cmd, int *karg)
> +{
> + return -EINVAL;
> +}
> +
> +/**
> + * homa_socket() - Implements the socket(2) system call for sockets.
> + * @sk: Socket on which the system call was invoked. The non-Homa
> + * parts have already been initialized.
> + *
> + * Return: always 0 (success).
> + */
> +int homa_socket(struct sock *sk)
> +{
> + struct homa_sock *hsk = homa_sk(sk);
> +
> + homa_sock_init(hsk, homa)
I noticed that homa_sock_init() contains a memory allocation action, perhaps you should add a return
value check.
-> hsk->buffer_pool = kzalloc(sizeof(*hsk->buffer_pool), GFP_KERNEL);
> + return 0;
> +}
> +
> +/**
> + * homa_setsockopt() - Implements the getsockopt system call for Homa sockets.
> + * @sk: Socket on which the system call was invoked.
> + * @level: Level at which the operation should be handled; will always
> + * be IPPROTO_HOMA.
> + * @optname: Identifies a particular setsockopt operation.
> + * @optval: Address in user space of information about the option.
> + * @optlen: Number of bytes of data at @optval.
> + * Return: 0 on success, otherwise a negative errno.
> + */
> +int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
> + unsigned int optlen)
> +{
> + struct homa_sock *hsk = homa_sk(sk);
> + struct homa_set_buf_args args;
> + int ret;
> +
> + if (level != IPPROTO_HOMA || optname != SO_HOMA_SET_BUF ||
> + optlen != sizeof(struct homa_set_buf_args))
> + return -EINVAL;
> +
> + if (copy_from_sockptr(&args, optval, optlen))
> + return -EFAULT;
> +
> + /* Do a trivial test to make sure we can at least write the first
> + * page of the region.
> + */
> + if (copy_to_user((__force void __user *)args.start, &args, sizeof(args)))
> + return -EFAULT;
> +
> + homa_sock_lock(hsk, "homa_setsockopt SO_HOMA_SET_BUF");
> + ret = homa_pool_init(hsk, (__force void __user *)args.start, args.length);
> + homa_sock_unlock(hsk);
> + return ret;
> +}
> +
> +/**
> + * homa_getsockopt() - Implements the getsockopt system call for Homa sockets.
> + * @sk: Socket on which the system call was invoked.
> + * @level: ??
> + * @optname: Identifies a particular setsockopt operation.
> + * @optval: Address in user space where the option's value should be stored.
> + * @option: ??.
> + * Return: 0 on success, otherwise a negative errno.
> + */
> +int homa_getsockopt(struct sock *sk, int level, int optname,
> + char __user *optval, int __user *option)
> +{
> + pr_warn("unimplemented getsockopt invoked on Homa socket: level %d, optname %d\n",
> + level, optname);
> + return -EINVAL;
> +}
> +
> +/**
> + * homa_sendmsg() - Send a request or response message on a Homa socket.
> + * @sk: Socket on which the system call was invoked.
> + * @msg: Structure describing the message to send; the msg_control
> + * field points to additional information.
> + * @length: Number of bytes of the message.
> + * Return: 0 on success, otherwise a negative errno.
> + */
> +int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length)
> +{
> + struct homa_sock *hsk = homa_sk(sk);
> + struct homa_sendmsg_args args;
> + int result = 0;
> + struct homa_rpc *rpc = NULL;
> + union sockaddr_in_union *addr = (union sockaddr_in_union *)msg->msg_name;
> +
> + if (unlikely(!msg->msg_control_is_user)) {
> + result = -EINVAL;
> + goto error;
> + }
> + if (unlikely(copy_from_user(&args,
> + (__force void __user *)msg->msg_control,
> + sizeof(args)))) {
> + result = -EFAULT;
> + goto error;
> + }
> + if (addr->in6.sin6_family != sk->sk_family) {
> + result = -EAFNOSUPPORT;
> + goto error;
> + }
> + if (msg->msg_namelen < sizeof(struct sockaddr_in) ||
> + (msg->msg_namelen < sizeof(struct sockaddr_in6) &&
> + addr->in6.sin6_family == AF_INET6)) {
> + result = -EINVAL;
> + goto error;
> + }
> +
> + if (!args.id) {
> + /* This is a request message. */
> + rpc = homa_rpc_new_client(hsk, addr);
> + if (IS_ERR(rpc)) {
> + result = PTR_ERR(rpc);
> + rpc = NULL;
> + goto error;
> + }
> + rpc->completion_cookie = args.completion_cookie;
> + result = homa_message_out_fill(rpc, &msg->msg_iter, 1);
> + if (result)
> + goto error;
> + args.id = rpc->id;
> + homa_rpc_unlock(rpc);
> + rpc = NULL;
> +
> + if (unlikely(copy_to_user((__force void __user *)msg->msg_control,
> + &args, sizeof(args)))) {
> + rpc = homa_find_client_rpc(hsk, args.id);
> + result = -EFAULT;
> + goto error;
> + }
> + } else {
> + /* This is a response message. */
> + struct in6_addr canonical_dest;
> +
> + if (args.completion_cookie != 0) {
> + result = -EINVAL;
> + goto error;
> + }
> + canonical_dest = canonical_ipv6_addr(addr);
> +
> + rpc = homa_find_server_rpc(hsk, &canonical_dest,
> + ntohs(addr->in6.sin6_port), args.id);
> + if (!rpc)
> + /* Return without an error if the RPC doesn't exist;
> + * this could be totally valid (e.g. client is
> + * no longer interested in it).
> + */
> + return 0;
> + if (rpc->error) {
> + result = rpc->error;
> + goto error;
> + }
> + if (rpc->state != RPC_IN_SERVICE) {
> + homa_rpc_unlock(rpc);
> + rpc = NULL;
> + result = -EINVAL;
> + goto error;
> + }
> + rpc->state = RPC_OUTGOING;
> +
> + result = homa_message_out_fill(rpc, &msg->msg_iter, 1);
> + if (result && rpc->state != RPC_DEAD)
> + goto error;
> + homa_rpc_unlock(rpc);
> + }
> + return 0;
> +
> +error:
> + if (rpc) {
> + homa_rpc_free(rpc);
> + homa_rpc_unlock(rpc);
> + }
> + return result;
> +}
> +
> +/**
> + * homa_recvmsg() - Receive a message from a Homa socket.
> + * @sk: Socket on which the system call was invoked.
> + * @msg: Controlling information for the receive.
> + * @len: Total bytes of space available in msg->msg_iov; not used.
> + * @flags: Flags from system call; only MSG_DONTWAIT is used.
> + * @addr_len: Store the length of the sender address here
> + * Return: The length of the message on success, otherwise a negative
> + * errno.
> + */
> +int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
> + int *addr_len)
> +{
> + struct homa_sock *hsk = homa_sk(sk);
> + struct homa_recvmsg_args control;
> + struct homa_rpc *rpc;
> + int result;
> +
> + if (unlikely(!msg->msg_control)) {
> + /* This test isn't strictly necessary, but it provides a
> + * hook for testing kernel call times.
> + */
> + return -EINVAL;
> + }
> + if (msg->msg_controllen != sizeof(control)) {
> + result = -EINVAL;
> + goto done;
> + }
> + if (unlikely(copy_from_user(&control,
> + (__force void __user *)msg->msg_control,
> + sizeof(control)))) {
> + result = -EFAULT;
> + goto done;
> + }
> + control.completion_cookie = 0;
> +
> + if (control.num_bpages > HOMA_MAX_BPAGES ||
> + (control.flags & ~HOMA_RECVMSG_VALID_FLAGS)) {
> + result = -EINVAL;
> + goto done;
> + }
> + homa_pool_release_buffers(hsk->buffer_pool, control.num_bpages,
> + control.bpage_offsets);
> + control.num_bpages = 0;
> +
> + rpc = homa_wait_for_message(hsk, (flags & MSG_DONTWAIT)
> + ? (control.flags | HOMA_RECVMSG_NONBLOCKING)
> + : control.flags, control.id);
> + if (IS_ERR(rpc)) {
> + /* If we get here, it means there was an error that prevented
> + * us from finding an RPC to return. If there's an error in
> + * the RPC itself we won't get here.
> + */
> + result = PTR_ERR(rpc);
> + goto done;
> + }
> + result = rpc->error ? rpc->error : rpc->msgin.length;
> +
> + /* Collect result information. */
> + control.id = rpc->id;
> + control.completion_cookie = rpc->completion_cookie;
> + if (likely(rpc->msgin.length >= 0)) {
> + control.num_bpages = rpc->msgin.num_bpages;
> + memcpy(control.bpage_offsets, rpc->msgin.bpage_offsets,
> + sizeof(control.bpage_offsets));
> + }
> + if (sk->sk_family == AF_INET6) {
> + struct sockaddr_in6 *in6 = msg->msg_name;
> +
> + in6->sin6_family = AF_INET6;
> + in6->sin6_port = htons(rpc->dport);
> + in6->sin6_addr = rpc->peer->addr;
> + *addr_len = sizeof(*in6);
> + } else {
> + struct sockaddr_in *in4 = msg->msg_name;
> +
> + in4->sin_family = AF_INET;
> + in4->sin_port = htons(rpc->dport);
> + in4->sin_addr.s_addr = ipv6_to_ipv4(rpc->peer->addr);
> + *addr_len = sizeof(*in4);
> + }
> +
> + /* This indicates that the application now owns the buffers, so
> + * we won't free them in homa_rpc_free.
> + */
> + rpc->msgin.num_bpages = 0;
> +
> + /* Must release the RPC lock (and potentially free the RPC) before
> + * copying the results back to user space.
> + */
> + if (homa_is_client(rpc->id)) {
> + homa_peer_add_ack(rpc);
> + homa_rpc_free(rpc);
> + } else {
> + if (result < 0)
> + homa_rpc_free(rpc);
> + else
> + rpc->state = RPC_IN_SERVICE;
> + }
> + homa_rpc_unlock(rpc);
> +
> +done:
> + if (unlikely(copy_to_user((__force void __user *)msg->msg_control,
> + &control, sizeof(control)))) {
> + /* Note: in this case the message's buffers will be leaked. */
> + pr_notice("%s couldn't copy back args\n", __func__);
> + result = -EFAULT;
> + }
> +
> + return result;
> +}
> +
> +/**
> + * homa_hash() - Not needed for Homa.
> + * @sk: Socket for the operation
> + * Return: ??
> + */
> +int homa_hash(struct sock *sk)
> +{
> + return 0;
> +}
> +
> +/**
> + * homa_unhash() - Not needed for Homa.
> + * @sk: Socket for the operation
> + */
> +void homa_unhash(struct sock *sk)
> +{
> +}
> +
> +/**
> + * homa_get_port() - It appears that this function is called to assign a
> + * default port for a socket.
> + * @sk: Socket for the operation
> + * @snum: Unclear what this is.
> + * Return: Zero for success, or a negative errno for an error.
> + */
> +int homa_get_port(struct sock *sk, unsigned short snum)
> +{
> + /* Homa always assigns ports immediately when a socket is created,
> + * so there is nothing to do here.
> + */
> + return 0;
> +}
> +
> +/**
> + * homa_softirq() - This function is invoked at SoftIRQ level to handle
> + * incoming packets.
> + * @skb: The incoming packet.
> + * Return: Always 0
> + */
> +int homa_softirq(struct sk_buff *skb)
> +{
> + struct sk_buff *packets, *other_pkts, *next;
> + struct sk_buff **prev_link, **other_link;
> + struct common_header *h;
> + int first_packet = 1;
> + int header_offset;
> + int pull_length;
> +
> + /* skb may actually contain many distinct packets, linked through
> + * skb_shinfo(skb)->frag_list by the Homa GRO mechanism. Make a
> + * pass through the list to process all of the short packets,
> + * leaving the longer packets in the list. Also, perform various
> + * prep/cleanup/error checking functions.
> + */
> + skb->next = skb_shinfo(skb)->frag_list;
> + skb_shinfo(skb)->frag_list = NULL;
> + packets = skb;
> + prev_link = &packets;
> + for (skb = packets; skb; skb = next) {
> + next = skb->next;
> +
> + /* Make the header available at skb->data, even if the packet
> + * is fragmented. One complication: it's possible that the IP
> + * header hasn't yet been removed (this happens for GRO packets
> + * on the frag_list, since they aren't handled explicitly by IP.
> + */
> + header_offset = skb_transport_header(skb) - skb->data;
> + pull_length = HOMA_MAX_HEADER + header_offset;
> + if (pull_length > skb->len)
> + pull_length = skb->len;
> + if (!pskb_may_pull(skb, pull_length))
> + goto discard;
> + if (header_offset)
> + __skb_pull(skb, header_offset);
> +
> + /* Reject packets that are too short or have bogus types. */
> + h = (struct common_header *)skb->data;
> + if (unlikely(skb->len < sizeof(struct common_header) ||
> + h->type < DATA || h->type >= BOGUS ||
> + skb->len < header_lengths[h->type - DATA]))
> + goto discard;
> +
> + if (first_packet)
> + first_packet = 0;
> +
> + /* Process the packet now if it is a control packet or
> + * if it contains an entire short message.
> + */
> + if (h->type != DATA || ntohl(((struct data_header *)h)
> + ->message_length) < 1400) {
> + *prev_link = skb->next;
> + skb->next = NULL;
> + homa_dispatch_pkts(skb, homa);
> + } else {
> + prev_link = &skb->next;
> + }
> + continue;
> +
> +discard:
> + *prev_link = skb->next;
> + kfree_skb(skb);
> + }
> +
> + /* Now process the longer packets. Each iteration of this loop
> + * collects all of the packets for a particular RPC and dispatches
> + * them.
> + */
> + while (packets) {
> + struct in6_addr saddr, saddr2;
> + struct common_header *h2;
> + struct sk_buff *skb2;
> +
> + skb = packets;
> + prev_link = &skb->next;
> + saddr = skb_canonical_ipv6_saddr(skb);
> + other_pkts = NULL;
> + other_link = &other_pkts;
> + h = (struct common_header *)skb->data;
> + for (skb2 = skb->next; skb2; skb2 = next) {
> + next = skb2->next;
> + h2 = (struct common_header *)skb2->data;
> + if (h2->sender_id == h->sender_id) {
> + saddr2 = skb_canonical_ipv6_saddr(skb2);
> + if (ipv6_addr_equal(&saddr, &saddr2)) {
> + *prev_link = skb2;
> + prev_link = &skb2->next;
> + continue;
> + }
> + }
> + *other_link = skb2;
> + other_link = &skb2->next;
> + }
> + *prev_link = NULL;
> + *other_link = NULL;
> + homa_dispatch_pkts(packets, homa);
> + packets = other_pkts;
> + }
> +
> + return 0;
> +}
> +
> +/**
> + * homa_backlog_rcv() - Invoked to handle packets saved on a socket's
> + * backlog because it was locked when the packets first arrived.
> + * @sk: Homa socket that owns the packet's destination port.
> + * @skb: The incoming packet. This function takes ownership of the packet
> + * (we'll delete it).
> + *
> + * Return: Always returns 0.
> + */
> +int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb)
> +{
> + pr_warn("unimplemented backlog_rcv invoked on Homa socket\n");
> + kfree_skb(skb);
> + return 0;
> +}
> +
> +/**
> + * homa_err_handler_v4() - Invoked by IP to handle an incoming error
> + * packet, such as ICMP UNREACHABLE.
> + * @skb: The incoming packet.
> + * @info: Information about the error that occurred?
> + *
> + * Return: zero, or a negative errno if the error couldn't be handled here.
> + */
> +int homa_err_handler_v4(struct sk_buff *skb, u32 info)
> +{
> + const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb);
> + const struct iphdr *iph = ip_hdr(skb);
> + int type = icmp_hdr(skb)->type;
> + int code = icmp_hdr(skb)->code;
> +
> + if (type == ICMP_DEST_UNREACH && code == ICMP_PORT_UNREACH) {
> + char *icmp = (char *)icmp_hdr(skb);
> + struct common_header *h;
> +
> + iph = (struct iphdr *)(icmp + sizeof(struct icmphdr));
> + h = (struct common_header *)(icmp + sizeof(struct icmphdr)
> + + iph->ihl * 4);
> + homa_abort_rpcs(homa, &saddr, ntohs(h->dport), -ENOTCONN);
> + } else if (type == ICMP_DEST_UNREACH) {
> + int error;
> +
> + if (code == ICMP_PROT_UNREACH)
> + error = -EPROTONOSUPPORT;
> + else
> + error = -EHOSTUNREACH;
> + homa_abort_rpcs(homa, &saddr, 0, error);
> + } else {
> + pr_notice("%s invoked with info %x, ICMP type %d, ICMP code %d\n",
> + __func__, info, type, code);
> + }
> + return 0;
> +}
> +
> +/**
> + * homa_err_handler_v6() - Invoked by IP to handle an incoming error
> + * packet, such as ICMP UNREACHABLE.
> + * @skb: The incoming packet.
> + * @opt: Not used.
> + * @type: Type of ICMP packet.
> + * @code: Additional information about the error.
> + * @offset: Not used.
> + * @info: Information about the error that occurred?
> + *
> + * Return: zero, or a negative errno if the error couldn't be handled here.
> + */
> +int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt,
> + u8 type, u8 code, int offset, __be32 info)
> +{
> + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
> +
> + if (type == ICMPV6_DEST_UNREACH && code == ICMPV6_PORT_UNREACH) {
> + char *icmp = (char *)icmp_hdr(skb);
> + struct common_header *h;
> +
> + iph = (struct ipv6hdr *)(icmp + sizeof(struct icmphdr));
> + h = (struct common_header *)(icmp + sizeof(struct icmphdr)
> + + HOMA_IPV6_HEADER_LENGTH);
> + homa_abort_rpcs(homa, &iph->daddr, ntohs(h->dport), -ENOTCONN);
> + } else if (type == ICMPV6_DEST_UNREACH) {
> + int error;
> +
> + if (code == ICMP_PROT_UNREACH)
> + error = -EPROTONOSUPPORT;
> + else
> + error = -EHOSTUNREACH;
> + homa_abort_rpcs(homa, &iph->daddr, 0, error);
> + }
> + return 0;
> +}
> +
> +/**
> + * homa_poll() - Invoked by Linux as part of implementing select, poll,
> + * epoll, etc.
> + * @file: Open file that is participating in a poll, select, etc.
> + * @sock: A Homa socket, associated with @file.
> + * @wait: This table will be registered with the socket, so that it
> + * is notified when the socket's ready state changes.
> + *
> + * Return: A mask of bits such as EPOLLIN, which indicate the current
> + * state of the socket.
> + */
> +__poll_t homa_poll(struct file *file, struct socket *sock,
> + struct poll_table_struct *wait)
> +{
> + struct sock *sk = sock->sk;
> + __u32 mask;
> +
> + /* It seems to be standard practice for poll functions *not* to
> + * acquire the socket lock, so we don't do it here; not sure
> + * why...
> + */
> +
> + sock_poll_wait(file, sock, wait);
> + mask = POLLOUT | POLLWRNORM;
> +
> + if (!list_empty(&homa_sk(sk)->ready_requests) ||
> + !list_empty(&homa_sk(sk)->ready_responses))
> + mask |= POLLIN | POLLRDNORM;
> + return (__force __poll_t)mask;
> +}
> +
> +/**
> + * homa_hrtimer() - This function is invoked by the hrtimer mechanism to
> + * wake up the timer thread. Runs at IRQ level.
> + * @timer: The timer that triggered; not used.
> + *
> + * Return: Always HRTIMER_RESTART.
> + */
> +enum hrtimer_restart homa_hrtimer(struct hrtimer *timer)
> +{
> + wake_up_process(timer_kthread);
> + return HRTIMER_NORESTART;
> +}
> +
> +/**
> + * homa_timer_main() - Top-level function for the timer thread.
> + * @transport: Pointer to struct homa.
> + *
> + * Return: Always 0.
> + */
> +int homa_timer_main(void *transport)
> +{
> + struct homa *homa = (struct homa *)transport;
> + struct hrtimer hrtimer;
> + ktime_t tick_interval;
> + u64 nsec;
> +
> + hrtimer_init(&hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + hrtimer.function = &homa_hrtimer;
> + nsec = 1000000; /* 1 ms */
> + tick_interval = ns_to_ktime(nsec);
> + while (1) {
> + set_current_state(TASK_UNINTERRUPTIBLE);
> + if (!exiting) {
> + hrtimer_start(&hrtimer, tick_interval, HRTIMER_MODE_REL);
> + schedule();
> + }
> + __set_current_state(TASK_RUNNING);
> + if (exiting)
> + break;
> + homa_timer(homa);
> + }
> + hrtimer_cancel(&hrtimer);
> + kthread_complete_and_exit(&timer_thread_done, 0);
> + return 0;
> +}
> diff --git a/net/homa/homa_utils.c b/net/homa/homa_utils.c
> new file mode 100644
> index 000000000000..905d00c836bd
> --- /dev/null
> +++ b/net/homa/homa_utils.c
> @@ -0,0 +1,150 @@
> +// SPDX-License-Identifier: BSD-2-Clause
> +
> +/* This file contains miscellaneous utility functions for Homa, such
> + * as initializing and destroying homa structs.
> + */
> +
> +#include "homa_impl.h"
> +#include "homa_peer.h"
> +#include "homa_rpc.h"
> +#include "homa_stub.h"
> +
> +struct completion homa_pacer_kthread_done;
> +
> +/**
> + * homa_init() - Constructor for homa objects.
> + * @homa: Object to initialize.
> + *
> + * Return: 0 on success, or a negative errno if there was an error. Even
> + * if an error occurs, it is safe (and necessary) to call
> + * homa_destroy at some point.
> + */
> +int homa_init(struct homa *homa)
> +{
> + int err;
> +
> + _Static_assert(HOMA_MAX_PRIORITIES >= 8,
> + "homa_init assumes at least 8 priority levels");
> +
> + homa->pacer_kthread = NULL;
> + init_completion(&homa_pacer_kthread_done);
> + atomic64_set(&homa->next_outgoing_id, 2);
> + atomic64_set(&homa->link_idle_time, sched_clock());
> + spin_lock_init(&homa->pacer_mutex);
> + homa->pacer_fifo_fraction = 50;
> + homa->pacer_fifo_count = 1;
> + homa->pacer_wake_time = 0;
> + spin_lock_init(&homa->throttle_lock);
> + INIT_LIST_HEAD_RCU(&homa->throttled_rpcs);
> + homa->throttle_add = 0;
> + homa->throttle_min_bytes = 200;
> + homa->next_client_port = HOMA_MIN_DEFAULT_PORT;
> + homa->port_map = kmalloc(sizeof(*homa->port_map), GFP_KERNEL);
> + if (!homa->port_map) {
> + pr_err("%s couldn't create port_map: kmalloc failure", __func__);
> + return -ENOMEM;
> + }
> + homa_socktab_init(homa->port_map);
> + homa->peers = kmalloc(sizeof(*homa->peers), GFP_KERNEL);
> + if (!homa->peers) {
> + pr_err("%s couldn't create peers: kmalloc failure", __func__);
> + return -ENOMEM;
> + }
> + err = homa_peertab_init(homa->peers);
> + if (err) {
> + pr_err("%s couldn't initialize peer table (errno %d)\n",
> + __func__, -err);
> + return err;
> + }
> +
> + /* Wild guesses to initialize configuration values... */
> + homa->unsched_bytes = 40000;
> + homa->window_param = 100000;
> + homa->link_mbps = 25000;
> + homa->fifo_grant_increment = 10000;
> + homa->grant_fifo_fraction = 50;
> + homa->max_overcommit = 8;
> + homa->max_incoming = 400000;
> + homa->max_rpcs_per_peer = 1;
> + homa->resend_ticks = 5;
> + homa->resend_interval = 5;
> + homa->timeout_ticks = 100;
> + homa->timeout_resends = 5;
> + homa->request_ack_ticks = 2;
> + homa->reap_limit = 10;
> + homa->dead_buffs_limit = 5000;
> + homa->max_dead_buffs = 0;
> + homa->pacer_kthread = kthread_run(homa_pacer_main, homa,
> + "homa_pacer");
> + if (IS_ERR(homa->pacer_kthread)) {
> + err = PTR_ERR(homa->pacer_kthread);
> + homa->pacer_kthread = NULL;
> + pr_err("couldn't create homa pacer thread: error %d\n", err);
> + return err;
> + }
> + homa->pacer_exit = false;
> + homa->max_nic_queue_ns = 5000;
> + homa->ns_per_mbyte = 0;
> + homa->max_gso_size = 10000;
> + homa->gso_force_software = 0;
> + homa->max_gro_skbs = 20;
> + homa->gro_policy = HOMA_GRO_NORMAL;
> + homa->timer_ticks = 0;
> + homa->flags = 0;
> + homa->bpage_lease_usecs = 10000;
> + homa->next_id = 0;
> + homa_outgoing_sysctl_changed(homa);
> + homa_incoming_sysctl_changed(homa);
> + return 0;
> +}
> +
> +/**
> + * homa_destroy() - Destructor for homa objects.
> + * @homa: Object to destroy.
> + */
> +void homa_destroy(struct homa *homa)
> +{
> + if (homa->pacer_kthread) {
> + homa_pacer_stop(homa);
> + wait_for_completion(&homa_pacer_kthread_done);
> + }
> +
> + /* The order of the following statements matters! */
> + if (homa->port_map) {
> + homa_socktab_destroy(homa->port_map);
> + kfree(homa->port_map);
> + homa->port_map = NULL;
> + }
> + if (homa->peers) {
> + homa_peertab_destroy(homa->peers);
> + kfree(homa->peers);
> + homa->peers = NULL;
> + }
> +}
> +
> +/**
> + * homa_spin() - Delay (without sleeping) for a given time interval.
> + * @ns: How long to delay (in nanoseconds)
> + */
> +void homa_spin(int ns)
> +{
> + __u64 end;
> +
> + end = sched_clock() + ns;
> + while (sched_clock() < end)
> + /* Empty loop body.*/
> + ;
> +}
> +
> +/**
> + * homa_throttle_lock_slow() - This function implements the slow path for
> + * acquiring the throttle lock. It is invoked when the lock isn't immediately
> + * available. It waits for the lock, but also records statistics about
> + * the waiting time.
> + * @homa: Overall data about the Homa protocol implementation.
> + */
> +void homa_throttle_lock_slow(struct homa *homa)
> + __acquires(&homa->throttle_lock)
> +{
> + spin_lock_bh(&homa->throttle_lock);
> +}
Powered by blists - more mailing lists