[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <willemdebruijn.kernel.2e842c8c31670@gmail.com>
Date: Wed, 22 Oct 2025 18:00:28 -0400
From: Willem de Bruijn <willemdebruijn.kernel@...il.com>
To: Ido Schimmel <idosch@...dia.com>,
netdev@...r.kernel.org
Cc: davem@...emloft.net,
kuba@...nel.org,
pabeni@...hat.com,
edumazet@...gle.com,
horms@...nel.org,
dsahern@...nel.org,
petrm@...dia.com,
willemb@...gle.com,
daniel@...earbox.net,
fw@...len.de,
ishaangandhi@...il.com,
rbonica@...iper.net,
tom@...bertland.com,
Ido Schimmel <idosch@...dia.com>
Subject: Re: [PATCH net-next 1/3] ipv4: icmp: Add RFC 5837 support
Ido Schimmel wrote:
> Add the ability to append the incoming IP interface information to
> ICMPv4 error messages in accordance with RFC 5837 and RFC 4884. This is
> required for more meaningful traceroute results in unnumbered networks.
>
> The feature is disabled by default and controlled via a new sysctl
> ("net.ipv4.icmp_errors_extension_mask") which accepts a bitmask of ICMP
> extensions to append to ICMP error messages. Currently, only a single
> value is supported, but the interface and the implementation should be
> able to support more extensions, if needed.
>
> Clone the skb and copy the relevant data portions before modifying the
> skb as the caller of __icmp_send() still owns the skb after the function
> returns. This should be fine since by default ICMP error messages are
> rate limited to 1000 per second and no more than 1 per second per
> specific host.
>
> Trim or pad the packet to 128 bytes before appending the ICMP extension
> structure in order to be compatible with legacy applications that assume
> that the ICMP extension structure always starts at this offset (the
> minimum length specified by RFC 4884).
>
> Reviewed-by: Petr Machata <petrm@...dia.com>
> Signed-off-by: Ido Schimmel <idosch@...dia.com>
> ---
> Documentation/networking/ip-sysctl.rst | 17 +++
> include/linux/icmp.h | 32 +++++
> include/net/netns/ipv4.h | 1 +
> net/ipv4/icmp.c | 190 ++++++++++++++++++++++++-
> net/ipv4/sysctl_net_ipv4.c | 11 ++
> 5 files changed, 250 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index a06cb99d66dc..ece1187ba0f1 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -1796,6 +1796,23 @@ icmp_errors_use_inbound_ifaddr - BOOLEAN
>
> Default: 0 (disabled)
>
> +icmp_errors_extension_mask - UNSIGNED INTEGER
> + Bitmask of ICMP extensions to append to ICMPv4 error messages
> + ("Destination Unreachable", "Time Exceeded" and "Parameter Problem").
> + The original datagram is trimmed / padded to 128 bytes in order to be
> + compatible with applications that do not comply with RFC 4884.
> +
> + Possible extensions are:
> +
> + ==== ==============================================================
> + 0x01 Incoming IP interface information according to RFC 5837.
> + Extension will include the index, IPv4 address (if present),
> + name and MTU of the IP interface that received the datagram
> + which elicited the ICMP error.
> + ==== ==============================================================
> +
> + Default: 0x00 (no extensions)
> +
> igmp_max_memberships - INTEGER
> Change the maximum number of multicast groups we can subscribe to.
> Default: 20
> diff --git a/include/linux/icmp.h b/include/linux/icmp.h
> index 0af4d210ee31..043ec5d9c882 100644
> --- a/include/linux/icmp.h
> +++ b/include/linux/icmp.h
> @@ -40,4 +40,36 @@ void ip_icmp_error_rfc4884(const struct sk_buff *skb,
> struct sock_ee_data_rfc4884 *out,
> int thlen, int off);
>
> +/* RFC 4884 */
> +#define ICMP_EXT_ORIG_DGRAM_MIN_LEN 128
> +#define ICMP_EXT_VERSION_2 2
> +
> +/* ICMP Extension Object Classes */
> +#define ICMP_EXT_OBJ_CLASS_IIO 2 /* RFC 5837 */
> +
> +/* Interface Information Object - RFC 5837 */
> +enum {
> + ICMP_EXT_CTYPE_IIO_ROLE_IIF,
> +};
> +
> +#define ICMP_EXT_CTYPE_IIO_ROLE(ROLE) ((ROLE) << 6)
> +#define ICMP_EXT_CTYPE_IIO_MTU BIT(0)
> +#define ICMP_EXT_CTYPE_IIO_NAME BIT(1)
> +#define ICMP_EXT_CTYPE_IIO_IPADDR BIT(2)
> +#define ICMP_EXT_CTYPE_IIO_IFINDEX BIT(3)
> +
> +struct icmp_ext_iio_name_subobj {
> + u8 len;
> + char name[IFNAMSIZ];
> +};
> +
> +enum {
> + /* RFC 5837 - Incoming IP Interface Role */
> + ICMP_ERR_EXT_IIO_IIF,
> + /* Add new constants above. Used by "icmp_errors_extension_mask"
> + * sysctl.
> + */
> + ICMP_ERR_EXT_COUNT,
> +};
> +
> #endif /* _LINUX_ICMP_H */
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index 34eb3aecb3f2..0e96c90e56c6 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -135,6 +135,7 @@ struct netns_ipv4 {
> u8 sysctl_icmp_echo_ignore_broadcasts;
> u8 sysctl_icmp_ignore_bogus_error_responses;
> u8 sysctl_icmp_errors_use_inbound_ifaddr;
> + u8 sysctl_icmp_errors_extension_mask;
> int sysctl_icmp_ratelimit;
> int sysctl_icmp_ratemask;
> int sysctl_icmp_msgs_per_sec;
> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> index 1b7fb5d935ed..44c4deb9d9da 100644
> --- a/net/ipv4/icmp.c
> +++ b/net/ipv4/icmp.c
> @@ -582,6 +582,184 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
> return ERR_PTR(err);
> }
>
> +struct icmp_ext_iio_addr4_subobj {
> + __be16 afi;
> + __be16 reserved;
> + __be32 addr4;
> +};
> +
> +static unsigned int icmp_ext_iio_len(void)
> +{
> + return sizeof(struct icmp_extobj_hdr) +
> + /* ifIndex */
> + sizeof(__be32) +
> + /* Interface Address Sub-Object */
> + sizeof(struct icmp_ext_iio_addr4_subobj) +
> + /* Interface Name Sub-Object. Length must be a multiple of 4
> + * bytes.
> + */
> + ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) +
> + /* MTU */
> + sizeof(__be32);
> +}
> +
> +static unsigned int icmp_ext_max_len(u8 ext_objs)
> +{
> + unsigned int ext_max_len;
> +
> + ext_max_len = sizeof(struct icmp_ext_hdr);
> +
> + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
> + ext_max_len += icmp_ext_iio_len();
> +
> + return ext_max_len;
> +}
> +
> +static __be32 icmp_ext_iio_addr4_find(const struct net_device *dev)
> +{
> + struct in_device *in_dev;
> + struct in_ifaddr *ifa;
> +
> + in_dev = __in_dev_get_rcu(dev);
> + if (!in_dev)
> + return 0;
> +
> + /* It is unclear from RFC 5837 which IP address should be chosen, but
> + * it makes sense to choose a global unicast address.
Is it possible for no such address to exist, and in that case should
one of the backup options be considered?
> + */
> + in_dev_for_each_ifa_rcu(ifa, in_dev) {
> + if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
> + continue;
> + if (ifa->ifa_scope != RT_SCOPE_UNIVERSE ||
> + ipv4_is_multicast(ifa->ifa_address))
> + continue;
> + return ifa->ifa_address;
> + }
> +
> + return 0;
> +}
> +
> +static void icmp_ext_iio_iif_append(struct net *net, struct sk_buff *skb,
> + int iif)
> +{
> + struct icmp_ext_iio_name_subobj *name_subobj;
> + struct icmp_extobj_hdr *objh;
> + struct net_device *dev;
> + __be32 data;
> +
> + if (!iif)
> + return;
> +
Might be good to add a comment that field order is prescribed by the RFC.
> + objh = skb_put(skb, sizeof(*objh));
> + objh->class_num = ICMP_EXT_OBJ_CLASS_IIO;
> + objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF);
> +
> + data = htonl(iif);
> + skb_put_data(skb, &data, sizeof(__be32));
> + objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX;
> +
> + rcu_read_lock();
> +
> + dev = dev_get_by_index_rcu(net, iif);
> + if (!dev)
> + goto out;
> +
> + data = icmp_ext_iio_addr4_find(dev);
> + if (data) {
> + struct icmp_ext_iio_addr4_subobj *addr4_subobj;
> +
> + addr4_subobj = skb_put_zero(skb, sizeof(*addr4_subobj));
> + addr4_subobj->afi = htons(ICMP_AFI_IP);
> + addr4_subobj->addr4 = data;
> + objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR;
> + }
> +
> + name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4));
> + name_subobj->len = ALIGN(sizeof(*name_subobj), 4);
> + netdev_copy_name(dev, name_subobj->name);
> + objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME;
> +
> + data = htonl(READ_ONCE(dev->mtu));
> + skb_put_data(skb, &data, sizeof(__be32));
> + objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU;
> +
> +out:
> + rcu_read_unlock();
> + objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh);
> +}
> +
> +static void icmp_ext_objs_append(struct net *net, struct sk_buff *skb,
> + u8 ext_objs, int iif)
> +{
> + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
> + icmp_ext_iio_iif_append(net, skb, iif);
> +}
> +
> +static struct sk_buff *
> +icmp_ext_append(struct net *net, struct sk_buff *skb_in, struct icmphdr *icmph,
> + unsigned int room, int iif)
> +{
> + unsigned int payload_len, ext_max_len, ext_len;
> + struct icmp_ext_hdr *ext_hdr;
> + struct sk_buff *skb;
> + u8 ext_objs;
> + int nhoff;
> +
> + switch (icmph->type) {
> + case ICMP_DEST_UNREACH:
> + case ICMP_TIME_EXCEEDED:
> + case ICMP_PARAMETERPROB:
> + break;
> + default:
> + return NULL;
> + }
> +
> + ext_objs = READ_ONCE(net->ipv4.sysctl_icmp_errors_extension_mask);
> + if (!ext_objs)
> + return NULL;
> +
> + ext_max_len = icmp_ext_max_len(ext_objs);
> + if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room)
> + return NULL;
> +
> + skb = skb_clone(skb_in, GFP_ATOMIC);
> + if (!skb)
> + return NULL;
> +
> + nhoff = skb_network_offset(skb);
> + payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN);
> +
> + if (!pskb_network_may_pull(skb, payload_len))
> + goto free_skb;
> +
> + if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) ||
> + __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false))
> + goto free_skb;
> +
> + if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC))
> + goto free_skb;
> +
> + ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr));
> + ext_hdr->version = ICMP_EXT_VERSION_2;
> +
> + icmp_ext_objs_append(net, skb, ext_objs, iif);
> +
> + /* Do not send an empty extension structure. */
> + ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr;
> + if (ext_len == sizeof(*ext_hdr))
> + goto free_skb;
> +
> + ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len);
> + /* The length of the original datagram in 32-bit words (RFC 4884). */
> + icmph->un.reserved[1] = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u32);
> +
> + return skb;
> +
> +free_skb:
> + consume_skb(skb);
> + return NULL;
> +}
> +
> /*
> * Send an ICMP message in response to a situation
> *
> @@ -601,6 +779,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
> struct icmp_bxm icmp_param;
> struct rtable *rt = skb_rtable(skb_in);
> bool apply_ratelimit = false;
> + struct sk_buff *ext_skb;
> struct ipcm_cookie ipc;
> struct flowi4 fl4;
> __be32 saddr;
> @@ -770,7 +949,12 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
> if (room <= (int)sizeof(struct iphdr))
> goto ende;
>
> - icmp_param.data_len = skb_in->len - icmp_param.offset;
> + ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room,
> + parm->iif);
> + if (ext_skb)
> + icmp_param.skb = ext_skb;
> +
> + icmp_param.data_len = icmp_param.skb->len - icmp_param.offset;
> if (icmp_param.data_len > room)
> icmp_param.data_len = room;
> icmp_param.head_len = sizeof(struct icmphdr);
> @@ -785,6 +969,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
> trace_icmp_send(skb_in, type, code);
>
> icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
> +
> + if (ext_skb)
> + consume_skb(ext_skb);
> ende:
> ip_rt_put(rt);
> out_unlock:
> @@ -1502,6 +1689,7 @@ static int __net_init icmp_sk_init(struct net *net)
> net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
> net->ipv4.sysctl_icmp_ratemask = 0x1818;
> net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
> + net->ipv4.sysctl_icmp_errors_extension_mask = 0;
> net->ipv4.sysctl_icmp_msgs_per_sec = 1000;
> net->ipv4.sysctl_icmp_msgs_burst = 50;
>
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 24dbc603cc44..0c7c8f9041cb 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -48,6 +48,8 @@ static int tcp_plb_max_rounds = 31;
> static int tcp_plb_max_cong_thresh = 256;
> static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
> static int tcp_ecn_mode_max = 2;
> +static u32 icmp_errors_extension_mask_all =
> + GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0);
>
> /* obsolete */
> static int sysctl_tcp_low_latency __read_mostly;
> @@ -674,6 +676,15 @@ static struct ctl_table ipv4_net_table[] = {
> .extra1 = SYSCTL_ZERO,
> .extra2 = SYSCTL_ONE
> },
> + {
> + .procname = "icmp_errors_extension_mask",
> + .data = &init_net.ipv4.sysctl_icmp_errors_extension_mask,
> + .maxlen = sizeof(u8),
> + .mode = 0644,
> + .proc_handler = proc_dou8vec_minmax,
> + .extra1 = SYSCTL_ZERO,
> + .extra2 = &icmp_errors_extension_mask_all,
> + },
> {
> .procname = "icmp_ratelimit",
> .data = &init_net.ipv4.sysctl_icmp_ratelimit,
> --
> 2.51.0
>
Powered by blists - more mailing lists