[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251022065349.434123-2-idosch@nvidia.com>
Date: Wed, 22 Oct 2025 09:53:47 +0300
From: Ido Schimmel <idosch@...dia.com>
To: <netdev@...r.kernel.org>
CC: <davem@...emloft.net>, <kuba@...nel.org>, <pabeni@...hat.com>,
<edumazet@...gle.com>, <horms@...nel.org>, <dsahern@...nel.org>,
<petrm@...dia.com>, <willemb@...gle.com>, <daniel@...earbox.net>,
<fw@...len.de>, <ishaangandhi@...il.com>, <rbonica@...iper.net>,
<tom@...bertland.com>, Ido Schimmel <idosch@...dia.com>
Subject: [PATCH net-next 1/3] ipv4: icmp: Add RFC 5837 support
Add the ability to append the incoming IP interface information to
ICMPv4 error messages in accordance with RFC 5837 and RFC 4884. This is
required for more meaningful traceroute results in unnumbered networks.
The feature is disabled by default and controlled via a new sysctl
("net.ipv4.icmp_errors_extension_mask") which accepts a bitmask of ICMP
extensions to append to ICMP error messages. Currently, only a single
value is supported, but the interface and the implementation should be
able to support more extensions, if needed.
Clone the skb and copy the relevant data portions before modifying the
skb as the caller of __icmp_send() still owns the skb after the function
returns. This should be fine since by default ICMP error messages are
rate limited to 1000 per second and no more than 1 per second per
specific host.
Trim or pad the packet to 128 bytes before appending the ICMP extension
structure in order to be compatible with legacy applications that assume
that the ICMP extension structure always starts at this offset (the
minimum length specified by RFC 4884).
Reviewed-by: Petr Machata <petrm@...dia.com>
Signed-off-by: Ido Schimmel <idosch@...dia.com>
---
Documentation/networking/ip-sysctl.rst | 17 +++
include/linux/icmp.h | 32 +++++
include/net/netns/ipv4.h | 1 +
net/ipv4/icmp.c | 190 ++++++++++++++++++++++++-
net/ipv4/sysctl_net_ipv4.c | 11 ++
5 files changed, 250 insertions(+), 1 deletion(-)
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index a06cb99d66dc..ece1187ba0f1 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1796,6 +1796,23 @@ icmp_errors_use_inbound_ifaddr - BOOLEAN
Default: 0 (disabled)
+icmp_errors_extension_mask - UNSIGNED INTEGER
+ Bitmask of ICMP extensions to append to ICMPv4 error messages
+ ("Destination Unreachable", "Time Exceeded" and "Parameter Problem").
+ The original datagram is trimmed / padded to 128 bytes in order to be
+ compatible with applications that do not comply with RFC 4884.
+
+ Possible extensions are:
+
+ ==== ==============================================================
+ 0x01 Incoming IP interface information according to RFC 5837.
+ Extension will include the index, IPv4 address (if present),
+ name and MTU of the IP interface that received the datagram
+ which elicited the ICMP error.
+ ==== ==============================================================
+
+ Default: 0x00 (no extensions)
+
igmp_max_memberships - INTEGER
Change the maximum number of multicast groups we can subscribe to.
Default: 20
diff --git a/include/linux/icmp.h b/include/linux/icmp.h
index 0af4d210ee31..043ec5d9c882 100644
--- a/include/linux/icmp.h
+++ b/include/linux/icmp.h
@@ -40,4 +40,36 @@ void ip_icmp_error_rfc4884(const struct sk_buff *skb,
struct sock_ee_data_rfc4884 *out,
int thlen, int off);
+/* RFC 4884 */
+#define ICMP_EXT_ORIG_DGRAM_MIN_LEN 128
+#define ICMP_EXT_VERSION_2 2
+
+/* ICMP Extension Object Classes */
+#define ICMP_EXT_OBJ_CLASS_IIO 2 /* RFC 5837 */
+
+/* Interface Information Object - RFC 5837 */
+enum {
+ ICMP_EXT_CTYPE_IIO_ROLE_IIF,
+};
+
+#define ICMP_EXT_CTYPE_IIO_ROLE(ROLE) ((ROLE) << 6)
+#define ICMP_EXT_CTYPE_IIO_MTU BIT(0)
+#define ICMP_EXT_CTYPE_IIO_NAME BIT(1)
+#define ICMP_EXT_CTYPE_IIO_IPADDR BIT(2)
+#define ICMP_EXT_CTYPE_IIO_IFINDEX BIT(3)
+
+struct icmp_ext_iio_name_subobj {
+ u8 len;
+ char name[IFNAMSIZ];
+};
+
+enum {
+ /* RFC 5837 - Incoming IP Interface Role */
+ ICMP_ERR_EXT_IIO_IIF,
+ /* Add new constants above. Used by "icmp_errors_extension_mask"
+ * sysctl.
+ */
+ ICMP_ERR_EXT_COUNT,
+};
+
#endif /* _LINUX_ICMP_H */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 34eb3aecb3f2..0e96c90e56c6 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -135,6 +135,7 @@ struct netns_ipv4 {
u8 sysctl_icmp_echo_ignore_broadcasts;
u8 sysctl_icmp_ignore_bogus_error_responses;
u8 sysctl_icmp_errors_use_inbound_ifaddr;
+ u8 sysctl_icmp_errors_extension_mask;
int sysctl_icmp_ratelimit;
int sysctl_icmp_ratemask;
int sysctl_icmp_msgs_per_sec;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1b7fb5d935ed..44c4deb9d9da 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -582,6 +582,184 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
return ERR_PTR(err);
}
+struct icmp_ext_iio_addr4_subobj {
+ __be16 afi;
+ __be16 reserved;
+ __be32 addr4;
+};
+
+static unsigned int icmp_ext_iio_len(void)
+{
+ return sizeof(struct icmp_extobj_hdr) +
+ /* ifIndex */
+ sizeof(__be32) +
+ /* Interface Address Sub-Object */
+ sizeof(struct icmp_ext_iio_addr4_subobj) +
+ /* Interface Name Sub-Object. Length must be a multiple of 4
+ * bytes.
+ */
+ ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) +
+ /* MTU */
+ sizeof(__be32);
+}
+
+static unsigned int icmp_ext_max_len(u8 ext_objs)
+{
+ unsigned int ext_max_len;
+
+ ext_max_len = sizeof(struct icmp_ext_hdr);
+
+ if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+ ext_max_len += icmp_ext_iio_len();
+
+ return ext_max_len;
+}
+
+static __be32 icmp_ext_iio_addr4_find(const struct net_device *dev)
+{
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ return 0;
+
+ /* It is unclear from RFC 5837 which IP address should be chosen, but
+ * it makes sense to choose a global unicast address.
+ */
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
+ continue;
+ if (ifa->ifa_scope != RT_SCOPE_UNIVERSE ||
+ ipv4_is_multicast(ifa->ifa_address))
+ continue;
+ return ifa->ifa_address;
+ }
+
+ return 0;
+}
+
+static void icmp_ext_iio_iif_append(struct net *net, struct sk_buff *skb,
+ int iif)
+{
+ struct icmp_ext_iio_name_subobj *name_subobj;
+ struct icmp_extobj_hdr *objh;
+ struct net_device *dev;
+ __be32 data;
+
+ if (!iif)
+ return;
+
+ objh = skb_put(skb, sizeof(*objh));
+ objh->class_num = ICMP_EXT_OBJ_CLASS_IIO;
+ objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF);
+
+ data = htonl(iif);
+ skb_put_data(skb, &data, sizeof(__be32));
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX;
+
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, iif);
+ if (!dev)
+ goto out;
+
+ data = icmp_ext_iio_addr4_find(dev);
+ if (data) {
+ struct icmp_ext_iio_addr4_subobj *addr4_subobj;
+
+ addr4_subobj = skb_put_zero(skb, sizeof(*addr4_subobj));
+ addr4_subobj->afi = htons(ICMP_AFI_IP);
+ addr4_subobj->addr4 = data;
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR;
+ }
+
+ name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4));
+ name_subobj->len = ALIGN(sizeof(*name_subobj), 4);
+ netdev_copy_name(dev, name_subobj->name);
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME;
+
+ data = htonl(READ_ONCE(dev->mtu));
+ skb_put_data(skb, &data, sizeof(__be32));
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU;
+
+out:
+ rcu_read_unlock();
+ objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh);
+}
+
+static void icmp_ext_objs_append(struct net *net, struct sk_buff *skb,
+ u8 ext_objs, int iif)
+{
+ if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+ icmp_ext_iio_iif_append(net, skb, iif);
+}
+
+static struct sk_buff *
+icmp_ext_append(struct net *net, struct sk_buff *skb_in, struct icmphdr *icmph,
+ unsigned int room, int iif)
+{
+ unsigned int payload_len, ext_max_len, ext_len;
+ struct icmp_ext_hdr *ext_hdr;
+ struct sk_buff *skb;
+ u8 ext_objs;
+ int nhoff;
+
+ switch (icmph->type) {
+ case ICMP_DEST_UNREACH:
+ case ICMP_TIME_EXCEEDED:
+ case ICMP_PARAMETERPROB:
+ break;
+ default:
+ return NULL;
+ }
+
+ ext_objs = READ_ONCE(net->ipv4.sysctl_icmp_errors_extension_mask);
+ if (!ext_objs)
+ return NULL;
+
+ ext_max_len = icmp_ext_max_len(ext_objs);
+ if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room)
+ return NULL;
+
+ skb = skb_clone(skb_in, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ nhoff = skb_network_offset(skb);
+ payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN);
+
+ if (!pskb_network_may_pull(skb, payload_len))
+ goto free_skb;
+
+ if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) ||
+ __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false))
+ goto free_skb;
+
+ if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC))
+ goto free_skb;
+
+ ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr));
+ ext_hdr->version = ICMP_EXT_VERSION_2;
+
+ icmp_ext_objs_append(net, skb, ext_objs, iif);
+
+ /* Do not send an empty extension structure. */
+ ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr;
+ if (ext_len == sizeof(*ext_hdr))
+ goto free_skb;
+
+ ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len);
+ /* The length of the original datagram in 32-bit words (RFC 4884). */
+ icmph->un.reserved[1] = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u32);
+
+ return skb;
+
+free_skb:
+ consume_skb(skb);
+ return NULL;
+}
+
/*
* Send an ICMP message in response to a situation
*
@@ -601,6 +779,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
struct icmp_bxm icmp_param;
struct rtable *rt = skb_rtable(skb_in);
bool apply_ratelimit = false;
+ struct sk_buff *ext_skb;
struct ipcm_cookie ipc;
struct flowi4 fl4;
__be32 saddr;
@@ -770,7 +949,12 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
if (room <= (int)sizeof(struct iphdr))
goto ende;
- icmp_param.data_len = skb_in->len - icmp_param.offset;
+ ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room,
+ parm->iif);
+ if (ext_skb)
+ icmp_param.skb = ext_skb;
+
+ icmp_param.data_len = icmp_param.skb->len - icmp_param.offset;
if (icmp_param.data_len > room)
icmp_param.data_len = room;
icmp_param.head_len = sizeof(struct icmphdr);
@@ -785,6 +969,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
trace_icmp_send(skb_in, type, code);
icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
+
+ if (ext_skb)
+ consume_skb(ext_skb);
ende:
ip_rt_put(rt);
out_unlock:
@@ -1502,6 +1689,7 @@ static int __net_init icmp_sk_init(struct net *net)
net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
net->ipv4.sysctl_icmp_ratemask = 0x1818;
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
+ net->ipv4.sysctl_icmp_errors_extension_mask = 0;
net->ipv4.sysctl_icmp_msgs_per_sec = 1000;
net->ipv4.sysctl_icmp_msgs_burst = 50;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 24dbc603cc44..0c7c8f9041cb 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -48,6 +48,8 @@ static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
static int tcp_ecn_mode_max = 2;
+static u32 icmp_errors_extension_mask_all =
+ GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0);
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -674,6 +676,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE
},
+ {
+ .procname = "icmp_errors_extension_mask",
+ .data = &init_net.ipv4.sysctl_icmp_errors_extension_mask,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &icmp_errors_extension_mask_all,
+ },
{
.procname = "icmp_ratelimit",
.data = &init_net.ipv4.sysctl_icmp_ratelimit,
--
2.51.0
Powered by blists - more mailing lists