lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANn89iL0y+0axq5RtshyLrZcJ8cJBqJ=OzCH3BW8qUjfqkdG7Q@mail.gmail.com>
Date: Mon, 27 Oct 2025 01:35:44 -0700
From: Eric Dumazet <edumazet@...gle.com>
To: Ido Schimmel <idosch@...dia.com>
Cc: netdev@...r.kernel.org, davem@...emloft.net, kuba@...nel.org, 
	pabeni@...hat.com, horms@...nel.org, dsahern@...nel.org, petrm@...dia.com, 
	willemb@...gle.com, daniel@...earbox.net, fw@...len.de, 
	ishaangandhi@...il.com, rbonica@...iper.net, tom@...bertland.com
Subject: Re: [PATCH net-next v2 1/3] ipv4: icmp: Add RFC 5837 support

On Mon, Oct 27, 2025 at 1:24 AM Ido Schimmel <idosch@...dia.com> wrote:
>
> Add the ability to append the incoming IP interface information to
> ICMPv4 error messages in accordance with RFC 5837 and RFC 4884. This is
> required for more meaningful traceroute results in unnumbered networks.
>
> The feature is disabled by default and controlled via a new sysctl
> ("net.ipv4.icmp_errors_extension_mask") which accepts a bitmask of ICMP
> extensions to append to ICMP error messages. Currently, only a single
> value is supported, but the interface and the implementation should be
> able to support more extensions, if needed.
>
> Clone the skb and copy the relevant data portions before modifying the
> skb as the caller of __icmp_send() still owns the skb after the function
> returns. This should be fine since by default ICMP error messages are
> rate limited to 1000 per second and no more than 1 per second per
> specific host.
>
> Trim or pad the packet to 128 bytes before appending the ICMP extension
> structure in order to be compatible with legacy applications that assume
> that the ICMP extension structure always starts at this offset (the
> minimum length specified by RFC 4884).
>
> Reviewed-by: Petr Machata <petrm@...dia.com>
> Reviewed-by: David Ahern <dsahern@...nel.org>
> Reviewed-by: Willem de Bruijn <willemb@...gle.com>
> Signed-off-by: Ido Schimmel <idosch@...dia.com>
> ---
>
> Notes:
>     v2:
>     * Add a comment about field ordering.
>
>  Documentation/networking/ip-sysctl.rst |  17 +++
>  include/linux/icmp.h                   |  32 +++++
>  include/net/netns/ipv4.h               |   1 +
>  net/ipv4/icmp.c                        | 191 ++++++++++++++++++++++++-
>  net/ipv4/sysctl_net_ipv4.c             |  11 ++
>  5 files changed, 251 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index a06cb99d66dc..ece1187ba0f1 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -1796,6 +1796,23 @@ icmp_errors_use_inbound_ifaddr - BOOLEAN
>
>         Default: 0 (disabled)
>
> +icmp_errors_extension_mask - UNSIGNED INTEGER
> +       Bitmask of ICMP extensions to append to ICMPv4 error messages
> +       ("Destination Unreachable", "Time Exceeded" and "Parameter Problem").
> +       The original datagram is trimmed / padded to 128 bytes in order to be
> +       compatible with applications that do not comply with RFC 4884.
> +
> +       Possible extensions are:
> +
> +       ==== ==============================================================
> +       0x01 Incoming IP interface information according to RFC 5837.
> +            Extension will include the index, IPv4 address (if present),
> +            name and MTU of the IP interface that received the datagram
> +            which elicited the ICMP error.
> +       ==== ==============================================================
> +
> +       Default: 0x00 (no extensions)
> +
>  igmp_max_memberships - INTEGER
>         Change the maximum number of multicast groups we can subscribe to.
>         Default: 20
> diff --git a/include/linux/icmp.h b/include/linux/icmp.h
> index 0af4d210ee31..043ec5d9c882 100644
> --- a/include/linux/icmp.h
> +++ b/include/linux/icmp.h
> @@ -40,4 +40,36 @@ void ip_icmp_error_rfc4884(const struct sk_buff *skb,
>                            struct sock_ee_data_rfc4884 *out,
>                            int thlen, int off);
>
> +/* RFC 4884 */
> +#define ICMP_EXT_ORIG_DGRAM_MIN_LEN    128
> +#define ICMP_EXT_VERSION_2             2
> +
> +/* ICMP Extension Object Classes */
> +#define ICMP_EXT_OBJ_CLASS_IIO         2       /* RFC 5837 */
> +
> +/* Interface Information Object - RFC 5837 */
> +enum {
> +       ICMP_EXT_CTYPE_IIO_ROLE_IIF,
> +};
> +
> +#define ICMP_EXT_CTYPE_IIO_ROLE(ROLE)  ((ROLE) << 6)
> +#define ICMP_EXT_CTYPE_IIO_MTU         BIT(0)
> +#define ICMP_EXT_CTYPE_IIO_NAME                BIT(1)
> +#define ICMP_EXT_CTYPE_IIO_IPADDR      BIT(2)
> +#define ICMP_EXT_CTYPE_IIO_IFINDEX     BIT(3)
> +
> +struct icmp_ext_iio_name_subobj {
> +       u8 len;
> +       char name[IFNAMSIZ];
> +};
> +
> +enum {
> +       /* RFC 5837 - Incoming IP Interface Role */
> +       ICMP_ERR_EXT_IIO_IIF,
> +       /* Add new constants above. Used by "icmp_errors_extension_mask"
> +        * sysctl.
> +        */
> +       ICMP_ERR_EXT_COUNT,
> +};
> +
>  #endif /* _LINUX_ICMP_H */
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index 34eb3aecb3f2..0e96c90e56c6 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -135,6 +135,7 @@ struct netns_ipv4 {
>         u8 sysctl_icmp_echo_ignore_broadcasts;
>         u8 sysctl_icmp_ignore_bogus_error_responses;
>         u8 sysctl_icmp_errors_use_inbound_ifaddr;
> +       u8 sysctl_icmp_errors_extension_mask;
>         int sysctl_icmp_ratelimit;
>         int sysctl_icmp_ratemask;
>         int sysctl_icmp_msgs_per_sec;
> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> index 1b7fb5d935ed..4abbec2f47ef 100644
> --- a/net/ipv4/icmp.c
> +++ b/net/ipv4/icmp.c
> @@ -582,6 +582,185 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
>         return ERR_PTR(err);
>  }
>
> +struct icmp_ext_iio_addr4_subobj {
> +       __be16 afi;
> +       __be16 reserved;
> +       __be32 addr4;
> +};
> +
> +static unsigned int icmp_ext_iio_len(void)
> +{
> +       return sizeof(struct icmp_extobj_hdr) +
> +               /* ifIndex */
> +               sizeof(__be32) +
> +               /* Interface Address Sub-Object */
> +               sizeof(struct icmp_ext_iio_addr4_subobj) +
> +               /* Interface Name Sub-Object. Length must be a multiple of 4
> +                * bytes.
> +                */
> +               ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) +
> +               /* MTU */
> +               sizeof(__be32);
> +}
> +
> +static unsigned int icmp_ext_max_len(u8 ext_objs)
> +{
> +       unsigned int ext_max_len;
> +
> +       ext_max_len = sizeof(struct icmp_ext_hdr);
> +
> +       if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
> +               ext_max_len += icmp_ext_iio_len();
> +
> +       return ext_max_len;
> +}
> +
> +static __be32 icmp_ext_iio_addr4_find(const struct net_device *dev)
> +{
> +       struct in_device *in_dev;
> +       struct in_ifaddr *ifa;
> +
> +       in_dev = __in_dev_get_rcu(dev);
> +       if (!in_dev)
> +               return 0;
> +
> +       /* It is unclear from RFC 5837 which IP address should be chosen, but
> +        * it makes sense to choose a global unicast address.
> +        */
> +       in_dev_for_each_ifa_rcu(ifa, in_dev) {
> +               if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
> +                       continue;
> +               if (ifa->ifa_scope != RT_SCOPE_UNIVERSE ||
> +                   ipv4_is_multicast(ifa->ifa_address))
> +                       continue;
> +               return ifa->ifa_address;
> +       }
> +
> +       return 0;
> +}
> +
> +static void icmp_ext_iio_iif_append(struct net *net, struct sk_buff *skb,
> +                                   int iif)
> +{
> +       struct icmp_ext_iio_name_subobj *name_subobj;
> +       struct icmp_extobj_hdr *objh;
> +       struct net_device *dev;
> +       __be32 data;
> +
> +       if (!iif)
> +               return;
> +
> +       /* Add the fields in the order specified by RFC 5837. */
> +       objh = skb_put(skb, sizeof(*objh));
> +       objh->class_num = ICMP_EXT_OBJ_CLASS_IIO;
> +       objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF);
> +
> +       data = htonl(iif);
> +       skb_put_data(skb, &data, sizeof(__be32));
> +       objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX;
> +
> +       rcu_read_lock();
> +
> +       dev = dev_get_by_index_rcu(net, iif);
> +       if (!dev)
> +               goto out;
> +
> +       data = icmp_ext_iio_addr4_find(dev);
> +       if (data) {
> +               struct icmp_ext_iio_addr4_subobj *addr4_subobj;
> +
> +               addr4_subobj = skb_put_zero(skb, sizeof(*addr4_subobj));
> +               addr4_subobj->afi = htons(ICMP_AFI_IP);
> +               addr4_subobj->addr4 = data;
> +               objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR;
> +       }
> +
> +       name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4));
> +       name_subobj->len = ALIGN(sizeof(*name_subobj), 4);
> +       netdev_copy_name(dev, name_subobj->name);
> +       objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME;
> +
> +       data = htonl(READ_ONCE(dev->mtu));
> +       skb_put_data(skb, &data, sizeof(__be32));
> +       objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU;
> +
> +out:
> +       rcu_read_unlock();
> +       objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh);
> +}
> +
> +static void icmp_ext_objs_append(struct net *net, struct sk_buff *skb,
> +                                u8 ext_objs, int iif)
> +{
> +       if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
> +               icmp_ext_iio_iif_append(net, skb, iif);
> +}
> +
> +static struct sk_buff *
> +icmp_ext_append(struct net *net, struct sk_buff *skb_in, struct icmphdr *icmph,
> +               unsigned int room, int iif)
> +{
> +       unsigned int payload_len, ext_max_len, ext_len;
> +       struct icmp_ext_hdr *ext_hdr;
> +       struct sk_buff *skb;
> +       u8 ext_objs;
> +       int nhoff;
> +
> +       switch (icmph->type) {
> +       case ICMP_DEST_UNREACH:
> +       case ICMP_TIME_EXCEEDED:
> +       case ICMP_PARAMETERPROB:
> +               break;
> +       default:
> +               return NULL;
> +       }
> +
> +       ext_objs = READ_ONCE(net->ipv4.sysctl_icmp_errors_extension_mask);
> +       if (!ext_objs)
> +               return NULL;
> +
> +       ext_max_len = icmp_ext_max_len(ext_objs);
> +       if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room)
> +               return NULL;
> +
> +       skb = skb_clone(skb_in, GFP_ATOMIC);
> +       if (!skb)
> +               return NULL;
> +
> +       nhoff = skb_network_offset(skb);
> +       payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN);
> +
> +       if (!pskb_network_may_pull(skb, payload_len))
> +               goto free_skb;
> +
> +       if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) ||
> +           __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false))
> +               goto free_skb;
> +
> +       if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC))
> +               goto free_skb;
> +
> +       ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr));
> +       ext_hdr->version = ICMP_EXT_VERSION_2;
> +
> +       icmp_ext_objs_append(net, skb, ext_objs, iif);
> +
> +       /* Do not send an empty extension structure. */
> +       ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr;
> +       if (ext_len == sizeof(*ext_hdr))
> +               goto free_skb;
> +
> +       ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len);
> +       /* The length of the original datagram in 32-bit words (RFC 4884). */
> +       icmph->un.reserved[1] = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u32);
> +
> +       return skb;
> +
> +free_skb:
> +       consume_skb(skb);
> +       return NULL;
> +}
> +
>  /*
>   *     Send an ICMP message in response to a situation
>   *
> @@ -601,6 +780,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
>         struct icmp_bxm icmp_param;
>         struct rtable *rt = skb_rtable(skb_in);
>         bool apply_ratelimit = false;
> +       struct sk_buff *ext_skb;
>         struct ipcm_cookie ipc;
>         struct flowi4 fl4;
>         __be32 saddr;
> @@ -770,7 +950,12 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
>         if (room <= (int)sizeof(struct iphdr))
>                 goto ende;
>
> -       icmp_param.data_len = skb_in->len - icmp_param.offset;
> +       ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room,
> +                                 parm->iif);
> +       if (ext_skb)
> +               icmp_param.skb = ext_skb;
> +
> +       icmp_param.data_len = icmp_param.skb->len - icmp_param.offset;
>         if (icmp_param.data_len > room)
>                 icmp_param.data_len = room;
>         icmp_param.head_len = sizeof(struct icmphdr);
> @@ -785,6 +970,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
>         trace_icmp_send(skb_in, type, code);
>
>         icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
> +
> +       if (ext_skb)
> +               consume_skb(ext_skb);

nit : if (ext_skb) is not needed, consume_skb(NULL) is ok.

No need to resend.

Reviewed-by: Eric Dumazet <edumazet@...gle.com>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ