lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAA3_GnpJ-RV=1xaf4DKRd30RgH0HoAxXJA1tx0cvBCX05OcYuA@mail.gmail.com>
Date: Wed, 28 Jan 2026 19:46:44 +0900
From: 戸田晃太 <kota.toda@...-cybersecurity.com>
To: Eric Dumazet <edumazet@...gle.com>
Cc: pabeni@...hat.com, netdev@...r.kernel.org, linux-kernel@...r.kernel.org, 
	小池悠生 <yuki.koike@...-cybersecurity.com>, 
	戸田晃太 <kota.toda@...-cybersecurity.com>
Subject: Re: [PATCH net] bonding: Fix header_ops type confusion

Here is the patch with the barriers added, based on v6.12.67.

However, as Yuki said, we are wondering if this would be considered an
acceptable change
from the perspective of the maintainers (or in terms of Linux kernel
culture). This is because
the patch adds `READ_ONCE` to several locations outside of bonding subsystem.
Please let me know if you have any concerns regarding this point.

>  Also, please clarify what happens with stacks of two or more bonding devices ?

To clarify, currently the `header_ops` of the bottom-most
interface are used regardless of the number of bonding layers.
This patch changes it so that `&bond->bond_header_ops` is used
as the bond device's `header_ops`, regardless of the stack depth.

```
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index f17a170d1..5ecc64e38 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1616,14 +1616,70 @@ static void bond_compute_features(struct bonding *bond)
     netdev_change_features(bond_dev);
 }

+static int bond_hard_header(struct sk_buff *skb, struct net_device *dev,
+        unsigned short type, const void *daddr,
+        const void *saddr, unsigned int len)
+{
+    struct bonding *bond = netdev_priv(dev);
+    struct net_device *slave_dev;
+
+    slave_dev = bond->header_slave_dev;
+
+    return dev_hard_header(skb, slave_dev, type, daddr, saddr, len);
+}
+
+static void bond_header_cache_update(struct hh_cache *hh, const
+struct net_device *dev,
+        const unsigned char *haddr)
+{
+    const struct bonding *bond = netdev_priv(dev);
+    void (*cache_update)(struct hh_cache *hh,
+        const struct net_device *dev,
+        const unsigned char *haddr);
+    struct net_device *slave_dev;
+
+    slave_dev = bond->header_slave_dev;
+
+    if (!slave_dev->header_ops || !(cache_update =
READ_ONCE(slave_dev->header_ops->cache_update)))
+        return;
+
+    cache_update(hh, slave_dev, haddr);
+}
+
+
 static void bond_setup_by_slave(struct net_device *bond_dev,
                 struct net_device *slave_dev)
 {
+    struct bonding *bond = netdev_priv(bond_dev);
     bool was_up = !!(bond_dev->flags & IFF_UP);

     dev_close(bond_dev);

-    bond_dev->header_ops        = slave_dev->header_ops;
+    /* Some functions are given dev as an argument
+    * while others not. When dev is not given, we cannot
+    * find out what is the slave device through struct bonding
+    * (the private data of bond_dev). Therefore, we need a raw
+    * header_ops variable instead of its pointer to const header_ops
+    * and assign slave's functions directly.
+    * For the other case, we set the wrapper functions that pass
+    * slave_dev to the wrapped functions.
+    */
+    bond->bond_header_ops.create = bond_hard_header;
+    bond->bond_header_ops.cache_update = bond_header_cache_update;
+    if (slave_dev->header_ops) {
+        WRITE_ONCE(bond->bond_header_ops.parse, slave_dev->header_ops->parse);
+        WRITE_ONCE(bond->bond_header_ops.cache, slave_dev->header_ops->cache);
+        WRITE_ONCE(bond->bond_header_ops.validate,
slave_dev->header_ops->validate);
+        WRITE_ONCE(bond->bond_header_ops.parse_protocol,
slave_dev->header_ops->parse_protocol);
+    } else {
+        WRITE_ONCE(bond->bond_header_ops.parse, NULL);
+        WRITE_ONCE(bond->bond_header_ops.cache, NULL);
+        WRITE_ONCE(bond->bond_header_ops.validate, NULL);
+        WRITE_ONCE(bond->bond_header_ops.parse_protocol, NULL);
+    }
+
+    bond->header_slave_dev      = slave_dev;
+    bond_dev->header_ops        = &bond->bond_header_ops;

     bond_dev->type            = slave_dev->type;
     bond_dev->hard_header_len   = slave_dev->hard_header_len;
@@ -2682,6 +2738,14 @@ static int bond_release_and_destroy(struct
net_device *bond_dev,
     struct bonding *bond = netdev_priv(bond_dev);
     int ret;

+    /* If slave_dev is the earliest registered one, we must clear
+    * the variables related to header_ops to avoid dangling pointer.
+    */
+    if (bond->header_slave_dev == slave_dev) {
+        bond->header_slave_dev = NULL;
+        bond_dev->header_ops = NULL;
+    }
+
     ret = __bond_release_one(bond_dev, slave_dev, false, true);
     if (ret == 0 && !bond_has_slaves(bond) &&
         bond_dev->reg_state != NETREG_UNREGISTERING) {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 77a99c8ab..0d2c1c852 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3150,35 +3150,41 @@ static inline int dev_hard_header(struct
sk_buff *skb, struct net_device *dev,
                   const void *daddr, const void *saddr,
                   unsigned int len)
 {
-    if (!dev->header_ops || !dev->header_ops->create)
+    int (*create)(struct sk_buff *skb, struct net_device *dev,
+        unsigned short type, const void *daddr,
+        const void *saddr, unsigned int len);
+    if (!dev->header_ops || !(create = READ_ONCE(dev->header_ops->create)))
         return 0;

-    return dev->header_ops->create(skb, dev, type, daddr, saddr, len);
+    return create(skb, dev, type, daddr, saddr, len);
 }

 static inline int dev_parse_header(const struct sk_buff *skb,
                    unsigned char *haddr)
 {
+    int (*parse)(const struct sk_buff *skb, unsigned char *haddr);
     const struct net_device *dev = skb->dev;

-    if (!dev->header_ops || !dev->header_ops->parse)
+    if (!dev->header_ops || !(parse = READ_ONCE(dev->header_ops->parse)))
         return 0;
-    return dev->header_ops->parse(skb, haddr);
+    return parse(skb, haddr);
 }

 static inline __be16 dev_parse_header_protocol(const struct sk_buff *skb)
 {
+    __be16    (*parse_protocol)(const struct sk_buff *skb);
     const struct net_device *dev = skb->dev;

-    if (!dev->header_ops || !dev->header_ops->parse_protocol)
+    if (!dev->header_ops || !(parse_protocol =
READ_ONCE(dev->header_ops->parse_protocol)))
         return 0;
-    return dev->header_ops->parse_protocol(skb);
+    return parse_protocol(skb);
 }

 /* ll_header must have at least hard_header_len allocated */
 static inline bool dev_validate_header(const struct net_device *dev,
                        char *ll_header, int len)
 {
+    bool    (*validate)(const char *ll_header, unsigned int len);
     if (likely(len >= dev->hard_header_len))
         return true;
     if (len < dev->min_header_len)
@@ -3189,15 +3195,15 @@ static inline bool dev_validate_header(const
struct net_device *dev,
         return true;
     }

-    if (dev->header_ops && dev->header_ops->validate)
-        return dev->header_ops->validate(ll_header, len);
+    if (dev->header_ops && (validate = READ_ONCE(dev->header_ops->validate)))
+        return validate(ll_header, len);

     return false;
 }

 static inline bool dev_has_header(const struct net_device *dev)
 {
-    return dev->header_ops && dev->header_ops->create;
+    return dev->header_ops && READ_ONCE(dev->header_ops->create);
 }

 /*
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 95f67b308..c37800e17 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -215,6 +215,11 @@ struct bond_ipsec {
  */
 struct bonding {
     struct   net_device *dev; /* first - useful for panic debug */
+    struct   net_device *header_slave_dev;  /* slave net_device for
header_ops */
+    /* maintained as a non-const variable
+    * because bond's header_ops should change depending on slaves.
+    */
+    struct   header_ops bond_header_ops;
     struct   slave __rcu *curr_active_slave;
     struct   slave __rcu *current_arp_slave;
     struct   slave __rcu *primary_slave;
diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 76d2cd2e2..dec638763 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -522,7 +522,7 @@ wpan_dev_hard_header(struct sk_buff *skb, struct
net_device *dev,
 {
     struct wpan_dev *wpan_dev = dev->ieee802154_ptr;

-    return wpan_dev->header_ops->create(skb, dev, daddr, saddr, len);
+    return READ_ONCE(wpan_dev->header_ops->create)(skb, dev, daddr,
saddr, len);
 }
 #endif

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 96786016d..ff948e35e 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1270,7 +1270,7 @@ static void neigh_update_hhs(struct neighbour *neigh)
         = NULL;

     if (neigh->dev->header_ops)
-        update = neigh->dev->header_ops->cache_update;
+        update = READ_ONCE(neigh->dev->header_ops->cache_update);

     if (update) {
         hh = &neigh->hh;
@@ -1540,7 +1540,7 @@ static void neigh_hh_init(struct neighbour *n)
      * hh_cache entry.
      */
     if (!hh->hh_len)
-        dev->header_ops->cache(n, hh, prot);
+        READ_ONCE(dev->header_ops->cache)(n, hh, prot);

     write_unlock_bh(&n->lock);
 }
@@ -1556,7 +1556,7 @@ int neigh_resolve_output(struct neighbour
*neigh, struct sk_buff *skb)
         struct net_device *dev = neigh->dev;
         unsigned int seq;

-        if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len))
+        if (READ_ONCE(dev->header_ops->cache) && !READ_ONCE(neigh->hh.hh_len))
             neigh_hh_init(neigh);

         do {
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 7822b2144..421bea6eb 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -278,7 +278,7 @@ static int arp_constructor(struct neighbour *neigh)
             memcpy(neigh->ha, dev->broadcast, dev->addr_len);
         }

-        if (dev->header_ops->cache)
+        if (READ_ONCE(dev->header_ops->cache))
             neigh->ops = &arp_hh_ops;
         else
             neigh->ops = &arp_generic_ops;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index d961e6c2d..d81f509ec 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -361,7 +361,7 @@ static int ndisc_constructor(struct neighbour *neigh)
             neigh->nud_state = NUD_NOARP;
             memcpy(neigh->ha, dev->broadcast, dev->addr_len);
         }
-        if (dev->header_ops->cache)
+        if (READ_ONCE(dev->header_ops->cache))
             neigh->ops = &ndisc_hh_ops;
         else
             neigh->ops = &ndisc_generic_ops;
```

2026年1月19日(月) 18:30 Eric Dumazet <edumazet@...gle.com>:
>
> On Mon, Jan 19, 2026 at 6:36 AM 戸田晃太 <kota.toda@...-cybersecurity.com> wrote:
> >
> > Thanks for your quick response.
> >
> > The following information is based on Linux kernel version 6.12.65,
> > the latest release in the 6.12 tree.
> > The kernel config is identical to that of the kernelCTF instance
> > (available at: https://storage.googleapis.com/kernelctf-build/releases/lts-6.12.65/.config)
> >
> >
> > This type confusion occurs in several locations, including,
> > for example, `ipgre_header` (`header_ops->create`),
> > where the private data of the network device is incorrectly cast as
> > `struct ip_tunnel *`.
> >
> > ```
> > static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
> >       unsigned short type,
> >       const void *daddr, const void *saddr, unsigned int len)
> > {
> >   struct ip_tunnel *t = netdev_priv(dev);
> >   struct iphdr *iph;
> >   struct gre_base_hdr *greh;
> > ...
> > ```
> >
> > When a bond interface is given to this function,
> > it should not reference the private data as `struct ip_tunnel *`,
> > because the bond interface uses the private data as `struct bonding *`.
> > (quickly confirmed by seeing drivers/net/bonding/bond_netlink.c:909)
> >
> > ```
> > struct rtnl_link_ops bond_link_ops __read_mostly = {
> >     .kind            = "bond",
> >     .priv_size        = sizeof(struct bonding),
> > ...
> > ```
> >
> > The stack trace below is the backtrace of all stack frame during a
> > call to `ipgre_header`.
> >
> > ```
> > ipgre_header at net/ipv4/ip_gre.c:890
> > dev_hard_header at ./include/linux/netdevice.h:3156
> > packet_snd at net/packet/af_packet.c:3082
> > packet_sendmsg at net/packet/af_packet.c:3162
> > sock_sendmsg_nosec at net/socket.c:729
> > __sock_sendmsg at net/socket.c:744
> > __sys_sendto at net/socket.c:2213
> > __do_sys_sendto at net/socket.c:2225
> > __se_sys_sendto at net/socket.c:2221
> > __x64_sys_sendto at net/socket.c:2221
> > do_syscall_x64 at arch/x86/entry/common.c:47
> > do_syscall_64 at arch/x86/entry/common.c:78
> > entry_SYSCALL_64 at arch/x86/entry/entry_64.S:121
> > ```
> >
> > This causes memory corruption during subsequent operations.
> >
> > The following stack trace shows a General Protection Fault triggered
> > when sending a packet
> > to a bonding interface that has an IPv4 GRE interface as a slave.
> >
> > ```
> > [    1.712329] Oops: general protection fault, probably for
> > non-canonical address 0xdead0000cafebabe: 0000 [#1] SMP NOPTI
> > [    1.712972] CPU: 0 UID: 1000 PID: 205 Comm: exp Not tainted 6.12.65 #1
> > [    1.713344] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> > BIOS Arch Linux 1.17.0-2-2 04/01/2014
> > [    1.713890] RIP: 0010:skb_release_data+0x8a/0x1c0
> > [    1.714162] Code: c0 00 00 00 49 03 86 c8 00 00 00 0f b6 10 f6 c2
> > 01 74 48 48 8b 70 28 48 85 f6 74 3f 41 0f b6 5d 00 83 e3 10 40 f6 c6
> > 01 75 24 <48> 8b 06 ba 01 00 00 00 4c 89 f7 48 8b 00 ff d0 0f 1f 00 41
> > 8b6
> > [    1.715276] RSP: 0018:ffffc900007cfcc0 EFLAGS: 00010246
> > [    1.715583] RAX: ffff888106fe12c0 RBX: 0000000000000010 RCX: 0000000000000000
> > [    1.716036] RDX: 0000000000000017 RSI: dead0000cafebabe RDI: ffff8881059c4a00
> > [    1.716504] RBP: ffffc900007cfe10 R08: 0000000000000010 R09: 0000000000000000
> > [    1.716955] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000002
> > [    1.717429] R13: ffff888106fe12c0 R14: ffff8881059c4a00 R15: ffff888106e57000
> > [    1.717866] FS:  0000000038e54380(0000) GS:ffff88813bc00000(0000)
> > knlGS:0000000000000000
> > [    1.718350] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [    1.718703] CR2: 00000000004bf480 CR3: 00000001009ec001 CR4: 0000000000772ef0
> > [    1.719109] PKRU: 55555554
> > [    1.719297] Call Trace:
> > [    1.719461]  <TASK>
> > [    1.719611]  sk_skb_reason_drop+0x58/0x120
> > [    1.719891]  packet_sendmsg+0xbcb/0x18f0
> > [    1.720166]  ? pcpu_alloc_area+0x186/0x260
> > [    1.720421]  __sys_sendto+0x1e2/0x1f0
> > [    1.720691]  __x64_sys_sendto+0x24/0x30
> > [    1.720948]  do_syscall_64+0x58/0x120
> > [    1.721174]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > [    1.721509] RIP: 0033:0x42860d
> > [    1.721713] Code: c3 ff ff ff ff 64 89 02 eb b9 0f 1f 00 f3 0f 1e
> > fa 80 3d 5d 4a 09 00 00 41 89 ca 74 20 45 31 c9 45 31 c0 b8 2c 00 00
> > 00 0f 05 <48> 3d 00 f0 ff ff 77 6b c3 66 2e 0f 1f 84 00 00 00 00 00 55
> > 489
> > [    1.722837] RSP: 002b:00007fff597e95e8 EFLAGS: 00000246 ORIG_RAX:
> > 000000000000002c
> > [    1.723315] RAX: ffffffffffffffda RBX: 00000000000003e8 RCX: 000000000042860d
> > [    1.723721] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000310
> > [    1.724103] RBP: 00007fff597e9880 R08: 0000000000000000 R09: 0000000000000000
> > [    1.724565] R10: 0000000000000000 R11: 0000000000000246 R12: 00007fff597e99f8
> > [    1.725010] R13: 00007fff597e9a08 R14: 00000000004b7828 R15: 0000000000000001
> > [    1.725441]  </TASK>
> > [    1.725594] Modules linked in:
> > [    1.725790] ---[ end trace 0000000000000000 ]---
> > [    1.726057] RIP: 0010:skb_release_data+0x8a/0x1c0
> > [    1.726339] Code: c0 00 00 00 49 03 86 c8 00 00 00 0f b6 10 f6 c2
> > 01 74 48 48 8b 70 28 48 85 f6 74 3f 41 0f b6 5d 00 83 e3 10 40 f6 c6
> > 01 75 24 <48> 8b 06 ba 01 00 00 00 4c 89 f7 48 8b 00 ff d0 0f 1f 00 41
> > 8b6
> > [    1.727285] RSP: 0018:ffffc900007cfcc0 EFLAGS: 00010246
> > [    1.727623] RAX: ffff888106fe12c0 RBX: 0000000000000010 RCX: 0000000000000000
> > [    1.728052] RDX: 0000000000000017 RSI: dead0000cafebabe RDI: ffff8881059c4a00
> > [    1.728467] RBP: ffffc900007cfe10 R08: 0000000000000010 R09: 0000000000000000
> > [    1.728908] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000002
> > [    1.729323] R13: ffff888106fe12c0 R14: ffff8881059c4a00 R15: ffff888106e57000
> > [    1.729744] FS:  0000000038e54380(0000) GS:ffff88813bc00000(0000)
> > knlGS:0000000000000000
> > [    1.730236] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [    1.730597] CR2: 00000000004bf480 CR3: 00000001009ec001 CR4: 0000000000772ef0
> > [    1.730988] PKRU: 55555554
> > ```
> >
>
> OK thanks.
>
> I will repeat my original feedback : I do not see any barriers in the
> patch you sent.
>
> Assuming bond_setup_by_slave() can be called multiple times during one
> master lifetime, I do not think your patch is enough.
>
> Also, please clarify what happens with stacks of two or more bonding devices ?

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ