[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAFu7c2XDYR-SMiuqMxtqGhfM00nw8-h0Tr2uewN8+D1BptsayA@mail.gmail.com>
Date: Wed, 25 Sep 2013 14:29:40 -0700
From: Merlin Davis <me.the.wizard@...il.com>
To: Ben Hutchings <bhutchings@...arflare.com>, netdev@...r.kernel.org
Subject: Re: Kernel Panic Sending Frames Using dev_queue_xmit()
> You are more likely to get help if you include the code in your email
> rather than linking to it.
Understood. Thank you for the protocol tip. Here is the code
(test_send.c) of the module causing the kernel panic when I send SKBs
through dev_queue_xmit(). Below it are my notes (NOTES.txt) on the
behavior, testing, etc.
test_send.c
========
#include <linux/err.h>
#include <linux/if_ether.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <net/net_namespace.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Merlin Davis <me.the.wizard@...il.com>");
static char* iface = "eth0";
module_param(iface, charp, 0);
MODULE_PARM_DESC(iface, "Name of output interface");
static unsigned long n = 0;
module_param(n, ulong, 0);
MODULE_PARM_DESC(
n, "Number of packets to send (zero for continuous)");
static const unsigned int MAX_PENDING = 10u;
static struct net_device* sendDev = NULL;
static struct task_struct* sendThread = NULL;
static unsigned long nSent = 0uL;
static atomic_t nPending;
static wait_queue_head_t sendWq;
// Real ICMP echo response frame captured from eth0
static char FRAME_DATA[] =
{
0x0a, 0x00, 0x27, 0x00, 0x00, 0x00, // dest MAC addr
0x08, 0x00, 0x27, 0x98, 0xa0, 0xea, // src MAC addr
0x08, 0x00, // IPv4
0x45, // v4, header length 20
0x00, // DCSP, ECN
0x00, 0x54, // total IPv4 length 84
0xdc, 0x5a, // ID
0x00, 0x00, // flags, fragment offset
0x40, // TTL 64
0x01, // protocol ICMP
0xac, 0xf2, // IPv4 header checksum
0xc0, 0xa8, 0x38, 0x0a, // src IP addr
0xc0, 0xa8, 0x38, 0x01, // dest IP addr
0x00, // echo reply
0x00, // code 0
0xf8, 0x17, // ICMP header checksum
0x07, 0x80, // echo ID
0x00, 0x0a, // echo sequence number
// echo data
0x94, 0x89, 0x16, 0x52, 0x00, 0x00, 0x00, 0x00,
0x8d, 0xaf, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37
};
static void testSend_exit(void);
static int testSendThreadfn(void* data);
static void testSend_skb_destruct(struct sk_buff* skb);
static int __init testSend_init(void)
{
struct net* netNs = NULL;
int err = 0;
atomic_set(&nPending, 0);
init_waitqueue_head(&sendWq);
if (!iface)
{
printk(KERN_ERR
"test_send: Non-null device name must be "
"specified via iface=<name>\n");
err = -EINVAL;
goto err;
}
rcu_read_lock();
for_each_net_rcu(netNs)
{
sendDev = dev_get_by_name(netNs, iface);
if (sendDev)
{
break;
}
}
netNs = NULL;
rcu_read_unlock();
if (!sendDev)
{
printk(KERN_ERR "Invalid device name '%s'\n", iface);
err = -EINVAL;
goto err;
}
sendThread = kthread_run(testSendThreadfn, 0, "test_send");
if (IS_ERR(sendThread))
{
err = PTR_ERR(sendThread);
sendThread = NULL;
goto err;
}
// keep reference so we can kthread_stop() even after
// thread returns
get_task_struct(sendThread);
return 0;
err:
testSend_exit();
return err;
}
module_init(testSend_init);
static void testSend_exit(void)
{
if (sendThread)
{
kthread_stop(sendThread);
// cleanup extra reference kept for the kthread_stop()
put_task_struct(sendThread);
sendThread = NULL;
}
while (atomic_read(&nPending) > 0)
{
wait_event_interruptible(sendWq,
atomic_read(&nPending) <= 0);
}
if (sendDev)
{
dev_put(sendDev);
sendDev = NULL;
}
}
static void __exit _testSend_exit(void)
{
testSend_exit();
printk(KERN_INFO "test_send: Sent %lu frames\n", nSent);
}
module_exit(_testSend_exit);
static int testSendThreadfn(void* data)
{
int reservedSpace = max((int)LL_RESERVED_SPACE(sendDev),
(int)sizeof(struct ethhdr));
int buffLen = reservedSpace - sizeof(struct ethhdr) +
sizeof(FRAME_DATA) +
sendDev->needed_tailroom;
while (!kthread_should_stop() && (n == 0 || nSent < n))
{
struct sk_buff* skb = NULL;
int err = 0;
while (atomic_read(&nPending) >= MAX_PENDING)
{
wait_event_interruptible(
sendWq, atomic_read(&nPending) < MAX_PENDING);
if (kthread_should_stop())
{
goto out;
}
}
skb = alloc_skb(buffLen, GFP_KERNEL);
if (!skb)
{
printk(KERN_ERR "test_send: Couldn't allocate SKB\n");
schedule();
continue;
}
skb_reserve(skb, reservedSpace);
skb_reset_network_header(skb);
skb_put(skb, sizeof(FRAME_DATA) - sizeof(struct ethhdr));
skb_push(skb, sizeof(struct ethhdr));
skb_reset_mac_header(skb);
skb_reset_mac_len(skb);
err = skb_store_bits(skb,
0,
FRAME_DATA,
sizeof(FRAME_DATA));
if (err)
{
printk(KERN_ERR
"test_send: Error %d storing to SKB\n", err);
schedule();
continue;
}
skb->dev = sendDev;
skb->protocol = htons(ETH_P_IP);
skb->destructor = testSend_skb_destruct;
skb_shinfo(skb)->destructor_arg = NULL;
// NOTE: Without this, the destructor is called twice and
// there is an almost immediate kernel panic. With it,
// a kernel panic still occurs but takes a while to
// manifest.
skb_get(skb);
atomic_inc(&nPending);
err = net_xmit_eval(dev_queue_xmit(skb));
if (err)
{
printk(KERN_ERR
"test_send: Error %d sending frame\n", err);
schedule();
continue;
}
++nSent;
//printk(KERN_ERR "test_send: SENT SKB\n");
}
out:
return 0;
}
static void testSend_skb_destruct(struct sk_buff* skb)
{
//printk(KERN_ERR "test_send: DESTRUCTING SKB\n");
atomic_dec(&nPending);
wake_up_all(&sendWq);
}
NOTES.txt
========
* Running on a VirtualBox VM with 64-bit Ubuntu 12.10 and the
3.5.0-40-generic kernel (and matching kernel headers for
module compilation). The VM has been allocated 4 CPU cores
and 2GB of memory (it has no swap partition). The module had
been tested on a previous kernel version (also 3.5.0-x) and
exhibited the same symptoms. The network adapter set in
VirtualBox is the default "Intel PRO/1000 MT Desktop
(82540EM)", and uses the e1000 driver
(drivers/net/ethernet/intel/e1000). However, other adapters
in VirtuaBox's list of choices that use different drivers have
also been tested with the same results.
* I have also tested this code on bare metal with a network card
using Intel's igb driver and on a VM with a compiled 2.6.39
kernel using all default configuration parameters, with the
same results.
* The idea of the module is to resend/replay frames previously
captured on the same network interface. The frames are
byte-for-byte identical to one originally captured. The
captured frame was an ICMP echo reply captured during ping
execution between the host machine and the VM, on the same
interface being tested (eth0 set to static 192.168.56.10/24 on
a VirtualBox host-only network); the packet was manually
inspected to verify it is a valid Ethernet/IPv4/ICMP/echo
reply message with correct checksums. This small module is a
stripped down test case created after running into the problem
in a larger project (one which DOES send packets associated
with a socket).
* The frames are sent without a socket, in a manner similar to
the way they are created and sent for IPv4 ARP. In fact, the
frames are constructed and sent in a way that is almost
identical to the way net/ipv4/arp.c does it. The exception is
that this module uses complete frame data including the
Ethernet header, unlike ARP which adds the hardware header
using dev_hard_header(). This module has been tested by
changing the code to use dev_hard_header() in an attempt to
diagnose the problem, but this does not change the behavior.
* A kernel panic occurs after sending one or more frames using
this module. The number of frames sent before the panic
occurs seems to vary, but it is inevitable that one occurs.
The panic does not always occur in the same place in kernel
code, but usually happens somewhere in __kfree_skb()—perhaps
in skb_release_head_state() as "? skb_release_head_state+..."
is the last line of the kernel stack trace. Sometimes,
though, it happens in the qdisk code for transmission or the
e1000 driver itself instead. It is ALMOST always a "BUG:
unable to handle kernel NULL pointer dereference", but rarely
I have seen a general protection fault instead.
* The first implementation sent SKBs endlessly and without
pause. This caused a panic much quicker. The code was then
changed to limit the number of outstanding SKBs pending in the
transmit code, similar to the way a socket's limited amount of
buffer space would (but done using a simple counter). This
results in the same sort of panic, but after a longer period
of time since the sned rate is naturally lower. Counting the
number of outstanding SKBs also resulted in the discovery of
an SKB reference counting problem.
* Calling dev_queue_xmit() with an SKB is supposed to turn
ownership of the SKB over to the transmit code, according to
the API documentation in net/core/dev.c. This would seem to
imply the reference count should be 1, and that reference is
turned over to the callee. However, when the reference count
is only 1, the transmit code destructs each SKB twice, meaning
somewhere it must be decrementing the reference count and
assuming it is still positive, then later incrementing and
decrementing it again. This causes an immediate kernel panic
in either the code to free the SKB or in a later memory
allocation operation.
* When an SKB's reference count is incremented to 2 just before
calling dev_queue_xmit(), the destructor seems to be called
just once (this can be tested by uncommenting a couple of
printk's in the code and loading the module with parameter
"n=x" to send x frames before terminating the send thread,
then checking the messages using dmesg, grep, wc, etc.). With
this reference count fix the kernel panic still occurs, but
only after the relatively long and varying period of execution
mentioned above (generally after hundreds of thousands of
frames). On the VM I am testing on, the panic usually happens
after somewhere between 15 seconds and 2 minutes.
* My best guess is that SOMETHING is overwriting memory with
null bytes or random data or soemthing. Is there something
wrong with the way the SKBs are being allocated, prepared,
turned over to the kernel for transmission, cleaned up, etc.?
If so, how is it different from the ARP implementation in a
way that causes a panic? Or is this a defect in the kernel
code itself, in which case how can Linux machines stay stable
and running with significant uptimes? Or is it just that this
is a very infrequent race condition of some sort and ARP
transmission is slow/infrequent enough that the mean time
before failure is very, very long?
* I have attempted both kernel debugging and panic message
capture, but despite following all the instructions I can find
online for doing this with a VirtualBox VM, I cannot seem to
get the debugger to resume execution properly when debugging,
the emulated serial console to work, or a crash dump to appear
in the VM after restart. So I seem to be stuck with code
inspection and good ol' printk-style debugging, unfortunately.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists