pcap-linux: support new tpacket frame header format From: Patrick McHardy The tpacket_hdr is not clean for 64 bit kernel/32 bit userspace and is not extendable because the struct sockaddr_ll following it is expected at a fixed offset. Linux 2.6.27-rc supports a new tpacket frame header that removes these two limitations. Convert the mmap ring support to support both formats and probe for availability of the new version. --- pcap-int.h | 2 + pcap-linux.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 134 insertions(+), 20 deletions(-) diff --git a/pcap-int.h b/pcap-int.h index 4de319c..97238fd 100644 --- a/pcap-int.h +++ b/pcap-int.h @@ -132,6 +132,8 @@ struct pcap_md { int lo_ifindex; /* interface index of the loopback device */ u_int packets_read; /* count of packets read with recvfrom() */ bpf_u_int32 oldmode; /* mode to restore when turning monitor mode off */ + u_int tp_version; /* version of tpacket_hdr for mmaped ring */ + u_int tp_hdrlen; /* hdrlen of tpacket_hdr for mmaped ring */ #endif /* linux */ #ifdef HAVE_DAG_API diff --git a/pcap-linux.c b/pcap-linux.c index fcc665a..52d9bd8 100644 --- a/pcap-linux.c +++ b/pcap-linux.c @@ -177,6 +177,11 @@ static const char rcsid[] _U_ = * uses many ring related structs and macros */ # ifdef TPACKET_HDRLEN # define HAVE_PACKET_RING +# ifdef TPACKET2_HDRLEN +# define HAVE_TPACKET2 +# else +# define TPACKET_V1 0 +# endif /* TPACKET2_HDRLEN */ # endif /* TPACKET_HDRLEN */ #endif /* PF_PACKET */ @@ -240,11 +245,18 @@ static int pcap_setfilter_linux(pcap_t *, struct bpf_program *); static int pcap_setdirection_linux(pcap_t *, pcap_direction_t); static void pcap_cleanup_linux(pcap_t *); +union thdr { + struct tpacket_hdr *h1; + struct tpacket2_hdr *h2; + void *raw; +}; + #ifdef HAVE_PACKET_RING -#define RING_GET_FRAME(h) (((struct tpacket_hdr**)h->buffer)[h->offset]) +#define RING_GET_FRAME(h) (((union thdr **)h->buffer)[h->offset]) static void destroy_ring(pcap_t *handle); static int create_ring(pcap_t *handle); +static int prepare_tpacket_socket(pcap_t *handle); static void pcap_cleanup_linux_mmap(pcap_t *); static int pcap_read_linux_mmap(pcap_t *, int, pcap_handler , u_char *); static int pcap_setfilter_linux_mmap(pcap_t *, struct bpf_program *); @@ -1897,6 +1909,9 @@ activate_mmap(pcap_t *handle) /* by default request 2M for the ring buffer */ handle->opt.buffer_size = 2*1024*1024; } + ret = prepare_tpacket_socket(handle); + if (ret == 0) + return ret; ret = create_ring(handle); if (ret == 0) return ret; @@ -1918,6 +1933,41 @@ activate_mmap(pcap_t *handle) } #ifdef HAVE_PACKET_RING +static int +prepare_tpacket_socket(pcap_t *handle) +{ + socklen_t len; + int val; + + handle->md.tp_version = TPACKET_V1; + handle->md.tp_hdrlen = sizeof(struct tpacket_hdr); + +#ifdef HAVE_TPACKET2 + /* Probe whether kernel supports TPACKET_V2 */ + val = TPACKET_V2; + len = sizeof(val); + if (getsockopt(handle->fd, SOL_PACKET, PACKET_HDRLEN, &val, &len) < 0) { + if (errno == ENOPROTOOPT) + return 1; + snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, + "can't get TPACKET_V2 header len on socket %d: %d-%s", + handle->fd, errno, pcap_strerror(errno)); + return 0; + } + handle->md.tp_hdrlen = val; + + val = TPACKET_V2; + if (setsockopt(handle->fd, SOL_PACKET, PACKET_VERSION, &val, + sizeof(val)) < 0) { + snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, + "can't activate TPACKET_V2 on socket %d: %d-%s", + handle->fd, errno, pcap_strerror(errno)); + return 0; + } + handle->md.tp_version = TPACKET_V2; +#endif /* HAVE_TPACKET2 */ + return 1; +} static void compute_ring_block(int frame_size, unsigned *block_size, unsigned *frames_per_block) @@ -1944,7 +1994,9 @@ create_ring(pcap_t *handle) * (and a lot of memory will be unused). * The snap len should be carefully chosen to achive best * performance */ - req.tp_frame_size = TPACKET_ALIGN(handle->snapshot+TPACKET_HDRLEN); + req.tp_frame_size = TPACKET_ALIGN(handle->snapshot + + TPACKET_ALIGN(handle->md.tp_hdrlen) + + sizeof(struct sockaddr_ll)); req.tp_frame_nr = handle->opt.buffer_size/req.tp_frame_size; compute_ring_block(req.tp_frame_size, &req.tp_block_size, &frames_per_block); req.tp_block_nr = req.tp_frame_nr / frames_per_block; @@ -1983,7 +2035,7 @@ retry: /* allocate a ring for each frame header pointer*/ handle->cc = req.tp_frame_nr; - handle->buffer = malloc(handle->cc * sizeof(struct tpacket_hdr*)); + handle->buffer = malloc(handle->cc * sizeof(union thdr *)); if (!handle->buffer) { destroy_ring(handle); return 0; @@ -1992,9 +2044,9 @@ retry: /* fill the header ring with proper frame ptr*/ handle->offset = 0; for (i=0; ibp[i*req.tp_block_size]; + void *base = &handle->bp[i*req.tp_block_size]; for (j=0; joffset) { - RING_GET_FRAME(handle) = (struct tpacket_hdr*) base; + RING_GET_FRAME(handle) = base; base += req.tp_frame_size; } } @@ -2055,6 +2107,29 @@ pcap_setnonblock_mmap(pcap_t *p, int nonblock, char *errbuf) return 0; } +static inline union thdr * +pcap_get_ring_frame(pcap_t *handle, int status) +{ + union thdr h; + + h.raw = RING_GET_FRAME(handle); + switch (handle->md.tp_version) { + case TPACKET_V1: + if (status != (h.h1->tp_status ? TP_STATUS_USER : + TP_STATUS_KERNEL)) + return NULL; + break; +#ifdef HAVE_TPACKET2 + case TPACKET_V2: + if (status != (h.h2->tp_status ? TP_STATUS_USER : + TP_STATUS_KERNEL)) + return NULL; + break; +#endif + } + return h.raw; +} + static int pcap_read_linux_mmap(pcap_t *handle, int max_packets, pcap_handler callback, u_char *user) @@ -2062,7 +2137,8 @@ pcap_read_linux_mmap(pcap_t *handle, int max_packets, pcap_handler callback, int pkts = 0; /* wait for frames availability.*/ - if ((handle->md.timeout >= 0) && !(RING_GET_FRAME(handle)->tp_status)) { + if ((handle->md.timeout >= 0) && + !pcap_get_ring_frame(handle, TP_STATUS_USER)) { struct pollfd pollinfo; int ret; @@ -2094,16 +2170,41 @@ pcap_read_linux_mmap(pcap_t *handle, int max_packets, pcap_handler callback, struct sockaddr_ll *sll; struct pcap_pkthdr pcaphdr; unsigned char *bp; - struct tpacket_hdr* thdr = RING_GET_FRAME(handle); - if (thdr->tp_status == TP_STATUS_KERNEL) + union thdr h; + unsigned int tp_len; + unsigned int tp_mac; + unsigned int tp_snaplen; + unsigned int tp_sec; + unsigned int tp_usec; + + h.raw = pcap_get_ring_frame(handle, TP_STATUS_USER); + if (!h.raw) break; + switch (handle->md.tp_version) { + case TPACKET_V1: + tp_len = h.h1->tp_len; + tp_mac = h.h1->tp_mac; + tp_snaplen = h.h1->tp_snaplen; + tp_sec = h.h1->tp_sec; + tp_usec = h.h1->tp_usec; + break; +#ifdef HAVE_TPACKET2 + case TPACKET_V2: + tp_len = h.h2->tp_len; + tp_mac = h.h2->tp_mac; + tp_snaplen = h.h2->tp_snaplen; + tp_sec = h.h2->tp_sec; + tp_usec = h.h2->tp_nsec / 1000; + break; +#endif + } /* perform sanity check on internal offset. */ - if (thdr->tp_mac+thdr->tp_snaplen > handle->bufsize) { + if (tp_mac + tp_snaplen > handle->bufsize) { snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "corrupted frame on kernel ring mac " "offset %d + caplen %d > frame len %d", - thdr->tp_mac, thdr->tp_snaplen, handle->bufsize); + tp_mac, tp_snaplen, handle->bufsize); return -1; } @@ -2116,25 +2217,25 @@ pcap_read_linux_mmap(pcap_t *handle, int max_packets, pcap_handler callback, * Note: alternatively it could be possible to stop applying * the filter when the ring became empty, but it can possibly * happen a lot later... */ - bp = (unsigned char*)thdr + thdr->tp_mac; + bp = (unsigned char*)h.raw + tp_mac; run_bpf = (!handle->md.use_bpf) || ((handle->md.use_bpf>1) && handle->md.use_bpf--); if (run_bpf && handle->fcode.bf_insns && (bpf_filter(handle->fcode.bf_insns, bp, - thdr->tp_len, thdr->tp_snaplen) == 0)) + tp_len, tp_snaplen) == 0)) goto skip; /* check direction and interface index */ - sll = (void*)thdr + TPACKET_ALIGN(sizeof(*thdr)); + sll = (void *)h.raw + TPACKET_ALIGN(handle->md.tp_hdrlen); if ((sll->sll_ifindex == handle->md.lo_ifindex) && (sll->sll_pkttype == PACKET_OUTGOING)) goto skip; /* get required packet info from ring header */ - pcaphdr.ts.tv_sec = thdr->tp_sec; - pcaphdr.ts.tv_usec = thdr->tp_usec; - pcaphdr.caplen = thdr->tp_snaplen; - pcaphdr.len = thdr->tp_len; + pcaphdr.ts.tv_sec = tp_sec; + pcaphdr.ts.tv_usec = tp_usec; + pcaphdr.caplen = tp_snaplen; + pcaphdr.len = tp_len; /* if required build in place the sll header*/ if (handle->md.cooked) { @@ -2156,7 +2257,9 @@ pcap_read_linux_mmap(pcap_t *handle, int max_packets, pcap_handler callback, * don't step on the header when we construct * the sll header. */ - if (bp < (u_char *)thdr + TPACKET_HDRLEN) { + if (bp < (u_char *)h.raw + + TPACKET_ALIGN(handle->md.tp_hdrlen) + + sizeof(struct sockaddr_ll)) { snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "cooked-mode frame doesn't have room for sll header"); return -1; @@ -2185,7 +2288,16 @@ pcap_read_linux_mmap(pcap_t *handle, int max_packets, pcap_handler callback, skip: /* next packet */ - thdr->tp_status = TP_STATUS_KERNEL; + switch (handle->md.tp_version) { + case TPACKET_V1: + h.h1->tp_status = TP_STATUS_KERNEL; + break; +#ifdef HAVE_TPACKET2 + case TPACKET_V2: + h.h2->tp_status = TP_STATUS_KERNEL; + break; +#endif + } if (++handle->offset >= handle->cc) handle->offset = 0; @@ -2219,7 +2331,7 @@ pcap_setfilter_linux_mmap(pcap_t *handle, struct bpf_program *filter) for (n=0; n < handle->cc; ++n) { if (--handle->offset < 0) handle->offset = handle->cc - 1; - if (RING_GET_FRAME(handle)->tp_status != TP_STATUS_KERNEL) + if (!pcap_get_ring_frame(handle, TP_STATUS_KERNEL)) break; }