[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20080818233243.GC7785@verge.net.au>
Date: Tue, 19 Aug 2008 09:32:45 +1000
From: Simon Horman <horms@...ge.net.au>
To: Sven Wegener <sven.wegener@...aler.net>
Cc: netdev@...r.kernel.org, lvs-devel@...r.kernel.org,
wensong@...ux-vs.org, ja@....bg
Subject: Re: [PATCH] ipvs: Fix race conditions in lblcr scheduler
On Mon, Aug 18, 2008 at 12:53:05AM +0200, Sven Wegener wrote:
> We can't access the cache entry outside of our critical read-locked region,
> because someone may free that entry. Also getting an entry under read lock,
> then locking for write and trying to delete that entry looks fishy, but should
> be no problem here, because we're only comparing a pointer. Also there is no
> need for our own rwlock, there is already one in the service structure for use
> in the schedulers.
Hi Sven,
this looks good to me. Just a few minor comments inline.
> Signed-off-by: Sven Wegener <sven.wegener@...aler.net>
> ---
> net/ipv4/ipvs/ip_vs_lblcr.c | 229 +++++++++++++++++++++----------------------
> 1 files changed, 114 insertions(+), 115 deletions(-)
>
> diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
> index f1c8450..96bfdc2 100644
> --- a/net/ipv4/ipvs/ip_vs_lblcr.c
> +++ b/net/ipv4/ipvs/ip_vs_lblcr.c
> @@ -106,7 +106,7 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
> return NULL;
> }
>
> - e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
> + e = kmalloc(sizeof(*e), GFP_ATOMIC);
I think that I prefer using struct ip_vs_dest_list rather than *e.
Ditto for *tbl below.
> if (e == NULL) {
> IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
> return NULL;
> @@ -116,11 +116,9 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
> e->dest = dest;
>
> /* link it to the list */
> - write_lock(&set->lock);
> e->next = set->list;
> set->list = e;
> atomic_inc(&set->size);
> - write_unlock(&set->lock);
>
> set->lastmod = jiffies;
> return e;
> @@ -131,7 +129,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
> {
> struct ip_vs_dest_list *e, **ep;
>
> - write_lock(&set->lock);
> for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
> if (e->dest == dest) {
> /* HIT */
> @@ -144,7 +141,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
> }
> ep = &e->next;
> }
> - write_unlock(&set->lock);
> }
>
> static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
> @@ -174,7 +170,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
> if (set == NULL)
> return NULL;
>
> - read_lock(&set->lock);
> /* select the first destination server, whose weight > 0 */
> for (e=set->list; e!=NULL; e=e->next) {
> least = e->dest;
> @@ -188,7 +183,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
> goto nextstage;
> }
> }
> - read_unlock(&set->lock);
> return NULL;
>
> /* find the destination with the weighted least load */
> @@ -207,7 +201,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
> loh = doh;
> }
> }
> - read_unlock(&set->lock);
>
> IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
> "activeconns %d refcnt %d weight %d overhead %d\n",
> @@ -229,7 +222,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
> if (set == NULL)
> return NULL;
>
> - read_lock(&set->lock);
> /* select the first destination server, whose weight > 0 */
> for (e=set->list; e!=NULL; e=e->next) {
> most = e->dest;
> @@ -239,7 +231,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
> goto nextstage;
> }
> }
> - read_unlock(&set->lock);
> return NULL;
>
> /* find the destination with the weighted most load */
> @@ -256,7 +247,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
> moh = doh;
> }
> }
> - read_unlock(&set->lock);
>
> IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
> "activeconns %d refcnt %d weight %d overhead %d\n",
> @@ -284,7 +274,6 @@ struct ip_vs_lblcr_entry {
> * IPVS lblcr hash table
> */
> struct ip_vs_lblcr_table {
> - rwlock_t lock; /* lock for this table */
> struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
> atomic_t entries; /* number of entries */
> int max_size; /* maximum size of entries */
> @@ -311,32 +300,6 @@ static ctl_table vs_vars_table[] = {
>
> static struct ctl_table_header * sysctl_header;
>
> -/*
> - * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
> - * IP address to a server.
> - */
> -static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr)
> -{
> - struct ip_vs_lblcr_entry *en;
> -
> - en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
> - if (en == NULL) {
> - IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
> - return NULL;
> - }
> -
> - INIT_LIST_HEAD(&en->list);
> - en->addr = daddr;
> -
> - /* initilize its dest set */
> - atomic_set(&(en->set.size), 0);
> - en->set.list = NULL;
> - rwlock_init(&en->set.lock);
> -
> - return en;
> -}
> -
> -
> static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
> {
> list_del(&en->list);
> @@ -358,55 +321,68 @@ static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
> * Hash an entry in the ip_vs_lblcr_table.
> * returns bool success.
> */
> -static int
> +static void
> ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
> {
> - unsigned hash;
> -
> - if (!list_empty(&en->list)) {
> - IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
> - "called from %p\n", __builtin_return_address(0));
> - return 0;
> - }
> + unsigned hash = ip_vs_lblcr_hashkey(en->addr);
>
> - /*
> - * Hash by destination IP address
> - */
> - hash = ip_vs_lblcr_hashkey(en->addr);
> -
> - write_lock(&tbl->lock);
> list_add(&en->list, &tbl->bucket[hash]);
> atomic_inc(&tbl->entries);
> - write_unlock(&tbl->lock);
> -
> - return 1;
> }
>
>
> /*
> - * Get ip_vs_lblcr_entry associated with supplied parameters.
> + * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
> + * read lock.
> */
> static inline struct ip_vs_lblcr_entry *
> ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
> {
> - unsigned hash;
> + unsigned hash = ip_vs_lblcr_hashkey(addr);
> struct ip_vs_lblcr_entry *en;
>
> - hash = ip_vs_lblcr_hashkey(addr);
> + list_for_each_entry(en, &tbl->bucket[hash], list)
> + if (en->addr == addr)
> + return en;
> +
> + return NULL;
> +}
>
> - read_lock(&tbl->lock);
>
> - list_for_each_entry(en, &tbl->bucket[hash], list) {
> - if (en->addr == addr) {
> - /* HIT */
> - read_unlock(&tbl->lock);
> - return en;
> +/*
> + * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
> + * IP address to a server. Called under write lock.
> + */
> +static inline struct ip_vs_lblcr_entry *
> +ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr,
> + struct ip_vs_dest *dest)
> +{
> + struct ip_vs_lblcr_entry *en;
> +
> + en = ip_vs_lblcr_get(tbl, daddr);
> + if (!en) {
> + en = kmalloc(sizeof(*en), GFP_ATOMIC);
> + if (!en) {
> + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
> + return NULL;
> }
> +
> + en->addr = daddr;
> + en->lastuse = jiffies;
> +
> + /* initilize its dest set */
> + atomic_set(&(en->set.size), 0);
> + en->set.list = NULL;
> + rwlock_init(&en->set.lock);
> +
> + ip_vs_lblcr_hash(tbl, en);
> }
>
> - read_unlock(&tbl->lock);
> + write_lock(&en->set.lock);
> + ip_vs_dest_set_insert(&en->set, dest);
> + write_unlock(&en->set.lock);
>
> - return NULL;
> + return en;
> }
>
>
> @@ -418,19 +394,18 @@ static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
> int i;
> struct ip_vs_lblcr_entry *en, *nxt;
>
> + /* No locking required, only called during cleanup. */
> for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
> - write_lock(&tbl->lock);
> list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
> ip_vs_lblcr_free(en);
> - atomic_dec(&tbl->entries);
> }
> - write_unlock(&tbl->lock);
> }
> }
>
>
> -static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
> +static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
> {
> + struct ip_vs_lblcr_table *tbl = svc->sched_data;
> unsigned long now = jiffies;
> int i, j;
> struct ip_vs_lblcr_entry *en, *nxt;
> @@ -438,7 +413,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
> for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
> j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
>
> - write_lock(&tbl->lock);
> + write_lock(&svc->sched_lock);
> list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
> if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
> now))
> @@ -447,7 +422,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
> ip_vs_lblcr_free(en);
> atomic_dec(&tbl->entries);
> }
> - write_unlock(&tbl->lock);
> + write_unlock(&svc->sched_lock);
> }
> tbl->rover = j;
> }
> @@ -466,17 +441,16 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
> */
> static void ip_vs_lblcr_check_expire(unsigned long data)
> {
> - struct ip_vs_lblcr_table *tbl;
> + struct ip_vs_service *svc = (struct ip_vs_service *) data;
> + struct ip_vs_lblcr_table *tbl = svc->sched_data;
> unsigned long now = jiffies;
> int goal;
> int i, j;
> struct ip_vs_lblcr_entry *en, *nxt;
>
> - tbl = (struct ip_vs_lblcr_table *)data;
> -
> if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
> /* do full expiration check */
> - ip_vs_lblcr_full_check(tbl);
> + ip_vs_lblcr_full_check(svc);
> tbl->counter = 1;
> goto out;
> }
> @@ -493,7 +467,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
> for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
> j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
>
> - write_lock(&tbl->lock);
> + write_lock(&svc->sched_lock);
> list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
> if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
> continue;
> @@ -502,7 +476,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
> atomic_dec(&tbl->entries);
> goal--;
> }
> - write_unlock(&tbl->lock);
> + write_unlock(&svc->sched_lock);
> if (goal <= 0)
> break;
> }
> @@ -520,15 +494,14 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
> /*
> * Allocate the ip_vs_lblcr_table for this service
> */
> - tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
> + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
> if (tbl == NULL) {
> IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
> return -ENOMEM;
> }
> svc->sched_data = tbl;
> IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
> - "current service\n",
> - sizeof(struct ip_vs_lblcr_table));
> + "current service\n", sizeof(*tbl));
>
> /*
> * Initialize the hash buckets
> @@ -536,7 +509,6 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
> for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
> INIT_LIST_HEAD(&tbl->bucket[i]);
> }
> - rwlock_init(&tbl->lock);
> tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
> tbl->rover = 0;
> tbl->counter = 1;
> @@ -545,9 +517,8 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
> * Hook periodic timer for garbage collection
> */
> setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
> - (unsigned long)tbl);
> - tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
> - add_timer(&tbl->periodic_timer);
> + (unsigned long)svc);
> + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
>
> return 0;
> }
> @@ -564,9 +535,9 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
> ip_vs_lblcr_flush(tbl);
>
> /* release the table itself */
> - kfree(svc->sched_data);
> + kfree(tbl);
> IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
> - sizeof(struct ip_vs_lblcr_table));
> + sizeof(*tbl));
>
> return 0;
> }
> @@ -663,50 +634,78 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
> static struct ip_vs_dest *
> ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
> {
> - struct ip_vs_dest *dest;
> - struct ip_vs_lblcr_table *tbl;
> - struct ip_vs_lblcr_entry *en;
> + struct ip_vs_lblcr_table *tbl = svc->sched_data;
> struct iphdr *iph = ip_hdr(skb);
> + struct ip_vs_dest *dest = NULL;
> + struct ip_vs_lblcr_entry *en;
>
> IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
>
> - tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
> + /* First look in our cache */
> + read_lock(&svc->sched_lock);
> en = ip_vs_lblcr_get(tbl, iph->daddr);
> - if (en == NULL) {
> - dest = __ip_vs_lblcr_schedule(svc, iph);
> - if (dest == NULL) {
> - IP_VS_DBG(1, "no destination available\n");
> - return NULL;
> - }
> - en = ip_vs_lblcr_new(iph->daddr);
> - if (en == NULL) {
> - return NULL;
> - }
> - ip_vs_dest_set_insert(&en->set, dest);
> - ip_vs_lblcr_hash(tbl, en);
> - } else {
> + if (en) {
> + /* We only hold a read lock, but this is atomic */
> + en->lastuse = jiffies;
> +
> + /* Get the least loaded destination */
> + read_lock(&en->set.lock);
> dest = ip_vs_dest_set_min(&en->set);
> - if (!dest || is_overloaded(dest, svc)) {
> - dest = __ip_vs_lblcr_schedule(svc, iph);
> - if (dest == NULL) {
> - IP_VS_DBG(1, "no destination available\n");
> - return NULL;
> - }
> - ip_vs_dest_set_insert(&en->set, dest);
> - }
> + read_unlock(&en->set.lock);
> +
> + /* More than one destination + enough time passed by, cleanup */
> if (atomic_read(&en->set.size) > 1 &&
> - jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
> + time_after(jiffies, en->set.lastmod +
> + sysctl_ip_vs_lblcr_expiration)) {
> struct ip_vs_dest *m;
> +
> + write_lock(&en->set.lock);
> m = ip_vs_dest_set_max(&en->set);
> if (m)
> ip_vs_dest_set_erase(&en->set, m);
> + write_unlock(&en->set.lock);
> + }
> +
> + /* If the destination is not overloaded, use it */
> + if (dest && !is_overloaded(dest, svc)) {
> + read_unlock(&svc->sched_lock);
> + goto out;
> + }
> +
> + /* The cache entry is invalid, time to schedule */
> + dest = __ip_vs_lblcr_schedule(svc, iph);
> + if (!dest) {
> + IP_VS_DBG(1, "no destination available\n");
> + read_unlock(&svc->sched_lock);
> + return NULL;
> }
> +
> + /* Update our cache entry */
> + write_lock(&en->set.lock);
> + ip_vs_dest_set_insert(&en->set, dest);
> + write_unlock(&en->set.lock);
> + }
> + read_unlock(&svc->sched_lock);
> +
> + if (dest)
> + goto out;
> +
> + /* No cache entry, time to schedule */
> + dest = __ip_vs_lblcr_schedule(svc, iph);
> + if (!dest) {
> + IP_VS_DBG(1, "no destination available\n");
> + return NULL;
> }
> - en->lastuse = jiffies;
>
> + /* If we fail to create a cache entry, we'll just use the valid dest */
> + write_lock(&svc->sched_lock);
> + ip_vs_lblcr_new(tbl, iph->daddr, dest);
> + write_unlock(&svc->sched_lock);
> +
> +out:
> IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
> "--> server %u.%u.%u.%u:%d\n",
> - NIPQUAD(en->addr),
> + NIPQUAD(iph->addr),
Minor problem, this should be iph->daddr
> NIPQUAD(dest->addr),
> ntohs(dest->port));
>
> --
> To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists