#include #include #include #include #include #include #include #include #include #include #define _NETHASH_BUFSZ_ 16 #define MODULE_NAME "nethash" #define _DEBUG_IT_ #ifdef _DEBUG_IT_ #define nprintk(x...) printk(KERN_ALERT x); #else /* !_DEBUG_IT_ */ #define nprintk(x) #endif /* _DEBUG_IT_ */ static struct proc_dir_entry *nh_proc_dir; enum nh_type { NH_ENTRY, NH_HEAD }; struct nh_item { /* the list head must be first followed by nh_type */ struct list_head nh_list; enum nh_type nh_type; }; struct nh_entry { /* the list head must be first followed by nh_type */ struct list_head nh_list; enum nh_type nh_type; unsigned long data; struct rcu_head rcu_head; }; struct nh_head { /* the list head must be first followed by nh_type */ struct list_head list; enum nh_type nh_type; spinlock_t lock; }; struct nh { unsigned long nentries; struct nh_head* hash; struct rcu_head rcu_head; }; static struct nh * __nh; static DEFINE_SEQLOCK(nethash_seq); static DEFINE_MUTEX(nethash_resize_mutex); extern void * system_hash_alloc(unsigned long sz); extern void system_hash_free(void * v, unsigned long sz); /* XXX nh_dump() is for for debug only * it must be called under rcu_read_lock() */ static int nh_dump(struct nh * nh, const char * debug_str); static struct nh * get_nh(void); static unsigned long nh_hashval(unsigned long data) { return data; } static void nh_entry_free(struct rcu_head * head) { struct nh_entry * entry = container_of(head, struct nh_entry, rcu_head); nprintk("%s: freeing entry with data %lu\n", __FUNCTION__, entry->data); kfree(entry); } static void nh_free(struct rcu_head * head) { struct nh * nh = container_of(head, struct nh, rcu_head); unsigned long nentries = nh->nentries; unsigned long size = sizeof (struct nh_head) * nentries; nprintk("%s: freeing nh of size %lu\n", __FUNCTION__, size); system_hash_free((void *)nh->hash, size); nprintk("%s: freeing nh\n", __FUNCTION__); kfree(nh); } /* called with head's spin_lock held */ static int __nh_insert(struct nh_entry * entry, struct nh_head * head, unsigned long nentries) { struct list_head * prev; /* insert entry after prev */ struct list_head * list; nprintk("%s: linking entry with data %lu\n", __FUNCTION__, entry->data); /* find the appropriate spot to place entry */ prev = &head->list; if ( nh_hashval(entry->data) & nentries ) { list_for_each_rcu(list, &head->list) { struct nh_entry * tmp; struct nh_item * item = (struct nh_item *)list; if (item->nh_type != NH_ENTRY) continue; tmp = list_entry(list, struct nh_entry, nh_list); prev = &tmp->nh_list; if (nh_hashval(tmp->data) & nentries) { /* put entry after nh */ break; } } } list_add_rcu(&entry->nh_list, prev); return 0; } /* called with head's lock held */ static void __nh_sort_chain(struct nh_head * head, unsigned long nentries) { struct list_head tmp, *list = &head->list; struct nh_entry* entry; INIT_LIST_HEAD(&tmp); list_splice_init(list, &tmp); while ( !list_empty(&tmp) ) { struct list_head * first = tmp.next; list_del(first); entry = (struct nh_entry*)first; __nh_insert(entry, head, nentries); } } static void nh_fixup_grow(struct rcu_head * head) { struct nh * nh; struct nh * oh = container_of(head, struct nh, rcu_head); unsigned long i, oentries = oh->nentries; rcu_read_lock(); nh = get_nh(); /* this is an rcu_callback - only the "new" nh is * visible elsewhere now, so make sure to access * the hashtable using the proper (new) locks,... */ for (i = 0; i < oentries; i++) { struct nh_head * nhead1 = &nh->hash[i]; struct nh_head * nhead2 = &nh->hash[i + oentries]; struct list_head *olist, *tail1, *tail2; /* grab locks in order */ spin_lock(&nhead1->lock); spin_lock(&nhead2->lock); olist = &oh->hash[i].list; list_del(olist); /* don't need rcu variant here */ tail1 = nhead2->list.prev; tail2 = nhead1->list.prev; nhead1->list.prev = tail1; nhead2->list.prev = tail2; tail1->next = &nhead1->list; tail2->next = &nhead2->list; __nh_sort_chain(nhead1, oentries << 1); __nh_sort_chain(nhead2, oentries << 1); spin_unlock(&nhead2->lock); spin_unlock(&nhead1->lock); } rcu_read_unlock(); nh_free(head); } static void nh_fixup_shrink(struct rcu_head * head) { struct nh * nh; struct nh * oh = container_of(head, struct nh, rcu_head); unsigned long i, nentries; rcu_read_lock(); nh = get_nh(); nentries = nh->nentries; /* this is an rcu_callback - only the "new" nh is * visible elsewhere now, so make sure to access * the hashtable using the proper (new) locks,... */ for (i = 0; i < nh->nentries; i++) { struct nh_head * nhead = &nh->hash[i]; struct list_head * olist1 = &oh->hash[i].list; struct list_head * olist2 = &oh->hash[i + nentries].list; spin_lock(&nhead->lock); list_del(olist1); /* don't need RCU variants here */ list_del(olist2); __nh_sort_chain(nhead, nentries); spin_unlock(&nhead->lock); } rcu_read_unlock(); nh_free(head); } /* called under rcu_read_lock */ static struct nh * get_nh(void) { unsigned long seq; struct nh * nh = NULL; do { seq = read_seqbegin(&nethash_seq); nh = __nh; } while (read_seqretry(&nethash_seq, seq)); return nh; } /* use returned value only under rcu_read_lock */ static struct nh * nh_replace(struct nh * new) { struct nh * old = NULL; write_seqlock(&nethash_seq); old = __nh; __nh = new; write_sequnlock(&nethash_seq); return old; } static void nh_destroy(void) { unsigned long i; struct nh * nh; rcu_read_lock(); nh = nh_replace(NULL); if (!nh) goto out_unlock; for (i = 0; i < nh->nentries; i++) { struct nh_head * head = &nh->hash[i]; struct list_head *list; spin_lock(&head->lock); list_for_each_rcu(list, &head->list) { struct nh_item * item = (struct nh_item *)list; struct nh_entry *entry; if (item->nh_type != NH_ENTRY) continue; entry = list_entry(list, struct nh_entry, nh_list); list_del_rcu(&entry->nh_list); nprintk("%s: delinking entry with data %lu\n", __FUNCTION__, entry->data) call_rcu(&entry->rcu_head, nh_entry_free); } spin_unlock(&head->lock); } call_rcu(&nh->rcu_head, nh_free); out_unlock: rcu_read_unlock(); } static void nh_init(struct nh* nh, struct nh_head * hash, unsigned long nentries) { unsigned long i; for (i = 0; i < nentries; i++) { INIT_LIST_HEAD(&hash[i].list); hash[i].nh_type = NH_HEAD; spin_lock_init(&hash[i].lock); } nh->hash = hash; nh->nentries = nentries; } static struct nh * nh_alloc(unsigned long nentries) { struct nh * nh; unsigned long sz; struct nh_head * hash; /* nentries must be a power of 2 */ nentries = roundup_pow_of_two(nentries); sz = nentries * sizeof(struct nh_head); nprintk("%s: creating hash of %lu entries, size %lu bytes\n", __FUNCTION__, nentries, sz); nh = kmalloc(sizeof(struct nh), GFP_KERNEL); if (!nh) { nprintk("%s: failed to allocate hash of size %lu\n", __FUNCTION__, sz); return NULL; } hash = system_hash_alloc(sz); if (!hash) { nprintk("%s: failed to allocate hash of size %lu\n", __FUNCTION__, sz); kfree(nh); return NULL; } nh_init(nh, hash, nentries); return nh; } static int nh_create(unsigned long nentries) { struct nh * nh; nh_destroy(); nh = nh_alloc(nentries); if (!nh) return -ENOMEM; nh_replace(nh); return 0; } static unsigned long nh_bucket(unsigned long hashval, unsigned long nentries) { /* nentries is a power of 2 */ return (hashval & (nentries - 1)); } static int nh_add(unsigned long data) { struct nh_entry * entry; struct nh * nh = NULL; int ret = 0; entry = kmalloc(sizeof(struct nh_entry), GFP_KERNEL); if (!entry) return -ENOMEM; entry->data = data; entry->nh_type = NH_ENTRY; rcu_read_lock(); nh = get_nh(); if (nh) { struct nh_head *hash = nh->hash; unsigned long nentries = nh->nentries; unsigned long hashval = nh_hashval(data); unsigned long bucket = nh_bucket(hashval, nentries); struct nh_head * head = &hash[bucket]; spin_lock(&head->lock); ret = __nh_insert(entry, head, nentries); spin_unlock(&head->lock); } rcu_read_unlock(); return ret; } static void nh_migrate_grow(struct nh * old, struct nh * new) { unsigned long i, oentries = old->nentries; /* this is called prior to calling replace_nh(), so * only the "old" nh is visible elsewhere - only * access the hashtable under the appropriate "old" * locks */ for ( i = 0; i < oentries; i++ ) { struct nh_head * ohead = &old->hash[i]; struct list_head * olist = &ohead->list; struct list_head * nlist1 = &new->hash[i].list; struct list_head * nlist2 = &new->hash[i + oentries].list; struct nh_entry *entry; struct list_head * list, * prev = NULL; nprintk("%s: migrating bucket %lu\n", __FUNCTION__, i); spin_lock(&ohead->lock); list_add_rcu(nlist1, olist); /* find where to split chain */ list_for_each_rcu(list, olist) { struct nh_item * item = (struct nh_item *)list; if (item->nh_type != NH_ENTRY) { prev = list; continue; } entry = list_entry(list, struct nh_entry, nh_list); if (nh_hashval(entry->data) & oentries) { break; } prev = list; } BUG_ON(prev == NULL); list_add_rcu(nlist2, prev); spin_unlock(&ohead->lock); } } static void nh_migrate_shrink(struct nh * old, struct nh * new) { unsigned long i, oentries = old->nentries; /* this is called prior to calling replace_nh(), so * only the "old" nh is visible elsewhere - only * access the hashtable under the appropriate "old" * locks */ for ( i = 0; i < oentries >> 1; i++ ) { struct list_head * nlist = &new->hash[i].list; struct nh_head * ohead1 = &old->hash[i]; struct nh_head * ohead2 = &old->hash[i + (oentries>>1)]; struct list_head * olist1, * olist2, tmp; nprintk("%s: migrating bucket %lu\n", __FUNCTION__, i); /* acquire locks in order */ spin_lock(&ohead1->lock); spin_lock(&ohead2->lock); olist1 = &ohead1->list; olist2 = &ohead2->list; INIT_LIST_HEAD(&tmp); list_add_tail_rcu(&tmp, olist2); list_splice_init(&tmp, olist1->prev); list_add_tail_rcu(nlist, olist1); spin_unlock(&ohead2->lock); spin_unlock(&ohead1->lock); } } unsigned long nh_nentries_grow(void) { unsigned long nentries = 0; /* we're called under rcu_read_lock */ struct nh * nh = get_nh(); if (nh) nentries = nh->nentries << 1; return nentries; } unsigned long nh_nentries_shrink(void) { unsigned long nentries = 0; /* we're called under rcu_read_lock */ struct nh * nh = get_nh(); if (nh) nentries = nh->nentries >> 1; return nentries; } enum { NH_GROW_OPS, NH_SHRINK_OPS, NH_MAX_OPS }; /* all of the nh_resize_ops must be called under the * nethash_resize_mutex and rcu_read_lock */ struct nh_resize_ops { unsigned long (*nh_nentries_op) (void); void (*nh_migrate_op) (struct nh *old, struct nh *new); void (*nh_fixup_op) (struct rcu_head *head); } nh_resize_ops [NH_MAX_OPS] = { { nh_nentries_grow, nh_migrate_grow, nh_fixup_grow, }, { nh_nentries_shrink, nh_migrate_shrink, nh_fixup_shrink, }, }; static int nh_resize(long dir) { struct nh *nh, *new_nh; unsigned long new_nentries; int err = 0; struct nh_resize_ops * nh_ops; if (dir > 0) nh_ops = &nh_resize_ops[NH_GROW_OPS]; else if (dir < 0) nh_ops = &nh_resize_ops[NH_SHRINK_OPS]; else return -EINVAL; mutex_lock(&nethash_resize_mutex); rcu_read_lock(); nh = get_nh(); if (!nh) goto out_unlock; new_nentries = nh_ops->nh_nentries_op(); nprintk("%s: resizing nh to %lu buckets\n", __FUNCTION__, new_nentries); new_nh = nh_alloc(new_nentries); if (!new_nh) { err = -ENOMEM; goto out_unlock; } nh_ops->nh_migrate_op(nh, new_nh); nh_dump(nh, "old nh [post migration]"); nh_dump(new_nh, "new nh [newborn]"); nh = nh_replace(new_nh); /* now callers of get_nh() will get the updated * version, but references to the old nh can still * be in use. It must be possible for users to see * all elements of the table using either the old * or new nh */ call_rcu(&nh->rcu_head, nh_ops->nh_fixup_op); rcu_read_unlock(); synchronize_net(); /* by holding the nethash_resize_mutex until after * synchronize_net() we are sure that there are at * most 2 nh structures that we need to be * concerned with */ mutex_unlock(&nethash_resize_mutex); rcu_read_lock(); nh_dump(new_nh, "new nh [done]"); rcu_read_unlock(); return err; out_unlock: rcu_read_unlock(); mutex_unlock(&nethash_resize_mutex); return err; } static int nh_getalong(const char __user *buf, size_t count, long *res) { char kbuf[_NETHASH_BUFSZ_+1]; if (count > _NETHASH_BUFSZ_) return -EINVAL; if (copy_from_user(&kbuf, buf, count)) return -EFAULT; kbuf[_NETHASH_BUFSZ_] = '\0'; if (sscanf(kbuf, "%ld", res) != 1) return -EINVAL; return 0; } ssize_t nh_create_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { long res; int ret; if ((ret = nh_getalong(buf, count, &res)) != 0) return ret; if(res <= 0) return -EINVAL; if ((ret = nh_create((unsigned long)res)) != 0) return ret; return count; } static struct file_operations nh_create_fops = { .write = nh_create_write, }; ssize_t nh_add_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { long res; int ret; if ((ret = nh_getalong(buf, count, &res)) != 0) return ret; if(res <= 0) return -EINVAL; if ((ret = nh_add((unsigned long)res)) != 0) return ret; return count; } static struct file_operations nh_add_fops = { .write = nh_add_write, }; ssize_t nh_resize_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { long res; int ret; if ((ret = nh_getalong(buf, count, &res)) != 0) return ret; if ((ret = nh_resize(res)) != 0) return ret; return count; } static struct file_operations nh_resize_fops = { .write = nh_resize_write, }; static void * d_start (struct seq_file *m, loff_t *pos) { struct nh * nh; rcu_read_lock(); nh = get_nh(); if (nh) { struct nh_head *hash = nh->hash; unsigned long nentries = nh->nentries; if (nentries > *pos) { seq_printf(m, "%lu: ", (unsigned long)*pos); return (&hash[*pos]); } } return NULL; } static void * d_next (struct seq_file *m, void *v, loff_t *pos) { struct nh * nh; (*pos)++; /* rcu_read_lock is still held */ nh = get_nh(); if (nh) { struct nh_head *hash = nh->hash; unsigned long nentries = nh->nentries; if (nentries > *pos) { seq_printf(m, "%lu: ", (unsigned long)*pos); return (&hash[*pos]); } } return NULL; } static void d_stop (struct seq_file *m, void *v) { rcu_read_unlock(); } static int d_show (struct seq_file *m, void *v) { struct nh_head *head = v; struct nh_entry *entry; struct list_head * list; rcu_read_lock(); list_for_each_rcu(list, &head->list) { struct nh_item * item = (struct nh_item *)list; switch (item->nh_type) { case NH_ENTRY: entry = list_entry(list, struct nh_entry, nh_list); seq_printf(m, "%lu", entry->data); seq_printf(m, "%s", " -> "); break; case NH_HEAD: seq_printf(m, "%s", " .. "); break; default: seq_printf(m, "%s", "bad nh_type!\n"); } } rcu_read_unlock(); seq_printf(m, "%s", "\n"); return 0; } struct seq_operations nh_dump_op = { .start = d_start, .next = d_next, .stop = d_stop, .show = d_show, }; static int nh_dump_open(struct inode *inode, struct file *file) { return seq_open(file, &nh_dump_op); } static struct file_operations proc_nh_dump_operations = { .open = nh_dump_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; /* XXX nh_dump() is for for debug only * it must be called under rcu_read_lock() */ static int nh_dump(struct nh * nh, const char * debug_str) { unsigned long i; struct nh_head * head; struct nh_entry *entry; struct list_head * list; nprintk("%s: %s\n", __FUNCTION__, debug_str); for (i = 0; i < nh->nentries; i++ ) { head = &nh->hash[i]; nprintk("[%lu]: (%p)", i, head); list_for_each_rcu(list, &head->list) { struct nh_item * item = (struct nh_item *)list; nprintk(" %p ", list); switch (item->nh_type) { case NH_ENTRY: entry = list_entry(list, struct nh_entry, nh_list); nprintk("%lu -> ", entry->data); break; case NH_HEAD: nprintk(" .. "); break; default: nprintk("bad nh_type!\n"); } } nprintk("\n"); } return 0; } static int nethash_module_init(void) { struct proc_dir_entry *entry; nh_proc_dir = proc_mkdir(MODULE_NAME, NULL); if (!nh_proc_dir) return -ENODEV; /* first the entry to create the hashtable */ entry = create_proc_entry("create", 0644, nh_proc_dir); if (!entry) return -ENODEV; entry->proc_fops = &nh_create_fops; /* and the entry to add an element to the hashtable */ entry = create_proc_entry("add", 0644, nh_proc_dir); if (!entry) return -ENODEV; entry->proc_fops = &nh_add_fops; /* the entry to dump the hashtable */ entry = create_proc_entry("dump", 0444, nh_proc_dir); if (!entry) return -ENODEV; entry->proc_fops = &proc_nh_dump_operations; /* and to resize the hashtable */ entry = create_proc_entry("resize", 0644, nh_proc_dir); if (!entry) return -ENODEV; entry->proc_fops = &nh_resize_fops; return 0; } static void nethash_module_exit(void) { nh_destroy(); rcu_barrier(); /* don't unload until rcu callbacks are done * XXX is this really safe? */ if (nh_proc_dir) { remove_proc_entry("resize", nh_proc_dir); remove_proc_entry("dump", nh_proc_dir); remove_proc_entry("add", nh_proc_dir); remove_proc_entry("create", nh_proc_dir); } remove_proc_entry(MODULE_NAME, NULL); printk("%s removed\n", MODULE_NAME); } module_init(nethash_module_init); module_exit(nethash_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("akepner@sgi.com");