--- tree_a/include/uapi/linux/if_bonding.h 2015-05-06 16:04:23.000000000 -0400 +++ tree_b/include/uapi/linux/if_bonding.h 2015-05-18 14:49:16.916558030 -0400 @@ -70,6 +70,7 @@ #define BOND_MODE_8023AD 4 #define BOND_MODE_TLB 5 #define BOND_MODE_ALB 6 /* TLB + RLB (receive load balancing) */ +#define BOND_MODE_BATMAN 7 /* each slave's link has 4 states */ #define BOND_LINK_UP 0 /* link is up and running */ --- tree_a/include/net/bonding.h 2015-05-06 16:04:23.000000000 -0400 +++ tree_b/include/net/bonding.h 2015-05-18 16:54:46.452900354 -0400 @@ -15,6 +15,7 @@ #ifndef _NET_BONDING_H #define _NET_BONDING_H +#include #include #include #include @@ -241,6 +242,7 @@ struct bonding { struct dentry *debug_dir; #endif /* CONFIG_DEBUG_FS */ struct rtnl_link_stats64 bond_stats; + struct task_struct* battimer_task; }; #define bond_slave_get_rcu(dev) \ @@ -650,6 +652,22 @@ static inline int bond_get_targets_ip(__ return -1; } +struct batnode +{ + struct batnode* next_batnode; + char slvname[IFNAMSIZ]; + unsigned long jiffiestamp; + u32 tcp_seq_num; + u16 port_id; +}; + +struct slave_congestion_node +{ + struct slave_congestion_node* next_slave; + char this_slave[IFNAMSIZ]; + u32 congestion_counter; +}; + /* exported from bond_main.c */ extern int bond_net_id; extern const struct bond_parm_tbl bond_lacp_tbl[]; @@ -660,6 +678,58 @@ extern const struct bond_parm_tbl fail_o extern const struct bond_parm_tbl pri_reselect_tbl[]; extern struct bond_parm_tbl ad_select_tbl[]; +#define BATTABLE_SIZE 256 +static struct batnode battable[BATTABLE_SIZE]; +static struct slave_congestion_node* slavelist_head_ptr = NULL; + +static struct slave_congestion_node* get_slave_congestion_node(char* slvname) +{ + //printk("In get_slave_congestion_node: 0x%zx\n",the_slave); + struct slave_congestion_node* cur_slave = slavelist_head_ptr; + while(cur_slave) + { + if(strcmp(cur_slave->this_slave,slvname)==0) + { + //printk("Found a match: 0x%zx\n",cur_slave); + return cur_slave; + } + + if(!cur_slave->next_slave) + break; + cur_slave = cur_slave->next_slave; + } + + //Create new slave node + if(!cur_slave) //special case head + { + cur_slave = (struct slave_congestion_node*)(kmalloc(sizeof(struct slave_congestion_node),GFP_ATOMIC)); + slavelist_head_ptr = cur_slave; + } + else + { + cur_slave->next_slave = (struct slave_congestion_node*)(kmalloc(sizeof(struct slave_congestion_node),GFP_ATOMIC)); + cur_slave = cur_slave->next_slave; + } + + //Initialize new slave node + cur_slave->next_slave = NULL; + strlcpy(cur_slave->this_slave,slvname,IFNAMSIZ); + cur_slave->congestion_counter = 1; + + //printk("Created new congestion node: 0x%zx\n",cur_slave); + return cur_slave; +} + +static void slave_congestion_increment(char* the_slave) +{ + get_slave_congestion_node(the_slave)->congestion_counter++; +} + +static void slave_congestion_decrement(char* the_slave) +{ + get_slave_congestion_node(the_slave)->congestion_counter--; +} + /* exported from bond_netlink.c */ extern struct rtnl_link_ops bond_link_ops; --- tree_a/drivers/net/bonding/bond_main.c 2015-05-06 16:04:23.000000000 -0400 +++ tree_b/drivers/net/bonding/bond_main.c 2015-05-18 20:17:46.914627467 -0400 @@ -71,6 +71,8 @@ #include #include #include +#include +#include #include #include #include @@ -135,7 +137,7 @@ module_param(mode, charp, 0); MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, " "1 for active-backup, 2 for balance-xor, " "3 for broadcast, 4 for 802.3ad, 5 for balance-tlb, " - "6 for balance-alb"); + "6 for balance-alb, 7 for batman"); module_param(primary, charp, 0); MODULE_PARM_DESC(primary, "Primary network device to use"); module_param(primary_reselect, charp, 0); @@ -205,6 +207,8 @@ static int bond_mode = BOND_MODE_ROUNDRO static int xmit_hashtype = BOND_XMIT_POLICY_LAYER2; static int lacp_fast; +static DEFINE_SPINLOCK(batlock); + /*-------------------------- Forward declarations ---------------------------*/ static int bond_init(struct net_device *bond_dev); @@ -225,9 +229,10 @@ const char *bond_mode_name(int mode) [BOND_MODE_8023AD] = "IEEE 802.3ad Dynamic link aggregation", [BOND_MODE_TLB] = "transmit load balancing", [BOND_MODE_ALB] = "adaptive load balancing", + [BOND_MODE_BATMAN] = "load balancing (Batman)", }; - if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_ALB) + if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_BATMAN) return "unknown"; return names[mode]; @@ -856,7 +861,8 @@ void bond_change_active_slave(struct bon */ if (netif_running(bond->dev) && (bond->params.resend_igmp > 0) && ((bond_uses_primary(bond) && new_active) || - BOND_MODE(bond) == BOND_MODE_ROUNDROBIN)) { + BOND_MODE(bond) == BOND_MODE_ROUNDROBIN || + BOND_MODE(bond) == BOND_MODE_BATMAN)) { bond->igmp_retrans = bond->params.resend_igmp; queue_delayed_work(bond->wq, &bond->mcast_work, 1); } @@ -1082,6 +1088,69 @@ static bool bond_should_deliver_exact_ma return false; } +static void batman_normalize_congestion(struct bonding* bond, int* congestion) +{ + static long unsigned int jiffiestamp = 0; + long unsigned int jiffie_temp = jiffies; + struct slave* slave; + struct list_head* i; + if(jiffie_temp!=jiffiestamp && jiffie_temp%1000==0) //every thousand packets, normalize congestion + { + printk(KERN_DEBUG "batman: congestion normalization has occurred.\n"); + bond_for_each_slave_rcu(bond,slave,i) + { + struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name); + if(congest_current->congestion_counter > 1) + { + (*congestion)--; + congest_current->congestion_counter--; + } + } + jiffiestamp = jiffie_temp; + } + else if((jiffie_temp - jiffiestamp) / HZ > 60) + { + printk(KERN_DEBUG "batman: failsafe reset invoked: %lu, %lu, %lu\n",jiffie_temp, jiffiestamp, (jiffie_temp - jiffiestamp) / HZ); + *congestion = 0; + bond_for_each_slave_rcu(bond,slave,i) + { + struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name); + congest_current->congestion_counter = 1; + (*congestion)++; + } + jiffiestamp = jiffie_temp; + } +} + +static int batman_timer_invoke(void* bond_) +{ + struct bonding* bond = (struct bonding*)(bond_); + struct slave* slave; + struct list_head* i; + + while(!kthread_should_stop()) + { + unsigned long batflags; + long unsigned int jiffiestamp = jiffies; + + msleep(10*1000); + rcu_read_lock(); + spin_lock_irqsave(&batlock,batflags); + + long unsigned int jiffie_temp = jiffies; + if(!(jiffie_temp - jiffiestamp < 5 * HZ)) + bond_for_each_slave_rcu(bond,slave,i) + { + struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name); + if(congest_current->congestion_counter >= 2) + congest_current->congestion_counter/=2; + } + + spin_unlock_irqrestore(&batlock,batflags); + rcu_read_unlock(); + } +} + static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; @@ -1100,7 +1169,67 @@ static rx_handler_result_t bond_handle_f slave = bond_slave_get_rcu(skb->dev); bond = slave->bond; - recv_probe = ACCESS_ONCE(bond->recv_probe); + //This seems as good a place as any to put this. + if(skb->protocol == htons(ETH_P_IP)) + { + struct iphdr* iph = ip_hdr(skb); + if(iph->protocol == IPPROTO_TCP) + { + //printk(KERN_EMERG "batman recv handler Chekpoint 1\n"); + + //Headers + struct tcphdr* tcp_header = tcp_hdr(skb); + u16 port_id = max(ntohs(tcp_header->source),ntohs(tcp_header->dest)); + u32 ack_seq = ntohl(tcp_header->ack_seq); + + unsigned long batflags; + spin_lock_irqsave(&batlock,batflags); + + //Iterate over hash table, deleting old congestion nodes along the way + struct batnode* backptr = NULL; + struct batnode* battable_ptr = battable + (port_id % BATTABLE_SIZE); + while(battable_ptr && battable_ptr->jiffiestamp) + { + if(battable_ptr->port_id == port_id && battable_ptr->tcp_seq_num < ack_seq || + (jiffies - battable_ptr->jiffiestamp) / HZ > 60) + { + //Special case delete from head + if(!backptr) + if(battable_ptr->next_batnode) + { + struct batnode* tofree = battable_ptr->next_batnode; + memcpy(battable_ptr,battable_ptr->next_batnode,sizeof(struct batnode)); + kfree(tofree); + } + else + { + battable_ptr->jiffiestamp = 0; + break; + } + else + { + backptr->next_batnode = battable_ptr->next_batnode; + kfree(battable_ptr); + battable_ptr = backptr->next_batnode; + } + + continue; + } + + //Go to next in sequence + backptr = battable_ptr; + battable_ptr = battable_ptr->next_batnode; + } + + int ignored=0; + batman_normalize_congestion(bond,&ignored); + + spin_unlock_irqrestore(&batlock,batflags); + //printk(KERN_EMERG "batman recv handler Chekpoint 2\n"); + } + } + + recv_probe = ACCESS_ONCE(bond->recv_probe); if (recv_probe) { ret = recv_probe(skb, bond, slave); if (ret == RX_HANDLER_CONSUMED) { @@ -3485,7 +3614,6 @@ static int bond_set_mac_address(struct n if (BOND_MODE(bond) == BOND_MODE_ALB) return bond_alb_set_mac_address(bond_dev, addr); - netdev_dbg(bond_dev, "bond=%p\n", bond); /* If fail_over_mac is enabled, do nothing and return success. @@ -3538,6 +3666,224 @@ unwind: return res; } +static void bond_xmit_slave_id(struct bonding *bond, struct sk_buff *skb, int slave_id); + +static int bond_xmit_batman(struct sk_buff *skb, struct net_device *bond_dev) +{ + struct bonding *bond = netdev_priv(bond_dev); + struct slave *slave = NULL, *backup_slave = NULL; + struct list_head* i; + int res = 1; + struct iphdr *iph = ip_hdr(skb); + + /* + * Start with the curr_active_slave that joined the bond as the + * default for sending IGMP traffic. For failover purposes one + * needs to maintain some consistency for the interface that will + * send the join/membership reports. The curr_active_slave found + * will send all of this type of traffic. + */ + if ((iph->protocol == IPPROTO_IGMP) && + (skb->protocol == htons(ETH_P_IP))) { + + slave = rcu_dereference(bond->curr_active_slave); + if (slave) + bond_dev_queue_xmit(bond, skb, slave->dev); + else + bond_xmit_slave_id(bond, skb, 0); + } else { + //printk(KERN_DEBUG "xmit_batman Checkpoint 1\n"); + + u32 congestion = 0; + struct slave_congestion_node* congest_node = NULL; + + unsigned long flags; + spin_lock_irqsave(&batlock,flags); + + //printk(KERN_DEBUG "xmit_batman Checkpoint 2\n"); + + //Choose slave to send + bond_for_each_slave_rcu(bond,slave,i) + { + struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name); + congestion+=congest_current->congestion_counter; + if(!congest_node || congest_current->congestion_counter > congest_node->congestion_counter) + congest_node = congest_current; + } + + //printk(KERN_DEBUG "xmit_batman Checkpoint 3\n"); + + //If we're not TCP, we do this + if(congest_node) + bond_for_each_slave_rcu(bond,slave,i) + if(strcmp(slave->dev->name,congest_node->this_slave)==0) + break; + else + slave = rcu_dereference(bond->curr_active_slave); + + //printk(KERN_DEBUG "xmit_batman Checkpoint 4\n"); + + //If we're TCP, update our congestion table + if(skb->protocol == htons(ETH_P_IP) && iph->protocol == IPPROTO_TCP) + { + //Headers + struct tcphdr *tcp_header = tcp_hdr(skb); + u16 port_id = max(ntohs(tcp_header->source),ntohs(tcp_header->dest)); + int batidx = port_id % BATTABLE_SIZE; + u32 seqnum = ntohl(tcp_header->seq); + + //printk(KERN_DEBUG "xmit_batman Checkpoint 5\n"); + + batman_normalize_congestion(bond,&congestion); + + //printk(KERN_DEBUG "xmit_batman Checkpoint 6\n"); + + //Find slave + u32 ratio_pos = jiffies % congestion; + u32 walk_pos = 0; + bond_for_each_slave_rcu(bond,slave,i) + { + struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name); + walk_pos+=congest_current->congestion_counter; + if(walk_pos > ratio_pos) + break; + backup_slave = slave; + } + + if(!slave) + slave = backup_slave; + if(!slave) + { + printk(KERN_DEBUG "HOLY 13TH AMENDMENT, BATMAN!"); + goto end; + } + + //printk(KERN_DEBUG "xmit_batman Checkpoint 7\n"); + + //Add to hash table + struct batnode* backptr = NULL; + struct batnode* battable_ptr = battable + batidx; + //printk(KERN_DEBUG "battable: 0x%zx\nbattable_ptr: 0x%zx", battable, battable_ptr); + while(battable_ptr->jiffiestamp) + { + if(battable_ptr->port_id==port_id && battable_ptr->tcp_seq_num == seqnum) //penalize slave that sent packet later retransmitted + { + //printk(KERN_DEBUG "Penalizing %s and favoring %s.\n",battable_ptr->slvname,slave->dev->name); + struct slave_congestion_node* sptr = get_slave_congestion_node(battable_ptr->slvname); + if(sptr->congestion_counter > 1 && strcmp(battable_ptr->slvname,slave->dev->name)!=0) + sptr->congestion_counter--; + if(strcmp(battable_ptr->slvname,slave->dev->name)!=0) + slave_congestion_increment(slave->dev->name); + else //never retransmit on same slave, unless we have no choice + { + struct slave* other_slave = slave; + congest_node = NULL; + bond_for_each_slave_rcu(bond,slave,i) + { + struct slave_congestion_node* congest_current = get_slave_congestion_node(slave->dev->name); + if(strcmp(congest_current->this_slave,other_slave->dev->name)!=0 && (!congest_node || congest_current->congestion_counter > congest_node->congestion_counter)) + congest_node = congest_current; + } + if(congest_node) + { + bond_for_each_slave_rcu(bond,slave,i) + if(strcmp(congest_node->this_slave,slave->dev->name)==0) + break; + } + else + slave = other_slave; + } + + battable_ptr->port_id = 0; + } + + //Delete old failures + if(!battable_ptr->port_id || (jiffies - battable_ptr->jiffiestamp) / HZ > 60) + { + //Special case delete from head + if(!backptr) + { + if(battable_ptr->next_batnode) + { + struct batnode* tofree = battable_ptr->next_batnode; + memcpy(battable_ptr,battable_ptr->next_batnode,sizeof(struct batnode)); + kfree(tofree); + } + else + { + battable_ptr->jiffiestamp = 0; + break; + } + + continue; + } + else + { + backptr->next_batnode = battable_ptr->next_batnode; + kfree(battable_ptr); + battable_ptr = backptr; + } + } + + //Go to next in sequence + if(battable_ptr->next_batnode) + { + backptr = battable_ptr; + battable_ptr = battable_ptr->next_batnode; + } + else + { + struct batnode* empty = (struct batnode*)(kmalloc(sizeof(struct batnode),GFP_ATOMIC)); + empty->jiffiestamp = 0; + empty->next_batnode = NULL; + battable_ptr->next_batnode = empty; + backptr = battable_ptr; //We currently won't use backptr again, but just in case. + battable_ptr = empty; + } + + //printk(KERN_DEBUG "Old failures natural end: battable_ptr: 0x%zx", battable_ptr); + } + + //printk(KERN_DEBUG "xmit_batman Checkpoint 8\n"); + + //Create congestion packet + strlcpy(battable_ptr->slvname,slave->dev->name,IFNAMSIZ); + battable_ptr->jiffiestamp = jiffies; + if(!battable_ptr->jiffiestamp) + battable_ptr->jiffiestamp++; + battable_ptr->tcp_seq_num = seqnum; + battable_ptr->port_id = port_id; + + //printk(KERN_DEBUG "xmit_batman Checkpoint 9\n"); + + if(jiffies%100==0) + { + struct slave_congestion_node* cur_slave = slavelist_head_ptr; + while(cur_slave) + { + printk("slave 0x%zx: %s has congestion %d\n", + cur_slave,cur_slave->this_slave,cur_slave->congestion_counter); + cur_slave = cur_slave->next_slave; + } + } + + //printk(KERN_DEBUG "xmit_batman Checkpoint 10\n"); + } + + end: + //printk(KERN_DEBUG "xmit_batman Checkpoint 11"); + spin_unlock_irqrestore(&batlock,flags); + + //printk(KERN_DEBUG "xmit_batman Checkpoint 12"); + if(slave /*&& bond_slave_is_up(slave)*/) + bond_dev_queue_xmit(bond, skb, slave->dev); + else + bond_xmit_slave_id(bond, skb, 0); + } + + return NETDEV_TX_OK; +} + /** * bond_xmit_slave_id - transmit skb through slave with slave_id * @bond: bonding device that is transmitting @@ -3896,6 +4242,8 @@ static netdev_tx_t __bond_start_xmit(str switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: return bond_xmit_roundrobin(skb, dev); + case BOND_MODE_BATMAN: + return bond_xmit_batman(skb,dev); case BOND_MODE_ACTIVEBACKUP: return bond_xmit_activebackup(skb, dev); case BOND_MODE_8023AD: @@ -4020,6 +4368,9 @@ static void bond_destructor(struct net_d if (bond->wq) destroy_workqueue(bond->wq); free_netdev(bond_dev); + + if(bond_mode==BOND_MODE_BATMAN) + kthread_stop(bond->battimer_task); } void bond_setup(struct net_device *bond_dev) @@ -4068,6 +4419,12 @@ void bond_setup(struct net_device *bond_ bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_HW_CSUM); bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; bond_dev->features |= bond_dev->hw_features; + + if(bond_mode == BOND_MODE_BATMAN) + { + printk(KERN_INFO "%s: I'm Batman.\n",bond_dev->name); + bond->battimer_task = kthread_run(batman_timer_invoke,bond,"%s_batman",bond_dev->name); + } } /* Destroy a bonding device. @@ -4615,6 +4972,9 @@ err_link: static void __exit bonding_exit(void) { + int i; + struct slave_congestion_node* next_slave; + unregister_netdevice_notifier(&bond_netdev_notifier); bond_destroy_debugfs(); @@ -4622,6 +4982,27 @@ static void __exit bonding_exit(void) bond_netlink_fini(); unregister_pernet_subsys(&bond_net_ops); + //Batman doesn't leak memory + for(i=0; inext_batnode; + kfree(next_batnode); + next_batnode = next_next_batnode; + } + } + memset(battable,0,sizeof(struct batnode)*BATTABLE_SIZE); + + next_slave = slavelist_head_ptr; + while(next_slave) + { + struct slave_congestion_node* next_next_slave = next_slave->next_slave; + kfree(next_slave); + next_slave = next_next_slave; + } + #ifdef CONFIG_NET_POLL_CONTROLLER /* Make sure we don't have an imbalance on our netpoll blocking */ WARN_ON(atomic_read(&netpoll_block_tx)); --- tree_a/drivers/net/bonding/bond_options.c 2015-05-06 16:04:23.000000000 -0400 +++ tree_b/drivers/net/bonding/bond_options.c 2015-05-08 23:33:24.850987473 -0400 @@ -80,6 +80,7 @@ static const struct bond_opt_value bond_ { "802.3ad", BOND_MODE_8023AD, 0}, { "balance-tlb", BOND_MODE_TLB, 0}, { "balance-alb", BOND_MODE_ALB, 0}, + { "batman", BOND_MODE_BATMAN, 0}, { NULL, -1, 0}, };