netdev - AW: HSR/PRP sequence counter issue with Cisco Redbox

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <e20bb1bd30e9465ea36d26b274b8b2b6@EXCH-SVR2013.eberle.local>
Date:   Wed, 17 Feb 2021 13:14:14 +0000
From:   "Wenzel, Marco" <Marco.Wenzel@...berle.de>
To:     George McCollister <george.mccollister@...il.com>
CC:     "netdev@...r.kernel.org" <netdev@...r.kernel.org>
Subject: AW: HSR/PRP sequence counter issue with Cisco Redbox

On Mon, Feb 15, 2021 at 5:49 PM George McCollister <george.mccollister@...il.com> wrote:
> 
> On Mon, Feb 15, 2021 at 6:30 AM Wenzel, Marco <Marco.Wenzel@a-
> eberle.de> wrote:
> >
> > > On Wed, Jan 27, 2021 at 6:32 AM Wenzel, Marco <Marco.Wenzel@a-
> > > eberle.de> wrote:
> > > >
> > > > Hi,
> > > >
> > > > we have figured out an issue with the current PRP driver when
> > > > trying to
> > > communicate with Cisco IE 2000 industrial Ethernet switches in
> > > Redbox mode. The Cisco always resets the HSR/PRP sequence counter to
> > > "1" at low traffic (<= 1 frame in 400 ms). It can be reproduced by a
> > > simple ICMP echo request with 1 s interval between a Linux box
> > > running with PRP and a VDAN behind the Cisco Redbox. The Linux box
> > > then always receives frames with sequence counter "1" and drops
> > > them. The behavior is not configurable at the Cisco Redbox.
> > > >
> > > > I fixed it by ignoring sequence counters with value "1" at the
> > > > sequence
> > > counter check in hsr_register_frame_out ():
> > > >
> > > > diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index
> > > > 5c97de459905..630c238e81f0 100644
> > > > --- a/net/hsr/hsr_framereg.c
> > > > +++ b/net/hsr/hsr_framereg.c
> > > > @@ -411,7 +411,7 @@ void hsr_register_frame_in(struct hsr_node
> > > > *node, struct hsr_port *port,  int hsr_register_frame_out(struct
> > > > hsr_port *port,
> > > struct hsr_node *node,
> > > >                            u16 sequence_nr)  {
> > > > -       if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port-
> >type]))
> > > > +       if (seq_nr_before_or_eq(sequence_nr,
> > > > + node->seq_out[port->type]) && (sequence_nr != 1))
> > > >                 return 1;
> > > >
> > > >         node->seq_out[port->type] = sequence_nr;
> > > >
> > > >
> > > > Do you think this could be a solution? Should this patch be
> > > > officially applied
> > > in order to avoid other users running into these communication issues?
> > >
> > > This isn't the correct way to solve the problem. IEC 62439-3 defines
> > > EntryForgetTime as "Time after which an entry is removed from the
> > > duplicate table" with a value of 400ms and states devices should
> > > usually be configured to keep entries in the table for a much
> > > shorter time. hsr_framereg.c needs to be reworked to handle this
> according to the specification.
> >
> > Sorry for the delay but I did not have the time to take a closer look at the
> problem until now.
> >
> > My suggestion for the EntryForgetTime feature would be the following: A
> time_out element will be added to the hsr_node structure, which always
> stores the current time when entering hsr_register_frame_out(). If the last
> stored time is older than EntryForgetTime (400 ms) the sequence number
> check will be ignored.
> >
> > diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index
> > 5c97de459905..a97bffbd2581 100644
> > --- a/net/hsr/hsr_framereg.c
> > +++ b/net/hsr/hsr_framereg.c
> > @@ -164,8 +164,10 @@ static struct hsr_node *hsr_add_node(struct
> hsr_priv *hsr,
> >          * as initialization. (0 could trigger an spurious ring error warning).
> >          */
> >         now = jiffies;
> > -       for (i = 0; i < HSR_PT_PORTS; i++)
> > +       for (i = 0; i < HSR_PT_PORTS; i++) {
> >                 new_node->time_in[i] = now;
> > +               new_node->time_out[i] = now;
> > +       }
> >         for (i = 0; i < HSR_PT_PORTS; i++)
> >                 new_node->seq_out[i] = seq_out;
> >
> > @@ -411,9 +413,12 @@ void hsr_register_frame_in(struct hsr_node
> *node,
> > struct hsr_port *port,  int hsr_register_frame_out(struct hsr_port *port,
> struct hsr_node *node,
> >                            u16 sequence_nr)  {
> > -       if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]))
> > +       if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type])
> &&
> > +                time_is_after_jiffies(node->time_out[port->type] +
> > + msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) {
> >                 return 1;
> > +       }
> >
> > +       node->time_out[port->type] = jiffies;
> >         node->seq_out[port->type] = sequence_nr;
> >         return 0;
> >  }
> > diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index
> > 86b43f539f2c..d9628e7a5f05 100644
> > --- a/net/hsr/hsr_framereg.h
> > +++ b/net/hsr/hsr_framereg.h
> > @@ -75,6 +75,7 @@ struct hsr_node {
> >         enum hsr_port_type      addr_B_port;
> >         unsigned long           time_in[HSR_PT_PORTS];
> >         bool                    time_in_stale[HSR_PT_PORTS];
> > +       unsigned long           time_out[HSR_PT_PORTS];
> >         /* if the node is a SAN */
> >         bool                    san_a;
> >         bool                    san_b;
> > diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index
> > 7dc92ce5a134..f79ca55d6986 100644
> > --- a/net/hsr/hsr_main.h
> > +++ b/net/hsr/hsr_main.h
> > @@ -21,6 +21,7 @@
> >  #define HSR_LIFE_CHECK_INTERVAL                 2000 /* ms */
> >  #define HSR_NODE_FORGET_TIME           60000 /* ms */
> >  #define HSR_ANNOUNCE_INTERVAL            100 /* ms */
> > +#define HSR_ENTRY_FORGET_TIME            400 /* ms */
> >
> >  /* By how much may slave1 and slave2 timestamps of latest received
> frame from
> >   * each node differ before we notify of communication problem?
> >
> >
> > This approach works fine with the Cisco IE 2000 and I think it implements
> the correct way to handle sequence numbers as defined in IEC 62439-3.
> 
> Looks good to me. Can you send an official patch? If so I'll try it out. Even if I
> can't replicate the Cisco situation I can try it with my setups and make sure it
> doesn't break anything.

I was not so familiar with kernel patching until now and hope that this patch is correct now:


From 8836f1df35a884327da37885ff3ad8bfc5eb933c Mon Sep 17 00:00:00 2001
From: Marco Wenzel <marco.wenzel@...berle.de>
Date: Wed, 17 Feb 2021 13:53:31 +0100
Subject: [PATCH] net: hsr: add support for EntryForgetTime

In IEC 62439-3 EntryForgetTime is defined with a value of 400 ms. When a
node does not send any frame within this time, the sequence number check
for can be ignored. This solves communication issues with Cisco IE 2000
in Redbox mode.

Signed-off-by: Marco Wenzel <marco.wenzel@...berle.de>
---
 net/hsr/hsr_framereg.c | 9 +++++++--
 net/hsr/hsr_framereg.h | 1 +
 net/hsr/hsr_main.h     | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index 5c97de459905..805f974923b9 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -164,8 +164,10 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
 	 * as initialization. (0 could trigger an spurious ring error warning).
 	 */
 	now = jiffies;
-	for (i = 0; i < HSR_PT_PORTS; i++)
+	for (i = 0; i < HSR_PT_PORTS; i++) {
 		new_node->time_in[i] = now;
+		new_node->time_out[i] = now;
+	}
 	for (i = 0; i < HSR_PT_PORTS; i++)
 		new_node->seq_out[i] = seq_out;
 
@@ -411,9 +413,12 @@ void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port,
 int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node,
 			   u16 sequence_nr)
 {
-	if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]))
+	if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) &&
+	    time_is_after_jiffies(node->time_out[port->type] +
+	    msecs_to_jiffies(HSR_ENTRY_FORGET_TIME)))
 		return 1;
 
+	node->time_out[port->type] = jiffies;
 	node->seq_out[port->type] = sequence_nr;
 	return 0;
 }
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
index 86b43f539f2c..7a120ce3e3db 100644
--- a/net/hsr/hsr_framereg.h
+++ b/net/hsr/hsr_framereg.h
@@ -75,6 +75,7 @@ struct hsr_node {
 	enum hsr_port_type	addr_B_port;
 	unsigned long		time_in[HSR_PT_PORTS];
 	bool			time_in_stale[HSR_PT_PORTS];
+	unsigned long	  time_out[HSR_PT_PORTS];
 	/* if the node is a SAN */
 	bool			san_a;
 	bool			san_b;
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 7dc92ce5a134..f79ca55d6986 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -21,6 +21,7 @@
 #define HSR_LIFE_CHECK_INTERVAL		 2000 /* ms */
 #define HSR_NODE_FORGET_TIME		60000 /* ms */
 #define HSR_ANNOUNCE_INTERVAL		  100 /* ms */
+#define HSR_ENTRY_FORGET_TIME		  400 /* ms */
 
 /* By how much may slave1 and slave2 timestamps of latest received frame from
  * each node differ before we notify of communication problem?
-- 
2.29.2


Regards,
Marco Wenzel

> 
> Regards,
> George McCollister
> 
> >
> > Regards,
> > Marco Wenzel
> >
> > > >
> > > > Thanks
> > > > Marco Wenzel
> > >
> > > Regards,
> > > George McCollister