[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <f95707a3-ff53-3a6c-5fa1-1b0fd44d8f38@mellanox.com>
Date: Thu, 16 May 2019 12:02:49 +0000
From: Aya Levin <ayal@...lanox.com>
To: Jiri Pirko <jiri@...nulli.us>
CC: "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
Eran Ben Elisha <eranbe@...lanox.com>,
Saeed Mahameed <saeedm@...lanox.com>,
Jiri Pirko <jiri@...lanox.com>
Subject: Re: [PATCH net-next RFC] Dump SW SQ context as part of tx reporter
On 5/16/2019 2:53 PM, Jiri Pirko wrote:
> Thu, May 16, 2019 at 10:49:54AM CEST, ayal@...lanox.com wrote:
>>
>>
>> On 5/14/2019 3:07 PM, Jiri Pirko wrote:
>>> Sun, May 12, 2019 at 10:37:35AM CEST, ayal@...lanox.com wrote:
>>>>
>>>>
>>>> On 5/9/2019 11:23 AM, Jiri Pirko wrote:
>>>>> Tue, May 07, 2019 at 02:58:32PM CEST, ayal@...lanox.com wrote:
>>>>>>
>>>>>>
>>>>>> On 5/7/2019 3:41 PM, Jiri Pirko wrote:
>>>>>>> Mon, Apr 29, 2019 at 04:17:39PM CEST, ayal@...lanox.com wrote:
>>>>>>>> TX reporter reports an error on two scenarios:
>>>>>>>> - TX timeout on a specific tx queue
>>>>>>>> - TX completion error on a specific send queue
>>>>>>>> Prior to this patch, no dump data was supported by the tx reporter. This
>>>>>>>> patch adds support for SW data dump of the related SQ context. The dump
>>>>>>>> is simply the SQ's raw memory snapshot taken right after the error was
>>>>>>>> reported, before any recovery procedure was launched. With this
>>>>>>>> approach, no maintenance is needed as the driver fetch the actual data
>>>>>>>> according to the layout on which the SQ was compiled with. By providing
>>>>>>>> a SW context, one can easily debug error on a given SQ.
>>>>>>>>
>>>>>>>> In order to offline translate the raw memory into a human readable
>>>>>>>> format, the user can use some out-of-kernel scripts which receives as an
>>>>>>>> input the following:
>>>>>>>> - Object raw memory
>>>>>>>> - Driver object compiled with debug info (can be taken/generated at any time from the machine)
>>>>>>>> - Object name
>>>>>>>>
>>>>>>>> An example of such script output can be seen below.
>>>>>>>> Note: the script is not offered as part of this patch as it do not
>>>>>>>> belong to the kernel, I just described it in order to grasp the general
>>>>>>>> idea of how/what can be fetched from SW dump via devlink health.
>>>>>>>>
>>>>>>>> The output of the SW dump can be extracted by devlink health command:
>>>>>>>> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
>>>>>>>> mlx5e_txqsq: sqn: 6336
>>>>>>>> memory:
>>>>>>>> 00 00 00 00 00 00 00 00
>>>>>>>> 01 00 00 00 00 00 00 00
>>>>>>>> 00 00 00 00 00 00 00 00
>>>>>>>> 45 f4 88 cb 09 00 00 00
>>>>>>>> 00 00 00 00 00 00 00 00
>>>>>>>> 00 00 00 00 00 00 00 00
>>>>>>>> c0 ff ff ff 1f 00 00 00
>>>>>>>> f8 18 1e 89 81 88 ff ff
>>>>>>>> ...
>>>>>>>>
>>>>>>>> script output below, with struct members names and actual values:
>>>>>>>>
>>>>>>>> struct mlx5e_txqsq {
>>>>>>>> short unsigned int cc 0x5 ;
>>>>>>>> unsigned int dma_fifo_cc 0x5 ;
>>>>>>>> struct net_dim {
>>>>>>>> unsigned char state 0x1 ;
>>>>>>>> struct net_dim_stats {
>>>>>>>> int ppms 0x0 ;
>>>>>>>> int bpms 0x0 ;
>>>>>>>> int epms 0x0 ;
>>>>>>>> } prev_stats;
>>>>>>>> struct net_dim_sample {
>>>>>>>> long long int time 0x90766ef9d ;
>>>>>>>> unsigned int pkt_ctr 0x0 ;
>>>>>>>> unsigned int byte_ctr 0x0 ;
>>>>>>>> short unsigned int event_ctr 0x0 ;
>>>>>>>> } start_sample;
>>>>>>>> struct work_struct {
>>>>>>>> struct {
>>>>>>>> long int counter 0x1fffffffc0 ;
>>>>>>>> } data;
>>>>>>>> struct list_head {
>>>>>>>> struct list_head * next 0xffff8881b08998f8 ;
>>>>>>>> struct list_head * prev 0xffff8881b08998f8 ;
>>>>>>>> } entry;
>>>>>>>> void (*func)(struct work_struct *) 0xffffffffa02d0e30 ;
>>>>>>>> } work;
>>>>>>>> unsigned char profile_ix 0x60 ;
>>>>>>>> unsigned char mode 0x72 ;
>>>>>>>> unsigned char tune_state 0x35 ;
>>>>>>>> unsigned char steps_right 0xa0 ;
>>>>>>>> unsigned char steps_left 0xff ;
>>>>>>>> unsigned char tired 0xff ;
>>>>>>>> } dim;
>>>>>>>> short unsigned int pc 0x0 ;
>>>>>>>> unsigned int dma_fifo_pc 0x0 ;
>>>>>>>> struct mlx5e_cq {
>>>>>>>> struct mlx5_cqwq {
>>>>>>>> struct mlx5_frag_buf_ctrl {
>>>>>>>> struct mlx5_buf_list * frags 0x500000005 ;
>>>>>>>> unsigned int sz_m1 0x0 ;
>>>>>>>> short unsigned int frag_sz_m1 0x0 ;
>>>>>>>> short unsigned int strides_offset 0x0 ;
>>>>>>>> unsigned char log_sz 0x0 ;
>>>>>>>> unsigned char log_stride 0x0 ;
>>>>>>>> unsigned char log_frag_strides 0x0 ;
>>>>>>>> } fbc;
>>>>>>>> __be32 * db 0x0 ;
>>>>>>>> unsigned int cc 0x0 ;
>>>>>>>> } wq;
>>>>>>>> short unsigned int event_ctr 0x0 ;
>>>>>>>> struct napi_struct * napi 0x0 ;
>>>>>>>> struct mlx5_core_cq {
>>>>>>>> unsigned int cqn 0x0 ;
>>>>>>>> int cqe_sz 0x0 ;
>>>>>>>> __be32 * set_ci_db 0xffff8881b1aa4988 ;
>>>>>>>> __be32 * arm_db 0x3f000003ff ;
>>>>>>>> struct mlx5_uars_page * uar 0x6060a ;
>>>>>>>> struct refcount_struct {
>>>>>>>> struct {
>>>>>>>> int counter 0xa1814500 ;
>>>>>>>> } refs;
>>>>>>>> } refcount;
>>>>>>>> struct completion {
>>>>>>>> unsigned int done 0x5 ;
>>>>>>>> struct wait_queue_head {
>>>>>>>> struct spinlock {
>>>>>>>> union {
>>>>>>>> struct raw_spinlock {
>>>>>>>> struct qspinlock {
>>>>>>>> union {
>>>>>>>> struct {
>>>>>>>> int counter 0x5 ;
>>>>>>>> } val;
>>>>>>>> struct {
>>>>>>>> unsigned char locked 0x5 ;
>>>>>>>> unsigned char pending 0x0 ;
>>>>>>>> } ;
>>>>>>>> struct {
>>>>>>>> short unsigned int locked_pending 0x5 ;
>>>>>>>> short unsigned int tail 0x0 ;
>>>>>>>> } ;
>>>>>>>> } ;
>>>>>>>> } raw_lock;
>>>>>>>> } rlock;
>>>>>>>> } ;
>>>>>>>> } lock;
>>>>>>>> struct list_head {
>>>>>>>> struct list_head * next 0xffff8881b089bb88 ;
>>>>>>>> struct list_head * prev 0x4000000c0a ;
>>>>>>>> } head;
>>>>>>>> } wait;
>>>>>>>> } free;
>>>>>>>> unsigned int vector 0xa1814500 ;
>>>>>>>> unsigned int irqn 0xffff8881 ;
>>>>>>>> void (*comp)(struct mlx5_core_cq *) 0xffff8881a1814504 ;
>>>>>>>> void (*event)(struct mlx5_core_cq *, enum mlx5_event) 0xffff8881a2cdea08 ;
>>>>>>>> unsigned int cons_index 0x1 ;
>>>>>>>> unsigned int arm_sn 0x0 ;
>>>>>>>> struct mlx5_rsc_debug * dbg 0x0 ;
>>>>>>>> int pid 0x0 ;
>>>>>>>> struct {
>>>>>>>> struct list_head {
>>>>>>>> struct list_head * next 0xffffffff ;
>>>>>>>> struct list_head * prev 0xffffffffffffffff ;
>>>>>>>> } list;
>>>>>>>> void (*comp)(struct mlx5_core_cq *) 0xffffffffa0356940 ;
>>>>>>>> void * priv 0x0 ;
>>>>>>>> } tasklet_ctx;
>>>>>>>> int reset_notify_added 0x0 ;
>>>>>>>> struct list_head {
>>>>>>>> struct list_head * next 0xffffffffa0300700 ;
>>>>>>>> struct list_head * prev 0xd ;
>>>>>>>> } reset_notify;
>>>>>>>> struct mlx5_eq_comp * eq 0x0 ;
>>>>>>>> short unsigned int uid 0x9a70 ;
>>>>>>>> } mcq;
>>>>>>>> struct mlx5e_channel * channel 0xffff8881b0899a70 ;
>>>>>>>> struct mlx5_core_dev * mdev 0x4800000001 ;
>>>>>>>> struct mlx5_wq_ctrl {
>>>>>>>> struct mlx5_core_dev * mdev 0xffffffffa02d5350 ;
>>>>>>>> struct mlx5_frag_buf {
>>>>>>>> struct mlx5_buf_list * frags 0xffffffffa02d5460 ;
>>>>>>>> int npages 0x0 ;
>>>>>>>> int size 0x5 ;
>>>>>>>> unsigned char page_shift 0x8 ;
>>>>>>>> } buf;
>>>>>>>> struct mlx5_db {
>>>>>>>> __be32 * db 0x1c6 ;
>>>>>>>> union {
>>>>>>>> struct mlx5_db_pgdir * pgdir 0x0 ;
>>>>>>>> struct mlx5_ib_user_db_page * user_page 0x0 ;
>>>>>>>> } u;
>>>>>>>> long long unsigned int dma 0xffff8881b0899ab0 ;
>>>>>>>> int index 0x0 ;
>>>>>>>> } db;
>>>>>>>> } wq_ctrl;
>>>>>>>> } cq;
>>>>>>>> struct mlx5_wq_cyc {
>>>>>>>> struct mlx5_frag_buf_ctrl {
>>>>>>>> struct mlx5_buf_list * frags 0xffff8881a7600160 ;
>>>>>>>> unsigned int sz_m1 0xa7600160 ;
>>>>>>>> short unsigned int frag_sz_m1 0x8881 ;
>>>>>>>> short unsigned int strides_offset 0xffff ;
>>>>>>>> unsigned char log_sz 0x88 ;
>>>>>>>> unsigned char log_stride 0x49 ;
>>>>>>>> unsigned char log_frag_strides 0xaa ;
>>>>>>>> } fbc;
>>>>>>>> __be32 * db 0x1000000000010 ;
>>>>>>>> short unsigned int sz 0xc ;
>>>>>>>> short unsigned int wqe_ctr 0x0 ;
>>>>>>>> short unsigned int cur_sz 0x0 ;
>>>>>>>> } wq;
>>>>>>>> unsigned int dma_fifo_mask 0xa1814500 ;
>>>>>>>> struct mlx5e_sq_stats * stats 0xffff8881a33a0348 ;
>>>>>>>> struct {
>>>>>>>> struct mlx5e_sq_dma * dma_fifo 0x1a1814500 ;
>>>>>>>> struct mlx5e_tx_wqe_info * wqe_info 0x14 ;
>>>>>>>> } db;
>>>>>>>> void * uar_map 0x0 ;
>>>>>>>> struct netdev_queue * txq 0x0 ;
>>>>>>>> unsigned int sqn 0x18c0 ;
>>>>>>>> unsigned char min_inline_mode 0x0 ;
>>>>>>>> struct device * pdev 0x0 ;
>>>>>>>> unsigned int mkey_be 0x0 ;
>>>>>>>> long unsigned int state 0x0 ;
>>>>>>>> struct hwtstamp_config * tstamp 0x0 ;
>>>>>>>> struct mlx5_clock * clock 0xffff8881b1aa6f88 ;
>>>>>>>> struct mlx5_wq_ctrl {
>>>>>>>> struct mlx5_core_dev * mdev 0x3f000003ff ;
>>>>>>>> struct mlx5_frag_buf {
>>>>>>>> struct mlx5_buf_list * frags 0x6060a ;
>>>>>>>> int npages 0xa1814604 ;
>>>>>>>> int size 0xffff8881 ;
>>>>>>>> unsigned char page_shift 0x0 ;
>>>>>>>> } buf;
>>>>>>>> struct mlx5_db {
>>>>>>>> __be32 * db 0xfff ;
>>>>>>>> union {
>>>>>>>> struct mlx5_db_pgdir * pgdir 0x0 ;
>>>>>>>> struct mlx5_ib_user_db_page * user_page 0x0 ;
>>>>>>>> } u;
>>>>>>>> long long unsigned int dma 0xffff888188440000 ;
>>>>>>>> int index 0x8b074000 ;
>>>>>>>> } db;
>>>>>>>> } wq_ctrl;
>>>>>>>> struct mlx5e_channel * channel 0xffffc9000010d800 ;
>>>>>>>> int txq_ix 0xa0020180 ;
>>>>>>>> unsigned int rate_limit 0xffff8881 ;
>>>>>>>> struct work_struct {
>>>>>>>> struct {
>>>>>>>> long int counter 0x1000018c0 ;
>>>>>>>> } data;
>>>>>>>> struct list_head {
>>>>>>>> struct list_head * next 0xffff8881c32b68e8 ;
>>>>>>>> struct list_head * prev 0x800 ;
>>>>>>>> } entry;
>>>>>>>> void (*func)(struct work_struct *) 0x9 ;
>>>>>>>> } recover_work;
>>>>>>>> } ;
>>>>>>>
>>>>>>> I don't get it. You are dumping live kernel memory? There are already
>>>>>>> facilities to do that in place. Why to replicate it?
>>>>>> I am dumping the driver's memory under a lock so I can ensure it's
>>>>>> consistency (as appose to /dev/mem)
>>>>>> vmcore cannot be taken from a live kernel (without crashing).
>>>>>> I need the memory's snapshot right after the error from the driver's
>>>>>> context.
>>>>>
>>>>> Got it. However, this sounds like a generic problem not specific to
>>>>> nic drivers. How other subsystems resolve this (if they do at all)?
>>>>>
>>>>>
>>>> Correct, this is a suggested debugging solution for a generic problem:
>>>> enabling the user of a run time memory snapshot for kernel modules (at a
>>>> given error event). My research shows that other subsystems deal with
>>>> errors either by panicking (too much) or by debug/log prints (too little).
>>>> This solution is (a) low in maintenance (b) consistent in memory (c) has
>>>> small performance impact (d) use an existing infra-structure between the
>>>> kernel module and the user space.
>>>
>>> I'm still convinced that dumping kernel memory over devlink health dump
>>> is a good idea :/
>>>
>>>
>>>> It might be ported to other subsystems using their own user-space vs.
>>>> kernel tools. Regardless of how the memory output was generated to the
>>>> user, the parsing script can work on it.
>>>
>>> Could you share the script? How is it going to be distributed?
>> I thought that the script should be in a available on Mellanox website.
>
> :(
Do you think it belongs under kernel/scripts?
>
>
>> The script is still pending review but I will be happy to share it when
>> its ready.
>>>
>>>
>>>>
>>>>>
>>>>>> Which other tools do you mean?
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Signed-off-by: Aya Levin <ayal@...lanox.com>
>>>>>>>> ---
>>>>>>>> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c | 100 +++++++++++++++++++++
>>>>>>>> 1 file changed, 100 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>>>> index 476dd97f7f2f..8a39f5525e57 100644
>>>>>>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>>>> @@ -9,6 +9,7 @@
>>>>>>>>
>>>>>>>> struct mlx5e_tx_err_ctx {
>>>>>>>> int (*recover)(struct mlx5e_txqsq *sq);
>>>>>>>> + int (*dump)(struct mlx5e_txqsq *sq);
>>>>>>>> struct mlx5e_txqsq *sq;
>>>>>>>> };
>>>>>>>>
>>>>>>>> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
>>>>>>>> return err;
>>>>>>>> }
>>>>>>>>
>>>>>>>> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
>>>>>>>> + struct mlx5e_txqsq *sq,
>>>>>>>> + struct devlink_fmsg *fmsg)
>>>>>>>> +{
>>>>>>>> + u64 *ptr = (u64 *)sq;
>>>>>>>> + int copy, err;
>>>>>>>> + int i = 0;
>>>>>>>> +
>>>>>>>> + if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>>>>>> + return 0;
>>>>>>>> +
>>>>>>>> + err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
>>>>>>>> + if (err)
>>>>>>>> + return err;
>>>>>>>> +
>>>>>>>> + err = devlink_fmsg_obj_nest_start(fmsg);
>>>>>>>> + if (err)
>>>>>>>> + return err;
>>>>>>>> +
>>>>>>>> + err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
>>>>>>>> + if (err)
>>>>>>>> + return err;
>>>>>>>> +
>>>>>>>> + while (i < sizeof(struct mlx5e_txqsq)) {
>>>>>>>> + copy = sizeof(u64);
>>>>>>>> +
>>>>>>>> + if (i + copy > sizeof(struct mlx5e_txqsq))
>>>>>>>> + copy = sizeof(struct mlx5e_txqsq) - i;
>>>>>>>> +
>>>>>>>> + err = devlink_fmsg_binary_put(fmsg, ptr, copy);
>>>>>>>> + if (err)
>>>>>>>> + return err;
>>>>>>>> + ptr++;
>>>>>>>> + i += copy;
>>>>>>>> + }
>>>>>>>> +
>>>>>>>> + err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>>>>>> + if (err)
>>>>>>>> + return err;
>>>>>>>> +
>>>>>>>> + err = devlink_fmsg_obj_nest_end(fmsg);
>>>>>>>> + if (err)
>>>>>>>> + return err;
>>>>>>>> +
>>>>>>>> + err = devlink_fmsg_pair_nest_end(fmsg);
>>>>>>>> +
>>>>>>>> + return err;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
>>>>>>>> + struct devlink_fmsg *fmsg)
>>>>>>>> +{
>>>>>>>> + int i, err = 0;
>>>>>>>> +
>>>>>>>> + mutex_lock(&priv->state_lock);
>>>>>>>> +
>>>>>>>> + if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>>>>>> + goto unlock;
>>>>>>>> +
>>>>>>>> + err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
>>>>>>>> + if (err)
>>>>>>>> + goto unlock;
>>>>>>>> +
>>>>>>>> + for (i = 0; i < priv->channels.num * priv->channels.params.num_tc;
>>>>>>>> + i++) {
>>>>>>>> + err = devlink_fmsg_obj_nest_start(fmsg);
>>>>>>>> + if (err)
>>>>>>>> + goto unlock;
>>>>>>>> +
>>>>>>>> + err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i],
>>>>>>>> + fmsg);
>>>>>>>> + if (err)
>>>>>>>> + goto unlock;
>>>>>>>> +
>>>>>>>> + err = devlink_fmsg_pair_nest_end(fmsg);
>>>>>>>> + if (err)
>>>>>>>> + goto unlock;
>>>>>>>> + }
>>>>>>>> + err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>>>>>> + if (err)
>>>>>>>> + goto unlock;
>>>>>>>> +
>>>>>>>> +unlock:
>>>>>>>> + mutex_unlock(&priv->state_lock);
>>>>>>>> + return err;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter *reporter,
>>>>>>>> + struct devlink_fmsg *fmsg, void *context)
>>>>>>>> +{
>>>>>>>> + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
>>>>>>>> + struct mlx5e_tx_err_ctx *err_ctx = context;
>>>>>>>> +
>>>>>>>> + return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq,
>>>>>>>> + fmsg) :
>>>>>>>> + mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
>>>>>>>> .name = "tx",
>>>>>>>> .recover = mlx5e_tx_reporter_recover,
>>>>>>>> .diagnose = mlx5e_tx_reporter_diagnose,
>>>>>>>> + .dump = mlx5e_tx_reporter_sw_dump,
>>>>>>>> };
>>>>>>>>
>>>>>>>> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
>>>>>>>> --
>>>>>>>> 2.14.1
>>>>>>>>
Powered by blists - more mailing lists