[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8265fa8e-3de1-444e-8f58-ec60c79d6a9c@oracle.com>
Date: Tue, 5 Mar 2024 14:59:11 -0800
From: Manjunath Patil <manjunath.b.patil@...cle.com>
To: Leon Romanovsky <leon@...nel.org>
Cc: dledford@...hat.com, jgg@...pe.ca, linux-rdma@...r.kernel.org,
linux-kernel@...r.kernel.org, rama.nichanamatlu@...cle.com
Subject: Re: [PATCH RFC] RDMA/cm: add timeout to cm_destroy_id wait
On 3/3/24 1:58 AM, Leon Romanovsky wrote:
> On Tue, Feb 27, 2024 at 12:00:17PM -0800, Manjunath Patil wrote:
>> Add timeout to cm_destroy_id, so that userspace can trigger any data
>> collection that would help in analyzing the cause of delay in destroying
>> the cm_id.
>
> Why doesn't rdmatool resource cm_id dump help to see stalled cm_ids?
Wouldn't this require us to know cm_id before hand?
I am unfamiliar with rdmatool. Can you explain how I use it to detect a stalled connection?
I wouldn't know cm_id before hand to track it to see if that is stalled.
My intention is to have a script monitor for stalled connections[Ex: one of my connections was stuck in destroying it's cm_id] and trigger things like firmware dumps, enable more logging in related modules, crash node if this takes longer than few minutes etc.
The current logic is to, have this timeout trigger a function(which is traceable with ebpf/dtrace) in error path, if more than expected time is spent is destroying the cm_id.
-Thank you,
Manjunath
>
> Thanks
>
>>
>> New noinline function helps dtrace/ebpf programs to hook on to it.
>> Existing functionality isn't changed except triggering a probe-able new
>> function at every timeout interval.
>>
>> We have seen cases where CM messages stuck with MAD layer (either due to
>> software bug or faulty HCA), leading to cm_id getting stuck in the
>> following call stack. This patch helps in resolving such issues faster.
>>
>> kernel: ... INFO: task XXXX:56778 blocked for more than 120 seconds.
>> ...
>> Call Trace:
>> __schedule+0x2bc/0x895
>> schedule+0x36/0x7c
>> schedule_timeout+0x1f6/0x31f
>> ? __slab_free+0x19c/0x2ba
>> wait_for_completion+0x12b/0x18a
>> ? wake_up_q+0x80/0x73
>> cm_destroy_id+0x345/0x610 [ib_cm]
>> ib_destroy_cm_id+0x10/0x20 [ib_cm]
>> rdma_destroy_id+0xa8/0x300 [rdma_cm]
>> ucma_destroy_id+0x13e/0x190 [rdma_ucm]
>> ucma_write+0xe0/0x160 [rdma_ucm]
>> __vfs_write+0x3a/0x16d
>> vfs_write+0xb2/0x1a1
>> ? syscall_trace_enter+0x1ce/0x2b8
>> SyS_write+0x5c/0xd3
>> do_syscall_64+0x79/0x1b9
>> entry_SYSCALL_64_after_hwframe+0x16d/0x0
>>
>> Signed-off-by: Manjunath Patil <manjunath.b.patil@...cle.com>
>> ---
>> drivers/infiniband/core/cm.c | 38 +++++++++++++++++++++++++++++++++++-
>> 1 file changed, 37 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
>> index ff58058aeadc..03f7b80efa77 100644
>> --- a/drivers/infiniband/core/cm.c
>> +++ b/drivers/infiniband/core/cm.c
>> @@ -34,6 +34,20 @@ MODULE_AUTHOR("Sean Hefty");
>> MODULE_DESCRIPTION("InfiniBand CM");
>> MODULE_LICENSE("Dual BSD/GPL");
>>
>> +static unsigned long cm_destroy_id_wait_timeout_sec = 10;
>> +
>> +static struct ctl_table_header *cm_ctl_table_header;
>> +static struct ctl_table cm_ctl_table[] = {
>> + {
>> + .procname = "destroy_id_wait_timeout_sec",
>> + .data = &cm_destroy_id_wait_timeout_sec,
>> + .maxlen = sizeof(cm_destroy_id_wait_timeout_sec),
>> + .mode = 0644,
>> + .proc_handler = proc_doulongvec_minmax,
>> + },
>> + { }
>> +};
>> +
>> static const char * const ibcm_rej_reason_strs[] = {
>> [IB_CM_REJ_NO_QP] = "no QP",
>> [IB_CM_REJ_NO_EEC] = "no EEC",
>> @@ -1025,10 +1039,20 @@ static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
>> }
>> }
>>
>> +static noinline void cm_destroy_id_wait_timeout(struct ib_cm_id *cm_id)
>> +{
>> + struct cm_id_private *cm_id_priv;
>> +
>> + cm_id_priv = container_of(cm_id, struct cm_id_private, id);
>> + pr_err("%s: cm_id=%p timed out. state=%d refcnt=%d\n", __func__,
>> + cm_id, cm_id->state, refcount_read(&cm_id_priv->refcount));
>> +}
>> +
>> static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
>> {
>> struct cm_id_private *cm_id_priv;
>> struct cm_work *work;
>> + int ret;
>>
>> cm_id_priv = container_of(cm_id, struct cm_id_private, id);
>> spin_lock_irq(&cm_id_priv->lock);
>> @@ -1135,7 +1159,14 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
>>
>> xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id));
>> cm_deref_id(cm_id_priv);
>> - wait_for_completion(&cm_id_priv->comp);
>> + do {
>> + ret = wait_for_completion_timeout(&cm_id_priv->comp,
>> + msecs_to_jiffies(
>> + cm_destroy_id_wait_timeout_sec * 1000));
>> + if (!ret) /* timeout happened */
>> + cm_destroy_id_wait_timeout(cm_id);
>> + } while (!ret);
>> +
>> while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
>> cm_free_work(work);
>>
>> @@ -4505,6 +4536,10 @@ static int __init ib_cm_init(void)
>> ret = ib_register_client(&cm_client);
>> if (ret)
>> goto error3;
>> + cm_ctl_table_header = register_net_sysctl(&init_net,
>> + "net/ib_cm", cm_ctl_table);
>> + if (!cm_ctl_table_header)
>> + pr_warn("ib_cm: couldn't register sysctl path, using default values\n");
>>
>> return 0;
>> error3:
>> @@ -4522,6 +4557,7 @@ static void __exit ib_cm_cleanup(void)
>> cancel_delayed_work(&timewait_info->work.work);
>> spin_unlock_irq(&cm.lock);
>>
>> + unregister_net_sysctl_table(cm_ctl_table_header);
>> ib_unregister_client(&cm_client);
>> destroy_workqueue(cm.wq);
>>
>> --
>> 2.31.1
>>
>>
Powered by blists - more mailing lists