[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ce6c33d4-e3c5-0b5d-aab8-e0ec57dcfe6c@quicinc.com>
Date: Wed, 31 Jan 2024 21:45:57 +0530
From: Mukesh Ojha <quic_mojha@...cinc.com>
To: Rodrigo Vivi <rodrigo.vivi@...el.com>
CC: <linux-kernel@...r.kernel.org>, Johannes Berg <johannes@...solutions.net>,
Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
"Rafael J . Wysocki"
<rafael@...nel.org>,
Jose Souza <jose.souza@...el.com>
Subject: Re: [PATCH 2/2] devcoredump: Remove the mutex serialization
On 1/30/2024 9:04 PM, Rodrigo Vivi wrote:
> On Tue, Jan 30, 2024 at 05:32:24PM +0530, Mukesh Ojha wrote:
>>
>>
>> On 1/26/2024 8:41 PM, Rodrigo Vivi wrote:
>>> The commit 01daccf74832 ("devcoredump : Serialize devcd_del work")
>>> introduced the mutex to protect the case where mod_delayed_work
>>> could be called before the delayed work even existed.
>>>
>>> Instead, we can simply initialize the delayed work before the device
>>> is added, so the race condition doesn't exist at first place.
>>>
>>> The mutex_unlock is very problematic here. Although mod_delayed_work
>>> is async, we have no warranty that the work is not finished before
>>> the mutex_unlock(devcd->mutex), and if that happen 'devcd' is used
>>> after freed.
>>
>> I agree, Mutex is bad and last time there was only a situation of UAF from
>> disable_store() and that can not occur as it keeps its ref, so
>> we went ahead with the change.,
>
> my concern was with:
>
> flush_delayed_work(&devcd->del_wk);
> mutex_unlock(&devcd->mutex);
>
> at devcd_free().
>
> flush_work always wait for the work to finish it's execution,
> which will delete the device.
> The with the release, the devcd should be gone soon and
> this is at risk of the UAF, no?
>
> maybe I'm missing something.
Before your patch, the only place where flush_delayed_work()
used was devcd_free() and that is getting called from disabled_store()
is taking its own reference and due to which above UAF issue would not
happen from above path.
>
>
>>
>>>
>>> Cc: Mukesh Ojha <quic_mojha@...cinc.com>
>>> Cc: Johannes Berg <johannes@...solutions.net>
>>> Cc: Greg Kroah-Hartman <gregkh@...uxfoundation.org>
>>> Cc: Rafael J. Wysocki <rafael@...nel.org>
>>> Cc: Jose Souza <jose.souza@...el.com>
>>> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@...el.com>
>>> ---
>>> drivers/base/devcoredump.c | 97 +++-----------------------------------
>>> 1 file changed, 6 insertions(+), 91 deletions(-)
>>>
>>> diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
>>> index 678ecc2fa242..0e26b1273920 100644
>>> --- a/drivers/base/devcoredump.c
>>> +++ b/drivers/base/devcoredump.c
>>> @@ -25,47 +25,6 @@ struct devcd_entry {
>>> struct device devcd_dev;
>>> void *data;
>>> size_t datalen;
>>> - /*
>>> - * Here, mutex is required to serialize the calls to del_wk work between
>>> - * user/kernel space which happens when devcd is added with device_add()
>>> - * and that sends uevent to user space. User space reads the uevents,
>>> - * and calls to devcd_data_write() which try to modify the work which is
>>> - * not even initialized/queued from devcoredump.
>>> - *
>>> - *
>>> - *
>>> - * cpu0(X) cpu1(Y)
>>> - *
>>> - * dev_coredump() uevent sent to user space
>>> - * device_add() ======================> user space process Y reads the
>>> - * uevents writes to devcd fd
>>> - * which results into writes to
>>> - *
>>> - * devcd_data_write()
>>> - * mod_delayed_work()
>>> - * try_to_grab_pending()
>>> - * del_timer()
>>> - * debug_assert_init()
>>> - * INIT_DELAYED_WORK()
>>> - * schedule_delayed_work()
>>> - *
>>> - *
>>> - * Also, mutex alone would not be enough to avoid scheduling of
>>> - * del_wk work after it get flush from a call to devcd_free()
>>> - * mentioned as below.
>>> - *
>>> - * disabled_store()
>>> - * devcd_free()
>>> - * mutex_lock() devcd_data_write()
>>> - * flush_delayed_work()
>>> - * mutex_unlock()
>>> - * mutex_lock()
>>> - * mod_delayed_work()
>>> - * mutex_unlock()
>>> - * So, delete_work flag is required.
>>> - */
>>> - struct mutex mutex;
>>> - bool delete_work;
>>> struct module *owner;
>>> ssize_t (*read)(char *buffer, loff_t offset, size_t count,
>>> void *data, size_t datalen);
>>> @@ -125,13 +84,8 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
>>> struct device *dev = kobj_to_dev(kobj);
>>> struct devcd_entry *devcd = dev_to_devcd(dev);
>>> - mutex_lock(&devcd->mutex);
>>> - if (!devcd->delete_work) {
>>> - devcd->delete_work = true;
>>> - mod_delayed_work(system_wq, &devcd->del_wk, 0);
>>> - }
>>> - mutex_unlock(&devcd->mutex);
>>> -
>>> + /* This file needs to be closed before devcd can be deleted */
>>> + mod_delayed_work(system_wq, &devcd->del_wk, 0);
>>> return count;
>>> }
>>> @@ -158,12 +112,7 @@ static int devcd_free(struct device *dev, void *data)
>>> {
>>> struct devcd_entry *devcd = dev_to_devcd(dev);
>>> - mutex_lock(&devcd->mutex);
>>> - if (!devcd->delete_work)
>>> - devcd->delete_work = true;
>>> -
>>> flush_delayed_work(&devcd->del_wk);
>>> - mutex_unlock(&devcd->mutex);
>>> return 0;
>>> }
>>> @@ -173,30 +122,6 @@ static ssize_t disabled_show(const struct class *class, const struct class_attri
>>> return sysfs_emit(buf, "%d\n", devcd_disabled);
>>> }
>>> -/*
>>> - *
>>> - * disabled_store() worker()
>>> - * class_for_each_device(&devcd_class,
>>> - * NULL, NULL, devcd_free)
>>> - * ...
>>> - * ...
>>> - * while ((dev = class_dev_iter_next(&iter))
>>> - * devcd_del()
>>> - * device_del()
>>> - * put_device() <- last reference
>>> - * error = fn(dev, data) devcd_dev_release()
>>> - * devcd_free(dev, data) kfree(devcd)
>>> - * mutex_lock(&devcd->mutex);
>>> - *
>>> - *
>>> - * In the above diagram, It looks like disabled_store() would be racing with parallely
>>> - * running devcd_del() and result in memory abort while acquiring devcd->mutex which
>>> - * is called after kfree of devcd memory after dropping its last reference with
>>> - * put_device(). However, this will not happens as fn(dev, data) runs
>>> - * with its own reference to device via klist_node so it is not its last reference.
>>> - * so, above situation would not occur.
>>> - */
>>> -
>>> static ssize_t disabled_store(const struct class *class, const struct class_attribute *attr,
>>> const char *buf, size_t count)
>>> {
>>> @@ -308,13 +233,7 @@ static void devcd_remove(void *data)
>>> {
>>> struct devcd_entry *devcd = data;
>>> - mutex_lock(&devcd->mutex);
>>> - if (!devcd->delete_work) {
>>> - devcd->delete_work = true;
>>> - /* XXX: Cannot flush otherwise the mutex below will hit a UAF */
>>> - mod_delayed_work(system_wq, &devcd->del_wk, 0);
>>> - }
>>> - mutex_unlock(&devcd->mutex);
>>> + flush_delayed_work(&devcd->del_wk);
>>> }
>>> /**
>>> @@ -365,16 +284,15 @@ void dev_coredumpm(struct device *dev, struct module *owner,
>>> devcd->read = read;
>>> devcd->free = free;
>>> devcd->failing_dev = get_device(dev);
>>> - devcd->delete_work = false;
>>> - mutex_init(&devcd->mutex);
>>> device_initialize(&devcd->devcd_dev);
>>> dev_set_name(&devcd->devcd_dev, "devcd%d",
>>> atomic_inc_return(&devcd_count));
>>> devcd->devcd_dev.class = &devcd_class;
>>> - mutex_lock(&devcd->mutex);
>>> + INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
>>> + schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
>>
>> Last time, we discussed [1] here, involves a assumption
>> about timeout can not happen before device_add() succeeds.
>> It is rare but it is there.
>>
>> https://lore.kernel.org/all/87ilr15ekx.ffs@tglx/
>
> hmm... I couldn't imagine a case where a device_add could
> take longer then 5 minutes, at least not without other bigger
> problems...
>
> I'm wondering that multiple subsequent calls of dev_coredumpm()
> would fail to find the failing_device with the class_find_device
> and all, but maybe I'm overthinking here or missing something else.
There are two issue here which is described here,
1.
/*
*
*
* cpu0(X) cpu1(Y)
*
* dev_coredump() uevent sent to user space
* device_add() ======================> user space process Y
reads the
* uevents writes to devcd fd
* which results into
writes to
*
* devcd_data_write()
* mod_delayed_work()
* try_to_grab_pending()
* del_timer()
*
* INIT_DELAYED_WORK()
* schedule_delayed_work()
*
2.
*
*
* disabled_store()
* devcd_free()
* flush_delayed_work()
* devcd_data_write()
* mod_delayed_work()
*
*
But i think, we can further optimize the existing change to only protect
delete_work flag,
diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
index 7e2d1f0d903a..af2448da00f4 100644
--- a/drivers/base/devcoredump.c
+++ b/drivers/base/devcoredump.c
@@ -126,12 +126,14 @@ static ssize_t devcd_data_write(struct file *filp,
struct kobject *kobj,
struct devcd_entry *devcd = dev_to_devcd(dev);
mutex_lock(&devcd->mutex);
- if (!devcd->delete_work) {
- devcd->delete_work = true;
- mod_delayed_work(system_wq, &devcd->del_wk, 0);
+ if (devcd->delete_work) {
+ mutex_unlock(&devcd->mutex);
+ goto out;
}
+ devcd->delete_work = true;
mutex_unlock(&devcd->mutex);
-
+ mod_delayed_work(system_wq, &devcd->del_wk, 0);
+out:
return count;
}
@@ -161,9 +163,9 @@ static int devcd_free(struct device *dev, void *data)
mutex_lock(&devcd->mutex);
if (!devcd->delete_work)
devcd->delete_work = true;
-
- flush_delayed_work(&devcd->del_wk);
mutex_unlock(&devcd->mutex);
+ flush_delayed_work(&devcd->del_wk);
+
return 0;
}
@@ -361,7 +363,6 @@ void dev_coredumpm(struct device *dev, struct module
*owner,
atomic_inc_return(&devcd_count));
devcd->devcd_dev.class = &devcd_class;
- mutex_lock(&devcd->mutex);
dev_set_uevent_suppress(&devcd->devcd_dev, true);
if (device_add(&devcd->devcd_dev))
goto put_device;
@@ -377,15 +378,13 @@ void dev_coredumpm(struct device *dev, struct
module *owner,
"devcoredump"))
dev_warn(dev, "devcoredump create_link failed\n");
- dev_set_uevent_suppress(&devcd->devcd_dev, false);
- kobject_uevent(&devcd->devcd_dev.kobj, KOBJ_ADD);
INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
- mutex_unlock(&devcd->mutex);
+ dev_set_uevent_suppress(&devcd->devcd_dev, false);
+ kobject_uevent(&devcd->devcd_dev.kobj, KOBJ_ADD);
return;
put_device:
put_device(&devcd->devcd_dev);
- mutex_unlock(&devcd->mutex);
put_module:
module_put(owner);
free:
-Mukesh
>
>>
>> -Mukesh
>>
>>> dev_set_uevent_suppress(&devcd->devcd_dev, true);
>>> if (device_add(&devcd->devcd_dev))
>>> goto put_device;
>>> @@ -392,15 +310,12 @@ void dev_coredumpm(struct device *dev, struct module *owner,
>>> dev_set_uevent_suppress(&devcd->devcd_dev, false);
>>> kobject_uevent(&devcd->devcd_dev.kobj, KOBJ_ADD);
>>> - INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
>>> - schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
>>> if (devm_add_action(dev, devcd_remove, devcd))
>>> dev_warn(dev, "devcoredump managed auto-removal registration failed\n");
>>> - mutex_unlock(&devcd->mutex);
>>> return;
>>> put_device:
>>> + cancel_delayed_work(&devcd->del_wk);
>>> put_device(&devcd->devcd_dev);
>>> - mutex_unlock(&devcd->mutex);
>>> put_module:
>>> module_put(owner);
>>> free:
Powered by blists - more mailing lists