[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d3580fa0-df0c-49ce-aa4e-e8c945172939@nvidia.com>
Date: Fri, 21 Mar 2025 15:55:49 +1100
From: Balbir Singh <balbirs@...dia.com>
To: Bert Karwatzki <spasswolf@....de>, Alex Deucher <alexdeucher@...il.com>
Cc: Ingo Molnar <mingo@...nel.org>, Kees Cook <kees@...nel.org>,
Bjorn Helgaas <bhelgaas@...gle.com>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Peter Zijlstra <peterz@...radead.org>, Andy Lutomirski <luto@...nel.org>,
linux-kernel@...r.kernel.org, amd-gfx@...ts.freedesktop.org
Subject: Re: commit 7ffb791423c7 breaks steam game
On 3/21/25 10:43, Bert Karwatzki wrote:
> I did some monitoring using this patch (on top of 6.12.18):
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
> index 0760e70402ec..ccd0c9058cee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
> @@ -121,6 +121,8 @@ static int amdgpu_gtt_mgr_new(struct ttm_resource_manager
> *man,
> int r;
>
> node = kzalloc(struct_size(node, mm_nodes, 1), GFP_KERNEL);
> + if (!strcmp(get_current()->comm, "stellaris"))
> + printk(KERN_INFO "%s: node = %px\n", __func__, node);
> if (!node)
> return -ENOMEM;
>
> @@ -142,10 +144,16 @@ static int amdgpu_gtt_mgr_new(struct ttm_resource_manager
> *man,
> goto err_free;
>
> node->base.start = node->mm_nodes[0].start;
> + if (!strcmp(get_current()->comm, "stellaris"))
> + printk(KERN_INFO "%s %d: node->base.start = 0x%lx node-
>> base.size = 0x%lx\n",
> + __func__, __LINE__, node->base.start, node-
>> base.size);
> } else {
> node->mm_nodes[0].start = 0;
> node->mm_nodes[0].size = PFN_UP(node->base.size);
> node->base.start = AMDGPU_BO_INVALID_OFFSET;
> + if (!strcmp(get_current()->comm, "stellaris"))
> + printk(KERN_INFO "%s %d: node->base.start = 0x%lx node-
>> base.size = 0x%lx\n",
> + __func__, __LINE__, node->base.start, node-
>> base.size);
> }
>
> *res = &node->base;
> @@ -170,6 +178,8 @@ static void amdgpu_gtt_mgr_del(struct ttm_resource_manager
> *man,
> {
> struct ttm_range_mgr_node *node = to_ttm_range_mgr_node(res);
> struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
> + if (!strcmp(get_current()->comm, "stellaris"))
> + printk(KERN_INFO "%s: node = %px\n", __func__, node);
>
> spin_lock(&mgr->lock);
> if (drm_mm_node_allocated(&node->mm_nodes[0]))
> @@ -217,7 +227,11 @@ static bool amdgpu_gtt_mgr_intersects(struct
> ttm_resource_manager *man,
> const struct ttm_place *place,
> size_t size)
> {
> - return !place->lpfn || amdgpu_gtt_mgr_has_gart_addr(res);
> + bool ret;
> + ret = !place->lpfn || amdgpu_gtt_mgr_has_gart_addr(res);
> + if (!strcmp(get_current()->comm, "stellaris"))
> + printk(KERN_INFO, "%s: returning ret = %d", __func__, ret);
> + return ret;
> }
>
> /**
> @@ -235,7 +249,11 @@ static bool amdgpu_gtt_mgr_compatible(struct
> ttm_resource_manager *man,
> const struct ttm_place *place,
> size_t size)
> {
> - return !place->lpfn || amdgpu_gtt_mgr_has_gart_addr(res);
> + bool ret;
> + ret = !place->lpfn || amdgpu_gtt_mgr_has_gart_addr(res);
> + if (!strcmp(get_current()->comm, "stellaris"))
> + printk(KERN_INFO, "%s: returning ret = %d", __func__, ret);
> + return ret;
> }
>
> /**
> @@ -288,6 +306,8 @@ int amdgpu_gtt_mgr_init(struct amdgpu_device *adev, uint64_t
> gtt_size)
> drm_mm_init(&mgr->mm, start, size);
> spin_lock_init(&mgr->lock);
>
> + dev_info(adev->dev, "%s: start = 0x%llx size = 0x%llx\n", __func__,
> start, size);
> +
> ttm_set_driver_manager(&adev->mman.bdev, TTM_PL_TT, &mgr->manager);
> ttm_resource_manager_set_used(man, true);
> return 0;
> diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c
> index 1ed68d3cd80b..e525a1276304 100644
> --- a/drivers/gpu/drm/drm_mm.c
> +++ b/drivers/gpu/drm/drm_mm.c
> @@ -223,6 +223,13 @@ static void insert_hole_size(struct rb_root_cached *root,
> struct rb_node **link = &root->rb_root.rb_node, *rb = NULL;
> u64 x = node->hole_size;
> bool first = true;
> + int count = 0;
> +
> + if (!strcmp(get_current()->comm, "stellaris")) {
> + for(struct rb_node *first = rb_first_cached(root); first; first
> = rb_next(first))
> + count++;
> + printk(KERN_INFO "%s: RB count = %d\n", __func__, count);
> + }
>
> while (*link) {
> rb = *link;
> @@ -247,6 +254,13 @@ static void insert_hole_addr(struct rb_root *root, struct
> drm_mm_node *node)
> struct rb_node **link = &root->rb_node, *rb_parent = NULL;
> u64 start = HOLE_ADDR(node), subtree_max_hole = node->subtree_max_hole;
> struct drm_mm_node *parent;
> + int count = 0;
> +
> + if (!strcmp(get_current()->comm, "stellaris")) {
> + for(struct rb_node *first = rb_first(root); first; first =
> rb_next(first))
> + count++;
> + printk(KERN_INFO "%s: RB count = %d\n", __func__, count);
> + }
>
> while (*link) {
> rb_parent = *link;
>
>
> With this I ran stellaris (just opening the game the closing it again from the
> game menu)
>
> The findings are:
> (a) The size of the RB tree is the same in the working and non-working case (50-
> 60)
> (b) The number of calls to amdgpu_gtt_mgr_new() is ~2000 in both cases
> (c) In the non-working case amdgpu_gtt_mgr_del() is called far more often then
> in the working case:
> Non-working case (cmdline: nokaslr) 834 calls to amdgpu_gtt_mgt_del()
> Working case (cmdline: nokaslr amdgpu.vramlimit=512) 51 calls to
> amdgpu_gtt_mgr_del()
> Working case (cmdline: no additional arguments) 44 calls to amdgpu_gtt_mgr_del()
>
I am not an expert in amdgpu or gtt_mgr, but I wonder if some of the deletes are coming
from forceful eviction of memory during allocation?
Have you filed a bug report for the nokaslr case?
Balbir Singh
Powered by blists - more mailing lists