[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <F1A21578-E5CA-44E4-958B-C3D481200111@nvidia.com>
Date: Fri, 13 Nov 2020 20:00:10 -0500
From: Zi Yan <ziy@...dia.com>
To: Roman Gushchin <guro@...com>
CC: <linux-mm@...ck.org>, Matthew Wilcox <willy@...radead.org>,
"Kirill A . Shutemov" <kirill.shutemov@...ux.intel.com>,
Andrew Morton <akpm@...ux-foundation.org>,
<linux-kernel@...r.kernel.org>, <linux-kselftest@...r.kernel.org>,
Yang Shi <shy828301@...il.com>,
"Michal Hocko" <mhocko@...nel.org>,
John Hubbard <jhubbard@...dia.com>,
"Ralph Campbell" <rcampbell@...dia.com>,
David Nellans <dnellans@...dia.com>
Subject: Re: [RFC PATCH 4/6] mm: thp: add support for split huge page to any
lower order pages.
On 13 Nov 2020, at 19:52, Roman Gushchin wrote:
> On Wed, Nov 11, 2020 at 03:40:06PM -0500, Zi Yan wrote:
>> From: Zi Yan <ziy@...dia.com>
>>
>> To split a THP to any lower order pages, we need to reform THPs on
>> subpages at given order and add page refcount based on the new page
>> order. Also we need to reinitialize page_deferred_list after removing
>> the page from the split_queue, otherwise a subsequent split will see
>> list corruption when checking the page_deferred_list again.
>>
>> It has many uses, like minimizing the number of pages after
>> truncating a pagecache THP. For anonymous THPs, we can only split them
>> to order-0 like before until we add support for any size anonymous THPs.
>>
>> Signed-off-by: Zi Yan <ziy@...dia.com>
>> ---
>> include/linux/huge_mm.h | 8 +++++
>> mm/huge_memory.c | 78 +++++++++++++++++++++++++++++------------
>> mm/swap.c | 1 -
>> 3 files changed, 63 insertions(+), 24 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 60a907a19f7d..9819cd9b4619 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -189,6 +189,8 @@ bool is_transparent_hugepage(struct page *page);
>>
>> bool can_split_huge_page(struct page *page, int *pextra_pins);
>> int split_huge_page_to_list(struct page *page, struct list_head *list);
>> +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>> + unsigned int new_order);
>> static inline int split_huge_page(struct page *page)
>> {
>> return split_huge_page_to_list(page, NULL);
>> @@ -396,6 +398,12 @@ split_huge_page_to_list(struct page *page, struct list_head *list)
>> {
>> return 0;
>> }
>> +static inline int
>> +split_huge_page_to_order_to_list(struct page *page, struct list_head *list,
>> + unsigned int new_order)
>
> It was
> int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> unsigned int new_order);
> above.
Right. It should be split_huge_page_to_list_to_order. Will fix it.
>
>> +{
>> + return 0;
>> +}
>> static inline int split_huge_page(struct page *page)
>> {
>> return 0;
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 8b7d771ee962..88f50da40c9b 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2327,11 +2327,14 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
>> static void unmap_page(struct page *page)
>> {
>> enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
>> - TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
>> + TTU_RMAP_LOCKED;
>> bool unmap_success;
>>
>> VM_BUG_ON_PAGE(!PageHead(page), page);
>>
>> + if (thp_order(page) >= HPAGE_PMD_ORDER)
>> + ttu_flags |= TTU_SPLIT_HUGE_PMD;
>> +
>> if (PageAnon(page))
>> ttu_flags |= TTU_SPLIT_FREEZE;
>>
>> @@ -2339,21 +2342,22 @@ static void unmap_page(struct page *page)
>> VM_BUG_ON_PAGE(!unmap_success, page);
>> }
>>
>> -static void remap_page(struct page *page, unsigned int nr)
>> +static void remap_page(struct page *page, unsigned int nr, unsigned int new_nr)
>> {
>> int i;
>> - if (PageTransHuge(page)) {
>> + if (thp_nr_pages(page) == nr) {
>> remove_migration_ptes(page, page, true);
>> } else {
>> - for (i = 0; i < nr; i++)
>> + for (i = 0; i < nr; i += new_nr)
>> remove_migration_ptes(page + i, page + i, true);
>> }
>> }
>>
>> static void __split_huge_page_tail(struct page *head, int tail,
>> - struct lruvec *lruvec, struct list_head *list)
>> + struct lruvec *lruvec, struct list_head *list, unsigned int new_order)
>> {
>> struct page *page_tail = head + tail;
>> + unsigned long compound_head_flag = new_order ? (1L << PG_head) : 0;
>>
>> VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
>>
>> @@ -2377,6 +2381,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
>> #ifdef CONFIG_64BIT
>> (1L << PG_arch_2) |
>> #endif
>> + compound_head_flag |
>> (1L << PG_dirty)));
>>
>> /* ->mapping in first tail page is compound_mapcount */
>> @@ -2395,10 +2400,15 @@ static void __split_huge_page_tail(struct page *head, int tail,
>> * which needs correct compound_head().
>> */
>> clear_compound_head(page_tail);
>> + if (new_order) {
>> + prep_compound_page(page_tail, new_order);
>> + thp_prep(page_tail);
>> + }
>>
>> /* Finally unfreeze refcount. Additional reference from page cache. */
>> - page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
>> - PageSwapCache(head)));
>> + page_ref_unfreeze(page_tail, 1 + ((!PageAnon(head) ||
>> + PageSwapCache(head)) ?
>> + thp_nr_pages(page_tail) : 0));
>>
>> if (page_is_young(head))
>> set_page_young(page_tail);
>> @@ -2416,7 +2426,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
>> }
>>
>> static void __split_huge_page(struct page *page, struct list_head *list,
>> - pgoff_t end, unsigned long flags)
>> + pgoff_t end, unsigned long flags, unsigned int new_order)
>> {
>> struct page *head = compound_head(page);
>> pg_data_t *pgdat = page_pgdat(head);
>> @@ -2424,12 +2434,13 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>> struct address_space *swap_cache = NULL;
>> unsigned long offset = 0;
>> unsigned int nr = thp_nr_pages(head);
>> + unsigned int new_nr = 1 << new_order;
>> int i;
>>
>> lruvec = mem_cgroup_page_lruvec(head, pgdat);
>>
>> /* complete memcg works before add pages to LRU */
>> - mem_cgroup_split_huge_fixup(head, 1);
>> + mem_cgroup_split_huge_fixup(head, new_nr);
>>
>> if (PageAnon(head) && PageSwapCache(head)) {
>> swp_entry_t entry = { .val = page_private(head) };
>> @@ -2439,14 +2450,14 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>> xa_lock(&swap_cache->i_pages);
>> }
>>
>> - for (i = nr - 1; i >= 1; i--) {
>> - __split_huge_page_tail(head, i, lruvec, list);
>> + for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
>> + __split_huge_page_tail(head, i, lruvec, list, new_order);
>> /* Some pages can be beyond i_size: drop them from page cache */
>> if (head[i].index >= end) {
>> ClearPageDirty(head + i);
>> __delete_from_page_cache(head + i, NULL);
>> if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
>> - shmem_uncharge(head->mapping->host, 1);
>> + shmem_uncharge(head->mapping->host, new_nr);
>> put_page(head + i);
>> } else if (!PageAnon(page)) {
>> __xa_store(&head->mapping->i_pages, head[i].index,
>> @@ -2457,28 +2468,31 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>> }
>> }
>>
>> - ClearPageCompound(head);
>> + if (!new_order)
>> + ClearPageCompound(head);
>> + else
>> + set_compound_order(head, new_order);
>>
>> - split_page_owner(head, nr, 1);
>> + split_page_owner(head, nr, new_nr);
>>
>> /* See comment in __split_huge_page_tail() */
>> if (PageAnon(head)) {
>> /* Additional pin to swap cache */
>> if (PageSwapCache(head)) {
>> - page_ref_add(head, 2);
>> + page_ref_add(head, 1 + new_nr);
>> xa_unlock(&swap_cache->i_pages);
>> } else {
>> page_ref_inc(head);
>> }
>> } else {
>> /* Additional pin to page cache */
>> - page_ref_add(head, 2);
>> + page_ref_add(head, 1 + new_nr);
>> xa_unlock(&head->mapping->i_pages);
>> }
>>
>> spin_unlock_irqrestore(&pgdat->lru_lock, flags);
>>
>> - remap_page(head, nr);
>> + remap_page(head, nr, new_nr);
>>
>> if (PageSwapCache(head)) {
>> swp_entry_t entry = { .val = page_private(head) };
>> @@ -2486,7 +2500,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>> split_swap_cluster(entry);
>> }
>>
>> - for (i = 0; i < nr; i++) {
>> + for (i = 0; i < nr; i += new_nr) {
>> struct page *subpage = head + i;
>> if (subpage == page)
>> continue;
>> @@ -2620,21 +2634,39 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
>> * us.
>> */
>> int split_huge_page_to_list(struct page *page, struct list_head *list)
>> +{
>> + return split_huge_page_to_list_to_order(page, list, 0);
>> +}
>> +
>> +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>> + unsigned int new_order)
>> {
>> struct page *head = compound_head(page);
>> struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
>> struct deferred_split *ds_queue = get_deferred_split_queue(head);
>> - XA_STATE(xas, &head->mapping->i_pages, head->index);
>> + /* reset xarray order to new order after split */
>> + XA_STATE_ORDER(xas, &head->mapping->i_pages, head->index, new_order);
>> struct anon_vma *anon_vma = NULL;
>> struct address_space *mapping = NULL;
>> int count, mapcount, extra_pins, ret;
>> unsigned long flags;
>> pgoff_t end;
>>
>> + VM_BUG_ON(thp_order(head) <= new_order);
>> VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
>> VM_BUG_ON_PAGE(!PageLocked(head), head);
>> VM_BUG_ON_PAGE(!PageCompound(head), head);
>>
>> + if (new_order == 1) {
>> + WARN_ONCE(1, "Cannot split THP to order-1 (no order-1 THPs)");
>> + return -EINVAL;
>> + }
>> +
>> + if (PageAnon(head) && new_order) {
>> + WARN_ONCE(1, "Split anonymous THP to non-zero order not support");
>> + return -EINVAL;
>> + }
>
> I'd convert those into VM_BUG_ON()'s. Unlikely they will be hit at arbitrary moments
> by random users.
Sure. Will change them.
Thanks.
—
Best Regards,
Yan Zi
Download attachment "signature.asc" of type "application/pgp-signature" (855 bytes)
Powered by blists - more mailing lists