All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
To: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>,
	linux-mm@kvack.org, akpm@linux-foundation.org,
	mpe@ellerman.id.au, linuxppc-dev@lists.ozlabs.org,
	npiggin@gmail.com, christophe.leroy@csgroup.eu
Cc: Oscar Salvador <osalvador@suse.de>,
	Mike Kravetz <mike.kravetz@oracle.com>,
	Dan Williams <dan.j.williams@intel.com>,
	Joao Martins <joao.m.martins@oracle.com>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Muchun Song <muchun.song@linux.dev>,
	Will Deacon <will@kernel.org>,
	"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Subject: Re: [PATCH v2 13/16] powerpc/book3s64/mm: Enable transparent pud hugepage
Date: Wed, 28 Jun 2023 06:53:58 +0530	[thread overview]
Message-ID: <87ttusnzv5.fsf@doe.com> (raw)
In-Reply-To: <20230616110826.344417-14-aneesh.kumar@linux.ibm.com>

"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

These are just some minor nits in case you are going to send another
revision.

> This is enabled only with radix translation and 1G hugepage size. This will
> be used with devdax device memory with a namespace alignment of 1G.
>
> Anon transparent hugepage is not supported even though we do have helpers
> checking pud_trans_huge(). We should never find that return true. The only
> expected pte bit combination is _PAGE_PTE | _PAGE_DEVMAP.
>
> Some of the helpers are never expected to get called on hash translation
> and hence is marked to call BUG() in such a case.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/book3s/64/pgtable.h  | 156 ++++++++++++++++--
>  arch/powerpc/include/asm/book3s/64/radix.h    |  37 +++++
>  .../include/asm/book3s/64/tlbflush-radix.h    |   2 +
>  arch/powerpc/include/asm/book3s/64/tlbflush.h |   8 +
>  arch/powerpc/mm/book3s64/pgtable.c            |  78 +++++++++
>  arch/powerpc/mm/book3s64/radix_pgtable.c      |  28 ++++
>  arch/powerpc/mm/book3s64/radix_tlb.c          |   7 +
>  arch/powerpc/platforms/Kconfig.cputype        |   1 +
>  include/trace/events/thp.h                    |  17 ++
>  9 files changed, 323 insertions(+), 11 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 4acc9690f599..9a05de007956 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -921,8 +921,29 @@ static inline pud_t pte_pud(pte_t pte)
>  {
>  	return __pud_raw(pte_raw(pte));
>  }
> +
> +static inline pte_t *pudp_ptep(pud_t *pud)
> +{
> +	return (pte_t *)pud;
> +}
> +
> +#define pud_pfn(pud)		pte_pfn(pud_pte(pud))
> +#define pud_dirty(pud)		pte_dirty(pud_pte(pud))
> +#define pud_young(pud)		pte_young(pud_pte(pud))
> +#define pud_mkold(pud)		pte_pud(pte_mkold(pud_pte(pud)))
> +#define pud_wrprotect(pud)	pte_pud(pte_wrprotect(pud_pte(pud)))
> +#define pud_mkdirty(pud)	pte_pud(pte_mkdirty(pud_pte(pud)))
> +#define pud_mkclean(pud)	pte_pud(pte_mkclean(pud_pte(pud)))
> +#define pud_mkyoung(pud)	pte_pud(pte_mkyoung(pud_pte(pud)))
> +#define pud_mkwrite(pud)	pte_pud(pte_mkwrite(pud_pte(pud)))
>  #define pud_write(pud)		pte_write(pud_pte(pud))
>  
> +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
> +#define pud_soft_dirty(pmd)    pte_soft_dirty(pud_pte(pud))
> +#define pud_mksoft_dirty(pmd)  pte_pud(pte_mksoft_dirty(pud_pte(pud)))
> +#define pud_clear_soft_dirty(pmd) pte_pud(pte_clear_soft_dirty(pud_pte(pud)))
> +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
> +
>  static inline int pud_bad(pud_t pud)
>  {
>  	if (radix_enabled())
> @@ -1115,15 +1136,24 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool write)
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
> +extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot);
>  extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
>  extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
>  extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>  		       pmd_t *pmdp, pmd_t pmd);
> +extern void set_pud_at(struct mm_struct *mm, unsigned long addr,
> +		       pud_t *pudp, pud_t pud);
> +
>  static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
>  					unsigned long addr, pmd_t *pmd)
>  {
>  }
>  
> +static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
> +					unsigned long addr, pud_t *pud)
> +{
> +}
> +
>  extern int hash__has_transparent_hugepage(void);
>  static inline int has_transparent_hugepage(void)
>  {
> @@ -1133,6 +1163,14 @@ static inline int has_transparent_hugepage(void)
>  }
>  #define has_transparent_hugepage has_transparent_hugepage
>  
> +static inline int has_transparent_pud_hugepage(void)
> +{
> +	if (radix_enabled())
> +		return radix__has_transparent_pud_hugepage();
> +	return 0;
> +}
> +#define has_transparent_pud_hugepage has_transparent_pud_hugepage
> +
>  static inline unsigned long
>  pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
>  		    unsigned long clr, unsigned long set)
> @@ -1142,6 +1180,16 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
>  	return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set);
>  }
>  
> +static inline unsigned long
> +pud_hugepage_update(struct mm_struct *mm, unsigned long addr, pud_t *pudp,
> +		    unsigned long clr, unsigned long set)
> +{
> +	if (radix_enabled())
> +		return radix__pud_hugepage_update(mm, addr, pudp, clr, set);
> +	BUG();
> +	return pud_val(*pudp);
> +}
> +
>  /*
>   * returns true for pmd migration entries, THP, devmap, hugetlb
>   * But compile time dependent on THP config
> @@ -1151,6 +1199,11 @@ static inline int pmd_large(pmd_t pmd)
>  	return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
>  }
>  
> +static inline int pud_large(pud_t pud)
> +{
> +	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
> +}
> +
>  /*
>   * For radix we should always find H_PAGE_HASHPTE zero. Hence
>   * the below will work for radix too
> @@ -1166,6 +1219,17 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
>  	return ((old & _PAGE_ACCESSED) != 0);
>  }
>  
> +static inline int __pudp_test_and_clear_young(struct mm_struct *mm,
> +					      unsigned long addr, pud_t *pudp)
> +{
> +	unsigned long old;
> +
> +	if ((pud_raw(*pudp) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0)
> +		return 0;
> +	old = pud_hugepage_update(mm, addr, pudp, _PAGE_ACCESSED, 0);
> +	return ((old & _PAGE_ACCESSED) != 0);
> +}
> +
>  #define __HAVE_ARCH_PMDP_SET_WRPROTECT
>  static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
>  				      pmd_t *pmdp)
> @@ -1174,6 +1238,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
>  		pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0);
>  }
>  
> +#define __HAVE_ARCH_PUDP_SET_WRPROTECT
> +static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
> +				      pud_t *pudp)
> +{
> +	if (pud_write(*pudp))
> +		pud_hugepage_update(mm, addr, pudp, _PAGE_WRITE, 0);
> +}
> +
>  /*
>   * Only returns true for a THP. False for pmd migration entry.
>   * We also need to return true when we come across a pte that
> @@ -1195,6 +1267,17 @@ static inline int pmd_trans_huge(pmd_t pmd)
>  	return hash__pmd_trans_huge(pmd);
>  }
>  
> +static inline int pud_trans_huge(pud_t pud)
> +{
> +	if (!pud_present(pud))
> +		return false;
> +
> +	if (radix_enabled())
> +		return radix__pud_trans_huge(pud);
> +	return 0;
> +}
> +
> +
>  #define __HAVE_ARCH_PMD_SAME
>  static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
>  {
> @@ -1203,6 +1286,16 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
>  	return hash__pmd_same(pmd_a, pmd_b);
>  }
>  
> +#define __HAVE_ARCH_PUD_SAME
> +static inline int pud_same(pud_t pud_a, pud_t pud_b)
> +{
> +	if (radix_enabled())
> +		return radix__pud_same(pud_a, pud_b);
> +	BUG();
> +	return 0;
> +}
> +
> +
>  static inline pmd_t __pmd_mkhuge(pmd_t pmd)
>  {
>  	if (radix_enabled())
> @@ -1210,6 +1303,14 @@ static inline pmd_t __pmd_mkhuge(pmd_t pmd)
>  	return hash__pmd_mkhuge(pmd);
>  }
>  
> +static inline pud_t __pud_mkhuge(pud_t pud)
> +{
> +	if (radix_enabled())
> +		return radix__pud_mkhuge(pud);
> +	BUG();
> +	return pud;
> +}
> +
>  /*
>   * pfn_pmd return a pmd_t that can be used as pmd pte entry.
>   */
> @@ -1225,14 +1326,34 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
>  	return pmd;
>  }
>  
> +static inline pud_t pud_mkhuge(pud_t pud)
> +{
> +#ifdef CONFIG_DEBUG_VM
> +	if (radix_enabled())
> +		WARN_ON((pud_raw(pud) & cpu_to_be64(_PAGE_PTE)) == 0);
> +	else
> +		WARN_ON(1);
> +#endif
> +	return pud;
> +}
> +
> +
>  #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
>  extern int pmdp_set_access_flags(struct vm_area_struct *vma,
>  				 unsigned long address, pmd_t *pmdp,
>  				 pmd_t entry, int dirty);
> +#define __HAVE_ARCH_PUDP_SET_ACCESS_FLAGS
> +extern int pudp_set_access_flags(struct vm_area_struct *vma,
> +				 unsigned long address, pud_t *pudp,
> +				 pud_t entry, int dirty);
>  
>  #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
>  extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
>  				     unsigned long address, pmd_t *pmdp);
> +#define __HAVE_ARCH_PUDP_TEST_AND_CLEAR_YOUNG
> +extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
> +				     unsigned long address, pud_t *pudp);
> +
>  
>  #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
>  static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
> @@ -1243,6 +1364,16 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
>  	return hash__pmdp_huge_get_and_clear(mm, addr, pmdp);
>  }
>  
> +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
> +static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
> +					    unsigned long addr, pud_t *pudp)
> +{
> +	if (radix_enabled())
> +		return radix__pudp_huge_get_and_clear(mm, addr, pudp);
> +	BUG();
> +	return *pudp;
> +}
> +
>  static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>  					unsigned long address, pmd_t *pmdp)
>  {
> @@ -1257,6 +1388,11 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
>  				   unsigned long addr,
>  				   pmd_t *pmdp, int full);
>  
> +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
> +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
> +				   unsigned long addr,
> +				   pud_t *pudp, int full);
> +
>  #define __HAVE_ARCH_PGTABLE_DEPOSIT
>  static inline void pgtable_trans_huge_deposit(struct mm_struct *mm,
>  					      pmd_t *pmdp, pgtable_t pgtable)
> @@ -1305,6 +1441,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
>  	return hash__pmd_mkdevmap(pmd);
>  }
>  
> +static inline pud_t pud_mkdevmap(pud_t pud)
> +{
> +	if (radix_enabled())
> +		return radix__pud_mkdevmap(pud);
> +	BUG();
> +	return pud;
> +}
> +
>  static inline int pmd_devmap(pmd_t pmd)
>  {
>  	return pte_devmap(pmd_pte(pmd));
> @@ -1312,7 +1456,7 @@ static inline int pmd_devmap(pmd_t pmd)
>  
>  static inline int pud_devmap(pud_t pud)
>  {
> -	return 0;
> +	return pte_devmap(pud_pte(pud));
>  }
>  
>  static inline int pgd_devmap(pgd_t pgd)
> @@ -1321,16 +1465,6 @@ static inline int pgd_devmap(pgd_t pgd)
>  }
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  
> -static inline int pud_pfn(pud_t pud)
> -{
> -	/*
> -	 * Currently all calls to pud_pfn() are gated around a pud_devmap()
> -	 * check so this should never be used. If it grows another user we
> -	 * want to know about it.
> -	 */
> -	BUILD_BUG();
> -	return 0;
> -}
>  #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
>  pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
>  void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> index 686001eda936..8cdff5a05011 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -250,6 +250,10 @@ static inline int radix__pud_bad(pud_t pud)
>  	return !!(pud_val(pud) & RADIX_PUD_BAD_BITS);
>  }
>  
> +static inline int radix__pud_same(pud_t pud_a, pud_t pud_b)
> +{
> +	return ((pud_raw(pud_a) ^ pud_raw(pud_b)) == 0);
> +}
>  
>  static inline int radix__p4d_bad(p4d_t p4d)
>  {
> @@ -268,9 +272,22 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
>  	return __pmd(pmd_val(pmd) | _PAGE_PTE);
>  }
>  
> +static inline int radix__pud_trans_huge(pud_t pud)
> +{
> +	return (pud_val(pud) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE;
> +}
> +
> +static inline pud_t radix__pud_mkhuge(pud_t pud)
> +{
> +	return __pud(pud_val(pud) | _PAGE_PTE);
> +}
> +
>  extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
>  					  pmd_t *pmdp, unsigned long clr,
>  					  unsigned long set);
> +extern unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
> +						pud_t *pudp, unsigned long clr,
> +						unsigned long set);
>  extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma,
>  				  unsigned long address, pmd_t *pmdp);
>  extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
> @@ -278,6 +295,9 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
>  extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
>  extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
>  				      unsigned long addr, pmd_t *pmdp);
> +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
> +				     unsigned long addr, pud_t *pudp);
> +
>  static inline int radix__has_transparent_hugepage(void)
>  {
>  	/* For radix 2M at PMD level means thp */
> @@ -285,6 +305,14 @@ static inline int radix__has_transparent_hugepage(void)
>  		return 1;
>  	return 0;
>  }
> +
> +static inline int radix__has_transparent_pud_hugepage(void)
> +{
> +	/* For radix 1G at PUD level means pud hugepage support */
> +	if (mmu_psize_defs[MMU_PAGE_1G].shift == PUD_SHIFT)
> +		return 1;
> +	return 0;
> +}
>  #endif
>  
>  static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
> @@ -292,9 +320,18 @@ static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
>  	return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP));
>  }
>  
> +static inline pud_t radix__pud_mkdevmap(pud_t pud)
> +{
> +	return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP));
> +}
> +
> +struct vmem_altmap;
> +struct dev_pagemap;

Minor nit.
I guess this struct dev_pagemap is meant to be added in a later patch.

>  extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
>  					     unsigned long page_size,
>  					     unsigned long phys);
> +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end,
> +				      int node, struct vmem_altmap *altmap);
>  extern void radix__vmemmap_remove_mapping(unsigned long start,
>  				    unsigned long page_size);
>  
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> index 77797a2a82eb..a38542259fab 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> @@ -68,6 +68,8 @@ void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
>  				      unsigned long end, int psize);
>  extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
>  				       unsigned long start, unsigned long end);
> +extern void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
> +				       unsigned long start, unsigned long end);
>  extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
>  			    unsigned long end);
>  extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> index 0d0c1447ecf0..a01c20a8fbf7 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> @@ -50,6 +50,14 @@ static inline void flush_pmd_tlb_range(struct vm_area_struct *vma,
>  		radix__flush_pmd_tlb_range(vma, start, end);
>  }
>  
> +#define __HAVE_ARCH_FLUSH_PUD_TLB_RANGE
> +static inline void flush_pud_tlb_range(struct vm_area_struct *vma,
> +				       unsigned long start, unsigned long end)
> +{
> +	if (radix_enabled())
> +		radix__flush_pud_tlb_range(vma, start, end);
> +}
> +
>  #define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
>  static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma,
>  					   unsigned long start,
> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
> index 85c84e89e3ea..9e5f01a1738c 100644
> --- a/arch/powerpc/mm/book3s64/pgtable.c
> +++ b/arch/powerpc/mm/book3s64/pgtable.c
> @@ -64,11 +64,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
>  	return changed;
>  }
>  
> +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
> +			  pud_t *pudp, pud_t entry, int dirty)
> +{
> +	int changed;
> +#ifdef CONFIG_DEBUG_VM
> +	WARN_ON(!pud_devmap(*pudp));

just a query -
for pmdp_set_access_flags() we had
WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));

so why don't we require the same check here?

> +	assert_spin_locked(pud_lockptr(vma->vm_mm, pudp));
> +#endif
> +	changed = !pud_same(*(pudp), entry);
> +	if (changed) {
> +		/*
> +		 * We can use MMU_PAGE_2M here, because only radix

s/MMU_PAGE_2M/MMY_PAGE_1G

> +		 * path look at the psize.
> +		 */
> +		__ptep_set_access_flags(vma, pudp_ptep(pudp),
> +					pud_pte(entry), address, MMU_PAGE_1G);
> +	}
> +	return changed;
> +}
> +
> +
>  int pmdp_test_and_clear_young(struct vm_area_struct *vma,
>  			      unsigned long address, pmd_t *pmdp)
>  {
>  	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
>  }
> +
> +int pudp_test_and_clear_young(struct vm_area_struct *vma,
> +			      unsigned long address, pud_t *pudp)
> +{
> +	return __pudp_test_and_clear_young(vma->vm_mm, address, pudp);
> +}
> +
>  /*
>   * set a new huge pmd. We should not be called for updating
>   * an existing pmd entry. That should go via pmd_hugepage_update.
> @@ -90,6 +118,23 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>  	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
>  }
>  
> +void set_pud_at(struct mm_struct *mm, unsigned long addr,
> +		pud_t *pudp, pud_t pud)
> +{
> +#ifdef CONFIG_DEBUG_VM
> +	/*
> +	 * Make sure hardware valid bit is not set. We don't do
> +	 * tlb flush for this update.
> +	 */
> +
> +	WARN_ON(pte_hw_valid(pud_pte(*pudp)));

For set_pmd_at() we had
WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));

Could you please help me understand why is it different for set_pud_at()


> +	assert_spin_locked(pud_lockptr(mm, pudp));
> +	WARN_ON(!(pud_large(pud)));
> +#endif
> +	trace_hugepage_set_pud(addr, pud_val(pud));
> +	return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud));
> +}
> +
>  static void do_serialize(void *arg)
>  {
>  	/* We've taken the IPI, so try to trim the mask while here */
> @@ -147,11 +192,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
>  	return pmd;
>  }
>  
> +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
> +				   unsigned long addr, pud_t *pudp, int full)
> +{
> +	pud_t pud;
> +
> +	VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
> +	VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) ||
> +		  !pud_present(*pudp));
> +	pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp);
> +	/*
> +	 * if it not a fullmm flush, then we can possibly end up converting
> +	 * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
> +	 * Make sure we flush the tlb in this case.
> +	 */
> +	if (!full)
> +		flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE);
> +	return pud;
> +}
> +
>  static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
>  {
>  	return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
>  }
>  
> +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot)
> +{
> +	return __pud(pud_val(pud) | pgprot_val(pgprot));
> +}
> +
>  /*
>   * At some point we should be able to get rid of
>   * pmd_mkhuge() and mk_huge_pmd() when we update all the
> @@ -166,6 +235,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
>  	return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot));
>  }
>  
> +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot)
> +{
> +	unsigned long pudv;
> +
> +	pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
> +
> +	return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot));
> +}
> +
>  pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
>  {
>  	return pfn_pmd(page_to_pfn(page), pgprot);
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 76f6a1f3b9d8..d7e2dd3d4add 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -965,6 +965,23 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add
>  	return old;
>  }
>  
> +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
> +					 pud_t *pudp, unsigned long clr,
> +					 unsigned long set)
> +{
> +	unsigned long old;
> +
> +#ifdef CONFIG_DEBUG_VM
> +	WARN_ON(!pud_devmap(*pudp));
> +	assert_spin_locked(pud_lockptr(mm, pudp));
> +#endif
> +
> +	old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
> +	trace_hugepage_update(addr, old, clr, set);

Here, we are using the same trace event for both pmd and pud update.
See the comment in the end.

> +
> +	return old;
> +}
> +
>  pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
>  			pmd_t *pmdp)
>  
> @@ -1041,6 +1058,17 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
>  	return old_pmd;
>  }
>  
> +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
> +				     unsigned long addr, pud_t *pudp)
> +{
> +	pud_t old_pud;
> +	unsigned long old;
> +
> +	old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
> +	old_pud = __pud(old);
> +	return old_pud;
> +}
> +
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  
>  void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
> index ce804b7bf84e..a18f7d2c9f63 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -1453,6 +1453,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
>  }
>  EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
>  
> +void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
> +				unsigned long start, unsigned long end)
> +{
> +	radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G);
> +}
> +EXPORT_SYMBOL(radix__flush_pud_tlb_range);
> +
>  void radix__flush_tlb_all(void)
>  {
>  	unsigned long rb,prs,r,rs;
> diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
> index 45fd975ef521..340b86ef7284 100644
> --- a/arch/powerpc/platforms/Kconfig.cputype
> +++ b/arch/powerpc/platforms/Kconfig.cputype
> @@ -94,6 +94,7 @@ config PPC_BOOK3S_64
>  	select PPC_FPU
>  	select PPC_HAVE_PMU_SUPPORT
>  	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
> +	select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>  	select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
>  	select ARCH_ENABLE_SPLIT_PMD_PTLOCK
>  	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
> diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h
> index 202b3e3e67ff..a919d943d106 100644
> --- a/include/trace/events/thp.h
> +++ b/include/trace/events/thp.h
> @@ -25,6 +25,23 @@ TRACE_EVENT(hugepage_set_pmd,
>  	    TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd)
>  );
>  
> +TRACE_EVENT(hugepage_set_pud,

Given we have trace_hugepage_set_pud and trace_hugepage_set_pmd, doing
the exact same thing, we can as well have a
DECLARE_EVENT_CLASS(hugepage_set) and then DEFINE_EVENT using this event class.

> +
> +	    TP_PROTO(unsigned long addr, unsigned long pud),
> +	    TP_ARGS(addr, pud),
> +	    TP_STRUCT__entry(
> +		    __field(unsigned long, addr)
> +		    __field(unsigned long, pud)
> +			    ),
> +
> +	    TP_fast_assign(
> +		    __entry->addr = addr;
> +		    __entry->pud = pud;
> +		    ),
> +
> +	    TP_printk("Set pud with 0x%lx with 0x%lx", __entry->addr, __entry->pud)
> +	);
> +
>  
>  TRACE_EVENT(hugepage_update,

Same here. We can have a DECLARE_EVENT_CLASS(hugepage_update) and then
DEFINE_EVENT for hugepage_update_pmd and hugepage_update_pud.

-ritesh


WARNING: multiple messages have this Message-ID (diff)
From: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
To: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>,
	linux-mm@kvack.org, akpm@linux-foundation.org,
	mpe@ellerman.id.au, linuxppc-dev@lists.ozlabs.org,
	npiggin@gmail.com, christophe.leroy@csgroup.eu
Cc: Catalin Marinas <catalin.marinas@arm.com>,
	Muchun Song <muchun.song@linux.dev>,
	"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>,
	Dan Williams <dan.j.williams@intel.com>,
	Oscar Salvador <osalvador@suse.de>, Will Deacon <will@kernel.org>,
	Joao Martins <joao.m.martins@oracle.com>,
	Mike Kravetz <mike.kravetz@oracle.com>
Subject: Re: [PATCH v2 13/16] powerpc/book3s64/mm: Enable transparent pud hugepage
Date: Wed, 28 Jun 2023 06:53:58 +0530	[thread overview]
Message-ID: <87ttusnzv5.fsf@doe.com> (raw)
In-Reply-To: <20230616110826.344417-14-aneesh.kumar@linux.ibm.com>

"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

These are just some minor nits in case you are going to send another
revision.

> This is enabled only with radix translation and 1G hugepage size. This will
> be used with devdax device memory with a namespace alignment of 1G.
>
> Anon transparent hugepage is not supported even though we do have helpers
> checking pud_trans_huge(). We should never find that return true. The only
> expected pte bit combination is _PAGE_PTE | _PAGE_DEVMAP.
>
> Some of the helpers are never expected to get called on hash translation
> and hence is marked to call BUG() in such a case.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/book3s/64/pgtable.h  | 156 ++++++++++++++++--
>  arch/powerpc/include/asm/book3s/64/radix.h    |  37 +++++
>  .../include/asm/book3s/64/tlbflush-radix.h    |   2 +
>  arch/powerpc/include/asm/book3s/64/tlbflush.h |   8 +
>  arch/powerpc/mm/book3s64/pgtable.c            |  78 +++++++++
>  arch/powerpc/mm/book3s64/radix_pgtable.c      |  28 ++++
>  arch/powerpc/mm/book3s64/radix_tlb.c          |   7 +
>  arch/powerpc/platforms/Kconfig.cputype        |   1 +
>  include/trace/events/thp.h                    |  17 ++
>  9 files changed, 323 insertions(+), 11 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 4acc9690f599..9a05de007956 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -921,8 +921,29 @@ static inline pud_t pte_pud(pte_t pte)
>  {
>  	return __pud_raw(pte_raw(pte));
>  }
> +
> +static inline pte_t *pudp_ptep(pud_t *pud)
> +{
> +	return (pte_t *)pud;
> +}
> +
> +#define pud_pfn(pud)		pte_pfn(pud_pte(pud))
> +#define pud_dirty(pud)		pte_dirty(pud_pte(pud))
> +#define pud_young(pud)		pte_young(pud_pte(pud))
> +#define pud_mkold(pud)		pte_pud(pte_mkold(pud_pte(pud)))
> +#define pud_wrprotect(pud)	pte_pud(pte_wrprotect(pud_pte(pud)))
> +#define pud_mkdirty(pud)	pte_pud(pte_mkdirty(pud_pte(pud)))
> +#define pud_mkclean(pud)	pte_pud(pte_mkclean(pud_pte(pud)))
> +#define pud_mkyoung(pud)	pte_pud(pte_mkyoung(pud_pte(pud)))
> +#define pud_mkwrite(pud)	pte_pud(pte_mkwrite(pud_pte(pud)))
>  #define pud_write(pud)		pte_write(pud_pte(pud))
>  
> +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
> +#define pud_soft_dirty(pmd)    pte_soft_dirty(pud_pte(pud))
> +#define pud_mksoft_dirty(pmd)  pte_pud(pte_mksoft_dirty(pud_pte(pud)))
> +#define pud_clear_soft_dirty(pmd) pte_pud(pte_clear_soft_dirty(pud_pte(pud)))
> +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
> +
>  static inline int pud_bad(pud_t pud)
>  {
>  	if (radix_enabled())
> @@ -1115,15 +1136,24 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool write)
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
> +extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot);
>  extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
>  extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
>  extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>  		       pmd_t *pmdp, pmd_t pmd);
> +extern void set_pud_at(struct mm_struct *mm, unsigned long addr,
> +		       pud_t *pudp, pud_t pud);
> +
>  static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
>  					unsigned long addr, pmd_t *pmd)
>  {
>  }
>  
> +static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
> +					unsigned long addr, pud_t *pud)
> +{
> +}
> +
>  extern int hash__has_transparent_hugepage(void);
>  static inline int has_transparent_hugepage(void)
>  {
> @@ -1133,6 +1163,14 @@ static inline int has_transparent_hugepage(void)
>  }
>  #define has_transparent_hugepage has_transparent_hugepage
>  
> +static inline int has_transparent_pud_hugepage(void)
> +{
> +	if (radix_enabled())
> +		return radix__has_transparent_pud_hugepage();
> +	return 0;
> +}
> +#define has_transparent_pud_hugepage has_transparent_pud_hugepage
> +
>  static inline unsigned long
>  pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
>  		    unsigned long clr, unsigned long set)
> @@ -1142,6 +1180,16 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
>  	return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set);
>  }
>  
> +static inline unsigned long
> +pud_hugepage_update(struct mm_struct *mm, unsigned long addr, pud_t *pudp,
> +		    unsigned long clr, unsigned long set)
> +{
> +	if (radix_enabled())
> +		return radix__pud_hugepage_update(mm, addr, pudp, clr, set);
> +	BUG();
> +	return pud_val(*pudp);
> +}
> +
>  /*
>   * returns true for pmd migration entries, THP, devmap, hugetlb
>   * But compile time dependent on THP config
> @@ -1151,6 +1199,11 @@ static inline int pmd_large(pmd_t pmd)
>  	return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
>  }
>  
> +static inline int pud_large(pud_t pud)
> +{
> +	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
> +}
> +
>  /*
>   * For radix we should always find H_PAGE_HASHPTE zero. Hence
>   * the below will work for radix too
> @@ -1166,6 +1219,17 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
>  	return ((old & _PAGE_ACCESSED) != 0);
>  }
>  
> +static inline int __pudp_test_and_clear_young(struct mm_struct *mm,
> +					      unsigned long addr, pud_t *pudp)
> +{
> +	unsigned long old;
> +
> +	if ((pud_raw(*pudp) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0)
> +		return 0;
> +	old = pud_hugepage_update(mm, addr, pudp, _PAGE_ACCESSED, 0);
> +	return ((old & _PAGE_ACCESSED) != 0);
> +}
> +
>  #define __HAVE_ARCH_PMDP_SET_WRPROTECT
>  static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
>  				      pmd_t *pmdp)
> @@ -1174,6 +1238,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
>  		pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0);
>  }
>  
> +#define __HAVE_ARCH_PUDP_SET_WRPROTECT
> +static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
> +				      pud_t *pudp)
> +{
> +	if (pud_write(*pudp))
> +		pud_hugepage_update(mm, addr, pudp, _PAGE_WRITE, 0);
> +}
> +
>  /*
>   * Only returns true for a THP. False for pmd migration entry.
>   * We also need to return true when we come across a pte that
> @@ -1195,6 +1267,17 @@ static inline int pmd_trans_huge(pmd_t pmd)
>  	return hash__pmd_trans_huge(pmd);
>  }
>  
> +static inline int pud_trans_huge(pud_t pud)
> +{
> +	if (!pud_present(pud))
> +		return false;
> +
> +	if (radix_enabled())
> +		return radix__pud_trans_huge(pud);
> +	return 0;
> +}
> +
> +
>  #define __HAVE_ARCH_PMD_SAME
>  static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
>  {
> @@ -1203,6 +1286,16 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
>  	return hash__pmd_same(pmd_a, pmd_b);
>  }
>  
> +#define __HAVE_ARCH_PUD_SAME
> +static inline int pud_same(pud_t pud_a, pud_t pud_b)
> +{
> +	if (radix_enabled())
> +		return radix__pud_same(pud_a, pud_b);
> +	BUG();
> +	return 0;
> +}
> +
> +
>  static inline pmd_t __pmd_mkhuge(pmd_t pmd)
>  {
>  	if (radix_enabled())
> @@ -1210,6 +1303,14 @@ static inline pmd_t __pmd_mkhuge(pmd_t pmd)
>  	return hash__pmd_mkhuge(pmd);
>  }
>  
> +static inline pud_t __pud_mkhuge(pud_t pud)
> +{
> +	if (radix_enabled())
> +		return radix__pud_mkhuge(pud);
> +	BUG();
> +	return pud;
> +}
> +
>  /*
>   * pfn_pmd return a pmd_t that can be used as pmd pte entry.
>   */
> @@ -1225,14 +1326,34 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
>  	return pmd;
>  }
>  
> +static inline pud_t pud_mkhuge(pud_t pud)
> +{
> +#ifdef CONFIG_DEBUG_VM
> +	if (radix_enabled())
> +		WARN_ON((pud_raw(pud) & cpu_to_be64(_PAGE_PTE)) == 0);
> +	else
> +		WARN_ON(1);
> +#endif
> +	return pud;
> +}
> +
> +
>  #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
>  extern int pmdp_set_access_flags(struct vm_area_struct *vma,
>  				 unsigned long address, pmd_t *pmdp,
>  				 pmd_t entry, int dirty);
> +#define __HAVE_ARCH_PUDP_SET_ACCESS_FLAGS
> +extern int pudp_set_access_flags(struct vm_area_struct *vma,
> +				 unsigned long address, pud_t *pudp,
> +				 pud_t entry, int dirty);
>  
>  #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
>  extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
>  				     unsigned long address, pmd_t *pmdp);
> +#define __HAVE_ARCH_PUDP_TEST_AND_CLEAR_YOUNG
> +extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
> +				     unsigned long address, pud_t *pudp);
> +
>  
>  #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
>  static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
> @@ -1243,6 +1364,16 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
>  	return hash__pmdp_huge_get_and_clear(mm, addr, pmdp);
>  }
>  
> +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
> +static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
> +					    unsigned long addr, pud_t *pudp)
> +{
> +	if (radix_enabled())
> +		return radix__pudp_huge_get_and_clear(mm, addr, pudp);
> +	BUG();
> +	return *pudp;
> +}
> +
>  static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>  					unsigned long address, pmd_t *pmdp)
>  {
> @@ -1257,6 +1388,11 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
>  				   unsigned long addr,
>  				   pmd_t *pmdp, int full);
>  
> +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
> +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
> +				   unsigned long addr,
> +				   pud_t *pudp, int full);
> +
>  #define __HAVE_ARCH_PGTABLE_DEPOSIT
>  static inline void pgtable_trans_huge_deposit(struct mm_struct *mm,
>  					      pmd_t *pmdp, pgtable_t pgtable)
> @@ -1305,6 +1441,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
>  	return hash__pmd_mkdevmap(pmd);
>  }
>  
> +static inline pud_t pud_mkdevmap(pud_t pud)
> +{
> +	if (radix_enabled())
> +		return radix__pud_mkdevmap(pud);
> +	BUG();
> +	return pud;
> +}
> +
>  static inline int pmd_devmap(pmd_t pmd)
>  {
>  	return pte_devmap(pmd_pte(pmd));
> @@ -1312,7 +1456,7 @@ static inline int pmd_devmap(pmd_t pmd)
>  
>  static inline int pud_devmap(pud_t pud)
>  {
> -	return 0;
> +	return pte_devmap(pud_pte(pud));
>  }
>  
>  static inline int pgd_devmap(pgd_t pgd)
> @@ -1321,16 +1465,6 @@ static inline int pgd_devmap(pgd_t pgd)
>  }
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  
> -static inline int pud_pfn(pud_t pud)
> -{
> -	/*
> -	 * Currently all calls to pud_pfn() are gated around a pud_devmap()
> -	 * check so this should never be used. If it grows another user we
> -	 * want to know about it.
> -	 */
> -	BUILD_BUG();
> -	return 0;
> -}
>  #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
>  pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
>  void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> index 686001eda936..8cdff5a05011 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -250,6 +250,10 @@ static inline int radix__pud_bad(pud_t pud)
>  	return !!(pud_val(pud) & RADIX_PUD_BAD_BITS);
>  }
>  
> +static inline int radix__pud_same(pud_t pud_a, pud_t pud_b)
> +{
> +	return ((pud_raw(pud_a) ^ pud_raw(pud_b)) == 0);
> +}
>  
>  static inline int radix__p4d_bad(p4d_t p4d)
>  {
> @@ -268,9 +272,22 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
>  	return __pmd(pmd_val(pmd) | _PAGE_PTE);
>  }
>  
> +static inline int radix__pud_trans_huge(pud_t pud)
> +{
> +	return (pud_val(pud) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE;
> +}
> +
> +static inline pud_t radix__pud_mkhuge(pud_t pud)
> +{
> +	return __pud(pud_val(pud) | _PAGE_PTE);
> +}
> +
>  extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
>  					  pmd_t *pmdp, unsigned long clr,
>  					  unsigned long set);
> +extern unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
> +						pud_t *pudp, unsigned long clr,
> +						unsigned long set);
>  extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma,
>  				  unsigned long address, pmd_t *pmdp);
>  extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
> @@ -278,6 +295,9 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
>  extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
>  extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
>  				      unsigned long addr, pmd_t *pmdp);
> +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
> +				     unsigned long addr, pud_t *pudp);
> +
>  static inline int radix__has_transparent_hugepage(void)
>  {
>  	/* For radix 2M at PMD level means thp */
> @@ -285,6 +305,14 @@ static inline int radix__has_transparent_hugepage(void)
>  		return 1;
>  	return 0;
>  }
> +
> +static inline int radix__has_transparent_pud_hugepage(void)
> +{
> +	/* For radix 1G at PUD level means pud hugepage support */
> +	if (mmu_psize_defs[MMU_PAGE_1G].shift == PUD_SHIFT)
> +		return 1;
> +	return 0;
> +}
>  #endif
>  
>  static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
> @@ -292,9 +320,18 @@ static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
>  	return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP));
>  }
>  
> +static inline pud_t radix__pud_mkdevmap(pud_t pud)
> +{
> +	return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP));
> +}
> +
> +struct vmem_altmap;
> +struct dev_pagemap;

Minor nit.
I guess this struct dev_pagemap is meant to be added in a later patch.

>  extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
>  					     unsigned long page_size,
>  					     unsigned long phys);
> +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end,
> +				      int node, struct vmem_altmap *altmap);
>  extern void radix__vmemmap_remove_mapping(unsigned long start,
>  				    unsigned long page_size);
>  
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> index 77797a2a82eb..a38542259fab 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> @@ -68,6 +68,8 @@ void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
>  				      unsigned long end, int psize);
>  extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
>  				       unsigned long start, unsigned long end);
> +extern void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
> +				       unsigned long start, unsigned long end);
>  extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
>  			    unsigned long end);
>  extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> index 0d0c1447ecf0..a01c20a8fbf7 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> @@ -50,6 +50,14 @@ static inline void flush_pmd_tlb_range(struct vm_area_struct *vma,
>  		radix__flush_pmd_tlb_range(vma, start, end);
>  }
>  
> +#define __HAVE_ARCH_FLUSH_PUD_TLB_RANGE
> +static inline void flush_pud_tlb_range(struct vm_area_struct *vma,
> +				       unsigned long start, unsigned long end)
> +{
> +	if (radix_enabled())
> +		radix__flush_pud_tlb_range(vma, start, end);
> +}
> +
>  #define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
>  static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma,
>  					   unsigned long start,
> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
> index 85c84e89e3ea..9e5f01a1738c 100644
> --- a/arch/powerpc/mm/book3s64/pgtable.c
> +++ b/arch/powerpc/mm/book3s64/pgtable.c
> @@ -64,11 +64,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
>  	return changed;
>  }
>  
> +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
> +			  pud_t *pudp, pud_t entry, int dirty)
> +{
> +	int changed;
> +#ifdef CONFIG_DEBUG_VM
> +	WARN_ON(!pud_devmap(*pudp));

just a query -
for pmdp_set_access_flags() we had
WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));

so why don't we require the same check here?

> +	assert_spin_locked(pud_lockptr(vma->vm_mm, pudp));
> +#endif
> +	changed = !pud_same(*(pudp), entry);
> +	if (changed) {
> +		/*
> +		 * We can use MMU_PAGE_2M here, because only radix

s/MMU_PAGE_2M/MMY_PAGE_1G

> +		 * path look at the psize.
> +		 */
> +		__ptep_set_access_flags(vma, pudp_ptep(pudp),
> +					pud_pte(entry), address, MMU_PAGE_1G);
> +	}
> +	return changed;
> +}
> +
> +
>  int pmdp_test_and_clear_young(struct vm_area_struct *vma,
>  			      unsigned long address, pmd_t *pmdp)
>  {
>  	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
>  }
> +
> +int pudp_test_and_clear_young(struct vm_area_struct *vma,
> +			      unsigned long address, pud_t *pudp)
> +{
> +	return __pudp_test_and_clear_young(vma->vm_mm, address, pudp);
> +}
> +
>  /*
>   * set a new huge pmd. We should not be called for updating
>   * an existing pmd entry. That should go via pmd_hugepage_update.
> @@ -90,6 +118,23 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>  	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
>  }
>  
> +void set_pud_at(struct mm_struct *mm, unsigned long addr,
> +		pud_t *pudp, pud_t pud)
> +{
> +#ifdef CONFIG_DEBUG_VM
> +	/*
> +	 * Make sure hardware valid bit is not set. We don't do
> +	 * tlb flush for this update.
> +	 */
> +
> +	WARN_ON(pte_hw_valid(pud_pte(*pudp)));

For set_pmd_at() we had
WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));

Could you please help me understand why is it different for set_pud_at()


> +	assert_spin_locked(pud_lockptr(mm, pudp));
> +	WARN_ON(!(pud_large(pud)));
> +#endif
> +	trace_hugepage_set_pud(addr, pud_val(pud));
> +	return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud));
> +}
> +
>  static void do_serialize(void *arg)
>  {
>  	/* We've taken the IPI, so try to trim the mask while here */
> @@ -147,11 +192,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
>  	return pmd;
>  }
>  
> +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
> +				   unsigned long addr, pud_t *pudp, int full)
> +{
> +	pud_t pud;
> +
> +	VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
> +	VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) ||
> +		  !pud_present(*pudp));
> +	pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp);
> +	/*
> +	 * if it not a fullmm flush, then we can possibly end up converting
> +	 * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
> +	 * Make sure we flush the tlb in this case.
> +	 */
> +	if (!full)
> +		flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE);
> +	return pud;
> +}
> +
>  static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
>  {
>  	return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
>  }
>  
> +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot)
> +{
> +	return __pud(pud_val(pud) | pgprot_val(pgprot));
> +}
> +
>  /*
>   * At some point we should be able to get rid of
>   * pmd_mkhuge() and mk_huge_pmd() when we update all the
> @@ -166,6 +235,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
>  	return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot));
>  }
>  
> +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot)
> +{
> +	unsigned long pudv;
> +
> +	pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
> +
> +	return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot));
> +}
> +
>  pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
>  {
>  	return pfn_pmd(page_to_pfn(page), pgprot);
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 76f6a1f3b9d8..d7e2dd3d4add 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -965,6 +965,23 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add
>  	return old;
>  }
>  
> +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
> +					 pud_t *pudp, unsigned long clr,
> +					 unsigned long set)
> +{
> +	unsigned long old;
> +
> +#ifdef CONFIG_DEBUG_VM
> +	WARN_ON(!pud_devmap(*pudp));
> +	assert_spin_locked(pud_lockptr(mm, pudp));
> +#endif
> +
> +	old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
> +	trace_hugepage_update(addr, old, clr, set);

Here, we are using the same trace event for both pmd and pud update.
See the comment in the end.

> +
> +	return old;
> +}
> +
>  pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
>  			pmd_t *pmdp)
>  
> @@ -1041,6 +1058,17 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
>  	return old_pmd;
>  }
>  
> +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
> +				     unsigned long addr, pud_t *pudp)
> +{
> +	pud_t old_pud;
> +	unsigned long old;
> +
> +	old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
> +	old_pud = __pud(old);
> +	return old_pud;
> +}
> +
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  
>  void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
> index ce804b7bf84e..a18f7d2c9f63 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -1453,6 +1453,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
>  }
>  EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
>  
> +void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
> +				unsigned long start, unsigned long end)
> +{
> +	radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G);
> +}
> +EXPORT_SYMBOL(radix__flush_pud_tlb_range);
> +
>  void radix__flush_tlb_all(void)
>  {
>  	unsigned long rb,prs,r,rs;
> diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
> index 45fd975ef521..340b86ef7284 100644
> --- a/arch/powerpc/platforms/Kconfig.cputype
> +++ b/arch/powerpc/platforms/Kconfig.cputype
> @@ -94,6 +94,7 @@ config PPC_BOOK3S_64
>  	select PPC_FPU
>  	select PPC_HAVE_PMU_SUPPORT
>  	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
> +	select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>  	select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
>  	select ARCH_ENABLE_SPLIT_PMD_PTLOCK
>  	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
> diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h
> index 202b3e3e67ff..a919d943d106 100644
> --- a/include/trace/events/thp.h
> +++ b/include/trace/events/thp.h
> @@ -25,6 +25,23 @@ TRACE_EVENT(hugepage_set_pmd,
>  	    TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd)
>  );
>  
> +TRACE_EVENT(hugepage_set_pud,

Given we have trace_hugepage_set_pud and trace_hugepage_set_pmd, doing
the exact same thing, we can as well have a
DECLARE_EVENT_CLASS(hugepage_set) and then DEFINE_EVENT using this event class.

> +
> +	    TP_PROTO(unsigned long addr, unsigned long pud),
> +	    TP_ARGS(addr, pud),
> +	    TP_STRUCT__entry(
> +		    __field(unsigned long, addr)
> +		    __field(unsigned long, pud)
> +			    ),
> +
> +	    TP_fast_assign(
> +		    __entry->addr = addr;
> +		    __entry->pud = pud;
> +		    ),
> +
> +	    TP_printk("Set pud with 0x%lx with 0x%lx", __entry->addr, __entry->pud)
> +	);
> +
>  
>  TRACE_EVENT(hugepage_update,

Same here. We can have a DECLARE_EVENT_CLASS(hugepage_update) and then
DEFINE_EVENT for hugepage_update_pmd and hugepage_update_pud.

-ritesh

  reply	other threads:[~2023-06-28  1:24 UTC|newest]

Thread overview: 58+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-16 11:08 [PATCH v2 00/16] Add support for DAX vmemmap optimization for ppc64 Aneesh Kumar K.V
2023-06-16 11:08 ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 01/16] powerpc/mm/book3s64: Use pmdp_ptep helper instead of typecasting Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 02/16] powerpc/book3s64/mm: mmu_vmemmap_psize is used by radix Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 03/16] powerpc/book3s64/mm: Fix DirectMap stats in /proc/meminfo Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 04/16] powerpc/book3s64/mm: Use PAGE_KERNEL instead of opencoding Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 05/16] powerpc/mm/dax: Fix the condition when checking if altmap vmemap can cross-boundary Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 06/16] mm/hugepage pud: Allow arch-specific helper function to check huge page pud support Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 07/16] mm: Change pudp_huge_get_and_clear_full take vm_area_struct as arg Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 08/16] mm/vmemmap: Improve vmemmap_can_optimize and allow architectures to override Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-20 11:53   ` Joao Martins
2023-06-20 11:53     ` Joao Martins
2023-06-20 14:29     ` Aneesh Kumar K.V
2023-06-20 14:29       ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 09/16] mm/vmemmap: Allow architectures to override how vmemmap optimization works Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 10/16] mm: Add __HAVE_ARCH_PUD_SAME similar to __HAVE_ARCH_P4D_SAME Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 11/16] mm/huge pud: Use transparent huge pud helpers only with CONFIG_TRANSPARENT_HUGEPAGE Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 12/16] mm/vmemmap optimization: Split hugetlb and devdax vmemmap optimization Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-28  1:09   ` Ritesh Harjani
2023-06-28  1:09     ` Ritesh Harjani
2023-06-28  3:01     ` Aneesh Kumar K V
2023-06-28  3:01       ` Aneesh Kumar K V
2023-06-16 11:08 ` [PATCH v2 13/16] powerpc/book3s64/mm: Enable transparent pud hugepage Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-28  1:23   ` Ritesh Harjani [this message]
2023-06-28  1:23     ` Ritesh Harjani
2023-06-28  3:32     ` Aneesh Kumar K V
2023-06-28  3:32       ` Aneesh Kumar K V
2023-06-16 11:08 ` [PATCH v2 14/16] powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap handling function Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-28  1:33   ` Ritesh Harjani
2023-06-28  1:33     ` Ritesh Harjani
2023-06-28  3:37     ` Aneesh Kumar K V
2023-06-28  3:37       ` Aneesh Kumar K V
2023-06-16 11:08 ` [PATCH v2 15/16] powerpc/book3s64/radix: Add support for vmemmap optimization for radix Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 16/16] powerpc/book3s64/radix: Remove mmu_vmemmap_psize Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-18 11:54 ` [PATCH v2 00/16] Add support for DAX vmemmap optimization for ppc64 Sachin Sant
2023-06-18 11:54   ` Sachin Sant
2023-06-24 14:52 ` Aneesh Kumar K.V
2023-06-24 14:52   ` Aneesh Kumar K.V
2023-06-24 17:22   ` Andrew Morton
2023-06-24 17:22     ` Andrew Morton
2023-07-03  5:26 ` (subset) " Michael Ellerman
2023-07-03  5:26   ` Michael Ellerman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87ttusnzv5.fsf@doe.com \
    --to=ritesh.list@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=aneesh.kumar@linux.ibm.com \
    --cc=catalin.marinas@arm.com \
    --cc=christophe.leroy@csgroup.eu \
    --cc=dan.j.williams@intel.com \
    --cc=joao.m.martins@oracle.com \
    --cc=linux-mm@kvack.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=mike.kravetz@oracle.com \
    --cc=mpe@ellerman.id.au \
    --cc=muchun.song@linux.dev \
    --cc=npiggin@gmail.com \
    --cc=osalvador@suse.de \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.