All of lore.kernel.org
 help / color / mirror / Atom feed
From: Aneesh Kumar K V <aneesh.kumar@linux.ibm.com>
To: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>,
	linux-mm@kvack.org, akpm@linux-foundation.org,
	mpe@ellerman.id.au, linuxppc-dev@lists.ozlabs.org,
	npiggin@gmail.com, christophe.leroy@csgroup.eu
Cc: Oscar Salvador <osalvador@suse.de>,
	Mike Kravetz <mike.kravetz@oracle.com>,
	Dan Williams <dan.j.williams@intel.com>,
	Joao Martins <joao.m.martins@oracle.com>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Muchun Song <muchun.song@linux.dev>,
	Will Deacon <will@kernel.org>
Subject: Re: [PATCH v2 13/16] powerpc/book3s64/mm: Enable transparent pud hugepage
Date: Wed, 28 Jun 2023 09:02:21 +0530	[thread overview]
Message-ID: <0d8633f2-69de-c178-fe03-49742b063962@linux.ibm.com> (raw)
In-Reply-To: <87ttusnzv5.fsf@doe.com>

On 6/28/23 6:53 AM, Ritesh Harjani (IBM) wrote:
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

....

>>  
>>  static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
>> @@ -292,9 +320,18 @@ static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
>>  	return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP));
>>  }
>>  
>> +static inline pud_t radix__pud_mkdevmap(pud_t pud)
>> +{
>> +	return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP));
>> +}
>> +
>> +struct vmem_altmap;
>> +struct dev_pagemap;
> 
> Minor nit.
> I guess this struct dev_pagemap is meant to be added in a later patch.

Moved that change to a later patch.

> 
>>  extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
>>  					     unsigned long page_size,
>>  					     unsigned long phys);
>> +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end,
>> +				      int node, struct vmem_altmap *altmap);
>>  extern void radix__vmemmap_remove_mapping(unsigned long start,
>>  				    unsigned long page_size);
>>  
>> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
>> index 77797a2a82eb..a38542259fab 100644
>> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
>> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
>> @@ -68,6 +68,8 @@ void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
>>  				      unsigned long end, int psize);
>>  extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
>>  				       unsigned long start, unsigned long end);
>> +extern void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
>> +				       unsigned long start, unsigned long end);
>>  extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
>>  			    unsigned long end);
>>  extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
>> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
>> index 0d0c1447ecf0..a01c20a8fbf7 100644
>> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
>> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
>> @@ -50,6 +50,14 @@ static inline void flush_pmd_tlb_range(struct vm_area_struct *vma,
>>  		radix__flush_pmd_tlb_range(vma, start, end);
>>  }
>>  
>> +#define __HAVE_ARCH_FLUSH_PUD_TLB_RANGE
>> +static inline void flush_pud_tlb_range(struct vm_area_struct *vma,
>> +				       unsigned long start, unsigned long end)
>> +{
>> +	if (radix_enabled())
>> +		radix__flush_pud_tlb_range(vma, start, end);
>> +}
>> +
>>  #define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
>>  static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma,
>>  					   unsigned long start,
>> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
>> index 85c84e89e3ea..9e5f01a1738c 100644
>> --- a/arch/powerpc/mm/book3s64/pgtable.c
>> +++ b/arch/powerpc/mm/book3s64/pgtable.c
>> @@ -64,11 +64,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
>>  	return changed;
>>  }
>>  
>> +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
>> +			  pud_t *pudp, pud_t entry, int dirty)
>> +{
>> +	int changed;
>> +#ifdef CONFIG_DEBUG_VM
>> +	WARN_ON(!pud_devmap(*pudp));
> 
> just a query -
> for pmdp_set_access_flags() we had
> WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
> 
> so why don't we require the same check here?
> 

Because we don't support generic anon PUD level THP. and we want to catch a wrong usage.

>> +	assert_spin_locked(pud_lockptr(vma->vm_mm, pudp));
>> +#endif
>> +	changed = !pud_same(*(pudp), entry);
>> +	if (changed) {
>> +		/*
>> +		 * We can use MMU_PAGE_2M here, because only radix
> 
> s/MMU_PAGE_2M/MMY_PAGE_1G
> 

updated

>> +		 * path look at the psize.
>> +		 */
>> +		__ptep_set_access_flags(vma, pudp_ptep(pudp),
>> +					pud_pte(entry), address, MMU_PAGE_1G);
>> +	}
>> +	return changed;
>> +}
>> +
>> +
>>  int pmdp_test_and_clear_young(struct vm_area_struct *vma,
>>  			      unsigned long address, pmd_t *pmdp)
>>  {
>>  	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
>>  }
>> +
>> +int pudp_test_and_clear_young(struct vm_area_struct *vma,
>> +			      unsigned long address, pud_t *pudp)
>> +{
>> +	return __pudp_test_and_clear_young(vma->vm_mm, address, pudp);
>> +}
>> +
>>  /*
>>   * set a new huge pmd. We should not be called for updating
>>   * an existing pmd entry. That should go via pmd_hugepage_update.
>> @@ -90,6 +118,23 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>>  	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
>>  }
>>  
>> +void set_pud_at(struct mm_struct *mm, unsigned long addr,
>> +		pud_t *pudp, pud_t pud)
>> +{
>> +#ifdef CONFIG_DEBUG_VM
>> +	/*
>> +	 * Make sure hardware valid bit is not set. We don't do
>> +	 * tlb flush for this update.
>> +	 */
>> +
>> +	WARN_ON(pte_hw_valid(pud_pte(*pudp)));
> 
> For set_pmd_at() we had
> WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
> 
> Could you please help me understand why is it different for set_pud_at()
> 
> 

There is no numa fault support for devdax pages and we want to catch wrong usage.

>> +	assert_spin_locked(pud_lockptr(mm, pudp));
>> +	WARN_ON(!(pud_large(pud)));
>> +#endif
>> +	trace_hugepage_set_pud(addr, pud_val(pud));
>> +	return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud));
>> +}
>> +
>>  static void do_serialize(void *arg)
>>  {
>>  	/* We've taken the IPI, so try to trim the mask while here */
>> @@ -147,11 +192,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
>>  	return pmd;
>>  }
>>  
>> +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
>> +				   unsigned long addr, pud_t *pudp, int full)
>> +{
>> +	pud_t pud;
>> +
>> +	VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
>> +	VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) ||
>> +		  !pud_present(*pudp));
>> +	pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp);
>> +	/*
>> +	 * if it not a fullmm flush, then we can possibly end up converting
>> +	 * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
>> +	 * Make sure we flush the tlb in this case.
>> +	 */
>> +	if (!full)
>> +		flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE);
>> +	return pud;
>> +}
>> +
>>  static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
>>  {
>>  	return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
>>  }
>>  
>> +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot)
>> +{
>> +	return __pud(pud_val(pud) | pgprot_val(pgprot));
>> +}
>> +
>>  /*
>>   * At some point we should be able to get rid of
>>   * pmd_mkhuge() and mk_huge_pmd() when we update all the
>> @@ -166,6 +235,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
>>  	return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot));
>>  }
>>  
>> +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot)
>> +{
>> +	unsigned long pudv;
>> +
>> +	pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
>> +
>> +	return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot));
>> +}
>> +
>>  pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
>>  {
>>  	return pfn_pmd(page_to_pfn(page), pgprot);
>> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
>> index 76f6a1f3b9d8..d7e2dd3d4add 100644
>> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
>> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
>> @@ -965,6 +965,23 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add
>>  	return old;
>>  }
>>  
>> +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
>> +					 pud_t *pudp, unsigned long clr,
>> +					 unsigned long set)
>> +{
>> +	unsigned long old;
>> +
>> +#ifdef CONFIG_DEBUG_VM
>> +	WARN_ON(!pud_devmap(*pudp));
>> +	assert_spin_locked(pud_lockptr(mm, pudp));
>> +#endif
>> +
>> +	old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
>> +	trace_hugepage_update(addr, old, clr, set);
> 
> Here, we are using the same trace event for both pmd and pud update.
> See the comment in the end.
> 


updated

>> +
>> +	return old;
>> +}
>> +
>>  pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
>>  			pmd_t *pmdp)
>>  
>> @@ -1041,6 +1058,17 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
>>  	return old_pmd;
>>  }
>>  
>> +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
>> +				     unsigned long addr, pud_t *pudp)
>> +{
>> +	pud_t old_pud;
>> +	unsigned long old;
>> +
>> +	old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
>> +	old_pud = __pud(old);
>> +	return old_pud;
>> +}
>> +
>>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>>  
>>  void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
>> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
>> index ce804b7bf84e..a18f7d2c9f63 100644
>> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
>> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
>> @@ -1453,6 +1453,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
>>  }
>>  EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
>>  
>> +void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
>> +				unsigned long start, unsigned long end)
>> +{
>> +	radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G);
>> +}
>> +EXPORT_SYMBOL(radix__flush_pud_tlb_range);
>> +
>>  void radix__flush_tlb_all(void)
>>  {
>>  	unsigned long rb,prs,r,rs;
>> diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
>> index 45fd975ef521..340b86ef7284 100644
>> --- a/arch/powerpc/platforms/Kconfig.cputype
>> +++ b/arch/powerpc/platforms/Kconfig.cputype
>> @@ -94,6 +94,7 @@ config PPC_BOOK3S_64
>>  	select PPC_FPU
>>  	select PPC_HAVE_PMU_SUPPORT
>>  	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
>> +	select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>>  	select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
>>  	select ARCH_ENABLE_SPLIT_PMD_PTLOCK
>>  	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
>> diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h
>> index 202b3e3e67ff..a919d943d106 100644
>> --- a/include/trace/events/thp.h
>> +++ b/include/trace/events/thp.h
>> @@ -25,6 +25,23 @@ TRACE_EVENT(hugepage_set_pmd,
>>  	    TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd)
>>  );
>>  
>> +TRACE_EVENT(hugepage_set_pud,
> 
> Given we have trace_hugepage_set_pud and trace_hugepage_set_pmd, doing
> the exact same thing, we can as well have a
> DECLARE_EVENT_CLASS(hugepage_set) and then DEFINE_EVENT using this event class.
> 
>> +
>> +	    TP_PROTO(unsigned long addr, unsigned long pud),
>> +	    TP_ARGS(addr, pud),
>> +	    TP_STRUCT__entry(
>> +		    __field(unsigned long, addr)
>> +		    __field(unsigned long, pud)
>> +			    ),
>> +
>> +	    TP_fast_assign(
>> +		    __entry->addr = addr;
>> +		    __entry->pud = pud;
>> +		    ),
>> +
>> +	    TP_printk("Set pud with 0x%lx with 0x%lx", __entry->addr, __entry->pud)
>> +	);
>> +
>>  
>>  TRACE_EVENT(hugepage_update,
> 
> Same here. We can have a DECLARE_EVENT_CLASS(hugepage_update) and then
> DEFINE_EVENT for hugepage_update_pmd and hugepage_update_pud.

updated. I  created a patch that first switch  the existing tracepoint to DECLARE_EVENT_CLASS.

-aneesh



WARNING: multiple messages have this Message-ID (diff)
From: Aneesh Kumar K V <aneesh.kumar@linux.ibm.com>
To: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>,
	linux-mm@kvack.org, akpm@linux-foundation.org,
	mpe@ellerman.id.au, linuxppc-dev@lists.ozlabs.org,
	npiggin@gmail.com, christophe.leroy@csgroup.eu
Cc: Catalin Marinas <catalin.marinas@arm.com>,
	Muchun Song <muchun.song@linux.dev>,
	Dan Williams <dan.j.williams@intel.com>,
	Oscar Salvador <osalvador@suse.de>, Will Deacon <will@kernel.org>,
	Joao Martins <joao.m.martins@oracle.com>,
	Mike Kravetz <mike.kravetz@oracle.com>
Subject: Re: [PATCH v2 13/16] powerpc/book3s64/mm: Enable transparent pud hugepage
Date: Wed, 28 Jun 2023 09:02:21 +0530	[thread overview]
Message-ID: <0d8633f2-69de-c178-fe03-49742b063962@linux.ibm.com> (raw)
In-Reply-To: <87ttusnzv5.fsf@doe.com>

On 6/28/23 6:53 AM, Ritesh Harjani (IBM) wrote:
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

....

>>  
>>  static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
>> @@ -292,9 +320,18 @@ static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
>>  	return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP));
>>  }
>>  
>> +static inline pud_t radix__pud_mkdevmap(pud_t pud)
>> +{
>> +	return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP));
>> +}
>> +
>> +struct vmem_altmap;
>> +struct dev_pagemap;
> 
> Minor nit.
> I guess this struct dev_pagemap is meant to be added in a later patch.

Moved that change to a later patch.

> 
>>  extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
>>  					     unsigned long page_size,
>>  					     unsigned long phys);
>> +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end,
>> +				      int node, struct vmem_altmap *altmap);
>>  extern void radix__vmemmap_remove_mapping(unsigned long start,
>>  				    unsigned long page_size);
>>  
>> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
>> index 77797a2a82eb..a38542259fab 100644
>> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
>> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
>> @@ -68,6 +68,8 @@ void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
>>  				      unsigned long end, int psize);
>>  extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
>>  				       unsigned long start, unsigned long end);
>> +extern void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
>> +				       unsigned long start, unsigned long end);
>>  extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
>>  			    unsigned long end);
>>  extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
>> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
>> index 0d0c1447ecf0..a01c20a8fbf7 100644
>> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
>> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
>> @@ -50,6 +50,14 @@ static inline void flush_pmd_tlb_range(struct vm_area_struct *vma,
>>  		radix__flush_pmd_tlb_range(vma, start, end);
>>  }
>>  
>> +#define __HAVE_ARCH_FLUSH_PUD_TLB_RANGE
>> +static inline void flush_pud_tlb_range(struct vm_area_struct *vma,
>> +				       unsigned long start, unsigned long end)
>> +{
>> +	if (radix_enabled())
>> +		radix__flush_pud_tlb_range(vma, start, end);
>> +}
>> +
>>  #define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
>>  static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma,
>>  					   unsigned long start,
>> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
>> index 85c84e89e3ea..9e5f01a1738c 100644
>> --- a/arch/powerpc/mm/book3s64/pgtable.c
>> +++ b/arch/powerpc/mm/book3s64/pgtable.c
>> @@ -64,11 +64,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
>>  	return changed;
>>  }
>>  
>> +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
>> +			  pud_t *pudp, pud_t entry, int dirty)
>> +{
>> +	int changed;
>> +#ifdef CONFIG_DEBUG_VM
>> +	WARN_ON(!pud_devmap(*pudp));
> 
> just a query -
> for pmdp_set_access_flags() we had
> WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
> 
> so why don't we require the same check here?
> 

Because we don't support generic anon PUD level THP. and we want to catch a wrong usage.

>> +	assert_spin_locked(pud_lockptr(vma->vm_mm, pudp));
>> +#endif
>> +	changed = !pud_same(*(pudp), entry);
>> +	if (changed) {
>> +		/*
>> +		 * We can use MMU_PAGE_2M here, because only radix
> 
> s/MMU_PAGE_2M/MMY_PAGE_1G
> 

updated

>> +		 * path look at the psize.
>> +		 */
>> +		__ptep_set_access_flags(vma, pudp_ptep(pudp),
>> +					pud_pte(entry), address, MMU_PAGE_1G);
>> +	}
>> +	return changed;
>> +}
>> +
>> +
>>  int pmdp_test_and_clear_young(struct vm_area_struct *vma,
>>  			      unsigned long address, pmd_t *pmdp)
>>  {
>>  	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
>>  }
>> +
>> +int pudp_test_and_clear_young(struct vm_area_struct *vma,
>> +			      unsigned long address, pud_t *pudp)
>> +{
>> +	return __pudp_test_and_clear_young(vma->vm_mm, address, pudp);
>> +}
>> +
>>  /*
>>   * set a new huge pmd. We should not be called for updating
>>   * an existing pmd entry. That should go via pmd_hugepage_update.
>> @@ -90,6 +118,23 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>>  	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
>>  }
>>  
>> +void set_pud_at(struct mm_struct *mm, unsigned long addr,
>> +		pud_t *pudp, pud_t pud)
>> +{
>> +#ifdef CONFIG_DEBUG_VM
>> +	/*
>> +	 * Make sure hardware valid bit is not set. We don't do
>> +	 * tlb flush for this update.
>> +	 */
>> +
>> +	WARN_ON(pte_hw_valid(pud_pte(*pudp)));
> 
> For set_pmd_at() we had
> WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
> 
> Could you please help me understand why is it different for set_pud_at()
> 
> 

There is no numa fault support for devdax pages and we want to catch wrong usage.

>> +	assert_spin_locked(pud_lockptr(mm, pudp));
>> +	WARN_ON(!(pud_large(pud)));
>> +#endif
>> +	trace_hugepage_set_pud(addr, pud_val(pud));
>> +	return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud));
>> +}
>> +
>>  static void do_serialize(void *arg)
>>  {
>>  	/* We've taken the IPI, so try to trim the mask while here */
>> @@ -147,11 +192,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
>>  	return pmd;
>>  }
>>  
>> +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
>> +				   unsigned long addr, pud_t *pudp, int full)
>> +{
>> +	pud_t pud;
>> +
>> +	VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
>> +	VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) ||
>> +		  !pud_present(*pudp));
>> +	pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp);
>> +	/*
>> +	 * if it not a fullmm flush, then we can possibly end up converting
>> +	 * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
>> +	 * Make sure we flush the tlb in this case.
>> +	 */
>> +	if (!full)
>> +		flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE);
>> +	return pud;
>> +}
>> +
>>  static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
>>  {
>>  	return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
>>  }
>>  
>> +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot)
>> +{
>> +	return __pud(pud_val(pud) | pgprot_val(pgprot));
>> +}
>> +
>>  /*
>>   * At some point we should be able to get rid of
>>   * pmd_mkhuge() and mk_huge_pmd() when we update all the
>> @@ -166,6 +235,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
>>  	return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot));
>>  }
>>  
>> +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot)
>> +{
>> +	unsigned long pudv;
>> +
>> +	pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
>> +
>> +	return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot));
>> +}
>> +
>>  pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
>>  {
>>  	return pfn_pmd(page_to_pfn(page), pgprot);
>> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
>> index 76f6a1f3b9d8..d7e2dd3d4add 100644
>> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
>> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
>> @@ -965,6 +965,23 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add
>>  	return old;
>>  }
>>  
>> +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
>> +					 pud_t *pudp, unsigned long clr,
>> +					 unsigned long set)
>> +{
>> +	unsigned long old;
>> +
>> +#ifdef CONFIG_DEBUG_VM
>> +	WARN_ON(!pud_devmap(*pudp));
>> +	assert_spin_locked(pud_lockptr(mm, pudp));
>> +#endif
>> +
>> +	old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
>> +	trace_hugepage_update(addr, old, clr, set);
> 
> Here, we are using the same trace event for both pmd and pud update.
> See the comment in the end.
> 


updated

>> +
>> +	return old;
>> +}
>> +
>>  pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
>>  			pmd_t *pmdp)
>>  
>> @@ -1041,6 +1058,17 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
>>  	return old_pmd;
>>  }
>>  
>> +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
>> +				     unsigned long addr, pud_t *pudp)
>> +{
>> +	pud_t old_pud;
>> +	unsigned long old;
>> +
>> +	old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
>> +	old_pud = __pud(old);
>> +	return old_pud;
>> +}
>> +
>>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>>  
>>  void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
>> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
>> index ce804b7bf84e..a18f7d2c9f63 100644
>> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
>> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
>> @@ -1453,6 +1453,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
>>  }
>>  EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
>>  
>> +void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
>> +				unsigned long start, unsigned long end)
>> +{
>> +	radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G);
>> +}
>> +EXPORT_SYMBOL(radix__flush_pud_tlb_range);
>> +
>>  void radix__flush_tlb_all(void)
>>  {
>>  	unsigned long rb,prs,r,rs;
>> diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
>> index 45fd975ef521..340b86ef7284 100644
>> --- a/arch/powerpc/platforms/Kconfig.cputype
>> +++ b/arch/powerpc/platforms/Kconfig.cputype
>> @@ -94,6 +94,7 @@ config PPC_BOOK3S_64
>>  	select PPC_FPU
>>  	select PPC_HAVE_PMU_SUPPORT
>>  	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
>> +	select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>>  	select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
>>  	select ARCH_ENABLE_SPLIT_PMD_PTLOCK
>>  	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
>> diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h
>> index 202b3e3e67ff..a919d943d106 100644
>> --- a/include/trace/events/thp.h
>> +++ b/include/trace/events/thp.h
>> @@ -25,6 +25,23 @@ TRACE_EVENT(hugepage_set_pmd,
>>  	    TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd)
>>  );
>>  
>> +TRACE_EVENT(hugepage_set_pud,
> 
> Given we have trace_hugepage_set_pud and trace_hugepage_set_pmd, doing
> the exact same thing, we can as well have a
> DECLARE_EVENT_CLASS(hugepage_set) and then DEFINE_EVENT using this event class.
> 
>> +
>> +	    TP_PROTO(unsigned long addr, unsigned long pud),
>> +	    TP_ARGS(addr, pud),
>> +	    TP_STRUCT__entry(
>> +		    __field(unsigned long, addr)
>> +		    __field(unsigned long, pud)
>> +			    ),
>> +
>> +	    TP_fast_assign(
>> +		    __entry->addr = addr;
>> +		    __entry->pud = pud;
>> +		    ),
>> +
>> +	    TP_printk("Set pud with 0x%lx with 0x%lx", __entry->addr, __entry->pud)
>> +	);
>> +
>>  
>>  TRACE_EVENT(hugepage_update,
> 
> Same here. We can have a DECLARE_EVENT_CLASS(hugepage_update) and then
> DEFINE_EVENT for hugepage_update_pmd and hugepage_update_pud.

updated. I  created a patch that first switch  the existing tracepoint to DECLARE_EVENT_CLASS.

-aneesh


  reply	other threads:[~2023-06-28  3:32 UTC|newest]

Thread overview: 58+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-16 11:08 [PATCH v2 00/16] Add support for DAX vmemmap optimization for ppc64 Aneesh Kumar K.V
2023-06-16 11:08 ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 01/16] powerpc/mm/book3s64: Use pmdp_ptep helper instead of typecasting Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 02/16] powerpc/book3s64/mm: mmu_vmemmap_psize is used by radix Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 03/16] powerpc/book3s64/mm: Fix DirectMap stats in /proc/meminfo Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 04/16] powerpc/book3s64/mm: Use PAGE_KERNEL instead of opencoding Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 05/16] powerpc/mm/dax: Fix the condition when checking if altmap vmemap can cross-boundary Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 06/16] mm/hugepage pud: Allow arch-specific helper function to check huge page pud support Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 07/16] mm: Change pudp_huge_get_and_clear_full take vm_area_struct as arg Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 08/16] mm/vmemmap: Improve vmemmap_can_optimize and allow architectures to override Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-20 11:53   ` Joao Martins
2023-06-20 11:53     ` Joao Martins
2023-06-20 14:29     ` Aneesh Kumar K.V
2023-06-20 14:29       ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 09/16] mm/vmemmap: Allow architectures to override how vmemmap optimization works Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 10/16] mm: Add __HAVE_ARCH_PUD_SAME similar to __HAVE_ARCH_P4D_SAME Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 11/16] mm/huge pud: Use transparent huge pud helpers only with CONFIG_TRANSPARENT_HUGEPAGE Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 12/16] mm/vmemmap optimization: Split hugetlb and devdax vmemmap optimization Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-28  1:09   ` Ritesh Harjani
2023-06-28  1:09     ` Ritesh Harjani
2023-06-28  3:01     ` Aneesh Kumar K V
2023-06-28  3:01       ` Aneesh Kumar K V
2023-06-16 11:08 ` [PATCH v2 13/16] powerpc/book3s64/mm: Enable transparent pud hugepage Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-28  1:23   ` Ritesh Harjani
2023-06-28  1:23     ` Ritesh Harjani
2023-06-28  3:32     ` Aneesh Kumar K V [this message]
2023-06-28  3:32       ` Aneesh Kumar K V
2023-06-16 11:08 ` [PATCH v2 14/16] powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap handling function Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-28  1:33   ` Ritesh Harjani
2023-06-28  1:33     ` Ritesh Harjani
2023-06-28  3:37     ` Aneesh Kumar K V
2023-06-28  3:37       ` Aneesh Kumar K V
2023-06-16 11:08 ` [PATCH v2 15/16] powerpc/book3s64/radix: Add support for vmemmap optimization for radix Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 16/16] powerpc/book3s64/radix: Remove mmu_vmemmap_psize Aneesh Kumar K.V
2023-06-16 11:08   ` Aneesh Kumar K.V
2023-06-18 11:54 ` [PATCH v2 00/16] Add support for DAX vmemmap optimization for ppc64 Sachin Sant
2023-06-18 11:54   ` Sachin Sant
2023-06-24 14:52 ` Aneesh Kumar K.V
2023-06-24 14:52   ` Aneesh Kumar K.V
2023-06-24 17:22   ` Andrew Morton
2023-06-24 17:22     ` Andrew Morton
2023-07-03  5:26 ` (subset) " Michael Ellerman
2023-07-03  5:26   ` Michael Ellerman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=0d8633f2-69de-c178-fe03-49742b063962@linux.ibm.com \
    --to=aneesh.kumar@linux.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=catalin.marinas@arm.com \
    --cc=christophe.leroy@csgroup.eu \
    --cc=dan.j.williams@intel.com \
    --cc=joao.m.martins@oracle.com \
    --cc=linux-mm@kvack.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=mike.kravetz@oracle.com \
    --cc=mpe@ellerman.id.au \
    --cc=muchun.song@linux.dev \
    --cc=npiggin@gmail.com \
    --cc=osalvador@suse.de \
    --cc=ritesh.list@gmail.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.