[Arm-dev] [PATCH v1 48/87] ARM64: Improve copy_page for 128 cache line sizes.

From: Andrew Pinski <apinski at cavium.com>

Adding a check for the cache line size is not much overhead.
Special case 128 byte cache line size.
This improves copy_page by 85% on ThunderX compared to the
original implementation.

For LMBench, it improves between 4-10%.

Signed-off-by: Andrew Pinski <apinski at cavium.com>
Signed-off-by: Vadim Lomovtsev <Vadim.Lomovtsev at caviumnetworks.com>
---
 arch/arm64/lib/copy_page.S | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S
index 512b9a7..24c72f2 100644
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -27,6 +27,12 @@
  *	x1 - src
  */
 ENTRY(copy_page)
+	/* Special case 128 byte or more cache lines */
+	mrs	x2, dczid_el0
+	and	w2, w2, #0xf
+	cmp	w2, 5
+	b.ge    2f
+
 	/* Assume cache line size is 64 bytes. */
 	prfm	pldl1strm, [x1, #64]
 1:	ldp	x2, x3, [x1]
@@ -40,6 +46,32 @@ ENTRY(copy_page)
 	stnp	x6, x7, [x0, #32]
 	stnp	x8, x9, [x0, #48]
 	add	x0, x0, #64
+	tst     x1, #(PAGE_SIZE - 1)
+	b.ne	1b
+	ret
+2:
+	/* The cache line size is at least 128 bytes. */
+	prfm	pldl1strm, [x1, #128]
+1:	prfm	pldl1strm, [x1, #256]
+	ldp	x2, x3, [x1]
+	ldp	x4, x5, [x1, #16]
+	ldp	x6, x7, [x1, #32]
+	ldp	x8, x9, [x1, #48]
+	stnp	x2, x3, [x0]
+	stnp	x4, x5, [x0, #16]
+	stnp	x6, x7, [x0, #32]
+	stnp	x8, x9, [x0, #48]
+
+	ldp	x2, x3, [x1, #64]
+	ldp	x4, x5, [x1, #80]
+	ldp	x6, x7, [x1, #96]
+	ldp	x8, x9, [x1, #112]
+	add	x1, x1, #128
+	stnp	x2, x3, [x0, #64]
+	stnp	x4, x5, [x0, #80]
+	stnp	x6, x7, [x0, #96]
+	stnp	x8, x9, [x0, #112]
+	add	x0, x0, #128
 	tst	x1, #(PAGE_SIZE - 1)
 	b.ne	1b
 	ret
-- 
2.4.3