From: Andrew Pinski <apinski at cavium.com> Adding a check for the cache line size is not much overhead. Special case 128 byte cache line size. This improves copy_page by 85% on ThunderX compared to the original implementation. For LMBench, it improves between 4-10%. Signed-off-by: Andrew Pinski <apinski at cavium.com> Signed-off-by: Vadim Lomovtsev <Vadim.Lomovtsev at caviumnetworks.com> --- arch/arm64/lib/copy_page.S | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index 512b9a7..24c72f2 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -27,6 +27,12 @@ * x1 - src */ ENTRY(copy_page) + /* Special case 128 byte or more cache lines */ + mrs x2, dczid_el0 + and w2, w2, #0xf + cmp w2, 5 + b.ge 2f + /* Assume cache line size is 64 bytes. */ prfm pldl1strm, [x1, #64] 1: ldp x2, x3, [x1] @@ -40,6 +46,32 @@ ENTRY(copy_page) stnp x6, x7, [x0, #32] stnp x8, x9, [x0, #48] add x0, x0, #64 + tst x1, #(PAGE_SIZE - 1) + b.ne 1b + ret +2: + /* The cache line size is at least 128 bytes. */ + prfm pldl1strm, [x1, #128] +1: prfm pldl1strm, [x1, #256] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + ldp x6, x7, [x1, #32] + ldp x8, x9, [x1, #48] + stnp x2, x3, [x0] + stnp x4, x5, [x0, #16] + stnp x6, x7, [x0, #32] + stnp x8, x9, [x0, #48] + + ldp x2, x3, [x1, #64] + ldp x4, x5, [x1, #80] + ldp x6, x7, [x1, #96] + ldp x8, x9, [x1, #112] + add x1, x1, #128 + stnp x2, x3, [x0, #64] + stnp x4, x5, [x0, #80] + stnp x6, x7, [x0, #96] + stnp x8, x9, [x0, #112] + add x0, x0, #128 tst x1, #(PAGE_SIZE - 1) b.ne 1b ret -- 2.4.3