2014-06-17 12:58:27 +02:00
|
|
|
Fix for bnc #868622, slow memset for large block sizes.
|
|
|
|
|
|
|
|
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
|
|
|
|
index db4fb84..9c42018 100644
|
|
|
|
--- a/sysdeps/x86_64/memset.S
|
|
|
|
+++ b/sysdeps/x86_64/memset.S
|
|
|
|
@@ -84,6 +84,9 @@ L(loop_start):
|
2016-02-24 12:39:39 +01:00
|
|
|
movdqu %xmm0, -48(%rdi,%rdx)
|
|
|
|
movdqu %xmm0, 48(%rdi)
|
|
|
|
movdqu %xmm0, -64(%rdi,%rdx)
|
2014-06-17 12:58:27 +02:00
|
|
|
+ mov __x86_shared_cache_size(%rip),%r9d # The largest cache size
|
|
|
|
+ cmp %r9,%rdx
|
|
|
|
+ ja L(nt_move)
|
|
|
|
addq %rdi, %rdx
|
|
|
|
andq $-64, %rdx
|
|
|
|
cmpq %rdx, %rcx
|
|
|
|
@@ -99,6 +102,23 @@ L(loop):
|
|
|
|
jne L(loop)
|
|
|
|
rep
|
|
|
|
ret
|
|
|
|
+L(nt_move):
|
|
|
|
+ addq %rdi, %rdx
|
|
|
|
+ andq $-64, %rdx
|
|
|
|
+ cmpq %rdx, %rcx
|
|
|
|
+ je L(return)
|
|
|
|
+ .p2align 4
|
|
|
|
+L(nt_loop):
|
2016-02-24 12:39:39 +01:00
|
|
|
+ movntdq %xmm0, (%rcx)
|
|
|
|
+ movntdq %xmm0, 16(%rcx)
|
|
|
|
+ movntdq %xmm0, 32(%rcx)
|
|
|
|
+ movntdq %xmm0, 48(%rcx)
|
2014-06-17 12:58:27 +02:00
|
|
|
+ addq $64, %rcx
|
|
|
|
+ cmpq %rcx, %rdx
|
|
|
|
+ jne L(nt_loop)
|
|
|
|
+ sfence
|
|
|
|
+ rep
|
|
|
|
+ ret
|
|
|
|
L(less_16_bytes):
|
2016-02-24 12:39:39 +01:00
|
|
|
movq %xmm0, %rcx
|
2014-06-17 12:58:27 +02:00
|
|
|
testb $24, %dl
|