Fix for bnc #868622, slow memset for large block sizes. diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index db4fb84..9c42018 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -84,6 +84,9 @@ L(loop_start): movdqu %xmm8, -48(%rdi,%rdx) movdqu %xmm8, 48(%rdi) movdqu %xmm8, -64(%rdi,%rdx) + mov __x86_shared_cache_size(%rip),%r9d # The largest cache size + cmp %r9,%rdx + ja L(nt_move) addq %rdi, %rdx andq $-64, %rdx cmpq %rdx, %rcx @@ -99,6 +102,23 @@ L(loop): jne L(loop) rep ret +L(nt_move): + addq %rdi, %rdx + andq $-64, %rdx + cmpq %rdx, %rcx + je L(return) + .p2align 4 +L(nt_loop): + movntdq %xmm8, (%rcx) + movntdq %xmm8, 16(%rcx) + movntdq %xmm8, 32(%rcx) + movntdq %xmm8, 48(%rcx) + addq $64, %rcx + cmpq %rcx, %rdx + jne L(nt_loop) + sfence + rep + ret L(less_16_bytes): movq %xmm8, %rcx testb $24, %dl