diff options
author | tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> | 2010-02-17 00:32:19 +0000 |
---|---|---|
committer | tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> | 2010-02-17 00:32:19 +0000 |
commit | 0d382a262c0638d0f572fc37193ccc5ed3dc895f (patch) | |
tree | 8578dcddfce4191f3f7a142a37769df7add48475 /k9vamps/tcmemcpy.cpp | |
download | k9copy-0d382a262c0638d0f572fc37193ccc5ed3dc895f.tar.gz k9copy-0d382a262c0638d0f572fc37193ccc5ed3dc895f.zip |
Added old abandoned version of k9copy
git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/applications/k9copy@1091546 283d02a7-25f6-0310-bc7c-ecb5cbfe19da
Diffstat (limited to 'k9vamps/tcmemcpy.cpp')
-rw-r--r-- | k9vamps/tcmemcpy.cpp | 483 |
1 files changed, 483 insertions, 0 deletions
diff --git a/k9vamps/tcmemcpy.cpp b/k9vamps/tcmemcpy.cpp new file mode 100644 index 0000000..ceb7c69 --- /dev/null +++ b/k9vamps/tcmemcpy.cpp @@ -0,0 +1,483 @@ +/* + * tcmemcpy.c - optimized memcpy() routines for transcode + * Written by Andrew Church <achurch@achurch.org> + */ + +#include <string.h> +#include <stdio.h> +#include "ac.h" + +/*************************************************************************/ + +#if defined(ARCH_X86) + +/* MMX-optimized routine, intended for PMMX/PII processors. + * Nonstandard instructions used: + * (CPUID.MMX) MOVQ + */ + +void *ac_memcpy_mmx(void *dest, const void *src, size_t bytes) +{ + asm("\ +PENTIUM_LINE_SIZE = 32 # PMMX/PII cache line size \n\ +PENTIUM_CACHE_SIZE = 8192 # PMMX/PII total cache size \n\ +# Use only half because writes may touch the cache too (PII) \n\ +PENTIUM_CACHE_BLOCK = (PENTIUM_CACHE_SIZE/2 - PENTIUM_LINE_SIZE) \n\ + \n\ + push %%ebx # Save PIC register \n\ + push %%edi # Save destination for return value \n\ + cld # MOVS* should ascend \n\ + \n\ + mov $64, %%ebx # Constant \n\ + \n\ + cmp %%ebx, %%ecx \n\ + jb mmx.memcpy_last # Just use movs if <64 bytes \n\ + \n\ + # First align destination address to a multiple of 8 bytes \n\ + mov $8, %%eax # EAX <- (8-dest) & 7 \n\ + sub %%edi, %%eax \n\ + and $0b111, %%eax # ... which is the number of bytes to copy\n\ + lea 0f, %%edx # Use a computed jump--faster than a loop\n\ + sub %%eax, %%edx \n\ + jmp *%%edx # Execute 0-7 MOVSB's \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ +0: sub %%eax, %%ecx # Update count \n\ + \n\ + # Now copy data in blocks \n\ +0: mov %%ecx, %%edx # EDX <- ECX >> 6 (cache lines to copy) \n\ + shr $6, %%edx \n\ + jz mmx.memcpy_last # <64 bytes left? Skip to end \n\ + cmp $PENTIUM_CACHE_BLOCK/64, %%edx \n\ + jb 1f # Limit size of block \n\ + mov $PENTIUM_CACHE_BLOCK/64, %%edx \n\ +1: mov %%edx, %%eax # EAX <- EDX << 6 (bytes to copy) \n\ + shl $6, %%eax \n\ + sub %%eax, %%ecx # Update remaining count \n\ + add %%eax, %%esi # Point to end of region to be block-copied\n\ +2: test %%eax, -32(%%esi) # Touch each cache line in reverse order\n\ + test %%eax, -64(%%esi) \n\ + sub %%ebx, %%esi # Update pointer \n\ + sub %%ebx, %%eax # And loop \n\ + jnz 2b \n\ + # Note that ESI now points to the beginning of the block \n\ +3: movq (%%esi), %%mm0 # Do the actual copy, 64 bytes at a time\n\ + movq 8(%%esi), %%mm1 \n\ + movq 16(%%esi), %%mm2 \n\ + movq 24(%%esi), %%mm3 \n\ + movq 32(%%esi), %%mm4 \n\ + movq 40(%%esi), %%mm5 \n\ + movq 48(%%esi), %%mm6 \n\ + movq 56(%%esi), %%mm7 \n\ + movq %%mm0, (%%edi) \n\ + movq %%mm1, 8(%%edi) \n\ + movq %%mm2, 16(%%edi) \n\ + movq %%mm3, 24(%%edi) \n\ + movq %%mm4, 32(%%edi) \n\ + movq %%mm5, 40(%%edi) \n\ + movq %%mm6, 48(%%edi) \n\ + movq %%mm7, 56(%%edi) \n\ + add %%ebx, %%esi # Update pointers \n\ + add %%ebx, %%edi \n\ + dec %%edx # And loop \n\ + jnz 3b \n\ + jmp 0b \n\ + \n\ +mmx.memcpy_last: \n\ + # Copy last <64 bytes, using the computed jump trick \n\ + mov %%ecx, %%eax # EAX <- ECX>>2 \n\ + shr $2, %%eax \n\ + lea 0f, %%edx \n\ + sub %%eax, %%edx \n\ + jmp *%%edx # Execute 0-15 MOVSD's \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ +0: and $0b11, %%ecx # ECX <- ECX & 3 \n\ + lea 0f, %%edx \n\ + sub %%ecx, %%edx \n\ + jmp *%%edx # Execute 0-3 MOVSB's \n\ + movsb \n\ + movsb \n\ + movsb \n\ +0: \n\ + # All done! \n\ + emms # Clean up MMX state \n\ + pop %%edi # Restore destination (return value) \n\ + pop %%ebx # Restore PIC register \n\ + " : /* no outputs */ + : "D" (dest), "S" (src), "c" (bytes) + : "%eax", "%edx" + ); + return dest; +} + +#endif /* ARCH_X86 */ + +/*************************************************************************/ + +#if defined(ARCH_X86) + +/* SSE-optimized routine. Backported from AMD64 routine below. + * Nonstandard instructions used: + * (CPUID.CMOVE) CMOVA + * (CPUID.MMX) MOVQ + * (CPUID.SSE) MOVNTQ + */ + +void *ac_memcpy_sse(void *dest, const void *src, size_t bytes) +{ + asm("\ + push %%ebx # Save PIC register \n\ + push %%edi # Save destination for return value \n\ + cld # MOVS* should ascend \n\ + \n\ + cmp $64, %%ecx # Skip block copy for small blocks \n\ + jb sse.memcpy_last \n\ + \n\ + mov $128, %%ebx # Constant used later \n\ + \n\ + # First align destination address to a multiple of 8 bytes \n\ + mov $8, %%eax # EAX <- (8-dest) & 7 \n\ + sub %%edi, %%eax \n\ + and $0b111, %%eax # ... which is the number of bytes to copy\n\ + lea 0f, %%edx # Use a computed jump--faster than a loop\n\ + sub %%eax, %%edx \n\ + jmp *%%edx # Execute 0-7 MOVSB's \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ +0: sub %%eax, %%ecx # Update count \n\ + \n\ + cmp $0x10040, %%ecx # Is this a large block? (0x10040 is an \n\ + # arbitrary value where prefetching and \n\ + # write combining seem to start becoming\n\ + # faster) \n\ + jae sse.memcpy_bp # Yup, use prefetch copy \n\ + \n\ +sse.memcpy_small: # Small block copy routine--no prefetch \n" +#if 0 +" mov %%ecx, %%edx # EDX <- bytes to copy / 8 \n\ + shr $3, %%edx \n\ + mov %%edx, %%eax # Leave remainder in ECX for later \n\ + shl $3, %%eax \n\ + sub %%eax, %%ecx \n\ + .align 16 \n\ +0: movq (%%esi), %%mm0 # Copy 8 bytes of data \n\ + movq %%mm0, (%%edi) \n\ + add $8, %%esi # Update pointers \n\ + add $8, %%edi \n\ + dec %%edx # And loop \n\ + jg 0b \n\ + jmp sse.memcpy_last # Copy any remaining bytes \n\ + \n\ + nop # Align loops below \n" +#else +" # It appears that a simple rep movs is faster than cleverness \n\ + # with movq... \n\ + mov %%ecx, %%edx # EDX <- ECX & 3 \n\ + and $0b11, %%edx \n\ + shr $2, %%ecx # ECX <- ECX >> 2 \n\ + rep movsl # Copy away! \n\ + mov %%edx, %%ecx # Take care of last 0-3 bytes \n\ + rep movsb \n\ + jmp sse.memcpy_end # And exit \n\ + \n\ + .align 16 \n\ + nop \n\ + nop \n" +#endif +"sse.memcpy_bp: # Block prefetch copy routine \n\ +0: mov %%ecx, %%edx # EDX: temp counter \n\ + shr $6, %%edx # Divide by cache line size (64 bytes) \n\ + cmp %%ebx, %%edx # ... and cap at 128 (8192 bytes) \n\ + cmova %%ebx, %%edx \n\ + shl $3, %%edx # EDX <- cache lines to copy * 8 \n\ + mov %%edx, %%eax # EAX <- cache lines to preload * 8 \n\ + # (also used as memory offset) \n\ +1: test %%eax, -64(%%esi,%%eax,8) # Preload cache lines in pairs \n\ + test %%eax, -128(%%esi,%%eax,8) # (going backwards) \n\ + # (note that test %%eax,... seems to be faster than prefetchnta \n\ + # on x86) \n\ + sub $16, %%eax # And loop \n\ + jg 1b \n\ + \n\ + # Then copy--forward, which seems to be faster than reverse for \n\ + # certain alignments \n\ + xor %%eax, %%eax \n\ +2: movq (%%esi,%%eax,8), %%mm0 # Copy 8 bytes and loop \n\ + movntq %%mm0, (%%edi,%%eax,8) \n\ + inc %%eax \n\ + cmp %%edx, %%eax \n\ + jb 2b \n\ + \n\ + # Finally, update pointers and count, and loop \n\ + shl $3, %%edx # EDX <- bytes copied \n\ + add %%edx, %%esi \n\ + add %%edx, %%edi \n\ + sub %%edx, %%ecx \n\ + cmp $64, %%ecx # At least one cache line left? \n\ + jae 0b # Yup, loop \n\ + \n\ +sse.memcpy_last: \n\ + # Copy last <64 bytes, using the computed jump trick \n\ + mov %%ecx, %%eax # EAX <- ECX>>2 \n\ + shr $2, %%eax \n\ + lea 0f, %%edx \n\ + sub %%eax, %%edx \n\ + jmp *%%edx # Execute 0-15 MOVSD's \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ + movsd \n\ +0: and $0b11, %%ecx # ECX <- ECX & 3 \n\ + lea sse.memcpy_end, %%edx \n\ + sub %%ecx, %%edx \n\ + jmp *%%edx # Execute 0-3 MOVSB's \n\ + movsb \n\ + movsb \n\ + movsb \n\ + \n\ +sse.memcpy_end: \n\ + # All done! \n\ + emms # Clean up after MMX instructions \n\ + sfence # Flush the write buffer \n\ + pop %%edi # Restore destination (return value) \n\ + pop %%ebx # Restore PIC register \n\ + " : /* no outputs */ + : "D" (dest), "S" (src), "c" (bytes) + : "%eax", "%edx" + ); + return dest; +} + +#endif /* ARCH_X86 */ + +/*************************************************************************/ + +#if defined(ARCH_X86_64) + +/* AMD64-optimized routine, using SSE2. Derived from AMD64 optimization + * guide section 5.13: Appropriate Memory Copying Routines. + * Nonstandard instructions used: + * (CPUID.CMOVE) CMOVA + * (CPUID.SSE2) MOVDQA, MOVDQU, MOVNTDQ + * + * Note that this routine will also run more or less as-is (modulo register + * names and label(%%rip) references) on x86 CPUs, but tests have shown the + * SSE1 version above to be faster. + */ + +/* The block copying code--macroized because we use two versions of it + * depending on whether the source is 16-byte-aligned or not. Pass either + * movdqa or movdqu (unquoted) for the parameter. */ +#define AMD64_BLOCK_MEMCPY(movdq) \ +" # First prefetch (note that if we end on an odd number of cache \n\ + # lines, we skip prefetching the last one--faster that way than \n\ + # prefetching line by line or treating it as a special case) \n\ +0: mov %%ecx, %%edx # EDX: temp counter (always <32 bits) \n\ + shr $6, %%edx # Divide by cache line size (64 bytes) \n\ + cmp %%ebx, %%edx # ... and cap at 128 (8192 bytes) \n\ + cmova %%ebx, %%edx \n\ + shl $3, %%edx # EDX <- cache lines to copy * 8 \n\ + mov %%edx, %%eax # EAX <- cache lines to preload * 8 \n\ + # (also used as memory offset) \n\ +1: prefetchnta -64(%%rsi,%%rax,8) # Preload cache lines in pairs \n\ + prefetchnta -128(%%rsi,%%rax,8) # (going backwards) \n\ + sub $16, %%eax # And loop \n\ + jg 1b \n\ + \n\ + # Then copy--forward, which seems to be faster than reverse for \n\ + # certain alignments \n\ + xor %%eax, %%eax \n\ +2: " #movdq " (%%rsi,%%rax,8), %%xmm0 # Copy 16 bytes and loop \n\ + movntdq %%xmm0, (%%rdi,%%rax,8) \n\ + add $2, %%eax \n\ + cmp %%edx, %%eax \n\ + jb 2b \n\ + \n\ + # Finally, update pointers and count, and loop \n\ + shl $3, %%edx # EDX <- bytes copied \n\ + add %%rdx, %%rsi \n\ + add %%rdx, %%rdi \n\ + sub %%rdx, %%rcx \n\ + cmp $64, %%rcx # At least one cache line left? \n\ + jae 0b # Yup, loop \n" + +void *ac_memcpy_amd64(void *dest, const void *src, size_t bytes) +{ + asm("\ + push %%rdi # Save destination for return value \n\ + cld # MOVS* should ascend \n\ + \n\ + cmp $64, %%rcx # Skip block copy for small blocks \n\ + jb amd64.memcpy_last \n\ + \n\ + mov $128, %%ebx # Constant used later \n\ + \n\ + # First align destination address to a multiple of 16 bytes \n\ + mov $8, %%eax # EAX <- (8-dest) & 7 \n\ + sub %%edi, %%eax # (we don't care about the top 32 bits) \n\ + and $0b111, %%eax # ... which is the number of bytes to copy\n\ + lea 0f(%%rip), %%rdx # Use a computed jump--faster than a loop\n\ + sub %%rax, %%rdx \n\ + jmp *%%rdx # Execute 0-7 MOVSB's \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ +0: sub %%rax, %%rcx # Update count \n\ + test $0b1000, %%edi # Is destination not 16-byte aligned? \n\ + je 1f \n\ + movsq # Then move 8 bytes to align it \n\ + sub $8, %%rcx \n\ + \n\ +1: cmp $0x38000, %%rcx # Is this a large block? (0x38000 is an \n\ + # arbitrary value where prefetching and \n\ + # write combining seem to start becoming\n\ + # faster) \n\ + jb amd64.memcpy_small # Nope, use small copy (no prefetch/WC) \n\ + test $0b1111, %%esi # Is source also 16-byte aligned? \n\ + # (use ESI to save a REX prefix byte) \n\ + jnz amd64.memcpy_normal_bp # Nope, use slow copy \n\ + jmp amd64.memcpy_fast_bp # Yup, use fast copy \n\ + \n\ +amd64.memcpy_small: # Small block copy routine--no prefetch \n\ + mov %%ecx, %%edx # EDX <- bytes to copy / 16 \n\ + shr $4, %%edx # (count known to fit in 32 bits) \n\ + mov %%edx, %%eax # Leave remainder in ECX for later \n\ + shl $4, %%eax \n\ + sub %%eax, %%ecx \n\ + .align 16 \n\ +0: movdqu (%%rsi), %%xmm0 # Copy 16 bytes of data \n\ + movdqa %%xmm0, (%%rdi) \n\ + add $16, %%rsi # Update pointers \n\ + add $16, %%rdi \n\ + dec %%edx # And loop \n\ + jnz 0b \n\ + jmp amd64.memcpy_last # Copy any remaining bytes \n\ + \n\ + .align 16 \n\ + nop \n\ + nop \n\ +amd64.memcpy_fast_bp: # Fast block prefetch loop \n" +AMD64_BLOCK_MEMCPY(movdqa) +" jmp amd64.memcpy_last # Copy any remaining bytes \n\ + \n\ + .align 16 \n\ + nop \n\ + nop \n\ +amd64.memcpy_normal_bp: # Normal (unaligned) block prefetch loop\n" +AMD64_BLOCK_MEMCPY(movdqu) +" \n\ +amd64.memcpy_last: \n\ + # Copy last <64 bytes, using the computed jump trick \n\ + mov %%ecx, %%eax # EAX <- ECX>>3 \n\ + shr $3, %%eax \n\ + lea 0f(%%rip), %%rdx \n\ + add %%eax, %%eax # Watch out, MOVSQ is 2 bytes! \n\ + sub %%rax, %%rdx \n\ + jmp *%%rdx # Execute 0-7 MOVSQ's \n\ + movsq \n\ + movsq \n\ + movsq \n\ + movsq \n\ + movsq \n\ + movsq \n\ + movsq \n\ +0: and $0b111, %%ecx # ECX <- ECX & 7 \n\ + lea 0f(%%rip), %%rdx \n\ + sub %%rcx, %%rdx \n\ + jmp *%%rdx # Execute 0-7 MOVSB's \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ + movsb \n\ +0: \n\ + # All done! \n\ + emms # Clean up after MMX instructions \n\ + sfence # Flush the write buffer \n\ + pop %%rdi # Restore destination (return value) \n\ + " : /* no outputs */ + : "D" (dest), "S" (src), "c" (bytes) + : "%rax", "%rbx", "%rdx" + ); + return dest; +} + +#endif /* ARCH_X86_64 */ + +/*************************************************************************/ + +void * (*tc_memcpy)(void *, const void *, size_t) = memcpy; + +void tc_memcpy_init(int verbose, int mmflags) +{ + const char * method = "libc"; + +#if defined(ARCH_X86) || defined(ARCH_X86_64) + int accel = mmflags == -1 ? ac_mmflag() : mmflags; +#endif + +#if defined(ARCH_X86) + if((accel & MM_CMOVE) && (accel & MM_SSE)) + { + method = "sse"; + tc_memcpy = ac_memcpy_sse; + } + else if(accel & MM_MMX) + { + method = "mmx"; + tc_memcpy = ac_memcpy_mmx; + } +#endif + +#if defined(ARCH_X86_64) + if((accel & MM_CMOVE) && (accel & MM_SSE2)) + { + method = "amd64"; + tc_memcpy = ac_memcpy_amd64; + } +#endif + + if(verbose) + fprintf(stderr, "tc_memcpy: using %s for memcpy\n", method); +} |