Added old abandoned version of k9copy

git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/applications/k9copy@1091546 283d02a7-25f6-0310-bc7c-ecb5cbfe19da
author: tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> 2010-02-17 00:32:19 +0000
committer: tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> 2010-02-17 00:32:19 +0000
commit: 0d382a262c0638d0f572fc37193ccc5ed3dc895f (patch)
tree: 8578dcddfce4191f3f7a142a37769df7add48475 /k9vamps/tcmemcpy.cpp
download: k9copy-0d382a262c0638d0f572fc37193ccc5ed3dc895f.tar.gz
k9copy-0d382a262c0638d0f572fc37193ccc5ed3dc895f.zip
1 files changed, 483 insertions, 0 deletions
diff --git a/k9vamps/tcmemcpy.cpp b/k9vamps/tcmemcpy.cpp
new file mode 100644
index 0000000..ceb7c69
--- /dev/null
+++ b/k9vamps/tcmemcpy.cpp
@@ -0,0 +1,483 @@
+/*
+ * tcmemcpy.c - optimized memcpy() routines for transcode
+ * Written by Andrew Church <achurch@achurch.org>
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include "ac.h"
+
+/*************************************************************************/
+
+#if defined(ARCH_X86)
+
+/* MMX-optimized routine, intended for PMMX/PII processors.
+ * Nonstandard instructions used:
+ *     (CPUID.MMX)   MOVQ
+ */
+
+void *ac_memcpy_mmx(void *dest, const void *src, size_t bytes)
+{
+    asm("\
+PENTIUM_LINE_SIZE = 32		# PMMX/PII cache line size		\n\
+PENTIUM_CACHE_SIZE = 8192	# PMMX/PII total cache size		\n\
+# Use only half because writes may touch the cache too (PII)		\n\
+PENTIUM_CACHE_BLOCK = (PENTIUM_CACHE_SIZE/2 - PENTIUM_LINE_SIZE)	\n\
+									\n\
+	push %%ebx		# Save PIC register			\n\
+	push %%edi		# Save destination for return value	\n\
+	cld			# MOVS* should ascend			\n\
+									\n\
+	mov $64, %%ebx		# Constant				\n\
+									\n\
+	cmp %%ebx, %%ecx						\n\
+	jb mmx.memcpy_last	# Just use movs if <64 bytes		\n\
+									\n\
+	# First align destination address to a multiple of 8 bytes	\n\
+	mov $8, %%eax		# EAX <- (8-dest) & 7			\n\
+	sub %%edi, %%eax						\n\
+	and $0b111, %%eax	# ... which is the number of bytes to copy\n\
+	lea 0f, %%edx		# Use a computed jump--faster than a loop\n\
+	sub %%eax, %%edx						\n\
+	jmp *%%edx		# Execute 0-7 MOVSB's			\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+0:	sub %%eax, %%ecx	# Update count				\n\
+									\n\
+	# Now copy data in blocks					\n\
+0:	mov %%ecx, %%edx	# EDX <- ECX >> 6 (cache lines to copy)	\n\
+	shr $6, %%edx							\n\
+	jz mmx.memcpy_last	# <64 bytes left?  Skip to end		\n\
+	cmp $PENTIUM_CACHE_BLOCK/64, %%edx				\n\
+	jb 1f			# Limit size of block			\n\
+	mov $PENTIUM_CACHE_BLOCK/64, %%edx				\n\
+1:	mov %%edx, %%eax	# EAX <- EDX << 6 (bytes to copy)	\n\
+	shl $6, %%eax							\n\
+	sub %%eax, %%ecx	# Update remaining count		\n\
+	add %%eax, %%esi	# Point to end of region to be block-copied\n\
+2:	test %%eax, -32(%%esi)	# Touch each cache line in reverse order\n\
+	test %%eax, -64(%%esi)						\n\
+	sub %%ebx, %%esi	# Update pointer			\n\
+	sub %%ebx, %%eax	# And loop				\n\
+	jnz 2b								\n\
+	# Note that ESI now points to the beginning of the block	\n\
+3:	movq   (%%esi), %%mm0	# Do the actual copy, 64 bytes at a time\n\
+	movq  8(%%esi), %%mm1						\n\
+	movq 16(%%esi), %%mm2						\n\
+	movq 24(%%esi), %%mm3						\n\
+	movq 32(%%esi), %%mm4						\n\
+	movq 40(%%esi), %%mm5						\n\
+	movq 48(%%esi), %%mm6						\n\
+	movq 56(%%esi), %%mm7						\n\
+	movq %%mm0,   (%%edi)						\n\
+	movq %%mm1,  8(%%edi)						\n\
+	movq %%mm2, 16(%%edi)						\n\
+	movq %%mm3, 24(%%edi)						\n\
+	movq %%mm4, 32(%%edi)						\n\
+	movq %%mm5, 40(%%edi)						\n\
+	movq %%mm6, 48(%%edi)						\n\
+	movq %%mm7, 56(%%edi)						\n\
+	add %%ebx, %%esi	# Update pointers			\n\
+	add %%ebx, %%edi						\n\
+	dec %%edx		# And loop				\n\
+	jnz 3b								\n\
+	jmp 0b								\n\
+									\n\
+mmx.memcpy_last:							\n\
+	# Copy last <64 bytes, using the computed jump trick		\n\
+	mov %%ecx, %%eax	# EAX <- ECX>>2				\n\
+	shr $2, %%eax							\n\
+	lea 0f, %%edx							\n\
+	sub %%eax, %%edx						\n\
+	jmp *%%edx		# Execute 0-15 MOVSD's			\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+0:	and $0b11, %%ecx	# ECX <- ECX & 3			\n\
+	lea 0f, %%edx							\n\
+	sub %%ecx, %%edx						\n\
+	jmp *%%edx		# Execute 0-3 MOVSB's			\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+0:									\n\
+	# All done!							\n\
+	emms			# Clean up MMX state			\n\
+	pop %%edi		# Restore destination (return value)	\n\
+	pop %%ebx		# Restore PIC register			\n\
+    " : /* no outputs */
+      : "D" (dest), "S" (src), "c" (bytes)
+      : "%eax", "%edx"
+    );
+    return dest;
+}
+
+#endif  /* ARCH_X86 */
+
+/*************************************************************************/
+
+#if defined(ARCH_X86)
+
+/* SSE-optimized routine.  Backported from AMD64 routine below.
+ * Nonstandard instructions used:
+ *     (CPUID.CMOVE) CMOVA
+ *     (CPUID.MMX)   MOVQ
+ *     (CPUID.SSE)   MOVNTQ
+ */
+
+void *ac_memcpy_sse(void *dest, const void *src, size_t bytes)
+{
+    asm("\
+	push %%ebx		# Save PIC register			\n\
+	push %%edi		# Save destination for return value	\n\
+	cld			# MOVS* should ascend			\n\
+									\n\
+	cmp $64, %%ecx		# Skip block copy for small blocks	\n\
+	jb sse.memcpy_last						\n\
+									\n\
+	mov $128, %%ebx		# Constant used later			\n\
+									\n\
+	# First align destination address to a multiple of 8 bytes	\n\
+	mov $8, %%eax		# EAX <- (8-dest) & 7			\n\
+	sub %%edi, %%eax						\n\
+	and $0b111, %%eax	# ... which is the number of bytes to copy\n\
+	lea 0f, %%edx		# Use a computed jump--faster than a loop\n\
+	sub %%eax, %%edx						\n\
+	jmp *%%edx		# Execute 0-7 MOVSB's			\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+0:	sub %%eax, %%ecx	# Update count				\n\
+									\n\
+	cmp $0x10040, %%ecx	# Is this a large block? (0x10040 is an	\n\
+				# arbitrary value where prefetching and	\n\
+				# write combining seem to start becoming\n\
+				# faster)				\n\
+	jae sse.memcpy_bp	# Yup, use prefetch copy		\n\
+									\n\
+sse.memcpy_small:		# Small block copy routine--no prefetch	\n"
+#if 0
+"	mov %%ecx, %%edx	# EDX <- bytes to copy / 8		\n\
+	shr $3, %%edx							\n\
+	mov %%edx, %%eax	# Leave remainder in ECX for later	\n\
+	shl $3, %%eax							\n\
+	sub %%eax, %%ecx						\n\
+	.align 16							\n\
+0:	movq (%%esi), %%mm0	# Copy 8 bytes of data			\n\
+	movq %%mm0, (%%edi)						\n\
+	add $8, %%esi		# Update pointers			\n\
+	add $8, %%edi							\n\
+	dec %%edx		# And loop				\n\
+	jg 0b								\n\
+	jmp sse.memcpy_last	# Copy any remaining bytes		\n\
+									\n\
+	nop			# Align loops below			\n"
+#else
+"	# It appears that a simple rep movs is faster than cleverness	\n\
+	# with movq...							\n\
+	mov %%ecx, %%edx	# EDX <- ECX & 3			\n\
+	and $0b11, %%edx						\n\
+	shr $2, %%ecx		# ECX <- ECX >> 2			\n\
+	rep movsl		# Copy away!				\n\
+	mov %%edx, %%ecx	# Take care of last 0-3 bytes		\n\
+	rep movsb							\n\
+	jmp sse.memcpy_end	# And exit				\n\
+									\n\
+	.align 16							\n\
+	nop								\n\
+	nop								\n"
+#endif
+"sse.memcpy_bp:			# Block prefetch copy routine		\n\
+0:	mov %%ecx, %%edx	# EDX: temp counter			\n\
+	shr $6, %%edx		# Divide by cache line size (64 bytes)	\n\
+	cmp %%ebx, %%edx	# ... and cap at 128 (8192 bytes)	\n\
+	cmova %%ebx, %%edx						\n\
+	shl $3, %%edx		# EDX <- cache lines to copy * 8	\n\
+	mov %%edx, %%eax	# EAX <- cache lines to preload * 8	\n\
+				#        (also used as memory offset)	\n\
+1:	test %%eax, -64(%%esi,%%eax,8)	# Preload cache lines in pairs	\n\
+	test %%eax, -128(%%esi,%%eax,8)	# (going backwards)		\n\
+	# (note that test %%eax,... seems to be faster than prefetchnta	\n\
+	#  on x86)							\n\
+	sub $16, %%eax		# And loop				\n\
+	jg 1b								\n\
+									\n\
+	# Then copy--forward, which seems to be faster than reverse for	\n\
+	# certain alignments						\n\
+	xor %%eax, %%eax						\n\
+2:	movq (%%esi,%%eax,8), %%mm0 # Copy 8 bytes and loop		\n\
+	movntq %%mm0, (%%edi,%%eax,8)					\n\
+	inc %%eax							\n\
+	cmp %%edx, %%eax						\n\
+	jb 2b								\n\
+									\n\
+	# Finally, update pointers and count, and loop			\n\
+	shl $3, %%edx		# EDX <- bytes copied			\n\
+	add %%edx, %%esi						\n\
+	add %%edx, %%edi						\n\
+	sub %%edx, %%ecx						\n\
+	cmp $64, %%ecx		# At least one cache line left?		\n\
+	jae 0b			# Yup, loop				\n\
+									\n\
+sse.memcpy_last:							\n\
+	# Copy last <64 bytes, using the computed jump trick		\n\
+	mov %%ecx, %%eax	# EAX <- ECX>>2				\n\
+	shr $2, %%eax							\n\
+	lea 0f, %%edx							\n\
+	sub %%eax, %%edx						\n\
+	jmp *%%edx		# Execute 0-15 MOVSD's			\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+	movsd								\n\
+0:	and $0b11, %%ecx	# ECX <- ECX & 3			\n\
+	lea sse.memcpy_end, %%edx					\n\
+	sub %%ecx, %%edx						\n\
+	jmp *%%edx		# Execute 0-3 MOVSB's			\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+									\n\
+sse.memcpy_end:								\n\
+	# All done!							\n\
+	emms			# Clean up after MMX instructions	\n\
+	sfence			# Flush the write buffer		\n\
+	pop %%edi		# Restore destination (return value)	\n\
+	pop %%ebx		# Restore PIC register			\n\
+    " : /* no outputs */
+      : "D" (dest), "S" (src), "c" (bytes)
+      : "%eax", "%edx"
+    );
+    return dest;
+}
+
+#endif  /* ARCH_X86 */
+
+/*************************************************************************/
+
+#if defined(ARCH_X86_64)
+
+/* AMD64-optimized routine, using SSE2.  Derived from AMD64 optimization
+ * guide section 5.13: Appropriate Memory Copying Routines.
+ * Nonstandard instructions used:
+ *     (CPUID.CMOVE) CMOVA
+ *     (CPUID.SSE2)  MOVDQA, MOVDQU, MOVNTDQ
+ *
+ * Note that this routine will also run more or less as-is (modulo register
+ * names and label(%%rip) references) on x86 CPUs, but tests have shown the
+ * SSE1 version above to be faster.
+ */
+
+/* The block copying code--macroized because we use two versions of it
+ * depending on whether the source is 16-byte-aligned or not.  Pass either
+ * movdqa or movdqu (unquoted) for the parameter. */
+#define AMD64_BLOCK_MEMCPY(movdq) \
+"	# First prefetch (note that if we end on an odd number of cache	\n\
+	# lines, we skip prefetching the last one--faster that way than	\n\
+	# prefetching line by line or treating it as a special case)	\n\
+0:	mov %%ecx, %%edx	# EDX: temp counter (always <32 bits)	\n\
+	shr $6, %%edx		# Divide by cache line size (64 bytes)	\n\
+	cmp %%ebx, %%edx	# ... and cap at 128 (8192 bytes)	\n\
+	cmova %%ebx, %%edx						\n\
+	shl $3, %%edx		# EDX <- cache lines to copy * 8	\n\
+	mov %%edx, %%eax	# EAX <- cache lines to preload * 8	\n\
+				#        (also used as memory offset)	\n\
+1:	prefetchnta -64(%%rsi,%%rax,8)	# Preload cache lines in pairs	\n\
+	prefetchnta -128(%%rsi,%%rax,8)	# (going backwards)		\n\
+	sub $16, %%eax		# And loop				\n\
+	jg 1b								\n\
+									\n\
+	# Then copy--forward, which seems to be faster than reverse for	\n\
+	# certain alignments						\n\
+	xor %%eax, %%eax						\n\
+2:	" #movdq " (%%rsi,%%rax,8), %%xmm0 # Copy 16 bytes and loop	\n\
+	movntdq %%xmm0, (%%rdi,%%rax,8)					\n\
+	add $2, %%eax							\n\
+	cmp %%edx, %%eax						\n\
+	jb 2b								\n\
+									\n\
+	# Finally, update pointers and count, and loop			\n\
+	shl $3, %%edx		# EDX <- bytes copied			\n\
+	add %%rdx, %%rsi						\n\
+	add %%rdx, %%rdi						\n\
+	sub %%rdx, %%rcx						\n\
+	cmp $64, %%rcx		# At least one cache line left?		\n\
+	jae 0b			# Yup, loop				\n"
+
+void *ac_memcpy_amd64(void *dest, const void *src, size_t bytes)
+{
+    asm("\
+	push %%rdi		# Save destination for return value	\n\
+	cld			# MOVS* should ascend			\n\
+									\n\
+	cmp $64, %%rcx		# Skip block copy for small blocks	\n\
+	jb amd64.memcpy_last						\n\
+									\n\
+	mov $128, %%ebx		# Constant used later			\n\
+									\n\
+	# First align destination address to a multiple of 16 bytes	\n\
+	mov $8, %%eax		# EAX <- (8-dest) & 7			\n\
+	sub %%edi, %%eax	# (we don't care about the top 32 bits)	\n\
+	and $0b111, %%eax	# ... which is the number of bytes to copy\n\
+	lea 0f(%%rip), %%rdx	# Use a computed jump--faster than a loop\n\
+	sub %%rax, %%rdx						\n\
+	jmp *%%rdx		# Execute 0-7 MOVSB's			\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+0:	sub %%rax, %%rcx	# Update count				\n\
+	test $0b1000, %%edi	# Is destination not 16-byte aligned?	\n\
+	je 1f								\n\
+	movsq			# Then move 8 bytes to align it		\n\
+	sub $8, %%rcx							\n\
+									\n\
+1:	cmp $0x38000, %%rcx	# Is this a large block? (0x38000 is an	\n\
+				# arbitrary value where prefetching and	\n\
+				# write combining seem to start becoming\n\
+				# faster)				\n\
+	jb amd64.memcpy_small	# Nope, use small copy (no prefetch/WC)	\n\
+	test $0b1111, %%esi	# Is source also 16-byte aligned?	\n\
+				# (use ESI to save a REX prefix byte)	\n\
+	jnz amd64.memcpy_normal_bp # Nope, use slow copy		\n\
+	jmp amd64.memcpy_fast_bp # Yup, use fast copy			\n\
+									\n\
+amd64.memcpy_small:		# Small block copy routine--no prefetch	\n\
+	mov %%ecx, %%edx	# EDX <- bytes to copy / 16		\n\
+	shr $4, %%edx		# (count known to fit in 32 bits)	\n\
+	mov %%edx, %%eax	# Leave remainder in ECX for later	\n\
+	shl $4, %%eax							\n\
+	sub %%eax, %%ecx						\n\
+	.align 16							\n\
+0:	movdqu (%%rsi), %%xmm0	# Copy 16 bytes of data			\n\
+	movdqa %%xmm0, (%%rdi)						\n\
+	add $16, %%rsi		# Update pointers			\n\
+	add $16, %%rdi							\n\
+	dec %%edx		# And loop				\n\
+	jnz 0b								\n\
+	jmp amd64.memcpy_last	# Copy any remaining bytes		\n\
+									\n\
+	.align 16							\n\
+	nop								\n\
+	nop								\n\
+amd64.memcpy_fast_bp:		# Fast block prefetch loop		\n"
+AMD64_BLOCK_MEMCPY(movdqa)
+"	jmp amd64.memcpy_last	# Copy any remaining bytes		\n\
+									\n\
+	.align 16							\n\
+	nop								\n\
+	nop								\n\
+amd64.memcpy_normal_bp:		# Normal (unaligned) block prefetch loop\n"
+AMD64_BLOCK_MEMCPY(movdqu)
+"									\n\
+amd64.memcpy_last:							\n\
+	# Copy last <64 bytes, using the computed jump trick		\n\
+	mov %%ecx, %%eax	# EAX <- ECX>>3				\n\
+	shr $3, %%eax							\n\
+	lea 0f(%%rip), %%rdx						\n\
+	add %%eax, %%eax	# Watch out, MOVSQ is 2 bytes!		\n\
+	sub %%rax, %%rdx						\n\
+	jmp *%%rdx		# Execute 0-7 MOVSQ's			\n\
+	movsq								\n\
+	movsq								\n\
+	movsq								\n\
+	movsq								\n\
+	movsq								\n\
+	movsq								\n\
+	movsq								\n\
+0:	and $0b111, %%ecx	# ECX <- ECX & 7			\n\
+	lea 0f(%%rip), %%rdx						\n\
+	sub %%rcx, %%rdx						\n\
+	jmp *%%rdx		# Execute 0-7 MOVSB's			\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+	movsb								\n\
+0:									\n\
+	# All done!							\n\
+	emms			# Clean up after MMX instructions	\n\
+	sfence			# Flush the write buffer		\n\
+	pop %%rdi		# Restore destination (return value)	\n\
+    " : /* no outputs */
+      : "D" (dest), "S" (src), "c" (bytes)
+      : "%rax", "%rbx", "%rdx"
+    );
+    return dest;
+}
+
+#endif  /* ARCH_X86_64 */
+
+/*************************************************************************/
+
+void * (*tc_memcpy)(void *, const void *, size_t) = memcpy;
+
+void tc_memcpy_init(int verbose, int mmflags)
+{
+	const char * method = "libc";
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+	int accel = mmflags == -1 ? ac_mmflag() : mmflags;
+#endif
+
+#if defined(ARCH_X86)
+	if((accel & MM_CMOVE) && (accel & MM_SSE))
+	{
+		method = "sse";
+		tc_memcpy = ac_memcpy_sse;
+	}
+	else if(accel & MM_MMX)
+	{
+		method = "mmx";
+		tc_memcpy = ac_memcpy_mmx;
+	}
+#endif
+
+#if defined(ARCH_X86_64)
+	if((accel & MM_CMOVE) && (accel & MM_SSE2))
+	{
+		method = "amd64";
+		tc_memcpy = ac_memcpy_amd64;
+	}
+#endif
+
+	if(verbose)
+		fprintf(stderr, "tc_memcpy: using %s for memcpy\n", method);
+}
author	tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da>	2010-02-17 00:32:19 +0000
committer	tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da>	2010-02-17 00:32:19 +0000
commit	0d382a262c0638d0f572fc37193ccc5ed3dc895f (patch)
tree	8578dcddfce4191f3f7a142a37769df7add48475 /k9vamps/tcmemcpy.cpp
download	k9copy-0d382a262c0638d0f572fc37193ccc5ed3dc895f.tar.gz k9copy-0d382a262c0638d0f572fc37193ccc5ed3dc895f.zip