From e2de64d6f1beb9e492daf5b886e19933c1fa41dd Mon Sep 17 00:00:00 2001 From: toma Date: Wed, 25 Nov 2009 17:56:58 +0000 Subject: Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features. BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdemultimedia@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da --- mpg123_artsplugin/mpg123/decode_3dnow.s | 279 ++++++++++++++++++++++++++++++++ 1 file changed, 279 insertions(+) create mode 100644 mpg123_artsplugin/mpg123/decode_3dnow.s (limited to 'mpg123_artsplugin/mpg123/decode_3dnow.s') diff --git a/mpg123_artsplugin/mpg123/decode_3dnow.s b/mpg123_artsplugin/mpg123/decode_3dnow.s new file mode 100644 index 00000000..fd39429a --- /dev/null +++ b/mpg123_artsplugin/mpg123/decode_3dnow.s @@ -0,0 +1,279 @@ +# +# decode_3dnow.s - 3DNow! optimized synth_1to1() +# +# This code based 'decode_3dnow.s' by Syuuhei Kashiyama +# ,only two types of changes have been made: +# +# - remove PREFETCH instruction for speedup +# - change function name for support 3DNow! automatic detect +# - femms moved to before 'call dct64_3dnow' +# +# You can find Kashiyama's original 3dnow! support patch +# (for mpg123-0.59o) at +# http:#/user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese). +# +# by KIMURA Takuhiro - until 31.Mar.1999 +# - after 1.Apr.1999 +# + +##/ +##/ Replacement of synth_1to1() with AMD's 3DNow! SIMD operations support +##/ +##/ Syuuhei Kashiyama +##/ +##/ The author of this program disclaim whole expressed or implied +##/ warranties with regard to this program, and in no event shall the +##/ author of this program liable to whatever resulted from the use of +##/ this program. Use it at your own risk. +##/ + + .local buffs.40 + .comm buffs.40,4352,32 +.data + .align 4 + .type bo.42,@object + .size bo.42,4 +bo.42: + .long 1 +.text +.globl synth_1to1_3dnow + .type synth_1to1_3dnow,@function +synth_1to1_3dnow: + subl $24,%esp + pushl %ebp + pushl %edi + xorl %ebp,%ebp + pushl %esi + pushl %ebx + movl 56(%esp),%esi + movl 52(%esp),%edi + movl 0(%esi),%esi + movl 48(%esp),%ebx + addl %edi,%esi + movl %esi,16(%esp) + + femms + + # fixed by Takuhiro + cmpl $0,param+348 + je .L25 + pushl %ebx + pushl 48(%esp) + call do_equalizer_3dnow + addl $8,%esp +.L25: + testl %ebx,%ebx + jne .L26 + decl bo.42 + movl $buffs.40,%ecx + andl $15,bo.42 + jmp .L27 +.L26: + addl $2,16(%esp) + movl $buffs.40+2176,%ecx +.L27: + movl bo.42,%edx + testb $1,%dl + je .L28 + movl %edx,36(%esp) + movl %ecx,%ebx + movl 44(%esp),%esi + movl %edx,%edi + pushl %esi + sall $2,%edi + movl %ebx,%eax + movl %edi,24(%esp) + addl %edi,%eax + pushl %eax + movl %edx,%eax + incl %eax + andl $15,%eax + leal 1088(,%eax,4),%eax + addl %ebx,%eax + pushl %eax + call dct64_3dnow + addl $12,%esp + jmp .L29 +.L28: + leal 1(%edx),%esi + movl 44(%esp),%edi + movl %esi,36(%esp) + leal 1092(%ecx,%edx,4),%eax + pushl %edi + leal 1088(%ecx),%ebx + pushl %eax + sall $2,%esi + leal (%ecx,%edx,4),%eax + pushl %eax + call dct64_3dnow + addl $12,%esp + movl %esi,20(%esp) +.L29: + movl $decwin+64,%edx + movl $16,%ecx + subl 20(%esp),%edx + movl 16(%esp),%edi + + movq (%edx),%mm0 + movq (%ebx),%mm1 + .align 32 +.L33: + movq 8(%edx),%mm3 + pfmul %mm1,%mm0 + movq 8(%ebx),%mm4 + movq 16(%edx),%mm5 + pfmul %mm4,%mm3 + movq 16(%ebx),%mm6 + pfadd %mm3,%mm0 + movq 24(%edx),%mm1 + pfmul %mm6,%mm5 + movq 24(%ebx),%mm2 + pfadd %mm5,%mm0 + movq 32(%edx),%mm3 + pfmul %mm2,%mm1 + movq 32(%ebx),%mm4 + pfadd %mm1,%mm0 + movq 40(%edx),%mm5 + pfmul %mm4,%mm3 + movq 40(%ebx),%mm6 + pfadd %mm3,%mm0 + movq 48(%edx),%mm1 + pfmul %mm6,%mm5 + movq 48(%ebx),%mm2 + pfadd %mm0,%mm5 + movq 56(%edx),%mm3 + pfmul %mm1,%mm2 + movq 56(%ebx),%mm4 + pfadd %mm5,%mm2 + addl $64,%ebx + subl $-128,%edx + movq (%edx),%mm0 + pfmul %mm4,%mm3 + movq (%ebx),%mm1 + pfadd %mm3,%mm2 + movq %mm2,%mm3 + psrlq $32,%mm3 + pfsub %mm3,%mm2 + incl %ebp + pf2id %mm2,%mm2 + packssdw %mm2,%mm2 + movd %mm2,%eax + movw %ax,0(%edi) + addl $4,%edi + decl %ecx + jnz .L33 + + movd (%ebx),%mm0 + movd (%edx),%mm1 + punpckldq 8(%ebx),%mm0 + punpckldq 8(%edx),%mm1 + movd 16(%ebx),%mm3 + movd 16(%edx),%mm4 + pfmul %mm1,%mm0 + punpckldq 24(%ebx),%mm3 + punpckldq 24(%edx),%mm4 + movd 32(%ebx),%mm5 + movd 32(%edx),%mm6 + pfmul %mm4,%mm3 + punpckldq 40(%ebx),%mm5 + punpckldq 40(%edx),%mm6 + pfadd %mm3,%mm0 + movd 48(%ebx),%mm1 + movd 48(%edx),%mm2 + pfmul %mm6,%mm5 + punpckldq 56(%ebx),%mm1 + punpckldq 56(%edx),%mm2 + pfadd %mm5,%mm0 + pfmul %mm2,%mm1 + pfadd %mm1,%mm0 + pfacc %mm1,%mm0 + pf2id %mm0,%mm0 + packssdw %mm0,%mm0 + movd %mm0,%eax + movw %ax,0(%edi) + incl %ebp + movl 36(%esp),%esi + addl $-64,%ebx + movl $15,%ebp + addl $4,%edi + leal -128(%edx,%esi,8),%edx + + movl $15,%ecx + movd (%ebx),%mm0 + movd -4(%edx),%mm1 + punpckldq 4(%ebx),%mm0 + punpckldq -8(%edx),%mm1 + .align 32 +.L46: + movd 8(%ebx),%mm3 + movd -12(%edx),%mm4 + pfmul %mm1,%mm0 + punpckldq 12(%ebx),%mm3 + punpckldq -16(%edx),%mm4 + movd 16(%ebx),%mm5 + movd -20(%edx),%mm6 + pfmul %mm4,%mm3 + punpckldq 20(%ebx),%mm5 + punpckldq -24(%edx),%mm6 + pfadd %mm3,%mm0 + movd 24(%ebx),%mm1 + movd -28(%edx),%mm2 + pfmul %mm6,%mm5 + punpckldq 28(%ebx),%mm1 + punpckldq -32(%edx),%mm2 + pfadd %mm5,%mm0 + movd 32(%ebx),%mm3 + movd -36(%edx),%mm4 + pfmul %mm2,%mm1 + punpckldq 36(%ebx),%mm3 + punpckldq -40(%edx),%mm4 + pfadd %mm1,%mm0 + movd 40(%ebx),%mm5 + movd -44(%edx),%mm6 + pfmul %mm4,%mm3 + punpckldq 44(%ebx),%mm5 + punpckldq -48(%edx),%mm6 + pfadd %mm3,%mm0 + movd 48(%ebx),%mm1 + movd -52(%edx),%mm2 + pfmul %mm6,%mm5 + punpckldq 52(%ebx),%mm1 + punpckldq -56(%edx),%mm2 + pfadd %mm0,%mm5 + movd 56(%ebx),%mm3 + movd -60(%edx),%mm4 + pfmul %mm2,%mm1 + punpckldq 60(%ebx),%mm3 + punpckldq (%edx),%mm4 + pfadd %mm1,%mm5 + addl $-128,%edx + addl $-64,%ebx + movd (%ebx),%mm0 + movd -4(%edx),%mm1 + pfmul %mm4,%mm3 + punpckldq 4(%ebx),%mm0 + punpckldq -8(%edx),%mm1 + pfadd %mm5,%mm3 + pfacc %mm3,%mm3 + incl %ebp + pf2id %mm3,%mm3 + movd %mm3,%eax + negl %eax + movd %eax,%mm3 + packssdw %mm3,%mm3 + movd %mm3,%eax + movw %ax,(%edi) + addl $4,%edi + decl %ecx + jnz .L46 + + femms + movl 56(%esp),%esi + movl %ebp,%eax + subl $-128,0(%esi) + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $24,%esp + ret -- cgit v1.2.1