summaryrefslogtreecommitdiffstats
path: root/xorg
diff options
context:
space:
mode:
authorJay Sorg <jay.sorg@gmail.com>2014-09-27 13:34:08 -0700
committerJay Sorg <jay.sorg@gmail.com>2014-09-27 13:34:08 -0700
commit8b400ca0f3c8cf083dddb4794c682dda45c755ad (patch)
tree37b8c5496d6d060c87b056b6d9e7cda5b1cfae86 /xorg
parent656e6eae1ffc1d92ca717ca0b9b9928f6a263183 (diff)
downloadxrdp-proprietary-8b400ca0f3c8cf083dddb4794c682dda45c755ad.tar.gz
xrdp-proprietary-8b400ca0f3c8cf083dddb4794c682dda45c755ad.zip
Xorg: asm changes
Diffstat (limited to 'xorg')
-rw-r--r--xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm209
1 files changed, 55 insertions, 154 deletions
diff --git a/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm b/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm
index 85c6170d..1e67ff68 100644
--- a/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm
+++ b/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm
@@ -1,6 +1,7 @@
SECTION .data
align 16
+c8 times 4 dd 8
c16 times 4 dd 16
c100 times 4 dd 100
c128 times 4 dd 128
@@ -20,45 +21,29 @@ SECTION .text
y1_do4:
; y
- mov eax, 0
- mov al, [esi]
- add esi, 1
- pinsrd xmm0, eax, 0
- mov al, [esi]
- add esi, 1
- pinsrd xmm0, eax, 1
- mov al, [esi]
- add esi, 1
- pinsrd xmm0, eax, 2
- mov al, [esi]
- add esi, 1
- pinsrd xmm0, eax, 3
+ movd xmm0, [esi] ; 4 at a time
+ add esi, 4
+ pxor xmm6, xmm6
+ punpcklbw xmm0, xmm6
+ punpcklwd xmm0, xmm6
movdqa xmm7, [c16]
psubd xmm0, xmm7
; u
- mov eax, 0
- mov al, [ebx]
- add ebx, 1
- pinsrd xmm1, eax, 0
- pinsrd xmm1, eax, 1
- mov al, [ebx]
- add ebx, 1
- pinsrd xmm1, eax, 2
- pinsrd xmm1, eax, 3
+ movd xmm1, [ebx] ; read 4 but only using 2
+ add ebx, 2
+ punpcklbw xmm1, xmm1
+ punpcklbw xmm1, xmm6
+ punpcklwd xmm1, xmm6
movdqa xmm7, [c128]
psubd xmm1, xmm7
; v
- mov eax, 0
- mov al, [edx]
- add edx, 1
- pinsrd xmm2, eax, 0
- pinsrd xmm2, eax, 1
- mov al, [edx]
- add edx, 1
- pinsrd xmm2, eax, 2
- pinsrd xmm2, eax, 3
+ movd xmm2, [edx] ; read 4 but only using 2
+ add edx, 2
+ punpcklbw xmm2, xmm2
+ punpcklbw xmm2, xmm6
+ punpcklwd xmm2, xmm6
psubd xmm2, xmm7
; t = (298 * c + 409 * e + 128) >> 8;
@@ -69,11 +54,6 @@ y1_do4:
paddd xmm3, xmm4
paddd xmm3, xmm7
psrad xmm3, 8
- ; b = RDPCLAMP(t, 0, 255);
- pxor xmm4, xmm4
- pmaxsd xmm3, xmm4
- movdqa xmm4, [c255]
- pminsd xmm3, xmm4
; t = (298 * c - 100 * d - 208 * e + 128) >> 8;
movdqa xmm4, [c298]
@@ -86,11 +66,6 @@ y1_do4:
psubd xmm4, xmm6
paddd xmm4, xmm7
psrad xmm4, 8
- ; g = RDPCLAMP(t, 0, 255);
- pxor xmm5, xmm5
- pmaxsd xmm4, xmm5
- movdqa xmm5, [c255]
- pminsd xmm4, xmm5
; t = (298 * c + 516 * d + 128) >> 8;
movdqa xmm5, [c298]
@@ -100,69 +75,31 @@ y1_do4:
paddd xmm5, xmm6
paddd xmm5, xmm7
psrad xmm5, 8
- ; r = RDPCLAMP(t, 0, 255);
- pxor xmm6, xmm6
- pmaxsd xmm5, xmm6
- movdqa xmm6, [c255]
- pminsd xmm5, xmm6
-
- pextrd eax, xmm3, 0
- mov [edi], al
- pextrd eax, xmm4, 0
- mov [edi + 1], al
- pextrd eax, xmm5, 0
- mov [edi + 2], al
- mov eax, 0
- mov [edi + 3], al
- add edi, 4
-
- pextrd eax, xmm3, 1
- mov [edi], al
- pextrd eax, xmm4, 1
- mov [edi + 1], al
- pextrd eax, xmm5, 1
- mov [edi + 2], al
- mov eax, 0
- mov [edi + 3], al
- add edi, 4
-
- pextrd eax, xmm3, 2
- mov [edi], al
- pextrd eax, xmm4, 2
- mov [edi + 1], al
- pextrd eax, xmm5, 2
- mov [edi + 2], al
- mov eax, 0
- mov [edi + 3], al
- add edi, 4
-
- pextrd eax, xmm3, 3
- mov [edi], al
- pextrd eax, xmm4, 3
- mov [edi + 1], al
- pextrd eax, xmm5, 3
- mov [edi + 2], al
- mov eax, 0
- mov [edi + 3], al
- add edi, 4
+
+ packusdw xmm3, xmm3 ; b
+ packuswb xmm3, xmm3
+ packusdw xmm4, xmm4 ; g
+ packuswb xmm4, xmm4
+ punpcklbw xmm3, xmm4 ; gb
+
+ pxor xmm4, xmm4 ; a
+ packusdw xmm5, xmm5 ; b
+ packuswb xmm5, xmm5
+ punpcklbw xmm5, xmm4 ; ar
+
+ punpcklwd xmm3, xmm5 ; argb
+ movdqu [edi], xmm3
+ add edi, 16
ret;
y2_do4:
; y
- mov eax, 0
- mov al, [esi]
- add esi, 1
- pinsrd xmm0, eax, 0
- mov al, [esi]
- add esi, 1
- pinsrd xmm0, eax, 1
- mov al, [esi]
- add esi, 1
- pinsrd xmm0, eax, 2
- mov al, [esi]
- add esi, 1
- pinsrd xmm0, eax, 3
+ movd xmm0, [esi] ; read 4 but only using 2
+ add esi, 4
+ pxor xmm6, xmm6
+ punpcklbw xmm0, xmm6
+ punpcklwd xmm0, xmm6
movdqa xmm7, [c16]
psubd xmm0, xmm7
@@ -176,11 +113,6 @@ y2_do4:
paddd xmm3, xmm4
paddd xmm3, xmm7
psrad xmm3, 8
- ; b = RDPCLAMP(t, 0, 255);
- pxor xmm4, xmm4
- pmaxsd xmm3, xmm4
- movdqa xmm4, [c255]
- pminsd xmm3, xmm4
; t = (298 * c - 100 * d - 208 * e + 128) >> 8;
movdqa xmm4, [c298]
@@ -193,11 +125,6 @@ y2_do4:
psubd xmm4, xmm6
paddd xmm4, xmm7
psrad xmm4, 8
- ; g = RDPCLAMP(t, 0, 255);
- pxor xmm5, xmm5
- pmaxsd xmm4, xmm5
- movdqa xmm5, [c255]
- pminsd xmm4, xmm5
; t = (298 * c + 516 * d + 128) >> 8;
movdqa xmm5, [c298]
@@ -207,51 +134,21 @@ y2_do4:
paddd xmm5, xmm6
paddd xmm5, xmm7
psrad xmm5, 8
- ; r = RDPCLAMP(t, 0, 255);
- pxor xmm6, xmm6
- pmaxsd xmm5, xmm6
- movdqa xmm6, [c255]
- pminsd xmm5, xmm6
-
- pextrd eax, xmm3, 0
- mov [edi], al
- pextrd eax, xmm4, 0
- mov [edi + 1], al
- pextrd eax, xmm5, 0
- mov [edi + 2], al
- mov eax, 0
- mov [edi + 3], al
- add edi, 4
-
- pextrd eax, xmm3, 1
- mov [edi], al
- pextrd eax, xmm4, 1
- mov [edi + 1], al
- pextrd eax, xmm5, 1
- mov [edi + 2], al
- mov eax, 0
- mov [edi + 3], al
- add edi, 4
-
- pextrd eax, xmm3, 2
- mov [edi], al
- pextrd eax, xmm4, 2
- mov [edi + 1], al
- pextrd eax, xmm5, 2
- mov [edi + 2], al
- mov eax, 0
- mov [edi + 3], al
- add edi, 4
-
- pextrd eax, xmm3, 3
- mov [edi], al
- pextrd eax, xmm4, 3
- mov [edi + 1], al
- pextrd eax, xmm5, 3
- mov [edi + 2], al
- mov eax, 0
- mov [edi + 3], al
- add edi, 4
+
+ packusdw xmm3, xmm3 ; b
+ packuswb xmm3, xmm3
+ packusdw xmm4, xmm4 ; g
+ packuswb xmm4, xmm4
+ punpcklbw xmm3, xmm4 ; gb
+
+ pxor xmm4, xmm4 ; a
+ packusdw xmm5, xmm5 ; b
+ packuswb xmm5, xmm5
+ punpcklbw xmm5, xmm4 ; ar
+
+ punpcklwd xmm3, xmm5 ; argb
+ movdqu [edi], xmm3
+ add edi, 16
ret;
@@ -309,6 +206,10 @@ loop_y:
; save edx
mov [esp + 24], edx
+ prefetchnta 4096[esp + 0] ; y
+ prefetchnta 4096[esp + 8] ; u
+ prefetchnta 4096[esp + 12] ; v
+
loop_x:
mov esi, [esp + 0] ; y1