X-Git-Url: http://research.m1stereo.tv/gitweb?a=blobdiff_plain;f=src%2Fmodules%2Fgtk2%2Fscale_line_22_yuv_mmx.S;h=d78b3413d9c97b57dc5ab13ddbf0338d4ae36b77;hb=ced3d0b8c0520e4c8208166e3218e0caacba1efa;hp=cc389ad941e71c3abb70258bf1dca010d5188e0c;hpb=c56ed08b4a9f46677880cd7f550450eb41871565;p=melted

diff --git a/src/modules/gtk2/scale_line_22_yuv_mmx.S b/src/modules/gtk2/scale_line_22_yuv_mmx.S
index cc389ad..d78b341 100644
--- a/src/modules/gtk2/scale_line_22_yuv_mmx.S
+++ b/src/modules/gtk2/scale_line_22_yuv_mmx.S
@@ -1,3 +1,22 @@
+/*
+ * scale_line_22_yuv_mmx.S -- scale line in YUY2 format
+ * Copyright (C) 2003-2004 Ushodaya Enterprises Limited
+ * Author: Dan Dennedy <dan@dennedy.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
 	.file	"scale_line_22_yuv_mmx.S"
 	.version	"01.01"
 
@@ -25,16 +44,16 @@ _pixops_scale_line_22_yuv_mmx:
 /*
  * Arguments
  *		
- * weights:	 8(%ebp)
+ * weights:	     8(%ebp)
  * p (dest):    12(%ebp)	%esi
  * q1 (src0):   16(%ebp)	
  * q2 (src1):   20(%ebp)	
  * xstep:       24(%ebp)	
  * p_end:       28(%ebp)
  * xinit:       32(%ebp)
- * destx:       36(%ebp)
+ * dest_x:      36(%ebp)
  *
-*/
+ */
 
 /*
  * Function call entry
@@ -63,78 +82,45 @@ _pixops_scale_line_22_yuv_mmx:
 	cmpl 28(%ebp),%esi # dest == dest_end ?
 	jnb  .out
 
-	addl $65536, %ebx
-	.p2align 4,,7
-	pxor %mm4, %mm4
-
 /* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining
  * points we are interpolating between, as:
  *
- *  00UV00Y200UV00Y1
+ *  00VV00Y200UU00Y1
  */
 
-.loop:
-
+	pxor %mm4, %mm4
 /*
- * Load current values from pixel 1
+ * Load next component values into mm1 (src0) and mm3 (src1)
  */
-	movl %ebx, %edx          # x_scaled = x ...
-	sarl $16, %edx           # >> 16
-	sall $1, %edx            # x_scaled *= channels
+	movl %ebx, %eax          # x_scaled
+	sarl $15, %eax
+	andl $0xfffffffe, %eax
+	movl %eax, %edx          # x_aligned
+	andl $0xfffffffc, %edx
 
 	movl 16(%ebp), %edi      # get src0
-	movzbl 2(%edi,%edx), %ecx # next y = src0[ x_scaled + 2 ]
-	/* wish we had a register for this */
-	movl %ecx, -24(%ebp)     # save next y
-	movzbl (%edi,%edx), %ecx # y = src0[ x_scaled ]
-
-	sarl $2, %edx            # x_aligned = ( x_scaled / channels ) >> 1 ...
-	sall $2, %edx            # << 2
-
-	movl 36(%ebp), %eax      # uv_index = dest_x ...
-	andl $1, %eax            # ( dest_x & 1 ) ...
-	sall $1, %eax            # << 1
-	addl %eax, %edx          # x_aligned += uv_index
-
-	movzbl 1(%edi,%edx), %eax # uv = src0[ x_aligned + 1 ]
-	shll $8, %eax           # position uv
-	orl %eax, %ecx           # store uv
-
-	movd %ecx, %mm0          # move to mmx0
-	punpcklbw %mm4, %mm0
-
-	movl -24(%ebp), %ecx     # restore next y
-	orl %eax, %ecx           # store uv
-
+	movl (%edi,%eax), %ecx   # get y
+	andl $0x00ff00ff, %ecx   # mask off y
+	movl (%edi,%edx), %eax   # get uv
+	andl $0xff00ff00, %eax   # mask off uv
+	orl %eax, %ecx           # composite y, uv
 	movd %ecx, %mm1          # move to mmx1
 	punpcklbw %mm4, %mm1
 
 	movl 20(%ebp), %edi      # get src1
-
-	/* do u/v first since we already have x_aligned */
-	movzbl 1(%edi,%edx), %eax # uv = src1[ x_aligned + 1 ]
-	shll $8, %eax            # position uv
-
-	/* which is faster? 2 moves in and out of memory, or
-       1 move between registers and 2 shifts? I wager the latter. */
-	movl %ebx, %edx          # x_scaled = x ...
-	sarl $16, %edx           # >> 16
-	sall $1, %edx            # x_scaled *= channels
-
-	movzbl 2(%edi,%edx), %ecx # next y = src1[ x_scaled + 2 ]
-	movl %eax, -24(%ebp)     # save next y
-	movzbl (%edi,%edx), %ecx # y = src1[ x_scaled ]
-	orl %eax, %ecx           # store uv
-
-	movd %ecx, %mm2          # move to mmx2
-	punpcklbw %mm4, %mm2
-
-	movl -24(%ebp), %ecx     # restore next y
-	orl %eax, %ecx           # store uv
-
+	movl (%edi,%edx), %ecx   # get y
+	andl $0x00ff00ff, %ecx   # mask off y
+	movl (%edi,%edx), %eax   # get uv
+	andl $0xff00ff00, %eax   # mask off uv
+	orl %eax, %ecx           # composite y, uv
 	movd %ecx, %mm3          # move to mmx3
 	punpcklbw %mm4, %mm3
 
+	jmp .newx
+
+	.p2align 4,,7
+.loop:
+
 /* short *pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * n_x * n_y
  *                                             16             4                  0xf            2     2
  */
@@ -173,18 +159,60 @@ _pixops_scale_line_22_yuv_mmx:
 	packuswb %mm7, %mm7
 	movd %mm7, %eax
 
-	movb %al, 0(%esi)        # dest[ 0 ] = y
-	shrl $8, %eax
-	movb %al, 1(%esi)        # dest[ 1 ] = uv
+	movb %al, (%esi)         # *dest = y
+	
+	movl 36(%ebp), %ecx      # get dest_x
+	andl $1, %ecx            # select u or v
+	sall $1, %ecx            # determine offset
+	addl $1, %ecx            # relative to x_aligned
+	sall $3, %ecx            # offset * 8 bits/byte
 
-	addl $2, %esi            # dest += 2
+	movd %mm7, %eax
+	shrl %cl, %eax
+	movb %al, 1(%esi)        # *dest = uv
 
+	addl $2, %esi            # dest += 2
 	cmpl %esi,28(%ebp)       # if dest == dest_end
 	je   .out                # then exit
 
-	addl 24(%ebp), %ebx      # x += x_step
 	addl $1, 36(%ebp)        # dest_x++
 
+.newx:
+
+	addl 24(%ebp), %ebx      # x += x_step
+/*
+ * Load current component values into mm0 (src0) and mm2 (src1)
+ */
+	movq %mm1, %mm0
+	movq %mm3, %mm2
+
+/*
+ * Load next component values into mm1 (src0) and mm3 (src1)
+ */
+	movl %ebx, %eax          # x_scaled
+	sarl $15, %eax
+	andl $0xfffffffe, %eax
+	movl %eax, %edx          # x_aligned
+	andl $0xfffffffc, %edx
+
+	movl 16(%ebp), %edi      # get src0
+	movl (%edi,%eax), %ecx   # get y
+	andl $0x00ff00ff, %ecx   # mask off y
+	movl (%edi,%edx), %eax   # get uv
+	andl $0xff00ff00, %eax   # mask off uv
+	orl %eax, %ecx           # composite y, uv
+	movd %ecx, %mm1          # move to mmx1
+	punpcklbw %mm4, %mm1
+
+	movl 20(%ebp), %edi      # get src1
+	movl (%edi,%edx), %ecx   # get y
+	andl $0x00ff00ff, %ecx   # mask off y
+	movl (%edi,%edx), %eax   # get uv
+	andl $0xff00ff00, %eax   # mask off uv
+	orl %eax, %ecx           # composite y, uv
+	movd %ecx, %mm3          # move to mmx3
+	punpcklbw %mm4, %mm3
+
 	jmp .loop
 
 .out: