From 7377df6338b43516dc92d9fecec3469f774c2222 Mon Sep 17 00:00:00 2001
From: ddennedy <ddennedy@d19143bc-622f-0410-bfdd-b5b2a6649095>
Date: Thu, 26 Feb 2004 14:11:49 +0000
Subject: [PATCH] updated mmx yuv scaling

git-svn-id: https://mlt.svn.sourceforge.net/svnroot/mlt/trunk/mlt@171 d19143bc-622f-0410-bfdd-b5b2a6649095
---
 src/modules/gtk2/Makefile               |    2 +-
 src/modules/gtk2/pixops.c               |   31 +++--
 src/modules/gtk2/scale_line_22_33_mmx.S |  225 +++++++++++++++++++++----------
 3 files changed, 174 insertions(+), 84 deletions(-)

diff --git a/src/modules/gtk2/Makefile b/src/modules/gtk2/Makefile
index cd001dd..1ca7e8a 100644
--- a/src/modules/gtk2/Makefile
+++ b/src/modules/gtk2/Makefile
@@ -24,7 +24,7 @@ $(TARGET): $(OBJS) $(ASM_OBJS)
 have_mmx.o:
 	$(CC) -o $@ -c have_mmx.S
 
-scale_line_22_33_mmx.o:
+scale_line_22_33_mmx.o: scale_line_22_33_mmx.S
 	$(CC) -o $@ -c scale_line_22_33_mmx.S
 
 depend:	$(SRCS)
diff --git a/src/modules/gtk2/pixops.c b/src/modules/gtk2/pixops.c
index 5956b6e..9ac77ba 100644
--- a/src/modules/gtk2/pixops.c
+++ b/src/modules/gtk2/pixops.c
@@ -59,7 +59,7 @@ typedef void ( *PixopsPixelFunc ) ( guchar *dest, guint y1, guint cr, guint y2,
 
 /* mmx function declarations */
 #ifdef USE_MMX
-guchar *pixops_scale_line_22_33_mmx ( guint32 weights[ 16 ][ 8 ], guchar *p, guchar *q1, guchar *q2, int x_step, guchar *p_stop, int x_init );
+guchar *pixops_scale_line_22_yuv_mmx ( guint32 weights[ 16 ][ 8 ], guchar *p, guchar *q1, guchar *q2, int x_step, guchar *p_stop, int x_init, int destx );
 int pixops_have_mmx ( void );
 #endif
 
@@ -158,7 +158,7 @@ scale_line ( int *weights, int n_x, int n_y,
 
 #ifdef USE_MMX
 static inline guchar *
-scale_line_22_33_mmx_stub ( int *weights, int n_x, int n_y,
+scale_line_22_yuv_mmx_stub ( int *weights, int n_x, int n_y,
                             guchar *dest, int dest_x, guchar *dest_end,
                             guchar **src,
                             int x_init, int x_step, int src_width )
@@ -178,12 +178,12 @@ scale_line_22_33_mmx_stub ( int *weights, int n_x, int n_y,
 		mmx_weights[ j ][ 7 ] = 0x00010001 * ( weights[ 4 * j + 3 ] >> 8 );
 	}
 
-	return pixops_scale_line_22_33_mmx ( mmx_weights, dest, src[ 0 ], src[ 1 ], x_step, dest_end, x_init );
+	return pixops_scale_line_22_yuv_mmx ( mmx_weights, dest, src[ 0 ], src[ 1 ], x_step, dest_end, x_init, dest_x );
 }
 #endif /* USE_MMX */
 
 static inline guchar *
-scale_line_22_33 ( int *weights, int n_x, int n_y,
+scale_line_22_yuv ( int *weights, int n_x, int n_y,
                    guchar *dest, int dest_x, guchar *dest_end,
                    guchar **src,
                    int x_init, int x_step, int src_width )
@@ -219,16 +219,22 @@ scale_line_22_33 ( int *weights, int n_x, int n_y,
 		/* process U/V */
 		x_aligned = ( ( x_scaled >> 1 ) << 2 );
 		q0 = src0 + x_aligned;
+		uv_index = ( ( dest_x & 1 ) << 1 );
+		//printf( "scale_line_22_yuv: %d %d\n", x_aligned + uv_index, dest_x );
+		p  = w1 * q0[ uv_index + 1 ];
+		p += w2 * q0[ uv_index + 1 ];
+
+		x += x_step;
+		x_scaled = x >> SCALE_SHIFT;
+		dest_x++;
+		
+		x_aligned = ( ( x_scaled >> 1 ) << 2 );
 		q1 = src1 + x_aligned;
 		uv_index = ( ( dest_x & 1 ) << 1 ) + 1;
-		p  = w1 * q0[ uv_index ];
-		p += w2 * q0[ uv_index ];
 		p += w3 * q1[ uv_index ];
 		p += w4 * q1[ uv_index ];
 		*dest++ = ( p + 0x8000 ) >> SCALE_SHIFT;
 
-		x += x_step;
-		dest_x++;
 	}
 
 	return dest;
@@ -739,12 +745,15 @@ yuv422_scale ( guchar *dest_buf,
 	if ( filter.x.n == 2 && filter.y.n == 2 )
 	{
 #ifdef USE_MMX
-		if ( 0 && found_mmx )
-			line_func = scale_line_22_33_mmx_stub;
+		if ( found_mmx )
+		{
+			//fprintf( stderr, "rescale: using mmx\n" );
+			line_func = scale_line_22_yuv_mmx_stub;
+		}
 		else
 #endif
 
-			line_func = scale_line_22_33;
+			line_func = scale_line_22_yuv;
 	}
 	else
 		line_func = scale_line;
diff --git a/src/modules/gtk2/scale_line_22_33_mmx.S b/src/modules/gtk2/scale_line_22_33_mmx.S
index f389217..d4e73b4 100644
--- a/src/modules/gtk2/scale_line_22_33_mmx.S
+++ b/src/modules/gtk2/scale_line_22_33_mmx.S
@@ -1,32 +1,39 @@
-	.file	"scale_line_22_33_mmx.S"
+	.file	"scale_line_22_yuv_mmx.S"
 	.version	"01.01"
+
+.extern printf
+
 gcc2_compiled.:
+.data
+MSG: .ascii "scale_line_22_yuv_mmx: %d %d\n"
+
 .text
 	.align 16
 
 #if !defined(__MINGW32__) && !defined(__CYGWIN__)	
 	
-.globl pixops_scale_line_22_33_mmx
-	.type	 pixops_scale_line_22_33_mmx,@function
-pixops_scale_line_22_33_mmx:
+.globl pixops_scale_line_22_yuv_mmx
+	.type	 pixops_scale_line_22_yuv_mmx,@function
+pixops_scale_line_22_yuv_mmx:
 	
 #else
 	
-.globl _pixops_scale_line_22_33_mmx
-_pixops_scale_line_22_33_mmx:
+.globl _pixops_scale_line_22_yuv_mmx
+_pixops_scale_line_22_yuv_mmx:
 	
 #endif
 /*
  * Arguments
  *		
  * weights:	 8(%ebp)
- * p:	        12(%ebp)	%esi
- * q1:	        16(%ebp)	
- * q2:	        20(%ebp)	
+ * p (dest):    12(%ebp)	%esi
+ * q1 (src0):   16(%ebp)	
+ * q2 (src1):   20(%ebp)	
  * xstep:       24(%ebp)	
  * p_end:       28(%ebp)
  * xinit:       32(%ebp)
- *	
+ * destx:       36(%ebp)
+ *
 */
 
 /*
@@ -38,52 +45,95 @@ _pixops_scale_line_22_33_mmx:
 	pushl %edi
 	pushl %esi
 	pushl %ebx
-/* Locals:	
+/* Locals:
  * int x                      %ebx
  * int x_scaled             -24(%ebp)
+ * int dest_x               36(%ebp)
  */
 
 /*
  * Setup
  */
-/* Initialize variables */	
-	movl 32(%ebp),%ebx
-	movl 32(%ebp),%edx
-	sarl $16,%edx
-	movl 12(%ebp),%esi
+/* Initialize variables */
+	movl 36(%ebp),%eax # destx
+	movl %eax,36(%ebp)
+	movl 32(%ebp),%ebx # x
+	movl 12(%ebp),%esi # dest
 
-	cmpl 28(%ebp),%esi
+	cmpl 28(%ebp),%esi # dest == dest_end ?
 	jnb  .out
 
-/* For the body of this loop, %mm01, %mm1, %mm2, %mm3 hold the 4 adjoining
+/* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining
  * points we are interpolating between, as:
  *
- *  000000BB00GG00RR
- */	
-	
+ *  0000000000UV00YY
+ */
+
 /* Load initial values into %mm1, %mm3 */
-	leal (%edx,%edx,2),%edx  # Multiply by 3
 
-	movl 16(%ebp),%edi
-	pxor %mm4, %mm4
-	movzbl 2(%edi,%edx),%ecx
-	shll $16,%ecx
-	movzwl (%edi,%edx),%eax
-	orl %eax,%ecx
+	/* x_scaled = ( x >> 16 ) * stride */
+	movl %ebx, %edx
+	sarl $16,%edx
+	sall $1, %edx
+
+	/* load from src0 */
+	movl 16(%ebp), %edi
+	movzbl (%edi,%edx), %ecx
+
+	/* x_aligned = x_scaled divided by 2 and multiplied by 4 */
+	movl %ebx, %edx
+	sarl $17, %edx
+	sall $2, %edx
+
+	/* uv_index = ( ( dest_x & 1 ) << 1 ) + 1; */
+	movl 36(%ebp), %eax
+	andl $1, %eax
+	sall $1, %eax
+	addl %eax, %edx
+	movzbl 1(%edi,%edx), %eax
+	shll $8, %eax
+	orl %eax, %ecx
+
 	movd %ecx, %mm1
+	pxor %mm4, %mm4
 	punpcklbw %mm4, %mm1
 
-	movl 20(%ebp),%edi
-	movzbl 2(%edi,%edx),%ecx
-	shll $16,%ecx
-	movzwl (%edi,%edx),%eax
-	orl %eax,%ecx
+	/* x_scaled = ( x >> 16 ) * stride */
+	movl %ebx, %edx
+	sarl $16, %edx
+	sall $1, %edx
+
+	/* load from src1 */
+	movl 20(%ebp), %edi
+	movzbl (%edi,%edx), %ecx
+
+	/* x_aligned = x_scaled divided by 2 and multiplied by 4 */
+	movl %ebx, %edx
+	sarl $17, %edx
+	sall $2, %edx
+
+	/* uv_index = ( ( dest_x & 1 ) << 1 ) + 1; */
+	movl 36(%ebp), %eax
+	andl $1, %eax
+	sall $1, %eax
+	addl %eax, %edx
+	movzbl (%edi,%edx), %eax
+	shll $8, %eax
+	orl %eax, %ecx
+
 	movd %ecx, %mm3
 	punpcklbw %mm4, %mm3
 
-	addl $65536,%ebx
-	movl %ebx,%edx
-	sarl $16,%edx
+	/* dest_x++; */
+	movl 36(%ebp), %eax
+	addl $1, %eax
+	movl %eax, 36(%ebp)
+
+	/* x_scaled = x >> 16 */
+	addl $65536, %ebx
+	movl %ebx, %edx
+	sarl $16, %edx
+	movl %edx, -24(%ebp)
 
 	jmp .newx
 	.p2align 4,,7
@@ -110,64 +160,95 @@ _pixops_scale_line_22_33_mmx:
 	paddw %mm6, %mm7
 	paddw %mm5, %mm7
 
-/* %mm7	holds the accumulated sum. Compute (C + 0x80) / 256
+/* %mm7 holds the accumulated sum. Compute (C + 0x80) / 256
  */
 	pxor %mm4, %mm4
-	movl $8421504, %eax  # 0x00808080
-	movd %eax, %mm6  
+	movl $0x80808080, %eax
+	movd %eax, %mm6
 	punpcklbw %mm4, %mm6
 	paddw %mm6, %mm7
 	psrlw $8, %mm7
 
 /* Pack into %eax and store result
- */	
+ */
 	packuswb %mm7, %mm7
 	movd %mm7, %eax
-	
-	movb %al, (%esi)
+
+	movb %al, 0(%esi)        # *dest = y
 	shrl $8, %eax
-	movw %ax, 1(%esi)
-	addl $3, %esi
-		
-	cmpl %esi,28(%ebp)
-	je   .out
-
-/* x += x_step; */
-	addl 24(%ebp),%ebx
-/* x_scaled = x >> 16; */
-	movl %ebx,%edx
-	sarl $16,%edx
+	movb %al, 1(%esi)        # *dest = uv
+
+	addl $2, %esi            # dest += 2
+
+	cmpl %esi,28(%ebp)       # if dest == dest_end ?
+	je   .out                # then exit
+
+	movl 36(%ebp), %eax      # get dest_x
+	addl $1, %eax            # dest_x++
+	movl %eax, 36(%ebp)      # put dest_x
 
-	cmpl %edx,-24(%ebp)
-	je   .loop
+	addl 24(%ebp), %ebx      # x += x_step
+
+	movl %ebx, %edx          # x_scaled = x ...
+	sarl $16, %edx           # >> 16
+	movl %edx, -24(%ebp)     # save x_scaled
 
 .newx:
-	movl %edx,-24(%ebp)
+
 /*
  * Load the two new values into %mm1, %mm3, move old values into %mm0, %mm2
  */
 	movq %mm1, %mm0
 	movq %mm3, %mm2
-	
-	leal (%edx,%edx,2),%edx  # Multiply by 3
 
-	movl 16(%ebp),%edi
-	movzbl 2(%edi,%edx),%ecx
-	shll $16,%ecx
-	movzwl (%edi,%edx),%eax
-	orl %eax,%ecx
-	movd %ecx, %mm1
+	sall $1, %edx            # x_scaled *= channels
+
+	movl 16(%ebp), %edi      # get src0
+	movzbl (%edi,%edx), %ecx # y = src0[ x_scaled ]
+
+	sarl $2, %edx            # x_aligned = ( x_scaled / channels ) >> 1 ...
+	sall $2, %edx            # << 2
+
+	movl 36(%ebp), %eax      # uv_index = dest_x ...
+		#pushl %eax
+	andl $1, %eax            # ( dest_x & 1 ) ...
+	sall $1, %eax            # << 1
+	addl %eax, %edx          # x_aligned += uv_index
+		#pushl %edx
+		#pushl $MSG
+		#call printf
+		#popl %edx
+		#popl %edx
+		#popl %edx
+	movzbl 1(%edi,%edx), %eax # uv = src0[ x_aligned + 1 ]
+	shll $8, %eax            # store uv
+	orl %eax, %ecx
+
+	movd %ecx, %mm1          # move to mmx1
 	punpcklbw %mm4, %mm1
 
-	movl 20(%ebp),%edi
-	movzbl 2(%edi,%edx),%ecx
-	shll $16,%ecx
-	movzwl (%edi,%edx),%eax
-	orl %eax,%ecx
-	movd %ecx, %mm3
+	movl %ebx, %edx          # x_scaled = x ...
+	sarl $16, %edx           # >> 16
+	sall $1, %edx            # x_scaled *= channels
+
+	movl 20(%ebp), %edi      # get src1
+	movzbl (%edi,%edx), %ecx # y = src1[ x_scaled ]
+
+	sarl $2, %edx            # x_aligned = ( x_scaled / channels ) >> 1 ...
+	sall $2, %edx            # << 2
+
+	movl 36(%ebp), %eax      # uv_index = dest_x ...
+	andl $1, %eax            # ( dest_x & 1 ) ...
+	sall $1, %eax            # << 1
+	addl %eax, %edx          # x_aligned += uv_index
+	movzbl 1(%edi,%edx), %eax # uv = src1[ x_aligned + 1 ]
+	shll $8, %eax            # store uv
+	orl %eax, %ecx
+
+	movd %ecx, %mm3          # move to mmx3
 	punpcklbw %mm4, %mm3
-	
-	movl 8(%ebp),%edi
+
+	movl 8(%ebp), %edi       # get weights pointer
 	
 	jmp .loop
 
-- 
1.7.4.4