From 3f7c53230945e427b019ba5df4fec587e19e29c1 Mon Sep 17 00:00:00 2001
From: ddennedy <ddennedy@d19143bc-622f-0410-bfdd-b5b2a6649095>
Date: Thu, 11 Mar 2004 07:22:22 +0000
Subject: [PATCH] added very preliminary mmx for composite. bugfixes to -x and
 too small rescaling.

git-svn-id: https://mlt.svn.sourceforge.net/svnroot/mlt/trunk/mlt@200 d19143bc-622f-0410-bfdd-b5b2a6649095
---
 src/modules/core/Makefile                 |   14 ++-
 src/modules/core/composite_line_yuv_mmx.S |  203 +++++++++++++++++++++++++++++
 src/modules/core/filter_resize.c          |    2 +-
 src/modules/core/transition_composite.c   |  101 +++++++++------
 src/modules/gtk2/filter_rescale.c         |    2 +
 5 files changed, 275 insertions(+), 47 deletions(-)
 create mode 100644 src/modules/core/composite_line_yuv_mmx.S

diff --git a/src/modules/core/Makefile b/src/modules/core/Makefile
index 4dd185c..fd7d0df 100644
--- a/src/modules/core/Makefile
+++ b/src/modules/core/Makefile
@@ -18,14 +18,19 @@ OBJS = factory.o \
 	   transition_mix.o \
 	   transition_region.o 
 
-CFLAGS = -O3 -I../../ -Wall -g -D_FILE_OFFSET_BITS=64 -pthread
+ASM_OBJS = composite_line_yuv_mmx.o
+
+CFLAGS = -O3 -DUSE_MMX -I../../ -Wall -g -D_FILE_OFFSET_BITS=64 -pthread
 
 SRCS := $(OBJS:.o=.c)
 
 all: 	$(TARGET)
 
-$(TARGET): $(OBJS)
-		$(CC) -shared -o $@ $(OBJS) $(LDFLAGS)
+$(TARGET): $(OBJS) $(ASM_OBJS)
+		$(CC) -shared -o $@ $(OBJS) $(ASM_OBJS) $(LDFLAGS)
+
+composite_line_yuv_mmx.o: composite_line_yuv_mmx.S
+	$(CC) -o $@ -c composite_line_yuv_mmx.S
 
 depend:	$(SRCS)
 		$(CC) -MM $(CFLAGS) $^ 1>.depend
@@ -34,9 +39,8 @@ dist-clean:	clean
 		rm -f .depend
 
 clean:	
-		rm -f $(OBJS) $(TARGET) 
+		rm -f $(OBJS) $(ASM_OBJS) $(TARGET) 
 
 ifneq ($(wildcard .depend),)
 include .depend
 endif
-
diff --git a/src/modules/core/composite_line_yuv_mmx.S b/src/modules/core/composite_line_yuv_mmx.S
new file mode 100644
index 0000000..7a5fb80
--- /dev/null
+++ b/src/modules/core/composite_line_yuv_mmx.S
@@ -0,0 +1,203 @@
+	.file "composite_line_yuv_mmx"
+	.version "01.01"
+	
+gcc2_compiled.:
+.data
+
+.text
+	.align 16
+
+#if !defined(__MINGW32__) && !defined(__CYGWIN__)
+.globl composite_line_yuv_mmx
+	.type	 composite_line_yuv_mmx,@function
+composite_line_yuv_mmx:
+#else
+.globl _composite_line_yuv_mmx
+_composite_line_yuv_mmx:
+#endif
+
+/*
+ * Arguments
+ *		
+ * dest:	     8(%ebp)		%esi
+ * src:         12(%ebp)
+ * width_src:   16(%ebp)	
+ * alpha:       20(%ebp)	
+ * weight:      24(%ebp)	
+ * luma:        28(%ebp)
+ * softness:    32(%ebp)
+ */
+
+/*
+ * Function call entry
+ */
+	pushl %ebp
+	movl %esp,%ebp
+	subl $28,%esp
+	pushl %edi
+	pushl %esi
+	pushl %ebx
+
+/* Initialise */
+	movl 8(%ebp), %esi	      # get dest
+	movl $0, %edx			  # j = 0
+	
+.loop:
+
+	movl $0xff, %ecx           # a = 255
+	cmpl $0, 20(%ebp)         # if alpha == NULL
+	je .noalpha
+	movl 20(%ebp), %edi       # a = alpha[ j ]
+	movb (%edi,%edx), %cl
+.noalpha:
+
+	movl 24(%ebp), %eax       # mix = weight
+	cmpl $0, 28(%ebp)         # if luma == NULL
+	je .noluma
+	movl 28(%ebp), %edi       # mix = ...
+	movl %edx, %ebx
+	#sall $1, %ebx
+	movw (%edi,%ebx), %ax # luma[ j*2 ]
+	cmpw %cx, %ax
+	jl .luma0
+	movl %eax, %ebx
+	addl 32(%ebp), %ebx       # + softness
+	cmpw %bx, %cx
+	jge .luma1
+	/* TODO: linear interpolate between edges eax and ebx */
+	jmp .noluma
+.luma0:
+	movl $0, %eax
+	jmp .noluma
+.luma1:
+	movl $0xffff, %eax
+.noluma:
+	shrl $8, %eax
+
+	movl %edx, %ebx           # edx will be destroyed by mulw
+	mull %ecx                  # mix = mix * a...
+	movl %ebx, %edx           # restore edx
+	shrl $8, %eax             # >>8
+	andl $0xff, %eax
+	
+/* put alpha and (1-alpha) into mm0 */
+/* 0 aa 0 1-a 0 aa 0 1-a */
+
+	/* duplicate word */
+	movl %eax, %ecx
+	shll $16, %ecx
+	orl %eax, %ecx
+	
+	movd %ecx, %mm1
+	
+	/* (1 << 16) - mix */
+	movl $0x000000ff, %ecx
+	subl %eax, %ecx
+	andl $0xff, %ecx
+	
+	/* duplicate word */
+	movl %ecx, %eax
+	shll $16, %eax
+	orl %eax, %ecx
+	
+	movd %ecx, %mm0
+	
+	/* unpack words into double words */
+	punpcklwd %mm1, %mm0
+	
+/* put src yuv and dest yuv into mm1 */
+/* 0 UVs 0 UVd 0 Ys 0 Yd */
+
+	movl 12(%ebp), %edi       # get src
+	movb (%edi), %cl
+	shll $8, %ecx
+	movb 1(%edi), %al
+	shll $24, %eax
+	orl %eax, %ecx
+	
+	movb (%esi), %al         # get dest
+	orl %eax, %ecx
+	movb 1(%esi), %al
+	shll $16, %eax
+	orl %eax, %ecx
+	
+	movd %ecx, %mm1
+	punpcklbw %mm4, %mm1
+	
+/* alpha composite */
+	pmaddwd %mm1, %mm0
+	psrld $8, %mm0
+
+/* store result */
+	movd %mm0, %eax
+	movb %al, (%esi)
+	pextrw $2, %mm0, %eax
+		movl $128, %eax
+	movb %al, 1(%esi)
+
+/* for..next */
+	addl $1, %edx             # j++
+	cmpl %edx, 16(%ebp)       # if ( j == width_src )
+	je .out
+	
+	addl $2, %esi
+	addl $2, 12(%ebp)
+	
+	jmp .loop
+
+.out:
+	emms
+	leal -40(%ebp),%esp
+	popl %ebx
+	popl %esi
+	popl %edi
+	movl %ebp,%esp
+	popl %ebp
+	ret
+
+
+/********************************************/
+
+.align 8
+#if !defined(__MINGW32__) && !defined(__CYGWIN__)	
+.globl composite_have_mmx
+	.type	 composite_have_mmx,@function
+composite_have_mmx:
+#else
+.globl _composite_have_mmx
+_composite_have_mmx:
+#endif
+	
+	push	%ebx
+
+# Check if bit 21 in flags word is writeable
+
+	pushfl	
+	popl	%eax
+	movl	%eax,%ebx
+	xorl	$0x00200000, %eax
+	pushl   %eax
+	popfl
+	pushfl
+	popl	%eax
+
+	cmpl	%eax, %ebx
+
+	je .notfound
+
+# OK, we have CPUID
+
+	movl	$1, %eax
+	cpuid
+	
+	test	$0x00800000, %edx
+	jz	.notfound
+
+	movl	$1, %eax
+	jmp	.out2
+
+.notfound:
+	movl  	$0, %eax
+.out2:	
+	popl	%ebx
+	ret
diff --git a/src/modules/core/filter_resize.c b/src/modules/core/filter_resize.c
index 7a1f5dd..435cf12 100644
--- a/src/modules/core/filter_resize.c
+++ b/src/modules/core/filter_resize.c
@@ -130,7 +130,7 @@ static int filter_get_image( mlt_frame this, uint8_t **image, mlt_image_format *
 		}
 	}
 
-	return 0;
+	return error;
 }
 
 /** Filter processing.
diff --git a/src/modules/core/transition_composite.c b/src/modules/core/transition_composite.c
index 48f2d8a..48d1c6e 100644
--- a/src/modules/core/transition_composite.c
+++ b/src/modules/core/transition_composite.c
@@ -27,6 +27,14 @@
 #include <string.h>
 #include <math.h>
 
+typedef void ( *composite_line_fn )( uint8_t *dest, uint8_t *src, int width_src, uint8_t *alpha, int weight, uint16_t *luma, int softness );
+
+/* mmx function declarations */
+#ifdef USE_MMX
+	void composite_line_yuv_mmx( uint8_t *dest, uint8_t *src, int width_src, uint8_t *alpha, int weight, uint16_t *luma, int softness );
+	int composite_have_mmx( void );
+#endif
+
 /** Geometry struct.
 */
 
@@ -155,9 +163,11 @@ static void geometry_calculate( struct geometry_s *output, struct geometry_s *in
 	output->mix = in->mix + ( out->mix - in->mix ) * position;
 	output->distort = in->distort;
 
-	output->x = ( int )floor( output->x ) & 0xfffffffe;
-	output->w = ( int )floor( output->w ) & 0xfffffffe;
-	output->sw &= 0xfffffffe;
+	// DRD> These break on negative values. I do not think they are needed
+	// since yuv_composite takes care of YUYV group alignment
+	//output->x = ( int )floor( output->x ) & 0xfffffffe;
+	//output->w = ( int )floor( output->w ) & 0xfffffffe;
+	//output->sw &= 0xfffffffe;
 }
 
 void transition_destroy_keys( void *arg )
@@ -481,22 +491,48 @@ static void luma_read_yuv422( uint8_t *image, uint16_t **map, int width, int hei
 		*p++ = ( image[ i ] - 16 ) * 299; // 299 = 65535 / 219
 }
 
+
+/** Composite a source line over a destination line
+*/
+
+static inline
+void composite_line_yuv( uint8_t *dest, uint8_t *src, int width_src, uint8_t *alpha, int weight, uint16_t *luma, int softness )
+{
+	register int j;
+	int a, mix;
+	
+	for ( j = 0; j < width_src; j ++ )
+	{
+		a = ( alpha == NULL ) ? 255 : *alpha ++;
+		mix = ( luma == NULL ) ? weight : linearstep( luma[ j ], luma[ j ] + softness, weight );
+		mix = ( mix * ( a + 1 ) ) >> 8;
+		*dest = ( *src++ * mix + *dest * ( ( 1 << 16 ) - mix ) ) >> 16;
+		dest++;
+		*dest = ( *src++ * mix + *dest * ( ( 1 << 16 ) - mix ) ) >> 16;
+		dest++;
+	}
+}
+
 /** Composite function.
 */
 
-static int composite_yuv( uint8_t *p_dest, int width_dest, int height_dest, int bpp, uint8_t *p_src, int width_src, int height_src, uint8_t *p_alpha, struct geometry_s geometry, int field, uint16_t *p_luma, int32_t softness )
+static int composite_yuv( uint8_t *p_dest, int width_dest, int height_dest, uint8_t *p_src, int width_src, int height_src, uint8_t *p_alpha, struct geometry_s geometry, int field, uint16_t *p_luma, int32_t softness, composite_line_fn line_fn )
 {
 	int ret = 0;
-	int i, j;
+	int i;
 	int x_src = 0, y_src = 0;
 	int32_t weight = ( 1 << 16 ) * ( geometry.mix / 100 );
-	int stride_src = width_src * bpp;
-	int stride_dest = width_dest * bpp;
+	int step = ( field > -1 ) ? 2 : 1;
+	int bpp = 2;
+	int stride_src = width_src * bpp * step;
+	int stride_dest = width_dest * bpp * step;
+	int alpha_stride = stride_src / bpp;
 
 	// Adjust to consumer scale
 	int x = geometry.x * width_dest / geometry.nw;
 	int y = geometry.y * height_dest / geometry.nh;
 
+	// Align x to a full YUYV group
 	x &= 0xfffffffe;
 	width_src &= 0xfffffffe;
 
@@ -565,38 +601,13 @@ static int composite_yuv( uint8_t *p_dest, int width_dest, int height_dest, int
 		height_src--;
 	}
 
-	uint8_t *p = p_src;
-	uint8_t *q = p_dest;
-	uint8_t *o = p_dest;
-	uint16_t *l = p_luma;
-	uint8_t *z = p_alpha;
-
-	uint8_t a;
-	int32_t current_weight;
-	int32_t value;
-	int step = ( field > -1 ) ? 2 : 1;
-
-	stride_src = stride_src * step;
-	int alpha_stride = stride_src / bpp;
-	stride_dest = stride_dest * step;
+	if ( line_fn == NULL )
+		line_fn = composite_line_yuv;
 
 	// now do the compositing only to cropped extents
 	for ( i = 0; i < height_src; i += step )
 	{
-		p = p_src;
-		q = p_dest;
-		o = q;
-		l = p_luma;
-		z = p_alpha;
-
-		for ( j = 0; j < width_src; j ++ )
-		{
-			a = ( z == NULL ) ? 255 : *z ++;
-			current_weight = ( l == NULL ) ? weight : linearstep( l[ j ], l[ j ] + softness, weight );
-			value = ( current_weight * ( a + 1 ) ) >> 8;
-			*o ++ = ( *p++ * value + *q++ * ( ( 1 << 16 ) - value ) ) >> 16;
-			*o ++ = ( *p++ * value + *q++ * ( ( 1 << 16 ) - value ) ) >> 16;
-		}
+		line_fn( p_dest, p_src, width_src, p_alpha, weight, p_luma, softness );
 
 		p_src += stride_src;
 		p_dest += stride_dest;
@@ -805,7 +816,7 @@ static int get_b_frame_image( mlt_transition this, mlt_frame b_frame, uint8_t **
 	x -= x % 2;
 
 	// optimization points - no work to do
-	if ( *width <= 0 || *height <= 0 )
+	if ( *width < 1 || *height < 1 )
 		return 1;
 
 	if ( ( x < 0 && -x >= *width ) || ( y < 0 && -y >= *height ) )
@@ -895,7 +906,7 @@ mlt_frame composite_copy_region( mlt_transition this, mlt_frame a_frame, mlt_pos
 	h = result.h * height / result.nh;
 
 	x &= 0xfffffffe;
-	w &= 0xfffffffe;
+	//w &= 0xfffffffe;
 
 	// Now we need to create a new destination image
 	dest = mlt_pool_alloc( w * h * 2 );
@@ -979,7 +990,6 @@ static int transition_get_image( mlt_frame a_frame, uint8_t **image, mlt_image_f
 		{
 			uint8_t *dest = *image;
 			uint8_t *src = image_b;
-			int bpp = 2;
 			uint8_t *alpha = mlt_frame_get_alpha_mask( b_frame );
 			int progressive = mlt_properties_get_int( a_props, "progressive" ) ||
 					mlt_properties_get_int( a_props, "consumer_progressive" ) ||
@@ -988,6 +998,7 @@ static int transition_get_image( mlt_frame a_frame, uint8_t **image, mlt_image_f
 			
 			int32_t luma_softness = mlt_properties_get_double( properties, "softness" ) * ( 1 << 16 );
 			uint16_t *luma_bitmap = get_luma( properties, width_b, height_b );
+			composite_line_fn line_fn = mlt_properties_get_int( properties, "_MMX" ) ? composite_line_yuv_mmx : composite_line_yuv;
 
 			for ( field = 0; field < ( progressive ? 1 : 2 ); field++ )
 			{
@@ -1001,7 +1012,7 @@ static int transition_get_image( mlt_frame a_frame, uint8_t **image, mlt_image_f
 				alignment_calculate( &result );
 
 				// Composite the b_frame on the a_frame
-				composite_yuv( dest, *width, *height, bpp, src, width_b, height_b, alpha, result, progressive ? -1 : field, luma_bitmap, luma_softness );
+				composite_yuv( dest, *width, *height, src, width_b, height_b, alpha, result, progressive ? -1 : field, luma_bitmap, luma_softness, line_fn );
 			}
 		}
 	}
@@ -1030,11 +1041,19 @@ mlt_transition transition_composite_init( char *arg )
 	mlt_transition this = calloc( sizeof( struct mlt_transition_s ), 1 );
 	if ( this != NULL && mlt_transition_init( this, NULL ) == 0 )
 	{
+		mlt_properties properties = mlt_transition_properties( this );
+		
 		this->process = composite_process;
-		mlt_properties_set( mlt_transition_properties( this ), "start", arg != NULL ? arg : "85%,5%:10%x10%" );
+		
+		// Default starting motion and zoom
+		mlt_properties_set( properties, "start", arg != NULL ? arg : "85%,5%:10%x10%" );
 		
 		// Default factory
-		mlt_properties_set( mlt_transition_properties( this ), "factory", "fezzik" );
+		mlt_properties_set( properties, "factory", "fezzik" );
+
+#ifdef USE_MMX
+		//mlt_properties_set_int( properties, "_MMX", composite_have_mmx() );
+#endif
 	}
 	return this;
 }
diff --git a/src/modules/gtk2/filter_rescale.c b/src/modules/gtk2/filter_rescale.c
index 5151a05..c57e33d 100644
--- a/src/modules/gtk2/filter_rescale.c
+++ b/src/modules/gtk2/filter_rescale.c
@@ -38,6 +38,8 @@ static int filter_get_image( mlt_frame this, uint8_t **image, mlt_image_format *
 		*width = 720;
 	if ( *height == 0 )
 		*height = 576;
+	if ( *width < 2 || *height < 6 )
+		return 1;
 
 	mlt_properties properties = mlt_frame_properties( this );
 	int iwidth = *width;
-- 
1.7.4.4