From 7377df6338b43516dc92d9fecec3469f774c2222 Mon Sep 17 00:00:00 2001 From: ddennedy Date: Thu, 26 Feb 2004 14:11:49 +0000 Subject: [PATCH] updated mmx yuv scaling git-svn-id: https://mlt.svn.sourceforge.net/svnroot/mlt/trunk/mlt@171 d19143bc-622f-0410-bfdd-b5b2a6649095 --- src/modules/gtk2/Makefile | 2 +- src/modules/gtk2/pixops.c | 31 +++-- src/modules/gtk2/scale_line_22_33_mmx.S | 225 +++++++++++++++++++++---------- 3 files changed, 174 insertions(+), 84 deletions(-) diff --git a/src/modules/gtk2/Makefile b/src/modules/gtk2/Makefile index cd001dd..1ca7e8a 100644 --- a/src/modules/gtk2/Makefile +++ b/src/modules/gtk2/Makefile @@ -24,7 +24,7 @@ $(TARGET): $(OBJS) $(ASM_OBJS) have_mmx.o: $(CC) -o $@ -c have_mmx.S -scale_line_22_33_mmx.o: +scale_line_22_33_mmx.o: scale_line_22_33_mmx.S $(CC) -o $@ -c scale_line_22_33_mmx.S depend: $(SRCS) diff --git a/src/modules/gtk2/pixops.c b/src/modules/gtk2/pixops.c index 5956b6e..9ac77ba 100644 --- a/src/modules/gtk2/pixops.c +++ b/src/modules/gtk2/pixops.c @@ -59,7 +59,7 @@ typedef void ( *PixopsPixelFunc ) ( guchar *dest, guint y1, guint cr, guint y2, /* mmx function declarations */ #ifdef USE_MMX -guchar *pixops_scale_line_22_33_mmx ( guint32 weights[ 16 ][ 8 ], guchar *p, guchar *q1, guchar *q2, int x_step, guchar *p_stop, int x_init ); +guchar *pixops_scale_line_22_yuv_mmx ( guint32 weights[ 16 ][ 8 ], guchar *p, guchar *q1, guchar *q2, int x_step, guchar *p_stop, int x_init, int destx ); int pixops_have_mmx ( void ); #endif @@ -158,7 +158,7 @@ scale_line ( int *weights, int n_x, int n_y, #ifdef USE_MMX static inline guchar * -scale_line_22_33_mmx_stub ( int *weights, int n_x, int n_y, +scale_line_22_yuv_mmx_stub ( int *weights, int n_x, int n_y, guchar *dest, int dest_x, guchar *dest_end, guchar **src, int x_init, int x_step, int src_width ) @@ -178,12 +178,12 @@ scale_line_22_33_mmx_stub ( int *weights, int n_x, int n_y, mmx_weights[ j ][ 7 ] = 0x00010001 * ( weights[ 4 * j + 3 ] >> 8 ); } - return pixops_scale_line_22_33_mmx ( mmx_weights, dest, src[ 0 ], src[ 1 ], x_step, dest_end, x_init ); + return pixops_scale_line_22_yuv_mmx ( mmx_weights, dest, src[ 0 ], src[ 1 ], x_step, dest_end, x_init, dest_x ); } #endif /* USE_MMX */ static inline guchar * -scale_line_22_33 ( int *weights, int n_x, int n_y, +scale_line_22_yuv ( int *weights, int n_x, int n_y, guchar *dest, int dest_x, guchar *dest_end, guchar **src, int x_init, int x_step, int src_width ) @@ -219,16 +219,22 @@ scale_line_22_33 ( int *weights, int n_x, int n_y, /* process U/V */ x_aligned = ( ( x_scaled >> 1 ) << 2 ); q0 = src0 + x_aligned; + uv_index = ( ( dest_x & 1 ) << 1 ); + //printf( "scale_line_22_yuv: %d %d\n", x_aligned + uv_index, dest_x ); + p = w1 * q0[ uv_index + 1 ]; + p += w2 * q0[ uv_index + 1 ]; + + x += x_step; + x_scaled = x >> SCALE_SHIFT; + dest_x++; + + x_aligned = ( ( x_scaled >> 1 ) << 2 ); q1 = src1 + x_aligned; uv_index = ( ( dest_x & 1 ) << 1 ) + 1; - p = w1 * q0[ uv_index ]; - p += w2 * q0[ uv_index ]; p += w3 * q1[ uv_index ]; p += w4 * q1[ uv_index ]; *dest++ = ( p + 0x8000 ) >> SCALE_SHIFT; - x += x_step; - dest_x++; } return dest; @@ -739,12 +745,15 @@ yuv422_scale ( guchar *dest_buf, if ( filter.x.n == 2 && filter.y.n == 2 ) { #ifdef USE_MMX - if ( 0 && found_mmx ) - line_func = scale_line_22_33_mmx_stub; + if ( found_mmx ) + { + //fprintf( stderr, "rescale: using mmx\n" ); + line_func = scale_line_22_yuv_mmx_stub; + } else #endif - line_func = scale_line_22_33; + line_func = scale_line_22_yuv; } else line_func = scale_line; diff --git a/src/modules/gtk2/scale_line_22_33_mmx.S b/src/modules/gtk2/scale_line_22_33_mmx.S index f389217..d4e73b4 100644 --- a/src/modules/gtk2/scale_line_22_33_mmx.S +++ b/src/modules/gtk2/scale_line_22_33_mmx.S @@ -1,32 +1,39 @@ - .file "scale_line_22_33_mmx.S" + .file "scale_line_22_yuv_mmx.S" .version "01.01" + +.extern printf + gcc2_compiled.: +.data +MSG: .ascii "scale_line_22_yuv_mmx: %d %d\n" + .text .align 16 #if !defined(__MINGW32__) && !defined(__CYGWIN__) -.globl pixops_scale_line_22_33_mmx - .type pixops_scale_line_22_33_mmx,@function -pixops_scale_line_22_33_mmx: +.globl pixops_scale_line_22_yuv_mmx + .type pixops_scale_line_22_yuv_mmx,@function +pixops_scale_line_22_yuv_mmx: #else -.globl _pixops_scale_line_22_33_mmx -_pixops_scale_line_22_33_mmx: +.globl _pixops_scale_line_22_yuv_mmx +_pixops_scale_line_22_yuv_mmx: #endif /* * Arguments * * weights: 8(%ebp) - * p: 12(%ebp) %esi - * q1: 16(%ebp) - * q2: 20(%ebp) + * p (dest): 12(%ebp) %esi + * q1 (src0): 16(%ebp) + * q2 (src1): 20(%ebp) * xstep: 24(%ebp) * p_end: 28(%ebp) * xinit: 32(%ebp) - * + * destx: 36(%ebp) + * */ /* @@ -38,52 +45,95 @@ _pixops_scale_line_22_33_mmx: pushl %edi pushl %esi pushl %ebx -/* Locals: +/* Locals: * int x %ebx * int x_scaled -24(%ebp) + * int dest_x 36(%ebp) */ /* * Setup */ -/* Initialize variables */ - movl 32(%ebp),%ebx - movl 32(%ebp),%edx - sarl $16,%edx - movl 12(%ebp),%esi +/* Initialize variables */ + movl 36(%ebp),%eax # destx + movl %eax,36(%ebp) + movl 32(%ebp),%ebx # x + movl 12(%ebp),%esi # dest - cmpl 28(%ebp),%esi + cmpl 28(%ebp),%esi # dest == dest_end ? jnb .out -/* For the body of this loop, %mm01, %mm1, %mm2, %mm3 hold the 4 adjoining +/* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining * points we are interpolating between, as: * - * 000000BB00GG00RR - */ - + * 0000000000UV00YY + */ + /* Load initial values into %mm1, %mm3 */ - leal (%edx,%edx,2),%edx # Multiply by 3 - movl 16(%ebp),%edi - pxor %mm4, %mm4 - movzbl 2(%edi,%edx),%ecx - shll $16,%ecx - movzwl (%edi,%edx),%eax - orl %eax,%ecx + /* x_scaled = ( x >> 16 ) * stride */ + movl %ebx, %edx + sarl $16,%edx + sall $1, %edx + + /* load from src0 */ + movl 16(%ebp), %edi + movzbl (%edi,%edx), %ecx + + /* x_aligned = x_scaled divided by 2 and multiplied by 4 */ + movl %ebx, %edx + sarl $17, %edx + sall $2, %edx + + /* uv_index = ( ( dest_x & 1 ) << 1 ) + 1; */ + movl 36(%ebp), %eax + andl $1, %eax + sall $1, %eax + addl %eax, %edx + movzbl 1(%edi,%edx), %eax + shll $8, %eax + orl %eax, %ecx + movd %ecx, %mm1 + pxor %mm4, %mm4 punpcklbw %mm4, %mm1 - movl 20(%ebp),%edi - movzbl 2(%edi,%edx),%ecx - shll $16,%ecx - movzwl (%edi,%edx),%eax - orl %eax,%ecx + /* x_scaled = ( x >> 16 ) * stride */ + movl %ebx, %edx + sarl $16, %edx + sall $1, %edx + + /* load from src1 */ + movl 20(%ebp), %edi + movzbl (%edi,%edx), %ecx + + /* x_aligned = x_scaled divided by 2 and multiplied by 4 */ + movl %ebx, %edx + sarl $17, %edx + sall $2, %edx + + /* uv_index = ( ( dest_x & 1 ) << 1 ) + 1; */ + movl 36(%ebp), %eax + andl $1, %eax + sall $1, %eax + addl %eax, %edx + movzbl (%edi,%edx), %eax + shll $8, %eax + orl %eax, %ecx + movd %ecx, %mm3 punpcklbw %mm4, %mm3 - addl $65536,%ebx - movl %ebx,%edx - sarl $16,%edx + /* dest_x++; */ + movl 36(%ebp), %eax + addl $1, %eax + movl %eax, 36(%ebp) + + /* x_scaled = x >> 16 */ + addl $65536, %ebx + movl %ebx, %edx + sarl $16, %edx + movl %edx, -24(%ebp) jmp .newx .p2align 4,,7 @@ -110,64 +160,95 @@ _pixops_scale_line_22_33_mmx: paddw %mm6, %mm7 paddw %mm5, %mm7 -/* %mm7 holds the accumulated sum. Compute (C + 0x80) / 256 +/* %mm7 holds the accumulated sum. Compute (C + 0x80) / 256 */ pxor %mm4, %mm4 - movl $8421504, %eax # 0x00808080 - movd %eax, %mm6 + movl $0x80808080, %eax + movd %eax, %mm6 punpcklbw %mm4, %mm6 paddw %mm6, %mm7 psrlw $8, %mm7 /* Pack into %eax and store result - */ + */ packuswb %mm7, %mm7 movd %mm7, %eax - - movb %al, (%esi) + + movb %al, 0(%esi) # *dest = y shrl $8, %eax - movw %ax, 1(%esi) - addl $3, %esi - - cmpl %esi,28(%ebp) - je .out - -/* x += x_step; */ - addl 24(%ebp),%ebx -/* x_scaled = x >> 16; */ - movl %ebx,%edx - sarl $16,%edx + movb %al, 1(%esi) # *dest = uv + + addl $2, %esi # dest += 2 + + cmpl %esi,28(%ebp) # if dest == dest_end ? + je .out # then exit + + movl 36(%ebp), %eax # get dest_x + addl $1, %eax # dest_x++ + movl %eax, 36(%ebp) # put dest_x - cmpl %edx,-24(%ebp) - je .loop + addl 24(%ebp), %ebx # x += x_step + + movl %ebx, %edx # x_scaled = x ... + sarl $16, %edx # >> 16 + movl %edx, -24(%ebp) # save x_scaled .newx: - movl %edx,-24(%ebp) + /* * Load the two new values into %mm1, %mm3, move old values into %mm0, %mm2 */ movq %mm1, %mm0 movq %mm3, %mm2 - - leal (%edx,%edx,2),%edx # Multiply by 3 - movl 16(%ebp),%edi - movzbl 2(%edi,%edx),%ecx - shll $16,%ecx - movzwl (%edi,%edx),%eax - orl %eax,%ecx - movd %ecx, %mm1 + sall $1, %edx # x_scaled *= channels + + movl 16(%ebp), %edi # get src0 + movzbl (%edi,%edx), %ecx # y = src0[ x_scaled ] + + sarl $2, %edx # x_aligned = ( x_scaled / channels ) >> 1 ... + sall $2, %edx # << 2 + + movl 36(%ebp), %eax # uv_index = dest_x ... + #pushl %eax + andl $1, %eax # ( dest_x & 1 ) ... + sall $1, %eax # << 1 + addl %eax, %edx # x_aligned += uv_index + #pushl %edx + #pushl $MSG + #call printf + #popl %edx + #popl %edx + #popl %edx + movzbl 1(%edi,%edx), %eax # uv = src0[ x_aligned + 1 ] + shll $8, %eax # store uv + orl %eax, %ecx + + movd %ecx, %mm1 # move to mmx1 punpcklbw %mm4, %mm1 - movl 20(%ebp),%edi - movzbl 2(%edi,%edx),%ecx - shll $16,%ecx - movzwl (%edi,%edx),%eax - orl %eax,%ecx - movd %ecx, %mm3 + movl %ebx, %edx # x_scaled = x ... + sarl $16, %edx # >> 16 + sall $1, %edx # x_scaled *= channels + + movl 20(%ebp), %edi # get src1 + movzbl (%edi,%edx), %ecx # y = src1[ x_scaled ] + + sarl $2, %edx # x_aligned = ( x_scaled / channels ) >> 1 ... + sall $2, %edx # << 2 + + movl 36(%ebp), %eax # uv_index = dest_x ... + andl $1, %eax # ( dest_x & 1 ) ... + sall $1, %eax # << 1 + addl %eax, %edx # x_aligned += uv_index + movzbl 1(%edi,%edx), %eax # uv = src1[ x_aligned + 1 ] + shll $8, %eax # store uv + orl %eax, %ecx + + movd %ecx, %mm3 # move to mmx3 punpcklbw %mm4, %mm3 - - movl 8(%ebp),%edi + + movl 8(%ebp), %edi # get weights pointer jmp .loop -- 1.7.4.4