From c56ed08b4a9f46677880cd7f550450eb41871565 Mon Sep 17 00:00:00 2001 From: ddennedy Date: Fri, 27 Feb 2004 04:45:44 +0000 Subject: [PATCH] mmx version of non-nearest, 2x2 rescaling git-svn-id: https://mlt.svn.sourceforge.net/svnroot/mlt/trunk/mlt@173 d19143bc-622f-0410-bfdd-b5b2a6649095 --- src/modules/gtk2/Makefile | 6 +- src/modules/gtk2/pixops.c | 20 +- ...le_line_22_33_mmx.S => scale_line_22_yuv_mmx.S} | 213 +++++++------------- 3 files changed, 85 insertions(+), 154 deletions(-) rename src/modules/gtk2/{scale_line_22_33_mmx.S => scale_line_22_yuv_mmx.S} (56%) diff --git a/src/modules/gtk2/Makefile b/src/modules/gtk2/Makefile index 1ca7e8a..3e8cd62 100644 --- a/src/modules/gtk2/Makefile +++ b/src/modules/gtk2/Makefile @@ -8,7 +8,7 @@ OBJS = factory.o \ filter_rescale.o ASM_OBJS = have_mmx.o \ - scale_line_22_33_mmx.o + scale_line_22_yuv_mmx.o CFLAGS = -O3 -DUSE_MMX `pkg-config gdk-pixbuf-2.0 --cflags` `pkg-config pangoft2 --cflags` -I../../ -Wall -g -D_FILE_OFFSET_BITS=64 -pthread @@ -24,8 +24,8 @@ $(TARGET): $(OBJS) $(ASM_OBJS) have_mmx.o: $(CC) -o $@ -c have_mmx.S -scale_line_22_33_mmx.o: scale_line_22_33_mmx.S - $(CC) -o $@ -c scale_line_22_33_mmx.S +scale_line_22_yuv_mmx.o: scale_line_22_yuv_mmx.S + $(CC) -o $@ -c scale_line_22_yuv_mmx.S depend: $(SRCS) $(CC) -MM $(CFLAGS) $^ 1>.depend diff --git a/src/modules/gtk2/pixops.c b/src/modules/gtk2/pixops.c index 9ac77ba..a835f70 100644 --- a/src/modules/gtk2/pixops.c +++ b/src/modules/gtk2/pixops.c @@ -218,23 +218,19 @@ scale_line_22_yuv ( int *weights, int n_x, int n_y, /* process U/V */ x_aligned = ( ( x_scaled >> 1 ) << 2 ); - q0 = src0 + x_aligned; - uv_index = ( ( dest_x & 1 ) << 1 ); - //printf( "scale_line_22_yuv: %d %d\n", x_aligned + uv_index, dest_x ); - p = w1 * q0[ uv_index + 1 ]; - p += w2 * q0[ uv_index + 1 ]; - - x += x_step; - x_scaled = x >> SCALE_SHIFT; - dest_x++; + uv_index = ( ( dest_x & 1 ) << 1 ) + 1; - x_aligned = ( ( x_scaled >> 1 ) << 2 ); + q0 = src0 + x_aligned; q1 = src1 + x_aligned; - uv_index = ( ( dest_x & 1 ) << 1 ) + 1; + p = w1 * q0[ uv_index ]; p += w3 * q1[ uv_index ]; + p += w2 * q0[ uv_index ]; p += w4 * q1[ uv_index ]; - *dest++ = ( p + 0x8000 ) >> SCALE_SHIFT; + + x += x_step; + dest_x ++; + *dest++ = ( p + 0x8000 ) >> SCALE_SHIFT; } return dest; diff --git a/src/modules/gtk2/scale_line_22_33_mmx.S b/src/modules/gtk2/scale_line_22_yuv_mmx.S similarity index 56% rename from src/modules/gtk2/scale_line_22_33_mmx.S rename to src/modules/gtk2/scale_line_22_yuv_mmx.S index d4e73b4..cc389ad 100644 --- a/src/modules/gtk2/scale_line_22_33_mmx.S +++ b/src/modules/gtk2/scale_line_22_yuv_mmx.S @@ -63,99 +63,98 @@ _pixops_scale_line_22_yuv_mmx: cmpl 28(%ebp),%esi # dest == dest_end ? jnb .out + addl $65536, %ebx + .p2align 4,,7 + pxor %mm4, %mm4 + /* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining * points we are interpolating between, as: * - * 0000000000UV00YY + * 00UV00Y200UV00Y1 */ -/* Load initial values into %mm1, %mm3 */ +.loop: + +/* + * Load current values from pixel 1 + */ + movl %ebx, %edx # x_scaled = x ... + sarl $16, %edx # >> 16 + sall $1, %edx # x_scaled *= channels + + movl 16(%ebp), %edi # get src0 + movzbl 2(%edi,%edx), %ecx # next y = src0[ x_scaled + 2 ] + /* wish we had a register for this */ + movl %ecx, -24(%ebp) # save next y + movzbl (%edi,%edx), %ecx # y = src0[ x_scaled ] - /* x_scaled = ( x >> 16 ) * stride */ - movl %ebx, %edx - sarl $16,%edx - sall $1, %edx + sarl $2, %edx # x_aligned = ( x_scaled / channels ) >> 1 ... + sall $2, %edx # << 2 - /* load from src0 */ - movl 16(%ebp), %edi - movzbl (%edi,%edx), %ecx + movl 36(%ebp), %eax # uv_index = dest_x ... + andl $1, %eax # ( dest_x & 1 ) ... + sall $1, %eax # << 1 + addl %eax, %edx # x_aligned += uv_index - /* x_aligned = x_scaled divided by 2 and multiplied by 4 */ - movl %ebx, %edx - sarl $17, %edx - sall $2, %edx + movzbl 1(%edi,%edx), %eax # uv = src0[ x_aligned + 1 ] + shll $8, %eax # position uv + orl %eax, %ecx # store uv - /* uv_index = ( ( dest_x & 1 ) << 1 ) + 1; */ - movl 36(%ebp), %eax - andl $1, %eax - sall $1, %eax - addl %eax, %edx - movzbl 1(%edi,%edx), %eax - shll $8, %eax - orl %eax, %ecx + movd %ecx, %mm0 # move to mmx0 + punpcklbw %mm4, %mm0 - movd %ecx, %mm1 - pxor %mm4, %mm4 + movl -24(%ebp), %ecx # restore next y + orl %eax, %ecx # store uv + + movd %ecx, %mm1 # move to mmx1 punpcklbw %mm4, %mm1 - /* x_scaled = ( x >> 16 ) * stride */ - movl %ebx, %edx - sarl $16, %edx - sall $1, %edx - - /* load from src1 */ - movl 20(%ebp), %edi - movzbl (%edi,%edx), %ecx - - /* x_aligned = x_scaled divided by 2 and multiplied by 4 */ - movl %ebx, %edx - sarl $17, %edx - sall $2, %edx - - /* uv_index = ( ( dest_x & 1 ) << 1 ) + 1; */ - movl 36(%ebp), %eax - andl $1, %eax - sall $1, %eax - addl %eax, %edx - movzbl (%edi,%edx), %eax - shll $8, %eax - orl %eax, %ecx - - movd %ecx, %mm3 - punpcklbw %mm4, %mm3 + movl 20(%ebp), %edi # get src1 - /* dest_x++; */ - movl 36(%ebp), %eax - addl $1, %eax - movl %eax, 36(%ebp) + /* do u/v first since we already have x_aligned */ + movzbl 1(%edi,%edx), %eax # uv = src1[ x_aligned + 1 ] + shll $8, %eax # position uv - /* x_scaled = x >> 16 */ - addl $65536, %ebx - movl %ebx, %edx - sarl $16, %edx - movl %edx, -24(%ebp) + /* which is faster? 2 moves in and out of memory, or + 1 move between registers and 2 shifts? I wager the latter. */ + movl %ebx, %edx # x_scaled = x ... + sarl $16, %edx # >> 16 + sall $1, %edx # x_scaled *= channels + + movzbl 2(%edi,%edx), %ecx # next y = src1[ x_scaled + 2 ] + movl %eax, -24(%ebp) # save next y + movzbl (%edi,%edx), %ecx # y = src1[ x_scaled ] + orl %eax, %ecx # store uv + + movd %ecx, %mm2 # move to mmx2 + punpcklbw %mm4, %mm2 + + movl -24(%ebp), %ecx # restore next y + orl %eax, %ecx # store uv + + movd %ecx, %mm3 # move to mmx3 + punpcklbw %mm4, %mm3 - jmp .newx - .p2align 4,,7 -.loop: /* short *pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * n_x * n_y * 16 4 0xf 2 2 */ - movl %ebx,%eax - andl $0xf000,%eax - shrl $7,%eax + movl 8(%ebp), %edi # get weights pointer + movl %ebx, %eax + andl $0xf000, %eax + shrl $7, %eax -/* At this point, %edi holds weights. Load the 4 weights into %mm4,%mm5,%mm6,%mm7, multiply and - * accumulate. +/* At this point, %edi holds weights. Load the 4 weights into + * %mm4,%mm5,%mm6,%mm7, multiply and accumulate. */ - movq (%edi,%eax),%mm4 - pmullw %mm0,%mm4 - movq 8(%edi,%eax),%mm5 - pmullw %mm1,%mm5 - movq 16(%edi,%eax),%mm6 - movq 24(%edi,%eax),%mm7 + movq (%edi,%eax), %mm4 + pmullw %mm0, %mm4 + movq 8(%edi,%eax), %mm5 + pmullw %mm1, %mm5 + movq 16(%edi,%eax), %mm6 pmullw %mm2,%mm6 + movq 24(%edi,%eax), %mm7 pmullw %mm3,%mm7 + paddw %mm4, %mm5 paddw %mm6, %mm7 paddw %mm5, %mm7 @@ -174,82 +173,18 @@ _pixops_scale_line_22_yuv_mmx: packuswb %mm7, %mm7 movd %mm7, %eax - movb %al, 0(%esi) # *dest = y + movb %al, 0(%esi) # dest[ 0 ] = y shrl $8, %eax - movb %al, 1(%esi) # *dest = uv + movb %al, 1(%esi) # dest[ 1 ] = uv addl $2, %esi # dest += 2 - cmpl %esi,28(%ebp) # if dest == dest_end ? + cmpl %esi,28(%ebp) # if dest == dest_end je .out # then exit - movl 36(%ebp), %eax # get dest_x - addl $1, %eax # dest_x++ - movl %eax, 36(%ebp) # put dest_x - addl 24(%ebp), %ebx # x += x_step + addl $1, 36(%ebp) # dest_x++ - movl %ebx, %edx # x_scaled = x ... - sarl $16, %edx # >> 16 - movl %edx, -24(%ebp) # save x_scaled - -.newx: - -/* - * Load the two new values into %mm1, %mm3, move old values into %mm0, %mm2 - */ - movq %mm1, %mm0 - movq %mm3, %mm2 - - sall $1, %edx # x_scaled *= channels - - movl 16(%ebp), %edi # get src0 - movzbl (%edi,%edx), %ecx # y = src0[ x_scaled ] - - sarl $2, %edx # x_aligned = ( x_scaled / channels ) >> 1 ... - sall $2, %edx # << 2 - - movl 36(%ebp), %eax # uv_index = dest_x ... - #pushl %eax - andl $1, %eax # ( dest_x & 1 ) ... - sall $1, %eax # << 1 - addl %eax, %edx # x_aligned += uv_index - #pushl %edx - #pushl $MSG - #call printf - #popl %edx - #popl %edx - #popl %edx - movzbl 1(%edi,%edx), %eax # uv = src0[ x_aligned + 1 ] - shll $8, %eax # store uv - orl %eax, %ecx - - movd %ecx, %mm1 # move to mmx1 - punpcklbw %mm4, %mm1 - - movl %ebx, %edx # x_scaled = x ... - sarl $16, %edx # >> 16 - sall $1, %edx # x_scaled *= channels - - movl 20(%ebp), %edi # get src1 - movzbl (%edi,%edx), %ecx # y = src1[ x_scaled ] - - sarl $2, %edx # x_aligned = ( x_scaled / channels ) >> 1 ... - sall $2, %edx # << 2 - - movl 36(%ebp), %eax # uv_index = dest_x ... - andl $1, %eax # ( dest_x & 1 ) ... - sall $1, %eax # << 1 - addl %eax, %edx # x_aligned += uv_index - movzbl 1(%edi,%edx), %eax # uv = src1[ x_aligned + 1 ] - shll $8, %eax # store uv - orl %eax, %ecx - - movd %ecx, %mm3 # move to mmx3 - punpcklbw %mm4, %mm3 - - movl 8(%ebp), %edi # get weights pointer - jmp .loop .out: -- 1.7.4.4