mmx version of non-nearest, 2x2 rescaling

author ddennedy <ddennedy@d19143bc-622f-0410-bfdd-b5b2a6649095>

Fri, 27 Feb 2004 04:45:44 +0000 (04:45 +0000)

committer ddennedy <ddennedy@d19143bc-622f-0410-bfdd-b5b2a6649095>

Fri, 27 Feb 2004 04:45:44 +0000 (04:45 +0000)
author ddennedy <ddennedy@d19143bc-622f-0410-bfdd-b5b2a6649095>
Fri, 27 Feb 2004 04:45:44 +0000 (04:45 +0000)
committer ddennedy <ddennedy@d19143bc-622f-0410-bfdd-b5b2a6649095>
Fri, 27 Feb 2004 04:45:44 +0000 (04:45 +0000)
diff --git a/src/modules/gtk2/Makefile b/src/modules/gtk2/Makefile

index 1ca7e8a..3e8cd62 100644 (file)
--- a/src/modules/gtk2/Makefile
+++ b/src/modules/gtk2/Makefile
@@ -8,7 +8,7 @@ OBJS = factory.o \
           filter_rescale.o
 
 ASM_OBJS = have_mmx.o \
-                  scale_line_22_33_mmx.o
+                  scale_line_22_yuv_mmx.o
 
 CFLAGS = -O3 -DUSE_MMX `pkg-config gdk-pixbuf-2.0 --cflags` `pkg-config pangoft2 --cflags` -I../../ -Wall -g -D_FILE_OFFSET_BITS=64 -pthread
 
@@ -24,8 +24,8 @@ $(TARGET): $(OBJS) $(ASM_OBJS)
 have_mmx.o:
        $(CC) -o $@ -c have_mmx.S
 
-scale_line_22_33_mmx.o: scale_line_22_33_mmx.S
-       $(CC) -o $@ -c scale_line_22_33_mmx.S
+scale_line_22_yuv_mmx.o: scale_line_22_yuv_mmx.S
+       $(CC) -o $@ -c scale_line_22_yuv_mmx.S
 
 depend:        $(SRCS)
                $(CC) -MM $(CFLAGS) $^ 1>.depend
diff --git a/src/modules/gtk2/pixops.c b/src/modules/gtk2/pixops.c

index 9ac77ba..a835f70 100644 (file)
--- a/src/modules/gtk2/pixops.c
+++ b/src/modules/gtk2/pixops.c
@@ -218,23 +218,19 @@ scale_line_22_yuv ( int *weights, int n_x, int n_y,
 
                /* process U/V */
                x_aligned = ( ( x_scaled >> 1 ) << 2 );
-               q0 = src0 + x_aligned;
-               uv_index = ( ( dest_x & 1 ) << 1 );
-               //printf( "scale_line_22_yuv: %d %d\n", x_aligned + uv_index, dest_x );
-               p  = w1 * q0[ uv_index + 1 ];
-               p += w2 * q0[ uv_index + 1 ];
-
-               x += x_step;
-               x_scaled = x >> SCALE_SHIFT;
-               dest_x++;
+               uv_index = ( ( dest_x & 1 ) << 1 ) + 1;
                
-               x_aligned = ( ( x_scaled >> 1 ) << 2 );
+               q0 = src0 + x_aligned;
                q1 = src1 + x_aligned;
-               uv_index = ( ( dest_x & 1 ) << 1 ) + 1;
+               p  = w1 * q0[ uv_index ];
                p += w3 * q1[ uv_index ];
+               p += w2 * q0[ uv_index ];
                p += w4 * q1[ uv_index ];
-               *dest++ = ( p + 0x8000 ) >> SCALE_SHIFT;
+               
+               x += x_step;
+               dest_x ++;
 
+               *dest++ = ( p + 0x8000 ) >> SCALE_SHIFT;
        }
 
        return dest;
diff --git a/src/modules/gtk2/scale_line_22_33_mmx.S b/src/modules/gtk2/scale_line_22_yuv_mmx.S

similarity index 56%

rename from src/modules/gtk2/scale_line_22_33_mmx.S

rename to src/modules/gtk2/scale_line_22_yuv_mmx.S

index d4e73b4..cc389ad 100644 (file)
--- a/src/modules/gtk2/scale_line_22_33_mmx.S
+++ b/src/modules/gtk2/scale_line_22_yuv_mmx.S
@@ -63,99 +63,98 @@ _pixops_scale_line_22_yuv_mmx:
        cmpl 28(%ebp),%esi # dest == dest_end ?
        jnb  .out
 
+       addl $65536, %ebx
+       .p2align 4,,7
+       pxor %mm4, %mm4
+
 /* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining
  * points we are interpolating between, as:
  *
- *  0000000000UV00YY
+ *  00UV00Y200UV00Y1
  */
 
-/* Load initial values into %mm1, %mm3 */
+.loop:
+
+/*
+ * Load current values from pixel 1
+ */
+       movl %ebx, %edx          # x_scaled = x ...
+       sarl $16, %edx           # >> 16
+       sall $1, %edx            # x_scaled *= channels
+
+       movl 16(%ebp), %edi      # get src0
+       movzbl 2(%edi,%edx), %ecx # next y = src0[ x_scaled + 2 ]
+       /* wish we had a register for this */
+       movl %ecx, -24(%ebp)     # save next y
+       movzbl (%edi,%edx), %ecx # y = src0[ x_scaled ]
 
-       /* x_scaled = ( x >> 16 ) * stride */
-       movl %ebx, %edx
-       sarl $16,%edx
-       sall $1, %edx
+       sarl $2, %edx            # x_aligned = ( x_scaled / channels ) >> 1 ...
+       sall $2, %edx            # << 2
 
-       /* load from src0 */
-       movl 16(%ebp), %edi
-       movzbl (%edi,%edx), %ecx
+       movl 36(%ebp), %eax      # uv_index = dest_x ...
+       andl $1, %eax            # ( dest_x & 1 ) ...
+       sall $1, %eax            # << 1
+       addl %eax, %edx          # x_aligned += uv_index
 
-       /* x_aligned = x_scaled divided by 2 and multiplied by 4 */
-       movl %ebx, %edx
-       sarl $17, %edx
-       sall $2, %edx
+       movzbl 1(%edi,%edx), %eax # uv = src0[ x_aligned + 1 ]
+       shll $8, %eax           # position uv
+       orl %eax, %ecx           # store uv
 
-       /* uv_index = ( ( dest_x & 1 ) << 1 ) + 1; */
-       movl 36(%ebp), %eax
-       andl $1, %eax
-       sall $1, %eax
-       addl %eax, %edx
-       movzbl 1(%edi,%edx), %eax
-       shll $8, %eax
-       orl %eax, %ecx
+       movd %ecx, %mm0          # move to mmx0
+       punpcklbw %mm4, %mm0
 
-       movd %ecx, %mm1
-       pxor %mm4, %mm4
+       movl -24(%ebp), %ecx     # restore next y
+       orl %eax, %ecx           # store uv
+
+       movd %ecx, %mm1          # move to mmx1
        punpcklbw %mm4, %mm1
 
-       /* x_scaled = ( x >> 16 ) * stride */
-       movl %ebx, %edx
-       sarl $16, %edx
-       sall $1, %edx
-
-       /* load from src1 */
-       movl 20(%ebp), %edi
-       movzbl (%edi,%edx), %ecx
-
-       /* x_aligned = x_scaled divided by 2 and multiplied by 4 */
-       movl %ebx, %edx
-       sarl $17, %edx
-       sall $2, %edx
-
-       /* uv_index = ( ( dest_x & 1 ) << 1 ) + 1; */
-       movl 36(%ebp), %eax
-       andl $1, %eax
-       sall $1, %eax
-       addl %eax, %edx
-       movzbl (%edi,%edx), %eax
-       shll $8, %eax
-       orl %eax, %ecx
-
-       movd %ecx, %mm3
-       punpcklbw %mm4, %mm3
+       movl 20(%ebp), %edi      # get src1
 
-       /* dest_x++; */
-       movl 36(%ebp), %eax
-       addl $1, %eax
-       movl %eax, 36(%ebp)
+       /* do u/v first since we already have x_aligned */
+       movzbl 1(%edi,%edx), %eax # uv = src1[ x_aligned + 1 ]
+       shll $8, %eax            # position uv
 
-       /* x_scaled = x >> 16 */
-       addl $65536, %ebx
-       movl %ebx, %edx
-       sarl $16, %edx
-       movl %edx, -24(%ebp)
+       /* which is faster? 2 moves in and out of memory, or
+       1 move between registers and 2 shifts? I wager the latter. */
+       movl %ebx, %edx          # x_scaled = x ...
+       sarl $16, %edx           # >> 16
+       sall $1, %edx            # x_scaled *= channels
+
+       movzbl 2(%edi,%edx), %ecx # next y = src1[ x_scaled + 2 ]
+       movl %eax, -24(%ebp)     # save next y
+       movzbl (%edi,%edx), %ecx # y = src1[ x_scaled ]
+       orl %eax, %ecx           # store uv
+
+       movd %ecx, %mm2          # move to mmx2
+       punpcklbw %mm4, %mm2
+
+       movl -24(%ebp), %ecx     # restore next y
+       orl %eax, %ecx           # store uv
+
+       movd %ecx, %mm3          # move to mmx3
+       punpcklbw %mm4, %mm3
 
-       jmp .newx
-       .p2align 4,,7
-.loop:
 /* short *pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * n_x * n_y
  *                                             16             4                  0xf            2     2
  */
-       movl %ebx,%eax
-       andl $0xf000,%eax
-       shrl $7,%eax
+       movl 8(%ebp), %edi       # get weights pointer
+       movl %ebx, %eax
+       andl $0xf000, %eax
+       shrl $7, %eax
 
-/* At this point, %edi holds weights. Load the 4 weights into %mm4,%mm5,%mm6,%mm7, multiply and
- * accumulate.
+/* At this point, %edi holds weights. Load the 4 weights into 
+ * %mm4,%mm5,%mm6,%mm7, multiply and accumulate.
  */
-       movq (%edi,%eax),%mm4
-       pmullw %mm0,%mm4
-       movq 8(%edi,%eax),%mm5
-       pmullw %mm1,%mm5
-       movq 16(%edi,%eax),%mm6
-       movq 24(%edi,%eax),%mm7
+       movq (%edi,%eax), %mm4
+       pmullw %mm0, %mm4
+       movq 8(%edi,%eax), %mm5
+       pmullw %mm1, %mm5
+       movq 16(%edi,%eax), %mm6
        pmullw %mm2,%mm6
+       movq 24(%edi,%eax), %mm7
        pmullw %mm3,%mm7
+
        paddw %mm4, %mm5
        paddw %mm6, %mm7
        paddw %mm5, %mm7
@@ -174,82 +173,18 @@ _pixops_scale_line_22_yuv_mmx:
        packuswb %mm7, %mm7
        movd %mm7, %eax
 
-       movb %al, 0(%esi)        # *dest = y
+       movb %al, 0(%esi)        # dest[ 0 ] = y
        shrl $8, %eax
-       movb %al, 1(%esi)        # *dest = uv
+       movb %al, 1(%esi)        # dest[ 1 ] = uv
 
        addl $2, %esi            # dest += 2
 
-       cmpl %esi,28(%ebp)       # if dest == dest_end ?
+       cmpl %esi,28(%ebp)       # if dest == dest_end
        je   .out                # then exit
 
-       movl 36(%ebp), %eax      # get dest_x
-       addl $1, %eax            # dest_x++
-       movl %eax, 36(%ebp)      # put dest_x
-
        addl 24(%ebp), %ebx      # x += x_step
+       addl $1, 36(%ebp)        # dest_x++
 
-       movl %ebx, %edx          # x_scaled = x ...
-       sarl $16, %edx           # >> 16
-       movl %edx, -24(%ebp)     # save x_scaled
-
-.newx:
-
-/*
- * Load the two new values into %mm1, %mm3, move old values into %mm0, %mm2
- */
-       movq %mm1, %mm0
-       movq %mm3, %mm2
-
-       sall $1, %edx            # x_scaled *= channels
-
-       movl 16(%ebp), %edi      # get src0
-       movzbl (%edi,%edx), %ecx # y = src0[ x_scaled ]
-
-       sarl $2, %edx            # x_aligned = ( x_scaled / channels ) >> 1 ...
-       sall $2, %edx            # << 2
-
-       movl 36(%ebp), %eax      # uv_index = dest_x ...
-               #pushl %eax
-       andl $1, %eax            # ( dest_x & 1 ) ...
-       sall $1, %eax            # << 1
-       addl %eax, %edx          # x_aligned += uv_index
-               #pushl %edx
-               #pushl $MSG
-               #call printf
-               #popl %edx
-               #popl %edx
-               #popl %edx
-       movzbl 1(%edi,%edx), %eax # uv = src0[ x_aligned + 1 ]
-       shll $8, %eax            # store uv
-       orl %eax, %ecx
-
-       movd %ecx, %mm1          # move to mmx1
-       punpcklbw %mm4, %mm1
-
-       movl %ebx, %edx          # x_scaled = x ...
-       sarl $16, %edx           # >> 16
-       sall $1, %edx            # x_scaled *= channels
-
-       movl 20(%ebp), %edi      # get src1
-       movzbl (%edi,%edx), %ecx # y = src1[ x_scaled ]
-
-       sarl $2, %edx            # x_aligned = ( x_scaled / channels ) >> 1 ...
-       sall $2, %edx            # << 2
-
-       movl 36(%ebp), %eax      # uv_index = dest_x ...
-       andl $1, %eax            # ( dest_x & 1 ) ...
-       sall $1, %eax            # << 1
-       addl %eax, %edx          # x_aligned += uv_index
-       movzbl 1(%edi,%edx), %eax # uv = src1[ x_aligned + 1 ]
-       shll $8, %eax            # store uv
-       orl %eax, %ecx
-
-       movd %ecx, %mm3          # move to mmx3
-       punpcklbw %mm4, %mm3
-
-       movl 8(%ebp), %edi       # get weights pointer
-       
        jmp .loop
 
 .out:
author	ddennedy <ddennedy@d19143bc-622f-0410-bfdd-b5b2a6649095>
	Fri, 27 Feb 2004 04:45:44 +0000 (04:45 +0000)
committer	ddennedy <ddennedy@d19143bc-622f-0410-bfdd-b5b2a6649095>
	Fri, 27 Feb 2004 04:45:44 +0000 (04:45 +0000)
src/modules/gtk2/Makefile		patch \| blob \| history
src/modules/gtk2/pixops.c		patch \| blob \| history
src/modules/gtk2/scale_line_22_yuv_mmx.S	[moved from src/modules/gtk2/scale_line_22_33_mmx.S with 56% similarity]	patch \| blob \| history