cmpl 28(%ebp),%esi # dest == dest_end ?
jnb .out
+ addl $65536, %ebx
+ .p2align 4,,7
+ pxor %mm4, %mm4
+
/* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining
* points we are interpolating between, as:
*
- * 0000000000UV00YY
+ * 00UV00Y200UV00Y1
*/
-/* Load initial values into %mm1, %mm3 */
+.loop:
+
+/*
+ * Load current values from pixel 1
+ */
+ movl %ebx, %edx # x_scaled = x ...
+ sarl $16, %edx # >> 16
+ sall $1, %edx # x_scaled *= channels
+
+ movl 16(%ebp), %edi # get src0
+ movzbl 2(%edi,%edx), %ecx # next y = src0[ x_scaled + 2 ]
+ /* wish we had a register for this */
+ movl %ecx, -24(%ebp) # save next y
+ movzbl (%edi,%edx), %ecx # y = src0[ x_scaled ]
- /* x_scaled = ( x >> 16 ) * stride */
- movl %ebx, %edx
- sarl $16,%edx
- sall $1, %edx
+ sarl $2, %edx # x_aligned = ( x_scaled / channels ) >> 1 ...
+ sall $2, %edx # << 2
- /* load from src0 */
- movl 16(%ebp), %edi
- movzbl (%edi,%edx), %ecx
+ movl 36(%ebp), %eax # uv_index = dest_x ...
+ andl $1, %eax # ( dest_x & 1 ) ...
+ sall $1, %eax # << 1
+ addl %eax, %edx # x_aligned += uv_index
- /* x_aligned = x_scaled divided by 2 and multiplied by 4 */
- movl %ebx, %edx
- sarl $17, %edx
- sall $2, %edx
+ movzbl 1(%edi,%edx), %eax # uv = src0[ x_aligned + 1 ]
+ shll $8, %eax # position uv
+ orl %eax, %ecx # store uv
- /* uv_index = ( ( dest_x & 1 ) << 1 ) + 1; */
- movl 36(%ebp), %eax
- andl $1, %eax
- sall $1, %eax
- addl %eax, %edx
- movzbl 1(%edi,%edx), %eax
- shll $8, %eax
- orl %eax, %ecx
+ movd %ecx, %mm0 # move to mmx0
+ punpcklbw %mm4, %mm0
- movd %ecx, %mm1
- pxor %mm4, %mm4
+ movl -24(%ebp), %ecx # restore next y
+ orl %eax, %ecx # store uv
+
+ movd %ecx, %mm1 # move to mmx1
punpcklbw %mm4, %mm1
- /* x_scaled = ( x >> 16 ) * stride */
- movl %ebx, %edx
- sarl $16, %edx
- sall $1, %edx
-
- /* load from src1 */
- movl 20(%ebp), %edi
- movzbl (%edi,%edx), %ecx
-
- /* x_aligned = x_scaled divided by 2 and multiplied by 4 */
- movl %ebx, %edx
- sarl $17, %edx
- sall $2, %edx
-
- /* uv_index = ( ( dest_x & 1 ) << 1 ) + 1; */
- movl 36(%ebp), %eax
- andl $1, %eax
- sall $1, %eax
- addl %eax, %edx
- movzbl (%edi,%edx), %eax
- shll $8, %eax
- orl %eax, %ecx
-
- movd %ecx, %mm3
- punpcklbw %mm4, %mm3
+ movl 20(%ebp), %edi # get src1
- /* dest_x++; */
- movl 36(%ebp), %eax
- addl $1, %eax
- movl %eax, 36(%ebp)
+ /* do u/v first since we already have x_aligned */
+ movzbl 1(%edi,%edx), %eax # uv = src1[ x_aligned + 1 ]
+ shll $8, %eax # position uv
- /* x_scaled = x >> 16 */
- addl $65536, %ebx
- movl %ebx, %edx
- sarl $16, %edx
- movl %edx, -24(%ebp)
+ /* which is faster? 2 moves in and out of memory, or
+ 1 move between registers and 2 shifts? I wager the latter. */
+ movl %ebx, %edx # x_scaled = x ...
+ sarl $16, %edx # >> 16
+ sall $1, %edx # x_scaled *= channels
+
+ movzbl 2(%edi,%edx), %ecx # next y = src1[ x_scaled + 2 ]
+ movl %eax, -24(%ebp) # save next y
+ movzbl (%edi,%edx), %ecx # y = src1[ x_scaled ]
+ orl %eax, %ecx # store uv
+
+ movd %ecx, %mm2 # move to mmx2
+ punpcklbw %mm4, %mm2
+
+ movl -24(%ebp), %ecx # restore next y
+ orl %eax, %ecx # store uv
+
+ movd %ecx, %mm3 # move to mmx3
+ punpcklbw %mm4, %mm3
- jmp .newx
- .p2align 4,,7
-.loop:
/* short *pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * n_x * n_y
* 16 4 0xf 2 2
*/
- movl %ebx,%eax
- andl $0xf000,%eax
- shrl $7,%eax
+ movl 8(%ebp), %edi # get weights pointer
+ movl %ebx, %eax
+ andl $0xf000, %eax
+ shrl $7, %eax
-/* At this point, %edi holds weights. Load the 4 weights into %mm4,%mm5,%mm6,%mm7, multiply and
- * accumulate.
+/* At this point, %edi holds weights. Load the 4 weights into
+ * %mm4,%mm5,%mm6,%mm7, multiply and accumulate.
*/
- movq (%edi,%eax),%mm4
- pmullw %mm0,%mm4
- movq 8(%edi,%eax),%mm5
- pmullw %mm1,%mm5
- movq 16(%edi,%eax),%mm6
- movq 24(%edi,%eax),%mm7
+ movq (%edi,%eax), %mm4
+ pmullw %mm0, %mm4
+ movq 8(%edi,%eax), %mm5
+ pmullw %mm1, %mm5
+ movq 16(%edi,%eax), %mm6
pmullw %mm2,%mm6
+ movq 24(%edi,%eax), %mm7
pmullw %mm3,%mm7
+
paddw %mm4, %mm5
paddw %mm6, %mm7
paddw %mm5, %mm7
packuswb %mm7, %mm7
movd %mm7, %eax
- movb %al, 0(%esi) # *dest = y
+ movb %al, 0(%esi) # dest[ 0 ] = y
shrl $8, %eax
- movb %al, 1(%esi) # *dest = uv
+ movb %al, 1(%esi) # dest[ 1 ] = uv
addl $2, %esi # dest += 2
- cmpl %esi,28(%ebp) # if dest == dest_end ?
+ cmpl %esi,28(%ebp) # if dest == dest_end
je .out # then exit
- movl 36(%ebp), %eax # get dest_x
- addl $1, %eax # dest_x++
- movl %eax, 36(%ebp) # put dest_x
-
addl 24(%ebp), %ebx # x += x_step
+ addl $1, 36(%ebp) # dest_x++
- movl %ebx, %edx # x_scaled = x ...
- sarl $16, %edx # >> 16
- movl %edx, -24(%ebp) # save x_scaled
-
-.newx:
-
-/*
- * Load the two new values into %mm1, %mm3, move old values into %mm0, %mm2
- */
- movq %mm1, %mm0
- movq %mm3, %mm2
-
- sall $1, %edx # x_scaled *= channels
-
- movl 16(%ebp), %edi # get src0
- movzbl (%edi,%edx), %ecx # y = src0[ x_scaled ]
-
- sarl $2, %edx # x_aligned = ( x_scaled / channels ) >> 1 ...
- sall $2, %edx # << 2
-
- movl 36(%ebp), %eax # uv_index = dest_x ...
- #pushl %eax
- andl $1, %eax # ( dest_x & 1 ) ...
- sall $1, %eax # << 1
- addl %eax, %edx # x_aligned += uv_index
- #pushl %edx
- #pushl $MSG
- #call printf
- #popl %edx
- #popl %edx
- #popl %edx
- movzbl 1(%edi,%edx), %eax # uv = src0[ x_aligned + 1 ]
- shll $8, %eax # store uv
- orl %eax, %ecx
-
- movd %ecx, %mm1 # move to mmx1
- punpcklbw %mm4, %mm1
-
- movl %ebx, %edx # x_scaled = x ...
- sarl $16, %edx # >> 16
- sall $1, %edx # x_scaled *= channels
-
- movl 20(%ebp), %edi # get src1
- movzbl (%edi,%edx), %ecx # y = src1[ x_scaled ]
-
- sarl $2, %edx # x_aligned = ( x_scaled / channels ) >> 1 ...
- sall $2, %edx # << 2
-
- movl 36(%ebp), %eax # uv_index = dest_x ...
- andl $1, %eax # ( dest_x & 1 ) ...
- sall $1, %eax # << 1
- addl %eax, %edx # x_aligned += uv_index
- movzbl 1(%edi,%edx), %eax # uv = src1[ x_aligned + 1 ]
- shll $8, %eax # store uv
- orl %eax, %ecx
-
- movd %ecx, %mm3 # move to mmx3
- punpcklbw %mm4, %mm3
-
- movl 8(%ebp), %edi # get weights pointer
-
jmp .loop
.out: