X-Git-Url: http://research.m1stereo.tv/gitweb?a=blobdiff_plain;f=src%2Fmodules%2Fgtk2%2Fscale_line_22_yuv_mmx.S;h=d78b3413d9c97b57dc5ab13ddbf0338d4ae36b77;hb=ced3d0b8c0520e4c8208166e3218e0caacba1efa;hp=cc389ad941e71c3abb70258bf1dca010d5188e0c;hpb=c56ed08b4a9f46677880cd7f550450eb41871565;p=melted diff --git a/src/modules/gtk2/scale_line_22_yuv_mmx.S b/src/modules/gtk2/scale_line_22_yuv_mmx.S index cc389ad..d78b341 100644 --- a/src/modules/gtk2/scale_line_22_yuv_mmx.S +++ b/src/modules/gtk2/scale_line_22_yuv_mmx.S @@ -1,3 +1,22 @@ +/* + * scale_line_22_yuv_mmx.S -- scale line in YUY2 format + * Copyright (C) 2003-2004 Ushodaya Enterprises Limited + * Author: Dan Dennedy + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ .file "scale_line_22_yuv_mmx.S" .version "01.01" @@ -25,16 +44,16 @@ _pixops_scale_line_22_yuv_mmx: /* * Arguments * - * weights: 8(%ebp) + * weights: 8(%ebp) * p (dest): 12(%ebp) %esi * q1 (src0): 16(%ebp) * q2 (src1): 20(%ebp) * xstep: 24(%ebp) * p_end: 28(%ebp) * xinit: 32(%ebp) - * destx: 36(%ebp) + * dest_x: 36(%ebp) * -*/ + */ /* * Function call entry @@ -63,78 +82,45 @@ _pixops_scale_line_22_yuv_mmx: cmpl 28(%ebp),%esi # dest == dest_end ? jnb .out - addl $65536, %ebx - .p2align 4,,7 - pxor %mm4, %mm4 - /* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining * points we are interpolating between, as: * - * 00UV00Y200UV00Y1 + * 00VV00Y200UU00Y1 */ -.loop: - + pxor %mm4, %mm4 /* - * Load current values from pixel 1 + * Load next component values into mm1 (src0) and mm3 (src1) */ - movl %ebx, %edx # x_scaled = x ... - sarl $16, %edx # >> 16 - sall $1, %edx # x_scaled *= channels + movl %ebx, %eax # x_scaled + sarl $15, %eax + andl $0xfffffffe, %eax + movl %eax, %edx # x_aligned + andl $0xfffffffc, %edx movl 16(%ebp), %edi # get src0 - movzbl 2(%edi,%edx), %ecx # next y = src0[ x_scaled + 2 ] - /* wish we had a register for this */ - movl %ecx, -24(%ebp) # save next y - movzbl (%edi,%edx), %ecx # y = src0[ x_scaled ] - - sarl $2, %edx # x_aligned = ( x_scaled / channels ) >> 1 ... - sall $2, %edx # << 2 - - movl 36(%ebp), %eax # uv_index = dest_x ... - andl $1, %eax # ( dest_x & 1 ) ... - sall $1, %eax # << 1 - addl %eax, %edx # x_aligned += uv_index - - movzbl 1(%edi,%edx), %eax # uv = src0[ x_aligned + 1 ] - shll $8, %eax # position uv - orl %eax, %ecx # store uv - - movd %ecx, %mm0 # move to mmx0 - punpcklbw %mm4, %mm0 - - movl -24(%ebp), %ecx # restore next y - orl %eax, %ecx # store uv - + movl (%edi,%eax), %ecx # get y + andl $0x00ff00ff, %ecx # mask off y + movl (%edi,%edx), %eax # get uv + andl $0xff00ff00, %eax # mask off uv + orl %eax, %ecx # composite y, uv movd %ecx, %mm1 # move to mmx1 punpcklbw %mm4, %mm1 movl 20(%ebp), %edi # get src1 - - /* do u/v first since we already have x_aligned */ - movzbl 1(%edi,%edx), %eax # uv = src1[ x_aligned + 1 ] - shll $8, %eax # position uv - - /* which is faster? 2 moves in and out of memory, or - 1 move between registers and 2 shifts? I wager the latter. */ - movl %ebx, %edx # x_scaled = x ... - sarl $16, %edx # >> 16 - sall $1, %edx # x_scaled *= channels - - movzbl 2(%edi,%edx), %ecx # next y = src1[ x_scaled + 2 ] - movl %eax, -24(%ebp) # save next y - movzbl (%edi,%edx), %ecx # y = src1[ x_scaled ] - orl %eax, %ecx # store uv - - movd %ecx, %mm2 # move to mmx2 - punpcklbw %mm4, %mm2 - - movl -24(%ebp), %ecx # restore next y - orl %eax, %ecx # store uv - + movl (%edi,%edx), %ecx # get y + andl $0x00ff00ff, %ecx # mask off y + movl (%edi,%edx), %eax # get uv + andl $0xff00ff00, %eax # mask off uv + orl %eax, %ecx # composite y, uv movd %ecx, %mm3 # move to mmx3 punpcklbw %mm4, %mm3 + jmp .newx + + .p2align 4,,7 +.loop: + /* short *pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * n_x * n_y * 16 4 0xf 2 2 */ @@ -173,18 +159,60 @@ _pixops_scale_line_22_yuv_mmx: packuswb %mm7, %mm7 movd %mm7, %eax - movb %al, 0(%esi) # dest[ 0 ] = y - shrl $8, %eax - movb %al, 1(%esi) # dest[ 1 ] = uv + movb %al, (%esi) # *dest = y + + movl 36(%ebp), %ecx # get dest_x + andl $1, %ecx # select u or v + sall $1, %ecx # determine offset + addl $1, %ecx # relative to x_aligned + sall $3, %ecx # offset * 8 bits/byte - addl $2, %esi # dest += 2 + movd %mm7, %eax + shrl %cl, %eax + movb %al, 1(%esi) # *dest = uv + addl $2, %esi # dest += 2 cmpl %esi,28(%ebp) # if dest == dest_end je .out # then exit - addl 24(%ebp), %ebx # x += x_step addl $1, 36(%ebp) # dest_x++ +.newx: + + addl 24(%ebp), %ebx # x += x_step +/* + * Load current component values into mm0 (src0) and mm2 (src1) + */ + movq %mm1, %mm0 + movq %mm3, %mm2 + +/* + * Load next component values into mm1 (src0) and mm3 (src1) + */ + movl %ebx, %eax # x_scaled + sarl $15, %eax + andl $0xfffffffe, %eax + movl %eax, %edx # x_aligned + andl $0xfffffffc, %edx + + movl 16(%ebp), %edi # get src0 + movl (%edi,%eax), %ecx # get y + andl $0x00ff00ff, %ecx # mask off y + movl (%edi,%edx), %eax # get uv + andl $0xff00ff00, %eax # mask off uv + orl %eax, %ecx # composite y, uv + movd %ecx, %mm1 # move to mmx1 + punpcklbw %mm4, %mm1 + + movl 20(%ebp), %edi # get src1 + movl (%edi,%edx), %ecx # get y + andl $0x00ff00ff, %ecx # mask off y + movl (%edi,%edx), %eax # get uv + andl $0xff00ff00, %eax # mask off uv + orl %eax, %ecx # composite y, uv + movd %ecx, %mm3 # move to mmx3 + punpcklbw %mm4, %mm3 + jmp .loop .out: