1 .file "scale_line_22_yuv_mmx.S"
8 MSG: .ascii "scale_line_22_yuv_mmx: %d %d\n"
13 #if !defined(__MINGW32__) && !defined(__CYGWIN__)
15 .globl pixops_scale_line_22_yuv_mmx
16 .type pixops_scale_line_22_yuv_mmx,@function
17 pixops_scale_line_22_yuv_mmx:
21 .globl _pixops_scale_line_22_yuv_mmx
22 _pixops_scale_line_22_yuv_mmx:
29 * p (dest): 12(%ebp) %esi
50 * int x_scaled -24(%ebp)
57 /* Initialize variables */
58 movl 36(%ebp),%eax # destx
60 movl 32(%ebp),%ebx # x
61 movl 12(%ebp),%esi # dest
63 cmpl 28(%ebp),%esi # dest == dest_end ?
70 /* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining
71 * points we are interpolating between, as:
79 * Load current values from pixel 1
81 movl %ebx, %edx # x_scaled = x ...
82 sarl $16, %edx # >> 16
83 sall $1, %edx # x_scaled *= channels
85 movl 16(%ebp), %edi # get src0
86 movzbl 2(%edi,%edx), %ecx # next y = src0[ x_scaled + 2 ]
87 /* wish we had a register for this */
88 movl %ecx, -24(%ebp) # save next y
89 movzbl (%edi,%edx), %ecx # y = src0[ x_scaled ]
91 sarl $2, %edx # x_aligned = ( x_scaled / channels ) >> 1 ...
94 movl 36(%ebp), %eax # uv_index = dest_x ...
95 andl $1, %eax # ( dest_x & 1 ) ...
97 addl %eax, %edx # x_aligned += uv_index
99 movzbl 1(%edi,%edx), %eax # uv = src0[ x_aligned + 1 ]
100 shll $8, %eax # position uv
101 orl %eax, %ecx # store uv
103 movd %ecx, %mm0 # move to mmx0
106 movl -24(%ebp), %ecx # restore next y
107 orl %eax, %ecx # store uv
109 movd %ecx, %mm1 # move to mmx1
112 movl 20(%ebp), %edi # get src1
114 /* do u/v first since we already have x_aligned */
115 movzbl 1(%edi,%edx), %eax # uv = src1[ x_aligned + 1 ]
116 shll $8, %eax # position uv
118 /* which is faster? 2 moves in and out of memory, or
119 1 move between registers and 2 shifts? I wager the latter. */
120 movl %ebx, %edx # x_scaled = x ...
121 sarl $16, %edx # >> 16
122 sall $1, %edx # x_scaled *= channels
124 movzbl 2(%edi,%edx), %ecx # next y = src1[ x_scaled + 2 ]
125 movl %eax, -24(%ebp) # save next y
126 movzbl (%edi,%edx), %ecx # y = src1[ x_scaled ]
127 orl %eax, %ecx # store uv
129 movd %ecx, %mm2 # move to mmx2
132 movl -24(%ebp), %ecx # restore next y
133 orl %eax, %ecx # store uv
135 movd %ecx, %mm3 # move to mmx3
138 /* short *pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * n_x * n_y
141 movl 8(%ebp), %edi # get weights pointer
146 /* At this point, %edi holds weights. Load the 4 weights into
147 * %mm4,%mm5,%mm6,%mm7, multiply and accumulate.
149 movq (%edi,%eax), %mm4
151 movq 8(%edi,%eax), %mm5
153 movq 16(%edi,%eax), %mm6
155 movq 24(%edi,%eax), %mm7
162 /* %mm7 holds the accumulated sum. Compute (C + 0x80) / 256
165 movl $0x80808080, %eax
171 /* Pack into %eax and store result
176 movb %al, 0(%esi) # dest[ 0 ] = y
178 movb %al, 1(%esi) # dest[ 1 ] = uv
180 addl $2, %esi # dest += 2
182 cmpl %esi,28(%ebp) # if dest == dest_end
185 addl 24(%ebp), %ebx # x += x_step
186 addl $1, 36(%ebp) # dest_x++