research.m1stereo.tv/gitweb/ - melted/blob - src/modules/gtk2/scale_line_22_33_mmx.S

   1         .file   "scale_line_22_yuv_mmx.S"
   2         .version        "01.01"
   3
   4 .extern printf
   5
   6 gcc2_compiled.:
   7 .data
   8 MSG: .ascii "scale_line_22_yuv_mmx: %d %d\n"
   9
  10 .text
  11         .align 16
  12
  13 #if !defined(__MINGW32__) && !defined(__CYGWIN__)
  14
  15 .globl pixops_scale_line_22_yuv_mmx
  16         .type    pixops_scale_line_22_yuv_mmx,@function
  17 pixops_scale_line_22_yuv_mmx:
  18
  19 #else
  20
  21 .globl _pixops_scale_line_22_yuv_mmx
  22 _pixops_scale_line_22_yuv_mmx:
  23
  24 #endif
  25 /*
  26  * Arguments
  27  *
  28  * weights:      8(%ebp)
  29  * p (dest):    12(%ebp)        %esi
  30  * q1 (src0):   16(%ebp)
  31  * q2 (src1):   20(%ebp)
  32  * xstep:       24(%ebp)
  33  * p_end:       28(%ebp)
  34  * xinit:       32(%ebp)
  35  * destx:       36(%ebp)
  36  *
  37 */
  38
  39 /*
  40  * Function call entry
  41  */
  42         pushl %ebp
  43         movl %esp,%ebp
  44         subl $28,%esp
  45         pushl %edi
  46         pushl %esi
  47         pushl %ebx
  48 /* Locals:
  49  * int x                      %ebx
  50  * int x_scaled             -24(%ebp)
  51  * int dest_x               36(%ebp)
  52  */
  53
  54 /*
  55  * Setup
  56  */
  57 /* Initialize variables */
  58         movl 36(%ebp),%eax # destx
  59         movl %eax,36(%ebp)
  60         movl 32(%ebp),%ebx # x
  61         movl 12(%ebp),%esi # dest
  62
  63         cmpl 28(%ebp),%esi # dest == dest_end ?
  64         jnb  .out
  65
  66 /* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining
  67  * points we are interpolating between, as:
  68  *
  69  *  0000000000UV00YY
  70  */
  71
  72 /* Load initial values into %mm1, %mm3 */
  73
  74         /* x_scaled = ( x >> 16 ) * stride */
  75         movl %ebx, %edx
  76         sarl $16,%edx
  77         sall $1, %edx
  78
  79         /* load from src0 */
  80         movl 16(%ebp), %edi
  81         movzbl (%edi,%edx), %ecx
  82
  83         /* x_aligned = x_scaled divided by 2 and multiplied by 4 */
  84         movl %ebx, %edx
  85         sarl $17, %edx
  86         sall $2, %edx
  87
  88         /* uv_index = ( ( dest_x & 1 ) << 1 ) + 1; */
  89         movl 36(%ebp), %eax
  90         andl $1, %eax
  91         sall $1, %eax
  92         addl %eax, %edx
  93         movzbl 1(%edi,%edx), %eax
  94         shll $8, %eax
  95         orl %eax, %ecx
  96
  97         movd %ecx, %mm1
  98         pxor %mm4, %mm4
  99         punpcklbw %mm4, %mm1
 100
 101         /* x_scaled = ( x >> 16 ) * stride */
 102         movl %ebx, %edx
 103         sarl $16, %edx
 104         sall $1, %edx
 105
 106         /* load from src1 */
 107         movl 20(%ebp), %edi
 108         movzbl (%edi,%edx), %ecx
 109
 110         /* x_aligned = x_scaled divided by 2 and multiplied by 4 */
 111         movl %ebx, %edx
 112         sarl $17, %edx
 113         sall $2, %edx
 114
 115         /* uv_index = ( ( dest_x & 1 ) << 1 ) + 1; */
 116         movl 36(%ebp), %eax
 117         andl $1, %eax
 118         sall $1, %eax
 119         addl %eax, %edx
 120         movzbl (%edi,%edx), %eax
 121         shll $8, %eax
 122         orl %eax, %ecx
 123
 124         movd %ecx, %mm3
 125         punpcklbw %mm4, %mm3
 126
 127         /* dest_x++; */
 128         movl 36(%ebp), %eax
 129         addl $1, %eax
 130         movl %eax, 36(%ebp)
 131
 132         /* x_scaled = x >> 16 */
 133         addl $65536, %ebx
 134         movl %ebx, %edx
 135         sarl $16, %edx
 136         movl %edx, -24(%ebp)
 137
 138         jmp .newx
 139         .p2align 4,,7
 140 .loop:
 141 /* short *pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * n_x * n_y
 142  *                                             16             4                  0xf            2     2
 143  */
 144         movl %ebx,%eax
 145         andl $0xf000,%eax
 146         shrl $7,%eax
 147
 148 /* At this point, %edi holds weights. Load the 4 weights into %mm4,%mm5,%mm6,%mm7, multiply and
 149  * accumulate.
 150  */
 151         movq (%edi,%eax),%mm4
 152         pmullw %mm0,%mm4
 153         movq 8(%edi,%eax),%mm5
 154         pmullw %mm1,%mm5
 155         movq 16(%edi,%eax),%mm6
 156         movq 24(%edi,%eax),%mm7
 157         pmullw %mm2,%mm6
 158         pmullw %mm3,%mm7
 159         paddw %mm4, %mm5
 160         paddw %mm6, %mm7
 161         paddw %mm5, %mm7
 162
 163 /* %mm7 holds the accumulated sum. Compute (C + 0x80) / 256
 164  */
 165         pxor %mm4, %mm4
 166         movl $0x80808080, %eax
 167         movd %eax, %mm6
 168         punpcklbw %mm4, %mm6
 169         paddw %mm6, %mm7
 170         psrlw $8, %mm7
 171
 172 /* Pack into %eax and store result
 173  */
 174         packuswb %mm7, %mm7
 175         movd %mm7, %eax
 176
 177         movb %al, 0(%esi)        # *dest = y
 178         shrl $8, %eax
 179         movb %al, 1(%esi)        # *dest = uv
 180
 181         addl $2, %esi            # dest += 2
 182
 183         cmpl %esi,28(%ebp)       # if dest == dest_end ?
 184         je   .out                # then exit
 185
 186         movl 36(%ebp), %eax      # get dest_x
 187         addl $1, %eax            # dest_x++
 188         movl %eax, 36(%ebp)      # put dest_x
 189
 190         addl 24(%ebp), %ebx      # x += x_step
 191
 192         movl %ebx, %edx          # x_scaled = x ...
 193         sarl $16, %edx           # >> 16
 194         movl %edx, -24(%ebp)     # save x_scaled
 195
 196 .newx:
 197
 198 /*
 199  * Load the two new values into %mm1, %mm3, move old values into %mm0, %mm2
 200  */
 201         movq %mm1, %mm0
 202         movq %mm3, %mm2
 203
 204         sall $1, %edx            # x_scaled *= channels
 205
 206         movl 16(%ebp), %edi      # get src0
 207         movzbl (%edi,%edx), %ecx # y = src0[ x_scaled ]
 208
 209         sarl $2, %edx            # x_aligned = ( x_scaled / channels ) >> 1 ...
 210         sall $2, %edx            # << 2
 211
 212         movl 36(%ebp), %eax      # uv_index = dest_x ...
 213                 #pushl %eax
 214         andl $1, %eax            # ( dest_x & 1 ) ...
 215         sall $1, %eax            # << 1
 216         addl %eax, %edx          # x_aligned += uv_index
 217                 #pushl %edx
 218                 #pushl $MSG
 219                 #call printf
 220                 #popl %edx
 221                 #popl %edx
 222                 #popl %edx
 223         movzbl 1(%edi,%edx), %eax # uv = src0[ x_aligned + 1 ]
 224         shll $8, %eax            # store uv
 225         orl %eax, %ecx
 226
 227         movd %ecx, %mm1          # move to mmx1
 228         punpcklbw %mm4, %mm1
 229
 230         movl %ebx, %edx          # x_scaled = x ...
 231         sarl $16, %edx           # >> 16
 232         sall $1, %edx            # x_scaled *= channels
 233
 234         movl 20(%ebp), %edi      # get src1
 235         movzbl (%edi,%edx), %ecx # y = src1[ x_scaled ]
 236
 237         sarl $2, %edx            # x_aligned = ( x_scaled / channels ) >> 1 ...
 238         sall $2, %edx            # << 2
 239
 240         movl 36(%ebp), %eax      # uv_index = dest_x ...
 241         andl $1, %eax            # ( dest_x & 1 ) ...
 242         sall $1, %eax            # << 1
 243         addl %eax, %edx          # x_aligned += uv_index
 244         movzbl 1(%edi,%edx), %eax # uv = src1[ x_aligned + 1 ]
 245         shll $8, %eax            # store uv
 246         orl %eax, %ecx
 247
 248         movd %ecx, %mm3          # move to mmx3
 249         punpcklbw %mm4, %mm3
 250
 251         movl 8(%ebp), %edi       # get weights pointer
 252
 253         jmp .loop
 254
 255 .out:
 256         movl %esi,%eax
 257         emms
 258         leal -40(%ebp),%esp
 259         popl %ebx
 260         popl %esi
 261         popl %edi
 262         movl %ebp,%esp
 263         popl %ebp
 264         ret