research.m1stereo.tv/gitweb/ - melted/blob - scale_line_22_yuv_mmx.S

   1         .file   "scale_line_22_yuv_mmx.S"
   2         .version        "01.01"
   3
   4 .extern printf
   5
   6 gcc2_compiled.:
   7 .data
   8 MSG: .ascii "scale_line_22_yuv_mmx: %d %d\n"
   9
  10 .text
  11         .align 16
  12
  13 #if !defined(__MINGW32__) && !defined(__CYGWIN__)
  14
  15 .globl pixops_scale_line_22_yuv_mmx
  16         .type    pixops_scale_line_22_yuv_mmx,@function
  17 pixops_scale_line_22_yuv_mmx:
  18
  19 #else
  20
  21 .globl _pixops_scale_line_22_yuv_mmx
  22 _pixops_scale_line_22_yuv_mmx:
  23
  24 #endif
  25 /*
  26  * Arguments
  27  *
  28  * weights:          8(%ebp)
  29  * p (dest):    12(%ebp)        %esi
  30  * q1 (src0):   16(%ebp)
  31  * q2 (src1):   20(%ebp)
  32  * xstep:       24(%ebp)
  33  * p_end:       28(%ebp)
  34  * xinit:       32(%ebp)
  35  * dest_x:      36(%ebp)
  36  *
  37  */
  38
  39 /*
  40  * Function call entry
  41  */
  42         pushl %ebp
  43         movl %esp,%ebp
  44         subl $28,%esp
  45         pushl %edi
  46         pushl %esi
  47         pushl %ebx
  48 /* Locals:
  49  * int x                      %ebx
  50  * int x_scaled             -24(%ebp)
  51  * int dest_x               36(%ebp)
  52  */
  53
  54 /*
  55  * Setup
  56  */
  57 /* Initialize variables */
  58         movl 36(%ebp),%eax # destx
  59         movl %eax,36(%ebp)
  60         movl 32(%ebp),%ebx # x
  61         movl 12(%ebp),%esi # dest
  62
  63         cmpl 28(%ebp),%esi # dest == dest_end ?
  64         jnb  .out
  65
  66 /* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining
  67  * points we are interpolating between, as:
  68  *
  69  *  00VV00Y200UU00Y1
  70  */
  71
  72         pxor %mm4, %mm4
  73 /*
  74  * Load next component values into mm1 (src0) and mm3 (src1)
  75  */
  76         movl %ebx, %eax          # x_scaled
  77         sarl $15, %eax
  78         andl $0xfffffffe, %eax
  79         movl %eax, %edx          # x_aligned
  80         andl $0xfffffffc, %edx
  81
  82         movl 16(%ebp), %edi      # get src0
  83         movl (%edi,%eax), %ecx   # get y
  84         andl $0x00ff00ff, %ecx   # mask off y
  85         movl (%edi,%edx), %eax   # get uv
  86         andl $0xff00ff00, %eax   # mask off uv
  87         orl %eax, %ecx           # composite y, uv
  88         movd %ecx, %mm1          # move to mmx1
  89         punpcklbw %mm4, %mm1
  90
  91         movl 20(%ebp), %edi      # get src1
  92         movl (%edi,%edx), %ecx   # get y
  93         andl $0x00ff00ff, %ecx   # mask off y
  94         movl (%edi,%edx), %eax   # get uv
  95         andl $0xff00ff00, %eax   # mask off uv
  96         orl %eax, %ecx           # composite y, uv
  97         movd %ecx, %mm3          # move to mmx3
  98         punpcklbw %mm4, %mm3
  99
 100         jmp .newx
 101
 102         .p2align 4,,7
 103 .loop:
 104
 105 /* short *pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * n_x * n_y
 106  *                                             16             4                  0xf            2     2
 107  */
 108         movl 8(%ebp), %edi       # get weights pointer
 109         movl %ebx, %eax
 110         andl $0xf000, %eax
 111         shrl $7, %eax
 112
 113 /* At this point, %edi holds weights. Load the 4 weights into
 114  * %mm4,%mm5,%mm6,%mm7, multiply and accumulate.
 115  */
 116         movq (%edi,%eax), %mm4
 117         pmullw %mm0, %mm4
 118         movq 8(%edi,%eax), %mm5
 119         pmullw %mm1, %mm5
 120         movq 16(%edi,%eax), %mm6
 121         pmullw %mm2,%mm6
 122         movq 24(%edi,%eax), %mm7
 123         pmullw %mm3,%mm7
 124
 125         paddw %mm4, %mm5
 126         paddw %mm6, %mm7
 127         paddw %mm5, %mm7
 128
 129 /* %mm7 holds the accumulated sum. Compute (C + 0x80) / 256
 130  */
 131         pxor %mm4, %mm4
 132         movl $0x80808080, %eax
 133         movd %eax, %mm6
 134         punpcklbw %mm4, %mm6
 135         paddw %mm6, %mm7
 136         psrlw $8, %mm7
 137
 138 /* Pack into %eax and store result
 139  */
 140         packuswb %mm7, %mm7
 141         movd %mm7, %eax
 142
 143         movb %al, (%esi)         # *dest = y
 144
 145         movl 36(%ebp), %ecx      # get dest_x
 146         andl $1, %ecx            # select u or v
 147         sall $1, %ecx            # determine offset
 148         addl $1, %ecx            # relative to x_aligned
 149         sall $3, %ecx            # offset * 8 bits/byte
 150
 151         movd %mm7, %eax
 152         shrl %cl, %eax
 153         movb %al, 1(%esi)        # *dest = uv
 154
 155         addl $2, %esi            # dest += 2
 156         cmpl %esi,28(%ebp)       # if dest == dest_end
 157         je   .out                # then exit
 158
 159         addl $1, 36(%ebp)        # dest_x++
 160
 161 .newx:
 162
 163         addl 24(%ebp), %ebx      # x += x_step
 164 /*
 165  * Load current component values into mm0 (src0) and mm2 (src1)
 166  */
 167         movq %mm1, %mm0
 168         movq %mm3, %mm2
 169
 170 /*
 171  * Load next component values into mm1 (src0) and mm3 (src1)
 172  */
 173         movl %ebx, %eax          # x_scaled
 174         sarl $15, %eax
 175         andl $0xfffffffe, %eax
 176         movl %eax, %edx          # x_aligned
 177         andl $0xfffffffc, %edx
 178
 179         movl 16(%ebp), %edi      # get src0
 180         movl (%edi,%eax), %ecx   # get y
 181         andl $0x00ff00ff, %ecx   # mask off y
 182         movl (%edi,%edx), %eax   # get uv
 183         andl $0xff00ff00, %eax   # mask off uv
 184         orl %eax, %ecx           # composite y, uv
 185         movd %ecx, %mm1          # move to mmx1
 186         punpcklbw %mm4, %mm1
 187
 188         movl 20(%ebp), %edi      # get src1
 189         movl (%edi,%edx), %ecx   # get y
 190         andl $0x00ff00ff, %ecx   # mask off y
 191         movl (%edi,%edx), %eax   # get uv
 192         andl $0xff00ff00, %eax   # mask off uv
 193         orl %eax, %ecx           # composite y, uv
 194         movd %ecx, %mm3          # move to mmx3
 195         punpcklbw %mm4, %mm3
 196
 197         jmp .loop
 198
 199 .out:
 200         movl %esi,%eax
 201         emms
 202         leal -40(%ebp),%esp
 203         popl %ebx
 204         popl %esi
 205         popl %edi
 206         movl %ebp,%esp
 207         popl %ebp
 208         ret