research.m1stereo.tv/gitweb/ - melted/blob - src/modules/gtk2/scale_line_22_yuv_mmx.S

   1         .file   "scale_line_22_yuv_mmx.S"
   2         .version        "01.01"
   3
   4 .extern printf
   5
   6 gcc2_compiled.:
   7 .data
   8 MSG: .ascii "scale_line_22_yuv_mmx: %d %d\n"
   9
  10 .text
  11         .align 16
  12
  13 #if !defined(__MINGW32__) && !defined(__CYGWIN__)
  14
  15 .globl pixops_scale_line_22_yuv_mmx
  16         .type    pixops_scale_line_22_yuv_mmx,@function
  17 pixops_scale_line_22_yuv_mmx:
  18
  19 #else
  20
  21 .globl _pixops_scale_line_22_yuv_mmx
  22 _pixops_scale_line_22_yuv_mmx:
  23
  24 #endif
  25 /*
  26  * Arguments
  27  *
  28  * weights:      8(%ebp)
  29  * p (dest):    12(%ebp)        %esi
  30  * q1 (src0):   16(%ebp)
  31  * q2 (src1):   20(%ebp)
  32  * xstep:       24(%ebp)
  33  * p_end:       28(%ebp)
  34  * xinit:       32(%ebp)
  35  * destx:       36(%ebp)
  36  *
  37 */
  38
  39 /*
  40  * Function call entry
  41  */
  42         pushl %ebp
  43         movl %esp,%ebp
  44         subl $28,%esp
  45         pushl %edi
  46         pushl %esi
  47         pushl %ebx
  48 /* Locals:
  49  * int x                      %ebx
  50  * int x_scaled             -24(%ebp)
  51  * int dest_x               36(%ebp)
  52  */
  53
  54 /*
  55  * Setup
  56  */
  57 /* Initialize variables */
  58         movl 36(%ebp),%eax # destx
  59         movl %eax,36(%ebp)
  60         movl 32(%ebp),%ebx # x
  61         movl 12(%ebp),%esi # dest
  62
  63         cmpl 28(%ebp),%esi # dest == dest_end ?
  64         jnb  .out
  65
  66         addl $65536, %ebx
  67         .p2align 4,,7
  68         pxor %mm4, %mm4
  69
  70 /* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining
  71  * points we are interpolating between, as:
  72  *
  73  *  00UV00Y200UV00Y1
  74  */
  75
  76 .loop:
  77
  78 /*
  79  * Load current values from pixel 1
  80  */
  81         movl %ebx, %edx          # x_scaled = x ...
  82         sarl $16, %edx           # >> 16
  83         sall $1, %edx            # x_scaled *= channels
  84
  85         movl 16(%ebp), %edi      # get src0
  86         movzbl 2(%edi,%edx), %ecx # next y = src0[ x_scaled + 2 ]
  87         /* wish we had a register for this */
  88         movl %ecx, -24(%ebp)     # save next y
  89         movzbl (%edi,%edx), %ecx # y = src0[ x_scaled ]
  90
  91         sarl $2, %edx            # x_aligned = ( x_scaled / channels ) >> 1 ...
  92         sall $2, %edx            # << 2
  93
  94         movl 36(%ebp), %eax      # uv_index = dest_x ...
  95         andl $1, %eax            # ( dest_x & 1 ) ...
  96         sall $1, %eax            # << 1
  97         addl %eax, %edx          # x_aligned += uv_index
  98
  99         movzbl 1(%edi,%edx), %eax # uv = src0[ x_aligned + 1 ]
 100         shll $8, %eax           # position uv
 101         orl %eax, %ecx           # store uv
 102
 103         movd %ecx, %mm0          # move to mmx0
 104         punpcklbw %mm4, %mm0
 105
 106         movl -24(%ebp), %ecx     # restore next y
 107         orl %eax, %ecx           # store uv
 108
 109         movd %ecx, %mm1          # move to mmx1
 110         punpcklbw %mm4, %mm1
 111
 112         movl 20(%ebp), %edi      # get src1
 113
 114         /* do u/v first since we already have x_aligned */
 115         movzbl 1(%edi,%edx), %eax # uv = src1[ x_aligned + 1 ]
 116         shll $8, %eax            # position uv
 117
 118         /* which is faster? 2 moves in and out of memory, or
 119        1 move between registers and 2 shifts? I wager the latter. */
 120         movl %ebx, %edx          # x_scaled = x ...
 121         sarl $16, %edx           # >> 16
 122         sall $1, %edx            # x_scaled *= channels
 123
 124         movzbl 2(%edi,%edx), %ecx # next y = src1[ x_scaled + 2 ]
 125         movl %eax, -24(%ebp)     # save next y
 126         movzbl (%edi,%edx), %ecx # y = src1[ x_scaled ]
 127         orl %eax, %ecx           # store uv
 128
 129         movd %ecx, %mm2          # move to mmx2
 130         punpcklbw %mm4, %mm2
 131
 132         movl -24(%ebp), %ecx     # restore next y
 133         orl %eax, %ecx           # store uv
 134
 135         movd %ecx, %mm3          # move to mmx3
 136         punpcklbw %mm4, %mm3
 137
 138 /* short *pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * n_x * n_y
 139  *                                             16             4                  0xf            2     2
 140  */
 141         movl 8(%ebp), %edi       # get weights pointer
 142         movl %ebx, %eax
 143         andl $0xf000, %eax
 144         shrl $7, %eax
 145
 146 /* At this point, %edi holds weights. Load the 4 weights into
 147  * %mm4,%mm5,%mm6,%mm7, multiply and accumulate.
 148  */
 149         movq (%edi,%eax), %mm4
 150         pmullw %mm0, %mm4
 151         movq 8(%edi,%eax), %mm5
 152         pmullw %mm1, %mm5
 153         movq 16(%edi,%eax), %mm6
 154         pmullw %mm2,%mm6
 155         movq 24(%edi,%eax), %mm7
 156         pmullw %mm3,%mm7
 157
 158         paddw %mm4, %mm5
 159         paddw %mm6, %mm7
 160         paddw %mm5, %mm7
 161
 162 /* %mm7 holds the accumulated sum. Compute (C + 0x80) / 256
 163  */
 164         pxor %mm4, %mm4
 165         movl $0x80808080, %eax
 166         movd %eax, %mm6
 167         punpcklbw %mm4, %mm6
 168         paddw %mm6, %mm7
 169         psrlw $8, %mm7
 170
 171 /* Pack into %eax and store result
 172  */
 173         packuswb %mm7, %mm7
 174         movd %mm7, %eax
 175
 176         movb %al, 0(%esi)        # dest[ 0 ] = y
 177         shrl $8, %eax
 178         movb %al, 1(%esi)        # dest[ 1 ] = uv
 179
 180         addl $2, %esi            # dest += 2
 181
 182         cmpl %esi,28(%ebp)       # if dest == dest_end
 183         je   .out                # then exit
 184
 185         addl 24(%ebp), %ebx      # x += x_step
 186         addl $1, 36(%ebp)        # dest_x++
 187
 188         jmp .loop
 189
 190 .out:
 191         movl %esi,%eax
 192         emms
 193         leal -40(%ebp),%esp
 194         popl %ebx
 195         popl %esi
 196         popl %edi
 197         movl %ebp,%esp
 198         popl %ebp
 199         ret