research.m1stereo.tv/gitweb/ - melted/blob - src/modules/gtk2/scale_line_22_yuv_mmx.S

   1 /*
   2  * scale_line_22_yuv_mmx.S -- scale line in YUY2 format
   3  * Copyright (C) 2003-2004 Ushodaya Enterprises Limited
   4  * Author: Dan Dennedy <dan@dennedy.org>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  */
  20         .file   "scale_line_22_yuv_mmx.S"
  21         .version        "01.01"
  22
  23 .extern printf
  24
  25 gcc2_compiled.:
  26 .data
  27 MSG: .ascii "scale_line_22_yuv_mmx: %d %d\n"
  28
  29 .text
  30         .align 16
  31
  32 #if !defined(__MINGW32__) && !defined(__CYGWIN__)
  33
  34 .globl pixops_scale_line_22_yuv_mmx
  35         .type    pixops_scale_line_22_yuv_mmx,@function
  36 pixops_scale_line_22_yuv_mmx:
  37
  38 #else
  39
  40 .globl _pixops_scale_line_22_yuv_mmx
  41 _pixops_scale_line_22_yuv_mmx:
  42
  43 #endif
  44 /*
  45  * Arguments
  46  *
  47  * weights:          8(%ebp)
  48  * p (dest):    12(%ebp)        %esi
  49  * q1 (src0):   16(%ebp)
  50  * q2 (src1):   20(%ebp)
  51  * xstep:       24(%ebp)
  52  * p_end:       28(%ebp)
  53  * xinit:       32(%ebp)
  54  * dest_x:      36(%ebp)
  55  *
  56  */
  57
  58 /*
  59  * Function call entry
  60  */
  61         pushl %ebp
  62         movl %esp,%ebp
  63         subl $28,%esp
  64         pushl %edi
  65         pushl %esi
  66         pushl %ebx
  67 /* Locals:
  68  * int x                      %ebx
  69  * int x_scaled             -24(%ebp)
  70  * int dest_x               36(%ebp)
  71  */
  72
  73 /*
  74  * Setup
  75  */
  76 /* Initialize variables */
  77         movl 36(%ebp),%eax # destx
  78         movl %eax,36(%ebp)
  79         movl 32(%ebp),%ebx # x
  80         movl 12(%ebp),%esi # dest
  81
  82         cmpl 28(%ebp),%esi # dest == dest_end ?
  83         jnb  .out
  84
  85 /* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining
  86  * points we are interpolating between, as:
  87  *
  88  *  00VV00Y200UU00Y1
  89  */
  90
  91         pxor %mm4, %mm4
  92 /*
  93  * Load next component values into mm1 (src0) and mm3 (src1)
  94  */
  95         movl %ebx, %eax          # x_scaled
  96         sarl $15, %eax
  97         andl $0xfffffffe, %eax
  98         movl %eax, %edx          # x_aligned
  99         andl $0xfffffffc, %edx
 100
 101         movl 16(%ebp), %edi      # get src0
 102         movl (%edi,%eax), %ecx   # get y
 103         andl $0x00ff00ff, %ecx   # mask off y
 104         movl (%edi,%edx), %eax   # get uv
 105         andl $0xff00ff00, %eax   # mask off uv
 106         orl %eax, %ecx           # composite y, uv
 107         movd %ecx, %mm1          # move to mmx1
 108         punpcklbw %mm4, %mm1
 109
 110         movl 20(%ebp), %edi      # get src1
 111         movl (%edi,%edx), %ecx   # get y
 112         andl $0x00ff00ff, %ecx   # mask off y
 113         movl (%edi,%edx), %eax   # get uv
 114         andl $0xff00ff00, %eax   # mask off uv
 115         orl %eax, %ecx           # composite y, uv
 116         movd %ecx, %mm3          # move to mmx3
 117         punpcklbw %mm4, %mm3
 118
 119         jmp .newx
 120
 121         .p2align 4,,7
 122 .loop:
 123
 124 /* short *pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * n_x * n_y
 125  *                                             16             4                  0xf            2     2
 126  */
 127         movl 8(%ebp), %edi       # get weights pointer
 128         movl %ebx, %eax
 129         andl $0xf000, %eax
 130         shrl $7, %eax
 131
 132 /* At this point, %edi holds weights. Load the 4 weights into
 133  * %mm4,%mm5,%mm6,%mm7, multiply and accumulate.
 134  */
 135         movq (%edi,%eax), %mm4
 136         pmullw %mm0, %mm4
 137         movq 8(%edi,%eax), %mm5
 138         pmullw %mm1, %mm5
 139         movq 16(%edi,%eax), %mm6
 140         pmullw %mm2,%mm6
 141         movq 24(%edi,%eax), %mm7
 142         pmullw %mm3,%mm7
 143
 144         paddw %mm4, %mm5
 145         paddw %mm6, %mm7
 146         paddw %mm5, %mm7
 147
 148 /* %mm7 holds the accumulated sum. Compute (C + 0x80) / 256
 149  */
 150         pxor %mm4, %mm4
 151         movl $0x80808080, %eax
 152         movd %eax, %mm6
 153         punpcklbw %mm4, %mm6
 154         paddw %mm6, %mm7
 155         psrlw $8, %mm7
 156
 157 /* Pack into %eax and store result
 158  */
 159         packuswb %mm7, %mm7
 160         movd %mm7, %eax
 161
 162         movb %al, (%esi)         # *dest = y
 163
 164         movl 36(%ebp), %ecx      # get dest_x
 165         andl $1, %ecx            # select u or v
 166         sall $1, %ecx            # determine offset
 167         addl $1, %ecx            # relative to x_aligned
 168         sall $3, %ecx            # offset * 8 bits/byte
 169
 170         movd %mm7, %eax
 171         shrl %cl, %eax
 172         movb %al, 1(%esi)        # *dest = uv
 173
 174         addl $2, %esi            # dest += 2
 175         cmpl %esi,28(%ebp)       # if dest == dest_end
 176         je   .out                # then exit
 177
 178         addl $1, 36(%ebp)        # dest_x++
 179
 180 .newx:
 181
 182         addl 24(%ebp), %ebx      # x += x_step
 183 /*
 184  * Load current component values into mm0 (src0) and mm2 (src1)
 185  */
 186         movq %mm1, %mm0
 187         movq %mm3, %mm2
 188
 189 /*
 190  * Load next component values into mm1 (src0) and mm3 (src1)
 191  */
 192         movl %ebx, %eax          # x_scaled
 193         sarl $15, %eax
 194         andl $0xfffffffe, %eax
 195         movl %eax, %edx          # x_aligned
 196         andl $0xfffffffc, %edx
 197
 198         movl 16(%ebp), %edi      # get src0
 199         movl (%edi,%eax), %ecx   # get y
 200         andl $0x00ff00ff, %ecx   # mask off y
 201         movl (%edi,%edx), %eax   # get uv
 202         andl $0xff00ff00, %eax   # mask off uv
 203         orl %eax, %ecx           # composite y, uv
 204         movd %ecx, %mm1          # move to mmx1
 205         punpcklbw %mm4, %mm1
 206
 207         movl 20(%ebp), %edi      # get src1
 208         movl (%edi,%edx), %ecx   # get y
 209         andl $0x00ff00ff, %ecx   # mask off y
 210         movl (%edi,%edx), %eax   # get uv
 211         andl $0xff00ff00, %eax   # mask off uv
 212         orl %eax, %ecx           # composite y, uv
 213         movd %ecx, %mm3          # move to mmx3
 214         punpcklbw %mm4, %mm3
 215
 216         jmp .loop
 217
 218 .out:
 219         movl %esi,%eax
 220         emms
 221         leal -40(%ebp),%esp
 222         popl %ebx
 223         popl %esi
 224         popl %edi
 225         movl %ebp,%esp
 226         popl %ebp
 227         ret