2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software Foundation,
14 * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 #define SAD_SSE_INIT \
20 asm volatile ( "pxor %%mm6,%%mm6\n\t" :: );\
22 // Sum two 8x1 pixel blocks
23 #define SAD_SSE_SUM_8(OFFSET) \
24 "movq " #OFFSET "(%0),%%mm0 \n\t"\
25 "movq " #OFFSET "(%1),%%mm1 \n\t"\
26 "psadbw %%mm1,%%mm0 \n\t"\
27 "paddw %%mm0,%%mm6 \n\t"\
29 #define SAD_SSE_FINISH(RESULT) \
30 asm volatile( "movd %%mm6,%0" : "=r" (RESULT) : );
33 #define SAD_SSE_NEXTROW \
38 inline static int sad_sse_4x4( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
42 #define ROW SAD_SSE_SUM_8(0) SAD_SSE_NEXTROW
43 asm volatile ( ROW ROW ROW ROW
44 :: "r" (block1
), "r" (block2
), "r" ((long int)(ystride
)));
46 SAD_SSE_FINISH(result
)
52 inline static int sad_sse_8x8( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
56 #define ROW SAD_SSE_SUM_8(0) SAD_SSE_NEXTROW
57 asm volatile ( ROW ROW ROW ROW ROW ROW ROW ROW
58 :: "r" (block1
), "r" (block2
), "r" ((long int)(ystride
)));
60 SAD_SSE_FINISH(result
)
66 inline static int sad_sse_16x16( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
70 #define ROW SAD_SSE_SUM_8(0) SAD_SSE_SUM_8(8) SAD_SSE_NEXTROW
71 asm volatile ( ROW ROW ROW ROW ROW ROW ROW ROW
72 ROW ROW ROW ROW ROW ROW ROW ROW
73 :: "r" (block1
), "r" (block2
), "r" ((long int)(ystride
)));
75 SAD_SSE_FINISH(result
)
81 inline static int sad_sse_32x32( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
85 #define ROW SAD_SSE_SUM_8(0) SAD_SSE_SUM_8(8) SAD_SSE_SUM_8(16) SAD_SSE_SUM_8(24)\
88 asm volatile ( ROW ROW ROW ROW ROW ROW ROW ROW
89 ROW ROW ROW ROW ROW ROW ROW ROW
90 ROW ROW ROW ROW ROW ROW ROW ROW
91 ROW ROW ROW ROW ROW ROW ROW ROW
92 :: "r" (block1
), "r" (block2
), "r" ((long int)(ystride
)));
94 SAD_SSE_FINISH(result
)
100 inline static int sad_sse_4w( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
109 :: "r" (block1
), "r" (block2
)
116 SAD_SSE_FINISH(result
)
121 inline static int sad_sse_8w( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
131 :: "r" (block1
), "r" (block2
)
138 SAD_SSE_FINISH(result
)
143 inline static int sad_sse_16w( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
154 :: "r" (block1
), "r" (block2
)
161 SAD_SSE_FINISH(result
)
166 inline static int sad_sse_32w( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
179 :: "r" (block1
), "r" (block2
)
186 SAD_SSE_FINISH(result
)
191 inline static int sad_sse_64w( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
208 :: "r" (block1
), "r" (block2
)
215 SAD_SSE_FINISH(result
)
219 static __attribute__((used
)) __attribute__((aligned(8))) uint64_t sad_sse_422_mask_chroma
= 0x00ff00ff00ff00ffULL
;
221 #define SAD_SSE_422_LUMA_INIT \
222 asm volatile ( "movq %0,%%mm7\n\t"\
223 "pxor %%mm6,%%mm6\n\t" :: "m" (sad_sse_422_mask_chroma) );\
225 // Sum two 4x1 pixel blocks
226 #define SAD_SSE_422_LUMA_SUM_4(OFFSET) \
227 "movq " #OFFSET "(%0),%%mm0 \n\t"\
228 "movq " #OFFSET "(%1),%%mm1 \n\t"\
229 "pand %%mm7,%%mm0 \n\t"\
230 "pand %%mm7,%%mm1 \n\t"\
231 "psadbw %%mm1,%%mm0 \n\t"\
232 "paddw %%mm0,%%mm6 \n\t"\
234 static int sad_sse_422_luma_4x4( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
237 SAD_SSE_422_LUMA_INIT
238 #define ROW SAD_SSE_422_LUMA_SUM_4(0) SAD_SSE_NEXTROW
239 asm volatile ( ROW ROW ROW ROW
240 :: "r" (block1
), "r" (block2
), "r" ((long int)(ystride
)));
242 SAD_SSE_FINISH(result
)
248 static int sad_sse_422_luma_8x8( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
251 SAD_SSE_422_LUMA_INIT
252 #define ROW SAD_SSE_422_LUMA_SUM_4(0) SAD_SSE_422_LUMA_SUM_4(8) SAD_SSE_NEXTROW
253 asm volatile ( ROW ROW ROW ROW ROW ROW ROW ROW
254 :: "r" (block1
), "r" (block2
), "r" ((long int)(ystride
)));
256 SAD_SSE_FINISH(result
)
262 static int sad_sse_422_luma_16x16( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
265 SAD_SSE_422_LUMA_INIT
266 #define ROW SAD_SSE_422_LUMA_SUM_4(0) SAD_SSE_422_LUMA_SUM_4(8) SAD_SSE_422_LUMA_SUM_4(16) SAD_SSE_422_LUMA_SUM_4(24) SAD_SSE_NEXTROW
267 asm volatile ( ROW ROW ROW ROW ROW ROW ROW ROW
268 ROW ROW ROW ROW ROW ROW ROW ROW
269 :: "r" (block1
), "r" (block2
), "r" ((long int)(ystride
)));
271 SAD_SSE_FINISH(result
)
277 static int sad_sse_422_luma_32x32( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
280 SAD_SSE_422_LUMA_INIT
281 #define ROW SAD_SSE_422_LUMA_SUM_4(0) SAD_SSE_422_LUMA_SUM_4(8) SAD_SSE_422_LUMA_SUM_4(16) SAD_SSE_422_LUMA_SUM_4(24)\
282 SAD_SSE_422_LUMA_SUM_4(32) SAD_SSE_422_LUMA_SUM_4(40) SAD_SSE_422_LUMA_SUM_4(48) SAD_SSE_422_LUMA_SUM_4(56)\
285 asm volatile ( ROW ROW ROW ROW ROW ROW ROW ROW
286 ROW ROW ROW ROW ROW ROW ROW ROW
287 ROW ROW ROW ROW ROW ROW ROW ROW
288 ROW ROW ROW ROW ROW ROW ROW ROW
289 :: "r" (block1
), "r" (block2
), "r" ((long int)(ystride
)));
291 SAD_SSE_FINISH(result
)
297 static int sad_sse_422_luma_4w( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
301 SAD_SSE_422_LUMA_INIT
305 SAD_SSE_422_LUMA_SUM_4(0)
306 :: "r" (block1
), "r" (block2
)
313 SAD_SSE_FINISH(result
)
318 static int sad_sse_422_luma_8w( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
322 SAD_SSE_422_LUMA_INIT
326 SAD_SSE_422_LUMA_SUM_4(0)
327 SAD_SSE_422_LUMA_SUM_4(8)
329 :: "r" (block1
), "r" (block2
)
336 SAD_SSE_FINISH(result
)
341 static int sad_sse_422_luma_16w( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
345 SAD_SSE_422_LUMA_INIT
349 SAD_SSE_422_LUMA_SUM_4(0)
350 SAD_SSE_422_LUMA_SUM_4(8)
351 SAD_SSE_422_LUMA_SUM_4(16)
352 SAD_SSE_422_LUMA_SUM_4(24)
354 :: "r" (block1
), "r" (block2
)
361 SAD_SSE_FINISH(result
)
366 static int sad_sse_422_luma_32w( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
370 SAD_SSE_422_LUMA_INIT
374 SAD_SSE_422_LUMA_SUM_4(0)
375 SAD_SSE_422_LUMA_SUM_4(8)
376 SAD_SSE_422_LUMA_SUM_4(16)
377 SAD_SSE_422_LUMA_SUM_4(24)
378 SAD_SSE_422_LUMA_SUM_4(32)
379 SAD_SSE_422_LUMA_SUM_4(40)
380 SAD_SSE_422_LUMA_SUM_4(48)
381 SAD_SSE_422_LUMA_SUM_4(56)
383 :: "r" (block1
), "r" (block2
)
390 SAD_SSE_FINISH(result
)
395 static int sad_sse_422_luma_64w( uint8_t *block1
, uint8_t *block2
, int xstride
, int ystride
, int w
, int h
)
399 SAD_SSE_422_LUMA_INIT
403 SAD_SSE_422_LUMA_SUM_4(0)
404 SAD_SSE_422_LUMA_SUM_4(8)
405 SAD_SSE_422_LUMA_SUM_4(16)
406 SAD_SSE_422_LUMA_SUM_4(24)
407 SAD_SSE_422_LUMA_SUM_4(32)
408 SAD_SSE_422_LUMA_SUM_4(40)
409 SAD_SSE_422_LUMA_SUM_4(48)
410 SAD_SSE_422_LUMA_SUM_4(56)
411 SAD_SSE_422_LUMA_SUM_4(64)
412 SAD_SSE_422_LUMA_SUM_4(72)
413 SAD_SSE_422_LUMA_SUM_4(80)
414 SAD_SSE_422_LUMA_SUM_4(88)
415 SAD_SSE_422_LUMA_SUM_4(96)
416 SAD_SSE_422_LUMA_SUM_4(104)
417 SAD_SSE_422_LUMA_SUM_4(112)
418 SAD_SSE_422_LUMA_SUM_4(120)
420 :: "r" (block1
), "r" (block2
)
427 SAD_SSE_FINISH(result
)