Merge ../mlt
[melted] / src / modules / motion_est / sad_sse.h
1 /*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software Foundation,
14 * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 */
16
17
18
19 #define SAD_SSE_INIT \
20 asm volatile ( "pxor %%mm6,%%mm6\n\t" :: );\
21
22 // Sum two 8x1 pixel blocks
23 #define SAD_SSE_SUM_8(OFFSET) \
24 "movq " #OFFSET "(%0),%%mm0 \n\t"\
25 "movq " #OFFSET "(%1),%%mm1 \n\t"\
26 "psadbw %%mm1,%%mm0 \n\t"\
27 "paddw %%mm0,%%mm6 \n\t"\
28
29 #define SAD_SSE_FINISH(RESULT) \
30 asm volatile( "movd %%mm6,%0" : "=r" (RESULT) : );
31
32 // Advance by ystride
33 #define SAD_SSE_NEXTROW \
34 "add %2,%0 \n\t"\
35 "add %2,%1 \n\t"\
36
37 // BROKEN!
38 inline static int sad_sse_4x4( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
39 {
40 int result;
41 SAD_SSE_INIT
42 #define ROW SAD_SSE_SUM_8(0) SAD_SSE_NEXTROW
43 asm volatile ( ROW ROW ROW ROW
44 :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
45
46 SAD_SSE_FINISH(result)
47 return result;
48 #undef ROW
49
50 }
51
52 inline static int sad_sse_8x8( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
53 {
54 int result;
55 SAD_SSE_INIT
56 #define ROW SAD_SSE_SUM_8(0) SAD_SSE_NEXTROW
57 asm volatile ( ROW ROW ROW ROW ROW ROW ROW ROW
58 :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
59
60 SAD_SSE_FINISH(result)
61 return result;
62 #undef ROW
63
64 }
65
66 inline static int sad_sse_16x16( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
67 {
68 int result;
69 SAD_SSE_INIT
70 #define ROW SAD_SSE_SUM_8(0) SAD_SSE_SUM_8(8) SAD_SSE_NEXTROW
71 asm volatile ( ROW ROW ROW ROW ROW ROW ROW ROW
72 ROW ROW ROW ROW ROW ROW ROW ROW
73 :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
74
75 SAD_SSE_FINISH(result)
76 return result;
77 #undef ROW
78
79 }
80
81 inline static int sad_sse_32x32( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
82 {
83 int result;
84 SAD_SSE_INIT
85 #define ROW SAD_SSE_SUM_8(0) SAD_SSE_SUM_8(8) SAD_SSE_SUM_8(16) SAD_SSE_SUM_8(24)\
86 SAD_SSE_NEXTROW
87
88 asm volatile ( ROW ROW ROW ROW ROW ROW ROW ROW
89 ROW ROW ROW ROW ROW ROW ROW ROW
90 ROW ROW ROW ROW ROW ROW ROW ROW
91 ROW ROW ROW ROW ROW ROW ROW ROW
92 :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
93
94 SAD_SSE_FINISH(result)
95 return result;
96 #undef ROW
97
98 }
99 // BROKEN!
100 inline static int sad_sse_4w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
101 {
102 int result;
103
104 SAD_SSE_INIT
105
106 while( h != 0 ) {
107 asm volatile (
108 SAD_SSE_SUM_8(0)
109 :: "r" (block1), "r" (block2)
110 );
111
112 h--;
113 block1 += ystride;
114 block2 += ystride;
115 }
116 SAD_SSE_FINISH(result)
117 return result;
118
119 }
120
121 inline static int sad_sse_8w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
122 {
123 int result;
124
125 SAD_SSE_INIT
126
127 while( h != 0 ) {
128 asm volatile (
129 SAD_SSE_SUM_8(0)
130
131 :: "r" (block1), "r" (block2)
132 );
133
134 h--;
135 block1 += ystride;
136 block2 += ystride;
137 }
138 SAD_SSE_FINISH(result)
139 return result;
140
141 }
142
143 inline static int sad_sse_16w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
144 {
145 int result;
146
147 SAD_SSE_INIT
148
149 while( h != 0 ) {
150 asm volatile (
151 SAD_SSE_SUM_8(0)
152 SAD_SSE_SUM_8(8)
153
154 :: "r" (block1), "r" (block2)
155 );
156
157 h--;
158 block1 += ystride;
159 block2 += ystride;
160 }
161 SAD_SSE_FINISH(result)
162 return result;
163
164 }
165
166 inline static int sad_sse_32w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
167 {
168 int result;
169
170 SAD_SSE_INIT
171
172 while( h != 0 ) {
173 asm volatile (
174 SAD_SSE_SUM_8(0)
175 SAD_SSE_SUM_8(8)
176 SAD_SSE_SUM_8(16)
177 SAD_SSE_SUM_8(24)
178
179 :: "r" (block1), "r" (block2)
180 );
181
182 h--;
183 block1 += ystride;
184 block2 += ystride;
185 }
186 SAD_SSE_FINISH(result)
187 return result;
188
189 }
190
191 inline static int sad_sse_64w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
192 {
193 int result;
194
195 SAD_SSE_INIT
196
197 while( h != 0 ) {
198 asm volatile (
199 SAD_SSE_SUM_8(0)
200 SAD_SSE_SUM_8(8)
201 SAD_SSE_SUM_8(16)
202 SAD_SSE_SUM_8(24)
203 SAD_SSE_SUM_8(32)
204 SAD_SSE_SUM_8(40)
205 SAD_SSE_SUM_8(48)
206 SAD_SSE_SUM_8(56)
207
208 :: "r" (block1), "r" (block2)
209 );
210
211 h--;
212 block1 += ystride;
213 block2 += ystride;
214 }
215 SAD_SSE_FINISH(result)
216 return result;
217
218 }
219 static __attribute__((used)) __attribute__((aligned(8))) uint64_t sad_sse_422_mask_chroma = 0x00ff00ff00ff00ffULL;
220
221 #define SAD_SSE_422_LUMA_INIT \
222 asm volatile ( "movq %0,%%mm7\n\t"\
223 "pxor %%mm6,%%mm6\n\t" :: "m" (sad_sse_422_mask_chroma) );\
224
225 // Sum two 4x1 pixel blocks
226 #define SAD_SSE_422_LUMA_SUM_4(OFFSET) \
227 "movq " #OFFSET "(%0),%%mm0 \n\t"\
228 "movq " #OFFSET "(%1),%%mm1 \n\t"\
229 "pand %%mm7,%%mm0 \n\t"\
230 "pand %%mm7,%%mm1 \n\t"\
231 "psadbw %%mm1,%%mm0 \n\t"\
232 "paddw %%mm0,%%mm6 \n\t"\
233
234 static int sad_sse_422_luma_4x4( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
235 {
236 int result;
237 SAD_SSE_422_LUMA_INIT
238 #define ROW SAD_SSE_422_LUMA_SUM_4(0) SAD_SSE_NEXTROW
239 asm volatile ( ROW ROW ROW ROW
240 :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
241
242 SAD_SSE_FINISH(result)
243 return result;
244 #undef ROW
245
246 }
247
248 static int sad_sse_422_luma_8x8( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
249 {
250 int result;
251 SAD_SSE_422_LUMA_INIT
252 #define ROW SAD_SSE_422_LUMA_SUM_4(0) SAD_SSE_422_LUMA_SUM_4(8) SAD_SSE_NEXTROW
253 asm volatile ( ROW ROW ROW ROW ROW ROW ROW ROW
254 :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
255
256 SAD_SSE_FINISH(result)
257 return result;
258 #undef ROW
259
260 }
261
262 static int sad_sse_422_luma_16x16( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
263 {
264 int result;
265 SAD_SSE_422_LUMA_INIT
266 #define ROW SAD_SSE_422_LUMA_SUM_4(0) SAD_SSE_422_LUMA_SUM_4(8) SAD_SSE_422_LUMA_SUM_4(16) SAD_SSE_422_LUMA_SUM_4(24) SAD_SSE_NEXTROW
267 asm volatile ( ROW ROW ROW ROW ROW ROW ROW ROW
268 ROW ROW ROW ROW ROW ROW ROW ROW
269 :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
270
271 SAD_SSE_FINISH(result)
272 return result;
273 #undef ROW
274
275 }
276
277 static int sad_sse_422_luma_32x32( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
278 {
279 int result;
280 SAD_SSE_422_LUMA_INIT
281 #define ROW SAD_SSE_422_LUMA_SUM_4(0) SAD_SSE_422_LUMA_SUM_4(8) SAD_SSE_422_LUMA_SUM_4(16) SAD_SSE_422_LUMA_SUM_4(24)\
282 SAD_SSE_422_LUMA_SUM_4(32) SAD_SSE_422_LUMA_SUM_4(40) SAD_SSE_422_LUMA_SUM_4(48) SAD_SSE_422_LUMA_SUM_4(56)\
283 SAD_SSE_NEXTROW
284
285 asm volatile ( ROW ROW ROW ROW ROW ROW ROW ROW
286 ROW ROW ROW ROW ROW ROW ROW ROW
287 ROW ROW ROW ROW ROW ROW ROW ROW
288 ROW ROW ROW ROW ROW ROW ROW ROW
289 :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
290
291 SAD_SSE_FINISH(result)
292 return result;
293 #undef ROW
294
295 }
296
297 static int sad_sse_422_luma_4w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
298 {
299 int result;
300
301 SAD_SSE_422_LUMA_INIT
302
303 while( h != 0 ) {
304 asm volatile (
305 SAD_SSE_422_LUMA_SUM_4(0)
306 :: "r" (block1), "r" (block2)
307 );
308
309 h--;
310 block1 += ystride;
311 block2 += ystride;
312 }
313 SAD_SSE_FINISH(result)
314 return result;
315
316 }
317
318 static int sad_sse_422_luma_8w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
319 {
320 int result;
321
322 SAD_SSE_422_LUMA_INIT
323
324 while( h != 0 ) {
325 asm volatile (
326 SAD_SSE_422_LUMA_SUM_4(0)
327 SAD_SSE_422_LUMA_SUM_4(8)
328
329 :: "r" (block1), "r" (block2)
330 );
331
332 h--;
333 block1 += ystride;
334 block2 += ystride;
335 }
336 SAD_SSE_FINISH(result)
337 return result;
338
339 }
340
341 static int sad_sse_422_luma_16w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
342 {
343 int result;
344
345 SAD_SSE_422_LUMA_INIT
346
347 while( h != 0 ) {
348 asm volatile (
349 SAD_SSE_422_LUMA_SUM_4(0)
350 SAD_SSE_422_LUMA_SUM_4(8)
351 SAD_SSE_422_LUMA_SUM_4(16)
352 SAD_SSE_422_LUMA_SUM_4(24)
353
354 :: "r" (block1), "r" (block2)
355 );
356
357 h--;
358 block1 += ystride;
359 block2 += ystride;
360 }
361 SAD_SSE_FINISH(result)
362 return result;
363
364 }
365
366 static int sad_sse_422_luma_32w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
367 {
368 int result;
369
370 SAD_SSE_422_LUMA_INIT
371
372 while( h != 0 ) {
373 asm volatile (
374 SAD_SSE_422_LUMA_SUM_4(0)
375 SAD_SSE_422_LUMA_SUM_4(8)
376 SAD_SSE_422_LUMA_SUM_4(16)
377 SAD_SSE_422_LUMA_SUM_4(24)
378 SAD_SSE_422_LUMA_SUM_4(32)
379 SAD_SSE_422_LUMA_SUM_4(40)
380 SAD_SSE_422_LUMA_SUM_4(48)
381 SAD_SSE_422_LUMA_SUM_4(56)
382
383 :: "r" (block1), "r" (block2)
384 );
385
386 h--;
387 block1 += ystride;
388 block2 += ystride;
389 }
390 SAD_SSE_FINISH(result)
391 return result;
392
393 }
394
395 static int sad_sse_422_luma_64w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
396 {
397 int result;
398
399 SAD_SSE_422_LUMA_INIT
400
401 while( h != 0 ) {
402 asm volatile (
403 SAD_SSE_422_LUMA_SUM_4(0)
404 SAD_SSE_422_LUMA_SUM_4(8)
405 SAD_SSE_422_LUMA_SUM_4(16)
406 SAD_SSE_422_LUMA_SUM_4(24)
407 SAD_SSE_422_LUMA_SUM_4(32)
408 SAD_SSE_422_LUMA_SUM_4(40)
409 SAD_SSE_422_LUMA_SUM_4(48)
410 SAD_SSE_422_LUMA_SUM_4(56)
411 SAD_SSE_422_LUMA_SUM_4(64)
412 SAD_SSE_422_LUMA_SUM_4(72)
413 SAD_SSE_422_LUMA_SUM_4(80)
414 SAD_SSE_422_LUMA_SUM_4(88)
415 SAD_SSE_422_LUMA_SUM_4(96)
416 SAD_SSE_422_LUMA_SUM_4(104)
417 SAD_SSE_422_LUMA_SUM_4(112)
418 SAD_SSE_422_LUMA_SUM_4(120)
419
420 :: "r" (block1), "r" (block2)
421 );
422
423 h--;
424 block1 += ystride;
425 block2 += ystride;
426 }
427 SAD_SSE_FINISH(result)
428 return result;
429 }