xine/Makefile, xine/xineutils.h, xine/deinterlace.c: respect mmx compilation flag...
[melted] / src / modules / xine / deinterlace.c
1 /*
2 * Copyright (C) 2001 the xine project
3 *
4 * This file is part of xine, a free video player.
5 *
6 * xine is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * xine is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
19 *
20 * Deinterlace routines by Miguel Freitas
21 * based of DScaler project sources (deinterlace.sourceforge.net)
22 *
23 * Currently only available for Xv driver and MMX extensions
24 *
25 * small todo list:
26 * - implement non-MMX versions for all methods
27 * - support MMX2 instructions
28 * - move some generic code from xv driver to this file
29 * - make it also work for yuy2 frames
30 *
31 */
32
33 #include <stdio.h>
34 #include <string.h>
35 #include "deinterlace.h"
36 #include "xineutils.h"
37
38 #define xine_fast_memcpy memcpy
39 #define xine_fast_memmove memmove
40
41 /*
42 DeinterlaceFieldBob algorithm
43 Based on Virtual Dub plugin by Gunnar Thalin
44 MMX asm version from dscaler project (deinterlace.sourceforge.net)
45 Linux version for Xine player by Miguel Freitas
46 */
47 static void deinterlace_bob_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
48 int width, int height )
49 {
50 #ifdef USE_MMX
51 int Line;
52 uint64_t *YVal1;
53 uint64_t *YVal2;
54 uint64_t *YVal3;
55 uint64_t *Dest;
56 uint8_t* pEvenLines = psrc[0];
57 uint8_t* pOddLines = psrc[0]+width;
58 int LineLength = width;
59 int SourcePitch = width * 2;
60 int IsOdd = 1;
61 long EdgeDetect = 625;
62 long JaggieThreshold = 73;
63
64 int n;
65
66 uint64_t qwEdgeDetect;
67 uint64_t qwThreshold;
68
69 static mmx_t YMask = {ub:{0xff,0,0xff,0,0xff,0,0xff,0}};
70 static mmx_t Mask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
71
72 qwEdgeDetect = EdgeDetect;
73 qwEdgeDetect += (qwEdgeDetect << 48) + (qwEdgeDetect << 32) + (qwEdgeDetect << 16);
74 qwThreshold = JaggieThreshold;
75 qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16);
76
77
78 // copy first even line no matter what, and the first odd line if we're
79 // processing an odd field.
80 xine_fast_memcpy(pdst, pEvenLines, LineLength);
81 if (IsOdd)
82 xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength);
83
84 height = height / 2;
85 for (Line = 0; Line < height - 1; ++Line)
86 {
87 if (IsOdd)
88 {
89 YVal1 = (uint64_t *)(pOddLines + Line * SourcePitch);
90 YVal2 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
91 YVal3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
92 Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
93 }
94 else
95 {
96 YVal1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
97 YVal2 = (uint64_t *)(pOddLines + Line * SourcePitch);
98 YVal3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
99 Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
100 }
101
102 // For ease of reading, the comments below assume that we're operating on an odd
103 // field (i.e., that bIsOdd is true). The exact same processing is done when we
104 // operate on an even field, but the roles of the odd and even fields are reversed.
105 // It's just too cumbersome to explain the algorithm in terms of "the next odd
106 // line if we're doing an odd field, or the next even line if we're doing an
107 // even field" etc. So wherever you see "odd" or "even" below, keep in mind that
108 // half the time this function is called, those words' meanings will invert.
109
110 // Copy the odd line to the overlay verbatim.
111 xine_fast_memcpy((char *)Dest + LineLength, YVal3, LineLength);
112
113 n = LineLength >> 3;
114 while( n-- )
115 {
116 movq_m2r (*YVal1++, mm0);
117 movq_m2r (*YVal2++, mm1);
118 movq_m2r (*YVal3++, mm2);
119
120 // get intensities in mm3 - 4
121 movq_r2r ( mm0, mm3 );
122 pand_m2r ( YMask, mm3 );
123 movq_r2r ( mm1, mm4 );
124 pand_m2r ( YMask, mm4 );
125 movq_r2r ( mm2, mm5 );
126 pand_m2r ( YMask, mm5 );
127
128 // get average in mm0
129 pand_m2r ( Mask, mm0 );
130 pand_m2r ( Mask, mm2 );
131 psrlw_i2r ( 01, mm0 );
132 psrlw_i2r ( 01, mm2 );
133 paddw_r2r ( mm2, mm0 );
134
135 // work out (O1 - E) * (O2 - E) / 2 - EdgeDetect * (O1 - O2) ^ 2 >> 12
136 // result will be in mm6
137
138 psrlw_i2r ( 01, mm3 );
139 psrlw_i2r ( 01, mm4 );
140 psrlw_i2r ( 01, mm5 );
141
142 movq_r2r ( mm3, mm6 );
143 psubw_r2r ( mm4, mm6 ); //mm6 = O1 - E
144
145 movq_r2r ( mm5, mm7 );
146 psubw_r2r ( mm4, mm7 ); //mm7 = O2 - E
147
148 pmullw_r2r ( mm7, mm6 ); // mm6 = (O1 - E) * (O2 - E)
149
150 movq_r2r ( mm3, mm7 );
151 psubw_r2r ( mm5, mm7 ); // mm7 = (O1 - O2)
152 pmullw_r2r ( mm7, mm7 ); // mm7 = (O1 - O2) ^ 2
153 psrlw_i2r ( 12, mm7 ); // mm7 = (O1 - O2) ^ 2 >> 12
154 pmullw_m2r ( *&qwEdgeDetect, mm7 );// mm7 = EdgeDetect * (O1 - O2) ^ 2 >> 12
155
156 psubw_r2r ( mm7, mm6 ); // mm6 is what we want
157
158 pcmpgtw_m2r ( *&qwThreshold, mm6 );
159
160 movq_r2r ( mm6, mm7 );
161
162 pand_r2r ( mm6, mm0 );
163
164 pandn_r2r ( mm1, mm7 );
165
166 por_r2r ( mm0, mm7 );
167
168 movq_r2m ( mm7, *Dest++ );
169 }
170 }
171
172 // Copy last odd line if we're processing an even field.
173 if (! IsOdd)
174 {
175 xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
176 pOddLines + (height - 1) * SourcePitch,
177 LineLength);
178 }
179
180 // clear out the MMX registers ready for doing floating point
181 // again
182 emms();
183 #endif
184 }
185
186 /* Deinterlace the latest field, with a tendency to weave rather than bob.
187 Good for high detail on low-movement scenes.
188 Seems to produce bad output in general case, need to check if this
189 is normal or if the code is broken.
190 */
191 static int deinterlace_weave_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
192 int width, int height )
193 {
194 #ifdef USE_MMX
195
196 int Line;
197 uint64_t *YVal1;
198 uint64_t *YVal2;
199 uint64_t *YVal3;
200 uint64_t *YVal4;
201 uint64_t *Dest;
202 uint8_t* pEvenLines = psrc[0];
203 uint8_t* pOddLines = psrc[0]+width;
204 uint8_t* pPrevLines;
205
206 int LineLength = width;
207 int SourcePitch = width * 2;
208 int IsOdd = 1;
209
210 long TemporalTolerance = 300;
211 long SpatialTolerance = 600;
212 long SimilarityThreshold = 25;
213
214 int n;
215
216 uint64_t qwSpatialTolerance;
217 uint64_t qwTemporalTolerance;
218 uint64_t qwThreshold;
219
220 static mmx_t YMask = {ub:{0xff,0,0xff,0,0xff,0,0xff,0}};
221 static mmx_t Mask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
222
223
224 // Make sure we have all the data we need.
225 if ( psrc[0] == NULL || psrc[1] == NULL )
226 return 0;
227
228 if (IsOdd)
229 pPrevLines = psrc[1] + width;
230 else
231 pPrevLines = psrc[1];
232
233 // Since the code uses MMX to process 4 pixels at a time, we need our constants
234 // to be represented 4 times per quadword.
235 qwSpatialTolerance = SpatialTolerance;
236 qwSpatialTolerance += (qwSpatialTolerance << 48) + (qwSpatialTolerance << 32) + (qwSpatialTolerance << 16);
237 qwTemporalTolerance = TemporalTolerance;
238 qwTemporalTolerance += (qwTemporalTolerance << 48) + (qwTemporalTolerance << 32) + (qwTemporalTolerance << 16);
239 qwThreshold = SimilarityThreshold;
240 qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16);
241
242 // copy first even line no matter what, and the first odd line if we're
243 // processing an even field.
244 xine_fast_memcpy(pdst, pEvenLines, LineLength);
245 if (!IsOdd)
246 xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength);
247
248 height = height / 2;
249 for (Line = 0; Line < height - 1; ++Line)
250 {
251 if (IsOdd)
252 {
253 YVal1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
254 YVal2 = (uint64_t *)(pOddLines + Line * SourcePitch);
255 YVal3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
256 YVal4 = (uint64_t *)(pPrevLines + Line * SourcePitch);
257 Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
258 }
259 else
260 {
261 YVal1 = (uint64_t *)(pOddLines + Line * SourcePitch);
262 YVal2 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
263 YVal3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
264 YVal4 = (uint64_t *)(pPrevLines + (Line + 1) * SourcePitch);
265 Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
266 }
267
268 // For ease of reading, the comments below assume that we're operating on an odd
269 // field (i.e., that bIsOdd is true). The exact same processing is done when we
270 // operate on an even field, but the roles of the odd and even fields are reversed.
271 // It's just too cumbersome to explain the algorithm in terms of "the next odd
272 // line if we're doing an odd field, or the next even line if we're doing an
273 // even field" etc. So wherever you see "odd" or "even" below, keep in mind that
274 // half the time this function is called, those words' meanings will invert.
275
276 // Copy the even scanline below this one to the overlay buffer, since we'll be
277 // adapting the current scanline to the even lines surrounding it. The scanline
278 // above has already been copied by the previous pass through the loop.
279 xine_fast_memcpy((char *)Dest + LineLength, YVal3, LineLength);
280
281 n = LineLength >> 3;
282 while( n-- )
283 {
284 movq_m2r ( *YVal1++, mm0 ); // mm0 = E1
285 movq_m2r ( *YVal2++, mm1 ); // mm1 = O
286 movq_m2r ( *YVal3++, mm2 ); // mm2 = E2
287
288 movq_r2r ( mm0, mm3 ); // mm3 = intensity(E1)
289 movq_r2r ( mm1, mm4 ); // mm4 = intensity(O)
290 movq_r2r ( mm2, mm6 ); // mm6 = intensity(E2)
291
292 pand_m2r ( YMask, mm3 );
293 pand_m2r ( YMask, mm4 );
294 pand_m2r ( YMask, mm6 );
295
296 // Average E1 and E2 for interpolated bobbing.
297 // leave result in mm0
298 pand_m2r ( Mask, mm0 ); // mm0 = E1 with lower chroma bit stripped off
299 pand_m2r ( Mask, mm2 ); // mm2 = E2 with lower chroma bit stripped off
300 psrlw_i2r ( 01, mm0 ); // mm0 = E1 / 2
301 psrlw_i2r ( 01, mm2 ); // mm2 = E2 / 2
302 paddb_r2r ( mm2, mm0 );
303
304 // The meat of the work is done here. We want to see whether this pixel is
305 // close in luminosity to ANY of: its top neighbor, its bottom neighbor,
306 // or its predecessor. To do this without branching, we use MMX's
307 // saturation feature, which gives us Z(x) = x if x>=0, or 0 if x<0.
308 //
309 // The formula we're computing here is
310 // Z(ST - (E1 - O) ^ 2) + Z(ST - (E2 - O) ^ 2) + Z(TT - (Oold - O) ^ 2)
311 // where ST is spatial tolerance and TT is temporal tolerance. The idea
312 // is that if a pixel is similar to none of its neighbors, the resulting
313 // value will be pretty low, probably zero. A high value therefore indicates
314 // that the pixel had a similar neighbor. The pixel in the same position
315 // in the field before last (Oold) is considered a neighbor since we want
316 // to be able to display 1-pixel-high horizontal lines.
317
318 movq_m2r ( *&qwSpatialTolerance, mm7 );
319 movq_r2r ( mm3, mm5 ); // mm5 = E1
320 psubsw_r2r ( mm4, mm5 ); // mm5 = E1 - O
321 psraw_i2r ( 1, mm5 );
322 pmullw_r2r ( mm5, mm5 ); // mm5 = (E1 - O) ^ 2
323 psubusw_r2r ( mm5, mm7 ); // mm7 = ST - (E1 - O) ^ 2, or 0 if that's negative
324
325 movq_m2r ( *&qwSpatialTolerance, mm3 );
326 movq_r2r ( mm6, mm5 ); // mm5 = E2
327 psubsw_r2r ( mm4, mm5 ); // mm5 = E2 - O
328 psraw_i2r ( 1, mm5 );
329 pmullw_r2r ( mm5, mm5 ); // mm5 = (E2 - O) ^ 2
330 psubusw_r2r ( mm5, mm3 ); // mm0 = ST - (E2 - O) ^ 2, or 0 if that's negative
331 paddusw_r2r ( mm3, mm7 ); // mm7 = (ST - (E1 - O) ^ 2) + (ST - (E2 - O) ^ 2)
332
333 movq_m2r ( *&qwTemporalTolerance, mm3 );
334 movq_m2r ( *YVal4++, mm5 ); // mm5 = Oold
335 pand_m2r ( YMask, mm5 );
336 psubsw_r2r ( mm4, mm5 ); // mm5 = Oold - O
337 psraw_i2r ( 1, mm5 ); // XXX
338 pmullw_r2r ( mm5, mm5 ); // mm5 = (Oold - O) ^ 2
339 psubusw_r2r ( mm5, mm3 ); /* mm0 = TT - (Oold - O) ^ 2, or 0 if that's negative */
340 paddusw_r2r ( mm3, mm7 ); // mm7 = our magic number
341
342 /*
343 * Now compare the similarity totals against our threshold. The pcmpgtw
344 * instruction will populate the target register with a bunch of mask bits,
345 * filling words where the comparison is true with 1s and ones where it's
346 * false with 0s. A few ANDs and NOTs and an OR later, we have bobbed
347 * values for pixels under the similarity threshold and weaved ones for
348 * pixels over the threshold.
349 */
350
351 pcmpgtw_m2r( *&qwThreshold, mm7 ); // mm7 = 0xffff where we're greater than the threshold, 0 elsewhere
352 movq_r2r ( mm7, mm6 ); // mm6 = 0xffff where we're greater than the threshold, 0 elsewhere
353 pand_r2r ( mm1, mm7 ); // mm7 = weaved data where we're greater than the threshold, 0 elsewhere
354 pandn_r2r ( mm0, mm6 ); // mm6 = bobbed data where we're not greater than the threshold, 0 elsewhere
355 por_r2r ( mm6, mm7 ); // mm7 = bobbed and weaved data
356
357 movq_r2m ( mm7, *Dest++ );
358 }
359 }
360
361 // Copy last odd line if we're processing an odd field.
362 if (IsOdd)
363 {
364 xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
365 pOddLines + (height - 1) * SourcePitch,
366 LineLength);
367 }
368
369 // clear out the MMX registers ready for doing floating point
370 // again
371 emms();
372
373 #endif
374
375 return 1;
376 }
377
378
379 // This is a simple lightweight DeInterlace method that uses little CPU time
380 // but gives very good results for low or intermedite motion. (MORE CPU THAN BOB)
381 // It defers frames by one field, but that does not seem to produce noticeable
382 // lip sync problems.
383 //
384 // The method used is to take either the older or newer weave pixel depending
385 // upon which give the smaller comb factor, and then clip to avoid large damage
386 // when wrong.
387 //
388 // I'd intended this to be part of a larger more elaborate method added to
389 // Blended Clip but this give too good results for the CPU to ignore here.
390 static int deinterlace_greedy_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
391 int width, int height )
392 {
393 #ifdef USE_MMX
394 int Line;
395 int LoopCtr;
396 uint64_t *L1; // ptr to Line1, of 3
397 uint64_t *L2; // ptr to Line2, the weave line
398 uint64_t *L3; // ptr to Line3
399 uint64_t *LP2; // ptr to prev Line2
400 uint64_t *Dest;
401 uint8_t* pEvenLines = psrc[0];
402 uint8_t* pOddLines = psrc[0]+width;
403 uint8_t* pPrevLines;
404
405 static mmx_t ShiftMask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
406
407 int LineLength = width;
408 int SourcePitch = width * 2;
409 int IsOdd = 1;
410 long GreedyMaxComb = 15;
411 static mmx_t MaxComb;
412 int i;
413
414 if ( psrc[0] == NULL || psrc[1] == NULL )
415 return 0;
416
417 if (IsOdd)
418 pPrevLines = psrc[1] + width;
419 else
420 pPrevLines = psrc[1];
421
422
423 for( i = 0; i < 8; i++ )
424 MaxComb.ub[i] = GreedyMaxComb; // How badly do we let it weave? 0-255
425
426
427 // copy first even line no matter what, and the first odd line if we're
428 // processing an EVEN field. (note diff from other deint rtns.)
429 xine_fast_memcpy(pdst, pEvenLines, LineLength); //DL0
430 if (!IsOdd)
431 xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength); //DL1
432
433 height = height / 2;
434 for (Line = 0; Line < height - 1; ++Line)
435 {
436 LoopCtr = LineLength / 8; // there are LineLength / 8 qwords per line
437
438 if (IsOdd)
439 {
440 L1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
441 L2 = (uint64_t *)(pOddLines + Line * SourcePitch);
442 L3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
443 LP2 = (uint64_t *)(pPrevLines + Line * SourcePitch); // prev Odd lines
444 Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
445 }
446 else
447 {
448 L1 = (uint64_t *)(pOddLines + Line * SourcePitch);
449 L2 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
450 L3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
451 LP2 = (uint64_t *)(pPrevLines + (Line + 1) * SourcePitch); //prev even lines
452 Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
453 }
454
455 xine_fast_memcpy((char *)Dest + LineLength, L3, LineLength);
456
457 // For ease of reading, the comments below assume that we're operating on an odd
458 // field (i.e., that info->IsOdd is true). Assume the obvious for even lines..
459
460 while( LoopCtr-- )
461 {
462 movq_m2r ( *L1++, mm1 );
463 movq_m2r ( *L2++, mm2 );
464 movq_m2r ( *L3++, mm3 );
465 movq_m2r ( *LP2++, mm0 );
466
467 // average L1 and L3 leave result in mm4
468 movq_r2r ( mm1, mm4 ); // L1
469
470 pand_m2r ( ShiftMask, mm4 );
471 psrlw_i2r ( 01, mm4 );
472 movq_r2r ( mm3, mm5 ); // L3
473 pand_m2r ( ShiftMask, mm5 );
474 psrlw_i2r ( 01, mm5 );
475 paddb_r2r ( mm5, mm4 ); // the average, for computing comb
476
477 // get abs value of possible L2 comb
478 movq_r2r ( mm2, mm7 ); // L2
479 psubusb_r2r ( mm4, mm7 ); // L2 - avg
480 movq_r2r ( mm4, mm5 ); // avg
481 psubusb_r2r ( mm2, mm5 ); // avg - L2
482 por_r2r ( mm7, mm5 ); // abs(avg-L2)
483 movq_r2r ( mm4, mm6 ); // copy of avg for later
484
485 // get abs value of possible LP2 comb
486 movq_r2r ( mm0, mm7 ); // LP2
487 psubusb_r2r ( mm4, mm7 ); // LP2 - avg
488 psubusb_r2r ( mm0, mm4 ); // avg - LP2
489 por_r2r ( mm7, mm4 ); // abs(avg-LP2)
490
491 // use L2 or LP2 depending upon which makes smaller comb
492 psubusb_r2r ( mm5, mm4 ); // see if it goes to zero
493 psubusb_r2r ( mm5, mm5 ); // 0
494 pcmpeqb_r2r ( mm5, mm4 ); // if (mm4=0) then FF else 0
495 pcmpeqb_r2r ( mm4, mm5 ); // opposite of mm4
496
497 // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
498 pand_r2r ( mm2, mm5 ); // use L2 if mm5 == ff, else 0
499 pand_r2r ( mm0, mm4 ); // use LP2 if mm4 = ff, else 0
500 por_r2r ( mm5, mm4 ); // may the best win
501
502 // Now lets clip our chosen value to be not outside of the range
503 // of the high/low range L1-L3 by more than abs(L1-L3)
504 // This allows some comb but limits the damages and also allows more
505 // detail than a boring oversmoothed clip.
506
507 movq_r2r ( mm1, mm2 ); // copy L1
508 psubusb_r2r ( mm3, mm2 ); // - L3, with saturation
509 paddusb_r2r ( mm3, mm2 ); // now = Max(L1,L3)
510
511 pcmpeqb_r2r ( mm7, mm7 ); // all ffffffff
512 psubusb_r2r ( mm1, mm7 ); // - L1
513 paddusb_r2r ( mm7, mm3 ); // add, may sat at fff..
514 psubusb_r2r ( mm7, mm3 ); // now = Min(L1,L3)
515
516 // allow the value to be above the high or below the low by amt of MaxComb
517 paddusb_m2r ( MaxComb, mm2 ); // increase max by diff
518 psubusb_m2r ( MaxComb, mm3 ); // lower min by diff
519
520 psubusb_r2r ( mm3, mm4 ); // best - Min
521 paddusb_r2r ( mm3, mm4 ); // now = Max(best,Min(L1,L3)
522
523 pcmpeqb_r2r ( mm7, mm7 ); // all ffffffff
524 psubusb_r2r ( mm4, mm7 ); // - Max(best,Min(best,L3)
525 paddusb_r2r ( mm7, mm2 ); // add may sat at FFF..
526 psubusb_r2r ( mm7, mm2 ); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
527
528 movq_r2m ( mm2, *Dest++ ); // move in our clipped best
529
530 }
531 }
532
533 /* Copy last odd line if we're processing an Odd field. */
534 if (IsOdd)
535 {
536 xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
537 pOddLines + (height - 1) * SourcePitch,
538 LineLength);
539 }
540
541 /* clear out the MMX registers ready for doing floating point again */
542 emms();
543
544 #endif
545
546 return 1;
547 }
548
549 /* Use one field to interpolate the other (low cpu utilization)
550 Will lose resolution but does not produce weaving effect
551 (good for fast moving scenes) also know as "linear interpolation"
552 */
553 static void deinterlace_onefield_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
554 int width, int height )
555 {
556 #ifdef USE_MMX
557 int Line;
558 uint64_t *YVal1;
559 uint64_t *YVal3;
560 uint64_t *Dest;
561 uint8_t* pEvenLines = psrc[0];
562 uint8_t* pOddLines = psrc[0]+width;
563 int LineLength = width;
564 int SourcePitch = width * 2;
565 int IsOdd = 1;
566
567 int n;
568
569 static mmx_t Mask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
570
571 /*
572 * copy first even line no matter what, and the first odd line if we're
573 * processing an odd field.
574 */
575
576 xine_fast_memcpy(pdst, pEvenLines, LineLength);
577 if (IsOdd)
578 xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength);
579
580 height = height / 2;
581 for (Line = 0; Line < height - 1; ++Line)
582 {
583 if (IsOdd)
584 {
585 YVal1 = (uint64_t *)(pOddLines + Line * SourcePitch);
586 YVal3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
587 Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
588 }
589 else
590 {
591 YVal1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
592 YVal3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
593 Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
594 }
595
596 // Copy the odd line to the overlay verbatim.
597 xine_fast_memcpy((char *)Dest + LineLength, YVal3, LineLength);
598
599 n = LineLength >> 3;
600 while( n-- )
601 {
602 movq_m2r (*YVal1++, mm0);
603 movq_m2r (*YVal3++, mm2);
604
605 // get average in mm0
606 pand_m2r ( Mask, mm0 );
607 pand_m2r ( Mask, mm2 );
608 psrlw_i2r ( 01, mm0 );
609 psrlw_i2r ( 01, mm2 );
610 paddw_r2r ( mm2, mm0 );
611
612 movq_r2m ( mm0, *Dest++ );
613 }
614 }
615
616 /* Copy last odd line if we're processing an even field. */
617 if (! IsOdd)
618 {
619 xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
620 pOddLines + (height - 1) * SourcePitch,
621 LineLength);
622 }
623
624 /* clear out the MMX registers ready for doing floating point
625 * again
626 */
627 emms();
628 #endif
629 }
630
631 /* Linear Blend filter - does a kind of vertical blurring on the image.
632 (idea borrowed from mplayer's sources)
633 */
634 static void deinterlace_linearblend_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
635 int width, int height )
636 {
637 #ifdef USE_MMX
638 int Line;
639 uint64_t *YVal1;
640 uint64_t *YVal2;
641 uint64_t *YVal3;
642 uint64_t *Dest;
643 int LineLength = width;
644
645 int n;
646
647 /* Copy first line */
648 xine_fast_memmove(pdst, psrc[0], LineLength);
649
650 for (Line = 1; Line < height - 1; ++Line)
651 {
652 YVal1 = (uint64_t *)(psrc[0] + (Line - 1) * LineLength);
653 YVal2 = (uint64_t *)(psrc[0] + (Line) * LineLength);
654 YVal3 = (uint64_t *)(psrc[0] + (Line + 1) * LineLength);
655 Dest = (uint64_t *)(pdst + Line * LineLength);
656
657 n = LineLength >> 3;
658 while( n-- )
659 {
660 /* load data from 3 lines */
661 movq_m2r (*YVal1++, mm0);
662 movq_m2r (*YVal2++, mm1);
663 movq_m2r (*YVal3++, mm2);
664
665 /* expand bytes to words */
666 punpckhbw_r2r (mm0, mm3);
667 punpckhbw_r2r (mm1, mm4);
668 punpckhbw_r2r (mm2, mm5);
669 punpcklbw_r2r (mm0, mm0);
670 punpcklbw_r2r (mm1, mm1);
671 punpcklbw_r2r (mm2, mm2);
672
673 /*
674 * deinterlacing:
675 * deint_line = (line0 + 2*line1 + line2) / 4
676 */
677 psrlw_i2r (07, mm0);
678 psrlw_i2r (06, mm1);
679 psrlw_i2r (07, mm2);
680 psrlw_i2r (07, mm3);
681 psrlw_i2r (06, mm4);
682 psrlw_i2r (07, mm5);
683 paddw_r2r (mm1, mm0);
684 paddw_r2r (mm2, mm0);
685 paddw_r2r (mm4, mm3);
686 paddw_r2r (mm5, mm3);
687 psrlw_i2r (03, mm0);
688 psrlw_i2r (03, mm3);
689
690 /* pack 8 words to 8 bytes in mm0 */
691 packuswb_r2r (mm3, mm0);
692
693 movq_r2m ( mm0, *Dest++ );
694 }
695 }
696
697 /* Copy last line */
698 xine_fast_memmove(pdst + Line * LineLength,
699 psrc[0] + Line * LineLength, LineLength);
700
701 /* clear out the MMX registers ready for doing floating point
702 * again
703 */
704 emms();
705 #endif
706 }
707
708 /* Linear Blend filter - C version contributed by Rogerio Brito.
709 This algorithm has the same interface as the other functions.
710
711 The destination "screen" (pdst) is constructed from the source
712 screen (psrc[0]) line by line.
713
714 The i-th line of the destination screen is the average of 3 lines
715 from the source screen: the (i-1)-th, i-th and (i+1)-th lines, with
716 the i-th line having weight 2 in the computation.
717
718 Remarks:
719 * each line on pdst doesn't depend on previous lines;
720 * due to the way the algorithm is defined, the first & last lines of the
721 screen aren't deinterlaced.
722
723 */
724 static void deinterlace_linearblend_yuv( uint8_t *pdst, uint8_t *psrc[],
725 int width, int height )
726 {
727 register int x, y;
728 register uint8_t *l0, *l1, *l2, *l3;
729
730 l0 = pdst; /* target line */
731 l1 = psrc[0]; /* 1st source line */
732 l2 = l1 + width; /* 2nd source line = line that follows l1 */
733 l3 = l2 + width; /* 3rd source line = line that follows l2 */
734
735 /* Copy the first line */
736 xine_fast_memcpy(l0, l1, width);
737 l0 += width;
738
739 for (y = 1; y < height-1; ++y) {
740 /* computes avg of: l1 + 2*l2 + l3 */
741
742 for (x = 0; x < width; ++x) {
743 l0[x] = (l1[x] + (l2[x]<<1) + l3[x]) >> 2;
744 }
745
746 /* updates the line pointers */
747 l1 = l2; l2 = l3; l3 += width;
748 l0 += width;
749 }
750
751 /* Copy the last line */
752 xine_fast_memcpy(l0, l1, width);
753 }
754
755 static int check_for_mmx(void)
756 {
757 #ifdef USE_MMX
758 static int config_flags = -1;
759
760 if ( config_flags == -1 )
761 config_flags = xine_mm_accel();
762 if (config_flags & MM_ACCEL_X86_MMX)
763 return 1;
764 return 0;
765 #else
766 return 0;
767 #endif
768 }
769
770 /* generic YUV deinterlacer
771 pdst -> pointer to destination bitmap
772 psrc -> array of pointers to source bitmaps ([0] = most recent)
773 width,height -> dimension for bitmaps
774 method -> DEINTERLACE_xxx
775 */
776
777 void deinterlace_yuv( uint8_t *pdst, uint8_t *psrc[],
778 int width, int height, int method )
779 {
780 switch( method ) {
781 case DEINTERLACE_NONE:
782 xine_fast_memcpy(pdst,psrc[0],width*height);
783 break;
784 case DEINTERLACE_BOB:
785 if( check_for_mmx() )
786 deinterlace_bob_yuv_mmx(pdst,psrc,width,height);
787 else /* FIXME: provide an alternative? */
788 xine_fast_memcpy(pdst,psrc[0],width*height);
789 break;
790 case DEINTERLACE_WEAVE:
791 if( check_for_mmx() )
792 {
793 if( !deinterlace_weave_yuv_mmx(pdst,psrc,width,height) )
794 xine_fast_memcpy(pdst,psrc[0],width*height);
795 }
796 else /* FIXME: provide an alternative? */
797 xine_fast_memcpy(pdst,psrc[0],width*height);
798 break;
799 case DEINTERLACE_GREEDY:
800 if( check_for_mmx() )
801 {
802 if( !deinterlace_greedy_yuv_mmx(pdst,psrc,width,height) )
803 xine_fast_memcpy(pdst,psrc[0],width*height);
804 }
805 else /* FIXME: provide an alternative? */
806 xine_fast_memcpy(pdst,psrc[0],width*height);
807 break;
808 case DEINTERLACE_ONEFIELD:
809 if( check_for_mmx() )
810 deinterlace_onefield_yuv_mmx(pdst,psrc,width,height);
811 else /* FIXME: provide an alternative? */
812 xine_fast_memcpy(pdst,psrc[0],width*height);
813 break;
814 case DEINTERLACE_ONEFIELDXV:
815 lprintf("ONEFIELDXV must be handled by the video driver.\n");
816 break;
817 case DEINTERLACE_LINEARBLEND:
818 if( check_for_mmx() )
819 deinterlace_linearblend_yuv_mmx(pdst,psrc,width,height);
820 else
821 deinterlace_linearblend_yuv(pdst,psrc,width,height);
822 break;
823 default:
824 lprintf("unknown method %d.\n",method);
825 break;
826 }
827 }
828
829 int deinterlace_yuv_supported ( int method )
830 {
831 switch( method ) {
832 case DEINTERLACE_NONE:
833 return 1;
834 case DEINTERLACE_BOB:
835 case DEINTERLACE_WEAVE:
836 case DEINTERLACE_GREEDY:
837 case DEINTERLACE_ONEFIELD:
838 return check_for_mmx();
839 case DEINTERLACE_ONEFIELDXV:
840 lprintf ("ONEFIELDXV must be handled by the video driver.\n");
841 return 0;
842 case DEINTERLACE_LINEARBLEND:
843 return 1;
844 }
845
846 return 0;
847 }
848
849 char *deinterlace_methods[] = {
850 "none",
851 "bob",
852 "weave",
853 "greedy",
854 "onefield",
855 "onefield_xv",
856 "linearblend",
857 NULL
858 };
859
860